#!/bin/bash
function ocrpdf2txt
{
pdfimages $@ img
for f in `ls img*.ppm|sed s/\?/\ /g`;do
export imagename=`echo $f|sed s/\?/\ /g`
tesseract $imagename $imagename > /dev/null 2> /dev/null
cat $imagename.txt
done
}
function pdf2txt
{
pdftotext $@ $@.txt
if cat $@.txt|wc -l|grep -qw 0;then
ocrpdf2txt $@
else
cat $@.txt
fi
}
export PATH=/bin
mkdir -p /pdf
pdf2txt /pdf/pdf.pdf>/pdf/pdf.txt
