#! /bin/bash
PREFIX=`mktemp -t -d img2pdf.XXXXXXXXXX`
trap 'rm -rf $PREFIX' EXIT
#list files only breaking on new lines (not spaces in file names)
IFS=$'\n'
directory="$(pwd)"
#initialize variables
infile=
outfile=
webstatus="false"
deskew="false"
res="300"
renamebyzxing="false"
#Supported languages: eng ger fra rus swe spa ita ruseng ukr srp hrv pol dan por dut cze rum hun bul slo lav lit est tur
lang="eng"
#display usage statement
function displayusage {
echo "Usage: $0 -i input-file -o output-file"
echo "input-file must be .pdf"
echo "Required"
echo " -i input file"
echo " -o output file"
echo "Options"
echo " -m web status monitoring"
echo " -d deskew and despeckel (much slower)"
echo " -r output resolution"
echo " -z enable barcode renaming"
exit 2
}
#update web status
function updatestatus {
if [ "$1" = "true" ]; then
cp /tmp/watchocr.log /var/www/status/watchocr.log
fi
}
#check to make sure old files are not overwriten
function dontoverwrite {
outbase=$(basename $2)
outdir=$(dirname $2)
if [ -e $2 ]; then
dtstamp=$(date +%s)
mv $1 $dirname$dtstamp.$outbase
echo Finished moving $dirname$dtstamp.$outbase "
" >> /tmp/watchocr.log
else
mv $1 $2
echo Finished moving $2 "
" >> /tmp/watchocr.log
fi
}
# Verify command line options and values
# Display an error message if the input is not correct
while getopts i:o:r:mdz flag
do case "$flag" in
i) infile="$OPTARG";;
o) outfile="$OPTARG";;
m) webstatus="true";;
d) deskew="true";;
r) res="$OPTARG";;
z) renamebyzxing="true";;
?) displayusage;;
esac
done
if [[ -z "$infile" || -z "$outfile" ]]; then
displayusage
fi
#begin processing file
basenm=$(basename $infile)
cp $infile $PREFIX/
infile=$PREFIX/$basenm
pagecount=$(pdfinfo $infile | grep Pages: | awk '{print $2}')
echo Splitting $infile into $pagecount images "
" >> /tmp/watchocr.log
updatestatus $webstatus
#precess each page in a file
for page in $(seq 1 $pagecount)
do
echo \"$PREFIX/$basenm.$page.hocr.pdf\" >> $PREFIX/outline.txt
#convert to tif
#use unpaper to deskew if requested
if [ $deskew = "true" ]; then
gs -dNOPAUSE -r$res -dBATCH -dFirstPage=$page -dLastPage=$page -sDEVICE=ppmraw -sOutputFile=$PREFIX/$basenm.$page $infile
unpaper $PREFIX/$basenm.$page $PREFIX/$basenm.$page.ppm
convert $PREFIX/$basenm.$page.ppm $PREFIX/$basenm.$page.tif
rm -f $PREFIX/$basenm.$page.ppm $PREFIX/$basenm.$page
mv $PREFIX/$basenm.$page.tif $PREFIX/$basenm.$page
else
gs -dNOPAUSE -r$res -dBATCH -dFirstPage=$page -dLastPage=$page -sDEVICE=tiff24nc -sOutputFile=$PREFIX/$basenm.$page $infile
fi
echo OCRing $PREFIX/$basenm.$page "
" >> /tmp/watchocr.log
updatestatus $webstatus
#convert to BMP
econvert -i $PREFIX/$basenm.$page -o $PREFIX/scan.bmp
optimize2bw -n -i $PREFIX/scan.bmp -o $PREFIX/bw.bmp
#OCR output
cuneiform -l $lang -f hocr -o $PREFIX/hocr.html $PREFIX/bw.bmp
#Capture any blank pages
if [ -e $PREFIX/hocr.html ]; then
hocr2pdf -s -i $PREFIX/scan.bmp -o $PREFIX/$basenm.$page.hocr.pdf < $PREFIX/hocr.html
else
tiff2pdf -o $PREFIX/$basenm.$page.hocr.pdf $PREFIX/$basenm.$page
echo "Cuneiform returned blank page."
fi
#Search for barcode filename with zxing
if [ $renamebyzxing = "true" ]; then
if [ $page = "1" ]; then
cd /usr/local/share/zxing-1.6
java -cp javase/javase.jar:core/core.jar com.google.zxing.client.j2se.CommandLineRunner --try_harder $PREFIX/bw.bmp > $PREFIX/zxingout.txt
zxingresult="$(sed -n "3 p" $PREFIX/zxingout.txt)"
cd $directory
fi
fi
#cleanup
rm -rf $PREFIX/$basenm.$page $PREFIX/scan.bmp $PREFIX/bw.bmp $PREFIX/hocr.html
echo "$PREFIX/$basenm.$page.hocr.pdf created"
done
echo Combining $outfile "
" >> /tmp/watchocr.log
updatestatus $webstatus
#recombine pages
gs -dBATCH -dNOPAUSE -r$res -sDEVICE=pdfwrite -sOutputFile=$PREFIX/$basenm.searchable.pdf @$PREFIX/outline.txt
chmod 777 $PREFIX/$basenm.searchable.pdf
#rename based on barcode output
cd $directory
if [ "$zxingresult" = "" ]; then
dontoverwrite $PREFIX/$basenm.searchable.pdf $outfile
echo Finished processing $outfile "
" >> /tmp/watchocr.log
else
if [ "${zxingresult:0:1}" = "/" ];then
zxingresult=.$zxingresult
fi
mknewdir=$(dirname $zxingresult)
mkdir -p $mknewdir
chmod 777 $mknewdir
#add pdf file extension if missing
if [ "${zxingresult#*.}" = "pdf" ]; then
mv $PREFIX/$basenm.searchable.pdf $zxingresult
echo Finished processing $zxingresult "
" >> /tmp/watchocr.log
else
mv $PREFIX/$basenm.searchable.pdf $zxingresult.pdf
echo Finished processing $zxingresult.pdf "
" >> /tmp/watchocr.log
fi
fi
#cleanup
rm -rf $PREFIX/*
updatestatus $webstatus