#! /bin/bash

PREFIX=`mktemp -t -d img2pdf.XXXXXXXXXX`
trap 'rm -rf $PREFIX' EXIT

#list files only breaking on new lines (not spaces in file names)
IFS=$'\n'
directory="$(pwd)"

#initialize variables
infile=
outfile=
webstatus="false"
deskew="false"
res="300"
renamebyzxing="false"

#Supported languages: eng ger fra rus swe spa ita ruseng ukr srp hrv pol dan por dut cze rum hun bul slo lav lit est tur
lang="eng"

#display usage statement
function displayusage {
	echo "Usage: $0 -i input-file -o output-file"
	echo "input-file must be .pdf"
        echo "Required"
        echo "     -i input file"
        echo "     -o output file"
        echo "Options"
        echo "     -m web status monitoring"
        echo "     -d deskew and despeckel (much slower)"
	echo "     -r output resolution"
	echo "     -z enable barcode renaming"
	exit 2
}

#update web status
function updatestatus {
if [ "$1" = "true" ]; then
	cp /tmp/watchocr.log /var/www/status/watchocr.log
fi
}

#check to make sure old files are not overwriten
function dontoverwrite {
    outbase=$(basename $2)
    outdir=$(dirname $2)
	if [ -e $2 ]; then
        dtstamp=$(date +%s)
        	mv $1 $dirname$dtstamp.$outbase
        	echo Finished moving $dirname$dtstamp.$outbase "<br>" >> /tmp/watchocr.log
        else
                mv $1 $2
                echo Finished moving $2 "<br>" >> /tmp/watchocr.log
        fi
}

# Verify command line options and values
# Display an error message if the input is not correct
while getopts i:o:r:mdz flag
do      case "$flag" in
        i)      infile="$OPTARG";;
        o)      outfile="$OPTARG";;
        m)      webstatus="true";;
	d)	deskew="true";; 
        r)      res="$OPTARG";;
	z)      renamebyzxing="true";;
        ?)      displayusage;;
        esac
done

if [[ -z "$infile" || -z "$outfile" ]]; then
        displayusage
fi


#begin processing file

basenm=$(basename $infile)
cp $infile $PREFIX/
infile=$PREFIX/$basenm

pagecount=$(pdfinfo $infile | grep Pages: | awk '{print $2}')
echo Splitting $infile into $pagecount images "<br>" >> /tmp/watchocr.log

updatestatus $webstatus

#precess each page in a file
for page in $(seq 1 $pagecount)
	do
		echo \"$PREFIX/$basenm.$page.hocr.pdf\" >> $PREFIX/outline.txt
		#convert to tif
		#use unpaper to deskew if requested
		if [ $deskew = "true" ]; then
			gs -dNOPAUSE -r$res -dBATCH -dFirstPage=$page -dLastPage=$page -sDEVICE=ppmraw -sOutputFile=$PREFIX/$basenm.$page $infile
			unpaper $PREFIX/$basenm.$page $PREFIX/$basenm.$page.ppm
			convert $PREFIX/$basenm.$page.ppm  $PREFIX/$basenm.$page.tif
			rm -f $PREFIX/$basenm.$page.ppm $PREFIX/$basenm.$page
			mv $PREFIX/$basenm.$page.tif $PREFIX/$basenm.$page
		else
			gs -dNOPAUSE -r$res -dBATCH -dFirstPage=$page -dLastPage=$page -sDEVICE=tiff24nc -sOutputFile=$PREFIX/$basenm.$page $infile
		fi

		echo OCRing $PREFIX/$basenm.$page "<br>" >> /tmp/watchocr.log
		updatestatus $webstatus

		#convert to BMP
		econvert -i $PREFIX/$basenm.$page -o $PREFIX/scan.bmp
       		optimize2bw -n -i $PREFIX/scan.bmp -o $PREFIX/bw.bmp

		#OCR output
		cuneiform -l $lang -f hocr -o $PREFIX/hocr.html $PREFIX/bw.bmp

		#Capture any blank pages
		if [ -e $PREFIX/hocr.html ]; then
			hocr2pdf -s -i $PREFIX/scan.bmp -o $PREFIX/$basenm.$page.hocr.pdf < $PREFIX/hocr.html
		else
			tiff2pdf -o $PREFIX/$basenm.$page.hocr.pdf $PREFIX/$basenm.$page
			echo "Cuneiform returned blank page."
		fi

		#Search for barcode filename with zxing
		if [ $renamebyzxing = "true" ]; then
			if [ $page = "1" ]; then
				cd /usr/local/share/zxing-1.6
				java -cp javase/javase.jar:core/core.jar com.google.zxing.client.j2se.CommandLineRunner --try_harder $PREFIX/bw.bmp > $PREFIX/zxingout.txt
				zxingresult="$(sed -n "3 p" $PREFIX/zxingout.txt)"
				cd $directory
			fi
		fi

		#cleanup
		rm -rf $PREFIX/$basenm.$page $PREFIX/scan.bmp $PREFIX/bw.bmp $PREFIX/hocr.html
		echo "$PREFIX/$basenm.$page.hocr.pdf created"
	done

echo Combining $outfile "<br>" >> /tmp/watchocr.log
updatestatus $webstatus	

#recombine pages
gs -dBATCH -dNOPAUSE -r$res -sDEVICE=pdfwrite -sOutputFile=$PREFIX/$basenm.searchable.pdf @$PREFIX/outline.txt

chmod 777 $PREFIX/$basenm.searchable.pdf

#rename based on barcode output
cd $directory

if [ "$zxingresult" = "" ]; then
	dontoverwrite $PREFIX/$basenm.searchable.pdf $outfile
	echo Finished processing $outfile "<br>" >> /tmp/watchocr.log
else
	if [ "${zxingresult:0:1}" = "/" ];then
                zxingresult=.$zxingresult
        fi

	mknewdir=$(dirname $zxingresult)
	mkdir -p $mknewdir
	chmod 777 $mknewdir

	#add pdf file extension if missing
	if [ "${zxingresult#*.}" = "pdf" ]; then
		mv $PREFIX/$basenm.searchable.pdf $zxingresult
		echo Finished processing $zxingresult "<br>" >> /tmp/watchocr.log
	else
		mv $PREFIX/$basenm.searchable.pdf $zxingresult.pdf
		echo Finished processing $zxingresult.pdf "<br>" >> /tmp/watchocr.log
	fi
fi

#cleanup
rm -rf $PREFIX/*

updatestatus $webstatus
