From 7ffd054124334e6f5087573616ba44785863fbb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=B4=A2=E6=AF=85?= <944102712@qq.com> Date: Tue, 9 Jun 2020 11:31:15 +0800 Subject: [PATCH 1/2] make it work on mac if you wanna use this on mac: 1. brew install imagemagick 2.brew install tesseract --- extract_text_mac.sh | 49 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 extract_text_mac.sh diff --git a/extract_text_mac.sh b/extract_text_mac.sh new file mode 100644 index 0000000..29cbd85 --- /dev/null +++ b/extract_text_mac.sh @@ -0,0 +1,49 @@ +#!/bin/bash +BPATH=$1 # Path to directory containing PDFs. +OPATH=$2 # Path to output directory. +TPATH=$3 +LANG=$4 # See man tesseract > LANGUAGES +MIN_WORDS=5 # Number of words required to accept pdftotext result. +if [ $(echo "$LANG" | wc -c ) -lt 1 ] # Language defaults to eng. + then + LANG='eng' +fi +# If the output path does not exist, attempt to create it. +if [ ! -d "$OPATH" ]; then + mkdir -p "$OPATH" +fi + +if [ ! -d "$TPATH" ]; then + mkdir -p "$TPATH" +fi +for FILEPATH in $BPATH*.pdf; do + # Extracts plain text content from a PDF. + # + # First, attempts to extract embedded text with pdftotext. If that fails, + # converts the PDF to TIFF and attempts to perform OCR with Tesseract. + # + # Path to text file to be created. E.g. ./myfile.txt + OUTFILE=$OPATH$(basename $FILEPATH).txt + touch "$OUTFILE" # The text file will be created regardless of whether + # text is successfully extracted. + # First attempt to use pdftotext to extract embedded text. + echo -n "Attempting pdftotext extraction..." + pdftotext "$FILEPATH" "$OUTFILE" + FILESIZE=$(wc -w < "$OUTFILE") + echo "extracted $FILESIZE words." + # If that fails, try Tesseract. + if [[ $FILESIZE -lt $MIN_WORDS ]] + then + echo -n "Attempting OCR extraction..." + # Use imagemagick to convert the PDF to a high-rest multi-page TIFF. + convert -density 300 "$FILEPATH" -depth 8 -strip -background white \ + -alpha off $TPATH$(basename $FILEPATH).tiff + # Then use Tesseract to perform OCR on the tiff. + tesseract $TPATH$(basename $FILEPATH).tiff "$OUTFILE" $LANG + # We don't need then intermediate TIFF file, so discard it. + rm $TPATH$(basename $FILEPATH).tiff + FILESIZE=$(wc -w < "$OUTFILE") + echo "extracted $FILESIZE words." + fi + +done From 4fce617d8eb5ca1f171a10ce5159270c44bddd35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=B4=A2=E6=AF=85?= <944102712@qq.com> Date: Tue, 9 Jun 2020 11:32:28 +0800 Subject: [PATCH 2/2] work on mac python shellocr_mac.py --- shell_ocr_mac.py | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 shell_ocr_mac.py diff --git a/shell_ocr_mac.py b/shell_ocr_mac.py new file mode 100644 index 0000000..55a11dd --- /dev/null +++ b/shell_ocr_mac.py @@ -0,0 +1,8 @@ +import subprocess +import shlex +# shlex is used for passing parameters + +# subprocess.call(['./extract_text.sh']) +# Run above if no parameters are required + +subprocess.call(shlex.split('./extract_text.sh ./pdf/ ./txt/ ./tiff/' ))