From 7ffd054124334e6f5087573616ba44785863fbb2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=B4=A2=E6=AF=85?= <944102712@qq.com>
Date: Tue, 9 Jun 2020 11:31:15 +0800
Subject: [PATCH 1/2] make it work on mac

if you wanna use this on mac:
1. brew install imagemagick
2.brew install tesseract
---
 extract_text_mac.sh | 49 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 extract_text_mac.sh

diff --git a/extract_text_mac.sh b/extract_text_mac.sh
new file mode 100644
index 0000000..29cbd85
--- /dev/null
+++ b/extract_text_mac.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+BPATH=$1  # Path to directory containing PDFs.
+OPATH=$2  # Path to output directory.
+TPATH=$3
+LANG=$4   # See man tesseract > LANGUAGES
+MIN_WORDS=5     # Number of words required to accept pdftotext result.
+if [ $(echo "$LANG" | wc -c ) -lt 1 ]   # Language defaults to eng.
+    then
+        LANG='eng'
+fi
+# If the output path does not exist, attempt to create it.
+if [ ! -d "$OPATH" ]; then
+    mkdir -p "$OPATH"
+fi
+
+if [ ! -d "$TPATH" ]; then
+    mkdir -p "$TPATH"
+fi
+for FILEPATH in $BPATH*.pdf; do
+    # Extracts plain text content from a PDF.
+    #
+    # First, attempts to extract embedded text with pdftotext. If that fails,
+    #  converts the PDF to TIFF and attempts to perform OCR with Tesseract.
+    #
+    # Path to text file to be created. E.g. ./myfile.txt
+    OUTFILE=$OPATH$(basename $FILEPATH).txt
+    touch "$OUTFILE"    # The text file will be created regardless of whether
+                        #  text is successfully extracted.
+    # First attempt to use pdftotext to extract embedded text.
+    echo -n "Attempting pdftotext extraction..."
+    pdftotext "$FILEPATH" "$OUTFILE"
+    FILESIZE=$(wc -w < "$OUTFILE")
+    echo "extracted $FILESIZE words."
+    # If that fails, try Tesseract.
+    if [[ $FILESIZE -lt $MIN_WORDS ]]
+        then
+            echo -n "Attempting OCR extraction..."
+            # Use imagemagick to convert the PDF to a high-rest multi-page TIFF.
+            convert -density 300 "$FILEPATH" -depth 8 -strip -background white \
+                    -alpha off $TPATH$(basename $FILEPATH).tiff
+            # Then use Tesseract to perform OCR on the tiff.
+            tesseract $TPATH$(basename $FILEPATH).tiff "$OUTFILE" $LANG
+            # We don't need then intermediate TIFF file, so discard it.
+            rm $TPATH$(basename $FILEPATH).tiff
+            FILESIZE=$(wc -w < "$OUTFILE")
+            echo "extracted $FILESIZE words."
+    fi
+
+done

From 4fce617d8eb5ca1f171a10ce5159270c44bddd35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=B4=A2=E6=AF=85?= <944102712@qq.com>
Date: Tue, 9 Jun 2020 11:32:28 +0800
Subject: [PATCH 2/2] work on mac

python shellocr_mac.py
---
 shell_ocr_mac.py | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 shell_ocr_mac.py

diff --git a/shell_ocr_mac.py b/shell_ocr_mac.py
new file mode 100644
index 0000000..55a11dd
--- /dev/null
+++ b/shell_ocr_mac.py
@@ -0,0 +1,8 @@
+import subprocess
+import shlex
+# shlex is used for passing parameters
+
+# subprocess.call(['./extract_text.sh'])
+# Run above if no parameters are required
+
+subprocess.call(shlex.split('./extract_text.sh ./pdf/ ./txt/ ./tiff/' ))