updates to support dir processing

AnjanaRita · AnjanaRita · commit 5b4611ac5796 · 2022-07-02T15:27:59.000+05:30
diff --git a/ocrpy/io/reader.py b/ocrpy/io/reader.py
@@ -17,14 +17,32 @@ class DocumentReader:
     credentials = field(default=None)
 
     def read(self):
-        if self.file.endswith(".png") or self.file.endswith(".jpg"):
+        file_type = self.get_file_type()
+        if file_type == 'image':
             return self._read_image(self.file)
+        elif file_type == 'pdf':
+            return self._read_pdf(self.file)
+        else:
+            raise ValueError("File type not supported")
 
+    def get_file_type(self):
+        if self.file.endswith(".png") or self.file.endswith(".jpg"):
+            file_type = "image"
         elif self.file.endswith(".pdf"):
-            return self._read_pdf(self.file)
+            file_type = "pdf"
+        else:
+            file_type = "unknown"
+        return file_type
 
+    def get_storage_type(self):
+        storage_type = None
+        if self.file.startswith("gs://"):
+            storage_type = 'gs'
+        elif self.file.startswith("s3://"):
+            storage_type = 's3'
         else:
-            raise ValueError("File type not supported")
+            storage_type = 'local'
+        return storage_type
 
     def _read_image(self, file):
         return self._read(file)
@@ -40,10 +58,11 @@ def _read(self, file):
         return file_data.read_bytes()
 
     def _get_client(self, file):
-        if file.startswith("gs://") and self.credentials:
+        storage_type = self.get_storage_type()
+        if storage_type == "gs" and self.credentials:
             client = GSClient(application_credentials=self.credentials)
 
-        elif file.startswith("s3://") and self.credentials:
+        elif storage_type == 's3' and self.credentials:
             load_dotenv(self.credentials)
             client = S3Client(aws_access_key_id=os.getenv(
                 'aws_access_key_id'), aws_secret_access_key=os.getenv('aws_secret_access_key'))
diff --git a/ocrpy/parsers/text/aws_text.py b/ocrpy/parsers/text/aws_text.py
@@ -1,5 +1,7 @@
 import os
 import boto3
+import time
+from cloudpathlib import AnyPath
 from dotenv import load_dotenv
 from attr import define, field
 from typing import List, Dict, Any
@@ -26,6 +28,36 @@ def aws_token_formator(token):
     return token
 
 
+def is_job_complete(client, job_id):
+    time.sleep(1)
+    response = client.get_document_text_detection(JobId=job_id)
+    status = response["JobStatus"]
+    response = client.get_document_text_detection(JobId=job_id)
+    while(status == "IN_PROGRESS"):
+        time.sleep(1)
+        response = client.get_document_text_detection(JobId=job_id)
+        status = response["JobStatus"]
+    return status
+
+
+def get_job_results(client, job_id):
+    pages = []
+    response = client.get_document_text_detection(JobId=job_id)
+    pages.append(response)
+    next_token = None
+    if 'NextToken' in response:
+        next_token = response['NextToken']
+
+    while next_token:
+        response = client.\
+            get_document_text_detection(JobId=job_id, NextToken=next_token)
+        pages.append(response)
+        next_token = None
+        if 'NextToken' in response:
+            next_token = response['NextToken']
+    return pages
+
+
 @define
 class AwsLineSegmenter(AbstractLineSegmenter):
     """
@@ -41,6 +73,7 @@ def lines(self) -> List[Dict[str, Any]]:
         lines = []
         for line in self.ocr["Blocks"]:
             if line["BlockType"] == "LINE":
+
                 idx = line.get("Id")
                 text = line.get("Text")
                 region = aws_region_extractor(line)
@@ -57,8 +90,9 @@ def _aws_token_extractor(self, relationship):
         for i in relationship:
             for idx in i.get('Ids'):
                 token = self.mapper.get(idx)
-                token = aws_token_formator(token)
-                tokens.append(token)
+                if token:
+                    token = aws_token_formator(token)
+                    tokens.append(token)
         return tokens
 
 
@@ -80,37 +114,53 @@ class AwsTextOCR(AbstractTextOCR):
     def __attrs_post_init__(self):
         if self.env_file:
             load_dotenv(self.env_file)
-        self.document = self.reader.read()
         region = os.getenv('region_name')
         access_key = os.getenv('aws_access_key_id')
         secret_key = os.getenv('aws_secret_access_key')
         self.textract = boto3.client('textract', region_name=region,
                                      aws_access_key_id=access_key, aws_secret_access_key=secret_key)
-        # self.ocr = textract.detect_document_text(
-        #    Document={'Bytes': self.document})
 
     @property
     def parse(self):
         return self._process_data()
 
     def _process_data(self):
-        is_image = False
-        if isinstance(self.document, bytes):
-            self.document = [self.document]
-            is_image = True
-
         result = {}
-        for index, document in enumerate(self.document):
-            ocr = self.textract.detect_document_text(
-                Document={'Bytes': document})
-            data = dict(text=self._get_text(ocr), lines=self._get_lines(
-                ocr), blocks=self._get_blocks(ocr), tokens=self._get_tokens(ocr))
+        ocr = self._get_ocr()
+        if not isinstance(ocr, list):
+            ocr = [ocr]
+        for index, page in enumerate(ocr):
+            print("Processing page {}".format(index))
+            data = dict(text=self._get_text(page), lines=self._get_lines(
+                page), blocks=self._get_blocks(page), tokens=self._get_tokens(page))
             result[index] = data
+        return result
+
+    def _get_ocr(self):
+        storage_type = self.reader.get_storage_type()
+
+        if storage_type == 's3':
+            path = AnyPath(self.reader.file)
+
+            response = self.textract.start_document_text_detection(DocumentLocation={
+                'S3Object': {
+                    'Bucket': path.bucket,
+                    'Name': path.key
+                }})
+            job_id = response['JobId']
+            status = is_job_complete(self.textract, job_id)
+            ocr = get_job_results(self.textract, job_id)
 
-        if is_image:
-            return result[0]
         else:
-            return result
+            self.document = self.reader.read()
+            if isinstance(self.document, bytes):
+                self.document = [self.document]
+            ocr = []
+            for document in self.document:
+                result = self.textract.detect_document_text(
+                    Document={'Bytes': document})
+                ocr.append(result)
+        return ocr
 
     def _get_blocks(self, ocr):
         try: