From ceffcff29826ae0388c9472c02ce335527f041a3 Mon Sep 17 00:00:00 2001
From: Mat Werber <werberm@amazon.com>
Date: Sat, 31 Aug 2019 19:51:29 -0400
Subject: [PATCH 1/3] URL decode S3 object's key name received in event

---
 functions/textract-job-submit-async.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/functions/textract-job-submit-async.py b/functions/textract-job-submit-async.py
index 6771270..8e0e63e 100644
--- a/functions/textract-job-submit-async.py
+++ b/functions/textract-job-submit-async.py
@@ -2,7 +2,7 @@
 import time
 import boto3
 from datetime import datetime
-
+from urllib.parse import unquote_plus
 
 def attachExternalBucketPolicy(externalBucketName):
     iam = boto3.client('iam')
@@ -387,7 +387,7 @@ def lambda_handler(event, context):
         record, = event["Records"]        
         print(record)
         bucket = record['s3']['bucket']['name']
-        document = record['s3']['object']['key']
+        document = unquote_plus(record['s3']['object']['key'])
     else:
         bucket = event['ExternalBucketName']
         document = event['ExternalDocumentPrefix']   

From 54d35900c59df468fce29de2850c92259829ba5f Mon Sep 17 00:00:00 2001
From: Mat Werber <werberm@amazon.com>
Date: Sat, 31 Aug 2019 19:52:38 -0400
Subject: [PATCH 2/3] use s3 eTag for Textract request token and jobTag

---
 functions/textract-job-submit-async.py | 32 +++++++++++++++-----------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/functions/textract-job-submit-async.py b/functions/textract-job-submit-async.py
index 8e0e63e..dc63718 100644
--- a/functions/textract-job-submit-async.py
+++ b/functions/textract-job-submit-async.py
@@ -116,7 +116,7 @@ def updateResponse(givenjson, updatejson, override = False):
             givenjson[key] = updatejson[key]
     return givenjson
 
-def submitDocumentAnalysisJob(bucket, document, tokenPrefix, retryInterval, maxRetryAttempt, topicArn, roleArn, table_name):
+def submitDocumentAnalysisJob(bucket, document, eTag, tokenPrefix, retryInterval, maxRetryAttempt, topicArn, roleArn, table_name):
 
     s3 = boto3.resource('s3')
     textract = boto3.client('textract')
@@ -129,21 +129,23 @@ def submitDocumentAnalysisJob(bucket, document, tokenPrefix, retryInterval, maxR
     document_path = document[:document.rfind("/")] if document.find("/") >= 0 else ""
     document_name = document[document.rfind("/")+1:document.rfind(".")] if document.find("/") >= 0 else document[:document.rfind(".")]
     document_type = document[document.rfind(".")+1:].upper()
+    client_request_token = "{}-{}".format(tokenPrefix, eTag)
+    job_tag = "{}-{}".format(tokenPrefix, eTag)
 
-    print("DocumentAnalysisJob: ClientRequestToken = {}-{}".format(tokenPrefix, document.replace("/","_").replace(".","-")))
+    print("DocumentAnalysisJob: ClientRequestToken = {}".format(client_request_token))
     print("DocumentAnalysisJob: DocumentLocation = 'S3Object': 'Bucket': {}, 'Name': {}".format(bucket, document))
     print("DocumentAnalysisJob: NotificationChannel = 'SNSTopicArn': {},'RoleArn': {}".format(topicArn, roleArn))
-    print("DocumentAnalysisJob: JobTag = {}-{}".format(tokenPrefix, document[document.rfind("/")+1:document.rfind(".")]))
+    print("DocumentAnalysisJob: JobTag = {}".format(job_tag))
             
     #Submit Document Anlysis job to Textract to extract text features    
     while retryCount >= 0 and retryCount < maxRetryAttempt:
         try:
             response = textract.start_document_analysis(
-                                    ClientRequestToken = "{}-{}".format(tokenPrefix, document.replace("/","_").replace(".","-")),
+                                    ClientRequestToken = client_request_token,
                                     DocumentLocation={'S3Object': {'Bucket': bucket, 'Name': document}},
                                     FeatureTypes=["TABLES", "FORMS"],
                                     NotificationChannel={'SNSTopicArn': topicArn,'RoleArn': roleArn},
-                                    JobTag = "{}-{}".format(tokenPrefix, document[document.rfind("/")+1:document.rfind(".")]))
+                                    JobTag = job_tag)
             jobId = response['JobId']
             jobStartTimeStamp = datetime.strptime(response['ResponseMetadata']['HTTPHeaders']['date'], '%a, %d %b %Y %H:%M:%S %Z').timestamp()
             print("Textract Request: {} submitted at {} with JobId - {}".format(
@@ -242,7 +244,7 @@ def submitDocumentAnalysisJob(bucket, document, tokenPrefix, retryInterval, maxR
 
     return jsonresponse
         
-def submitTextDetectionJob(bucket, document, tokenPrefix, retryInterval, maxRetryAttempt, topicArn, roleArn, table_name):
+def submitTextDetectionJob(bucket, document, eTag, tokenPrefix, retryInterval, maxRetryAttempt, topicArn, roleArn, table_name):
 
     s3 = boto3.resource('s3')
     textract = boto3.client('textract')
@@ -255,20 +257,22 @@ def submitTextDetectionJob(bucket, document, tokenPrefix, retryInterval, maxRetr
     document_path = document[:document.rfind("/")] if document.find("/") >= 0 else ""
     document_name = document[document.rfind("/")+1:document.rfind(".")] if document.find("/") >= 0 else document[:document.rfind(".")]
     document_type = document[document.rfind(".")+1:].upper()
-
-    print("TextDetectionsJob: ClientRequestToken = {}-{}".format(tokenPrefix, document.replace("/","_").replace(".","-")))
+    client_request_token = "{}-{}".format(tokenPrefix, eTag)
+    job_tag = "{}-{}".format(tokenPrefix, eTag)
+    
+    print("TextDetectionsJob: ClientRequestToken = {}".format(client_request_token))
     print("TextDetectionJob: DocumentLocation = 'S3Object': 'Bucket': {}, 'Name': {}".format(bucket, document))
     print("TextDetectionJob: NotificationChannel = 'SNSTopicArn': {},'RoleArn': {}".format(topicArn, roleArn))
-    print("TextDetectionJob: JobTag = {}-{}".format(tokenPrefix, document[document.rfind("/")+1:document.rfind(".")]))
+    print("TextDetectionJob: JobTag = {}".format(client_request_token))
     
     #Submit Text Detection job to Textract to detect lines of text    
     while retryCount >= 0 and retryCount < maxRetryAttempt:
         try:
             response = textract.start_document_text_detection(
-                                    ClientRequestToken = "{}-{}".format(tokenPrefix, document.replace("/","_").replace(".","-")),
+                                    ClientRequestToken = client_request_token,
                                     DocumentLocation={'S3Object': {'Bucket': bucket, 'Name': document}},
                                     NotificationChannel={'SNSTopicArn': topicArn,'RoleArn': roleArn},
-                                    JobTag = "{}-{}".format(tokenPrefix, document[document.rfind("/")+1:document.rfind(".")]))
+                                    JobTag = job_tag)
             jobId = response['JobId']
             jobStartTimeStamp = datetime.strptime(response['ResponseMetadata']['HTTPHeaders']['date'], '%a, %d %b %Y %H:%M:%S %Z').timestamp()
             print("Textract Request: {} submitted at {} with JobId - {}".format(
@@ -377,6 +381,7 @@ def lambda_handler(event, context):
     external_bucket = ""
     bucket = ""
     document = ""
+    eTag = ""
     bucketAccessPolicyArn = None
     
     if 'ExternalBucketName' in event:
@@ -388,6 +393,7 @@ def lambda_handler(event, context):
         print(record)
         bucket = record['s3']['bucket']['name']
         document = unquote_plus(record['s3']['object']['key'])
+        eTag = record['s3']['object']['eTag']
     else:
         bucket = event['ExternalBucketName']
         document = event['ExternalDocumentPrefix']   
@@ -396,14 +402,14 @@ def lambda_handler(event, context):
         print("Bucket and/or Document not specified, nothing to do.")
         return {}
 
-    documentAnalysisResponse = submitDocumentAnalysisJob(bucket, document, 
+    documentAnalysisResponse = submitDocumentAnalysisJob(bucket, document, eTag,
                                                         documentAnalysisTokenPrefix, 
                                                         retryInterval, maxRetryAttempt, 
                                                         documentAnalysisTopicArn, 
                                                         roleArn, table_name)
     print("DocumentAnalysisResponse = {}".format(documentAnalysisResponse))
 
-    textDetectionResponse = submitTextDetectionJob(bucket, document, 
+    textDetectionResponse = submitTextDetectionJob(bucket, document, eTag,
                                                     textDetectionTokenPrefix, 
                                                     retryInterval, maxRetryAttempt, 
                                                     textDetectionTopicArn, 

From ce4b6bb79da2b8ef1e7424dfd811abdc73afcfef Mon Sep 17 00:00:00 2001
From: Mat Werber <werberm@amazon.com>
Date: Mon, 2 Sep 2019 18:54:29 -0400
Subject: [PATCH 3/3] check if resources key in pages blocks

---
 functions/textract_util.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/functions/textract_util.py b/functions/textract_util.py
index 1bda27d..eeb3689 100644
--- a/functions/textract_util.py
+++ b/functions/textract_util.py
@@ -420,15 +420,18 @@ def extractTextBody(blocks):
     document_text = {}
     for page in blocks['PAGE']:
         document_text['Page-{0:02d}'.format(page['Page'])] = {}
-        print("Page-{} contains {} Lines".format(page['Page'], len(page['Relationships'][0]['Ids'])))
-        total_line += len(page['Relationships'][0]['Ids'])
-        for i, line_id in enumerate(page['Relationships'][0]['Ids']):
-            page_line = None
-            for line in blocks['LINE']:
-                if line['Id'] == line_id:
-                    page_line = line
-                    break
-            document_text['Page-{0:02d}'.format(page['Page'])]['Line-{0:04d}'.format(i+1)] = {}
-            document_text['Page-{0:02d}'.format(page['Page'])]['Line-{0:04d}'.format(i+1)]['Text'] = page_line['Text']
+        if 'Relationships' in page.keys():
+          print("Page-{} contains {} Lines".format(page['Page'], len(page['Relationships'][0]['Ids'])))
+          total_line += len(page['Relationships'][0]['Ids'])
+          for i, line_id in enumerate(page['Relationships'][0]['Ids']):
+              page_line = None
+              for line in blocks['LINE']:
+                  if line['Id'] == line_id:
+                      page_line = line
+                      break
+              document_text['Page-{0:02d}'.format(page['Page'])]['Line-{0:04d}'.format(i+1)] = {}
+              document_text['Page-{0:02d}'.format(page['Page'])]['Line-{0:04d}'.format(i+1)]['Text'] = page_line['Text']
+        else:
+          print("Page-{} contains no lines".format(page['Page']))
     print(total_line)
     return document_text, total_line