From ceffcff29826ae0388c9472c02ce335527f041a3 Mon Sep 17 00:00:00 2001 From: Mat Werber Date: Sat, 31 Aug 2019 19:51:29 -0400 Subject: [PATCH 1/3] URL decode S3 object's key name received in event --- functions/textract-job-submit-async.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/functions/textract-job-submit-async.py b/functions/textract-job-submit-async.py index 6771270..8e0e63e 100644 --- a/functions/textract-job-submit-async.py +++ b/functions/textract-job-submit-async.py @@ -2,7 +2,7 @@ import time import boto3 from datetime import datetime - +from urllib.parse import unquote_plus def attachExternalBucketPolicy(externalBucketName): iam = boto3.client('iam') @@ -387,7 +387,7 @@ def lambda_handler(event, context): record, = event["Records"] print(record) bucket = record['s3']['bucket']['name'] - document = record['s3']['object']['key'] + document = unquote_plus(record['s3']['object']['key']) else: bucket = event['ExternalBucketName'] document = event['ExternalDocumentPrefix'] From 54d35900c59df468fce29de2850c92259829ba5f Mon Sep 17 00:00:00 2001 From: Mat Werber Date: Sat, 31 Aug 2019 19:52:38 -0400 Subject: [PATCH 2/3] use s3 eTag for Textract request token and jobTag --- functions/textract-job-submit-async.py | 32 +++++++++++++++----------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/functions/textract-job-submit-async.py b/functions/textract-job-submit-async.py index 8e0e63e..dc63718 100644 --- a/functions/textract-job-submit-async.py +++ b/functions/textract-job-submit-async.py @@ -116,7 +116,7 @@ def updateResponse(givenjson, updatejson, override = False): givenjson[key] = updatejson[key] return givenjson -def submitDocumentAnalysisJob(bucket, document, tokenPrefix, retryInterval, maxRetryAttempt, topicArn, roleArn, table_name): +def submitDocumentAnalysisJob(bucket, document, eTag, tokenPrefix, retryInterval, maxRetryAttempt, topicArn, roleArn, table_name): s3 = boto3.resource('s3') textract = boto3.client('textract') @@ -129,21 +129,23 @@ def submitDocumentAnalysisJob(bucket, document, tokenPrefix, retryInterval, maxR document_path = document[:document.rfind("/")] if document.find("/") >= 0 else "" document_name = document[document.rfind("/")+1:document.rfind(".")] if document.find("/") >= 0 else document[:document.rfind(".")] document_type = document[document.rfind(".")+1:].upper() + client_request_token = "{}-{}".format(tokenPrefix, eTag) + job_tag = "{}-{}".format(tokenPrefix, eTag) - print("DocumentAnalysisJob: ClientRequestToken = {}-{}".format(tokenPrefix, document.replace("/","_").replace(".","-"))) + print("DocumentAnalysisJob: ClientRequestToken = {}".format(client_request_token)) print("DocumentAnalysisJob: DocumentLocation = 'S3Object': 'Bucket': {}, 'Name': {}".format(bucket, document)) print("DocumentAnalysisJob: NotificationChannel = 'SNSTopicArn': {},'RoleArn': {}".format(topicArn, roleArn)) - print("DocumentAnalysisJob: JobTag = {}-{}".format(tokenPrefix, document[document.rfind("/")+1:document.rfind(".")])) + print("DocumentAnalysisJob: JobTag = {}".format(job_tag)) #Submit Document Anlysis job to Textract to extract text features while retryCount >= 0 and retryCount < maxRetryAttempt: try: response = textract.start_document_analysis( - ClientRequestToken = "{}-{}".format(tokenPrefix, document.replace("/","_").replace(".","-")), + ClientRequestToken = client_request_token, DocumentLocation={'S3Object': {'Bucket': bucket, 'Name': document}}, FeatureTypes=["TABLES", "FORMS"], NotificationChannel={'SNSTopicArn': topicArn,'RoleArn': roleArn}, - JobTag = "{}-{}".format(tokenPrefix, document[document.rfind("/")+1:document.rfind(".")])) + JobTag = job_tag) jobId = response['JobId'] jobStartTimeStamp = datetime.strptime(response['ResponseMetadata']['HTTPHeaders']['date'], '%a, %d %b %Y %H:%M:%S %Z').timestamp() print("Textract Request: {} submitted at {} with JobId - {}".format( @@ -242,7 +244,7 @@ def submitDocumentAnalysisJob(bucket, document, tokenPrefix, retryInterval, maxR return jsonresponse -def submitTextDetectionJob(bucket, document, tokenPrefix, retryInterval, maxRetryAttempt, topicArn, roleArn, table_name): +def submitTextDetectionJob(bucket, document, eTag, tokenPrefix, retryInterval, maxRetryAttempt, topicArn, roleArn, table_name): s3 = boto3.resource('s3') textract = boto3.client('textract') @@ -255,20 +257,22 @@ def submitTextDetectionJob(bucket, document, tokenPrefix, retryInterval, maxRetr document_path = document[:document.rfind("/")] if document.find("/") >= 0 else "" document_name = document[document.rfind("/")+1:document.rfind(".")] if document.find("/") >= 0 else document[:document.rfind(".")] document_type = document[document.rfind(".")+1:].upper() - - print("TextDetectionsJob: ClientRequestToken = {}-{}".format(tokenPrefix, document.replace("/","_").replace(".","-"))) + client_request_token = "{}-{}".format(tokenPrefix, eTag) + job_tag = "{}-{}".format(tokenPrefix, eTag) + + print("TextDetectionsJob: ClientRequestToken = {}".format(client_request_token)) print("TextDetectionJob: DocumentLocation = 'S3Object': 'Bucket': {}, 'Name': {}".format(bucket, document)) print("TextDetectionJob: NotificationChannel = 'SNSTopicArn': {},'RoleArn': {}".format(topicArn, roleArn)) - print("TextDetectionJob: JobTag = {}-{}".format(tokenPrefix, document[document.rfind("/")+1:document.rfind(".")])) + print("TextDetectionJob: JobTag = {}".format(client_request_token)) #Submit Text Detection job to Textract to detect lines of text while retryCount >= 0 and retryCount < maxRetryAttempt: try: response = textract.start_document_text_detection( - ClientRequestToken = "{}-{}".format(tokenPrefix, document.replace("/","_").replace(".","-")), + ClientRequestToken = client_request_token, DocumentLocation={'S3Object': {'Bucket': bucket, 'Name': document}}, NotificationChannel={'SNSTopicArn': topicArn,'RoleArn': roleArn}, - JobTag = "{}-{}".format(tokenPrefix, document[document.rfind("/")+1:document.rfind(".")])) + JobTag = job_tag) jobId = response['JobId'] jobStartTimeStamp = datetime.strptime(response['ResponseMetadata']['HTTPHeaders']['date'], '%a, %d %b %Y %H:%M:%S %Z').timestamp() print("Textract Request: {} submitted at {} with JobId - {}".format( @@ -377,6 +381,7 @@ def lambda_handler(event, context): external_bucket = "" bucket = "" document = "" + eTag = "" bucketAccessPolicyArn = None if 'ExternalBucketName' in event: @@ -388,6 +393,7 @@ def lambda_handler(event, context): print(record) bucket = record['s3']['bucket']['name'] document = unquote_plus(record['s3']['object']['key']) + eTag = record['s3']['object']['eTag'] else: bucket = event['ExternalBucketName'] document = event['ExternalDocumentPrefix'] @@ -396,14 +402,14 @@ def lambda_handler(event, context): print("Bucket and/or Document not specified, nothing to do.") return {} - documentAnalysisResponse = submitDocumentAnalysisJob(bucket, document, + documentAnalysisResponse = submitDocumentAnalysisJob(bucket, document, eTag, documentAnalysisTokenPrefix, retryInterval, maxRetryAttempt, documentAnalysisTopicArn, roleArn, table_name) print("DocumentAnalysisResponse = {}".format(documentAnalysisResponse)) - textDetectionResponse = submitTextDetectionJob(bucket, document, + textDetectionResponse = submitTextDetectionJob(bucket, document, eTag, textDetectionTokenPrefix, retryInterval, maxRetryAttempt, textDetectionTopicArn, From ce4b6bb79da2b8ef1e7424dfd811abdc73afcfef Mon Sep 17 00:00:00 2001 From: Mat Werber Date: Mon, 2 Sep 2019 18:54:29 -0400 Subject: [PATCH 3/3] check if resources key in pages blocks --- functions/textract_util.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/functions/textract_util.py b/functions/textract_util.py index 1bda27d..eeb3689 100644 --- a/functions/textract_util.py +++ b/functions/textract_util.py @@ -420,15 +420,18 @@ def extractTextBody(blocks): document_text = {} for page in blocks['PAGE']: document_text['Page-{0:02d}'.format(page['Page'])] = {} - print("Page-{} contains {} Lines".format(page['Page'], len(page['Relationships'][0]['Ids']))) - total_line += len(page['Relationships'][0]['Ids']) - for i, line_id in enumerate(page['Relationships'][0]['Ids']): - page_line = None - for line in blocks['LINE']: - if line['Id'] == line_id: - page_line = line - break - document_text['Page-{0:02d}'.format(page['Page'])]['Line-{0:04d}'.format(i+1)] = {} - document_text['Page-{0:02d}'.format(page['Page'])]['Line-{0:04d}'.format(i+1)]['Text'] = page_line['Text'] + if 'Relationships' in page.keys(): + print("Page-{} contains {} Lines".format(page['Page'], len(page['Relationships'][0]['Ids']))) + total_line += len(page['Relationships'][0]['Ids']) + for i, line_id in enumerate(page['Relationships'][0]['Ids']): + page_line = None + for line in blocks['LINE']: + if line['Id'] == line_id: + page_line = line + break + document_text['Page-{0:02d}'.format(page['Page'])]['Line-{0:04d}'.format(i+1)] = {} + document_text['Page-{0:02d}'.format(page['Page'])]['Line-{0:04d}'.format(i+1)]['Text'] = page_line['Text'] + else: + print("Page-{} contains no lines".format(page['Page'])) print(total_line) return document_text, total_line