Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PR for issue #4 - Check whether relationships key present in PAGE blocks of textract_util.py #5

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 21 additions & 15 deletions functions/textract-job-submit-async.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import time
import boto3
from datetime import datetime

from urllib.parse import unquote_plus

def attachExternalBucketPolicy(externalBucketName):
iam = boto3.client('iam')
Expand Down Expand Up @@ -116,7 +116,7 @@ def updateResponse(givenjson, updatejson, override = False):
givenjson[key] = updatejson[key]
return givenjson

def submitDocumentAnalysisJob(bucket, document, tokenPrefix, retryInterval, maxRetryAttempt, topicArn, roleArn, table_name):
def submitDocumentAnalysisJob(bucket, document, eTag, tokenPrefix, retryInterval, maxRetryAttempt, topicArn, roleArn, table_name):

s3 = boto3.resource('s3')
textract = boto3.client('textract')
Expand All @@ -129,21 +129,23 @@ def submitDocumentAnalysisJob(bucket, document, tokenPrefix, retryInterval, maxR
document_path = document[:document.rfind("/")] if document.find("/") >= 0 else ""
document_name = document[document.rfind("/")+1:document.rfind(".")] if document.find("/") >= 0 else document[:document.rfind(".")]
document_type = document[document.rfind(".")+1:].upper()
client_request_token = "{}-{}".format(tokenPrefix, eTag)
job_tag = "{}-{}".format(tokenPrefix, eTag)

print("DocumentAnalysisJob: ClientRequestToken = {}-{}".format(tokenPrefix, document.replace("/","_").replace(".","-")))
print("DocumentAnalysisJob: ClientRequestToken = {}".format(client_request_token))
print("DocumentAnalysisJob: DocumentLocation = 'S3Object': 'Bucket': {}, 'Name': {}".format(bucket, document))
print("DocumentAnalysisJob: NotificationChannel = 'SNSTopicArn': {},'RoleArn': {}".format(topicArn, roleArn))
print("DocumentAnalysisJob: JobTag = {}-{}".format(tokenPrefix, document[document.rfind("/")+1:document.rfind(".")]))
print("DocumentAnalysisJob: JobTag = {}".format(job_tag))

#Submit Document Anlysis job to Textract to extract text features
while retryCount >= 0 and retryCount < maxRetryAttempt:
try:
response = textract.start_document_analysis(
ClientRequestToken = "{}-{}".format(tokenPrefix, document.replace("/","_").replace(".","-")),
ClientRequestToken = client_request_token,
DocumentLocation={'S3Object': {'Bucket': bucket, 'Name': document}},
FeatureTypes=["TABLES", "FORMS"],
NotificationChannel={'SNSTopicArn': topicArn,'RoleArn': roleArn},
JobTag = "{}-{}".format(tokenPrefix, document[document.rfind("/")+1:document.rfind(".")]))
JobTag = job_tag)
jobId = response['JobId']
jobStartTimeStamp = datetime.strptime(response['ResponseMetadata']['HTTPHeaders']['date'], '%a, %d %b %Y %H:%M:%S %Z').timestamp()
print("Textract Request: {} submitted at {} with JobId - {}".format(
Expand Down Expand Up @@ -242,7 +244,7 @@ def submitDocumentAnalysisJob(bucket, document, tokenPrefix, retryInterval, maxR

return jsonresponse

def submitTextDetectionJob(bucket, document, tokenPrefix, retryInterval, maxRetryAttempt, topicArn, roleArn, table_name):
def submitTextDetectionJob(bucket, document, eTag, tokenPrefix, retryInterval, maxRetryAttempt, topicArn, roleArn, table_name):

s3 = boto3.resource('s3')
textract = boto3.client('textract')
Expand All @@ -255,20 +257,22 @@ def submitTextDetectionJob(bucket, document, tokenPrefix, retryInterval, maxRetr
document_path = document[:document.rfind("/")] if document.find("/") >= 0 else ""
document_name = document[document.rfind("/")+1:document.rfind(".")] if document.find("/") >= 0 else document[:document.rfind(".")]
document_type = document[document.rfind(".")+1:].upper()

print("TextDetectionsJob: ClientRequestToken = {}-{}".format(tokenPrefix, document.replace("/","_").replace(".","-")))
client_request_token = "{}-{}".format(tokenPrefix, eTag)
job_tag = "{}-{}".format(tokenPrefix, eTag)

print("TextDetectionsJob: ClientRequestToken = {}".format(client_request_token))
print("TextDetectionJob: DocumentLocation = 'S3Object': 'Bucket': {}, 'Name': {}".format(bucket, document))
print("TextDetectionJob: NotificationChannel = 'SNSTopicArn': {},'RoleArn': {}".format(topicArn, roleArn))
print("TextDetectionJob: JobTag = {}-{}".format(tokenPrefix, document[document.rfind("/")+1:document.rfind(".")]))
print("TextDetectionJob: JobTag = {}".format(client_request_token))

#Submit Text Detection job to Textract to detect lines of text
while retryCount >= 0 and retryCount < maxRetryAttempt:
try:
response = textract.start_document_text_detection(
ClientRequestToken = "{}-{}".format(tokenPrefix, document.replace("/","_").replace(".","-")),
ClientRequestToken = client_request_token,
DocumentLocation={'S3Object': {'Bucket': bucket, 'Name': document}},
NotificationChannel={'SNSTopicArn': topicArn,'RoleArn': roleArn},
JobTag = "{}-{}".format(tokenPrefix, document[document.rfind("/")+1:document.rfind(".")]))
JobTag = job_tag)
jobId = response['JobId']
jobStartTimeStamp = datetime.strptime(response['ResponseMetadata']['HTTPHeaders']['date'], '%a, %d %b %Y %H:%M:%S %Z').timestamp()
print("Textract Request: {} submitted at {} with JobId - {}".format(
Expand Down Expand Up @@ -377,6 +381,7 @@ def lambda_handler(event, context):
external_bucket = ""
bucket = ""
document = ""
eTag = ""
bucketAccessPolicyArn = None

if 'ExternalBucketName' in event:
Expand All @@ -387,7 +392,8 @@ def lambda_handler(event, context):
record, = event["Records"]
print(record)
bucket = record['s3']['bucket']['name']
document = record['s3']['object']['key']
document = unquote_plus(record['s3']['object']['key'])
eTag = record['s3']['object']['eTag']
else:
bucket = event['ExternalBucketName']
document = event['ExternalDocumentPrefix']
Expand All @@ -396,14 +402,14 @@ def lambda_handler(event, context):
print("Bucket and/or Document not specified, nothing to do.")
return {}

documentAnalysisResponse = submitDocumentAnalysisJob(bucket, document,
documentAnalysisResponse = submitDocumentAnalysisJob(bucket, document, eTag,
documentAnalysisTokenPrefix,
retryInterval, maxRetryAttempt,
documentAnalysisTopicArn,
roleArn, table_name)
print("DocumentAnalysisResponse = {}".format(documentAnalysisResponse))

textDetectionResponse = submitTextDetectionJob(bucket, document,
textDetectionResponse = submitTextDetectionJob(bucket, document, eTag,
textDetectionTokenPrefix,
retryInterval, maxRetryAttempt,
textDetectionTopicArn,
Expand Down
23 changes: 13 additions & 10 deletions functions/textract_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,15 +420,18 @@ def extractTextBody(blocks):
document_text = {}
for page in blocks['PAGE']:
document_text['Page-{0:02d}'.format(page['Page'])] = {}
print("Page-{} contains {} Lines".format(page['Page'], len(page['Relationships'][0]['Ids'])))
total_line += len(page['Relationships'][0]['Ids'])
for i, line_id in enumerate(page['Relationships'][0]['Ids']):
page_line = None
for line in blocks['LINE']:
if line['Id'] == line_id:
page_line = line
break
document_text['Page-{0:02d}'.format(page['Page'])]['Line-{0:04d}'.format(i+1)] = {}
document_text['Page-{0:02d}'.format(page['Page'])]['Line-{0:04d}'.format(i+1)]['Text'] = page_line['Text']
if 'Relationships' in page.keys():
print("Page-{} contains {} Lines".format(page['Page'], len(page['Relationships'][0]['Ids'])))
total_line += len(page['Relationships'][0]['Ids'])
for i, line_id in enumerate(page['Relationships'][0]['Ids']):
page_line = None
for line in blocks['LINE']:
if line['Id'] == line_id:
page_line = line
break
document_text['Page-{0:02d}'.format(page['Page'])]['Line-{0:04d}'.format(i+1)] = {}
document_text['Page-{0:02d}'.format(page['Page'])]['Line-{0:04d}'.format(i+1)]['Text'] = page_line['Text']
else:
print("Page-{} contains no lines".format(page['Page']))
print(total_line)
return document_text, total_line