You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
`2024-08-27 16:25:07,893 - /dedoc_root/dedoc/dedoc_manager.py - INFO - Get file tmpggtlpvfp.pdf with parameters {'document_type': 'diploma', 'structure_type': 'tree', 'return_format': 'html', 'with_attachments': 'false', 'need_content_analysis': 'false', 'recursion_deep_attachments': '10', 'return_base64': 'false', 'need_pdf_table_analysis': 'true', 'table_type': '', 'orient_analysis_cells': 'false', 'orient_cell_angle': '90', 'pdf_with_text_layer': 'auto_tabby', 'language': 'rus', 'pages': ':', 'is_one_column_document': 'true', 'document_orientation': 'auto', 'need_header_footer_analysis': 'false', 'need_binarization': 'false', 'delimiter': None, 'encoding': None, 'html_fields': '', 'handle_invisible_table': 'true', 'attachments_dir': '/tmp/tmpkupfsv1p'}
2024-08-27 16:25:07,893 - /dedoc_root/dedoc/dedoc_manager.py - INFO - Start handle /tmp/tmpkupfsv1p/tmpggtlpvfp.pdf
2024-08-27 16:25:10,058 - /dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py - INFO - Reading PDF pages from 1 to 8
2024-08-27 16:26:01,569 - /dedoc_root/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py - INFO - Assume document 1724765107_455.pdf has a correct textual layer
2024-08-27 16:26:03,394 - /dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py - INFO - Reading PDF pages from 1 to 1801
2024-08-27 16:32:51,051 - /dedoc_root/dedoc/api/dedoc_api.py - ERROR - Exception TabbyPdfError(Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at java.base/java.util.Arrays.copyOf(Arrays.java:3745)
at java.base/java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:172)
at java.base/java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:538)
at java.base/java.lang.StringBuffer.append(StringBuffer.java:317)
at java.base/java.io.StringWriter.write(StringWriter.java:106)
at org.json.JSONObject.write(JSONObject.java:2565)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.toString(JSONObject.java:2354)
at org.json.JSONObject.toString(JSONObject.java:2319)
at writers.JsonDocumentWriter.write(JsonDocumentWriter.java:51)
at DedocTableExtractor.printJSON(DedocTableExtractor.java:185)
at DedocTableExtractor.extract(DedocTableExtractor.java:131)
at DedocTableExtractor.run(DedocTableExtractor.java:89)
at DedocTableExtractor.main(DedocTableExtractor.java:58)
)
Traceback (most recent call last):
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py", line 296, in __run
result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.DEVNULL, check=True)
File "/usr/lib/python3.9/subprocess.py", line 528, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['java', '-jar', '/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar', '-i', '/tmp/tmpks3qpt9b/1724765107_455.pdf', '-sp', '1', '-ep', '1801']' returned non-zero exit status 1.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.9/dist-packages/starlette/middleware/exceptions.py", line 68, in call
await self.app(scope, receive, sender)
File "/usr/local/lib/python3.9/dist-packages/starlette/routing.py", line 718, in call
await route.handle(scope, receive, send)
File "/usr/local/lib/python3.9/dist-packages/starlette/routing.py", line 276, in handle
await self.app(scope, receive, send)
File "/usr/local/lib/python3.9/dist-packages/starlette/routing.py", line 66, in app
response = await func(request)
File "/usr/local/lib/python3.9/dist-packages/fastapi/routing.py", line 299, in app
raise e
File "/usr/local/lib/python3.9/dist-packages/fastapi/routing.py", line 294, in app
raw_response = await run_endpoint_function(
File "/usr/local/lib/python3.9/dist-packages/fastapi/routing.py", line 191, in run_endpoint_function
return await dependant.call(**values)
File "/dedoc_root/dedoc/api/dedoc_api.py", line 81, in upload
document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmpdir})
File "/dedoc_root/dedoc/dedoc_manager.py", line 81, in parse
raise e
File "/dedoc_root/dedoc/dedoc_manager.py", line 74, in parse
return self.__parse_no_error_handling(file_path=file_path, parameters=parameters)
File "/dedoc_root/dedoc/dedoc_manager.py", line 107, in __parse_no_error_handling
converted_file_path, unstructured_document = self.__read_with_mime_auto_detection(
File "/dedoc_root/dedoc/dedoc_manager.py", line 153, in __read_with_mime_auto_detection
converted_file_path, document = self.__parse_file(file_path=file_path, file_name=file_name, parameters=parameters, mime=mime, extension=extension)
File "/dedoc_root/dedoc/dedoc_manager.py", line 177, in __parse_file
unstructured_document = self.reader.read(file_path=converted_file_path, parameters=parameters, mime=mime, extension=extension)
File "/dedoc_root/dedoc/readers/reader_composition.py", line 39, in read
unstructured_document = reader.read(file_path=file_path, parameters=parameters)
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py", line 59, in read
result = self.__handle_correct_text_layer(is_first_page_correct=txtlayer_parameters.is_first_page_correct,
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py", line 99, in __handle_correct_text_layer
result = reader.read(file_path=path, parameters=parameters)
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py", line 62, in read
lines, tables, tables_on_images, attachments, document_metadata = self.__extract(path=file_path, parameters=parameters, warnings=warnings)
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py", line 105, in __extract
document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page)
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py", line 308, in __process_pdf
output = self.__run(path=path, start_page=start_page, end_page=end_page)
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py", line 303, in __run
raise TabbyPdfError(e.stderr.decode(encoding))
dedoc.common.exceptions.tabby_pdf_error.TabbyPdfError: TabbyPdfError(Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at java.base/java.util.Arrays.copyOf(Arrays.java:3745)
at java.base/java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:172)
at java.base/java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:538)
at java.base/java.lang.StringBuffer.append(StringBuffer.java:317)
at java.base/java.io.StringWriter.write(StringWriter.java:106)
at org.json.JSONObject.write(JSONObject.java:2565)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.toString(JSONObject.java:2354)
at org.json.JSONObject.toString(JSONObject.java:2319)
at writers.JsonDocumentWriter.write(JsonDocumentWriter.java:51)
at DedocTableExtractor.printJSON(DedocTableExtractor.java:185)
at DedocTableExtractor.extract(DedocTableExtractor.java:131)
at DedocTableExtractor.run(DedocTableExtractor.java:89)
at DedocTableExtractor.main(DedocTableExtractor.java:58)
)
INFO: 127.0.0.1:53028 - "POST /upload HTTP/1.1" 500 Internal Server Error`
v2.2.6
The text was updated successfully, but these errors were encountered:
При парсинге документа поймали джавовый ООМ:
`2024-08-27 16:25:07,893 - /dedoc_root/dedoc/dedoc_manager.py - INFO - Get file tmpggtlpvfp.pdf with parameters {'document_type': 'diploma', 'structure_type': 'tree', 'return_format': 'html', 'with_attachments': 'false', 'need_content_analysis': 'false', 'recursion_deep_attachments': '10', 'return_base64': 'false', 'need_pdf_table_analysis': 'true', 'table_type': '', 'orient_analysis_cells': 'false', 'orient_cell_angle': '90', 'pdf_with_text_layer': 'auto_tabby', 'language': 'rus', 'pages': ':', 'is_one_column_document': 'true', 'document_orientation': 'auto', 'need_header_footer_analysis': 'false', 'need_binarization': 'false', 'delimiter': None, 'encoding': None, 'html_fields': '', 'handle_invisible_table': 'true', 'attachments_dir': '/tmp/tmpkupfsv1p'}
2024-08-27 16:25:07,893 - /dedoc_root/dedoc/dedoc_manager.py - INFO - Start handle /tmp/tmpkupfsv1p/tmpggtlpvfp.pdf
2024-08-27 16:25:10,058 - /dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py - INFO - Reading PDF pages from 1 to 8
2024-08-27 16:26:01,569 - /dedoc_root/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py - INFO - Assume document 1724765107_455.pdf has a correct textual layer
2024-08-27 16:26:03,394 - /dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py - INFO - Reading PDF pages from 1 to 1801
2024-08-27 16:32:51,051 - /dedoc_root/dedoc/api/dedoc_api.py - ERROR - Exception TabbyPdfError(Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at java.base/java.util.Arrays.copyOf(Arrays.java:3745)
at java.base/java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:172)
at java.base/java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:538)
at java.base/java.lang.StringBuffer.append(StringBuffer.java:317)
at java.base/java.io.StringWriter.write(StringWriter.java:106)
at org.json.JSONObject.write(JSONObject.java:2565)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.toString(JSONObject.java:2354)
at org.json.JSONObject.toString(JSONObject.java:2319)
at writers.JsonDocumentWriter.write(JsonDocumentWriter.java:51)
at DedocTableExtractor.printJSON(DedocTableExtractor.java:185)
at DedocTableExtractor.extract(DedocTableExtractor.java:131)
at DedocTableExtractor.run(DedocTableExtractor.java:89)
at DedocTableExtractor.main(DedocTableExtractor.java:58)
)
Traceback (most recent call last):
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py", line 296, in __run
result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.DEVNULL, check=True)
File "/usr/lib/python3.9/subprocess.py", line 528, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['java', '-jar', '/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar', '-i', '/tmp/tmpks3qpt9b/1724765107_455.pdf', '-sp', '1', '-ep', '1801']' returned non-zero exit status 1.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.9/dist-packages/starlette/middleware/exceptions.py", line 68, in call
await self.app(scope, receive, sender)
File "/usr/local/lib/python3.9/dist-packages/starlette/routing.py", line 718, in call
await route.handle(scope, receive, send)
File "/usr/local/lib/python3.9/dist-packages/starlette/routing.py", line 276, in handle
await self.app(scope, receive, send)
File "/usr/local/lib/python3.9/dist-packages/starlette/routing.py", line 66, in app
response = await func(request)
File "/usr/local/lib/python3.9/dist-packages/fastapi/routing.py", line 299, in app
raise e
File "/usr/local/lib/python3.9/dist-packages/fastapi/routing.py", line 294, in app
raw_response = await run_endpoint_function(
File "/usr/local/lib/python3.9/dist-packages/fastapi/routing.py", line 191, in run_endpoint_function
return await dependant.call(**values)
File "/dedoc_root/dedoc/api/dedoc_api.py", line 81, in upload
document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmpdir})
File "/dedoc_root/dedoc/dedoc_manager.py", line 81, in parse
raise e
File "/dedoc_root/dedoc/dedoc_manager.py", line 74, in parse
return self.__parse_no_error_handling(file_path=file_path, parameters=parameters)
File "/dedoc_root/dedoc/dedoc_manager.py", line 107, in __parse_no_error_handling
converted_file_path, unstructured_document = self.__read_with_mime_auto_detection(
File "/dedoc_root/dedoc/dedoc_manager.py", line 153, in __read_with_mime_auto_detection
converted_file_path, document = self.__parse_file(file_path=file_path, file_name=file_name, parameters=parameters, mime=mime, extension=extension)
File "/dedoc_root/dedoc/dedoc_manager.py", line 177, in __parse_file
unstructured_document = self.reader.read(file_path=converted_file_path, parameters=parameters, mime=mime, extension=extension)
File "/dedoc_root/dedoc/readers/reader_composition.py", line 39, in read
unstructured_document = reader.read(file_path=file_path, parameters=parameters)
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py", line 59, in read
result = self.__handle_correct_text_layer(is_first_page_correct=txtlayer_parameters.is_first_page_correct,
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py", line 99, in __handle_correct_text_layer
result = reader.read(file_path=path, parameters=parameters)
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py", line 62, in read
lines, tables, tables_on_images, attachments, document_metadata = self.__extract(path=file_path, parameters=parameters, warnings=warnings)
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py", line 105, in __extract
document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page)
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py", line 308, in __process_pdf
output = self.__run(path=path, start_page=start_page, end_page=end_page)
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py", line 303, in __run
raise TabbyPdfError(e.stderr.decode(encoding))
dedoc.common.exceptions.tabby_pdf_error.TabbyPdfError: TabbyPdfError(Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at java.base/java.util.Arrays.copyOf(Arrays.java:3745)
at java.base/java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:172)
at java.base/java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:538)
at java.base/java.lang.StringBuffer.append(StringBuffer.java:317)
at java.base/java.io.StringWriter.write(StringWriter.java:106)
at org.json.JSONObject.write(JSONObject.java:2565)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.toString(JSONObject.java:2354)
at org.json.JSONObject.toString(JSONObject.java:2319)
at writers.JsonDocumentWriter.write(JsonDocumentWriter.java:51)
at DedocTableExtractor.printJSON(DedocTableExtractor.java:185)
at DedocTableExtractor.extract(DedocTableExtractor.java:131)
at DedocTableExtractor.run(DedocTableExtractor.java:89)
at DedocTableExtractor.main(DedocTableExtractor.java:58)
)
INFO: 127.0.0.1:53028 - "POST /upload HTTP/1.1" 500 Internal Server Error`
v2.2.6
The text was updated successfully, but these errors were encountered: