diff --git a/megaparse/api/app.py b/megaparse/api/app.py index 49814eb..1cda49c 100644 --- a/megaparse/api/app.py +++ b/megaparse/api/app.py @@ -80,15 +80,17 @@ async def parse_file( language=language, parsing_instruction=parsing_instruction, ) - - parser = parser_builder.build(parser_config) - with tempfile.NamedTemporaryFile( - delete=False, suffix=f".{str(file.filename).split('.')[-1]}" - ) as temp_file: - temp_file.write(file.file.read()) - megaparse = MegaParse(parser=parser) - result = await megaparse.aload(file_path=temp_file.name) - return {"message": "File parsed successfully", "result": result} + try: + parser = parser_builder.build(parser_config) + with tempfile.NamedTemporaryFile( + delete=False, suffix=f".{str(file.filename).split('.')[-1]}" + ) as temp_file: + temp_file.write(file.file.read()) + megaparse = MegaParse(parser=parser) + result = await megaparse.aload(file_path=temp_file.name) + return {"message": "File parsed successfully", "result": result} + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) @app.post("/v1/url") @@ -107,9 +109,14 @@ async def upload_url( with tempfile.NamedTemporaryFile(delete=False, suffix="pdf") as temp_file: temp_file.write(response.content) - megaparse = MegaParse(parser=UnstructuredParser(strategy=StrategyEnum.AUTO)) - result = megaparse.load(temp_file.name) - return {"message": "File parsed successfully", "result": result} + try: + megaparse = MegaParse( + parser=UnstructuredParser(strategy=StrategyEnum.AUTO) + ) + result = megaparse.load(temp_file.name) + return {"message": "File parsed successfully", "result": result} + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) else: data = await playwright_loader.aload() # Now turn the data into a string diff --git a/megaparse/core/parser/unstructured_parser.py b/megaparse/core/parser/unstructured_parser.py index a3cde7e..84e20c4 100644 --- a/megaparse/core/parser/unstructured_parser.py +++ b/megaparse/core/parser/unstructured_parser.py @@ -1,13 +1,11 @@ -import asyncio -from unstructured.partition.auto import partition +import re + from dotenv import load_dotenv -from megaparse.core.parser import MegaParser from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.prompts import ChatPromptTemplate -import re -from langchain_openai import ChatOpenAI -import os +from unstructured.partition.auto import partition +from megaparse.core.parser import MegaParser from megaparse.core.parser.type import StrategyEnum diff --git a/megaparse/sdk/examples/usage_example.py b/megaparse/sdk/examples/usage_example.py index d962028..eec216b 100644 --- a/megaparse/sdk/examples/usage_example.py +++ b/megaparse/sdk/examples/usage_example.py @@ -1,7 +1,7 @@ import asyncio import os -from megaparse.sdk import MegaParseSDK +from megaparse_sdk import MegaParseSDK async def main(): diff --git a/megaparse/sdk/megaparse_sdk/client.py b/megaparse/sdk/megaparse_sdk/client.py index bd95b6e..f4c9e3c 100644 --- a/megaparse/sdk/megaparse_sdk/client.py +++ b/megaparse/sdk/megaparse_sdk/client.py @@ -5,7 +5,7 @@ class MegaParseClient: def __init__(self, api_key: str | None = None): - self.base_url = "https://megaparse.tooling.quivr.app" # to define once in production # to define once in production + self.base_url = "https://megaparse.tooling.quivr.app" self.api_key = api_key if self.api_key: diff --git a/pyproject.toml b/pyproject.toml index 67f7c0f..74dabeb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ all = [ "playwright>=1.47.0", "langchain-anthropic>=0.2.3", "python-magic>=0.4.27", - "unstructured[all-docs]>=0.15.0", + "unstructured[all-docs]==0.15.0", "langchain>=0.2.0", "langchain-community>=0.2.0", "langchain-openai>=0.1.0", diff --git a/requirements-dev.lock b/requirements-dev.lock index b671b63..c296d26 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -168,8 +168,6 @@ grpcio-status==1.67.1 h11==0.14.0 # via httpcore # via uvicorn -html5lib==1.1 - # via unstructured httpcore==1.0.6 # via httpx httpx==0.27.2 @@ -360,7 +358,6 @@ numpy==1.26.4 # via torchvision # via transformers # via unstructured - # via unstructured-inference nvidia-cublas-cu12==12.4.5.8 ; platform_machine == 'x86_64' and platform_system == 'Linux' # via nvidia-cudnn-cu12 # via nvidia-cusolver-cu12 @@ -420,6 +417,7 @@ packaging==24.1 # via matplotlib # via onnxruntime # via pikepdf + # via pytesseract # via pytest # via transformers # via unstructured-pytesseract @@ -443,8 +441,6 @@ pdfplumber==0.11.4 # via megaparse pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32' # via ipython -pi-heif==0.20.0 - # via unstructured pikepdf==9.4.0 # via unstructured pillow==11.0.0 @@ -453,11 +449,14 @@ pillow==11.0.0 # via matplotlib # via pdf2image # via pdfplumber - # via pi-heif # via pikepdf + # via pillow-heif + # via pytesseract # via python-pptx # via torchvision # via unstructured-pytesseract +pillow-heif==0.20.0 + # via unstructured platformdirs==4.3.6 # via black # via jupyter-core @@ -543,6 +542,8 @@ pypdfium2==4.30.0 # via pdfplumber pyreadline3==3.5.4 ; sys_platform == 'win32' # via humanfriendly +pytesseract==0.3.13 + # via unstructured pytest==8.3.3 # via pytest-asyncio # via pytest-cov @@ -570,7 +571,7 @@ python-multipart==0.0.17 # via unstructured-inference python-oxmsg==0.0.1 # via unstructured -python-pptx==1.0.2 +python-pptx==0.6.23 # via unstructured pytz==2024.2 # via pandas @@ -628,7 +629,6 @@ setuptools==75.3.0 # via torch six==1.16.0 # via asttokens - # via html5lib # via langdetect # via python-dateutil sniffio==1.3.1 @@ -650,6 +650,8 @@ starlette==0.41.2 sympy==1.13.1 # via onnxruntime # via torch +tabulate==0.9.0 + # via unstructured tenacity==8.5.0 # via langchain # via langchain-community @@ -711,7 +713,6 @@ typing-extensions==4.12.2 # via pyee # via python-docx # via python-oxmsg - # via python-pptx # via sqlalchemy # via torch # via typing-inspect @@ -723,11 +724,11 @@ typing-inspect==0.9.0 # via unstructured-client tzdata==2024.2 # via pandas -unstructured==0.16.4 +unstructured==0.15.0 # via megaparse unstructured-client==0.27.0 # via unstructured -unstructured-inference==0.8.1 +unstructured-inference==0.7.36 # via unstructured unstructured-pytesseract==0.3.13 # via unstructured @@ -739,8 +740,6 @@ virtualenv==20.27.1 # via pre-commit wcwidth==0.2.13 # via prompt-toolkit -webencodings==0.5.1 - # via html5lib wrapt==1.16.0 # via deprecated # via llama-index-core diff --git a/requirements.lock b/requirements.lock index cb88319..57cfe60 100644 --- a/requirements.lock +++ b/requirements.lock @@ -138,8 +138,6 @@ grpcio-status==1.67.1 h11==0.14.0 # via httpcore # via uvicorn -html5lib==1.1 - # via unstructured httpcore==1.0.6 # via httpx httpx==0.27.2 @@ -305,7 +303,6 @@ numpy==1.26.4 # via torchvision # via transformers # via unstructured - # via unstructured-inference nvidia-cublas-cu12==12.4.5.8 ; platform_machine == 'x86_64' and platform_system == 'Linux' # via nvidia-cudnn-cu12 # via nvidia-cusolver-cu12 @@ -363,6 +360,7 @@ packaging==24.1 # via matplotlib # via onnxruntime # via pikepdf + # via pytesseract # via transformers # via unstructured-pytesseract pandas==2.2.3 @@ -379,8 +377,6 @@ pdfminer-six==20231228 pdfplumber==0.11.4 # via layoutparser # via megaparse -pi-heif==0.20.0 - # via unstructured pikepdf==9.4.0 # via unstructured pillow==11.0.0 @@ -389,11 +385,14 @@ pillow==11.0.0 # via matplotlib # via pdf2image # via pdfplumber - # via pi-heif # via pikepdf + # via pillow-heif + # via pytesseract # via python-pptx # via torchvision # via unstructured-pytesseract +pillow-heif==0.20.0 + # via unstructured playwright==1.48.0 # via megaparse portalocker==2.10.1 @@ -459,6 +458,8 @@ pypdfium2==4.30.0 # via pdfplumber pyreadline3==3.5.4 ; sys_platform == 'win32' # via humanfriendly +pytesseract==0.3.13 + # via unstructured python-dateutil==2.8.2 # via matplotlib # via pandas @@ -478,7 +479,7 @@ python-multipart==0.0.17 # via unstructured-inference python-oxmsg==0.0.1 # via unstructured -python-pptx==1.0.2 +python-pptx==0.6.23 # via unstructured pytz==2024.2 # via pandas @@ -529,7 +530,6 @@ scipy==1.14.1 setuptools==75.3.0 # via torch six==1.16.0 - # via html5lib # via langdetect # via python-dateutil sniffio==1.3.1 @@ -549,6 +549,8 @@ starlette==0.41.2 sympy==1.13.1 # via onnxruntime # via torch +tabulate==0.9.0 + # via unstructured tenacity==8.5.0 # via langchain # via langchain-community @@ -598,7 +600,6 @@ typing-extensions==4.12.2 # via pyee # via python-docx # via python-oxmsg - # via python-pptx # via sqlalchemy # via torch # via typing-inspect @@ -610,11 +611,11 @@ typing-inspect==0.9.0 # via unstructured-client tzdata==2024.2 # via pandas -unstructured==0.16.4 +unstructured==0.15.0 # via megaparse unstructured-client==0.27.0 # via unstructured -unstructured-inference==0.8.1 +unstructured-inference==0.7.36 # via unstructured unstructured-pytesseract==0.3.13 # via unstructured @@ -622,8 +623,6 @@ urllib3==2.2.3 # via requests uvicorn==0.32.0 # via megaparse -webencodings==0.5.1 - # via html5lib wrapt==1.16.0 # via deprecated # via llama-index-core