Skip to content

Commit

Permalink
fix: add_logs (#114)
Browse files Browse the repository at this point in the history
* fix: add_logs

* fix: fix unstructured version
  • Loading branch information
chloedia committed Nov 8, 2024
1 parent 88bafbe commit cc926c2
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 43 deletions.
31 changes: 19 additions & 12 deletions megaparse/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,17 @@ async def parse_file(
language=language,
parsing_instruction=parsing_instruction,
)

parser = parser_builder.build(parser_config)
with tempfile.NamedTemporaryFile(
delete=False, suffix=f".{str(file.filename).split('.')[-1]}"
) as temp_file:
temp_file.write(file.file.read())
megaparse = MegaParse(parser=parser)
result = await megaparse.aload(file_path=temp_file.name)
return {"message": "File parsed successfully", "result": result}
try:
parser = parser_builder.build(parser_config)
with tempfile.NamedTemporaryFile(
delete=False, suffix=f".{str(file.filename).split('.')[-1]}"
) as temp_file:
temp_file.write(file.file.read())
megaparse = MegaParse(parser=parser)
result = await megaparse.aload(file_path=temp_file.name)
return {"message": "File parsed successfully", "result": result}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))


@app.post("/v1/url")
Expand All @@ -107,9 +109,14 @@ async def upload_url(

with tempfile.NamedTemporaryFile(delete=False, suffix="pdf") as temp_file:
temp_file.write(response.content)
megaparse = MegaParse(parser=UnstructuredParser(strategy=StrategyEnum.AUTO))
result = megaparse.load(temp_file.name)
return {"message": "File parsed successfully", "result": result}
try:
megaparse = MegaParse(
parser=UnstructuredParser(strategy=StrategyEnum.AUTO)
)
result = megaparse.load(temp_file.name)
return {"message": "File parsed successfully", "result": result}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
else:
data = await playwright_loader.aload()
# Now turn the data into a string
Expand Down
6 changes: 4 additions & 2 deletions megaparse/core/parser/unstructured_parser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from typing import List
from unstructured.partition.auto import partition

from dotenv import load_dotenv
from megaparse.core.parser import MegaParser
from unstructured.documents.elements import Element
from unstructured.partition.auto import partition

from megaparse.core.parser import MegaParser
from megaparse.core.parser.type import StrategyEnum


Expand Down
2 changes: 1 addition & 1 deletion megaparse/sdk/examples/usage_example.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import asyncio
import os

from megaparse.sdk import MegaParseSDK
from megaparse_sdk import MegaParseSDK


async def main():
Expand Down
2 changes: 1 addition & 1 deletion megaparse/sdk/megaparse_sdk/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class MegaParseClient:
def __init__(self, api_key: str | None = None):
self.base_url = "https://megaparse.tooling.quivr.app" # to define once in production # to define once in production
self.base_url = "https://megaparse.tooling.quivr.app"

self.api_key = api_key
if self.api_key:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ all = [
"playwright>=1.47.0",
"langchain-anthropic>=0.2.3",
"python-magic>=0.4.27",
"unstructured[all-docs]>=0.15.0",
"unstructured[all-docs]==0.15.0",
"langchain>=0.2.0",
"langchain-community>=0.2.0",
"langchain-openai>=0.1.0",
Expand Down
25 changes: 12 additions & 13 deletions requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,6 @@ grpcio-status==1.67.1
h11==0.14.0
# via httpcore
# via uvicorn
html5lib==1.1
# via unstructured
httpcore==1.0.6
# via httpx
httpx==0.27.2
Expand Down Expand Up @@ -360,7 +358,6 @@ numpy==1.26.4
# via torchvision
# via transformers
# via unstructured
# via unstructured-inference
nvidia-cublas-cu12==12.4.5.8 ; platform_machine == 'x86_64' and platform_system == 'Linux'
# via nvidia-cudnn-cu12
# via nvidia-cusolver-cu12
Expand Down Expand Up @@ -420,6 +417,7 @@ packaging==24.1
# via matplotlib
# via onnxruntime
# via pikepdf
# via pytesseract
# via pytest
# via transformers
# via unstructured-pytesseract
Expand All @@ -443,8 +441,6 @@ pdfplumber==0.11.4
# via megaparse
pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32'
# via ipython
pi-heif==0.20.0
# via unstructured
pikepdf==9.4.0
# via unstructured
pillow==11.0.0
Expand All @@ -453,11 +449,14 @@ pillow==11.0.0
# via matplotlib
# via pdf2image
# via pdfplumber
# via pi-heif
# via pikepdf
# via pillow-heif
# via pytesseract
# via python-pptx
# via torchvision
# via unstructured-pytesseract
pillow-heif==0.20.0
# via unstructured
platformdirs==4.3.6
# via black
# via jupyter-core
Expand Down Expand Up @@ -543,6 +542,8 @@ pypdfium2==4.30.0
# via pdfplumber
pyreadline3==3.5.4 ; sys_platform == 'win32'
# via humanfriendly
pytesseract==0.3.13
# via unstructured
pytest==8.3.3
# via pytest-asyncio
# via pytest-cov
Expand Down Expand Up @@ -570,7 +571,7 @@ python-multipart==0.0.17
# via unstructured-inference
python-oxmsg==0.0.1
# via unstructured
python-pptx==1.0.2
python-pptx==0.6.23
# via unstructured
pytz==2024.2
# via pandas
Expand Down Expand Up @@ -628,7 +629,6 @@ setuptools==75.3.0
# via torch
six==1.16.0
# via asttokens
# via html5lib
# via langdetect
# via python-dateutil
sniffio==1.3.1
Expand All @@ -650,6 +650,8 @@ starlette==0.41.2
sympy==1.13.1
# via onnxruntime
# via torch
tabulate==0.9.0
# via unstructured
tenacity==8.5.0
# via langchain
# via langchain-community
Expand Down Expand Up @@ -711,7 +713,6 @@ typing-extensions==4.12.2
# via pyee
# via python-docx
# via python-oxmsg
# via python-pptx
# via sqlalchemy
# via torch
# via typing-inspect
Expand All @@ -723,11 +724,11 @@ typing-inspect==0.9.0
# via unstructured-client
tzdata==2024.2
# via pandas
unstructured==0.16.4
unstructured==0.15.0
# via megaparse
unstructured-client==0.27.0
# via unstructured
unstructured-inference==0.8.1
unstructured-inference==0.7.36
# via unstructured
unstructured-pytesseract==0.3.13
# via unstructured
Expand All @@ -739,8 +740,6 @@ virtualenv==20.27.1
# via pre-commit
wcwidth==0.2.13
# via prompt-toolkit
webencodings==0.5.1
# via html5lib
wrapt==1.16.0
# via deprecated
# via llama-index-core
Expand Down
25 changes: 12 additions & 13 deletions requirements.lock
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,6 @@ grpcio-status==1.67.1
h11==0.14.0
# via httpcore
# via uvicorn
html5lib==1.1
# via unstructured
httpcore==1.0.6
# via httpx
httpx==0.27.2
Expand Down Expand Up @@ -305,7 +303,6 @@ numpy==1.26.4
# via torchvision
# via transformers
# via unstructured
# via unstructured-inference
nvidia-cublas-cu12==12.4.5.8 ; platform_machine == 'x86_64' and platform_system == 'Linux'
# via nvidia-cudnn-cu12
# via nvidia-cusolver-cu12
Expand Down Expand Up @@ -363,6 +360,7 @@ packaging==24.1
# via matplotlib
# via onnxruntime
# via pikepdf
# via pytesseract
# via transformers
# via unstructured-pytesseract
pandas==2.2.3
Expand All @@ -379,8 +377,6 @@ pdfminer-six==20231228
pdfplumber==0.11.4
# via layoutparser
# via megaparse
pi-heif==0.20.0
# via unstructured
pikepdf==9.4.0
# via unstructured
pillow==11.0.0
Expand All @@ -389,11 +385,14 @@ pillow==11.0.0
# via matplotlib
# via pdf2image
# via pdfplumber
# via pi-heif
# via pikepdf
# via pillow-heif
# via pytesseract
# via python-pptx
# via torchvision
# via unstructured-pytesseract
pillow-heif==0.20.0
# via unstructured
playwright==1.48.0
# via megaparse
portalocker==2.10.1
Expand Down Expand Up @@ -459,6 +458,8 @@ pypdfium2==4.30.0
# via pdfplumber
pyreadline3==3.5.4 ; sys_platform == 'win32'
# via humanfriendly
pytesseract==0.3.13
# via unstructured
python-dateutil==2.8.2
# via matplotlib
# via pandas
Expand All @@ -478,7 +479,7 @@ python-multipart==0.0.17
# via unstructured-inference
python-oxmsg==0.0.1
# via unstructured
python-pptx==1.0.2
python-pptx==0.6.23
# via unstructured
pytz==2024.2
# via pandas
Expand Down Expand Up @@ -529,7 +530,6 @@ scipy==1.14.1
setuptools==75.3.0
# via torch
six==1.16.0
# via html5lib
# via langdetect
# via python-dateutil
sniffio==1.3.1
Expand All @@ -549,6 +549,8 @@ starlette==0.41.2
sympy==1.13.1
# via onnxruntime
# via torch
tabulate==0.9.0
# via unstructured
tenacity==8.5.0
# via langchain
# via langchain-community
Expand Down Expand Up @@ -598,7 +600,6 @@ typing-extensions==4.12.2
# via pyee
# via python-docx
# via python-oxmsg
# via python-pptx
# via sqlalchemy
# via torch
# via typing-inspect
Expand All @@ -610,20 +611,18 @@ typing-inspect==0.9.0
# via unstructured-client
tzdata==2024.2
# via pandas
unstructured==0.16.4
unstructured==0.15.0
# via megaparse
unstructured-client==0.27.0
# via unstructured
unstructured-inference==0.8.1
unstructured-inference==0.7.36
# via unstructured
unstructured-pytesseract==0.3.13
# via unstructured
urllib3==2.2.3
# via requests
uvicorn==0.32.0
# via megaparse
webencodings==0.5.1
# via html5lib
wrapt==1.16.0
# via deprecated
# via llama-index-core
Expand Down

0 comments on commit cc926c2

Please sign in to comment.