Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: add_logs #114

Merged
merged 2 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 19 additions & 12 deletions megaparse/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,17 @@ async def parse_file(
language=language,
parsing_instruction=parsing_instruction,
)

parser = parser_builder.build(parser_config)
with tempfile.NamedTemporaryFile(
delete=False, suffix=f".{str(file.filename).split('.')[-1]}"
) as temp_file:
temp_file.write(file.file.read())
megaparse = MegaParse(parser=parser)
result = await megaparse.aload(file_path=temp_file.name)
return {"message": "File parsed successfully", "result": result}
try:
parser = parser_builder.build(parser_config)
with tempfile.NamedTemporaryFile(
delete=False, suffix=f".{str(file.filename).split('.')[-1]}"
) as temp_file:
temp_file.write(file.file.read())
megaparse = MegaParse(parser=parser)
result = await megaparse.aload(file_path=temp_file.name)
return {"message": "File parsed successfully", "result": result}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))


@app.post("/v1/url")
Expand All @@ -107,9 +109,14 @@ async def upload_url(

with tempfile.NamedTemporaryFile(delete=False, suffix="pdf") as temp_file:
temp_file.write(response.content)
megaparse = MegaParse(parser=UnstructuredParser(strategy=StrategyEnum.AUTO))
result = megaparse.load(temp_file.name)
return {"message": "File parsed successfully", "result": result}
try:
megaparse = MegaParse(
parser=UnstructuredParser(strategy=StrategyEnum.AUTO)
)
result = megaparse.load(temp_file.name)
return {"message": "File parsed successfully", "result": result}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
else:
data = await playwright_loader.aload()
# Now turn the data into a string
Expand Down
10 changes: 4 additions & 6 deletions megaparse/core/parser/unstructured_parser.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import asyncio
from unstructured.partition.auto import partition
import re

from dotenv import load_dotenv
from megaparse.core.parser import MegaParser
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.prompts import ChatPromptTemplate
import re
from langchain_openai import ChatOpenAI
import os
from unstructured.partition.auto import partition

from megaparse.core.parser import MegaParser
from megaparse.core.parser.type import StrategyEnum


Expand Down
2 changes: 1 addition & 1 deletion megaparse/sdk/examples/usage_example.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import asyncio
import os

from megaparse.sdk import MegaParseSDK
from megaparse_sdk import MegaParseSDK


async def main():
Expand Down
2 changes: 1 addition & 1 deletion megaparse/sdk/megaparse_sdk/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class MegaParseClient:
def __init__(self, api_key: str | None = None):
self.base_url = "https://megaparse.tooling.quivr.app" # to define once in production # to define once in production
self.base_url = "https://megaparse.tooling.quivr.app"

self.api_key = api_key
if self.api_key:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ all = [
"playwright>=1.47.0",
"langchain-anthropic>=0.2.3",
"python-magic>=0.4.27",
"unstructured[all-docs]>=0.15.0",
"unstructured[all-docs]==0.15.0",
"langchain>=0.2.0",
"langchain-community>=0.2.0",
"langchain-openai>=0.1.0",
Expand Down
25 changes: 12 additions & 13 deletions requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,6 @@ grpcio-status==1.67.1
h11==0.14.0
# via httpcore
# via uvicorn
html5lib==1.1
# via unstructured
httpcore==1.0.6
# via httpx
httpx==0.27.2
Expand Down Expand Up @@ -360,7 +358,6 @@ numpy==1.26.4
# via torchvision
# via transformers
# via unstructured
# via unstructured-inference
nvidia-cublas-cu12==12.4.5.8 ; platform_machine == 'x86_64' and platform_system == 'Linux'
# via nvidia-cudnn-cu12
# via nvidia-cusolver-cu12
Expand Down Expand Up @@ -420,6 +417,7 @@ packaging==24.1
# via matplotlib
# via onnxruntime
# via pikepdf
# via pytesseract
# via pytest
# via transformers
# via unstructured-pytesseract
Expand All @@ -443,8 +441,6 @@ pdfplumber==0.11.4
# via megaparse
pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32'
# via ipython
pi-heif==0.20.0
# via unstructured
pikepdf==9.4.0
# via unstructured
pillow==11.0.0
Expand All @@ -453,11 +449,14 @@ pillow==11.0.0
# via matplotlib
# via pdf2image
# via pdfplumber
# via pi-heif
# via pikepdf
# via pillow-heif
# via pytesseract
# via python-pptx
# via torchvision
# via unstructured-pytesseract
pillow-heif==0.20.0
# via unstructured
platformdirs==4.3.6
# via black
# via jupyter-core
Expand Down Expand Up @@ -543,6 +542,8 @@ pypdfium2==4.30.0
# via pdfplumber
pyreadline3==3.5.4 ; sys_platform == 'win32'
# via humanfriendly
pytesseract==0.3.13
# via unstructured
pytest==8.3.3
# via pytest-asyncio
# via pytest-cov
Expand Down Expand Up @@ -570,7 +571,7 @@ python-multipart==0.0.17
# via unstructured-inference
python-oxmsg==0.0.1
# via unstructured
python-pptx==1.0.2
python-pptx==0.6.23
# via unstructured
pytz==2024.2
# via pandas
Expand Down Expand Up @@ -628,7 +629,6 @@ setuptools==75.3.0
# via torch
six==1.16.0
# via asttokens
# via html5lib
# via langdetect
# via python-dateutil
sniffio==1.3.1
Expand All @@ -650,6 +650,8 @@ starlette==0.41.2
sympy==1.13.1
# via onnxruntime
# via torch
tabulate==0.9.0
# via unstructured
tenacity==8.5.0
# via langchain
# via langchain-community
Expand Down Expand Up @@ -711,7 +713,6 @@ typing-extensions==4.12.2
# via pyee
# via python-docx
# via python-oxmsg
# via python-pptx
# via sqlalchemy
# via torch
# via typing-inspect
Expand All @@ -723,11 +724,11 @@ typing-inspect==0.9.0
# via unstructured-client
tzdata==2024.2
# via pandas
unstructured==0.16.4
unstructured==0.15.0
# via megaparse
unstructured-client==0.27.0
# via unstructured
unstructured-inference==0.8.1
unstructured-inference==0.7.36
# via unstructured
unstructured-pytesseract==0.3.13
# via unstructured
Expand All @@ -739,8 +740,6 @@ virtualenv==20.27.1
# via pre-commit
wcwidth==0.2.13
# via prompt-toolkit
webencodings==0.5.1
# via html5lib
wrapt==1.16.0
# via deprecated
# via llama-index-core
Expand Down
25 changes: 12 additions & 13 deletions requirements.lock
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,6 @@ grpcio-status==1.67.1
h11==0.14.0
# via httpcore
# via uvicorn
html5lib==1.1
# via unstructured
httpcore==1.0.6
# via httpx
httpx==0.27.2
Expand Down Expand Up @@ -305,7 +303,6 @@ numpy==1.26.4
# via torchvision
# via transformers
# via unstructured
# via unstructured-inference
nvidia-cublas-cu12==12.4.5.8 ; platform_machine == 'x86_64' and platform_system == 'Linux'
# via nvidia-cudnn-cu12
# via nvidia-cusolver-cu12
Expand Down Expand Up @@ -363,6 +360,7 @@ packaging==24.1
# via matplotlib
# via onnxruntime
# via pikepdf
# via pytesseract
# via transformers
# via unstructured-pytesseract
pandas==2.2.3
Expand All @@ -379,8 +377,6 @@ pdfminer-six==20231228
pdfplumber==0.11.4
# via layoutparser
# via megaparse
pi-heif==0.20.0
# via unstructured
pikepdf==9.4.0
# via unstructured
pillow==11.0.0
Expand All @@ -389,11 +385,14 @@ pillow==11.0.0
# via matplotlib
# via pdf2image
# via pdfplumber
# via pi-heif
# via pikepdf
# via pillow-heif
# via pytesseract
# via python-pptx
# via torchvision
# via unstructured-pytesseract
pillow-heif==0.20.0
# via unstructured
playwright==1.48.0
# via megaparse
portalocker==2.10.1
Expand Down Expand Up @@ -459,6 +458,8 @@ pypdfium2==4.30.0
# via pdfplumber
pyreadline3==3.5.4 ; sys_platform == 'win32'
# via humanfriendly
pytesseract==0.3.13
# via unstructured
python-dateutil==2.8.2
# via matplotlib
# via pandas
Expand All @@ -478,7 +479,7 @@ python-multipart==0.0.17
# via unstructured-inference
python-oxmsg==0.0.1
# via unstructured
python-pptx==1.0.2
python-pptx==0.6.23
# via unstructured
pytz==2024.2
# via pandas
Expand Down Expand Up @@ -529,7 +530,6 @@ scipy==1.14.1
setuptools==75.3.0
# via torch
six==1.16.0
# via html5lib
# via langdetect
# via python-dateutil
sniffio==1.3.1
Expand All @@ -549,6 +549,8 @@ starlette==0.41.2
sympy==1.13.1
# via onnxruntime
# via torch
tabulate==0.9.0
# via unstructured
tenacity==8.5.0
# via langchain
# via langchain-community
Expand Down Expand Up @@ -598,7 +600,6 @@ typing-extensions==4.12.2
# via pyee
# via python-docx
# via python-oxmsg
# via python-pptx
# via sqlalchemy
# via torch
# via typing-inspect
Expand All @@ -610,20 +611,18 @@ typing-inspect==0.9.0
# via unstructured-client
tzdata==2024.2
# via pandas
unstructured==0.16.4
unstructured==0.15.0
# via megaparse
unstructured-client==0.27.0
# via unstructured
unstructured-inference==0.8.1
unstructured-inference==0.7.36
# via unstructured
unstructured-pytesseract==0.3.13
# via unstructured
urllib3==2.2.3
# via requests
uvicorn==0.32.0
# via megaparse
webencodings==0.5.1
# via html5lib
wrapt==1.16.0
# via deprecated
# via llama-index-core
Expand Down