Skip to content

Commit

Permalink
Update requirements, add bench script
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Oct 14, 2024
1 parent 859302c commit 4f8c50c
Show file tree
Hide file tree
Showing 8 changed files with 291 additions and 165 deletions.
32 changes: 32 additions & 0 deletions .github/workflows/cla.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: "Tabled CLA Assistant"
on:
issue_comment:
types: [created]
pull_request_target:
types: [opened,closed,synchronize]

# explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings
permissions:
actions: write
contents: write
pull-requests: write
statuses: write

jobs:
CLAAssistant:
runs-on: ubuntu-latest
steps:
- name: "Tabled CLA Assistant"
if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target'
uses: contributor-assistant/[email protected]
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# the below token should have repo scope and must be manually added by you in the repository's secret
# This token is required only if you have configured to store the signatures in a remote repository/organization
PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
with:
path-to-signatures: 'signatures/version1/cla.json'
path-to-document: 'https://github.com/VikParuchuri/tabled/blob/master/CLA.md'
# branch should not be protected
branch: 'master'
allowlist: VikParuchuri
27 changes: 27 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Python package
on:
push:
tags:
- "v*.*.*"
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install python dependencies
run: |
pip install poetry
poetry install
- name: Build package
run: |
poetry build
- name: Publish package
env:
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
run: |
poetry config pypi-token.pypi "$PYPI_TOKEN"
poetry publish
30 changes: 30 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Integration test

on: [push]

env:
TORCH_DEVICE: "cpu"

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install apt dependencies
run: |
sudo apt-get update
sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
- name: Install python dependencies
run: |
pip install poetry
poetry install
poetry remove torch
poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu
- name: Run benchmark test
run: |
poetry run python benchmarks/benchmark.py --max 5 temp.json
poetry run python scripts/verify_benchmark_scores.py temp.json
18 changes: 10 additions & 8 deletions benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import time

import click
import datasets
from surya.input.pdflines import get_table_blocks
from tabulate import tabulate
Expand All @@ -14,13 +15,14 @@
from tabled.inference.recognition import recognize_tables


def main():
parser = argparse.ArgumentParser(description="Benchmark table conversion.")
parser.add_argument("out_file", help="Output filename for results")
parser.add_argument("--dataset", type=str, help="Dataset to use", default="vikp/table_bench2")
args = parser.parse_args()

ds = datasets.load_dataset(args.dataset, split="train")
@click.command()
@click.argument("out_file", type=str)
@click.option("--dataset", type=str, default="vikp/table_bench2", help="Dataset to use")
@click.option("--max", type=int, default=None, help="Max number of tables to process")
def main(out_file, dataset, max):
ds = datasets.load_dataset(dataset, split="train")
if max:
ds = ds[:max]

rec_models = load_recognition_models()

Expand Down Expand Up @@ -70,7 +72,7 @@ def main():
print(table)
print("Avg score computed by aligning table cell text with GPT-4 table cell text.")

with open(args.out_file, "w+") as f:
with open(out_file, "w+") as f:
json.dump(results, f, indent=2)


Expand Down
2 changes: 1 addition & 1 deletion extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from tabled.inference.models import load_detection_models, load_recognition_models


@click.command()
@click.command(help="Extract tables from PDFs")
@click.argument("in_path", type=click.Path(exists=True))
@click.argument("out_folder", type=click.Path())
@click.option("--save_json", is_flag=True, help="Save row/column/cell information in json format")
Expand Down
Loading

0 comments on commit 4f8c50c

Please sign in to comment.