Skip to content

Commit

Permalink
Remove pandas dep and fix argument checksum on CLI (#79)
Browse files Browse the repository at this point in the history
  • Loading branch information
J535D165 authored Jun 24, 2024
1 parent 9abbe92 commit 3c00afa
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 36 deletions.
29 changes: 26 additions & 3 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,34 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: Python package
name: Test repositories
on: [push, pull_request]

jobs:
build:
test-repositories:

runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install .
python -m pip install pytest pytest-xdist
- name: Test with pytest
run: |
pytest -n 4 --ignore=tests/test_repositories_plus.py
test-repositories-plus:

runs-on: ubuntu-latest
strategy:
Expand All @@ -26,4 +49,4 @@ jobs:
python -m pip install pytest pytest-xdist
- name: Test with pytest
run: |
pytest -n 4
pytest -n 4 tests/test_repositories_plus.py
3 changes: 1 addition & 2 deletions datahugger/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,7 @@ def main():
parser.add_argument("--no-unzip", dest="unzip", action="store_false")
parser.set_defaults(unzip=True)

parser.add_argument("--checksum", dest="checksum", action="store_false")
parser.set_defaults(checksum=False)
parser.add_argument("--checksum", dest="checksum", action="store_true")

parser.add_argument("--no-progress", dest="progress", action="store_false")
parser.set_defaults(progress=True)
Expand Down
27 changes: 12 additions & 15 deletions datahugger/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from typing import Union
from urllib.parse import urlparse

import pandas as pd
import requests
from jsonpath_ng import parse
from scitree import scitree
Expand Down Expand Up @@ -224,24 +223,22 @@ def _check_checksums(self, output_folder, files_info):
try:
checksums = {}

df = pd.DataFrame(files_info)

# loop through the downloaded files in the output_folder
for subdir, dirs, files in os.walk(output_folder):
logging.info(f"Not using the dirs: {dirs}")
for file in files:
filepath = os.path.join(subdir, file)
df2 = df[df["name"] == file].reset_index()

file_comp = list(filter(lambda x: x["name"] == file, files_info))

try:
hash = df2["hash"][0]
except Exception as e:
logging.info(f"Setting hash to None: {e}")
hash = file_comp[0]["hash"]
hash_type = file_comp[0]["hash_type"]
except IndexError:
logging.info("Setting hash and hash_type to None")
hash = None
try:
hash_type = df2["hash_type"][0]
except Exception as e:
logging.info(f"Setting hash_type to None: {e}")
hash_type = None

newhash = None
with open(filepath, "rb") as f:
if hash_type == "md5":
Expand All @@ -257,10 +254,10 @@ def _check_checksums(self, output_folder, files_info):
if hash_type == "sha512":
newhash = hashlib.sha512(f.read()).hexdigest()
hash_match = hash == newhash

if hash is not None and hash_type is not None:
status = f"---> Checksum match: {hash_match} - {file}"
print(status)
logging.info(status)
print(f"Checksum match: {hash_match} - {file}")
logging.info(f"Checksum match: {hash_match} - {file}")
checksums[file] = hash_match

try:
Expand Down Expand Up @@ -398,7 +395,7 @@ def _get(
file_hash=f["hash"],
file_hash_type=f["hash_type"],
)
# if checksum==True do checking of checksum

if self.checksum:
self._check_checksums(output_folder=output_folder, files_info=files_info)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ datahugger = "datahugger.__main__:main"

[project.optional-dependencies]
all = ["datasets"]
benchmark = ["pandas", "requests", "tabulate"]
benchmark = ["pandas", "tabulate"]
lint = ["ruff"]
test = ["pytest"]
docs = ["mkdocs-material"]
Expand Down
15 changes: 0 additions & 15 deletions tests/test_repositories.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@
"https://repositorioinstitucional.ceu.es/handle/10637/2741",
"Aquaporin_1_JAMartin_et_al_MedSport_2009.pdf",
),
# huggingface
# ("10.57967/hf/0034", "test.csv"),
# Pangaea
("https://doi.org/10.1594/PANGAEA.954547", "Gubbio_age.tab"),
("https://doi.pangaea.de/10.1594/PANGAEA.954543", "AA_age.tab"),
Expand Down Expand Up @@ -99,16 +97,3 @@ def test_info_without_loading(tmpdir):
dh_info = datahugger.info("https://osf.io/wdzh5/")

assert dh_get.dataset.files == dh_info.files


def test_huggingface(tmpdir):
datahugger.get(
"https://huggingface.co/datasets/wikitext",
tmpdir,
params={"name": "wikitext-2-v1"},
)


def test_huggingface_without_params(tmpdir):
with pytest.raises(ValueError):
datahugger.get("https://huggingface.co/datasets/wikitext", tmpdir)
16 changes: 16 additions & 0 deletions tests/test_repositories_plus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pytest

import datahugger


def test_huggingface(tmpdir):
datahugger.get(
"https://huggingface.co/datasets/wikitext",
tmpdir,
params={"name": "wikitext-2-v1"},
)


def test_huggingface_without_params(tmpdir):
with pytest.raises(ValueError):
datahugger.get("https://huggingface.co/datasets/wikitext", tmpdir)

0 comments on commit 3c00afa

Please sign in to comment.