Skip to content

Commit

Permalink
add histogram
Browse files Browse the repository at this point in the history
  • Loading branch information
changhiskhan committed Aug 16, 2022
1 parent 514421f commit 0e02cbb
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 8 deletions.
20 changes: 20 additions & 0 deletions python/benchmarks/oxford_pet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Optional

import duckdb
import numpy as np
import pandas as pd

import lance
Expand Down Expand Up @@ -44,6 +45,17 @@ def filter_data(base_uri: str, fmt: str, flavor: Optional[str]):
return scanner.to_table().to_pandas()


@oxford_pet_benchmarks.benchmark("area_histogram", key=['fmt', 'flavor'])
def compute_histogram(base_uri: str, fmt: str, flavor: Optional[str]):
if fmt == "raw":
return area_histogram_raw(base_uri)
suffix = '' if not flavor else f'_{flavor}'
uri = os.path.join(base_uri, f'oxford_pet{suffix}.{fmt}')
ds = _get_dataset(uri, fmt)
query = "SELECT histogram(size.width * size.height) FROM ds"
return duckdb.query(query).to_df()


def _get_dataset(uri, fmt):
if fmt == "parquet":
return pa.dataset.dataset(uri)
Expand All @@ -68,6 +80,14 @@ def get_pets_filtered_data(base_uri, klass="pug", offset=20, limit=50):
return limited.assign(images=download_uris(pd.Series(uris)))


def area_histogram_raw(base_uri):
c = OxfordPetConverter(base_uri)
df = c.read_metadata()
sz = pd.json_normalize(df['size'])
query = "SELECT histogram(width * height) FROM sz"
return duckdb.query(query).to_df()


if __name__ == "__main__":
main = oxford_pet_benchmarks.create_main()
main()
17 changes: 9 additions & 8 deletions python/benchmarks/parse_pet.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ def read_metadata(self, check_quality=False) -> pd.DataFrame:
no_index = pd.Index(names.values).difference(df.filename)
self._data_quality_issues["missing_index"] = no_index

with_xmls['segmented'] = with_xmls.segmented.astype(bool)
# TODO lance doesn't support writing booleans yet
with_xmls['segmented'] = with_xmls.segmented.astype(pd.Int8Dtype())
return with_xmls

def _get_index(self, name: str) -> pd.DataFrame:
Expand Down Expand Up @@ -132,16 +133,16 @@ def get_schema(self):
object_schema = pa.list_(pa.struct([
pa.field("name", pa.string()),
pa.field("pose", pa.string()),
pa.field("truncated", pa.bool_()),
pa.field("occluded", pa.bool_()),
pa.field("truncated", pa.uint8()),
pa.field("occluded", pa.uint8()),
pa.field("bndbox", bbox),
pa.field("difficult", pa.bool_())
pa.field("difficult", pa.uint8())
]))
names = ["filename", "class", "species", "breed", "split",
"folder", "source", "size", "segmented", "object"]
types = [pa.string(), pa.string(), pa.string(), pa.int16(),
pa.string(), pa.string(), source_schema, size_schema,
pa.bool_(), object_schema]
pa.uint8(), object_schema]
return pa.schema([pa.field(name, dtype)
for name, dtype in zip(names, types)])

Expand All @@ -158,9 +159,9 @@ def _get_xml(uri):
sz['height'] = int(sz['height'])
sz['depth'] = int(sz['depth'])
for obj in dd['object']:
obj['truncated'] = bool(obj['truncated'])
obj['occluded'] = bool(obj['occluded'])
obj['difficult'] = bool(obj['difficult'])
obj['truncated'] = int(obj['truncated'])
obj['occluded'] = int(obj['occluded'])
obj['difficult'] = int(obj['difficult'])
obj['bndbox'] = {
'xmin': int(obj['bndbox']['xmin']),
'xmax': int(obj['bndbox']['xmax']),
Expand Down

0 comments on commit 0e02cbb

Please sign in to comment.