Skip to content

Commit

Permalink
feat: drop int64 fields from cxg_schema in the obs array (#653)
Browse files Browse the repository at this point in the history
* drop int64 fields

add fallback if cxg_schema isn't there

* add dummy test

* set up test case

* remove unnecessary things

* rm more things

* move int64 call to top
  • Loading branch information
joyceyan authored Sep 22, 2023
1 parent 198a8a4 commit 7a96def
Show file tree
Hide file tree
Showing 92 changed files with 419 additions and 5 deletions.
9 changes: 6 additions & 3 deletions server/common/utils/type_conversion_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,15 @@ def get_dtype_and_schema_of_array(array: Union[np.ndarray, pd.Series, pd.Index])
return _get_type_info(array)


def get_schema_type_hint_from_dtype(dtype) -> dict:
res = _get_type_info_from_dtype(dtype)
def get_schema_type_hint_from_dtype(dtype: np.dtype, allow_int64=False) -> dict:
res = _get_type_info_from_dtype(dtype=dtype, allow_int64=allow_int64)
if res is None:
raise TypeError(f"Annotations of type {dtype} are unsupported.")
else:
return res[1]


def _get_type_info_from_dtype(dtype) -> Union[Tuple[np.dtype, dict], None]:
def _get_type_info_from_dtype(dtype: np.dtype, allow_int64=False) -> Union[Tuple[np.dtype, dict], None]:
"""
Best-effort to determine encoding type and schema hint from a dtype.
If this is not possible, or the type is unsupported, return None.
Expand All @@ -81,6 +81,9 @@ def _get_type_info_from_dtype(dtype) -> Union[Tuple[np.dtype, dict], None]:
_get_type_info(). The latter should be preferred if the array (values)
are available for typing.
"""
if allow_int64 and dtype.kind in ["i", "u"] and np.can_cast(dtype, np.int64):
return (np.int64, {"type": "int64"})

if dtype.kind == "b":
return (np.uint8, {"type": "boolean"})

Expand Down
13 changes: 11 additions & 2 deletions server/dataset/cxg_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,12 @@ def _get_schema(self):
shape = self.get_shape()
dtype = self.get_X_array_dtype()

dataframe = {"nObs": shape[0], "nVar": shape[1], **get_schema_type_hint_from_dtype(dtype)}
dataframe = {
"nObs": shape[0],
"nVar": shape[1],
# Allow int64 fields to be generated in the schema hint so that we can filter later
**get_schema_type_hint_from_dtype(dtype=dtype, allow_int64=True),
}

annotations = {}
for ax in ("obs", "var"):
Expand All @@ -406,6 +411,10 @@ def _get_schema(self):
type_hint = schema_hints.get(attr.name, {})
# type hints take precedence
if "type" in type_hint:
if type_hint["type"] in ["int64", "uint64"] and ax == "obs":
# Skip over int64 fields in the obs array when generating schema
continue

schema["type"] = type_hint["type"]
if schema["type"] == "boolean" and ax == "obs":
# convert boolean to categorical
Expand All @@ -416,7 +425,7 @@ def _get_schema(self):
elif schema["type"] == "categorical" and "categories" in type_hint:
schema["categories"] = type_hint["categories"]
else:
schema.update(get_schema_type_hint_from_dtype(attr.dtype))
schema.update(get_schema_type_hint_from_dtype(dtype=attr.dtype))
cols.append(schema)

annotations[ax] = dict(columns=cols)
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit 7a96def

Please sign in to comment.