Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: cxg conversion script updates for uns #6904

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions backend/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@
"dev": "https://api.cellxgene.dev.single-cell.czi.technology",
}

UNS_META_KEYS = ["spatial"]

DATA_SUBMISSION_POLICY_VERSION = "2.0"
33 changes: 32 additions & 1 deletion backend/common/utils/cxg_generation_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import json
import logging
import pickle

import numpy as np
import pandas as pd
import tiledb

from backend.common.constants import UNS_META_KEYS
from backend.common.utils.type_conversion_utils import get_dtype_and_schema_of_array
from backend.common.utils.uns import filter_spatial_data


def convert_dictionary_to_cxg_group(cxg_container, metadata_dict, group_metadata_name="cxg_group_metadata", ctx=None):
Expand All @@ -31,6 +34,29 @@
metadata_array.meta[key] = value


def convert_uns_to_cxg_group(cxg_container, metadata_dict, group_metadata_name="cxg_group_metadata", ctx=None):
"""
Convert uns (unstructured) metadata to CXG output directory specified
"""

array_name = f"{cxg_container}/{group_metadata_name}"
object_filtered = {}

tiledb.from_numpy(array_name, np.zeros((1,)))

with tiledb.open(array_name, mode="w", ctx=ctx) as metadata_array:
for key, value in metadata_dict.items():
if key not in UNS_META_KEYS:
continue
for object_id, content in value.items():
if key == "spatial":
object_filtered = filter_spatial_data(content, object_id)
else:
object_filtered[object_id] = content

Check warning on line 55 in backend/common/utils/cxg_generation_utils.py

View check run for this annotation

Codecov / codecov/patch

backend/common/utils/cxg_generation_utils.py#L55

Added line #L55 was not covered by tests

metadata_array.meta[key] = pickle.dumps(object_filtered)


def convert_dataframe_to_cxg_array(cxg_container, dataframe_name, dataframe, index_column_name, ctx):
"""
Saves the contents of the dataframe to the CXG output directory specified.
Expand All @@ -52,7 +78,12 @@
tdb_attrs = []

for column_name, column_values in dataframe.items():
dtype, hints = get_dtype_and_schema_of_array(column_values)
# Cast 'in_tissue' column values as boolean to make it categorical
# https://github.com/chanzuckerberg/single-cell-explorer/issues/841
if column_name == "in_tissue":
dtype, hints = get_dtype_and_schema_of_array(column_values.astype(bool))

Check warning on line 84 in backend/common/utils/cxg_generation_utils.py

View check run for this annotation

Codecov / codecov/patch

backend/common/utils/cxg_generation_utils.py#L84

Added line #L84 was not covered by tests
else:
dtype, hints = get_dtype_and_schema_of_array(column_values)
if "categories" in hints and len(hints.get("categories", [])) > 0.75 * dataframe.shape[0]:
hints["type"] = "string"
del hints["categories"]
Expand Down
18 changes: 18 additions & 0 deletions backend/common/utils/uns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
def filter_spatial_data(content, library_id):
"""
This filters data associated with the "spatial" key in a dictionary, specifically
retaining certain sub-items from "images" and "scalefactors" sub-dictionaries.
https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#uns-dataset-metadata
"""
spatial_filtered = {}
spatial_filtered[library_id] = {
"images": {
"hires": content["images"]["hires"], # Omit hires data once deep zooming feature is implemented
"fullres": [], # Currently not including fullsres data, due to deep zooming feature coming soon
},
"scalefactors": {
"spot_diameter_fullres": content["scalefactors"]["spot_diameter_fullres"],
"tissue_hires_scalef": content["scalefactors"]["tissue_hires_scalef"],
},
}
return spatial_filtered
4 changes: 4 additions & 0 deletions backend/layers/processing/h5ad_data_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
convert_dictionary_to_cxg_group,
convert_matrices_to_cxg_arrays,
convert_ndarray_to_cxg_dense_array,
convert_uns_to_cxg_group,
)
from backend.common.utils.matrix_utils import is_matrix_sparse
from backend.common.utils.tiledb import consolidation_buffer_size
Expand Down Expand Up @@ -79,6 +80,9 @@ def to_cxg(self, output_cxg_directory, sparse_threshold, convert_anndata_colors_
convert_dataframe_to_cxg_array(output_cxg_directory, "var", self.var, self.var_index_column_name, ctx)
logging.info("\t...dataset var dataframe saved")

convert_uns_to_cxg_group(output_cxg_directory, self.anndata.uns, "uns", ctx)
logging.info("\t...dataset uns dataframe saved")

self.write_anndata_embeddings_to_cxg(output_cxg_directory, ctx)
logging.info("\t...dataset embeddings saved")

Expand Down
31 changes: 30 additions & 1 deletion tests/unit/backend/layers/utils/test_cxg_generation_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import pickle
import unittest
from os import mkdir, path
from shutil import rmtree
Expand All @@ -13,6 +14,7 @@
convert_dictionary_to_cxg_group,
convert_matrices_to_cxg_arrays,
convert_ndarray_to_cxg_dense_array,
convert_uns_to_cxg_group,
)
from tests.unit.backend.fixtures.environment_setup import fixture_file_path

Expand All @@ -28,7 +30,7 @@ def tearDown(self):

def test__convert_dictionary_to_cxg_group__writes_successfully(self):
random_dictionary = {"cookies": "chocolate_chip", "brownies": "chocolate", "cake": "double chocolate"}
dictionary_name = "favorite_desserts"
dictionary_name = "spatial"
expected_array_directory = f"{self.testing_cxg_temp_directory}/{dictionary_name}"

convert_dictionary_to_cxg_group(
Expand All @@ -42,6 +44,33 @@ def test__convert_dictionary_to_cxg_group__writes_successfully(self):
self.assertTrue(isinstance(array, tiledb.DenseArray))
self.assertEqual(random_dictionary, actual_stored_metadata)

def test__convert_uns_to_cxg_group__writes_successfully(self):
random_dictionary = {
"spatial": {
"abcd": {
"images": {
"hires": "123",
"fullres": [],
},
"scalefactors": {
"spot_diameter_fullres": "123",
"tissue_hires_scalef": "123",
},
}
}
}
dictionary_name = "uns"
expected_array_directory = f"{self.testing_cxg_temp_directory}/{dictionary_name}"
convert_uns_to_cxg_group(
self.testing_cxg_temp_directory, random_dictionary, group_metadata_name=dictionary_name
)
array = tiledb.open(expected_array_directory)
actual_stored_metadata = dict(array.meta.items())

self.assertTrue(path.isdir(expected_array_directory))
self.assertTrue(isinstance(array, tiledb.DenseArray))
self.assertEqual(random_dictionary["spatial"], pickle.loads(actual_stored_metadata["spatial"]))

def test__convert_dataframe_to_cxg_array__writes_successfully(self):
random_int_category = Series(data=[3, 1, 2, 4], dtype=np.int64)
random_bool_category = Series(data=[True, True, False, True], dtype=np.bool_)
Expand Down
Loading