Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HDF5 refactor #310

Merged
merged 9 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions dascore/core/attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ def _get_dims(data_dict):
coord_info, new_attrs = separate_coord_info(
data_dict, dims, required=("min", "max")
)
if "dims" not in new_attrs and dims is not None:
new_attrs["dims"] = dims

new_attrs["coords"] = {i: dc.core.CoordSummary(**v) for i, v in coord_info.items()}
return new_attrs

Expand Down
5 changes: 2 additions & 3 deletions dascore/core/patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from dascore.core.attrs import PatchAttrs
from dascore.core.coordmanager import CoordManager, get_coord_manager
from dascore.core.coords import BaseCoord
from dascore.io import PatchIO
from dascore.utils.display import array_to_text, attrs_to_text, get_dascore_text
from dascore.utils.models import ArrayLike
from dascore.viz import VizPatchNameSpace
Expand Down Expand Up @@ -268,6 +267,6 @@ def tran(self) -> Self:
return self

@property
def io(self) -> PatchIO:
def io(self) -> dc.io.PatchIO:
"""Return a patch IO object for saving patches to various formats."""
return PatchIO(self)
return dc.io.PatchIO(self)
3 changes: 1 addition & 2 deletions dascore/core/spool.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from typing_extensions import Self

import dascore as dc
import dascore.io
from dascore.constants import (
ExecutorType,
PatchType,
Expand Down Expand Up @@ -601,7 +600,7 @@ def _spool_from_str(path, **kwargs):
# Return a FileSpool (lazy file reader), else return DirectorySpool.
elif path.exists(): # a single file path was passed.
_format, _version = dc.get_format(path, **kwargs)
formatter = dascore.io.FiberIO.manager.get_fiberio(_format, _version)
formatter = dc.io.FiberIO.manager.get_fiberio(_format, _version)
if formatter.implements_scan:
from dascore.clients.filespool import FileSpool

Expand Down
1 change: 1 addition & 0 deletions dascore/data_registry.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ h5_simple_1.h5 52803b26a5738da541cc9b32b867434cad0a845686c47dd93551b8eb431f8bc0
h5_simple_2.h5 8a70b873c5c2c172871ecd63760d24fa2c305c015e2ca1c84018c6864d2fb304 https://github.com/dasdae/test_data/raw/master/das/h5_simple_2.h5
conoco_segy_1.sgy 3944297d7c27dd265b40d56d4797f1a14caa5c2bed9f0af020b0f6ea193d4dfd https://github.com/dasdae/test_data/raw/master/das/conoco_segy_1.sgy
dispersion_event.h5 598c8baa2a5610c930e1c003f2ba02da13f8d8686e3ccf2a034e94bfc5e1990c https://github.com/dasdae/test_data/raw/master/das/dispersion_event.h5
PoroTomo_iDAS_1.h5 967a2885e79937ac0426b2022a9c03d5f24790ecf3abbaa9a16eb28055566fc6 https://github.com/dasdae/test_data/raw/master/das/PoroTomo_iDAS_1.h5
10 changes: 9 additions & 1 deletion dascore/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,15 @@
"""
from __future__ import annotations
from dascore.io.core import FiberIO, read, scan, scan_to_df, write, PatchFileSummary
from dascore.utils.io import BinaryReader, BinaryWriter, HDF5Reader, HDF5Writer
from dascore.utils.io import BinaryReader, BinaryWriter
from dascore.utils.hdf5 import (
HDF5Writer,
HDF5Reader,
PyTablesWriter,
PyTablesReader,
H5Reader,
H5Writer,
)
from dascore.utils.misc import MethodNameSpace


Expand Down
2 changes: 1 addition & 1 deletion dascore/io/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,7 @@ def scan(
file_format_, file_version_ = file_format, file_version
formatter = FiberIO.manager.get_fiberio(file_format_, file_version_)
req_type = getattr(formatter.scan, "_required_type", None)
# this will get an open file handle to past to get_resource
# this will get an open file handle to pass to get_resource
patch_thing = man.get_resource(req_type)
for attr in formatter.scan(patch_thing, _pre_cast=True):
out.append(dc.PatchAttrs.from_dict(attr))
Expand Down
19 changes: 12 additions & 7 deletions dascore/io/dasdae/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,13 @@

import dascore as dc
from dascore.constants import SpoolType
from dascore.io import FiberIO, HDF5Reader, HDF5Writer
from dascore.utils.hdf5 import HDFPatchIndexManager, NodeError
from dascore.io import FiberIO
from dascore.utils.hdf5 import (
HDFPatchIndexManager,
NodeError,
PyTablesReader,
PyTablesWriter,
)
from dascore.utils.patch import get_default_patch_name

from .utils import (
Expand Down Expand Up @@ -44,7 +49,7 @@ class DASDAEV1(FiberIO):
preferred_extensions = ("h5", "hdf5")
version = "1"

def write(self, spool: SpoolType, resource: HDF5Writer, index=False, **kwargs):
def write(self, spool: SpoolType, resource: PyTablesWriter, index=False, **kwargs):
"""
Write a collection of patches to a DASDAE file.

Expand Down Expand Up @@ -90,7 +95,7 @@ def _get_patch_summary(self, patches) -> pd.DataFrame:
)
return df

def get_format(self, resource: HDF5Reader) -> tuple[str, str] | bool:
def get_format(self, resource: PyTablesReader) -> tuple[str, str] | bool:
"""Return the format from a dasdae file."""
is_dasdae, version = False, "" # NOQA
with contextlib.suppress(KeyError):
Expand All @@ -100,7 +105,7 @@ def get_format(self, resource: HDF5Reader) -> tuple[str, str] | bool:
return (self.name, dasdae_file_version)
return False

def read(self, resource: HDF5Reader, **kwargs) -> SpoolType:
def read(self, resource: PyTablesReader, **kwargs) -> SpoolType:
"""Read a dascore file."""
patches = []
try:
Expand All @@ -111,7 +116,7 @@ def read(self, resource: HDF5Reader, **kwargs) -> SpoolType:
patches.append(_read_patch(patch_group, **kwargs))
return dc.spool(patches)

def scan(self, resource: HDF5Reader):
def scan(self, resource: PyTablesReader):
"""
Get the patch info from the file.

Expand All @@ -121,7 +126,7 @@ def scan(self, resource: HDF5Reader):

Parameters
----------
Path
resource
A path to the file.
"""
indexer = HDFPatchIndexManager(resource.filename)
Expand Down
11 changes: 11 additions & 0 deletions dascore/io/dashdf5/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""
Basic support for DAS-HDF5 a subset of CF (climate and forcasting).

This was created mainly for reading PoroTomo data from Brady Hotsprings.

More info on PoroTomo here:
https://github.com/openEDI/documentation/tree/main/PoroTomo
"""
from __future__ import annotations

from .core import DASHDF5
74 changes: 74 additions & 0 deletions dascore/io/dashdf5/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""IO module for reading prodML data."""
from __future__ import annotations

import numpy as np

import dascore as dc
from dascore.constants import opt_timeable_types
from dascore.io import FiberIO
from dascore.utils.hdf5 import H5Reader
from dascore.utils.models import UnitQuantity, UTF8Str

from .utils import _get_cf_attrs, _get_cf_coords, _get_cf_version_str


class ProdMLPatchAttrs(dc.PatchAttrs):
"""Patch attrs for ProdML."""

pulse_width: float = np.NAN
pulse_width_units: UnitQuantity | None = None
gauge_length: float = np.NaN
gauge_length_units: UnitQuantity | None = None
schema_version: UTF8Str = ""


class DASHDF5(FiberIO):
"""IO Support for DASHDF5 which uses CF version 1.7."""

name = "DASHDF5"
preferred_extensions = ("hdf5", "h5")
version = "1.0"

def get_format(self, resource: H5Reader) -> tuple[str, str] | bool:
"""
Return True if file contains terra15 version 2 data else False.

Parameters
----------
resource
A path to the file which may contain terra15 data.
"""
version_str = _get_cf_version_str(resource)
if version_str:
return self.name, version_str

def scan(self, resource: H5Reader) -> list[dc.PatchAttrs]:
"""Get metadata from file."""
coords = _get_cf_coords(resource)
extras = {
"path": resource.filename,
"file_format": self.name,
"file_version": str(self.version),
}
attrs = _get_cf_attrs(resource, coords, extras=extras)
return [attrs]

def read(
self,
resource: H5Reader,
time: tuple[opt_timeable_types, opt_timeable_types] | None = None,
channel: tuple[float | None, float | None] | None = None,
**kwargs,
):
"""Read a CF file and return a Patch."""
coords = _get_cf_coords(resource)
coords_new, data = coords.select(
array=resource["das"],
time=time,
channel=channel,
)
attrs = _get_cf_attrs(resource, coords_new)
patch = dc.Patch(
data=data, attrs=attrs, coords=coords_new, dims=coords_new.dims
)
return dc.spool(patch)
87 changes: 87 additions & 0 deletions dascore/io/dashdf5/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""Utilities for terra15."""
from __future__ import annotations

import dascore as dc
from dascore.core import get_coord

# --- Getting format/version

_REQUIRED_GROUPS = frozenset({"channel", "trace", "das", "t", "x", "y", "z"})
_COORD_GROUPS = ("channel", "trace", "t", "x", "y", "z")


# maps attributes on DAS group to attrs stored in patch.
_ROOT_ATTR_MAPPING = {"project": "project"}
_DAS_ATTR_MAPPING = {"long_name": "data_type"}
_CRS_MAPPING = {"epsg_code": "epsg_code"}


def _get_cf_version_str(hdf_fi) -> str | bool:
"""Return the version string for dashdf5 files."""
conventions = hdf_fi.attrs.get("Conventions", [])
cf_str = [x for x in conventions if x.startswith("CF-")]
das_hdf_str = [x for x in conventions if x.startswith("DAS-HDF5")]
has_req_groups = _REQUIRED_GROUPS.issubset(set(hdf_fi))
# if CF convention not found or not all
if len(cf_str) == 0 or len(das_hdf_str) == 0 or not has_req_groups:
return False
return das_hdf_str[0].replace("DAS-HDF5-", "")


def _get_cf_coords(hdf_fi, minimal=False) -> dc.core.CoordManager:
"""
Get a coordinate manager of full file range.

Parameters
----------
minimal
If True, only return queryable parameters.

"""

def _get_spatialcoord(hdf_fi, code):
"""Get spatial coord."""
return get_coord(
values=hdf_fi[code],
units=hdf_fi[code].attrs["units"],
)

coords_map = {
"channel": get_coord(values=hdf_fi["channel"][:]),
"trace": get_coord(values=hdf_fi["trace"][:]),
"time": get_coord(values=dc.to_datetime64(hdf_fi["t"][:])),
"x": _get_spatialcoord(hdf_fi, "x"),
"y": _get_spatialcoord(hdf_fi, "y"),
"z": _get_spatialcoord(hdf_fi, "z"),
}
dim_map = {
"time": ("time",),
"trace": ("time",),
"channel": ("channel",),
"x": ("channel",),
"y": ("channel",),
"z": ("channel",),
}
dims = ("channel", "time")
cm = dc.core.CoordManager(
coord_map=coords_map,
dim_map=dim_map,
dims=dims,
)
# a bit of a hack to make sure data and coords align.
if cm.shape != hdf_fi["das"].shape:
cm = cm.transpose()
return cm


def _get_cf_attrs(hdf_fi, coords=None, extras=None):
"""Get attributes for CF file."""
out = {"coords": coords or _get_cf_coords(hdf_fi)}
out.update(extras or {})
for n1, n2 in _ROOT_ATTR_MAPPING.items():
out[n1] = hdf_fi.attrs.get(n2)
for n1, n2 in _DAS_ATTR_MAPPING.items():
out[n1] = getattr(hdf_fi.get("das", {}), "attrs", {}).get(n2)
for n1, n2 in _CRS_MAPPING.items():
out[n1] = getattr(hdf_fi.get("crs", {}), "attrs", {}).get(n2)
return dc.PatchAttrs(**out)
9 changes: 5 additions & 4 deletions dascore/io/h5simple/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

import dascore as dc
from dascore.constants import SpoolType
from dascore.io import FiberIO, HDF5Reader
from dascore.io import FiberIO
from dascore.utils.hdf5 import PyTablesReader

from .utils import _get_attrs_coords_and_data, _is_h5simple, _maybe_trim_data

Expand All @@ -15,13 +16,13 @@ class H5Simple(FiberIO):
preferred_extensions = ("hdf5", "h5")
version = "1"

def get_format(self, resource: HDF5Reader) -> tuple[str, str] | bool:
def get_format(self, resource: PyTablesReader) -> tuple[str, str] | bool:
"""Determine if is simple h5 format."""
if _is_h5simple(resource):
return self.name, self.version
return False

def read(self, resource: HDF5Reader, snap=True, **kwargs) -> SpoolType:
def read(self, resource: PyTablesReader, snap=True, **kwargs) -> SpoolType:
"""
Read a simple h5 file.

Expand All @@ -39,7 +40,7 @@ def read(self, resource: HDF5Reader, snap=True, **kwargs) -> SpoolType:
patch = dc.Patch(coords=new_cm, data=new_data[:], attrs=attrs)
return dc.spool([patch])

def scan(self, resource: HDF5Reader, snap=True) -> list[dc.PatchAttrs]:
def scan(self, resource: PyTablesReader, snap=True) -> list[dc.PatchAttrs]:
"""Get the attributes of a h5simple file."""
attrs, cm, data = _get_attrs_coords_and_data(resource, snap, self)
attrs["coords"] = cm.to_summary_dict()
Expand Down
9 changes: 5 additions & 4 deletions dascore/io/prodml/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@

import dascore as dc
from dascore.constants import opt_timeable_types
from dascore.io import FiberIO, HDF5Reader
from dascore.io import FiberIO
from dascore.utils.models import UnitQuantity, UTF8Str

from ...utils.hdf5 import PyTablesReader
from .utils import _get_prodml_attrs, _get_prodml_version_str, _read_prodml


Expand All @@ -28,7 +29,7 @@ class ProdMLV2_0(FiberIO):
preferred_extensions = ("hdf5", "h5")
version = "2.0"

def get_format(self, resource: HDF5Reader) -> tuple[str, str] | bool:
def get_format(self, resource: PyTablesReader) -> tuple[str, str] | bool:
"""
Return True if file contains terra15 version 2 data else False.

Expand All @@ -41,7 +42,7 @@ def get_format(self, resource: HDF5Reader) -> tuple[str, str] | bool:
if version_str:
return (self.name, version_str)

def scan(self, resource: HDF5Reader) -> list[dc.PatchAttrs]:
def scan(self, resource: PyTablesReader) -> list[dc.PatchAttrs]:
"""Scan a prodml file, return summary information about the file's contents."""
file_version = _get_prodml_version_str(resource)
extras = {
Expand All @@ -54,7 +55,7 @@ def scan(self, resource: HDF5Reader) -> list[dc.PatchAttrs]:

def read(
self,
resource: HDF5Reader,
resource: PyTablesReader,
time: tuple[opt_timeable_types, opt_timeable_types] | None = None,
distance: tuple[float | None, float | None] | None = None,
**kwargs,
Expand Down
Loading
Loading