Sentek format (#320)

* adding Sentek test data * Add sentek data format * new contributor added * contributor added * restore deleted doc file and correct docstring * improve auto-format detection --------- Co-authored-by: Derrick Chambers <[email protected]>
DASDAE · Dec 21, 2023 · 6d38342 · 6d38342
1 parent 910f0eb
commit 6d38342
Show file tree

Hide file tree

Showing 8 changed files with 184 additions and 1 deletion.
diff --git a/dascore/data_registry.txt b/dascore/data_registry.txt
@@ -16,3 +16,4 @@ conoco_segy_1.sgy 3944297d7c27dd265b40d56d4797f1a14caa5c2bed9f0af020b0f6ea193d4d
 brady_hs_DAS_DTS_coords.csv b2e766136aac6516ddbb757d7dc26a8df0d5de48af03c8be769cc50b6816648f https://github.com/dasdae/test_data/raw/master/meta/brady_hs_DAS_DTS_coords.csv
 dispersion_event.h5 598c8baa2a5610c930e1c003f2ba02da13f8d8686e3ccf2a034e94bfc5e1990c https://github.com/dasdae/test_data/raw/master/das/dispersion_event.h5
 PoroTomo_iDAS_1.h5 967a2885e79937ac0426b2022a9c03d5f24790ecf3abbaa9a16eb28055566fc6 https://github.com/dasdae/test_data/raw/master/das/PoroTomo_iDAS_1.h5
+DASDMSShot00_20230328155653619.das 12ac53f78b32d8b0e32cc674c43ff5b4c79a6c8b19de2ad577fd481679b2b7b3 https://github.com/dasdae/test_data/raw/master/das/DASDMSShot00_20230328155653619.das
diff --git a/dascore/io/sentek/__init__.py b/dascore/io/sentek/__init__.py
@@ -0,0 +1,11 @@
+"""
+Module for reading DAS data recorded by Sentek interrogator
+
+Examples
+--------
+
+import dascore as dc
+
+data_sentek = dc.spool('path_to_file.das')
+"""
+from .core import SentekV5
diff --git a/dascore/io/sentek/core.py b/dascore/io/sentek/core.py
@@ -0,0 +1,50 @@
+"""IO module for reading Sentek's DAS data format."""
+from __future__ import annotations
+
+import numpy as np
+
+import dascore as dc
+from dascore.io import BinaryReader
+from dascore.io.core import FiberIO
+
+from .utils import _get_patch_attrs, _get_version
+
+
+class SentekV5(FiberIO):
+    """Support for Sentek Instrument data format."""
+
+    name = "sentek"
+    version = "5"
+    preferred_extensions = ("das",)
+
+    def read(
+        self,
+        resource: BinaryReader,
+        time=None,
+        distance=None,
+        **kwargs,
+    ) -> dc.BaseSpool:
+        """Read a Sentek das file, return a DataArray."""
+        attrs, coords, offsets = _get_patch_attrs(resource)
+        resource.seek(offsets[0])
+        array = np.fromfile(resource, dtype=np.float32, count=offsets[1] * offsets[2])
+        array = np.reshape(array, (offsets[1], offsets[2])).T
+        patch = dc.Patch(data=array, attrs=attrs, coords=coords, dims=coords.dims)
+        # Note: we are being a bit sloppy here in that selecting on time/distance
+        # doesn't actually affect how much data is read from the binary file. This
+        # is probably ok though since Sentek files tend to be quite small.
+        return dc.spool(patch).select(time=time, distance=distance)
+
+    def get_format(self, resource: BinaryReader) -> tuple[str, str] | bool:
+        """Auto detect sentek format."""
+        return _get_version(resource)
+
+    def scan(self, resource: BinaryReader):
+        """Extract metadata from sentek file."""
+        extras = {
+            "file_format": self.name,
+            "file_version": self.version,
+            "path": resource.name,
+        }
+
+        return [_get_patch_attrs(resource, extras=extras)[0]]
diff --git a/dascore/io/sentek/utils.py b/dascore/io/sentek/utils.py
@@ -0,0 +1,97 @@
+"""Utilities for Sentek data format."""
+
+from __future__ import annotations
+
+import numpy as np
+
+import dascore as dc
+from dascore.core import get_coord, get_coord_manager
+
+
+def _get_version(fid):
+    """Determine if Sentek file."""
+    name = fid.name
+    # Sentek files cannot change the extension, or file name.
+    sw_data = name.endswith(".das")
+    fid.seek(0)
+    # There isn't anything in the header particularly useful for determining
+    # if it is a Sentek file, so we do what we can here.
+    # First check if sensor_num and measurement_count are positive and nearly
+    # ints.
+    sensor_num = np.fromfile(fid, dtype=np.float32, count=1)[0]
+    measurement_count = np.fromfile(fid, dtype=np.float32, count=1)[0]
+    _ = np.fromfile(fid, dtype=np.float32, count=1)[0]  # sampling_interval
+    is_positive = (sensor_num > 1) and (measurement_count > 1)
+    sens_nearly_int = np.round(sensor_num, 5) == np.round(sensor_num)
+    meas_nearly_int = np.round(measurement_count, 5) == np.round(measurement_count)
+    nearly_ints = sens_nearly_int and meas_nearly_int
+    # Then check if strain_rate value is valid.
+    strain_rate = int(np.fromfile(fid, dtype=np.float32, count=1)[0])
+    proper_strain_rate = strain_rate in {0, 1}
+    # Note: We will need to modify this later for different versions of the
+    # sentek data, but for now we only support 5.
+    if sw_data and is_positive and proper_strain_rate and nearly_ints:
+        return ("sentek", "5")
+    return False
+
+
+def _get_time_from_file_name(name) -> np.datetime64:
+    """Extract time contained in the file name.
+
+    example file name: DASDMSShot00_20230328155652124.das
+    """
+    time_str = name.split("_")[1].split(".")[0]
+    year = time_str[:4]
+    month = time_str[4:6]
+    day = time_str[6:8]
+    hour = time_str[8:10]
+    minute = time_str[10:12]
+    second = float(time_str[12:]) / 1_000
+    iso = f"{year}-{month}-{day}T{hour}:{minute}:{second:02f}"
+    return np.datetime64(iso)
+
+
+def _get_patch_attrs(fid, extras=None):
+    """Extracts patch metadata.
+
+    A few important fields in the header and their meaning:
+
+    sensor_num: number of channels in the sensing fiber
+    measurement_count: number of measurements in ONE single file
+    sampling_interval: sampling interval in nanosecond (delta t)
+    strain_rate: flag that is set when the loaded data represents strain rate
+    trigger_position: index position where the trigger occurs
+    decimation_factor: decimation factor (integer)
+    """
+    fid.seek(0)
+    sensor_num = np.fromfile(fid, dtype=np.float32, count=1)[0]
+    measurement_count = np.fromfile(fid, dtype=np.float32, count=1)[0]
+    _ = np.fromfile(fid, dtype=np.float32, count=1)[0]  # sampling_interval
+    strain_rate = np.fromfile(fid, dtype=np.float32, count=1)[0]
+    _ = np.fromfile(fid, dtype=np.float32, count=1)[0]  # trigger_position
+    _ = np.fromfile(fid, dtype=np.float32, count=1)[0]  # decimation_factor
+    # create distance coordinate
+    distance_start = np.fromfile(fid, dtype=np.float32, count=1)[0]
+    fid.seek(int(sensor_num - 1) * 4)
+    distance_stop = np.fromfile(fid, dtype=np.float32, count=1)[0]
+    distance_step = (distance_stop - distance_start) / sensor_num
+    dist = get_coord(start=distance_start, stop=distance_stop, step=distance_step)
+    # create time coord
+    file_time = _get_time_from_file_name(fid.name)
+    offset_start = np.fromfile(fid, dtype=np.float32, count=1)[0]
+    fid.seek(int(measurement_count - 1) * 4)
+    offset_stop = np.fromfile(fid, dtype=np.float32, count=1)[0]
+    time_start = file_time + dc.to_timedelta64(offset_start)
+    time_stop = file_time + dc.to_timedelta64(offset_stop)
+    time_step = (time_stop - time_start) / measurement_count
+    time = get_coord(start=time_start, stop=time_stop, step=time_step)
+
+    data_type = "strain_rate" if strain_rate else "strain"
+    coord_manager = get_coord_manager(
+        {"time": time, "distance": dist}, dims=("distance", "time")
+    )
+    attrs = dc.PatchAttrs(
+        coords=coord_manager, data_type=data_type, **({} if extras is None else extras)
+    )
+    offsets = fid.tell(), int(measurement_count), int(sensor_num)
+    return attrs, coord_manager, offsets
diff --git a/docs/contributors.md b/docs/contributors.md
@@ -25,6 +25,8 @@ A Huge thanks to all of our contributors:
 
 [Thomas Cullison](https:/github.com/code-cullison)
 
+[Shihao Yuan](https://github.com/Shihao-Yuan)
+
 [Tomas Snyder](https:/github.com/quasiStellar45)
 
 You can find more contributor information

diff --git a/pyproject.toml b/pyproject.toml
@@ -102,6 +102,7 @@ H5SIMPLE__V1_0 = "dascore.io.h5simple.core:H5Simple"
 PICKLE = "dascore.io.pickle.core:PickleIO"
 PRODML__V2_0 = "dascore.io.prodml.core:ProdMLV2_0"
 PRODML__V2_1 = "dascore.io.prodml.core:ProdMLV2_1"
+SENTEK__V5 = "dascore.io.sentek.core:SentekV5"
 TDMS__V4713 = "dascore.io.tdms.core:TDMSFormatterV4713"
 TERRA15__V4 = "dascore.io.terra15.core:Terra15FormatterV4"
 TERRA15__V5 = "dascore.io.terra15.core:Terra15FormatterV5"
@@ -152,7 +153,7 @@ target-version = "py310"
 fixable = ["ALL"]
 
 # List of codes to ignore
-ignore = ["D105", "D107", "D401", "D205"]
+ignore = ["D105", "D107", "D401", "D205", "D200"]
 
 [tool.ruff.mccabe]
 # Unlike Flake8, default to a complexity level of 10.

diff --git a/tests/test_io/test_common_io.py b/tests/test_io/test_common_io.py
@@ -27,6 +27,7 @@
 from dascore.io.pickle import PickleIO
 from dascore.io.prodml import ProdMLV2_0, ProdMLV2_1
 from dascore.io.segy import SegyV2
+from dascore.io.sentek import SentekV5
 from dascore.io.tdms import TDMSFormatterV4713
 from dascore.io.terra15 import (
     Terra15FormatterV4,
@@ -62,6 +63,7 @@
     Terra15FormatterV6(): ("terra15_v6_test_file.hdf5",),
     SegyV2(): ("conoco_segy_1.sgy",),
     DASHDF5(): ("PoroTomo_iDAS_1.h5",),
+    SentekV5(): ("DASDMSShot00_20230328155653619.das",),
 }
 
 # This tuple is for fiber io which support a write method and can write

diff --git a/tests/test_io/test_sentek/test_sentek.py b/tests/test_io/test_sentek/test_sentek.py
@@ -0,0 +1,19 @@
+"""
+Tests specific to the Sentek format.
+"""
+import numpy as np
+
+from dascore.io.sentek import SentekV5
+
+
+class TestSentekV5:
+    """Tests for Sentek format that aren;t covered by common tests."""
+
+    def test_das_extension_not_sentek(self, tmp_path_factory):
+        """Ensure a non-sentek file with a das extension isn't id as sentek."""
+        path = tmp_path_factory.mktemp("sentek_test") / "not_sentek.das"
+        ar = np.random.random(10)
+        with path.open("wb") as fi:
+            np.save(fi, ar)
+        sentek = SentekV5()
+        assert not sentek.get_format(path)