Refactor sliderule io (#26)

* refactor sliderule io * better maxar authentication handling * add docs * fix maxar test logic * add docs
uw-cryo · Nov 22, 2024 · 1034493 · 1034493
1 parent 478116f
commit 1034493
Show file tree

Hide file tree

Showing 14 changed files with 4,733 additions and 1,596 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,6 +15,7 @@ repos:
     rev: "v4.6.0"
     hooks:
       - id: check-added-large-files
+        args: ["--maxkb=1000"]
       - id: check-case-conflict
       - id: check-merge-conflict
       - id: check-symlinks

diff --git a/docs/api.rst b/docs/api.rst
@@ -43,3 +43,16 @@ Datasets
    general.Dataset
    maxar.open_browse
    maxar.plot_browse
+
+
+IO
+--
+
+.. currentmodule:: coincident.io
+
+.. autosummary::
+   :toctree: generated/
+
+   sliderule.subset_gedi02a
+   sliderule.subset_atl06
+   sliderule.sample_3dep
diff --git a/docs/conf.py b/docs/conf.py
@@ -88,3 +88,4 @@
 always_document_param_types = True
 # autodoc_typehints = "none"
 nb_execution_mode = "auto"  # off, on
+nb_execution_excludepatterns = ["sliderule.ipynb"]
diff --git a/docs/examples/index.md b/docs/examples/index.md
@@ -8,4 +8,5 @@ This section contains Jupyter Notebooks with narrative workflows using the
 
 quickstart
 cascading_search
+sliderule
 ```
diff --git a/docs/examples/sliderule.ipynb b/docs/examples/sliderule.ipynb
diff --git a/pixi.lock b/pixi.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -63,6 +63,7 @@ docs = [
   "myst-nb",
   "myst_parser>=0.13",
   "pydata-sphinx-theme>=0.16.0,<0.17",
+  #"rpds-py>=0.21.0,<0.22",
   "sphinx>=7.0",
   "sphinx_autodoc_typehints",
   "sphinx_copybutton",
@@ -120,7 +121,7 @@ report.exclude_also = [
 ]
 
 [tool.codespell]
-skip="pixi.lock"
+skip="pixi.lock,docs/examples/sliderule.ipynb"
 
 [tool.mypy]
 files = ["src", "tests"]
@@ -220,6 +221,7 @@ rioxarray = "*"
 #s3fs = "*"
 stac-geoparquet = "*"
 pyarrow = "*"
+jsonschema = ">=4.23.0,<5"
 #nbconvert = ">=7.16.4,<8"
 #cloudpathlib-s3 = ">=0.20.0,<0.21"
 #matplotlib-base = ">=3.9.2,<4"
@@ -236,6 +238,7 @@ mypy = "*"
 # Testing additional dependencies
 #rich = ">=13.8.1,<14" # Optional. convenient for rich.print(dataset)
 #xvec = ">=0.3.0,<0.4"
+sliderule = ">=4.7.1,<5"
 
 [tool.pixi.pypi-dependencies]
 coincident = { path = ".", editable = false }

diff --git a/src/coincident/datasets/maxar.py b/src/coincident/datasets/maxar.py
@@ -79,7 +79,6 @@ async def download_item(
     return item  # noqa: RET504
 
 
-@depends_on_optional("matplotlib")
 def open_browse(item: pystac.Item, overview_level: int = 0) -> xr.DataArray:
     """
     Open a browse image from a STAC item using the specified overview level.

diff --git a/src/coincident/io/__init__.py b/src/coincident/io/__init__.py
@@ -4,6 +4,6 @@
 
 from __future__ import annotations
 
-from coincident.io.sliderule import load_gedi, load_icesat2
+from coincident.io import sliderule
 
-__all__ = ["load_icesat2", "load_gedi"]
+__all__ = ["sliderule"]
diff --git a/src/coincident/io/sliderule.py b/src/coincident/io/sliderule.py
@@ -1,70 +1,158 @@
 """
-Read ICESat-2 & GEDI using Sliderule
+Subset and Sample using Sliderule
+https://slideruleearth.io
 """
 
 from __future__ import annotations
 
 import warnings
+from pathlib import Path
 from typing import Any
 
 import geopandas as gpd
-import pandas as pd
 
 try:
-    from sliderule import gedi, icesat2, toregion
+    from sliderule import earthdata, gedi, icesat2, raster, toregion
 except ImportError:
     warnings.warn(
         "'sliderule' package not found. Install for GEDI & ICESat2 functionality: https://slideruleearth.io/web/rtd/getting_started/Install.html",
         stacklevel=2,
     )
 from coincident._utils import depends_on_optional
 
-# TODO: make these configurable?
-default_params = {
-    "ats": 20.0,
-    "cnt": 5,
-    "len": 40.0,
-    "res": 20.0,
-    "maxi": 6,
-}
-
-
-@depends_on_optional("sliderule.gedi")
-def load_gedi(
-    aoi: gpd.GeoDataFrame,
-    start_datetime: pd.Timestamp,
-    end_datetime: pd.Timestamp,
-    quality_flags: bool = True,
-    **kwargs: Any,
+
+def _gdf_to_sliderule_polygon(gf: gpd.GeoDataFrame) -> list[dict[str, float]]:
+    # Ignore type necessary I think b/c sliderule doesn't have type hints?
+    return toregion(gf[["geometry"]].dissolve())["poly"]  # type: ignore[no-any-return]
+
+
+def _gdf_to_sliderule_params(gf: gpd.GeoDataFrame) -> dict[str, Any]:
+    # Sliderule expects single 'geometry' column
+    # Dissolve creates a single multipolygon from all geometries
+    sliderule_aoi = _gdf_to_sliderule_polygon(gf)
+    params = {}
+    params["poly"] = sliderule_aoi
+    params["t0"] = gf.start_datetime.min().tz_localize(None).isoformat()
+    params["t1"] = gf.end_datetime.max().tz_localize(None).isoformat()
+    return params
+
+
+def _granule_from_assets(assets: gpd.GeoDataFrame) -> str:
+    # NOTE: change to while loop in case tons of assets?
+    for _k, v in assets.items():
+        if v.get("roles") == "data":
+            granule = Path(v.get("href")).name
+
+    return granule
+
+
+@depends_on_optional("sliderule")
+def subset_gedi02a(
+    gf: gpd.GeoDataFrame,
+    aoi: gpd.GeoDataFrame = None,
+    # NOTE: investigate B006 best-practice
+    sliderule_params: dict[str, Any] = {},  # noqa: B006
 ) -> gpd.GeoDataFrame:
-    # NOTE: allow passing private cluster?
-    gedi.init("slideruleearth.io")
-    params = default_params.copy()
-    if quality_flags:
-        params["degrade_flag"] = 0
-        params["l2_quality_flag"] = 1
-    params["poly"] = toregion(aoi)["poly"]
-    params["t0"] = start_datetime.tz_localize(None).isoformat()
-    params["t1"] = end_datetime.tz_localize(None).isoformat()
-    params.update(kwargs)
-    return gedi.gedi02ap(params)
-
-
-@depends_on_optional("sliderule.icesat2")
-def load_icesat2(
-    aoi: gpd.GeoDataFrame,
-    start_datetime: pd.Timestamp,
-    end_datetime: pd.Timestamp,
-    **kwargs: Any,
+    """
+    Subsets GEDI L2A data based on a given GeoDataFrame and optional area of interest (AOI).
+
+    Parameters
+    ----------
+    gf : gpd.GeoDataFrame
+        A GeoDataFrame of results from coincident.search.search(dataset='gedi')
+
+    aoi : gpd.GeoDataFrame, optional
+        A GeoDataFrame with a POLYGON to subset. If not provided the union of geometries in gf is used.
+
+    sliderule_params : dict[Any], optional
+        A dictionary of additional parameters to be passed to the sliderule `gedi.gedi02ap`.
+
+    Returns
+    -------
+    gpd.GeoDataFrame
+        A GeoDataFrame containing the subsetted GEDI L2A data.
+
+    Notes
+    -----
+    The function sets `degrade_filter=True` and `l2_quality_filter=True` by default.
+    """
+    params = _gdf_to_sliderule_params(gf)
+
+    granule_names = gf.assets.apply(_granule_from_assets).to_list()
+
+    if aoi is not None:
+        params["poly"] = _gdf_to_sliderule_polygon(aoi)
+
+    params.update(
+        {
+            "degrade_filter": True,
+            "l2_quality_filter": True,
+        }
+    )
+
+    # User-provided parameters take precedence
+    params.update(sliderule_params)
+
+    return gedi.gedi02ap(params, resources=granule_names)
+
+
+@depends_on_optional("sliderule")
+def subset_atl06(
+    gf: gpd.GeoDataFrame,
+    aoi: gpd.GeoDataFrame = None,
+    dropna: bool = True,
+    sliderule_params: dict[str, Any] = {},  # noqa: B006
 ) -> gpd.GeoDataFrame:
-    # NOTE: allow passing private cluster?
-    # NOTE: deal with kwargs
-    icesat2.init("slideruleearth.io")
-    params = default_params.copy()
-    params["srt"] = icesat2.SRT_LAND
-    params["cnf"] = icesat2.CNF_SURFACE_HIGH
-    params["poly"] = toregion(aoi)["poly"]
-    params["t0"] = start_datetime.tz_localize(None).isoformat()
-    params["t1"] = end_datetime.tz_localize(None).isoformat()
-    params.update(kwargs)
-    return icesat2.atl06p(params)
+    params = _gdf_to_sliderule_params(gf)
+
+    # Note necessary but avoids another CMR search
+    granule_names = gf.assets.apply(_granule_from_assets).to_list()
+
+    if aoi is not None:
+        params["poly"] = _gdf_to_sliderule_polygon(aoi)
+
+    # User-provided parameters take precedence
+    params.update(sliderule_params)
+
+    data = icesat2.atl06sp(params, resources=granule_names)
+
+    # NOTE: add server-side filtering for NaNs in sliderule.atl06sp?
+    if dropna:
+        return data.dropna(subset=["h_li"])
+    return data
+
+
+@depends_on_optional("sliderule")
+def sample_worldcover(gf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
+    points = [[x, y] for x, y in zip(gf.geometry.x, gf.geometry.y, strict=False)]
+    return raster.sample("esa-worldcover-10meter", points)
+
+
+@depends_on_optional("sliderule")
+def sample_3dep(gf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
+    """
+    Sample 3DEP 1-meter DEM with POINT geometries in GeoDataFrame.
+
+    Points should be (longitude, latitude) not UTM
+
+    Parameters
+    ----------
+    gf : gpd.GeoDataFrame
+        A GeoDataFrame containing POINT geometries
+
+    Returns
+    -------
+    gpd.GeoDataFrame
+        A GeoDataFrame with sampled elevation data from the 3DEP 1-meter DEM.
+    """
+    points = [[x, y] for x, y in zip(gf.geometry.x, gf.geometry.y, strict=False)]
+
+    poly = _gdf_to_sliderule_polygon(gf)
+    geojson = earthdata.tnm(
+        short_name="Digital Elevation Model (DEM) 1 meter", polygon=poly
+    )
+
+    # NOTE: make sliderule aware of 3dep polygons to not sample NaN regions for 1m grid?
+    data_3dep = raster.sample("usgs3dep-1meter-dem", points, {"catalog": geojson})
+
+    return data_3dep[data_3dep.value.notna()]
diff --git a/src/coincident/search/main.py b/src/coincident/search/main.py
@@ -147,6 +147,11 @@ def search(
             **kwargs,
         )
 
+    # Keep track of dataset alias in geodataframe metadata
+    # NOTE: attrs not always retrained and not saved to file
+    # https://github.com/geopandas/geopandas/issues/3320
+    gf.attrs["dataset"] = dataset.alias
+
     return gf
 
 

diff --git a/src/coincident/search/stac.py b/src/coincident/search/stac.py
@@ -25,6 +25,21 @@
 except maxar_platform.session.NoSessionCredentials:
     msg_noauth = "Unable to authenticate with Maxar API. Please set MAXAR_API_KEY environment variable."
     warnings.warn(msg_noauth, stacklevel=2)
+except maxar_platform.exceptions.UnAuthorizedException:
+    msg_noauth = (
+        "Unable to authenticate with Maxar API. Please check MAXAR_API_KEY is valid."
+    )
+    warnings.warn(msg_noauth, stacklevel=2)
+
+
+def _filter_assets(assets: gpd.GeoDataFrame) -> dict[str, str]:
+    """Remove key:None pairs from assets"""
+    keep_keys = []
+    for k, v in assets.items():
+        if v is not None:
+            keep_keys.append(k)
+
+    return {key: assets[key] for key in keep_keys}
 
 
 def to_geopandas(
@@ -59,6 +74,9 @@ def to_geopandas(
     record_batch_reader = stac_geoparquet.arrow.parse_stac_items_to_arrow(collection)
     gf = gpd.GeoDataFrame.from_arrow(record_batch_reader)  # doesn't keep arrow dtypes
 
+    # Workaround stac-geoparquet limitation https://github.com/stac-utils/stac-geoparquet/issues/82
+    gf["assets"] = gf["assets"].apply(_filter_assets)
+
     # Additional columns for convenience
     # NOTE: these become entries under STAC properties
     gf["dayofyear"] = gf["datetime"].dt.dayofyear

diff --git a/src/coincident/search/wesm.py b/src/coincident/search/wesm.py
@@ -58,6 +58,8 @@ def stacify_column_names(gf: GeoDataFrame) -> GeoDataFrame:
         "collect_end": "end_datetime",
     }
     gf = gf.rename(columns=name_map)
+    # Add collection name to match STAC (used to identify geodataframe contents in other functions)
+    gf["collection"] = "3DEP"
     gf["start_datetime"] = pd.to_datetime(gf["start_datetime"])
     gf["end_datetime"] = pd.to_datetime(gf["end_datetime"])
     duration = gf.end_datetime - gf.start_datetime