allow differnet time dimension sizes

btschwertfeger · Mar 8, 2024 · a496ae5 · a496ae5
1 parent 1d7e0b8
commit a496ae5
Show file tree

Hide file tree

Showing 5 changed files with 204 additions and 11 deletions.
diff --git a/cmethods/core.py b/cmethods/core.py
@@ -50,26 +50,45 @@ def apply_ufunc(
     if method not in __METHODS_FUNC__:
         raise UnknownMethodError(method, __METHODS_FUNC__.keys())
 
+    if kwargs.get("input_core_dims"):
+        if not isinstance(kwargs["input_core_dims"], dict):
+            raise TypeError("input_core_dims must be an object of type 'dict'")
+        if not len(kwargs["input_core_dims"]) == 3 or any(
+            not isinstance(value, str) for value in kwargs["input_core_dims"].values()
+        ):
+            raise ValueError(
+                "input_core_dims must have three key-value pairs like: "
+                '{"obs": "time", "simh": "time", "simp": "time"}',
+            )
+
+        input_core_dims = kwargs["input_core_dims"]
+    else:
+        input_core_dims = {"obs": "time", "simh": "time", "simp": "time"}
+
     result: XRData = xr.apply_ufunc(
         __METHODS_FUNC__[method],
         obs,
         simh,
         # Need to spoof a fake time axis since 'time' coord on full dataset is different
         # than 'time' coord on training dataset.
-        simp.rename({"time": "t2"}),
+        simp.rename({input_core_dims["simp"]: "__t_simp__"}),
         dask="parallelized",
         vectorize=True,
         # This will vectorize over the time dimension, so will submit each grid cell
         # independently
-        input_core_dims=[["time"], ["time"], ["t2"]],
+        input_core_dims=[
+            [input_core_dims["obs"]],
+            [input_core_dims["simh"]],
+            ["__t_simp__"],
+        ],
         # Need to denote that the final output dataset will be labeled with the
         # spoofed time coordinate
-        output_core_dims=[["t2"]],
+        output_core_dims=[["__t_simp__"]],
         kwargs=dict(kwargs),
     )
 
     # Rename to proper coordinate name.
-    result = result.rename({"t2": "time"})
+    result = result.rename({"__t_simp__": input_core_dims["simp"]})
 
     # ufunc will put the core dimension to the end (time), so want to preserve original
     # order where time is commonly first.
@@ -90,6 +109,14 @@ def adjust(
 
     See https://python-cmethods.readthedocs.io/en/latest/src/methods.html
 
+
+    The time dimension of ``obs``, ``simh`` and ``simp`` must be named ``time``.
+
+    If the sizes of time dimensions of the input data sets differ, you have to
+    pass the hidden ``input_core_dims`` parameter, see
+    https://python-cmethods.readthedocs.io/en/latest/src/getting_started.html#advanced-usage
+    for more information.
+
     :param method: Technique to apply
     :type method: str
     :param obs: The reference/observational data set
@@ -127,14 +154,28 @@ def adjust(
         )
 
     # Grouped correction | scaling-based technique
-    group: str = kwargs["group"]
-    del kwargs["group"]
+    group: str | dict[str, str] = kwargs["group"]
+    if isinstance(group, str):
+        # only for same sized time dimensions
+        obs_group = group
+        simh_group = group
+        simp_group = group
+    elif isinstance(group, dict):
+        if any(key not in {"obs", "simh", "simp"} for key in group):
+            raise ValueError(
+                "group must either be a string like 'time' or a dict like "
+                '{"obs": "time.month", "simh": "t_simh.month", "simp": "time.month"}',
+            )
+        # for different sized time dimensions
+        obs_group = group["obs"]
+        simh_group = group["simh"]
+        simp_group = group["simp"]
 
     result: Optional[XRData] = None
     for (_, obs_gds), (_, simh_gds), (_, simp_gds) in zip(
-        obs.groupby(group),
-        simh.groupby(group),
-        simp.groupby(group),
+        obs.groupby(obs_group),
+        simh.groupby(simh_group),
+        simp.groupby(simp_group),
     ):
         monthly_result = apply_ufunc(
             method,

diff --git a/cmethods/distribution.py b/cmethods/distribution.py
@@ -62,6 +62,11 @@ def quantile_mapping(
 
     cdf_obs = get_cdf(obs, xbins)
     cdf_simh = get_cdf(simh, xbins)
+    cdf_simh = np.interp(
+        cdf_simh,
+        (cdf_simh.min(), cdf_simh.max()),
+        (cdf_obs.min(), cdf_obs.max()),
+    )
 
     if kind in ADDITIVE:
         epsilon = np.interp(simp, xbins, cdf_simh)  # Eq. 1
@@ -124,6 +129,11 @@ def detrended_quantile_mapping(
 
     cdf_obs = get_cdf(obs, xbins)
     cdf_simh = get_cdf(simh, xbins)
+    cdf_simh = np.interp(
+        cdf_simh,
+        (cdf_simh.min(), cdf_simh.max()),
+        (cdf_obs.min(), cdf_obs.max()),
+    )
 
     # detrended => shift mean of $X_{sim,p}$ to range of $X_{sim,h}$ to adjust extremes
     res = np.zeros(len(simp.values))

diff --git a/doc/src/getting_started.rst b/doc/src/getting_started.rst
@@ -56,3 +56,60 @@ method specific documentation.
         n_quaniles=1000,
         kind="+",
     )
+
+
+Advanced Usage
+--------------
+
+In some cases the time dimension of input data sets have different sizes. In
+such case, the hidden parameter ``input_core_dims`` must be passed to the
+``adjust`` call.
+
+It defines the dimension names of the input data sets, i.e. if the time
+dimensions of ``obs`` and ``simp`` have the length, but the time dimension of
+``simh`` is somewhat smaller, you have to define this as follows:
+
+
+.. code-block:: python
+    :linenos:
+    :caption: Bias Adjustments for data sets with different time dimension lengths pt. 1
+
+    from cmethods import adjust
+    import xarray as xr
+
+    obs = xr.open_dataset("examples/input_data/observations.nc")["tas"]
+    simp = xr.open_dataset("examples/input_data/control.nc")["tas"]
+    simh = simp.copy(deep=True)[3650:]
+
+    bc = adjust(
+        method="quantile_mapping",
+        obs=obs,
+        simh=simh.rename({"time": "t_simh"}),
+        simp=simh,
+        kind="+",
+        input_core_dims={"obs": "time", "simh": "t_simh", "simp": "time"}
+    )
+
+In case you are applying a scaling based technique using grouping, you have to
+adjust the group names accordingly to the time dimension names.
+
+.. code-block:: python
+    :linenos:
+    :caption: Bias Adjustments for data sets with different time dimension lengths pt. 2
+
+    from cmethods import adjust
+    import xarray as xr
+
+    obs = xr.open_dataset("examples/input_data/observations.nc")["tas"]
+    simp = xr.open_dataset("examples/input_data/control.nc")["tas"]
+    simh = simp.copy(deep=True)[3650:]
+
+    bc = adjust(
+        method="linear_scaling",
+        obs=obs,
+        simh=simh.rename({"time": "t_simh"}),
+        simp=simh,
+        kind="+",
+        group={"obs": "time.month", "simh": "t_simh.month", "simp": "time.month"},
+        input_core_dims={"obs": "time", "simh": "t_simh", "simp": "time"}
+    )
diff --git a/tests/test_methods.py b/tests/test_methods.py
@@ -107,6 +107,55 @@ def test_3d_scaling(
     assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp)
 
 
+@pytest.mark.parametrize(
+    ("method", "kind"),
+    [
+        ("linear_scaling", "+"),
+        ("variance_scaling", "+"),
+        ("linear_scaling", "*"),
+    ],
+)
+def test_3d_scaling_different_time_span(
+    datasets: dict,
+    method: str,
+    kind: str,
+) -> None:
+    obsh: XRData_t = datasets[kind]["obsh"]
+    obsp: XRData_t = datasets[kind]["obsp"]
+    simh: XRData_t = datasets[kind]["simh"]
+    simp: XRData_t = datasets[kind]["simp"]
+    simh = simh.sel(time=slice(simh.time[1], None)).rename({"time": "t_simh"})
+
+    time_names = {"obs": "time", "simh": "t_simh", "simp": "time"}
+
+    # not grouped
+    result: XRData_t = adjust(
+        method=method,
+        obs=obsh,
+        simh=simh,
+        simp=simp,
+        kind=kind,
+        input_core_dims=time_names,
+    )
+
+    assert isinstance(result, XRData_t)
+    assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp)
+
+    # grouped
+    result: XRData_t = adjust(
+        method=method,
+        obs=obsh,
+        simh=simh,
+        simp=simp,
+        kind=kind,
+        group={"obs": "time.month", "simh": "t_simh.month", "simp": "time.month"},
+        input_core_dims=time_names,
+    )
+
+    assert isinstance(result, XRData_t)
+    assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp)
+
+
 @pytest.mark.parametrize(
     ("method", "kind"),
     [
@@ -171,6 +220,42 @@ def test_3d_distribution(
     assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp)
 
 
+@pytest.mark.parametrize(
+    ("method", "kind"),
+    [
+        ("quantile_mapping", "+"),
+        ("quantile_delta_mapping", "+"),
+        ("quantile_mapping", "*"),
+        ("quantile_delta_mapping", "*"),
+    ],
+)
+def test_3d_distribution_different_time_span(
+    datasets: dict,
+    method: str,
+    kind: str,
+) -> None:
+    obsh: XRData_t = datasets[kind]["obsh"]
+    obsp: XRData_t = datasets[kind]["obsp"]
+    simh: XRData_t = datasets[kind]["simh"]
+    simp: XRData_t = datasets[kind]["simp"]
+
+    simh = simh.sel(time=slice(simh.time[1], None)).rename({"time": "t_simh"})
+    time_names = {"obs": "time", "simh": "t_simh", "simp": "time"}
+
+    result: XRData_t = adjust(
+        method=method,
+        obs=obsh,
+        simh=simh,
+        simp=simp,
+        kind=kind,
+        n_quantiles=N_QUANTILES,
+        input_core_dims=time_names,
+    )
+
+    assert isinstance(result, XRData_t)
+    assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp)
+
+
 def test_1d_detrended_quantile_mapping_add(datasets: dict) -> None:
     kind: str = "+"
     obsh: XRData_t = datasets[kind]["obsh"][:, 0, 0]

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -39,10 +39,10 @@
 def test_quantile_mapping_single_nan() -> None:
     obs, simh, simp = list(np.arange(10)), list(np.arange(10)), list(np.arange(10))
     obs[0] = np.nan
-    expected = np.array([0.0, 1.9, 2.9, 3.9, 4.9, 5.9, 6.9, 7.9, 8.9, 9.0])
+    expected = np.array([0.0, 1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1, 9.0])
 
     res = quantile_mapping(obs=obs, simh=simh, simp=simp, n_quantiles=5)
-    assert np.allclose(res, expected)
+    assert np.allclose(res, expected), res
 
 
 @pytest.mark.filterwarnings("ignore:All-NaN slice encountered")