Skip to content

Commit

Permalink
allow differnet time dimension sizes
Browse files Browse the repository at this point in the history
  • Loading branch information
btschwertfeger committed Mar 8, 2024
1 parent 1d7e0b8 commit a496ae5
Show file tree
Hide file tree
Showing 5 changed files with 204 additions and 11 deletions.
59 changes: 50 additions & 9 deletions cmethods/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,26 +50,45 @@ def apply_ufunc(
if method not in __METHODS_FUNC__:
raise UnknownMethodError(method, __METHODS_FUNC__.keys())

if kwargs.get("input_core_dims"):
if not isinstance(kwargs["input_core_dims"], dict):
raise TypeError("input_core_dims must be an object of type 'dict'")

Check warning on line 55 in cmethods/core.py

View check run for this annotation

Codecov / codecov/patch

cmethods/core.py#L55

Added line #L55 was not covered by tests
if not len(kwargs["input_core_dims"]) == 3 or any(
not isinstance(value, str) for value in kwargs["input_core_dims"].values()
):
raise ValueError(

Check warning on line 59 in cmethods/core.py

View check run for this annotation

Codecov / codecov/patch

cmethods/core.py#L59

Added line #L59 was not covered by tests
"input_core_dims must have three key-value pairs like: "
'{"obs": "time", "simh": "time", "simp": "time"}',
)

input_core_dims = kwargs["input_core_dims"]
else:
input_core_dims = {"obs": "time", "simh": "time", "simp": "time"}

result: XRData = xr.apply_ufunc(
__METHODS_FUNC__[method],
obs,
simh,
# Need to spoof a fake time axis since 'time' coord on full dataset is different
# than 'time' coord on training dataset.
simp.rename({"time": "t2"}),
simp.rename({input_core_dims["simp"]: "__t_simp__"}),
dask="parallelized",
vectorize=True,
# This will vectorize over the time dimension, so will submit each grid cell
# independently
input_core_dims=[["time"], ["time"], ["t2"]],
input_core_dims=[
[input_core_dims["obs"]],
[input_core_dims["simh"]],
["__t_simp__"],
],
# Need to denote that the final output dataset will be labeled with the
# spoofed time coordinate
output_core_dims=[["t2"]],
output_core_dims=[["__t_simp__"]],
kwargs=dict(kwargs),
)

# Rename to proper coordinate name.
result = result.rename({"t2": "time"})
result = result.rename({"__t_simp__": input_core_dims["simp"]})

# ufunc will put the core dimension to the end (time), so want to preserve original
# order where time is commonly first.
Expand All @@ -90,6 +109,14 @@ def adjust(
See https://python-cmethods.readthedocs.io/en/latest/src/methods.html
The time dimension of ``obs``, ``simh`` and ``simp`` must be named ``time``.
If the sizes of time dimensions of the input data sets differ, you have to
pass the hidden ``input_core_dims`` parameter, see
https://python-cmethods.readthedocs.io/en/latest/src/getting_started.html#advanced-usage
for more information.
:param method: Technique to apply
:type method: str
:param obs: The reference/observational data set
Expand Down Expand Up @@ -127,14 +154,28 @@ def adjust(
)

# Grouped correction | scaling-based technique
group: str = kwargs["group"]
del kwargs["group"]
group: str | dict[str, str] = kwargs["group"]
if isinstance(group, str):
# only for same sized time dimensions
obs_group = group
simh_group = group
simp_group = group
elif isinstance(group, dict):
if any(key not in {"obs", "simh", "simp"} for key in group):
raise ValueError(

Check warning on line 165 in cmethods/core.py

View check run for this annotation

Codecov / codecov/patch

cmethods/core.py#L165

Added line #L165 was not covered by tests
"group must either be a string like 'time' or a dict like "
'{"obs": "time.month", "simh": "t_simh.month", "simp": "time.month"}',
)
# for different sized time dimensions
obs_group = group["obs"]
simh_group = group["simh"]
simp_group = group["simp"]

result: Optional[XRData] = None
for (_, obs_gds), (_, simh_gds), (_, simp_gds) in zip(
obs.groupby(group),
simh.groupby(group),
simp.groupby(group),
obs.groupby(obs_group),
simh.groupby(simh_group),
simp.groupby(simp_group),
):
monthly_result = apply_ufunc(
method,
Expand Down
10 changes: 10 additions & 0 deletions cmethods/distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@ def quantile_mapping(

cdf_obs = get_cdf(obs, xbins)
cdf_simh = get_cdf(simh, xbins)
cdf_simh = np.interp(
cdf_simh,
(cdf_simh.min(), cdf_simh.max()),
(cdf_obs.min(), cdf_obs.max()),
)

if kind in ADDITIVE:
epsilon = np.interp(simp, xbins, cdf_simh) # Eq. 1
Expand Down Expand Up @@ -124,6 +129,11 @@ def detrended_quantile_mapping(

cdf_obs = get_cdf(obs, xbins)
cdf_simh = get_cdf(simh, xbins)
cdf_simh = np.interp(
cdf_simh,
(cdf_simh.min(), cdf_simh.max()),
(cdf_obs.min(), cdf_obs.max()),
)

# detrended => shift mean of $X_{sim,p}$ to range of $X_{sim,h}$ to adjust extremes
res = np.zeros(len(simp.values))
Expand Down
57 changes: 57 additions & 0 deletions doc/src/getting_started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,60 @@ method specific documentation.
n_quaniles=1000,
kind="+",
)
Advanced Usage
--------------

In some cases the time dimension of input data sets have different sizes. In
such case, the hidden parameter ``input_core_dims`` must be passed to the
``adjust`` call.

It defines the dimension names of the input data sets, i.e. if the time
dimensions of ``obs`` and ``simp`` have the length, but the time dimension of
``simh`` is somewhat smaller, you have to define this as follows:


.. code-block:: python
:linenos:
:caption: Bias Adjustments for data sets with different time dimension lengths pt. 1
from cmethods import adjust
import xarray as xr
obs = xr.open_dataset("examples/input_data/observations.nc")["tas"]
simp = xr.open_dataset("examples/input_data/control.nc")["tas"]
simh = simp.copy(deep=True)[3650:]
bc = adjust(
method="quantile_mapping",
obs=obs,
simh=simh.rename({"time": "t_simh"}),
simp=simh,
kind="+",
input_core_dims={"obs": "time", "simh": "t_simh", "simp": "time"}
)
In case you are applying a scaling based technique using grouping, you have to
adjust the group names accordingly to the time dimension names.
.. code-block:: python
:linenos:
:caption: Bias Adjustments for data sets with different time dimension lengths pt. 2
from cmethods import adjust
import xarray as xr
obs = xr.open_dataset("examples/input_data/observations.nc")["tas"]
simp = xr.open_dataset("examples/input_data/control.nc")["tas"]
simh = simp.copy(deep=True)[3650:]
bc = adjust(
method="linear_scaling",
obs=obs,
simh=simh.rename({"time": "t_simh"}),
simp=simh,
kind="+",
group={"obs": "time.month", "simh": "t_simh.month", "simp": "time.month"},
input_core_dims={"obs": "time", "simh": "t_simh", "simp": "time"}
)
85 changes: 85 additions & 0 deletions tests/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,55 @@ def test_3d_scaling(
assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp)


@pytest.mark.parametrize(
("method", "kind"),
[
("linear_scaling", "+"),
("variance_scaling", "+"),
("linear_scaling", "*"),
],
)
def test_3d_scaling_different_time_span(
datasets: dict,
method: str,
kind: str,
) -> None:
obsh: XRData_t = datasets[kind]["obsh"]
obsp: XRData_t = datasets[kind]["obsp"]
simh: XRData_t = datasets[kind]["simh"]
simp: XRData_t = datasets[kind]["simp"]
simh = simh.sel(time=slice(simh.time[1], None)).rename({"time": "t_simh"})

time_names = {"obs": "time", "simh": "t_simh", "simp": "time"}

# not grouped
result: XRData_t = adjust(
method=method,
obs=obsh,
simh=simh,
simp=simp,
kind=kind,
input_core_dims=time_names,
)

assert isinstance(result, XRData_t)
assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp)

# grouped
result: XRData_t = adjust(
method=method,
obs=obsh,
simh=simh,
simp=simp,
kind=kind,
group={"obs": "time.month", "simh": "t_simh.month", "simp": "time.month"},
input_core_dims=time_names,
)

assert isinstance(result, XRData_t)
assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp)


@pytest.mark.parametrize(
("method", "kind"),
[
Expand Down Expand Up @@ -171,6 +220,42 @@ def test_3d_distribution(
assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp)


@pytest.mark.parametrize(
("method", "kind"),
[
("quantile_mapping", "+"),
("quantile_delta_mapping", "+"),
("quantile_mapping", "*"),
("quantile_delta_mapping", "*"),
],
)
def test_3d_distribution_different_time_span(
datasets: dict,
method: str,
kind: str,
) -> None:
obsh: XRData_t = datasets[kind]["obsh"]
obsp: XRData_t = datasets[kind]["obsp"]
simh: XRData_t = datasets[kind]["simh"]
simp: XRData_t = datasets[kind]["simp"]

simh = simh.sel(time=slice(simh.time[1], None)).rename({"time": "t_simh"})
time_names = {"obs": "time", "simh": "t_simh", "simp": "time"}

result: XRData_t = adjust(
method=method,
obs=obsh,
simh=simh,
simp=simp,
kind=kind,
n_quantiles=N_QUANTILES,
input_core_dims=time_names,
)

assert isinstance(result, XRData_t)
assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp)


def test_1d_detrended_quantile_mapping_add(datasets: dict) -> None:
kind: str = "+"
obsh: XRData_t = datasets[kind]["obsh"][:, 0, 0]
Expand Down
4 changes: 2 additions & 2 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@
def test_quantile_mapping_single_nan() -> None:
obs, simh, simp = list(np.arange(10)), list(np.arange(10)), list(np.arange(10))
obs[0] = np.nan
expected = np.array([0.0, 1.9, 2.9, 3.9, 4.9, 5.9, 6.9, 7.9, 8.9, 9.0])
expected = np.array([0.0, 1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1, 9.0])

res = quantile_mapping(obs=obs, simh=simh, simp=simp, n_quantiles=5)
assert np.allclose(res, expected)
assert np.allclose(res, expected), res


@pytest.mark.filterwarnings("ignore:All-NaN slice encountered")
Expand Down

0 comments on commit a496ae5

Please sign in to comment.