diff --git a/cmethods/core.py b/cmethods/core.py index 1860729..94dd706 100644 --- a/cmethods/core.py +++ b/cmethods/core.py @@ -50,26 +50,45 @@ def apply_ufunc( if method not in __METHODS_FUNC__: raise UnknownMethodError(method, __METHODS_FUNC__.keys()) + if kwargs.get("input_core_dims"): + if not isinstance(kwargs["input_core_dims"], dict): + raise TypeError("input_core_dims must be an object of type 'dict'") + if not len(kwargs["input_core_dims"]) == 3 or any( + not isinstance(value, str) for value in kwargs["input_core_dims"].values() + ): + raise ValueError( + "input_core_dims must have three key-value pairs like: " + '{"obs": "time", "simh": "time", "simp": "time"}', + ) + + input_core_dims = kwargs["input_core_dims"] + else: + input_core_dims = {"obs": "time", "simh": "time", "simp": "time"} + result: XRData = xr.apply_ufunc( __METHODS_FUNC__[method], obs, simh, # Need to spoof a fake time axis since 'time' coord on full dataset is different # than 'time' coord on training dataset. - simp.rename({"time": "t2"}), + simp.rename({input_core_dims["simp"]: "__t_simp__"}), dask="parallelized", vectorize=True, # This will vectorize over the time dimension, so will submit each grid cell # independently - input_core_dims=[["time"], ["time"], ["t2"]], + input_core_dims=[ + [input_core_dims["obs"]], + [input_core_dims["simh"]], + ["__t_simp__"], + ], # Need to denote that the final output dataset will be labeled with the # spoofed time coordinate - output_core_dims=[["t2"]], + output_core_dims=[["__t_simp__"]], kwargs=dict(kwargs), ) # Rename to proper coordinate name. - result = result.rename({"t2": "time"}) + result = result.rename({"__t_simp__": input_core_dims["simp"]}) # ufunc will put the core dimension to the end (time), so want to preserve original # order where time is commonly first. @@ -90,6 +109,14 @@ def adjust( See https://python-cmethods.readthedocs.io/en/latest/src/methods.html + + The time dimension of ``obs``, ``simh`` and ``simp`` must be named ``time``. + + If the sizes of time dimensions of the input data sets differ, you have to + pass the hidden ``input_core_dims`` parameter, see + https://python-cmethods.readthedocs.io/en/latest/src/getting_started.html#advanced-usage + for more information. + :param method: Technique to apply :type method: str :param obs: The reference/observational data set @@ -127,14 +154,28 @@ def adjust( ) # Grouped correction | scaling-based technique - group: str = kwargs["group"] - del kwargs["group"] + group: str | dict[str, str] = kwargs["group"] + if isinstance(group, str): + # only for same sized time dimensions + obs_group = group + simh_group = group + simp_group = group + elif isinstance(group, dict): + if any(key not in {"obs", "simh", "simp"} for key in group): + raise ValueError( + "group must either be a string like 'time' or a dict like " + '{"obs": "time.month", "simh": "t_simh.month", "simp": "time.month"}', + ) + # for different sized time dimensions + obs_group = group["obs"] + simh_group = group["simh"] + simp_group = group["simp"] result: Optional[XRData] = None for (_, obs_gds), (_, simh_gds), (_, simp_gds) in zip( - obs.groupby(group), - simh.groupby(group), - simp.groupby(group), + obs.groupby(obs_group), + simh.groupby(simh_group), + simp.groupby(simp_group), ): monthly_result = apply_ufunc( method, diff --git a/cmethods/distribution.py b/cmethods/distribution.py index b5d8fca..cc54c1a 100644 --- a/cmethods/distribution.py +++ b/cmethods/distribution.py @@ -62,6 +62,11 @@ def quantile_mapping( cdf_obs = get_cdf(obs, xbins) cdf_simh = get_cdf(simh, xbins) + cdf_simh = np.interp( + cdf_simh, + (cdf_simh.min(), cdf_simh.max()), + (cdf_obs.min(), cdf_obs.max()), + ) if kind in ADDITIVE: epsilon = np.interp(simp, xbins, cdf_simh) # Eq. 1 @@ -124,6 +129,11 @@ def detrended_quantile_mapping( cdf_obs = get_cdf(obs, xbins) cdf_simh = get_cdf(simh, xbins) + cdf_simh = np.interp( + cdf_simh, + (cdf_simh.min(), cdf_simh.max()), + (cdf_obs.min(), cdf_obs.max()), + ) # detrended => shift mean of $X_{sim,p}$ to range of $X_{sim,h}$ to adjust extremes res = np.zeros(len(simp.values)) diff --git a/doc/src/getting_started.rst b/doc/src/getting_started.rst index 8cc5cf4..152d2bd 100644 --- a/doc/src/getting_started.rst +++ b/doc/src/getting_started.rst @@ -56,3 +56,60 @@ method specific documentation. n_quaniles=1000, kind="+", ) + + +Advanced Usage +-------------- + +In some cases the time dimension of input data sets have different sizes. In +such case, the hidden parameter ``input_core_dims`` must be passed to the +``adjust`` call. + +It defines the dimension names of the input data sets, i.e. if the time +dimensions of ``obs`` and ``simp`` have the length, but the time dimension of +``simh`` is somewhat smaller, you have to define this as follows: + + +.. code-block:: python + :linenos: + :caption: Bias Adjustments for data sets with different time dimension lengths pt. 1 + + from cmethods import adjust + import xarray as xr + + obs = xr.open_dataset("examples/input_data/observations.nc")["tas"] + simp = xr.open_dataset("examples/input_data/control.nc")["tas"] + simh = simp.copy(deep=True)[3650:] + + bc = adjust( + method="quantile_mapping", + obs=obs, + simh=simh.rename({"time": "t_simh"}), + simp=simh, + kind="+", + input_core_dims={"obs": "time", "simh": "t_simh", "simp": "time"} + ) + +In case you are applying a scaling based technique using grouping, you have to +adjust the group names accordingly to the time dimension names. + +.. code-block:: python + :linenos: + :caption: Bias Adjustments for data sets with different time dimension lengths pt. 2 + + from cmethods import adjust + import xarray as xr + + obs = xr.open_dataset("examples/input_data/observations.nc")["tas"] + simp = xr.open_dataset("examples/input_data/control.nc")["tas"] + simh = simp.copy(deep=True)[3650:] + + bc = adjust( + method="linear_scaling", + obs=obs, + simh=simh.rename({"time": "t_simh"}), + simp=simh, + kind="+", + group={"obs": "time.month", "simh": "t_simh.month", "simp": "time.month"}, + input_core_dims={"obs": "time", "simh": "t_simh", "simp": "time"} + ) diff --git a/tests/test_methods.py b/tests/test_methods.py index f7eeee6..315c09c 100644 --- a/tests/test_methods.py +++ b/tests/test_methods.py @@ -107,6 +107,55 @@ def test_3d_scaling( assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp) +@pytest.mark.parametrize( + ("method", "kind"), + [ + ("linear_scaling", "+"), + ("variance_scaling", "+"), + ("linear_scaling", "*"), + ], +) +def test_3d_scaling_different_time_span( + datasets: dict, + method: str, + kind: str, +) -> None: + obsh: XRData_t = datasets[kind]["obsh"] + obsp: XRData_t = datasets[kind]["obsp"] + simh: XRData_t = datasets[kind]["simh"] + simp: XRData_t = datasets[kind]["simp"] + simh = simh.sel(time=slice(simh.time[1], None)).rename({"time": "t_simh"}) + + time_names = {"obs": "time", "simh": "t_simh", "simp": "time"} + + # not grouped + result: XRData_t = adjust( + method=method, + obs=obsh, + simh=simh, + simp=simp, + kind=kind, + input_core_dims=time_names, + ) + + assert isinstance(result, XRData_t) + assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp) + + # grouped + result: XRData_t = adjust( + method=method, + obs=obsh, + simh=simh, + simp=simp, + kind=kind, + group={"obs": "time.month", "simh": "t_simh.month", "simp": "time.month"}, + input_core_dims=time_names, + ) + + assert isinstance(result, XRData_t) + assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp) + + @pytest.mark.parametrize( ("method", "kind"), [ @@ -171,6 +220,42 @@ def test_3d_distribution( assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp) +@pytest.mark.parametrize( + ("method", "kind"), + [ + ("quantile_mapping", "+"), + ("quantile_delta_mapping", "+"), + ("quantile_mapping", "*"), + ("quantile_delta_mapping", "*"), + ], +) +def test_3d_distribution_different_time_span( + datasets: dict, + method: str, + kind: str, +) -> None: + obsh: XRData_t = datasets[kind]["obsh"] + obsp: XRData_t = datasets[kind]["obsp"] + simh: XRData_t = datasets[kind]["simh"] + simp: XRData_t = datasets[kind]["simp"] + + simh = simh.sel(time=slice(simh.time[1], None)).rename({"time": "t_simh"}) + time_names = {"obs": "time", "simh": "t_simh", "simp": "time"} + + result: XRData_t = adjust( + method=method, + obs=obsh, + simh=simh, + simp=simp, + kind=kind, + n_quantiles=N_QUANTILES, + input_core_dims=time_names, + ) + + assert isinstance(result, XRData_t) + assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp) + + def test_1d_detrended_quantile_mapping_add(datasets: dict) -> None: kind: str = "+" obsh: XRData_t = datasets[kind]["obsh"][:, 0, 0] diff --git a/tests/test_utils.py b/tests/test_utils.py index 2b5cb8e..2587231 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -39,10 +39,10 @@ def test_quantile_mapping_single_nan() -> None: obs, simh, simp = list(np.arange(10)), list(np.arange(10)), list(np.arange(10)) obs[0] = np.nan - expected = np.array([0.0, 1.9, 2.9, 3.9, 4.9, 5.9, 6.9, 7.9, 8.9, 9.0]) + expected = np.array([0.0, 1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1, 9.0]) res = quantile_mapping(obs=obs, simh=simh, simp=simp, n_quantiles=5) - assert np.allclose(res, expected) + assert np.allclose(res, expected), res @pytest.mark.filterwarnings("ignore:All-NaN slice encountered")