From 15f78c1729c1be03509d5e77eb7a899065fdf783 Mon Sep 17 00:00:00 2001 From: "Benjamin T. Schwertfeger" Date: Sun, 10 Mar 2024 07:06:22 +0100 Subject: [PATCH] Resolve "Adjustments using `adjust` require the input data of the control period to have the same size for the time dimension" (#67) --- CHANGELOG.md | 36 +++++++++++++++- README.md | 1 - cmethods/core.py | 59 +++++++++++++++++++++---- cmethods/distribution.py | 10 +++++ doc/links.rst | 3 -- doc/src/getting_started.rst | 57 +++++++++++++++++++++++++ doc/src/introduction.rst | 2 +- tests/test_methods.py | 85 +++++++++++++++++++++++++++++++++++++ tests/test_utils.py | 4 +- 9 files changed, 241 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e617208..aef6031 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,36 @@ # Changelog -## [v2.0.0](https://github.com/btschwertfeger/python-cmethods/tree/v2.0.0) (2023-01-23) +## [Unreleased](https://github.com/btschwertfeger/python-cmethods/tree/HEAD) + +[Full Changelog](https://github.com/btschwertfeger/python-cmethods/compare/v2.0.2...HEAD) + +**Merged pull requests:** + +- Fix typos and update pre-commit hooks [\#64](https://github.com/btschwertfeger/python-cmethods/pull/64) ([btschwertfeger](https://github.com/btschwertfeger)) + +## [v2.0.2](https://github.com/btschwertfeger/python-cmethods/tree/v2.0.2) (2024-02-02) + +[Full Changelog](https://github.com/btschwertfeger/python-cmethods/compare/v2.0.1...v2.0.2) + +**Merged pull requests:** + +- Update documentation -- QM and QDM formulas [\#62](https://github.com/btschwertfeger/python-cmethods/pull/62) ([btschwertfeger](https://github.com/btschwertfeger)) +- Bump GitHub action versions [\#59](https://github.com/btschwertfeger/python-cmethods/pull/59) ([btschwertfeger](https://github.com/btschwertfeger)) + +## [v2.0.1](https://github.com/btschwertfeger/python-cmethods/tree/v2.0.1) (2024-02-01) + +[Full Changelog](https://github.com/btschwertfeger/python-cmethods/compare/v2.0.0...v2.0.1) + +**Closed issues:** + +- The latest documentation still describes the legacy max_scaling_factor [\#60](https://github.com/btschwertfeger/python-cmethods/issues/60) + +**Merged pull requests:** + +- adjust CI workflows [\#58](https://github.com/btschwertfeger/python-cmethods/pull/58) ([btschwertfeger](https://github.com/btschwertfeger)) +- Resolve "The latest documentation still describes the legacy max scaling factor" [\#61](https://github.com/btschwertfeger/python-cmethods/pull/61) ([btschwertfeger](https://github.com/btschwertfeger)) + +## [v2.0.0](https://github.com/btschwertfeger/python-cmethods/tree/v2.0.0) (2024-01-23) [Full Changelog](https://github.com/btschwertfeger/python-cmethods/compare/v1.0.3...v2.0.0) @@ -13,6 +43,10 @@ - Optimization for `adjust_3d` [\#47](https://github.com/btschwertfeger/python-cmethods/issues/47) - Find a solution to process large data sets more efficient [\#6](https://github.com/btschwertfeger/python-cmethods/issues/6) +**Merged pull requests:** + +- Fix the CodeQL workflow execution; prepare v2.0.0 release [\#50](https://github.com/btschwertfeger/python-cmethods/pull/50) ([btschwertfeger](https://github.com/btschwertfeger)) + ## [v1.0.3](https://github.com/btschwertfeger/python-cmethods/tree/v1.0.3) (2023-08-09) [Full Changelog](https://github.com/btschwertfeger/python-cmethods/compare/v1.0.2...v1.0.3) diff --git a/README.md b/README.md index 5cd4555..df83a7f 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,6 @@ [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-orange.svg)](https://www.gnu.org/licenses/gpl-3.0) [![Downloads](https://pepy.tech/badge/python-cmethods)](https://pepy.tech/project/python-cmethods) -![CodeQL](https://github.com/btschwertfeger/python-cmethods/actions/workflows/codeql.yaml/badge.svg) [![CI/CD](https://github.com/btschwertfeger/python-cmethods/actions/workflows/cicd.yaml/badge.svg?branch=master)](https://github.com/btschwertfeger/python-cmethods/actions/workflows/cicd.yaml) [![codecov](https://codecov.io/github/btschwertfeger/python-cmethods/branch/master/graph/badge.svg?token=OSO4PAABPD)](https://codecov.io/github/btschwertfeger/python-cmethods) diff --git a/cmethods/core.py b/cmethods/core.py index 1860729..f76c929 100644 --- a/cmethods/core.py +++ b/cmethods/core.py @@ -50,26 +50,45 @@ def apply_ufunc( if method not in __METHODS_FUNC__: raise UnknownMethodError(method, __METHODS_FUNC__.keys()) + if kwargs.get("input_core_dims"): + if not isinstance(kwargs["input_core_dims"], dict): + raise TypeError("input_core_dims must be an object of type 'dict'") + if not len(kwargs["input_core_dims"]) == 3 or any( + not isinstance(value, str) for value in kwargs["input_core_dims"].values() + ): + raise ValueError( + "input_core_dims must have three key-value pairs like: " + '{"obs": "time", "simh": "time", "simp": "time"}', + ) + + input_core_dims = kwargs["input_core_dims"] + else: + input_core_dims = {"obs": "time", "simh": "time", "simp": "time"} + result: XRData = xr.apply_ufunc( __METHODS_FUNC__[method], obs, simh, # Need to spoof a fake time axis since 'time' coord on full dataset is different # than 'time' coord on training dataset. - simp.rename({"time": "t2"}), + simp.rename({input_core_dims["simp"]: "__t_simp__"}), dask="parallelized", vectorize=True, # This will vectorize over the time dimension, so will submit each grid cell # independently - input_core_dims=[["time"], ["time"], ["t2"]], + input_core_dims=[ + [input_core_dims["obs"]], + [input_core_dims["simh"]], + ["__t_simp__"], + ], # Need to denote that the final output dataset will be labeled with the # spoofed time coordinate - output_core_dims=[["t2"]], + output_core_dims=[["__t_simp__"]], kwargs=dict(kwargs), ) # Rename to proper coordinate name. - result = result.rename({"t2": "time"}) + result = result.rename({"__t_simp__": input_core_dims["simp"]}) # ufunc will put the core dimension to the end (time), so want to preserve original # order where time is commonly first. @@ -90,6 +109,14 @@ def adjust( See https://python-cmethods.readthedocs.io/en/latest/src/methods.html + + The time dimension of ``obs``, ``simh`` and ``simp`` must be named ``time``. + + If the sizes of time dimensions of the input data sets differ, you have to + pass the hidden ``input_core_dims`` parameter, see + https://python-cmethods.readthedocs.io/en/latest/src/getting_started.html#advanced-usage + for more information. + :param method: Technique to apply :type method: str :param obs: The reference/observational data set @@ -127,14 +154,30 @@ def adjust( ) # Grouped correction | scaling-based technique - group: str = kwargs["group"] + group: str | dict[str, str] = kwargs["group"] + if isinstance(group, str): + # only for same sized time dimensions + obs_group = group + simh_group = group + simp_group = group + elif isinstance(group, dict): + if any(key not in {"obs", "simh", "simp"} for key in group): + raise ValueError( + "group must either be a string like 'time' or a dict like " + '{"obs": "time.month", "simh": "t_simh.month", "simp": "time.month"}', + ) + # for different sized time dimensions + obs_group = group["obs"] + simh_group = group["simh"] + simp_group = group["simp"] + del kwargs["group"] result: Optional[XRData] = None for (_, obs_gds), (_, simh_gds), (_, simp_gds) in zip( - obs.groupby(group), - simh.groupby(group), - simp.groupby(group), + obs.groupby(obs_group), + simh.groupby(simh_group), + simp.groupby(simp_group), ): monthly_result = apply_ufunc( method, diff --git a/cmethods/distribution.py b/cmethods/distribution.py index b5d8fca..cc54c1a 100644 --- a/cmethods/distribution.py +++ b/cmethods/distribution.py @@ -62,6 +62,11 @@ def quantile_mapping( cdf_obs = get_cdf(obs, xbins) cdf_simh = get_cdf(simh, xbins) + cdf_simh = np.interp( + cdf_simh, + (cdf_simh.min(), cdf_simh.max()), + (cdf_obs.min(), cdf_obs.max()), + ) if kind in ADDITIVE: epsilon = np.interp(simp, xbins, cdf_simh) # Eq. 1 @@ -124,6 +129,11 @@ def detrended_quantile_mapping( cdf_obs = get_cdf(obs, xbins) cdf_simh = get_cdf(simh, xbins) + cdf_simh = np.interp( + cdf_simh, + (cdf_simh.min(), cdf_simh.max()), + (cdf_obs.min(), cdf_obs.max()), + ) # detrended => shift mean of $X_{sim,p}$ to range of $X_{sim,h}$ to adjust extremes res = np.zeros(len(simp.values)) diff --git a/doc/links.rst b/doc/links.rst index 14f0e67..a06fa68 100644 --- a/doc/links.rst +++ b/doc/links.rst @@ -27,9 +27,6 @@ .. |Downloads badge| image:: https://static.pepy.tech/personalized-badge/python-cmethods?period=total&units=abbreviation&left_color=grey&right_color=orange&left_text=downloads :target: https://pepy.tech/project/python-cmethods -.. |CodeQL badge| image:: https://github.com/btschwertfeger/python-cmethods/actions/workflows/codeql.yaml/badge.svg?branch=master - :target: https://github.com/btschwertfeger/python-cmethods/actions/workflows/codeql.yaml - .. |CI/CD badge| image:: https://github.com/btschwertfeger/python-cmethods/actions/workflows/cicd.yaml/badge.svg?branch=master :target: https://github.com/btschwertfeger/python-cmethods/actions/workflows/cicd.yaml diff --git a/doc/src/getting_started.rst b/doc/src/getting_started.rst index 8cc5cf4..152d2bd 100644 --- a/doc/src/getting_started.rst +++ b/doc/src/getting_started.rst @@ -56,3 +56,60 @@ method specific documentation. n_quaniles=1000, kind="+", ) + + +Advanced Usage +-------------- + +In some cases the time dimension of input data sets have different sizes. In +such case, the hidden parameter ``input_core_dims`` must be passed to the +``adjust`` call. + +It defines the dimension names of the input data sets, i.e. if the time +dimensions of ``obs`` and ``simp`` have the length, but the time dimension of +``simh`` is somewhat smaller, you have to define this as follows: + + +.. code-block:: python + :linenos: + :caption: Bias Adjustments for data sets with different time dimension lengths pt. 1 + + from cmethods import adjust + import xarray as xr + + obs = xr.open_dataset("examples/input_data/observations.nc")["tas"] + simp = xr.open_dataset("examples/input_data/control.nc")["tas"] + simh = simp.copy(deep=True)[3650:] + + bc = adjust( + method="quantile_mapping", + obs=obs, + simh=simh.rename({"time": "t_simh"}), + simp=simh, + kind="+", + input_core_dims={"obs": "time", "simh": "t_simh", "simp": "time"} + ) + +In case you are applying a scaling based technique using grouping, you have to +adjust the group names accordingly to the time dimension names. + +.. code-block:: python + :linenos: + :caption: Bias Adjustments for data sets with different time dimension lengths pt. 2 + + from cmethods import adjust + import xarray as xr + + obs = xr.open_dataset("examples/input_data/observations.nc")["tas"] + simp = xr.open_dataset("examples/input_data/control.nc")["tas"] + simh = simp.copy(deep=True)[3650:] + + bc = adjust( + method="linear_scaling", + obs=obs, + simh=simh.rename({"time": "t_simh"}), + simp=simh, + kind="+", + group={"obs": "time.month", "simh": "t_simh.month", "simp": "time.month"}, + input_core_dims={"obs": "time", "simh": "t_simh", "simp": "time"} + ) diff --git a/doc/src/introduction.rst b/doc/src/introduction.rst index dabc0e2..42ee927 100644 --- a/doc/src/introduction.rst +++ b/doc/src/introduction.rst @@ -4,7 +4,7 @@ Introduction ============= |GitHub badge| |License badge| |PyVersions badge| |Downloads badge| -|CodeQL badge| |CI/CD badge| |codecov badge| +|CI/CD badge| |codecov badge| |Release date badge| |Release version badge| |DOI badge| |Docs stable| About diff --git a/tests/test_methods.py b/tests/test_methods.py index f7eeee6..315c09c 100644 --- a/tests/test_methods.py +++ b/tests/test_methods.py @@ -107,6 +107,55 @@ def test_3d_scaling( assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp) +@pytest.mark.parametrize( + ("method", "kind"), + [ + ("linear_scaling", "+"), + ("variance_scaling", "+"), + ("linear_scaling", "*"), + ], +) +def test_3d_scaling_different_time_span( + datasets: dict, + method: str, + kind: str, +) -> None: + obsh: XRData_t = datasets[kind]["obsh"] + obsp: XRData_t = datasets[kind]["obsp"] + simh: XRData_t = datasets[kind]["simh"] + simp: XRData_t = datasets[kind]["simp"] + simh = simh.sel(time=slice(simh.time[1], None)).rename({"time": "t_simh"}) + + time_names = {"obs": "time", "simh": "t_simh", "simp": "time"} + + # not grouped + result: XRData_t = adjust( + method=method, + obs=obsh, + simh=simh, + simp=simp, + kind=kind, + input_core_dims=time_names, + ) + + assert isinstance(result, XRData_t) + assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp) + + # grouped + result: XRData_t = adjust( + method=method, + obs=obsh, + simh=simh, + simp=simp, + kind=kind, + group={"obs": "time.month", "simh": "t_simh.month", "simp": "time.month"}, + input_core_dims=time_names, + ) + + assert isinstance(result, XRData_t) + assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp) + + @pytest.mark.parametrize( ("method", "kind"), [ @@ -171,6 +220,42 @@ def test_3d_distribution( assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp) +@pytest.mark.parametrize( + ("method", "kind"), + [ + ("quantile_mapping", "+"), + ("quantile_delta_mapping", "+"), + ("quantile_mapping", "*"), + ("quantile_delta_mapping", "*"), + ], +) +def test_3d_distribution_different_time_span( + datasets: dict, + method: str, + kind: str, +) -> None: + obsh: XRData_t = datasets[kind]["obsh"] + obsp: XRData_t = datasets[kind]["obsp"] + simh: XRData_t = datasets[kind]["simh"] + simp: XRData_t = datasets[kind]["simp"] + + simh = simh.sel(time=slice(simh.time[1], None)).rename({"time": "t_simh"}) + time_names = {"obs": "time", "simh": "t_simh", "simp": "time"} + + result: XRData_t = adjust( + method=method, + obs=obsh, + simh=simh, + simp=simp, + kind=kind, + n_quantiles=N_QUANTILES, + input_core_dims=time_names, + ) + + assert isinstance(result, XRData_t) + assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp) + + def test_1d_detrended_quantile_mapping_add(datasets: dict) -> None: kind: str = "+" obsh: XRData_t = datasets[kind]["obsh"][:, 0, 0] diff --git a/tests/test_utils.py b/tests/test_utils.py index 2b5cb8e..2587231 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -39,10 +39,10 @@ def test_quantile_mapping_single_nan() -> None: obs, simh, simp = list(np.arange(10)), list(np.arange(10)), list(np.arange(10)) obs[0] = np.nan - expected = np.array([0.0, 1.9, 2.9, 3.9, 4.9, 5.9, 6.9, 7.9, 8.9, 9.0]) + expected = np.array([0.0, 1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1, 9.0]) res = quantile_mapping(obs=obs, simh=simh, simp=simp, n_quantiles=5) - assert np.allclose(res, expected) + assert np.allclose(res, expected), res @pytest.mark.filterwarnings("ignore:All-NaN slice encountered")