-
Notifications
You must be signed in to change notification settings - Fork 13
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
QDM not working with longer simp length #101
Comments
Could you please provide a minimal reproducible example using the example data sets provided in this package? Which version of python-cmethods are you using? |
I could not reproduce the issue with simp being longer than simh or obs, but I found out, that the computation of the CDF is not good, in case the temporal resolutions are not uniform. To show the error, I'm using dummy data, see below. import xarray as xr
import numpy as np
from cmethods.utils import get_cdf, get_inverse_of_cdf, ensure_dividable
from cmethods import adjust
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
def get_datasets(kind: str) -> tuple[xr.Dataset, xr.Dataset, xr.Dataset, xr.Dataset]:
"""Function to generate dummy data, as implemented in unit tests"""
historical_time = xr.cftime_range(
"1971-01-01",
"2000-12-31",
freq="D",
calendar="noleap",
)
future_time = xr.cftime_range(
"2001-01-01",
"2030-12-31",
freq="D",
calendar="noleap",
)
latitudes = np.arange(23, 27, 1)
def get_hist_temp_for_lat(lat: int) -> list[float]:
"""Returns a fake interval time series by latitude value"""
return 273.15 - (
lat * np.cos(2 * np.pi * historical_time.dayofyear / 365)
+ 2 * np.random.random_sample((historical_time.size,))
+ 273.15
+ 0.1 * (historical_time - historical_time[0]).days / 365
)
def get_fake_hist_precipitation_data() -> list[float]:
"""Returns ratio based fake time series"""
pr = (
np.cos(2 * np.pi * historical_time.dayofyear / 365)
* np.cos(2 * np.pi * historical_time.dayofyear / 365)
* np.random.random_sample((historical_time.size,))
)
pr *= 0.0004 / pr.max() # scaling
years = 30
days_without_rain_per_year = 239
c = days_without_rain_per_year * years # avoid rain every day
pr.ravel()[np.random.choice(pr.size, c, replace=False)] = 0
return pr
def get_dataset(data, time, kind: str) -> xr.Dataset:
"""Returns a data set by data and time"""
return (
xr.DataArray(
data,
dims=("lon", "lat", "time"),
coords={"time": time, "lat": latitudes, "lon": [0, 1, 3]},
)
.transpose("time", "lat", "lon")
.to_dataset(name=kind)
)
if kind == "+": # noqa: PLR2004
some_data = [get_hist_temp_for_lat(val) for val in latitudes]
data = np.array(
[
np.array(some_data),
np.array(some_data) + 0.5,
np.array(some_data) + 1,
],
)
obsh = get_dataset(data, historical_time, kind=kind)
obsp = get_dataset(data + 1, historical_time, kind=kind)
simh = get_dataset(data - 2, historical_time, kind=kind)
simp = get_dataset(data - 1, future_time, kind=kind)
else: # precipitation
some_data = [get_fake_hist_precipitation_data() for _ in latitudes]
data = np.array(
[some_data, np.array(some_data) + np.random.rand(), np.array(some_data)],
)
obsh = get_dataset(data, historical_time, kind=kind)
obsp = get_dataset(data * 1.02, historical_time, kind=kind)
simh = get_dataset(data * 0.98, historical_time, kind=kind)
simp = get_dataset(data * 0.09, future_time, kind=kind)
return obsh, obsp, simh, simp
def is_1d_rmse_better(result, obsp, simp) -> bool:
return np.sqrt(mean_squared_error(result, obsp)) < np.sqrt(
mean_squared_error(simp, obsp),
)
n_quantiles = 100
kind="+"
obs, obsp, simh, simp = get_datasets(kind)
obs = obs[kind][:,0,0]
obsp = obsp[kind][:,0,0]
simh = simh[kind][:7300,0,0]
simp = simp[kind][:,0,0]
obs, simh, simp = (
np.array(obs),
np.array(simh),
np.array(simp),
)
global_max = max(np.nanmax(obs), np.nanmax(simh))
global_min = min(np.nanmin(obs), np.nanmin(simh))
wide = abs(global_max - global_min) / n_quantiles
xbins = np.arange(global_min, global_max + wide, wide)
cdf_obs = get_cdf(obs, xbins) #/ len(obs)
cdf_simh = get_cdf(simh, xbins) #/ len(simh)
cdf_simp = get_cdf(simp, xbins) #/len(simp)
# calculate exact CDF values of $F_{sim,p}[T_{sim,p}(t)]$
epsilon = np.interp(simp, xbins, cdf_simp) # Eq. 1.1
QDM1 = get_inverse_of_cdf(cdf_obs, epsilon, xbins) # Eq. 1.2
delta = simp - get_inverse_of_cdf(cdf_simh, epsilon, xbins) # Eq. 1.3
result = QDM1 + delta # Eq. 1.4
if not is_1d_rmse_better(result, obsp, simp):
raise ValueError ... which will raise the value error, since the bias adjusted time series is not better at all. When plotting the the CDFs we see why: plt.plot(cdf_obs, label="$F_{obs,h}$")
plt.plot(cdf_simh, label="$F_{sim,h}$")
plt.legend() ... we can see that the CDFs are not correct due to wrong scaling. That is no problem for data sets with the same temporal resolution. This could be fixed by scaling those. I'm working on it in #102 |
Describe the bug
Whenever I use the Quantile delta method with the Samuel lengths of data, the method works really good. Nevertheless, when the data to be corrected is longer than the obs and simh, the tool does not correct the simp data.
To Reproduce
Input a simp with longer length than obs and simh
The text was updated successfully, but these errors were encountered: