Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set data when building Linearmodel #249

Merged
merged 10 commits into from
Nov 13, 2023
4 changes: 2 additions & 2 deletions pymc_experimental/linearmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@ def build_model(self, X: pd.DataFrame, y: pd.Series):
"""
cfg = self.model_config

# The model is built with placeholder data.
# Actual data will be set by _data_setter when fitting or evaluating the model.
# Data array size can change but number of dimensions must stay the same.
with pm.Model() as self.model:
x = pm.MutableData("x", np.zeros((1,)), dims="observation")
Expand All @@ -94,6 +92,8 @@ def build_model(self, X: pd.DataFrame, y: pd.Series):
dims="observation",
)

self._data_setter(X, y)

def _data_setter(self, X: pd.DataFrame, y: Optional[Union[pd.DataFrame, pd.Series]] = None):
with self.model:
pm.set_data({"x": X.squeeze()})
Expand Down
29 changes: 14 additions & 15 deletions pymc_experimental/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,8 +299,8 @@ def sample_model(self, **kwargs):
with self.model:
sampler_args = {**self.sampler_config, **kwargs}
idata = pm.sample(**sampler_args)
idata.extend(pm.sample_prior_predictive())
idata.extend(pm.sample_posterior_predictive(idata))
idata.extend(pm.sample_prior_predictive(), join="right")
idata.extend(pm.sample_posterior_predictive(idata), join="right")

idata = self.set_idata_attrs(idata)
return idata
Expand Down Expand Up @@ -595,23 +595,22 @@ def sample_prior_predictive(
Prior predictive samples for each input X_pred
"""
if y_pred is None:
y_pred = np.zeros(len(X_pred))
y_pred = pd.Series(np.zeros(len(X_pred)), name=self.output_var)
if samples is None:
samples = self.sampler_config.get("draws", 500)

if self.model is None:
self.build_model(X_pred, y_pred)

self._data_setter(X_pred, y_pred)
if self.model is not None:
with self.model: # sample with new input data
prior_pred: az.InferenceData = pm.sample_prior_predictive(samples, **kwargs)
self.set_idata_attrs(prior_pred)
if extend_idata:
if self.idata is not None:
self.idata.extend(prior_pred)
else:
self.idata = prior_pred
else:
self._data_setter(X_pred, y_pred)
with self.model: # sample with new input data
prior_pred: az.InferenceData = pm.sample_prior_predictive(samples, **kwargs)
self.set_idata_attrs(prior_pred)
if extend_idata:
if self.idata is not None:
self.idata.extend(prior_pred, join="right")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we test that all these join="right" are doing the right thing (i.e., discarding the old value and replacing the new one), and that extend_idata=False is being respected?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be possible. I'll work on it and update the PR accordingly.

else:
self.idata = prior_pred

prior_predictive_samples = az.extract(prior_pred, "prior_predictive", combined=combined)

Expand Down Expand Up @@ -641,7 +640,7 @@ def sample_posterior_predictive(self, X_pred, extend_idata, combined, **kwargs):
with self.model: # sample with new input data
post_pred = pm.sample_posterior_predictive(self.idata, **kwargs)
if extend_idata:
self.idata.extend(post_pred)
self.idata.extend(post_pred, join="right")

posterior_predictive_samples = az.extract(
post_pred, "posterior_predictive", combined=combined
Expand Down
39 changes: 33 additions & 6 deletions pymc_experimental/tests/test_linearmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,15 @@
sklearn_available = False


@pytest.fixture(scope="module")
def toy_actual_params():
return {
"intercept": 3,
"slope": 5,
"obs_error": 0.5,
}


@pytest.fixture(scope="module")
def toy_X():
x = np.linspace(start=0, stop=1, num=100)
Expand All @@ -44,23 +53,24 @@ def toy_X():


@pytest.fixture(scope="module")
def toy_y(toy_X):
y = 5 * toy_X["input"] + 3
y = y + np.random.normal(0, 1, size=len(toy_X))
def toy_y(toy_X, toy_actual_params):
y = toy_actual_params["slope"] * toy_X["input"] + toy_actual_params["intercept"]
rng = np.random.default_rng(427)
y = y + rng.normal(0, toy_actual_params["obs_error"], size=len(toy_X))
y = pd.Series(y, name="output")
return y


@pytest.fixture(scope="module")
def fitted_linear_model_instance(toy_X, toy_y):
sampler_config = {
"draws": 500,
"tune": 300,
"draws": 50,
"tune": 30,
"chains": 2,
"target_accept": 0.95,
}
model = LinearModel(sampler_config=sampler_config)
model.fit(toy_X, toy_y)
model.fit(toy_X, toy_y, random_seed=312)
return model


Expand Down Expand Up @@ -101,6 +111,23 @@ def test_fit(fitted_linear_model_instance):
assert isinstance(post_pred, xr.DataArray)


def test_parameter_fit(toy_X, toy_y, toy_actual_params):
"""Check that the fit model recovered the data-generating parameters."""
# Fit the model with a sufficient number of samples
sampler_config = {
"draws": 500,
"tune": 300,
"chains": 2,
"target_accept": 0.95,
}
model = LinearModel(sampler_config=sampler_config)
model.fit(toy_X, toy_y, random_seed=312)
fit_params = model.idata.posterior.mean()
np.testing.assert_allclose(fit_params["intercept"], toy_actual_params["intercept"], rtol=0.1)
np.testing.assert_allclose(fit_params["slope"], toy_actual_params["slope"], rtol=0.1)
np.testing.assert_allclose(fit_params["σ_model_fmc"], toy_actual_params["obs_error"], rtol=0.1)


def test_predict(fitted_linear_model_instance):
model = fitted_linear_model_instance
X_pred = pd.DataFrame({"input": np.random.uniform(low=0, high=1, size=100)})
Expand Down
62 changes: 62 additions & 0 deletions pymc_experimental/tests/test_model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,68 @@ def test_sample_posterior_predictive(fitted_model_instance, combined):
assert np.issubdtype(pred[fitted_model_instance.output_var].dtype, np.floating)


@pytest.mark.parametrize("extend_idata", [True, False])
def test_sample_prior_extend_idata_param(fitted_model_instance, extend_idata):
Copy link
Member

@ricardoV94 ricardoV94 Nov 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm concerned about mutating the fitted_model_instance in these tests, as it may be used in other tests.
In general I am not a fan of fixtures that return mutable objects that we probably want to mutate further to investigate the behavior. I suggest creating a dummy model in this test and pass a copy of the fitted idata at the beginning.

Also can we parametrize the sample method to merge this and the posterior predictive tests. They seem to share most of the logic anyway.

Otherwise I think it's more than ready

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see what you're saying about modifying the fitted_model_instance object. My thought is to make the fitted_model_instance fixture return a copy of the model instance so individual tests can do what they like with it without any possibility of affecting other tests. The overhead for making a copy doesn't seem to add much to the test run time.

I originally wrote the prior and posterior predictive tests in the same test, but there were so many if else branches that I decided to split the test. But then I ended up making intermediate variables to clean up the code, so it turned out the same after all, so I'm combining them again. Thanks for the suggestion.

output_var = fitted_model_instance.output_var
idata_prev = fitted_model_instance.idata.prior_predictive[output_var]

# Since coordinates are provided, the dimension must match
n_pred = 100 # Must match toy_x
x_pred = np.random.uniform(0, 1, n_pred)

prediction_data = pd.DataFrame({"input": x_pred})
pred = fitted_model_instance.sample_prior_predictive(
prediction_data["input"], combined=False, extend_idata=extend_idata
)

pred_unstacked = pred[output_var].values
idata_now = fitted_model_instance.idata.prior_predictive[output_var].values

if extend_idata:
# After sampling, data in the model should be the same as the predictions
np.testing.assert_array_equal(idata_now, pred_unstacked)
# Data in the model should NOT be the same as before
if idata_now.shape == idata_prev.values.shape:
assert np.sum(np.abs(idata_now - idata_prev.values) < 1e-5) <= 2
else:
# After sampling, data in the model should be the same as it was before
np.testing.assert_array_equal(idata_now, idata_prev.values)
# Data in the model should NOT be the same as the predictions
if idata_now.shape == pred_unstacked.shape:
assert np.sum(np.abs(idata_now - pred_unstacked) < 1e-5) <= 2


@pytest.mark.parametrize("extend_idata", [True, False])
def test_sample_posterior_extend_idata_param(fitted_model_instance, extend_idata):
output_var = fitted_model_instance.output_var
idata_prev = fitted_model_instance.idata.posterior_predictive[output_var]

# Since coordinates are provided, the dimension must match
n_pred = 100 # Must match toy_x
x_pred = np.random.uniform(0, 1, n_pred)

prediction_data = pd.DataFrame({"input": x_pred})
pred = fitted_model_instance.sample_posterior_predictive(
prediction_data["input"], combined=False, extend_idata=extend_idata
)

pred_unstacked = pred[output_var].values
idata_now = fitted_model_instance.idata.posterior_predictive[output_var].values

if extend_idata:
# After sampling, data in the model should be the same as the predictions
np.testing.assert_array_equal(idata_now, pred_unstacked)
# Data in the model should NOT be the same as before
if idata_now.shape == idata_prev.values.shape:
assert np.sum(np.abs(idata_now - idata_prev.values) < 1e-5) <= 2
else:
# After sampling, data in the model should be the same as it was before
np.testing.assert_array_equal(idata_now, idata_prev.values)
# Data in the model should NOT be the same as the predictions
if idata_now.shape == pred_unstacked.shape:
assert np.sum(np.abs(idata_now - pred_unstacked) < 1e-5) <= 2


def test_model_config_formatting():
model_config = {
"a": {
Expand Down
Loading