Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set data when building Linearmodel #249

Merged
merged 10 commits into from
Nov 13, 2023
4 changes: 2 additions & 2 deletions pymc_experimental/linearmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@ def build_model(self, X: pd.DataFrame, y: pd.Series):
"""
cfg = self.model_config

# The model is built with placeholder data.
# Actual data will be set by _data_setter when fitting or evaluating the model.
# Data array size can change but number of dimensions must stay the same.
with pm.Model() as self.model:
x = pm.MutableData("x", np.zeros((1,)), dims="observation")
Expand All @@ -94,6 +92,8 @@ def build_model(self, X: pd.DataFrame, y: pd.Series):
dims="observation",
)

self._data_setter(X, y)

def _data_setter(self, X: pd.DataFrame, y: Optional[Union[pd.DataFrame, pd.Series]] = None):
with self.model:
pm.set_data({"x": X.squeeze()})
Expand Down
29 changes: 14 additions & 15 deletions pymc_experimental/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,8 +299,8 @@ def sample_model(self, **kwargs):
with self.model:
sampler_args = {**self.sampler_config, **kwargs}
idata = pm.sample(**sampler_args)
idata.extend(pm.sample_prior_predictive())
idata.extend(pm.sample_posterior_predictive(idata))
idata.extend(pm.sample_prior_predictive(), join="right")
idata.extend(pm.sample_posterior_predictive(idata), join="right")

idata = self.set_idata_attrs(idata)
return idata
Expand Down Expand Up @@ -595,23 +595,22 @@ def sample_prior_predictive(
Prior predictive samples for each input X_pred
"""
if y_pred is None:
y_pred = np.zeros(len(X_pred))
y_pred = pd.Series(np.zeros(len(X_pred)), name=self.output_var)
if samples is None:
samples = self.sampler_config.get("draws", 500)

if self.model is None:
self.build_model(X_pred, y_pred)

self._data_setter(X_pred, y_pred)
if self.model is not None:
with self.model: # sample with new input data
prior_pred: az.InferenceData = pm.sample_prior_predictive(samples, **kwargs)
self.set_idata_attrs(prior_pred)
if extend_idata:
if self.idata is not None:
self.idata.extend(prior_pred)
else:
self.idata = prior_pred
else:
self._data_setter(X_pred, y_pred)
with self.model: # sample with new input data
prior_pred: az.InferenceData = pm.sample_prior_predictive(samples, **kwargs)
self.set_idata_attrs(prior_pred)
if extend_idata:
if self.idata is not None:
self.idata.extend(prior_pred, join="right")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we test that all these join="right" are doing the right thing (i.e., discarding the old value and replacing the new one), and that extend_idata=False is being respected?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be possible. I'll work on it and update the PR accordingly.

else:
self.idata = prior_pred

prior_predictive_samples = az.extract(prior_pred, "prior_predictive", combined=combined)

Expand Down Expand Up @@ -641,7 +640,7 @@ def sample_posterior_predictive(self, X_pred, extend_idata, combined, **kwargs):
with self.model: # sample with new input data
post_pred = pm.sample_posterior_predictive(self.idata, **kwargs)
if extend_idata:
self.idata.extend(post_pred)
self.idata.extend(post_pred, join="right")

posterior_predictive_samples = az.extract(
post_pred, "posterior_predictive", combined=combined
Expand Down
39 changes: 33 additions & 6 deletions pymc_experimental/tests/test_linearmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,15 @@
sklearn_available = False


@pytest.fixture(scope="module")
def toy_actual_params():
return {
"intercept": 3,
"slope": 5,
"obs_error": 0.5,
}


@pytest.fixture(scope="module")
def toy_X():
x = np.linspace(start=0, stop=1, num=100)
Expand All @@ -44,23 +53,24 @@ def toy_X():


@pytest.fixture(scope="module")
def toy_y(toy_X):
y = 5 * toy_X["input"] + 3
y = y + np.random.normal(0, 1, size=len(toy_X))
def toy_y(toy_X, toy_actual_params):
y = toy_actual_params["slope"] * toy_X["input"] + toy_actual_params["intercept"]
rng = np.random.default_rng(427)
y = y + rng.normal(0, toy_actual_params["obs_error"], size=len(toy_X))
y = pd.Series(y, name="output")
return y


@pytest.fixture(scope="module")
def fitted_linear_model_instance(toy_X, toy_y):
sampler_config = {
"draws": 500,
"tune": 300,
"draws": 50,
"tune": 30,
"chains": 2,
"target_accept": 0.95,
}
model = LinearModel(sampler_config=sampler_config)
model.fit(toy_X, toy_y)
model.fit(toy_X, toy_y, random_seed=312)
return model


Expand Down Expand Up @@ -101,6 +111,23 @@ def test_fit(fitted_linear_model_instance):
assert isinstance(post_pred, xr.DataArray)


def test_parameter_fit(toy_X, toy_y, toy_actual_params):
"""Check that the fit model recovered the data-generating parameters."""
# Fit the model with a sufficient number of samples
sampler_config = {
"draws": 500,
"tune": 300,
"chains": 2,
"target_accept": 0.95,
}
model = LinearModel(sampler_config=sampler_config)
model.fit(toy_X, toy_y, random_seed=312)
fit_params = model.idata.posterior.mean()
np.testing.assert_allclose(fit_params["intercept"], toy_actual_params["intercept"], rtol=0.1)
np.testing.assert_allclose(fit_params["slope"], toy_actual_params["slope"], rtol=0.1)
np.testing.assert_allclose(fit_params["σ_model_fmc"], toy_actual_params["obs_error"], rtol=0.1)


def test_predict(fitted_linear_model_instance):
model = fitted_linear_model_instance
X_pred = pd.DataFrame({"input": np.random.uniform(low=0, high=1, size=100)})
Expand Down
57 changes: 52 additions & 5 deletions pymc_experimental/tests/test_model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import hashlib
import json
import sys
Expand Down Expand Up @@ -42,10 +43,13 @@ def toy_y(toy_X):


@pytest.fixture(scope="module")
def fitted_model_instance(toy_X, toy_y):
def fitted_model_instance_base(toy_X, toy_y):
"""Because fitting takes a relatively long time, this is intended to
be used only once and then have copies returned to tests that use a fitted
model instance. Tests should use `fitted_model_instance` instead of this."""
sampler_config = {
"draws": 100,
"tune": 100,
"draws": 20,
"tune": 10,
"chains": 2,
"target_accept": 0.95,
}
Expand All @@ -61,6 +65,14 @@ def fitted_model_instance(toy_X, toy_y):
return model


@pytest.fixture
def fitted_model_instance(fitted_model_instance_base):
"""Get a fitted model instance. The instance is copied after being fit,
so tests using this fixture can modify the model object without affecting
other tests."""
return copy.deepcopy(fitted_model_instance_base)
Copy link
Member

@ricardoV94 ricardoV94 Nov 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

copy doesn't really work for objects that have PyMC models: see pymc-devs/pymc#6985

The approach is not too bad though. What I suggest is to create the idata once and then in this fixture recreate the model and glue-in a copy of the idata. I did something like that with a helper method in this PR: pymc-labs/pymc-marketing@44985a8

Check the _build_with_idata method and how that's used by thin_fit_result. Something similar could be used for a ModelBuilder.copy(), but for now you can just reimplement the logic in this fixture if you want.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I came up with a workaround for copying the model without using copy.deepcopy.

I also noticed that there's a test marked for skipping on win32 due to lack of permissions for temp files, but the marked test doesn't use a temp file. There is a different test that does use a temp file. I thought maybe the annotation got onto the wrong test, so I made a commit to fix that possible issue. If that's wrong or you want to handle it as it's own issue, no problem, I'll take that commit back out.



class test_ModelBuilder(ModelBuilder):
def __init__(self, model_config=None, sampler_config=None, test_parameter=None):
self.test_parameter = test_parameter
Expand Down Expand Up @@ -131,8 +143,8 @@ def _generate_and_preprocess_model_data(
@staticmethod
def get_default_sampler_config() -> Dict:
return {
"draws": 1_000,
"tune": 1_000,
"draws": 10,
"tune": 10,
"chains": 3,
"target_accept": 0.95,
}
Expand Down Expand Up @@ -220,6 +232,41 @@ def test_sample_posterior_predictive(fitted_model_instance, combined):
assert np.issubdtype(pred[fitted_model_instance.output_var].dtype, np.floating)


@pytest.mark.parametrize("group", ["prior_predictive", "posterior_predictive"])
@pytest.mark.parametrize("extend_idata", [True, False])
def test_sample_xxx_extend_idata_param(fitted_model_instance, group, extend_idata):
output_var = fitted_model_instance.output_var
idata_prev = fitted_model_instance.idata[group][output_var]

# Since coordinates are provided, the dimension must match
n_pred = 100 # Must match toy_x
x_pred = np.random.uniform(0, 1, n_pred)

prediction_data = pd.DataFrame({"input": x_pred})
if group == "prior_predictive":
prediction_method = fitted_model_instance.sample_prior_predictive
else: # group == "posterior_predictive":
prediction_method = fitted_model_instance.sample_posterior_predictive

pred = prediction_method(prediction_data["input"], combined=False, extend_idata=extend_idata)

pred_unstacked = pred[output_var].values
idata_now = fitted_model_instance.idata[group][output_var].values

if extend_idata:
# After sampling, data in the model should be the same as the predictions
np.testing.assert_array_equal(idata_now, pred_unstacked)
# Data in the model should NOT be the same as before
if idata_now.shape == idata_prev.values.shape:
assert np.sum(np.abs(idata_now - idata_prev.values) < 1e-5) <= 2
else:
# After sampling, data in the model should be the same as it was before
np.testing.assert_array_equal(idata_now, idata_prev.values)
# Data in the model should NOT be the same as the predictions
if idata_now.shape == pred_unstacked.shape:
assert np.sum(np.abs(idata_now - pred_unstacked) < 1e-5) <= 2


def test_model_config_formatting():
model_config = {
"a": {
Expand Down
Loading