Skip to content

Commit a362ff0

Browse files
pdb5627ricardoV94
andauthored
Set data when building Linearmodel (#249)
* Create test of fitted parameter values * Set data when creating model * Clean up sample_prior_predictive Since model_builder must create model object and set data, data only needs to be set if model object already exists and no need to check for model object before sampling. * Apply suggestions from code review Co-authored-by: Ricardo Vieira <28983449+ricardoV94@users.noreply.github.com> * Add join=right to extend calls * Update test_parameter_fit based on comments from PR review Use np.testing.assert_allclose instead of pytest.approx Make fitted_model use fewer samples for fitting and do a separate model and fit for checking parameter recovery. * Add tests for extend_idata parameters * Update based on code review comments Make copies of `fitted_model_instance` to keep tests from interfering with each other. Combine `test_sample_prior_extend_idata_param` and `test_sample_posterior_extend_idata_param` to reduce code repetition. * Work around `deepcopy` not working for model objects * Mark correct test for skipping on win32 --------- Co-authored-by: Ricardo Vieira <28983449+ricardoV94@users.noreply.github.com>
1 parent 6d8f569 commit a362ff0

File tree

4 files changed

+119
-33
lines changed

4 files changed

+119
-33
lines changed

pymc_experimental/linearmodel.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,6 @@ def build_model(self, X: pd.DataFrame, y: pd.Series):
6767
"""
6868
cfg = self.model_config
6969

70-
# The model is built with placeholder data.
71-
# Actual data will be set by _data_setter when fitting or evaluating the model.
7270
# Data array size can change but number of dimensions must stay the same.
7371
with pm.Model() as self.model:
7472
x = pm.MutableData("x", np.zeros((1,)), dims="observation")
@@ -94,6 +92,8 @@ def build_model(self, X: pd.DataFrame, y: pd.Series):
9492
dims="observation",
9593
)
9694

95+
self._data_setter(X, y)
96+
9797
def _data_setter(self, X: pd.DataFrame, y: Optional[Union[pd.DataFrame, pd.Series]] = None):
9898
with self.model:
9999
pm.set_data({"x": X.squeeze()})

pymc_experimental/model_builder.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -299,8 +299,8 @@ def sample_model(self, **kwargs):
299299
with self.model:
300300
sampler_args = {**self.sampler_config, **kwargs}
301301
idata = pm.sample(**sampler_args)
302-
idata.extend(pm.sample_prior_predictive())
303-
idata.extend(pm.sample_posterior_predictive(idata))
302+
idata.extend(pm.sample_prior_predictive(), join="right")
303+
idata.extend(pm.sample_posterior_predictive(idata), join="right")
304304

305305
idata = self.set_idata_attrs(idata)
306306
return idata
@@ -595,23 +595,22 @@ def sample_prior_predictive(
595595
Prior predictive samples for each input X_pred
596596
"""
597597
if y_pred is None:
598-
y_pred = np.zeros(len(X_pred))
598+
y_pred = pd.Series(np.zeros(len(X_pred)), name=self.output_var)
599599
if samples is None:
600600
samples = self.sampler_config.get("draws", 500)
601601

602602
if self.model is None:
603603
self.build_model(X_pred, y_pred)
604-
605-
self._data_setter(X_pred, y_pred)
606-
if self.model is not None:
607-
with self.model: # sample with new input data
608-
prior_pred: az.InferenceData = pm.sample_prior_predictive(samples, **kwargs)
609-
self.set_idata_attrs(prior_pred)
610-
if extend_idata:
611-
if self.idata is not None:
612-
self.idata.extend(prior_pred)
613-
else:
614-
self.idata = prior_pred
604+
else:
605+
self._data_setter(X_pred, y_pred)
606+
with self.model: # sample with new input data
607+
prior_pred: az.InferenceData = pm.sample_prior_predictive(samples, **kwargs)
608+
self.set_idata_attrs(prior_pred)
609+
if extend_idata:
610+
if self.idata is not None:
611+
self.idata.extend(prior_pred, join="right")
612+
else:
613+
self.idata = prior_pred
615614

616615
prior_predictive_samples = az.extract(prior_pred, "prior_predictive", combined=combined)
617616

@@ -641,7 +640,7 @@ def sample_posterior_predictive(self, X_pred, extend_idata, combined, **kwargs):
641640
with self.model: # sample with new input data
642641
post_pred = pm.sample_posterior_predictive(self.idata, **kwargs)
643642
if extend_idata:
644-
self.idata.extend(post_pred)
643+
self.idata.extend(post_pred, join="right")
645644

646645
posterior_predictive_samples = az.extract(
647646
post_pred, "posterior_predictive", combined=combined

pymc_experimental/tests/test_linearmodel.py

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,15 @@
3636
sklearn_available = False
3737

3838

39+
@pytest.fixture(scope="module")
40+
def toy_actual_params():
41+
return {
42+
"intercept": 3,
43+
"slope": 5,
44+
"obs_error": 0.5,
45+
}
46+
47+
3948
@pytest.fixture(scope="module")
4049
def toy_X():
4150
x = np.linspace(start=0, stop=1, num=100)
@@ -44,23 +53,24 @@ def toy_X():
4453

4554

4655
@pytest.fixture(scope="module")
47-
def toy_y(toy_X):
48-
y = 5 * toy_X["input"] + 3
49-
y = y + np.random.normal(0, 1, size=len(toy_X))
56+
def toy_y(toy_X, toy_actual_params):
57+
y = toy_actual_params["slope"] * toy_X["input"] + toy_actual_params["intercept"]
58+
rng = np.random.default_rng(427)
59+
y = y + rng.normal(0, toy_actual_params["obs_error"], size=len(toy_X))
5060
y = pd.Series(y, name="output")
5161
return y
5262

5363

5464
@pytest.fixture(scope="module")
5565
def fitted_linear_model_instance(toy_X, toy_y):
5666
sampler_config = {
57-
"draws": 500,
58-
"tune": 300,
67+
"draws": 50,
68+
"tune": 30,
5969
"chains": 2,
6070
"target_accept": 0.95,
6171
}
6272
model = LinearModel(sampler_config=sampler_config)
63-
model.fit(toy_X, toy_y)
73+
model.fit(toy_X, toy_y, random_seed=312)
6474
return model
6575

6676

@@ -101,6 +111,23 @@ def test_fit(fitted_linear_model_instance):
101111
assert isinstance(post_pred, xr.DataArray)
102112

103113

114+
def test_parameter_fit(toy_X, toy_y, toy_actual_params):
115+
"""Check that the fit model recovered the data-generating parameters."""
116+
# Fit the model with a sufficient number of samples
117+
sampler_config = {
118+
"draws": 500,
119+
"tune": 300,
120+
"chains": 2,
121+
"target_accept": 0.95,
122+
}
123+
model = LinearModel(sampler_config=sampler_config)
124+
model.fit(toy_X, toy_y, random_seed=312)
125+
fit_params = model.idata.posterior.mean()
126+
np.testing.assert_allclose(fit_params["intercept"], toy_actual_params["intercept"], rtol=0.1)
127+
np.testing.assert_allclose(fit_params["slope"], toy_actual_params["slope"], rtol=0.1)
128+
np.testing.assert_allclose(fit_params["σ_model_fmc"], toy_actual_params["obs_error"], rtol=0.1)
129+
130+
104131
def test_predict(fitted_linear_model_instance):
105132
model = fitted_linear_model_instance
106133
X_pred = pd.DataFrame({"input": np.random.uniform(low=0, high=1, size=100)})

pymc_experimental/tests/test_model_builder.py

Lines changed: 70 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,13 @@ def toy_y(toy_X):
4141
return y
4242

4343

44-
@pytest.fixture(scope="module")
45-
def fitted_model_instance(toy_X, toy_y):
44+
def get_unfitted_model_instance(X, y):
45+
"""Creates an unfitted model instance to which idata can be copied in
46+
and then used as a fitted model instance. That way a fitted model
47+
can be used multiple times without having to run `fit` multiple times."""
4648
sampler_config = {
47-
"draws": 100,
48-
"tune": 100,
49+
"draws": 20,
50+
"tune": 10,
4951
"chains": 2,
5052
"target_accept": 0.95,
5153
}
@@ -57,7 +59,30 @@ def fitted_model_instance(toy_X, toy_y):
5759
model = test_ModelBuilder(
5860
model_config=model_config, sampler_config=sampler_config, test_parameter="test_paramter"
5961
)
60-
model.fit(toy_X)
62+
# Do the things that `model.fit` does except sample to create idata.
63+
model._generate_and_preprocess_model_data(X, y.values.flatten())
64+
model.build_model(X, y)
65+
return model
66+
67+
68+
@pytest.fixture(scope="module")
69+
def fitted_model_instance_base(toy_X, toy_y):
70+
"""Because fitting takes a relatively long time, this is intended to
71+
be used only once and then have new instances created and fit data patched in
72+
for tests that use a fitted model instance. Tests should use
73+
`fitted_model_instance` instead of this."""
74+
model = get_unfitted_model_instance(toy_X, toy_y)
75+
model.fit(toy_X, toy_y)
76+
return model
77+
78+
79+
@pytest.fixture
80+
def fitted_model_instance(toy_X, toy_y, fitted_model_instance_base):
81+
"""Get a fitted model instance. A new instance is created and fit data is
82+
patched in, so tests using this fixture can modify the model object without
83+
affecting other tests."""
84+
model = get_unfitted_model_instance(toy_X, toy_y)
85+
model.idata = fitted_model_instance_base.idata.copy()
6186
return model
6287

6388

@@ -131,8 +156,8 @@ def _generate_and_preprocess_model_data(
131156
@staticmethod
132157
def get_default_sampler_config() -> Dict:
133158
return {
134-
"draws": 1_000,
135-
"tune": 1_000,
159+
"draws": 10,
160+
"tune": 10,
136161
"chains": 3,
137162
"target_accept": 0.95,
138163
}
@@ -142,6 +167,9 @@ def test_save_input_params(fitted_model_instance):
142167
assert fitted_model_instance.idata.attrs["test_paramter"] == '"test_paramter"'
143168

144169

170+
@pytest.mark.skipif(
171+
sys.platform == "win32", reason="Permissions for temp files not granted on windows CI."
172+
)
145173
def test_save_load(fitted_model_instance):
146174
temp = tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False)
147175
fitted_model_instance.save(temp.name)
@@ -193,9 +221,6 @@ def test_fit_no_y(toy_X):
193221
assert "posterior" in model_builder.idata.groups()
194222

195223

196-
@pytest.mark.skipif(
197-
sys.platform == "win32", reason="Permissions for temp files not granted on windows CI."
198-
)
199224
def test_predict(fitted_model_instance):
200225
x_pred = np.random.uniform(low=0, high=1, size=100)
201226
prediction_data = pd.DataFrame({"input": x_pred})
@@ -220,6 +245,41 @@ def test_sample_posterior_predictive(fitted_model_instance, combined):
220245
assert np.issubdtype(pred[fitted_model_instance.output_var].dtype, np.floating)
221246

222247

248+
@pytest.mark.parametrize("group", ["prior_predictive", "posterior_predictive"])
249+
@pytest.mark.parametrize("extend_idata", [True, False])
250+
def test_sample_xxx_extend_idata_param(fitted_model_instance, group, extend_idata):
251+
output_var = fitted_model_instance.output_var
252+
idata_prev = fitted_model_instance.idata[group][output_var]
253+
254+
# Since coordinates are provided, the dimension must match
255+
n_pred = 100 # Must match toy_x
256+
x_pred = np.random.uniform(0, 1, n_pred)
257+
258+
prediction_data = pd.DataFrame({"input": x_pred})
259+
if group == "prior_predictive":
260+
prediction_method = fitted_model_instance.sample_prior_predictive
261+
else: # group == "posterior_predictive":
262+
prediction_method = fitted_model_instance.sample_posterior_predictive
263+
264+
pred = prediction_method(prediction_data["input"], combined=False, extend_idata=extend_idata)
265+
266+
pred_unstacked = pred[output_var].values
267+
idata_now = fitted_model_instance.idata[group][output_var].values
268+
269+
if extend_idata:
270+
# After sampling, data in the model should be the same as the predictions
271+
np.testing.assert_array_equal(idata_now, pred_unstacked)
272+
# Data in the model should NOT be the same as before
273+
if idata_now.shape == idata_prev.values.shape:
274+
assert np.sum(np.abs(idata_now - idata_prev.values) < 1e-5) <= 2
275+
else:
276+
# After sampling, data in the model should be the same as it was before
277+
np.testing.assert_array_equal(idata_now, idata_prev.values)
278+
# Data in the model should NOT be the same as the predictions
279+
if idata_now.shape == pred_unstacked.shape:
280+
assert np.sum(np.abs(idata_now - pred_unstacked) < 1e-5) <= 2
281+
282+
223283
def test_model_config_formatting():
224284
model_config = {
225285
"a": {

0 commit comments

Comments
 (0)