Set data when building Linearmodel (#249)

pdb5627 · ricardoV94 · web-flow · commit a362ff03e8ae · 2023-11-13T12:00:57.000+01:00
* Create test of fitted parameter values

* Set data when creating model

* Clean up sample_prior_predictive

Since model_builder must create model object and set data, data only
needs to be set if model object already exists and no need to check for
model object before sampling.

* Apply suggestions from code review

Co-authored-by: Ricardo Vieira &lt;28983449+ricardoV94@users.noreply.github.com&gt;

* Add join=right to extend calls

* Update test_parameter_fit based on comments from PR review

Use np.testing.assert_allclose instead of pytest.approx

Make fitted_model use fewer samples for fitting and do a
separate model and fit for checking parameter recovery.

* Add tests for extend_idata parameters

* Update based on code review comments

Make copies of `fitted_model_instance` to keep tests from interfering
with each other.

Combine `test_sample_prior_extend_idata_param` and
`test_sample_posterior_extend_idata_param` to reduce code repetition.

* Work around `deepcopy` not working for model objects

* Mark correct test for skipping on win32

---------

Co-authored-by: Ricardo Vieira &lt;28983449+ricardoV94@users.noreply.github.com&gt;
diff --git a/pymc_experimental/linearmodel.py b/pymc_experimental/linearmodel.py
@@ -67,8 +67,6 @@ def build_model(self, X: pd.DataFrame, y: pd.Series):
         """
         cfg = self.model_config
 
-        # The model is built with placeholder data.
-        # Actual data will be set by _data_setter when fitting or evaluating the model.
         # Data array size can change but number of dimensions must stay the same.
         with pm.Model() as self.model:
             x = pm.MutableData("x", np.zeros((1,)), dims="observation")
@@ -94,6 +92,8 @@ def build_model(self, X: pd.DataFrame, y: pd.Series):
                 dims="observation",
             )
 
+        self._data_setter(X, y)
+
     def _data_setter(self, X: pd.DataFrame, y: Optional[Union[pd.DataFrame, pd.Series]] = None):
         with self.model:
             pm.set_data({"x": X.squeeze()})
diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py
@@ -299,8 +299,8 @@ def sample_model(self, **kwargs):
         with self.model:
             sampler_args = {**self.sampler_config, **kwargs}
             idata = pm.sample(**sampler_args)
-            idata.extend(pm.sample_prior_predictive())
-            idata.extend(pm.sample_posterior_predictive(idata))
+            idata.extend(pm.sample_prior_predictive(), join="right")
+            idata.extend(pm.sample_posterior_predictive(idata), join="right")
 
         idata = self.set_idata_attrs(idata)
         return idata
@@ -595,23 +595,22 @@ def sample_prior_predictive(
             Prior predictive samples for each input X_pred
         """
         if y_pred is None:
-            y_pred = np.zeros(len(X_pred))
+            y_pred = pd.Series(np.zeros(len(X_pred)), name=self.output_var)
         if samples is None:
             samples = self.sampler_config.get("draws", 500)
 
         if self.model is None:
             self.build_model(X_pred, y_pred)
-
-        self._data_setter(X_pred, y_pred)
-        if self.model is not None:
-            with self.model:  # sample with new input data
-                prior_pred: az.InferenceData = pm.sample_prior_predictive(samples, **kwargs)
-                self.set_idata_attrs(prior_pred)
-                if extend_idata:
-                    if self.idata is not None:
-                        self.idata.extend(prior_pred)
-                    else:
-                        self.idata = prior_pred
+        else:
+            self._data_setter(X_pred, y_pred)
+        with self.model:  # sample with new input data
+            prior_pred: az.InferenceData = pm.sample_prior_predictive(samples, **kwargs)
+            self.set_idata_attrs(prior_pred)
+            if extend_idata:
+                if self.idata is not None:
+                    self.idata.extend(prior_pred, join="right")
+                else:
+                    self.idata = prior_pred
 
         prior_predictive_samples = az.extract(prior_pred, "prior_predictive", combined=combined)
 
@@ -641,7 +640,7 @@ def sample_posterior_predictive(self, X_pred, extend_idata, combined, **kwargs):
         with self.model:  # sample with new input data
             post_pred = pm.sample_posterior_predictive(self.idata, **kwargs)
             if extend_idata:
-                self.idata.extend(post_pred)
+                self.idata.extend(post_pred, join="right")
 
         posterior_predictive_samples = az.extract(
             post_pred, "posterior_predictive", combined=combined
diff --git a/pymc_experimental/tests/test_linearmodel.py b/pymc_experimental/tests/test_linearmodel.py
@@ -36,6 +36,15 @@
     sklearn_available = False
 
 
+@pytest.fixture(scope="module")
+def toy_actual_params():
+    return {
+        "intercept": 3,
+        "slope": 5,
+        "obs_error": 0.5,
+    }
+
+
 @pytest.fixture(scope="module")
 def toy_X():
     x = np.linspace(start=0, stop=1, num=100)
@@ -44,23 +53,24 @@ def toy_X():
 
 
 @pytest.fixture(scope="module")
-def toy_y(toy_X):
-    y = 5 * toy_X["input"] + 3
-    y = y + np.random.normal(0, 1, size=len(toy_X))
+def toy_y(toy_X, toy_actual_params):
+    y = toy_actual_params["slope"] * toy_X["input"] + toy_actual_params["intercept"]
+    rng = np.random.default_rng(427)
+    y = y + rng.normal(0, toy_actual_params["obs_error"], size=len(toy_X))
     y = pd.Series(y, name="output")
     return y
 
 
 @pytest.fixture(scope="module")
 def fitted_linear_model_instance(toy_X, toy_y):
     sampler_config = {
-        "draws": 500,
-        "tune": 300,
+        "draws": 50,
+        "tune": 30,
         "chains": 2,
         "target_accept": 0.95,
     }
     model = LinearModel(sampler_config=sampler_config)
-    model.fit(toy_X, toy_y)
+    model.fit(toy_X, toy_y, random_seed=312)
     return model
 
 
@@ -101,6 +111,23 @@ def test_fit(fitted_linear_model_instance):
     assert isinstance(post_pred, xr.DataArray)
 
 
+def test_parameter_fit(toy_X, toy_y, toy_actual_params):
+    """Check that the fit model recovered the data-generating parameters."""
+    # Fit the model with a sufficient number of samples
+    sampler_config = {
+        "draws": 500,
+        "tune": 300,
+        "chains": 2,
+        "target_accept": 0.95,
+    }
+    model = LinearModel(sampler_config=sampler_config)
+    model.fit(toy_X, toy_y, random_seed=312)
+    fit_params = model.idata.posterior.mean()
+    np.testing.assert_allclose(fit_params["intercept"], toy_actual_params["intercept"], rtol=0.1)
+    np.testing.assert_allclose(fit_params["slope"], toy_actual_params["slope"], rtol=0.1)
+    np.testing.assert_allclose(fit_params["σ_model_fmc"], toy_actual_params["obs_error"], rtol=0.1)
+
+
 def test_predict(fitted_linear_model_instance):
     model = fitted_linear_model_instance
     X_pred = pd.DataFrame({"input": np.random.uniform(low=0, high=1, size=100)})
diff --git a/pymc_experimental/tests/test_model_builder.py b/pymc_experimental/tests/test_model_builder.py
@@ -41,11 +41,13 @@ def toy_y(toy_X):
     return y
 
 
-@pytest.fixture(scope="module")
-def fitted_model_instance(toy_X, toy_y):
+def get_unfitted_model_instance(X, y):
+    """Creates an unfitted model instance to which idata can be copied in
+    and then used as a fitted model instance. That way a fitted model
+    can be used multiple times without having to run `fit` multiple times."""
     sampler_config = {
-        "draws": 100,
-        "tune": 100,
+        "draws": 20,
+        "tune": 10,
         "chains": 2,
         "target_accept": 0.95,
     }
@@ -57,7 +59,30 @@ def fitted_model_instance(toy_X, toy_y):
     model = test_ModelBuilder(
         model_config=model_config, sampler_config=sampler_config, test_parameter="test_paramter"
     )
-    model.fit(toy_X)
+    # Do the things that `model.fit` does except sample to create idata.
+    model._generate_and_preprocess_model_data(X, y.values.flatten())
+    model.build_model(X, y)
+    return model
+
+
+@pytest.fixture(scope="module")
+def fitted_model_instance_base(toy_X, toy_y):
+    """Because fitting takes a relatively long time, this is intended to
+    be used only once and then have new instances created and fit data patched in
+    for tests that use a fitted model instance. Tests should use
+    `fitted_model_instance` instead of this."""
+    model = get_unfitted_model_instance(toy_X, toy_y)
+    model.fit(toy_X, toy_y)
+    return model
+
+
+@pytest.fixture
+def fitted_model_instance(toy_X, toy_y, fitted_model_instance_base):
+    """Get a fitted model instance. A new instance is created and fit data is
+    patched in, so tests using this fixture can modify the model object without
+    affecting other tests."""
+    model = get_unfitted_model_instance(toy_X, toy_y)
+    model.idata = fitted_model_instance_base.idata.copy()
     return model
 
 
@@ -131,8 +156,8 @@ def _generate_and_preprocess_model_data(
     @staticmethod
     def get_default_sampler_config() -> Dict:
         return {
-            "draws": 1_000,
-            "tune": 1_000,
+            "draws": 10,
+            "tune": 10,
             "chains": 3,
             "target_accept": 0.95,
         }
@@ -142,6 +167,9 @@ def test_save_input_params(fitted_model_instance):
     assert fitted_model_instance.idata.attrs["test_paramter"] == '"test_paramter"'
 
 
+@pytest.mark.skipif(
+    sys.platform == "win32", reason="Permissions for temp files not granted on windows CI."
+)
 def test_save_load(fitted_model_instance):
     temp = tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False)
     fitted_model_instance.save(temp.name)
@@ -193,9 +221,6 @@ def test_fit_no_y(toy_X):
     assert "posterior" in model_builder.idata.groups()
 
 
-@pytest.mark.skipif(
-    sys.platform == "win32", reason="Permissions for temp files not granted on windows CI."
-)
 def test_predict(fitted_model_instance):
     x_pred = np.random.uniform(low=0, high=1, size=100)
     prediction_data = pd.DataFrame({"input": x_pred})
@@ -220,6 +245,41 @@ def test_sample_posterior_predictive(fitted_model_instance, combined):
     assert np.issubdtype(pred[fitted_model_instance.output_var].dtype, np.floating)
 
 
+@pytest.mark.parametrize("group", ["prior_predictive", "posterior_predictive"])
+@pytest.mark.parametrize("extend_idata", [True, False])
+def test_sample_xxx_extend_idata_param(fitted_model_instance, group, extend_idata):
+    output_var = fitted_model_instance.output_var
+    idata_prev = fitted_model_instance.idata[group][output_var]
+
+    # Since coordinates are provided, the dimension must match
+    n_pred = 100  # Must match toy_x
+    x_pred = np.random.uniform(0, 1, n_pred)
+
+    prediction_data = pd.DataFrame({"input": x_pred})
+    if group == "prior_predictive":
+        prediction_method = fitted_model_instance.sample_prior_predictive
+    else:  # group == "posterior_predictive":
+        prediction_method = fitted_model_instance.sample_posterior_predictive
+
+    pred = prediction_method(prediction_data["input"], combined=False, extend_idata=extend_idata)
+
+    pred_unstacked = pred[output_var].values
+    idata_now = fitted_model_instance.idata[group][output_var].values
+
+    if extend_idata:
+        # After sampling, data in the model should be the same as the predictions
+        np.testing.assert_array_equal(idata_now, pred_unstacked)
+        # Data in the model should NOT be the same as before
+        if idata_now.shape == idata_prev.values.shape:
+            assert np.sum(np.abs(idata_now - idata_prev.values) < 1e-5) <= 2
+    else:
+        # After sampling, data in the model should be the same as it was before
+        np.testing.assert_array_equal(idata_now, idata_prev.values)
+        # Data in the model should NOT be the same as the predictions
+        if idata_now.shape == pred_unstacked.shape:
+            assert np.sum(np.abs(idata_now - pred_unstacked) < 1e-5) <= 2
+
+
 def test_model_config_formatting():
     model_config = {
         "a": {