DoubleML
diff --git a/‎.coverage
68 KB b/‎.coverage
68 KB
diff --git a/‎doubleml/double_ml.py
Lines changed: 8 additions & 5 deletions b/‎doubleml/double_ml.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎doubleml/double_ml_pliv.py
Lines changed: 0 additions & 46 deletions b/‎doubleml/double_ml_pliv.py
Lines changed: 0 additions & 46 deletions
diff --git a/‎doubleml/double_ml_plr.py
Lines changed: 0 additions & 45 deletions b/‎doubleml/double_ml_plr.py
Lines changed: 0 additions & 45 deletions
diff --git a/‎doubleml/tests/_utils_lpq_manual.py
Lines changed: 7 additions & 6 deletions b/‎doubleml/tests/_utils_lpq_manual.py
Lines changed: 7 additions & 6 deletions
diff --git a/‎doubleml/tests/test_blp.py
Lines changed: 8 additions & 2 deletions b/‎doubleml/tests/test_blp.py
Lines changed: 8 additions & 2 deletions
diff --git a/‎doubleml/tests/test_cvar_tune.py
Lines changed: 0 additions & 5 deletions b/‎doubleml/tests/test_cvar_tune.py
Lines changed: 0 additions & 5 deletions
diff --git a/‎doubleml/tests/test_dml_data.py
Lines changed: 29 additions & 1 deletion b/‎doubleml/tests/test_dml_data.py
Lines changed: 29 additions & 1 deletion
@@ -1109,7 +1109,7 @@ def evaluate_learners(self, learners=None, metric=_rmse):
             where ``n`` specifies the number of observations. Remark that some models like IRM are
             not able to provide all values for ``y_true`` for all learners and might contain
             some ``nan`` values in the target vector.
-            Default is the euclidean distance.
+            Default is the root-mean-square error.
 
         Returns
         -------
@@ -1130,10 +1130,13 @@ def evaluate_learners(self, learners=None, metric=_rmse):
         >>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd')
         >>> dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m)
         >>> dml_irm_obj.fit()
-        >>> dml_irm_obj.evaluate_learners(metric=mean_absolute_error)
-        {'ml_g0': array([[1.13318973]]),
-         'ml_g1': array([[0.91659939]]),
-         'ml_m': array([[0.36350912]])}
+        >>> def mae(y_true, y_pred):
+        >>>     subset = np.logical_not(np.isnan(y_true))
+        >>>     return mean_absolute_error(y_true[subset], y_pred[subset])
+        >>> dml_irm_obj.evaluate_learners(metric=mae)
+        {'ml_g0': array([[0.85974356]]),
+         'ml_g1': array([[0.85280376]]),
+         'ml_m': array([[0.35365143]])}
         """
         # if no learners are provided try to evaluate all learners
         if learners is None:
 
@@ -283,17 +283,6 @@ def _check_data(self, obj_dml_data):
                              'use DoubleMLPLR instead of DoubleMLPLIV.')
         return
 
-    # To be removed in version 0.6.0
-    def set_ml_nuisance_params(self, learner, treat_var, params):
-        if isinstance(self.score, str) & (self.score == 'partialling out') & (learner == 'ml_g'):
-            warnings.warn(("Learner ml_g was renamed to ml_l. "
-                           "Please adapt the argument learner accordingly. "
-                           "The provided parameters are set for ml_l. "
-                           "The redirection will be removed in a future version."),
-                          DeprecationWarning, stacklevel=2)
-            learner = 'ml_l'
-        super(DoubleMLPLIV, self).set_ml_nuisance_params(learner, treat_var, params)
-
     def _nuisance_est(self, smpls, n_jobs_cv, return_models=False):
         if self.partialX & (not self.partialZ):
             psi_elements, preds = self._nuisance_est_partial_x(smpls, n_jobs_cv, return_models)
@@ -525,41 +514,6 @@ def _nuisance_est_partial_xz(self, smpls, n_jobs_cv, return_models=False):
 
         return psi_elements, preds
 
-    # To be removed in version 0.6.0
-    def tune(self,
-             param_grids,
-             tune_on_folds=False,
-             scoring_methods=None,  # if None the estimator's score method is used
-             n_folds_tune=5,
-             search_mode='grid_search',
-             n_iter_randomized_search=100,
-             n_jobs_cv=None,
-             set_as_params=True,
-             return_tune_res=False):
-
-        if isinstance(self.score, str) and (self.score == 'partialling out') and (param_grids is not None) and \
-                ('ml_g' in param_grids) and ('ml_l' not in param_grids):
-            warnings.warn(("Learner ml_g was renamed to ml_l. "
-                           "Please adapt the key of param_grids accordingly. "
-                           "The provided param_grids for ml_g are set for ml_l. "
-                           "The redirection will be removed in a future version."),
-                          DeprecationWarning, stacklevel=2)
-            param_grids['ml_l'] = param_grids.pop('ml_g')
-
-        if isinstance(self.score, str) and (self.score == 'partialling out') and (scoring_methods is not None) and \
-                ('ml_g' in scoring_methods) and ('ml_l' not in scoring_methods):
-            warnings.warn(("Learner ml_g was renamed to ml_l. "
-                           "Please adapt the key of scoring_methods accordingly. "
-                           "The provided scoring_methods for ml_g are set for ml_l. "
-                           "The redirection will be removed in a future version."),
-                          DeprecationWarning, stacklevel=2)
-            scoring_methods['ml_l'] = scoring_methods.pop('ml_g')
-
-        tune_res = super(DoubleMLPLIV, self).tune(param_grids, tune_on_folds, scoring_methods, n_folds_tune,
-                                                  search_mode, n_iter_randomized_search, n_jobs_cv, set_as_params,
-                                                  return_tune_res)
-        return tune_res
-
     def _nuisance_tuning_partial_x(self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv,
                                    search_mode, n_iter_randomized_search):
         x, y = check_X_y(self._dml_data.x, self._dml_data.y,
 
@@ -166,17 +166,6 @@ def _check_data(self, obj_dml_data):
                              'To fit a partially linear IV regression model use DoubleMLPLIV instead of DoubleMLPLR.')
         return
 
-    # To be removed in version 0.6.0
-    def set_ml_nuisance_params(self, learner, treat_var, params):
-        if isinstance(self.score, str) & (self.score == 'partialling out') & (learner == 'ml_g'):
-            warnings.warn(("Learner ml_g was renamed to ml_l. "
-                           "Please adapt the argument learner accordingly. "
-                           "The provided parameters are set for ml_l. "
-                           "The redirection will be removed in a future version."),
-                          DeprecationWarning, stacklevel=2)
-            learner = 'ml_l'
-        super(DoubleMLPLR, self).set_ml_nuisance_params(learner, treat_var, params)
-
     def _nuisance_est(self, smpls, n_jobs_cv, return_models=False):
         x, y = check_X_y(self._dml_data.x, self._dml_data.y,
                          force_all_finite=False)
@@ -255,40 +244,6 @@ def _score_elements(self, y, d, l_hat, m_hat, g_hat, smpls):
 
         return psi_a, psi_b
 
-    # To be removed in version 0.6.0
-    def tune(self,
-             param_grids,
-             tune_on_folds=False,
-             scoring_methods=None,  # if None the estimator's score method is used
-             n_folds_tune=5,
-             search_mode='grid_search',
-             n_iter_randomized_search=100,
-             n_jobs_cv=None,
-             set_as_params=True,
-             return_tune_res=False):
-
-        if isinstance(self.score, str) and (self.score == 'partialling out') and (param_grids is not None) and \
-                ('ml_g' in param_grids) and ('ml_l' not in param_grids):
-            warnings.warn(("Learner ml_g was renamed to ml_l. "
-                           "Please adapt the key of param_grids accordingly. "
-                           "The provided param_grids for ml_g are set for ml_l. "
-                           "The redirection will be removed in a future version."),
-                          DeprecationWarning, stacklevel=2)
-            param_grids['ml_l'] = param_grids.pop('ml_g')
-
-        if isinstance(self.score, str) and (self.score == 'partialling out') and (scoring_methods is not None) and \
-                ('ml_g' in scoring_methods) and ('ml_l' not in scoring_methods):
-            warnings.warn(("Learner ml_g was renamed to ml_l. "
-                           "Please adapt the key of scoring_methods accordingly. "
-                           "The provided scoring_methods for ml_g are set for ml_l. "
-                           "The redirection will be removed in a future version."),
-                          DeprecationWarning, stacklevel=2)
-            scoring_methods['ml_l'] = scoring_methods.pop('ml_g')
-
-        tune_res = super(DoubleMLPLR, self).tune(param_grids, tune_on_folds, scoring_methods, n_folds_tune, search_mode,
-                                                 n_iter_randomized_search, n_jobs_cv, set_as_params, return_tune_res)
-        return tune_res
-
     def _sensitivity_element_est(self, preds):
         # set elments for readability
         y = self._dml_data.y
 
@@ -11,6 +11,7 @@ def fit_lpq(y, x, d, z, quantile,
             learner_g, learner_m, all_smpls, treatment, dml_procedure, n_rep=1,
             trimming_rule='truncate',
             trimming_threshold=1e-2,
+            kde=_default_kde,
             normalize_ipw=True, m_z_params=None,
             m_d_z0_params=None, m_d_z1_params=None,
             g_du_z0_params=None, g_du_z1_params=None):
@@ -37,10 +38,10 @@ def fit_lpq(y, x, d, z, quantile,
                                                                    g_du_z1_params=g_du_z1_params)
         if dml_procedure == 'dml1':
             lpqs[i_rep], ses[i_rep] = lpq_dml1(y, d, z, m_z_hat, g_du_z0_hat, g_du_z1_hat, comp_prob_hat,
-                                               treatment, quantile, ipw_vec, coef_bounds, smpls)
+                                               treatment, quantile, ipw_vec, coef_bounds, smpls, kde)
         else:
             lpqs[i_rep], ses[i_rep] = lpq_dml2(y, d, z, m_z_hat, g_du_z0_hat, g_du_z1_hat, comp_prob_hat,
-                                               treatment, quantile, ipw_vec, coef_bounds)
+                                               treatment, quantile, ipw_vec, coef_bounds, kde)
 
     lpq = np.median(lpqs)
     se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(lpqs - lpq, 2)) / n_obs)
@@ -200,7 +201,7 @@ def ipw_score(theta):
     return m_z_hat, g_du_z0_hat, g_du_z1_hat, comp_prob_hat, ipw_vec, coef_bounds
 
 
-def lpq_dml1(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds, smpls):
+def lpq_dml1(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds, smpls, kde):
     thetas = np.zeros(len(smpls))
     n_obs = len(y)
     ipw_est = ipw_vec.mean()
@@ -211,17 +212,17 @@ def lpq_dml1(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw
 
     theta_hat = np.mean(thetas)
 
-    se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs))
+    se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs, kde))
 
     return theta_hat, se
 
 
-def lpq_dml2(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds):
+def lpq_dml2(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds, kde):
     n_obs = len(y)
     ipw_est = ipw_vec.mean()
     theta_hat = lpq_est(m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, ipw_est, coef_bounds)
 
-    se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs))
+    se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs, kde))
 
     return theta_hat, se
 
 
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+import copy
 
 import doubleml as dml
 
@@ -26,7 +27,10 @@ def dml_blp_fixture(ci_joint, ci_level):
     random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 3)))
     random_signal = np.random.normal(0, 1, size=(n, ))
 
-    blp = dml.DoubleMLBLP(random_signal, random_basis).fit()
+    blp = dml.DoubleMLBLP(random_signal, random_basis)
+
+    blp_obj = copy.copy(blp)
+    blp.fit()
     blp_manual = fit_blp(random_signal, random_basis)
 
     np.random.seed(42)
@@ -47,7 +51,8 @@ def dml_blp_fixture(ci_joint, ci_level):
                 'ci_1': ci_1,
                 'ci_2': ci_2,
                 'ci_manual': ci_manual,
-                'blp_model': blp}
+                'blp_model': blp,
+                'unfitted_blp_model': blp_obj}
 
     return res_dict
 
@@ -91,6 +96,7 @@ def test_dml_blp_ci_2(dml_blp_fixture):
 def test_dml_blp_return_types(dml_blp_fixture):
     assert isinstance(dml_blp_fixture['blp_model'].__str__(), str)
     assert isinstance(dml_blp_fixture['blp_model'].summary, pd.DataFrame)
+    assert isinstance(dml_blp_fixture['unfitted_blp_model'].summary, pd.DataFrame)
 
 
 @pytest.mark.ci
 
@@ -3,8 +3,6 @@
 import math
 
 from sklearn.base import clone
-
-from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 
 import doubleml as dml
@@ -58,9 +56,6 @@ def tune_on_folds(request):
 def get_par_grid(learner):
     if learner.__class__ in [RandomForestRegressor, RandomForestClassifier]:
         par_grid = {'n_estimators': [5, 10, 15, 20]}
-    else:
-        assert learner.__class__ in [LogisticRegression]
-        par_grid = {'C': np.logspace(-4, 2, 10)}
     return par_grid
 
 
 
@@ -5,9 +5,27 @@
 from doubleml import DoubleMLData, DoubleMLPLR, DoubleMLClusterData, DoubleMLDIDCS
 from doubleml.datasets import make_plr_CCDDHNR2018, _make_pliv_data, make_pliv_CHS2015,\
     make_pliv_multiway_cluster_CKMS2021, make_did_SZ2020
+from doubleml.double_ml_data import DoubleMLBaseData
+
 from sklearn.linear_model import Lasso, LogisticRegression
 
 
+class DummyDataClass(DoubleMLBaseData):
+    def __init__(self, data):
+        DoubleMLBaseData.__init__(self, data)
+
+    @property
+    def n_coefs(self):
+        return 1
+
+
+@pytest.mark.ci
+def test_doubleml_basedata():
+    dummy_dml_data = DummyDataClass(pd.DataFrame(np.zeros((100, 10))))
+    assert dummy_dml_data.d_cols[0] == 'theta'
+    assert dummy_dml_data.n_treat == 1
+    assert dummy_dml_data.n_coefs == 1
+
 @pytest.fixture(scope="module")
 def dml_data_fixture(generate_data1):
     data = generate_data1
@@ -157,12 +175,22 @@ def test_dml_data_no_instr_no_time():
 
 
 @pytest.mark.ci
-def test_dml_cluster_summary_with_time():
+def test_dml_summary_with_time():
     dml_data_did_cs = make_did_SZ2020(n_obs=200, cross_sectional_data=True)
     dml_did_cs = DoubleMLDIDCS(dml_data_did_cs, Lasso(), LogisticRegression())
     assert isinstance(dml_did_cs.__str__(), str)
     assert isinstance(dml_did_cs.summary, pd.DataFrame)
 
+    dml_data = make_plr_CCDDHNR2018(n_obs=100)
+    df = dml_data.data.copy().iloc[:, :11]
+    df.columns = [f'X{i + 1}' for i in np.arange(8)] + ['y', 'd1', 'd2']
+    print(df)
+    dml_data = DoubleMLClusterData(df, 'y', ['d1', 'd2'],
+                                   cluster_cols=[f'X{i + 1}' for i in [5, 6]],
+                                   x_cols=[f'X{i + 1}' for i in np.arange(5)],
+                                   t_col='X8')
+    assert isinstance(dml_data._data_summary_str(), str)
+
 
 @pytest.mark.ci
 def test_x_cols_setter_defaults():