DoubleML · jer2ig · Jan 13, 2025 · Jan 27, 2025 · Feb 21, 2025 · Feb 27, 2025
diff --git a/doubleml/__init__.py b/doubleml/__init__.py
@@ -13,6 +13,7 @@
 from .irm.pq import DoubleMLPQ
 from .irm.qte import DoubleMLQTE
 from .irm.ssm import DoubleMLSSM
+from .plm.lplr import DoubleMLLPLR
 from .plm.pliv import DoubleMLPLIV
 from .plm.plr import DoubleMLPLR
 from .utils.blp import DoubleMLBLP
@@ -42,6 +43,7 @@
     "DoubleMLBLP",
     "DoubleMLPolicyTree",
     "DoubleMLSSM",
+    "DoubleMLLPLR",
 ]
 
 __version__ = importlib.metadata.version("doubleml")
diff --git a/doubleml/double_ml_score_mixins.py b/doubleml/double_ml_score_mixins.py
@@ -86,6 +86,7 @@ class NonLinearScoreMixin:
     _score_type = "nonlinear"
     _coef_start_val = np.nan
     _coef_bounds = None
+    _error_on_convergence_failure = False
 
     @property
     @abstractmethod
@@ -149,12 +150,16 @@ def score_deriv(theta):
             theta_hat = root_res.root
             if not root_res.converged:
                 score_val = score(theta_hat)
-                warnings.warn(
+                msg = (
                     "Could not find a root of the score function.\n "
                     f"Flag: {root_res.flag}.\n"
                     f"Score value found is {score_val} "
                     f"for parameter theta equal to {theta_hat}."
                 )
+                if self._error_on_convergence_failure:
+                    raise ValueError(msg)
+                else:
+                    warnings.warn(msg)
         else:
             signs_different, bracket_guess = _get_bracket_guess(score, self._coef_start_val, self._coef_bounds)
 
@@ -186,12 +191,16 @@ def score_squared(theta):
                             score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds]
                         )
                         theta_hat = theta_hat_array.item()
-                        warnings.warn(
+                        msg = (
                             "Could not find a root of the score function.\n "
                             f"Minimum score value found is {score_val} "
                             f"for parameter theta equal to {theta_hat}.\n "
                             "No theta found such that the score function evaluates to a negative value."
                         )
+                        if self._error_on_convergence_failure:
+                            raise ValueError(msg)
+                        else:
+                            warnings.warn(msg)
                     else:
 
                         def neg_score(theta):
@@ -202,11 +211,15 @@ def neg_score(theta):
                             neg_score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds]
                         )
                         theta_hat = theta_hat_array.item()
-                        warnings.warn(
+                        msg = (
                             "Could not find a root of the score function. "
                             f"Maximum score value found is {-1 * neg_score_val} "
                             f"for parameter theta equal to {theta_hat}. "
                             "No theta found such that the score function evaluates to a positive value."
                         )
+                        if self._error_on_convergence_failure:
+                            raise ValueError(msg)
+                        else:
+                            warnings.warn(msg)
 
         return theta_hat
diff --git a/doubleml/plm/__init__.py b/doubleml/plm/__init__.py
@@ -2,10 +2,8 @@
 The :mod:`doubleml.plm` module implements double machine learning estimates based on partially linear models.
 """
 
+from .lplr import DoubleMLLPLR
 from .pliv import DoubleMLPLIV
 from .plr import DoubleMLPLR
 
-__all__ = [
-    "DoubleMLPLR",
-    "DoubleMLPLIV",
-]
+__all__ = ["DoubleMLPLR", "DoubleMLPLIV", "DoubleMLLPLR"]
diff --git a/doubleml/plm/datasets/__init__.py b/doubleml/plm/datasets/__init__.py
@@ -4,6 +4,7 @@
 
 from ._make_pliv_data import _make_pliv_data
 from .dgp_confounded_plr_data import make_confounded_plr_data
+from .dgp_lplr_LZZ2020 import make_lplr_LZZ2020
 from .dgp_pliv_CHS2015 import make_pliv_CHS2015
 from .dgp_pliv_multiway_cluster_CKMS2021 import make_pliv_multiway_cluster_CKMS2021
 from .dgp_plr_CCDDHNR2018 import make_plr_CCDDHNR2018
@@ -15,5 +16,6 @@
     "make_confounded_plr_data",
     "make_pliv_CHS2015",
     "make_pliv_multiway_cluster_CKMS2021",
+    "make_lplr_LZZ2020",
     "_make_pliv_data",
 ]
diff --git a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py
@@ -0,0 +1,152 @@
+import numpy as np
+import pandas as pd
+from scipy.special import expit
+
+from doubleml.data import DoubleMLData
+from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias
+
+_array_alias = _get_array_alias()
+_data_frame_alias = _get_data_frame_alias()
+_dml_data_alias = _get_dml_data_alias()
+
+
+def make_lplr_LZZ2020(
+    n_obs=500, dim_x=20, alpha=0.5, return_type="DoubleMLData", balanced_r0=True, treatment="continuous", **kwargs
+):
+    r"""
+    Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021),
+    designed for use in double/debiased machine learning applications.
+
+    The data generating process is defined as follows:
+
+    - Covariates :math:`x_i \sim \mathcal{N}(0, \Sigma)`, where :math:`\Sigma_{kj} = 0.7^{|j-k|}`.
+    - Treatment :math:`d_i = a_0(x_i)`.
+    - Propensity score :math:`p_i = \sigma(\alpha d_i + r_0(x_i))`, where :math:`\sigma(\cdot)` is the logistic function.
+    - Outcome :math:`y_i \sim \text{Bernoulli}(p_i)`.
+
+    The nuisance functions are defined as:
+
+    .. math::
+        \begin{aligned}
+        a_0(x_i) &= \frac{2}{1 + \exp(x_{i,1})} - \frac{2}{1 + \exp(x_{i,2})} + \sin(x_{i,3}) + \cos(x_{i,4}) \\
+                 &\quad + 0.5 \cdot \mathbb{1}(x_{i,5} > 0) - 0.5 \cdot \mathbb{1}(x_{i,6} > 0) + 0.2\, x_{i,7} x_{i,8}
+                 - 0.2\, x_{i,9} x_{i,10} \\
+        r_0(x_i) &= 0.1\, x_{i,1} x_{i,2} x_{i,3} + 0.1\, x_{i,4} x_{i,5} + 0.1\, x_{i,6}^3 - 0.5 \sin^2(x_{i,7}) \\
+                 &\quad + 0.5 \cos(x_{i,8}) + \frac{1}{1 + x_{i,9}^2} - \frac{1}{1 + \exp(x_{i,10})} \\
+                 &\quad + 0.25 \cdot \mathbb{1}(x_{i,11} > 0) - 0.25 \cdot \mathbb{1}(x_{i,13} > 0)
+        \end{aligned}
+
+    Parameters
+    ----------
+    n_obs : int
+        Number of observations to simulate.
+    dim_x : int
+        Number of covariates.
+    alpha : float
+        Value of the causal parameter.
+    return_type : str
+        Determines the return format. One of:
+
+        - 'DoubleMLData' or DoubleMLData: returns a ``DoubleMLData`` object.
+        - 'DataFrame', 'pd.DataFrame' or pd.DataFrame: returns a ``pandas.DataFrame``.
+        - 'array', 'np.ndarray', 'np.array' or np.ndarray: returns tuple of numpy arrays (x, y, d, p).
+    balanced_r0 : bool, default True
+        If True, uses the "balanced" r_0 specification (smaller magnitude / more balanced
+        heterogeneity). If False, uses an "unbalanced" r_0 specification with larger
+        share of Y=0.
+    treatment : {'continuous', 'binary', 'binary_unbalanced'}, default 'continuous'
+        Determines how the treatment d is generated from a_0(x):
+        - 'continuous': d = a_0(x) (continuous treatment).
+        - 'binary':    d ~ Bernoulli( sigmoid(a_0(x) - mean(a_0(x))) ) .
+        - 'binary_unbalanced': d ~ Bernoulli( sigmoid(a_0(x)) ).
+
+    **kwargs
+        Optional keyword arguments (currently unused in this implementation).
+
+    Returns
+    -------
+    Union[DoubleMLData, pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]
+        The generated data in the specified format.
+
+    References
+    ----------
+    Liu, Molei, Yi Zhang, and Doudou Zhou. 2021.
+    "Double/Debiased Machine Learning for Logistic Partially Linear Model."
+    The Econometrics Journal 24 (3): 559–88. https://doi.org/10.1093/ectj/utab019.
+
+    """
+
+    if balanced_r0:
+
+        def r_0(X):
+            return (
+                0.1 * X[:, 0] * X[:, 1] * X[:, 2]
+                + 0.1 * X[:, 3] * X[:, 4]
+                + 0.1 * X[:, 5] ** 3
+                + -0.5 * np.sin(X[:, 6]) ** 2
+                + 0.5 * np.cos(X[:, 7])
+                + 1 / (1 + X[:, 8] ** 2)
+                + -1 / (1 + np.exp(X[:, 9]))
+                + 0.25 * np.where(X[:, 10] > 0, 1, 0)
+                + -0.25 * np.where(X[:, 12] > 0, 1, 0)
+            )
+
+    else:
+
+        def r_0(X):
+            return (
+                0.1 * X[:, 0] * X[:, 1] * X[:, 2]
+                + 0.1 * X[:, 3] * X[:, 4]
+                + 0.1 * X[:, 5] ** 3
+                + -0.5 * np.sin(X[:, 6]) ** 2
+                + 0.5 * np.cos(X[:, 7])
+                + 4 / (1 + X[:, 8] ** 2)
+                + -1 / (1 + np.exp(X[:, 9]))
+                + 1.5 * np.where(X[:, 10] > 0, 1, 0)
+                + -0.25 * np.where(X[:, 12] > 0, 1, 0)
+            )
+
+    def a_0(X):
+        return (
+            2 / (1 + np.exp(X[:, 0]))
+            + -2 / (1 + np.exp(X[:, 1]))
+            + 1 * np.sin(X[:, 2])
+            + 1 * np.cos(X[:, 3])
+            + 0.5 * np.where(X[:, 4] > 0, 1, 0)
+            + -0.5 * np.where(X[:, 5] > 0, 1, 0)
+            + 0.2 * X[:, 6] * X[:, 7]
+            + -0.2 * X[:, 8] * X[:, 9]
+        )
+
+    sigma = np.full((dim_x, dim_x), 0.2)
+    np.fill_diagonal(sigma, 1)
+
+    x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=n_obs)
+    np.clip(x, -2, 2, out=x)
+
+    if treatment == "continuous":
+        d = a_0(x)
+    elif treatment == "binary":
+        d_cont = a_0(x)
+        d = np.random.binomial(1, expit(d_cont - d_cont.mean()))
+    elif treatment == "binary_unbalanced":
+        d_cont = a_0(x)
+        d = np.random.binomial(1, expit(d_cont))
+    else:
+        raise ValueError("Invalid treatment type.")
+
+    p = expit(alpha * d[:] + r_0(x))
+
+    y = np.random.binomial(1, p)
+
+    if return_type in _array_alias:
+        return x, y, d, p
+    elif return_type in _data_frame_alias + _dml_data_alias:
+        x_cols = [f"X{i + 1}" for i in np.arange(dim_x)]
+        data = pd.DataFrame(np.column_stack((x, y, d, p)), columns=x_cols + ["y", "d", "p"])
+        if return_type in _data_frame_alias:
+            return data
+        else:
+            return DoubleMLData(data, "y", "d", x_cols)
+    else:
+        raise ValueError("Invalid return_type.")