Skip to content
Draft
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
df03887
Logistic regression implementation WIP
jer2ig Jan 13, 2025
f5521f1
First WIP of implementation
jer2ig Jan 27, 2025
bfa756c
Working implementation. Started on test set-up.
jer2ig Feb 21, 2025
d729d0a
Changed data type of arrays
jer2ig Feb 27, 2025
8fe7ca6
Fix variable name
jer2ig Feb 27, 2025
18bac23
Moved into plm folder, started testing setup
jer2ig Aug 27, 2025
c6e600d
Fixed bug in score computation
jer2ig Aug 27, 2025
6f556e0
Reverted from ensure_all_finite to force_all_finite
jer2ig Aug 27, 2025
3a332bf
Fixes to instrument score
jer2ig Aug 28, 2025
b41a773
Added option for exception on convergence failure
jer2ig Sep 3, 2025
c434667
Added unbalanced dataset option, bug fixes
jer2ig Sep 29, 2025
443d82d
Added binary treatment dataset, fixed bug for model check
jer2ig Oct 7, 2025
774c74d
Adjusted dataset balancing
jer2ig Oct 7, 2025
9695820
Renamed Logistic to LPLR
jer2ig Oct 27, 2025
dbfea73
Clean-up of branch
jer2ig Oct 27, 2025
29114ce
Ruff checks and formatting
jer2ig Oct 27, 2025
5d2d1ed
Unit tests work and bug fix in lplr
jer2ig Oct 28, 2025
2c626a0
Cleanup
jer2ig Oct 28, 2025
9819436
Tests updated
jer2ig Nov 6, 2025
5a7e279
Pre-commit checks
jer2ig Nov 6, 2025
fc03cc6
Pre-commit checks on all files
jer2ig Nov 6, 2025
5dae651
Changed function signature, test
jer2ig Nov 7, 2025
13fca2f
Argument fix
jer2ig Nov 7, 2025
ff4c75b
Updated tests for improved coverage
jer2ig Nov 7, 2025
8a181cd
Unused var removed
jer2ig Nov 7, 2025
f2ecea7
Fixed resampling
jer2ig Nov 7, 2025
a9a2959
External predictions
jer2ig Nov 8, 2025
cd6055b
Bugfix and addtl text
jer2ig Nov 8, 2025
4a8be08
Change to ext predictions
jer2ig Nov 10, 2025
0472f1c
Change to targets data type
jer2ig Nov 10, 2025
2fc1f53
DoubleResamplin integrated into mixin, small changes
jer2ig Nov 10, 2025
ecfe2c7
Added attribute to sample mixin
jer2ig Nov 10, 2025
a9c0deb
Smpls inner access adjusted
jer2ig Nov 10, 2025
6abff49
Docstring, complexity reduction
jer2ig Nov 11, 2025
0f08e37
Weights updated, seed corrected
jer2ig Nov 11, 2025
430f4a6
Fix
jer2ig Nov 11, 2025
5b92395
Renaming
jer2ig Nov 11, 2025
042aa26
Doctest
jer2ig Nov 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doubleml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .irm.pq import DoubleMLPQ
from .irm.qte import DoubleMLQTE
from .irm.ssm import DoubleMLSSM
from .plm.lplr import DoubleMLLPLR
from .plm.pliv import DoubleMLPLIV
from .plm.plr import DoubleMLPLR
from .utils.blp import DoubleMLBLP
Expand Down Expand Up @@ -42,6 +43,7 @@
"DoubleMLBLP",
"DoubleMLPolicyTree",
"DoubleMLSSM",
"DoubleMLLPLR",
]

__version__ = importlib.metadata.version("doubleml")
19 changes: 16 additions & 3 deletions doubleml/double_ml_score_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ class NonLinearScoreMixin:
_score_type = "nonlinear"
_coef_start_val = np.nan
_coef_bounds = None
_error_on_convergence_failure = False

@property
@abstractmethod
Expand Down Expand Up @@ -149,12 +150,16 @@ def score_deriv(theta):
theta_hat = root_res.root
if not root_res.converged:
score_val = score(theta_hat)
warnings.warn(
msg = (
"Could not find a root of the score function.\n "
f"Flag: {root_res.flag}.\n"
f"Score value found is {score_val} "
f"for parameter theta equal to {theta_hat}."
)
if self._error_on_convergence_failure:
raise ValueError(msg)
else:
warnings.warn(msg)
else:
signs_different, bracket_guess = _get_bracket_guess(score, self._coef_start_val, self._coef_bounds)

Expand Down Expand Up @@ -186,12 +191,16 @@ def score_squared(theta):
score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds]
)
theta_hat = theta_hat_array.item()
warnings.warn(
msg = (
"Could not find a root of the score function.\n "
f"Minimum score value found is {score_val} "
f"for parameter theta equal to {theta_hat}.\n "
"No theta found such that the score function evaluates to a negative value."
)
if self._error_on_convergence_failure:
raise ValueError(msg)
else:
warnings.warn(msg)
else:

def neg_score(theta):
Expand All @@ -202,11 +211,15 @@ def neg_score(theta):
neg_score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds]
)
theta_hat = theta_hat_array.item()
warnings.warn(
msg = (
"Could not find a root of the score function. "
f"Maximum score value found is {-1 * neg_score_val} "
f"for parameter theta equal to {theta_hat}. "
"No theta found such that the score function evaluates to a positive value."
)
if self._error_on_convergence_failure:
raise ValueError(msg)
else:
warnings.warn(msg)

return theta_hat
6 changes: 2 additions & 4 deletions doubleml/plm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@
The :mod:`doubleml.plm` module implements double machine learning estimates based on partially linear models.
"""

from .lplr import DoubleMLLPLR
from .pliv import DoubleMLPLIV
from .plr import DoubleMLPLR

__all__ = [
"DoubleMLPLR",
"DoubleMLPLIV",
]
__all__ = ["DoubleMLPLR", "DoubleMLPLIV", "DoubleMLLPLR"]
2 changes: 2 additions & 0 deletions doubleml/plm/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from ._make_pliv_data import _make_pliv_data
from .dgp_confounded_plr_data import make_confounded_plr_data
from .dgp_lplr_LZZ2020 import make_lplr_LZZ2020
from .dgp_pliv_CHS2015 import make_pliv_CHS2015
from .dgp_pliv_multiway_cluster_CKMS2021 import make_pliv_multiway_cluster_CKMS2021
from .dgp_plr_CCDDHNR2018 import make_plr_CCDDHNR2018
Expand All @@ -15,5 +16,6 @@
"make_confounded_plr_data",
"make_pliv_CHS2015",
"make_pliv_multiway_cluster_CKMS2021",
"make_lplr_LZZ2020",
"_make_pliv_data",
]
152 changes: 152 additions & 0 deletions doubleml/plm/datasets/dgp_lplr_LZZ2020.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import numpy as np
import pandas as pd
from scipy.special import expit

from doubleml.data import DoubleMLData
from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias

_array_alias = _get_array_alias()
_data_frame_alias = _get_data_frame_alias()
_dml_data_alias = _get_dml_data_alias()


def make_lplr_LZZ2020(
n_obs=500, dim_x=20, alpha=0.5, return_type="DoubleMLData", balanced_r0=True, treatment="continuous", **kwargs
):
r"""
Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021),
designed for use in double/debiased machine learning applications.

The data generating process is defined as follows:

- Covariates :math:`x_i \sim \mathcal{N}(0, \Sigma)`, where :math:`\Sigma_{kj} = 0.7^{|j-k|}`.
- Treatment :math:`d_i = a_0(x_i)`.
- Propensity score :math:`p_i = \sigma(\alpha d_i + r_0(x_i))`, where :math:`\sigma(\cdot)` is the logistic function.
- Outcome :math:`y_i \sim \text{Bernoulli}(p_i)`.

The nuisance functions are defined as:

.. math::
\begin{aligned}
a_0(x_i) &= \frac{2}{1 + \exp(x_{i,1})} - \frac{2}{1 + \exp(x_{i,2})} + \sin(x_{i,3}) + \cos(x_{i,4}) \\
&\quad + 0.5 \cdot \mathbb{1}(x_{i,5} > 0) - 0.5 \cdot \mathbb{1}(x_{i,6} > 0) + 0.2\, x_{i,7} x_{i,8}
- 0.2\, x_{i,9} x_{i,10} \\
r_0(x_i) &= 0.1\, x_{i,1} x_{i,2} x_{i,3} + 0.1\, x_{i,4} x_{i,5} + 0.1\, x_{i,6}^3 - 0.5 \sin^2(x_{i,7}) \\
&\quad + 0.5 \cos(x_{i,8}) + \frac{1}{1 + x_{i,9}^2} - \frac{1}{1 + \exp(x_{i,10})} \\
&\quad + 0.25 \cdot \mathbb{1}(x_{i,11} > 0) - 0.25 \cdot \mathbb{1}(x_{i,13} > 0)
\end{aligned}

Parameters
----------
n_obs : int
Number of observations to simulate.
dim_x : int
Number of covariates.
alpha : float
Value of the causal parameter.
return_type : str
Determines the return format. One of:

- 'DoubleMLData' or DoubleMLData: returns a ``DoubleMLData`` object.
- 'DataFrame', 'pd.DataFrame' or pd.DataFrame: returns a ``pandas.DataFrame``.
- 'array', 'np.ndarray', 'np.array' or np.ndarray: returns tuple of numpy arrays (x, y, d, p).
balanced_r0 : bool, default True
If True, uses the "balanced" r_0 specification (smaller magnitude / more balanced
heterogeneity). If False, uses an "unbalanced" r_0 specification with larger
share of Y=0.
treatment : {'continuous', 'binary', 'binary_unbalanced'}, default 'continuous'
Determines how the treatment d is generated from a_0(x):
- 'continuous': d = a_0(x) (continuous treatment).
- 'binary': d ~ Bernoulli( sigmoid(a_0(x) - mean(a_0(x))) ) .
- 'binary_unbalanced': d ~ Bernoulli( sigmoid(a_0(x)) ).

**kwargs
Optional keyword arguments (currently unused in this implementation).

Returns
-------
Union[DoubleMLData, pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]
The generated data in the specified format.

References
----------
Liu, Molei, Yi Zhang, and Doudou Zhou. 2021.
"Double/Debiased Machine Learning for Logistic Partially Linear Model."
The Econometrics Journal 24 (3): 559–88. https://doi.org/10.1093/ectj/utab019.

"""

if balanced_r0:

def r_0(X):
return (
0.1 * X[:, 0] * X[:, 1] * X[:, 2]
+ 0.1 * X[:, 3] * X[:, 4]
+ 0.1 * X[:, 5] ** 3
+ -0.5 * np.sin(X[:, 6]) ** 2
+ 0.5 * np.cos(X[:, 7])
+ 1 / (1 + X[:, 8] ** 2)
+ -1 / (1 + np.exp(X[:, 9]))
+ 0.25 * np.where(X[:, 10] > 0, 1, 0)
+ -0.25 * np.where(X[:, 12] > 0, 1, 0)
)

else:

def r_0(X):
return (
0.1 * X[:, 0] * X[:, 1] * X[:, 2]
+ 0.1 * X[:, 3] * X[:, 4]
+ 0.1 * X[:, 5] ** 3
+ -0.5 * np.sin(X[:, 6]) ** 2
+ 0.5 * np.cos(X[:, 7])
+ 4 / (1 + X[:, 8] ** 2)
+ -1 / (1 + np.exp(X[:, 9]))
+ 1.5 * np.where(X[:, 10] > 0, 1, 0)
+ -0.25 * np.where(X[:, 12] > 0, 1, 0)
)

def a_0(X):
return (
2 / (1 + np.exp(X[:, 0]))
+ -2 / (1 + np.exp(X[:, 1]))
+ 1 * np.sin(X[:, 2])
+ 1 * np.cos(X[:, 3])
+ 0.5 * np.where(X[:, 4] > 0, 1, 0)
+ -0.5 * np.where(X[:, 5] > 0, 1, 0)
+ 0.2 * X[:, 6] * X[:, 7]
+ -0.2 * X[:, 8] * X[:, 9]
)

sigma = np.full((dim_x, dim_x), 0.2)
np.fill_diagonal(sigma, 1)

x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=n_obs)
np.clip(x, -2, 2, out=x)

if treatment == "continuous":
d = a_0(x)
elif treatment == "binary":
d_cont = a_0(x)
d = np.random.binomial(1, expit(d_cont - d_cont.mean()))
elif treatment == "binary_unbalanced":
d_cont = a_0(x)
d = np.random.binomial(1, expit(d_cont))
else:
raise ValueError("Invalid treatment type.")

p = expit(alpha * d[:] + r_0(x))

y = np.random.binomial(1, p)

if return_type in _array_alias:
return x, y, d, p
elif return_type in _data_frame_alias + _dml_data_alias:
x_cols = [f"X{i + 1}" for i in np.arange(dim_x)]
data = pd.DataFrame(np.column_stack((x, y, d, p)), columns=x_cols + ["y", "d", "p"])
if return_type in _data_frame_alias:
return data
else:
return DoubleMLData(data, "y", "d", x_cols)
else:
raise ValueError("Invalid return_type.")
Loading
Loading