Skip to content

Commit 369aeee

Browse files
committed
Merge branch 'main' into 0.10.X
2 parents 80a30db + 5b55aa9 commit 369aeee

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+3410
-378
lines changed

.github/workflows/pytest.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ jobs:
2828
- {os: 'ubuntu-latest', python-version: '3.10'}
2929
- {os: 'ubuntu-latest', python-version: '3.11'}
3030
- {os: 'ubuntu-latest', python-version: '3.12'}
31+
- {os: 'ubuntu-latest', python-version: '3.13'}
3132

3233
steps:
3334
- uses: actions/checkout@v4

doubleml/data/panel_data.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ def __init__(
106106
force_all_x_finite=force_all_x_finite,
107107
force_all_d_finite=False,
108108
)
109+
# reset index to ensure a simple RangeIndex
110+
self.data.reset_index(drop=True, inplace=True)
109111
if self.n_treat != 1:
110112
raise ValueError("Only one treatment column is allowed for panel data.")
111113

@@ -139,7 +141,7 @@ def _data_summary_str(self):
139141
f"Id variable: {self.id_col}\n"
140142
)
141143

142-
data_summary += f"No. Observations: {self.n_obs}\n"
144+
data_summary += f"No. Unique Ids: {self.n_ids}\n" f"No. Observations: {self.n_obs}\n"
143145
return data_summary
144146

145147
@classmethod
@@ -213,9 +215,9 @@ def id_var_unique(self):
213215
return self._id_var_unique
214216

215217
@property
216-
def n_obs(self):
218+
def n_ids(self):
217219
"""
218-
The number of observations. For panel data, the number of unique values for id_col.
220+
The number of unique values for id_col.
219221
"""
220222
return len(self._id_var_unique)
221223

doubleml/data/tests/test_panel_data.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def test_id_col_setter():
5656
dml_data.id_col = "id_new"
5757
assert np.array_equal(dml_data.id_var, id_comp)
5858
assert dml_data._id_var_unique == np.unique(id_comp)
59-
assert dml_data.n_obs == 1
59+
assert dml_data.n_ids == 1
6060

6161
msg = "Invalid id variable id_col. a13 is no data column."
6262
with pytest.raises(ValueError, match=msg):
@@ -169,7 +169,8 @@ def test_panel_data_properties():
169169

170170
assert np.array_equal(dml_data.id_var, df["id"].values)
171171
assert np.array_equal(dml_data.id_var_unique, np.unique(df["id"].values))
172-
assert dml_data.n_obs == len(np.unique(df["id"].values))
172+
assert dml_data.n_obs == df.shape[0]
173+
assert dml_data.n_ids == len(np.unique(df["id"].values))
173174
assert dml_data.g_col == "d"
174175
assert np.array_equal(dml_data.g_values, np.sort(np.unique(df["d"].values)))
175176
assert dml_data.n_groups == len(np.unique(df["d"].values))
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,58 @@
1+
import pandas as pd
2+
13
valid_datetime_units = {"Y", "M", "D", "h", "m", "s", "ms", "us", "ns"}
24

5+
# Units that can be used with pd.Timedelta (unambiguous)
6+
timedelta_compatible_units = {"D", "h", "m", "s", "ms", "us", "ns"}
7+
8+
# Units that require period arithmetic (ambiguous)
9+
period_only_units = {"Y", "M"}
10+
311

412
def _is_valid_datetime_unit(unit):
513
if unit not in valid_datetime_units:
614
raise ValueError("Invalid datetime unit.")
715
else:
816
return unit
17+
18+
19+
def _is_timedelta_compatible(unit):
20+
"""Check if a datetime unit can be used with pd.Timedelta."""
21+
return unit in timedelta_compatible_units
22+
23+
24+
def _subtract_periods_safe(datetime_values, reference_datetime, periods, unit):
25+
"""
26+
Safely subtract periods from datetime values, handling both timedelta-compatible
27+
and period-only units.
28+
29+
Parameters
30+
----------
31+
datetime_values : pandas.Series or numpy.array
32+
Array of datetime values to compare
33+
reference_datetime : datetime-like
34+
Reference datetime to subtract periods from
35+
periods : int
36+
Number of periods to subtract
37+
unit : str
38+
Datetime unit
39+
40+
Returns
41+
-------
42+
numpy.array
43+
Boolean array indicating which datetime_values are >= (reference_datetime - periods)
44+
"""
45+
if periods == 0:
46+
# No anticipation periods, so no datetime arithmetic needed
47+
return datetime_values >= reference_datetime
48+
49+
if _is_timedelta_compatible(unit):
50+
# Use Timedelta for unambiguous units
51+
period_offset = pd.Timedelta(periods, unit=unit)
52+
return datetime_values >= (reference_datetime - period_offset)
53+
else:
54+
# Use Period arithmetic for ambiguous units like 'M' and 'Y'
55+
ref_period = pd.Period(reference_datetime, freq=unit)
56+
ref_minus_periods = ref_period - periods
57+
datetime_periods = pd.PeriodIndex(datetime_values, freq=unit)
58+
return datetime_periods >= ref_minus_periods

doubleml/did/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@
66
from .did_aggregation import DoubleMLDIDAggregation
77
from .did_binary import DoubleMLDIDBinary
88
from .did_cs import DoubleMLDIDCS
9+
from .did_cs_binary import DoubleMLDIDCSBinary
910
from .did_multi import DoubleMLDIDMulti
1011

1112
__all__ = [
1213
"DoubleMLDIDAggregation",
1314
"DoubleMLDID",
1415
"DoubleMLDIDCS",
1516
"DoubleMLDIDBinary",
17+
"DoubleMLDIDCSBinary",
1618
"DoubleMLDIDMulti",
1719
]

doubleml/did/datasets/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33
"""
44

55
from .dgp_did_CS2021 import make_did_CS2021
6+
from .dgp_did_cs_CS2021 import make_did_cs_CS2021
67
from .dgp_did_SZ2020 import make_did_SZ2020
78

89
__all__ = [
910
"make_did_SZ2020",
1011
"make_did_CS2021",
12+
"make_did_cs_CS2021",
1113
]
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
import numpy as np
2+
3+
from doubleml.did.datasets.dgp_did_CS2021 import make_did_CS2021
4+
5+
# Based on https://doi.org/10.1016/j.jeconom.2020.12.001 (see Appendix SC)
6+
# and https://d2cml-ai.github.io/csdid/examples/csdid_basic.html#Examples-with-simulated-data
7+
# Cross-sectional version of the data generating process (DGP) for Callaway and Sant'Anna (2021)
8+
9+
10+
def make_did_cs_CS2021(n_obs=1000, dgp_type=1, include_never_treated=True, lambda_t=0.5, time_type="datetime", **kwargs):
11+
"""
12+
Generate synthetic repeated cross-sectional data for difference-in-differences analysis based on
13+
Callaway and Sant'Anna (2021).
14+
15+
This function creates repeated cross-sectional data with heterogeneous treatment effects across time periods and groups.
16+
The data includes pre-treatment periods, multiple treatment groups that receive treatment at different times,
17+
and optionally a never-treated group that serves as a control. The true average treatment effect on the
18+
treated (ATT) has a heterogeneous structure dependent on covariates and exposure time.
19+
20+
The data generating process offers six variations (``dgp_type`` 1-6) that differ in how the regression features
21+
and propensity score features are derived:
22+
23+
- DGP 1: Outcome and propensity score are linear (in Z)
24+
- DGP 2: Outcome is linear, propensity score is nonlinear
25+
- DGP 3: Outcome is nonlinear, propensity score is linear
26+
- DGP 4: Outcome and propensity score are nonlinear
27+
- DGP 5: Outcome is linear, propensity score is constant (experimental setting)
28+
- DGP 6: Outcome is nonlinear, propensity score is constant (experimental setting)
29+
30+
Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries
31+
:math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix.
32+
33+
Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`,
34+
where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`,
35+
:math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`.
36+
37+
For a feature vector :math:`W=(W_1, W_2, W_3, W_4)^T` (either X or Z based on ``dgp_type``), the core functions are:
38+
39+
1. Time-varying outcome regression function for each time period :math:`t`:
40+
41+
.. math::
42+
43+
f_{reg,t}(W) = 210 + \\frac{t}{T} \\cdot (27.4 \\cdot W_1 + 13.7 \\cdot W_2 + 13.7 \\cdot W_3 + 13.7 \\cdot W_4)
44+
45+
2. Group-specific propensity function for each treatment group :math:`g`:
46+
47+
.. math::
48+
49+
f_{ps,g}(W) = \\xi \\cdot \\left(1-\\frac{g}{G}\\right) \\cdot
50+
(-W_1 + 0.5 \\cdot W_2 - 0.25 \\cdot W_3 - 0.2\\cdot W_4)
51+
52+
where :math:`T` is the number of time periods, :math:`G` is the number of treatment groups, and :math:`\\xi` is a
53+
scale parameter (default: 0.9).
54+
55+
The panel data model is defined with the following components:
56+
57+
1. Time effects: :math:`\\delta_t = t` for time period :math:`t`
58+
59+
2. Individual effects: :math:`\\eta_i \\sim \\mathcal{N}(g_i, 1)` where :math:`g_i` is unit :math:`i`'s treatment group
60+
61+
3. Treatment effects: For a unit in treatment group :math:`g`, the effect in period :math:`t` is:
62+
63+
.. math::
64+
65+
\\theta_{i,t,g} = \\max(t - t_g + 1, 0) + 0.1 \\cdot X_{i,1} \\cdot \\max(t - t_g + 1, 0)
66+
67+
where :math:`t_g` is the first treatment period for group :math:`g`, :math:`X_{i,1}` is the first covariate for unit
68+
:math:`i`, and :math:`\\max(t - t_g + 1, 0)` represents the exposure time (0 for pre-treatment periods).
69+
70+
4. Potential outcomes for unit :math:`i` in period :math:`t`:
71+
72+
.. math::
73+
74+
Y_{i,t}(0) &= f_{reg,t}(W_{reg}) + \\delta_t + \\eta_i + \\varepsilon_{i,0,t}
75+
76+
Y_{i,t}(1) &= Y_{i,t}(0) + \\theta_{i,t,g} + (\\varepsilon_{i,1,t} - \\varepsilon_{i,0,t})
77+
78+
where :math:`\\varepsilon_{i,0,t}, \\varepsilon_{i,1,t} \\sim \\mathcal{N}(0, 1)`.
79+
80+
5. Observed outcomes:
81+
82+
.. math::
83+
84+
Y_{i,t} = Y_{i,t}(1) \\cdot 1\\{t \\geq t_g\\} + Y_{i,t}(0) \\cdot 1\\{t < t_g\\}
85+
86+
6. Treatment assignment:
87+
88+
For non-experimental settings (DGP 1-4), the probability of being in treatment group :math:`g` is:
89+
90+
.. math::
91+
92+
P(G_i = g) = \\frac{\\exp(f_{ps,g}(W_{ps}))}{\\sum_{g'} \\exp(f_{ps,g'}(W_{ps}))}
93+
94+
For experimental settings (DGP 5-6), each treatment group (including never-treated) has equal probability:
95+
96+
.. math::
97+
98+
P(G_i = g) = \\frac{1}{G} \\text{ for all } g
99+
100+
7. Steps 1-6 generate panel data. To obtain repeated cross-sectional data, the number of generated individuals is increased
101+
to `n_obs/lambda_t`, where `lambda_t` denotes the probability to observe a unit at each time period (time constant).
102+
for each
103+
104+
105+
The variables :math:`W_{reg}` and :math:`W_{ps}` are selected based on the DGP type:
106+
107+
.. math::
108+
109+
DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z
110+
111+
DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X
112+
113+
DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z
114+
115+
DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X
116+
117+
DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0
118+
119+
DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0
120+
121+
where settings 5-6 correspond to experimental designs with equal probability across treatment groups.
122+
123+
124+
Parameters
125+
----------
126+
n_obs : int, default=1000
127+
The number of observations to simulate.
128+
129+
dgp_type : int, default=1
130+
The data generating process to be used (1-6).
131+
132+
include_never_treated : bool, default=True
133+
Whether to include units that are never treated.
134+
135+
lambda_t : float, default=0.5
136+
Probability of observing a unit at each time period. Note that internally `n_obs/lambda_t` individuals are
137+
generated of which only a fraction `lambda_t` is observed at each time period (see Step 7 in the DGP description).
138+
139+
time_type : str, default="datetime"
140+
Type of time variable. Either "datetime" or "float".
141+
142+
**kwargs
143+
Additional keyword arguments. Accepts the following parameters:
144+
145+
`c` (float, default=0.0):
146+
Parameter for correlation structure in X.
147+
148+
`dim_x` (int, default=4):
149+
Dimension of feature vectors.
150+
151+
`xi` (float, default=0.9):
152+
Scale parameter for the propensity score function.
153+
154+
`n_periods` (int, default=5):
155+
Number of time periods.
156+
157+
`anticipation_periods` (int, default=0):
158+
Number of periods before treatment where anticipation effects occur.
159+
160+
`n_pre_treat_periods` (int, default=2):
161+
Number of pre-treatment periods.
162+
163+
`start_date` (str, default="2025-01"):
164+
Start date for datetime time variables.
165+
166+
Returns
167+
-------
168+
pandas.DataFrame
169+
DataFrame containing the simulated panel data.
170+
171+
References
172+
----------
173+
Callaway, B. and Sant’Anna, P. H. (2021),
174+
Difference-in-Differences with multiple time periods. Journal of Econometrics, 225(2), 200-230.
175+
doi:`10.1016/j.jeconom.2020.12.001 <https://doi.org/10.1016/j.jeconom.2020.12.001>`_.
176+
"""
177+
178+
n_obs_panel = int(np.ceil(n_obs / lambda_t))
179+
df_panel = make_did_CS2021(
180+
n_obs=n_obs_panel,
181+
dgp_type=dgp_type,
182+
include_never_treated=include_never_treated,
183+
time_type=time_type,
184+
**kwargs,
185+
)
186+
187+
# for each time period, randomly select units to observe
188+
observed_units = np.random.binomial(1, lambda_t, size=(len(df_panel.index)))
189+
df_repeated_cs = df_panel[observed_units == 1].copy()
190+
191+
return df_repeated_cs

doubleml/did/did.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class DoubleMLDID(LinearScoreMixin, DoubleML):
3737
Default is ``5``.
3838
3939
n_rep : int
40-
Number of repetitons for the sample splitting.
40+
Number of repetitions for the sample splitting.
4141
Default is ``1``.
4242
4343
score : str
@@ -47,7 +47,7 @@ class DoubleMLDID(LinearScoreMixin, DoubleML):
4747
Default is ``'observational'``.
4848
4949
in_sample_normalization : bool
50-
Indicates whether to use a sligthly different normalization from Sant'Anna and Zhao (2020).
50+
Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020).
5151
Default is ``True``.
5252
5353
trimming_rule : str

0 commit comments

Comments
 (0)