|
| 1 | +import numpy as np |
| 2 | + |
| 3 | +from doubleml.did.datasets.dgp_did_CS2021 import make_did_CS2021 |
| 4 | + |
| 5 | +# Based on https://doi.org/10.1016/j.jeconom.2020.12.001 (see Appendix SC) |
| 6 | +# and https://d2cml-ai.github.io/csdid/examples/csdid_basic.html#Examples-with-simulated-data |
| 7 | +# Cross-sectional version of the data generating process (DGP) for Callaway and Sant'Anna (2021) |
| 8 | + |
| 9 | + |
| 10 | +def make_did_cs_CS2021(n_obs=1000, dgp_type=1, include_never_treated=True, lambda_t=0.5, time_type="datetime", **kwargs): |
| 11 | + """ |
| 12 | + Generate synthetic repeated cross-sectional data for difference-in-differences analysis based on |
| 13 | + Callaway and Sant'Anna (2021). |
| 14 | +
|
| 15 | + This function creates repeated cross-sectional data with heterogeneous treatment effects across time periods and groups. |
| 16 | + The data includes pre-treatment periods, multiple treatment groups that receive treatment at different times, |
| 17 | + and optionally a never-treated group that serves as a control. The true average treatment effect on the |
| 18 | + treated (ATT) has a heterogeneous structure dependent on covariates and exposure time. |
| 19 | +
|
| 20 | + The data generating process offers six variations (``dgp_type`` 1-6) that differ in how the regression features |
| 21 | + and propensity score features are derived: |
| 22 | +
|
| 23 | + - DGP 1: Outcome and propensity score are linear (in Z) |
| 24 | + - DGP 2: Outcome is linear, propensity score is nonlinear |
| 25 | + - DGP 3: Outcome is nonlinear, propensity score is linear |
| 26 | + - DGP 4: Outcome and propensity score are nonlinear |
| 27 | + - DGP 5: Outcome is linear, propensity score is constant (experimental setting) |
| 28 | + - DGP 6: Outcome is nonlinear, propensity score is constant (experimental setting) |
| 29 | +
|
| 30 | + Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries |
| 31 | + :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. |
| 32 | +
|
| 33 | + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, |
| 34 | + where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`, |
| 35 | + :math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`. |
| 36 | +
|
| 37 | + For a feature vector :math:`W=(W_1, W_2, W_3, W_4)^T` (either X or Z based on ``dgp_type``), the core functions are: |
| 38 | +
|
| 39 | + 1. Time-varying outcome regression function for each time period :math:`t`: |
| 40 | +
|
| 41 | + .. math:: |
| 42 | +
|
| 43 | + f_{reg,t}(W) = 210 + \\frac{t}{T} \\cdot (27.4 \\cdot W_1 + 13.7 \\cdot W_2 + 13.7 \\cdot W_3 + 13.7 \\cdot W_4) |
| 44 | +
|
| 45 | + 2. Group-specific propensity function for each treatment group :math:`g`: |
| 46 | +
|
| 47 | + .. math:: |
| 48 | +
|
| 49 | + f_{ps,g}(W) = \\xi \\cdot \\left(1-\\frac{g}{G}\\right) \\cdot |
| 50 | + (-W_1 + 0.5 \\cdot W_2 - 0.25 \\cdot W_3 - 0.2\\cdot W_4) |
| 51 | +
|
| 52 | + where :math:`T` is the number of time periods, :math:`G` is the number of treatment groups, and :math:`\\xi` is a |
| 53 | + scale parameter (default: 0.9). |
| 54 | +
|
| 55 | + The panel data model is defined with the following components: |
| 56 | +
|
| 57 | + 1. Time effects: :math:`\\delta_t = t` for time period :math:`t` |
| 58 | +
|
| 59 | + 2. Individual effects: :math:`\\eta_i \\sim \\mathcal{N}(g_i, 1)` where :math:`g_i` is unit :math:`i`'s treatment group |
| 60 | +
|
| 61 | + 3. Treatment effects: For a unit in treatment group :math:`g`, the effect in period :math:`t` is: |
| 62 | +
|
| 63 | + .. math:: |
| 64 | +
|
| 65 | + \\theta_{i,t,g} = \\max(t - t_g + 1, 0) + 0.1 \\cdot X_{i,1} \\cdot \\max(t - t_g + 1, 0) |
| 66 | +
|
| 67 | + where :math:`t_g` is the first treatment period for group :math:`g`, :math:`X_{i,1}` is the first covariate for unit |
| 68 | + :math:`i`, and :math:`\\max(t - t_g + 1, 0)` represents the exposure time (0 for pre-treatment periods). |
| 69 | +
|
| 70 | + 4. Potential outcomes for unit :math:`i` in period :math:`t`: |
| 71 | +
|
| 72 | + .. math:: |
| 73 | +
|
| 74 | + Y_{i,t}(0) &= f_{reg,t}(W_{reg}) + \\delta_t + \\eta_i + \\varepsilon_{i,0,t} |
| 75 | +
|
| 76 | + Y_{i,t}(1) &= Y_{i,t}(0) + \\theta_{i,t,g} + (\\varepsilon_{i,1,t} - \\varepsilon_{i,0,t}) |
| 77 | +
|
| 78 | + where :math:`\\varepsilon_{i,0,t}, \\varepsilon_{i,1,t} \\sim \\mathcal{N}(0, 1)`. |
| 79 | +
|
| 80 | + 5. Observed outcomes: |
| 81 | +
|
| 82 | + .. math:: |
| 83 | +
|
| 84 | + Y_{i,t} = Y_{i,t}(1) \\cdot 1\\{t \\geq t_g\\} + Y_{i,t}(0) \\cdot 1\\{t < t_g\\} |
| 85 | +
|
| 86 | + 6. Treatment assignment: |
| 87 | +
|
| 88 | + For non-experimental settings (DGP 1-4), the probability of being in treatment group :math:`g` is: |
| 89 | +
|
| 90 | + .. math:: |
| 91 | +
|
| 92 | + P(G_i = g) = \\frac{\\exp(f_{ps,g}(W_{ps}))}{\\sum_{g'} \\exp(f_{ps,g'}(W_{ps}))} |
| 93 | +
|
| 94 | + For experimental settings (DGP 5-6), each treatment group (including never-treated) has equal probability: |
| 95 | +
|
| 96 | + .. math:: |
| 97 | +
|
| 98 | + P(G_i = g) = \\frac{1}{G} \\text{ for all } g |
| 99 | +
|
| 100 | + 7. Steps 1-6 generate panel data. To obtain repeated cross-sectional data, the number of generated individuals is increased |
| 101 | + to `n_obs/lambda_t`, where `lambda_t` denotes the probability to observe a unit at each time period (time constant). |
| 102 | + for each |
| 103 | +
|
| 104 | +
|
| 105 | + The variables :math:`W_{reg}` and :math:`W_{ps}` are selected based on the DGP type: |
| 106 | +
|
| 107 | + .. math:: |
| 108 | +
|
| 109 | + DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z |
| 110 | +
|
| 111 | + DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X |
| 112 | +
|
| 113 | + DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z |
| 114 | +
|
| 115 | + DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X |
| 116 | +
|
| 117 | + DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0 |
| 118 | +
|
| 119 | + DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0 |
| 120 | +
|
| 121 | + where settings 5-6 correspond to experimental designs with equal probability across treatment groups. |
| 122 | +
|
| 123 | +
|
| 124 | + Parameters |
| 125 | + ---------- |
| 126 | + n_obs : int, default=1000 |
| 127 | + The number of observations to simulate. |
| 128 | +
|
| 129 | + dgp_type : int, default=1 |
| 130 | + The data generating process to be used (1-6). |
| 131 | +
|
| 132 | + include_never_treated : bool, default=True |
| 133 | + Whether to include units that are never treated. |
| 134 | +
|
| 135 | + lambda_t : float, default=0.5 |
| 136 | + Probability of observing a unit at each time period. Note that internally `n_obs/lambda_t` individuals are |
| 137 | + generated of which only a fraction `lambda_t` is observed at each time period (see Step 7 in the DGP description). |
| 138 | +
|
| 139 | + time_type : str, default="datetime" |
| 140 | + Type of time variable. Either "datetime" or "float". |
| 141 | +
|
| 142 | + **kwargs |
| 143 | + Additional keyword arguments. Accepts the following parameters: |
| 144 | +
|
| 145 | + `c` (float, default=0.0): |
| 146 | + Parameter for correlation structure in X. |
| 147 | +
|
| 148 | + `dim_x` (int, default=4): |
| 149 | + Dimension of feature vectors. |
| 150 | +
|
| 151 | + `xi` (float, default=0.9): |
| 152 | + Scale parameter for the propensity score function. |
| 153 | +
|
| 154 | + `n_periods` (int, default=5): |
| 155 | + Number of time periods. |
| 156 | +
|
| 157 | + `anticipation_periods` (int, default=0): |
| 158 | + Number of periods before treatment where anticipation effects occur. |
| 159 | +
|
| 160 | + `n_pre_treat_periods` (int, default=2): |
| 161 | + Number of pre-treatment periods. |
| 162 | +
|
| 163 | + `start_date` (str, default="2025-01"): |
| 164 | + Start date for datetime time variables. |
| 165 | +
|
| 166 | + Returns |
| 167 | + ------- |
| 168 | + pandas.DataFrame |
| 169 | + DataFrame containing the simulated panel data. |
| 170 | +
|
| 171 | + References |
| 172 | + ---------- |
| 173 | + Callaway, B. and Sant’Anna, P. H. (2021), |
| 174 | + Difference-in-Differences with multiple time periods. Journal of Econometrics, 225(2), 200-230. |
| 175 | + doi:`10.1016/j.jeconom.2020.12.001 <https://doi.org/10.1016/j.jeconom.2020.12.001>`_. |
| 176 | + """ |
| 177 | + |
| 178 | + n_obs_panel = int(np.ceil(n_obs / lambda_t)) |
| 179 | + df_panel = make_did_CS2021( |
| 180 | + n_obs=n_obs_panel, |
| 181 | + dgp_type=dgp_type, |
| 182 | + include_never_treated=include_never_treated, |
| 183 | + time_type=time_type, |
| 184 | + **kwargs, |
| 185 | + ) |
| 186 | + |
| 187 | + # for each time period, randomly select units to observe |
| 188 | + observed_units = np.random.binomial(1, lambda_t, size=(len(df_panel.index))) |
| 189 | + df_repeated_cs = df_panel[observed_units == 1].copy() |
| 190 | + |
| 191 | + return df_repeated_cs |
0 commit comments