Skip to content

Commit d65edf8

Browse files
Refactor Data Generators #306
1 parent 9df0711 commit d65edf8

18 files changed

+1796
-1620
lines changed

doubleml/datasets.py

Lines changed: 0 additions & 1620 deletions
This file was deleted.

doubleml/datasets/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
"""
2+
The :mod:`doubleml.datasets` module implements data generating processes for double machine learning simulations and provides access to real datasets.
3+
"""
4+
5+
# Import fetch functions
6+
from .fetch_401K import fetch_401K
7+
from .fetch_bonus import fetch_bonus
8+
9+
10+
__all__ = [
11+
"fetch_401K",
12+
"fetch_bonus",
13+
]

doubleml/datasets/fetch_401K.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
"""
2+
Data set on financial wealth and 401(k) plan participation.
3+
"""
4+
5+
import pandas as pd
6+
from doubleml import DoubleMLData
7+
8+
9+
def _get_array_alias():
10+
return ["array", "np.array", "np.ndarray"]
11+
12+
13+
def _get_data_frame_alias():
14+
return ["DataFrame", "pd.DataFrame", "pandas.DataFrame"]
15+
16+
17+
def _get_dml_data_alias():
18+
return ["DoubleMLData"]
19+
20+
21+
def fetch_401K(return_type="DoubleMLData", polynomial_features=False):
22+
"""
23+
Data set on financial wealth and 401(k) plan participation.
24+
25+
Parameters
26+
----------
27+
return_type :
28+
If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object.
29+
30+
If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``.
31+
polynomial_features :
32+
If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)).
33+
34+
References
35+
----------
36+
Abadie, A. (2003), Semiparametric instrumental variable estimation of treatment response models. Journal of
37+
Econometrics, 113(2): 231-263.
38+
39+
Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018),
40+
Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68.
41+
doi:`10.1111/ectj.12097 <https://doi.org/10.1111/ectj.12097>`_.
42+
"""
43+
_array_alias = _get_array_alias()
44+
_data_frame_alias = _get_data_frame_alias()
45+
_dml_data_alias = _get_dml_data_alias()
46+
47+
url = "https://github.com/VC2015/DMLonGitHub/raw/master/sipp1991.dta"
48+
raw_data = pd.read_stata(url)
49+
50+
y_col = "net_tfa"
51+
d_cols = ["e401"]
52+
x_cols = ["age", "inc", "educ", "fsize", "marr", "twoearn", "db", "pira", "hown"]
53+
54+
data = raw_data.copy()
55+
56+
if polynomial_features:
57+
raise NotImplementedError("polynomial_features os not implemented yet for fetch_401K.")
58+
59+
if return_type in _data_frame_alias + _dml_data_alias:
60+
if return_type in _data_frame_alias:
61+
return data
62+
else:
63+
return DoubleMLData(data, y_col, d_cols, x_cols)
64+
else:
65+
raise ValueError("Invalid return_type.")

doubleml/datasets/fetch_bonus.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
"""
2+
Data set on the Pennsylvania Reemployment Bonus experiment.
3+
"""
4+
5+
import numpy as np
6+
import pandas as pd
7+
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
8+
from doubleml import DoubleMLData
9+
10+
11+
def _get_array_alias():
12+
return ["array", "np.array", "np.ndarray"]
13+
14+
15+
def _get_data_frame_alias():
16+
return ["DataFrame", "pd.DataFrame", "pandas.DataFrame"]
17+
18+
19+
def _get_dml_data_alias():
20+
return ["DoubleMLData"]
21+
22+
23+
def fetch_bonus(return_type="DoubleMLData", polynomial_features=False):
24+
"""
25+
Data set on the Pennsylvania Reemployment Bonus experiment.
26+
27+
Parameters
28+
----------
29+
return_type :
30+
If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object.
31+
32+
If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``.
33+
polynomial_features :
34+
If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)).
35+
36+
References
37+
----------
38+
Bilias Y. (2000), Sequential Testing of Duration Data: The Case of Pennsylvania 'Reemployment Bonus' Experiment.
39+
Journal of Applied Econometrics, 15(6): 575-594.
40+
41+
Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018),
42+
Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68.
43+
doi:`10.1111/ectj.12097 <https://doi.org/10.1111/ectj.12097>`_.
44+
"""
45+
_array_alias = _get_array_alias()
46+
_data_frame_alias = _get_data_frame_alias()
47+
_dml_data_alias = _get_dml_data_alias()
48+
49+
url = "https://raw.githubusercontent.com/VC2015/DMLonGitHub/master/penn_jae.dat"
50+
raw_data = pd.read_csv(url, sep=r"\s+")
51+
52+
ind = (raw_data["tg"] == 0) | (raw_data["tg"] == 4)
53+
data = raw_data.copy()[ind]
54+
data.reset_index(inplace=True)
55+
data["tg"] = data["tg"].replace(4, 1)
56+
data["inuidur1"] = np.log(data["inuidur1"])
57+
58+
# variable dep as factor (dummy encoding)
59+
dummy_enc = OneHotEncoder(drop="first", categories="auto").fit(data.loc[:, ["dep"]])
60+
xx = dummy_enc.transform(data.loc[:, ["dep"]]).toarray()
61+
data["dep1"] = xx[:, 0]
62+
data["dep2"] = xx[:, 1]
63+
64+
y_col = "inuidur1"
65+
d_cols = ["tg"]
66+
x_cols = [
67+
"female",
68+
"black",
69+
"othrace",
70+
"dep1",
71+
"dep2",
72+
"q2",
73+
"q3",
74+
"q4",
75+
"q5",
76+
"q6",
77+
"agelt35",
78+
"agegt54",
79+
"durable",
80+
"lusd",
81+
"husd",
82+
]
83+
84+
if polynomial_features:
85+
poly = PolynomialFeatures(2, include_bias=False)
86+
data_transf = poly.fit_transform(data[x_cols])
87+
x_cols = list(poly.get_feature_names_out(x_cols))
88+
89+
data_transf = pd.DataFrame(data_transf, columns=x_cols)
90+
data = pd.concat((data[[y_col] + d_cols], data_transf), axis=1, sort=False)
91+
92+
if return_type in _data_frame_alias + _dml_data_alias:
93+
if return_type in _data_frame_alias:
94+
return data
95+
else:
96+
return DoubleMLData(data, y_col, d_cols, x_cols)
97+
else:
98+
raise ValueError("Invalid return_type.")

doubleml/irm/datasets/__init__.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""
2+
The :mod:`doubleml.irm.datasets` module implements data generating processes for interactive regression models.
3+
"""
4+
5+
from .dgp_confounded_irm_data import make_confounded_irm_data
6+
from .dgp_heterogeneous_data import make_heterogeneous_data
7+
from .dgp_iivm_data import make_iivm_data
8+
from .dgp_irm_data import make_irm_data
9+
from .dgp_irm_data_discrete_treatments import make_irm_data_discrete_treatments
10+
from .dgp_ssm_data import make_ssm_data
11+
12+
13+
__all__ = [
14+
"make_confounded_irm_data",
15+
"make_heterogeneous_data",
16+
"make_iivm_data",
17+
"make_irm_data",
18+
"make_irm_data_discrete_treatments",
19+
"make_ssm_data",
20+
]

0 commit comments

Comments
 (0)