-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathresidual.py
114 lines (95 loc) · 3.8 KB
/
residual.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""KSample testing for equality of (conditional) distributions.
If the distributions are marginal distributions being compared, then
the test is a standard two-sample test, where the KS statistic, or
Mann-Whitney U statistic, is used to test for equality of distributions.
If the distributions are conditional distributions being compared, then
the test is a conditional two-sample test, where the KS statistic, or
Mann-Whitney U statistic, is used to test for equality of the
residual distributions, where the residuals are computed by regressing
the target variable, Y, on the conditioning variable, X.
The test statistic is described fully in :footcite:`peters2016causal`
and :footcite:`shah2018goodness`.
"""
import numpy as np
from scipy.stats import kstest
from .pvalue_result import PValueResult
def ksample(Y, Z):
stat, pval = kstest(Y[Z == 1], Y[Z == 0])
return PValueResult(pvalue=pval, statistic=stat)
def condksample(Y, Z, X, residual_test="ks", target_predictor=None, combine_pvalues=True):
r"""
Calulates the 2-sample test statistic.
Parameters
----------
Y : ndarray, shape (n_samples,)
Target or outcome features
X : ndarray, shape (n_samples, n_features)
Features to condition on
Z : list or ndarray, shape (n_samples,)
List of zeros and ones indicating which samples belong to
which groups.
target_predictor : sklearn.BaseEstimator, default=None
Method to predict the target given the covariates. If None,
uses a spline regression with 4 knots and degree 3 as
described in :footcite:`peters2016causal`.
residual_test : {"whitney_levene", "ks"}, default="ks"
Test of the residuals between the groups
combine_pvalues: bool, default=True
If True, returns hte minimum of the corrected pvalues.
Returns
-------
pvalue : float
The computed *k*-sample p-value.
r2 : float
r2 score of the regression fit
model : object
Fitted regresion model, if return_model is True
"""
from sklearn.metrics import r2_score
if target_predictor is None:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import SplineTransformer
pipe = Pipeline(
steps=[
("spline", SplineTransformer(n_knots=4, degree=3)),
("linear", LinearRegression()),
]
)
param_grid = {
"spline__n_knots": [3, 5, 7, 9],
}
target_predictor = GridSearchCV(
pipe, param_grid, n_jobs=-2, refit=True, scoring="neg_mean_squared_error"
)
target_predictor.fit(X, Y)
Y_pred = target_predictor.predict(X)
residuals = Y - Y_pred
r2 = r2_score(Y, Y_pred)
if residual_test == "whitney_levene":
from scipy.stats import levene, mannwhitneyu
_, mean_pval = mannwhitneyu(
residuals[np.asarray(Z, dtype=bool)],
residuals[np.asarray(1 - Z, dtype=bool)],
)
_, var_pval = levene(
residuals[np.asarray(Z, dtype=bool)],
residuals[np.asarray(1 - Z, dtype=bool)],
)
# Correct for multiple tests
if combine_pvalues:
pval = min(mean_pval * 2, var_pval * 2, 1)
else:
pval = (min(mean_pval * 2, 1), min(var_pval * 2, 1))
elif residual_test == "ks":
from scipy.stats import kstest
_, pval = kstest(
residuals[np.asarray(Z, dtype=bool)],
residuals[np.asarray(1 - Z, dtype=bool)],
)
else:
raise ValueError(f"Test {residual_test} not a valid option.")
return PValueResult(
statistic=r2, pvalue=pval, additional_info={"target_predictor": target_predictor}
)