diff --git a/.travis.yml b/.travis.yml index 61a6372e..08a3e390 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ before_install: install: - conda create -n testenv --yes pip python=$PYTHON - source activate testenv - - conda install --yes --quiet numpy=1.11 scipy=0.17 scikit-learn=0.18 pytest pytest-cov + - conda install --yes --quiet nomkl numpy=1.11 scipy=0.17 scikit-learn=0.18 pytest pytest-cov - pip install pandas - pip install -q flake8 - pip install coverage codecov diff --git a/doc/cheatsheet.rst b/doc/cheatsheet.rst index 4baaad87..b890a42b 100644 --- a/doc/cheatsheet.rst +++ b/doc/cheatsheet.rst @@ -6,6 +6,7 @@ This is a simple cheatsheet with the gradients and Hessians of the penalized log likelihood loss to use as updates in the Newton coordinate descent algorithm for GLMs. + Poisson: `softplus` ------------------- @@ -20,14 +21,15 @@ Poisson: `softplus` .. math:: - \mathcal{L} = \sum_i y_i \log(\mu_i) - \sum_i \mu_i + \mathcal{L} = \sum_i w_i \left[ y_i \log(\mu_i) - \mu_i \right] **L2-penalized loss function** .. math:: - J = \frac{1}{n}\sum_i \left\{ \log( 1 + \exp( \beta_0 + \sum_j \beta_j x_{ij} ) ) \right\} \\ - - \frac{1}{n}\sum_i \left\{ y_i \log( \log( 1 + \exp(\beta_0 + \sum_j \beta_j x_{ij} ) ) ) \right\} \\ + J = \frac{1}{n}\sum_i w_i \\ + \left\{ \log( 1 + \exp( \beta_0 + \sum_j \beta_j x_{ij} ) ) \\ + - y_i \log( \log( 1 + \exp(\beta_0 + \sum_j \beta_j x_{ij} ) ) ) \right\} \\ + \lambda (1-\alpha) \frac{1}{2} \sum_j \beta_j^2 **Gradient** @@ -36,8 +38,8 @@ Poisson: `softplus` \mu(z_i) &= \log(1 + \exp(z_i)) \\ \sigma(z_i) &= \frac{1}{1 + \exp(-z_i)} \\ - \frac{\partial J}{\partial \beta_0} &= \frac{1}{n}\sum_i \sigma(z_i) - \frac{1}{n}\sum_i y_i \frac{\sigma(z_i)}{\mu(z_i)} \\ - \frac{\partial J}{\partial \beta_j} &= \frac{1}{n}\sum_i \sigma(z_i) x_{ij} - \frac{1}{n}\sum_i \sigma(z_i) y_i \frac{\sigma(z_i)}{\mu(z_i)}x_{ij} + \lambda (1 - \alpha) \beta_j + \frac{\partial J}{\partial \beta_0} &= \frac{1}{n}\sum_i w_i \sigma(z_i) - \frac{1}{n}\sum_i w_i y_i \frac{\sigma(z_i)}{\mu(z_i)} \\ + \frac{\partial J}{\partial \beta_j} &= \frac{1}{n}\sum_i w_i \sigma(z_i) x_{ij} - \frac{1}{n}\sum_i w_i y_i \frac{\sigma(z_i)}{\mu(z_i)}x_{ij} + \lambda (1 - \alpha) \beta_j **Hessian** @@ -45,10 +47,10 @@ Poisson: `softplus` \mu(z_i) &= \log(1 + \exp(z_i)) \\ \sigma(z_i) &= \frac{1}{1 + \exp(-z_i)} \\ - \frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_i \sigma(z_i) (1 - \sigma(z_i)) - - \frac{1}{n}\sum_i y_i \left\{ \frac{\sigma(z_i) (1 - \sigma(z_i))}{\mu(z_i)} - \frac{\sigma(z_i)}{\mu(z_i)^2} \right\} \\ - \frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_i \sigma(z_i) (1 - \sigma(z_i)) x_{ij}^2 - - \frac{1}{n}\sum_i y_i \left\{ \frac{\sigma(z_i) (1 - \sigma(z_i))}{\mu(z_i)} - \frac{\sigma(z_i)}{\mu(z_i)^2} \right\} x_{ij}^2 + \frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_i w_i \sigma(z_i) (1 - \sigma(z_i)) + - \frac{1}{n}\sum_i w_i y_i \left\{ \frac{\sigma(z_i) (1 - \sigma(z_i))}{\mu(z_i)} - \frac{\sigma(z_i)}{\mu(z_i)^2} \right\} \\ + \frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_i w_i \sigma(z_i) (1 - \sigma(z_i)) x_{ij}^2 + - \frac{1}{n}\sum_i w_i y_i \left\{ \frac{\sigma(z_i) (1 - \sigma(z_i))}{\mu(z_i)} - \frac{\sigma(z_i)}{\mu(z_i)^2} \right\} x_{ij}^2 + \lambda (1 - \alpha) Poisson (linearized): `poisson` @@ -70,7 +72,7 @@ Poisson (linearized): `poisson` .. math:: - \mathcal{L} = \sum_i y_i \log(\mu_i) - \sum_i \mu_i + \mathcal{L} = \sum_i w_i \left[ y_i \log(\mu_i) - \mu_i \right] **L2-penalized loss function** @@ -89,10 +91,10 @@ Poisson (linearized): `poisson` \exp(\eta)z_i + (1-\eta)\exp(\eta), & z_i > \eta \end{cases} \\ - \frac{\partial J}{\partial \beta_0} &= \frac{1}{n}\sum_{i; z_i \leq \eta} (\mu_i - y_i) - + \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta) (1 - y_i/\mu_i) \\ - \frac{\partial J}{\partial \beta_j} &= \frac{1}{n}\sum_{i; z_i \leq \eta} (\mu_i - y_i) x_{ij} - + \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta) (1 - y_i/\mu_i) x_{ij} + \frac{\partial J}{\partial \beta_0} &= \frac{1}{n}\sum_{i; z_i \leq \eta} w_i (\mu_i - y_i) + + \frac{1}{n}\sum_{i; z_i > \eta} w_i \exp(\eta) (1 - y_i/\mu_i) \\ + \frac{\partial J}{\partial \beta_j} &= \frac{1}{n}\sum_{i; z_i \leq \eta} w_i (\mu_i - y_i) x_{ij} + + \frac{1}{n}\sum_{i; z_i > \eta} w_i \exp(\eta) (1 - y_i/\mu_i) x_{ij} **Hessian** @@ -105,10 +107,10 @@ Poisson (linearized): `poisson` \exp(\eta)z_i + (1-\eta)\exp(\eta), & z_i > \eta \end{cases} \\ - \frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_{i; z_i \leq \eta} \mu_i - + \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta)^2 \frac{y_i}{\mu_i^2} \\ - \frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_{i; z_i \leq \eta} \mu_i x_{ij}^2 - + \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta)^2 \frac{y_i}{\mu_i^2} x_{ij}^2 + \frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_{i; z_i \leq \eta} w_i \mu_i + + \frac{1}{n}\sum_{i; z_i > \eta} w_i \exp(\eta)^2 \frac{y_i}{\mu_i^2} \\ + \frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_{i; z_i \leq \eta} w_i \mu_i x_{ij}^2 + + \frac{1}{n}\sum_{i; z_i > \eta} w_i \exp(\eta)^2 \frac{y_i}{\mu_i^2} x_{ij}^2 + \lambda (1 - \alpha) Gaussian: `gaussian` @@ -125,13 +127,13 @@ Gaussian: `gaussian` .. math:: - \mathcal{L} = -\frac{1}{2} \sum_i (y_i - \mu_i)^2 \\ + \mathcal{L} = -\frac{1}{2} \sum_i w_i (y_i - \mu_i)^2 \\ **L2-penalized loss function** .. math:: - J = \frac{1}{2n}\sum_i (y_i - (\beta_0 + \sum_j \beta_j x_{ij}))^2 + + J = \frac{1}{2n}\sum_i w_i (y_i - \mu_i)^2 + \lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\ **Gradient** @@ -139,8 +141,8 @@ Gaussian: `gaussian` .. math:: \mu(z_i) &= z_i \\ - \frac{\partial J}{\partial \beta_0} &= -\frac{1}{n}\sum_i (y_i - \mu_i) \\ - \frac{\partial J}{\partial \beta_j} &= -\frac{1}{n}\sum_i (y_i - \mu_i) x_{ij} + \frac{\partial J}{\partial \beta_0} &= -\frac{1}{n}\sum_i w_i (y_i - \mu_i) \\ + \frac{\partial J}{\partial \beta_j} &= -\frac{1}{n}\sum_i w_i (y_i - \mu_i) x_{ij} + \lambda (1 - \alpha) \beta_j **Hessian** @@ -148,7 +150,7 @@ Gaussian: `gaussian` .. math:: \frac{\partial^2 J}{\partial \beta_0^2} &= 1 \\ - \frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_i x_{ij}^2 + \frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_i w_i x_{ij}^2 + \lambda (1 - \alpha) Logistic: `binomial` @@ -165,13 +167,13 @@ Logistic: `binomial` .. math:: - \mathcal{L} = \sum_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} \\ + \mathcal{L} = \sum_i w_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} \\ **L2-penalized loss function** .. math:: - J = -\frac{1}{n}\sum_i \left\{ y_i \log(\mu_i) + + J = -\frac{1}{n}\sum_i w_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} + \lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\ @@ -181,16 +183,16 @@ Logistic: `binomial` .. math:: \mu(z_i) &= \frac{1}{1 + \exp(-z_i)} \\ - \frac{\partial J}{\partial \beta_0} &= -\frac{1}{n}\sum_i (y_i - \mu_i) \\ - \frac{\partial J}{\partial \beta_j} &= -\frac{1}{n}\sum_i (y_i - \mu_i) x_{ij} + \frac{\partial J}{\partial \beta_0} &= -\frac{1}{n}\sum_i w_i (y_i - \mu_i) \\ + \frac{\partial J}{\partial \beta_j} &= -\frac{1}{n}\sum_i w_i (y_i - \mu_i) x_{ij} + \lambda (1 - \alpha) \beta_j **Hessian** .. math:: - \frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_i \mu_i (1 - \mu_i) \\ - \frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_i \mu_i (1 - \mu_i) x_{ij}^2 + \frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_i w_i \mu_i (1 - \mu_i) \\ + \frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_i w_i \mu_i (1 - \mu_i) x_{ij}^2 + \lambda (1 - \alpha) Logistic: `probit` @@ -209,13 +211,13 @@ where :math:`\Phi(z_i)` is the standard normal cumulative distribution function. .. math:: - \mathcal{L} = \sum_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} \\ + \mathcal{L} = \sum_i w_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} \\ **L2-penalized loss function** .. math:: - J = J = -\frac{1}{n}\sum_i \left\{ y_i \log(\mu_i) + + J = J = -\frac{1}{n}\sum_i w_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} + \lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\ @@ -233,9 +235,9 @@ where :math:`\Phi(z_i)` and :math:`\phi(z_i)` are the standard normal cdf and pd .. math:: \frac{\partial J}{\partial \beta_0} &= - -\frac{1}{n}\sum_i \Bigg\{y_i \frac{\mu'(z_i)}{\mu(z_i)} - (1 - y_i)\frac{\mu'(z_i)}{1 - \mu(z_i)}\Bigg\} \\ + -\frac{1}{n}\sum_i w_i \Bigg\{y_i \frac{\mu'(z_i)}{\mu(z_i)} - (1 - y_i)\frac{\mu'(z_i)}{1 - \mu(z_i)}\Bigg\} \\ \frac{\partial J}{\partial \beta_j} &= - -\frac{1}{n}\sum_i \Bigg\{y_i \frac{\mu'(z_i)}{\mu(z_i)} - (1 - y_i)\frac{\mu'(z_i)}{1 - \mu(z_i)}\Bigg\} x_{ij} + -\frac{1}{n}\sum_i w_i \Bigg\{y_i \frac{\mu'(z_i)}{\mu(z_i)} - (1 - y_i)\frac{\mu'(z_i)}{1 - \mu(z_i)}\Bigg\} x_{ij} + \lambda (1 - \alpha) \beta_j @@ -244,10 +246,10 @@ where :math:`\Phi(z_i)` and :math:`\phi(z_i)` are the standard normal cdf and pd .. math:: \frac{\partial^2 J}{\partial \beta_0^2} &= - \frac{1}{n}\sum_i \mu'(z_i) \Bigg\{y_i \frac{z_i\mu(z_i) + \mu'(z_i)}{\mu^2(z_i)} + + \frac{1}{n}\sum_i w_i \mu'(z_i) \Bigg\{y_i \frac{z_i\mu(z_i) + \mu'(z_i)}{\mu^2(z_i)} + (1 - y_i)\frac{-z_i(1 - \mu(z_i)) + \mu'(z_i)}{(1 - \mu(z_i))^2} \Bigg\} \\ \frac{\partial^2 J}{\partial \beta_j^2} &= - \frac{1}{n}\sum_i \mu'(z_i) \Bigg\{y_i \frac{z_i\mu(z_i) + \mu'(z_i)}{\mu^2(z_i)} + + \frac{1}{n}\sum_i w_i \mu'(z_i) \Bigg\{y_i \frac{z_i\mu(z_i) + \mu'(z_i)}{\mu^2(z_i)} + (1 - y_i)\frac{-z_i(1 - \mu(z_i)) + \mu'(z_i)}{(1 - \mu(z_i))^2} \Bigg\} x_{ij}^2 + \lambda (1 - \alpha) @@ -265,7 +267,7 @@ Gamma .. math:: - \mathcal{L} = \sum_{i} \nu\Bigg\{\frac{-y_i}{\mu_i} - log(\mu_i)\Bigg\} + \mathcal{L} = \sum_{i} \nu w_i \Bigg\{\frac{-y_i}{\mu_i} - log(\mu_i)\Bigg\} where :math:`\nu` is the shape parameter. It is exponential for :math:`\nu = 1` and normal for :math:`\nu = \infty`. @@ -274,16 +276,16 @@ and normal for :math:`\nu = \infty`. .. math:: - J = -\frac{1}{n}\sum_{i} \nu\Bigg\{\frac{-y_i}{\mu_i} - log(\mu_i)\Bigg\} + J = -\frac{1}{n}\sum_{i} \nu w_i \Bigg\{\frac{-y_i}{\mu_i} - log(\mu_i)\Bigg\} + \lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\ **Gradient** .. math:: - \frac{\partial J}{\partial \beta_0} &= \frac{1}{n} \sum_{i} \nu\Bigg\{\frac{y_i}{\mu_i^2} + \frac{\partial J}{\partial \beta_0} &= \frac{1}{n} \sum_{i} w_i \nu\Bigg\{\frac{y_i}{\mu_i^2} - \frac{1}{\mu_i}\Bigg\}{\mu_i'} \\ - \frac{\partial J}{\partial \beta_j} &= \frac{1}{n} \sum_{i} \nu\Bigg\{\frac{y_i}{\mu_i^2} + \frac{\partial J}{\partial \beta_j} &= \frac{1}{n} \sum_{i} w_i \nu\Bigg\{\frac{y_i}{\mu_i^2} - \frac{1}{\mu_i}\Bigg\}{\mu_i'}x_{ij} + \lambda (1 - \alpha) \beta_j where :math:`\mu_i' = \frac{1}{1 + \exp(-z_i)}`. diff --git a/pyglmnet/datasets.py b/pyglmnet/datasets.py index 073c00ea..51f8f44c 100644 --- a/pyglmnet/datasets.py +++ b/pyglmnet/datasets.py @@ -7,7 +7,7 @@ import tempfile import itertools import numpy as np -from scipy.misc import comb +from scipy.special import comb from .externals.six.moves import urllib diff --git a/pyglmnet/metrics.py b/pyglmnet/metrics.py index ee22fb6f..426d2b4a 100644 --- a/pyglmnet/metrics.py +++ b/pyglmnet/metrics.py @@ -4,7 +4,7 @@ from .pyglmnet import _logL -def deviance(y, yhat, distr): +def deviance(y, yhat, sample_weight, distr): """Deviance metrics. Parameters @@ -15,6 +15,9 @@ def deviance(y, yhat, distr): yhat : array Predicted labels of shape (n_samples, ) + sample_weight : array + Sample weights of shape (n_samples, ) + distr: str distribution @@ -24,16 +27,16 @@ def deviance(y, yhat, distr): Deviance of the predicted labels. """ if distr in ['softplus', 'poisson']: - LS = _logL(distr, y, y) + LS = _logL(distr, y, y, w=sample_weight) else: LS = 0 - L1 = _logL(distr, y, yhat) + L1 = _logL(distr, y, yhat, w=sample_weight) score = -2 * (L1 - LS) return score -def pseudo_R2(X, y, yhat, ynull_, distr): +def pseudo_R2(X, y, yhat, ynull_, sample_weight, distr): """Pseudo-R2 metric. Parameters @@ -47,6 +50,9 @@ def pseudo_R2(X, y, yhat, ynull_, distr): ynull_ : float Mean of the target labels (null model prediction) + sample_weight : array + Sample weights of shape (n_samples, ) + distr: str distribution @@ -56,12 +62,12 @@ def pseudo_R2(X, y, yhat, ynull_, distr): Pseudo-R2 score. """ if distr in ['softplus', 'poisson']: - LS = _logL(distr, y, y) + LS = _logL(distr, y, y, w=sample_weight) else: LS = 0 - L0 = _logL(distr, y, ynull_) - L1 = _logL(distr, y, yhat) + L0 = _logL(distr, y, ynull_, w=sample_weight) + L1 = _logL(distr, y, yhat, w=sample_weight) if distr in ['softplus', 'poisson']: score = (1 - (LS - L1) / (LS - L0)) @@ -70,7 +76,7 @@ def pseudo_R2(X, y, yhat, ynull_, distr): return score -def accuracy(y, yhat): +def accuracy(y, yhat, sample_weight): """Accuracy as ratio of correct predictions. Parameters @@ -81,9 +87,12 @@ def accuracy(y, yhat): yhat : array Predicted labels of shape (n_samples, ) + sample_weight : array + Sample weights of shape (n_samples, ) + Returns ------- accuracy : float Accuracy score. """ - return float(np.sum(y == yhat)) / yhat.shape[0] + return float(np.dot(sample_weight, (y == yhat))) / np.sum(sample_weight) diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py index b50178ce..ecfac5c9 100644 --- a/pyglmnet/pyglmnet.py +++ b/pyglmnet/pyglmnet.py @@ -111,33 +111,35 @@ def _grad_mu(distr, z, eta): return grad_mu -def _logL(distr, y, y_hat, z=None): +def _logL(distr, y, y_hat, w, z=None): """The log likelihood.""" if distr in ['softplus', 'poisson']: eps = np.spacing(1) - logL = np.sum(y * np.log(y_hat + eps) - y_hat) + logL = np.sum(w * (y * np.log(y_hat + eps) - y_hat)) elif distr == 'gaussian': - logL = -0.5 * np.sum((y - y_hat)**2) + logL = -0.5 * np.sum(w * ((y - y_hat) ** 2)) elif distr == 'binomial': # prevents underflow if z is not None: - logL = np.sum(y * z - np.log(1 + np.exp(z))) + logL = np.sum(w * (y * z - np.log(1 + np.exp(z)))) # for scoring else: - logL = np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)) + logL = np.sum(w * (y * np.log(y_hat) + + (1 - y) * np.log(1 - y_hat))) elif distr == 'probit': if z is not None: pdfz, cdfz = norm.pdf(z), norm.cdf(z) - logL = np.sum(y * _probit_g1(z, pdfz, cdfz) + - (1 - y) * _probit_g2(z, pdfz, cdfz)) + logL = np.sum(w * (y * _probit_g1(z, pdfz, cdfz) + + (1 - y) * _probit_g2(z, pdfz, cdfz))) else: - logL = np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)) + logL = np.sum(w * (y * np.log(y_hat) + + (1 - y) * np.log(1 - y_hat))) elif distr == 'gamma': # see # https://www.statistics.ma.tum.de/fileadmin/w00bdb/www/czado/lec8.pdf nu = 1. # shape parameter, exponential for now - logL = np.sum(nu * (-y / y_hat - np.log(y_hat))) + logL = np.sum(w * (nu * (-y / y_hat - np.log(y_hat)))) return logL @@ -183,29 +185,29 @@ def _L1penalty(beta, group=None): return L1penalty -def _loss(distr, alpha, Tau, reg_lambda, X, y, eta, group, beta): +def _loss(distr, alpha, Tau, reg_lambda, X, y, w, eta, group, beta): """Define the objective function for elastic net.""" n_samples = X.shape[0] z = beta[0] + np.dot(X, beta[1:]) y_hat = _mu(distr, z, eta) - L = 1. / n_samples * _logL(distr, y, y_hat, z) + L = 1. / n_samples * _logL(distr, y, y_hat, w=w, z=z) P = _penalty(alpha, beta[1:], Tau, group) J = -L + reg_lambda * P return J -def _L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, group, beta): +def _L2loss(distr, alpha, Tau, reg_lambda, X, y, w, eta, group, beta): """Define the objective function for elastic net.""" n_samples = X.shape[0] z = beta[0] + np.dot(X, beta[1:]) y_hat = _mu(distr, z, eta) - L = 1. / n_samples * _logL(distr, y, y_hat, z) + L = 1. / n_samples * _logL(distr, y, y_hat, w=w, z=z) P = 0.5 * (1 - alpha) * _L2penalty(beta[1:], Tau) J = -L + reg_lambda * P return J -def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, beta): +def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, w, eta, beta): """The gradient.""" n_samples = np.float(X.shape[0]) @@ -218,29 +220,29 @@ def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, beta): grad_mu = _grad_mu(distr, z, eta) if distr in ['poisson', 'softplus']: - grad_beta0 = np.sum(grad_mu) - np.sum(y * grad_mu / mu) - grad_beta = ((np.dot(grad_mu.T, X) - - np.dot((y * grad_mu / mu).T, X)).T) + grad_beta0 = np.sum(w * grad_mu) - np.sum(w * y * grad_mu / mu) + grad_beta = ((np.dot(w * grad_mu.T, X) - + np.dot((w * y * grad_mu / mu).T, X)).T) elif distr == 'gaussian': - grad_beta0 = np.sum((mu - y) * grad_mu) - grad_beta = np.dot((mu - y).T, X * grad_mu[:, None]).T + grad_beta0 = np.sum(w * (mu - y) * grad_mu) + grad_beta = np.dot((w * (mu - y)).T, X * grad_mu[:, None]).T elif distr == 'binomial': - grad_beta0 = np.sum(mu - y) - grad_beta = np.dot((mu - y).T, X).T + grad_beta0 = np.sum(w * (mu - y)) + grad_beta = np.dot((w * (mu - y)).T, X).T elif distr == 'probit': grad_logl = (y * _probit_g3(z, grad_mu, mu) - (1 - y) * _probit_g4(z, grad_mu, mu)) - grad_beta0 = -np.sum(grad_logl) - grad_beta = -np.dot(grad_logl.T, X).T + grad_beta0 = -np.sum(w * grad_logl) + grad_beta = -np.dot((w * grad_logl).T, X).T elif distr == 'gamma': nu = 1. grad_logl = (y / mu ** 2 - 1 / mu) * grad_mu - grad_beta0 = -nu * np.sum(grad_logl) - grad_beta = -nu * np.dot(grad_logl.T, X).T + grad_beta0 = -nu * np.sum(w * grad_logl) + grad_beta = -nu * np.dot((w * grad_logl).T, X).T grad_beta0 *= 1. / n_samples grad_beta *= 1. / n_samples @@ -252,7 +254,7 @@ def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, beta): return g -def _gradhess_logloss_1d(distr, xk, y, z, eta): +def _gradhess_logloss_1d(distr, xk, y, w, z, eta): """ Compute gradient (1st derivative) and Hessian (2nd derivative) @@ -279,41 +281,42 @@ def _gradhess_logloss_1d(distr, xk, y, z, eta): if distr == 'softplus': mu = _mu(distr, z, eta) s = expit(z) - gk = np.sum(s * xk) - np.sum(y * s / mu * xk) + gk = np.sum(w * s * xk) - np.sum(w * y * s / mu * xk) grad_s = s * (1 - s) grad_s_by_mu = grad_s / mu - s / (mu ** 2) - hk = np.sum(grad_s * xk ** 2) - np.sum(y * grad_s_by_mu * xk ** 2) + hk = np.sum(w * grad_s * xk ** 2) - \ + np.sum(w * y * grad_s_by_mu * xk ** 2) elif distr == 'poisson': mu = _mu(distr, z, eta) s = expit(z) - gk = np.sum((mu[z <= eta] - y[z <= eta]) * + gk = np.sum(w[z <= eta] * (mu[z <= eta] - y[z <= eta]) * xk[z <= eta]) + \ np.exp(eta) * \ - np.sum((1 - y[z > eta] / mu[z > eta]) * + np.sum(w[z > eta] * (1 - y[z > eta] / mu[z > eta]) * xk[z > eta]) - hk = np.sum(mu[z <= eta] * xk[z <= eta] ** 2) + \ + hk = np.sum(w[z <= eta] * mu[z <= eta] * xk[z <= eta] ** 2) + \ np.exp(eta) ** 2 * \ - np.sum(y[z > eta] / (mu[z > eta] ** 2) * + np.sum(w[z > eta] * y[z > eta] / (mu[z > eta] ** 2) * (xk[z > eta] ** 2)) elif distr == 'gaussian': - gk = np.sum((z - y) * xk) - hk = np.sum(xk * xk) + gk = np.sum(w * (z - y) * xk) + hk = np.sum(w * xk * xk) elif distr == 'binomial': mu = _mu(distr, z, eta) - gk = np.sum((mu - y) * xk) - hk = np.sum(mu * (1.0 - mu) * xk * xk) + gk = np.sum(w * (mu - y) * xk) + hk = np.sum(w * mu * (1.0 - mu) * xk * xk) elif distr == 'probit': pdfz = norm.pdf(z) cdfz = norm.cdf(z) - gk = -np.sum((y * _probit_g3(z, pdfz, cdfz) - - (1 - y) * _probit_g4(z, pdfz, cdfz)) * xk) - hk = np.sum((y * _probit_g5(z, pdfz, cdfz) + - (1 - y) * _probit_g6(z, pdfz, cdfz)) * (xk * xk)) + gk = -np.sum(w * (y * _probit_g3(z, pdfz, cdfz) - + (1 - y) * _probit_g4(z, pdfz, cdfz)) * xk) + hk = np.sum(w * (y * _probit_g5(z, pdfz, cdfz) + + (1 - y) * _probit_g6(z, pdfz, cdfz)) * (xk * xk)) elif distr == 'gamma': raise NotImplementedError('cdfast is not implemented for Gamma ' @@ -478,6 +481,8 @@ def __init__(self, distr='poisson', alpha=0.5, self.solver = solver self.learning_rate = learning_rate self.max_iter = max_iter + self.n_iter = 0 + self.converged = False self.beta0_ = None self.beta_ = None self.ynull_ = None @@ -595,7 +600,7 @@ def _prox(self, beta, thresh): return result - def _cdfast(self, X, y, z, ActiveSet, beta, rl): + def _cdfast(self, X, y, w, ActiveSet, beta, rl): """ Perform one cycle of Newton updates for all coordinates. @@ -607,9 +612,10 @@ def _cdfast(self, X, y, z, ActiveSet, beta, rl): y : array Labels to the data n_samples x 1 - z: array + w : array + Weights to the data n_samples x 1 - beta[0] + X * beta[1:] + ActiveSet: array n_features + 1 x 1 Active set storing which betas are non-zero @@ -624,9 +630,6 @@ def _cdfast(self, X, y, z, ActiveSet, beta, rl): beta: array (n_features + 1) x 1 Updated parameters - z: array - beta[0] + X * beta[1:] - (n_features + 1) x 1 """ n_samples = X.shape[0] n_features = X.shape[1] @@ -641,7 +644,9 @@ def _cdfast(self, X, y, z, ActiveSet, beta, rl): xk = np.ones((n_samples, )) # Calculate grad and hess of log likelihood term - gk, hk = _gradhess_logloss_1d(self.distr, xk, y, z, self.eta) + z = beta[0] + np.dot(X, beta[1:]) + gk, hk = \ + _gradhess_logloss_1d(self.distr, xk, y, w, z, self.eta) # Add grad and hess of regularization term if self.Tau is None: @@ -656,10 +661,10 @@ def _cdfast(self, X, y, z, ActiveSet, beta, rl): # Update parameters, z update = 1. / hk * gk - beta[k], z = beta[k] - update, z - update * xk - return beta, z + beta[k] = beta[k] - update + return beta - def fit(self, X, y): + def fit(self, X, y, sample_weight=None): """The fit function. Parameters @@ -694,6 +699,13 @@ def fit(self, X, y): n_features = X.shape[1] + # normalize weights so that X.shape[0] = sum(sample_weight) + # as assumed by weighted sum calculations in loss functions + if sample_weight is None: + sample_weight = np.ones_like(y) + else: + sample_weight /= np.mean(sample_weight) + # Initialize parameters beta = np.zeros((n_features + 1,)) if self.beta0_ is None and self.beta_ is None: @@ -713,31 +725,43 @@ def fit(self, X, y): if self.solver == 'cdfast': ActiveSet = np.ones(n_features + 1) # init active set - z = beta[0] + np.dot(X, beta[1:]) # cache z # Iterative updates for t in range(0, self.max_iter): + logger.info("t: %i" % t) + convergence_metric = float('inf') if self.solver == 'batch-gradient': grad = _grad_L2loss(self.distr, alpha, self.Tau, - reg_lambda, X, y, self.eta, + reg_lambda, X, y, sample_weight, self.eta, beta) # Converged if the norm(gradient) < tol - if (t > 1) and (np.linalg.norm(grad) < tol): - msg = ('\tConverged in {0:d} iterations'.format(t)) - logger.info(msg) - break + convergence_metric = np.linalg.norm(grad) + logger.debug("convergence_metric: %f" % convergence_metric) + if (t > 1) and (convergence_metric < tol): + self.converged = True + self.n_iter += t + msg = ('\tConverged in {0:d} iterations' + .format(self.n_iter)) + logger.info(msg) + break beta = beta - self.learning_rate * grad elif self.solver == 'cdfast': beta_old = deepcopy(beta) - beta, z = \ - self._cdfast(X, y, z, ActiveSet, beta, reg_lambda) + + beta = self._cdfast(X, y, sample_weight, + ActiveSet, beta, reg_lambda) # Converged if the norm(update) < tol - if (t > 1) and (np.linalg.norm(beta - beta_old) < tol): - msg = ('\tConverged in {0:d} iterations'.format(t)) - logger.info(msg) - break + convergence_metric = np.linalg.norm(beta - beta_old) + logger.debug("convergence_metric: %f" % convergence_metric) + if (t > 1) and (convergence_metric < tol): + self.converged = True + self.n_iter += t + msg = ('\tConverged in {0:d} iterations' + .format(self.n_iter)) + logger.info(msg) + break # Apply proximal operator beta[1:] = self._prox(beta[1:], reg_lambda * alpha) @@ -749,11 +773,21 @@ def fit(self, X, y): # Compute and save loss if callbacks are requested if callable(self.callback): self.callback(beta) + else: + # Warn if it hit max_iter without converging + self.converged = False + self.n_iter += t + 1 + msg = ('\t' + 'Failed to converge after {0:d} iterations.' + ' Last convergence metric was {1:f}' + ' and convergence threshold {2:f}.' + ).format(self.n_iter, convergence_metric, tol) + logger.warning(msg) # Update the estimated variables self.beta0_ = beta[0] self.beta_ = beta[1:] - self.ynull_ = np.mean(y) + self.ynull_ = np.sum(sample_weight * y) / np.sum(sample_weight) return self def predict(self, X): @@ -813,7 +847,7 @@ def predict_proba(self, X): yhat = np.asarray(yhat) return yhat - def fit_predict(self, X, y): + def fit_predict(self, X, y, sample_weight=None): """Fit the model and predict on the same data. Parameters @@ -828,9 +862,9 @@ def fit_predict(self, X, y): yhat : array The predicted targets of shape (n_samples,). """ - return self.fit(X, y).predict(X) + return self.fit(X, y, sample_weight).predict(X) - def score(self, X, y): + def score(self, X, y, sample_weight=None): """Score the model. Parameters @@ -865,6 +899,10 @@ def score(self, X, y): 'or multinomial distributions') y = y.ravel() + if sample_weight is None: + sample_weight = np.ones_like(y) + else: + sample_weight /= np.mean(sample_weight) if self.distr == 'binomial' and self.score_metric != 'accuracy': yhat = self.predict_proba(X) @@ -873,11 +911,12 @@ def score(self, X, y): # Check whether we have a list of estimators or a single estimator if self.score_metric == 'deviance': - return metrics.deviance(y, yhat, self.distr) + return metrics.deviance(y, yhat, sample_weight, self.distr) elif self.score_metric == 'pseudo_R2': - return metrics.pseudo_R2(X, y, yhat, self.ynull_, self.distr) + return metrics.pseudo_R2(X, y, yhat, self.ynull_, + sample_weight, self.distr) if self.score_metric == 'accuracy': - return metrics.accuracy(y, yhat) + return metrics.accuracy(y, yhat, sample_weight) class GLMCV(object): @@ -1052,7 +1091,7 @@ def copy(self): """ return deepcopy(self) - def fit(self, X, y): + def fit(self, X, y, sample_weight=None): """The fit function. Parameters ---------- @@ -1069,7 +1108,12 @@ def fit(self, X, y): """ logger.info('Looping through the regularization path') glms, scores = list(), list() - self.ynull_ = np.mean(y) + if sample_weight is None: + sample_weight = np.ones_like(y) + else: + sample_weight /= np.mean(sample_weight) + + self.ynull_ = np.sum(sample_weight * y) / np.sum(sample_weight) if not type(int): raise ValueError('cv must be int. We do not support scikit-learn ' @@ -1104,15 +1148,16 @@ def fit(self, X, y): else: glm.beta0_, glm.beta_ = glms[-1].beta0_, glms[-1].beta_ - glm.fit(X[train], y[train]) - scores_fold.append(glm.score(X[val], y[val])) + glm.fit(X[train], y[train], sample_weight[train]) + scores_fold.append( + glm.score(X[val], y[val], sample_weight[val])) scores.append(np.mean(scores_fold)) if idx == 0: glm.beta0_, glm.beta_ = self.beta0_, self.beta_ else: glm.beta0_, glm.beta_ = glms[-1].beta0_, glms[-1].beta_ - glm.fit(X, y) + glm.fit(X, y, sample_weight) glms.append(glm) # Update the estimated variables if self.score_metric == 'deviance': @@ -1162,7 +1207,7 @@ def predict_proba(self, X): """ return self.glm_.predict_proba(X) - def fit_predict(self, X, y): + def fit_predict(self, X, y, sample_weight=None): """Fit the model and predict on the same data. Parameters @@ -1177,10 +1222,10 @@ def fit_predict(self, X, y): The predicted targets of shape based on the model with optimal reg_lambda (n_samples,) """ - self.fit(X, y) + self.fit(X, y, sample_weight) return self.glm_.predict(X) - def score(self, X, y): + def score(self, X, y, sample_weight=None): """Score the model. Parameters @@ -1197,4 +1242,4 @@ def score(self, X, y): score: float The score metric for the optimal reg_lambda """ - return self.glm_.score(X, y) + return self.glm_.score(X, y, sample_weight) diff --git a/setup.cfg b/setup.cfg index b5903cc1..40e9a84a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -7,7 +7,11 @@ ignore-files = (benchmarks.py|datasets.py) [flake8] exclude = __init__.py,six.py,funcsigs.py -ignore = E241 +ignore = + # W504: Line break occurred after a binary operator + W504, + # W605: invalid escape sequence 'x'. Conflicts with Sphinx math in docs + W605 [metadata] description-file = "README.md" diff --git a/setup.py b/setup.py index aa136326..2b3ff498 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,5 @@ 'Operating System :: MacOS', ], platforms='any', - packages=[ - 'pyglmnet' - ], + packages=setuptools.find_packages(), ) diff --git a/tests/test_pyglmnet.py b/tests/test_pyglmnet.py index 20d5202c..6f08444b 100644 --- a/tests/test_pyglmnet.py +++ b/tests/test_pyglmnet.py @@ -1,4 +1,5 @@ from functools import partial +import itertools import numpy as np from numpy.testing import assert_allclose, assert_array_equal @@ -15,6 +16,29 @@ _gradhess_logloss_1d, _loss, datasets) +# TODO move this into a better place in line after branch testing is done +def test_sample_weight_cv(): + """Simple sample weight check.""" + # XXX: don't use scikit-learn for tests. + n_samples = 100 + X, y = make_regression(n_samples=n_samples) + w = np.random.lognormal(0.0, 1.0, n_samples) + w /= w.sum() + cv = KFold(n_splits=5) + + glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1) + # check that cv and rest of sklearn interface works + cv_scores = cross_val_score(glm_normal, X, y, + fit_params={'sample_weight': w}, cv=cv) + assert(len(cv_scores) == 5) + + param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)}, + {'reg_lambda': np.logspace(np.log(0.5), np.log(0.01), + 10, base=np.exp(1))}] + glmcv = GridSearchCV(glm_normal, param_grid, cv=cv) + glmcv.fit(X, y) + + def test_gradients(): """Test gradient accuracy.""" # data @@ -22,6 +46,7 @@ def test_gradients(): n_samples, n_features = 1000, 100 X = np.random.normal(0.0, 1.0, [n_samples, n_features]) X = scaler.fit_transform(X) + w = np.ones(n_samples) density = 0.1 beta_ = np.zeros(n_features + 1) @@ -35,9 +60,9 @@ def test_gradients(): y = simulate_glm(glm.distr, beta_[0], beta_[1:], X) func = partial(_L2loss, distr, glm.alpha, - glm.Tau, reg_lambda, X, y, glm.eta, glm.group) + glm.Tau, reg_lambda, X, y, w, glm.eta, glm.group) grad = partial(_grad_L2loss, distr, glm.alpha, glm.Tau, - reg_lambda, X, y, + reg_lambda, X, y, w, glm.eta) approx_grad = approx_fprime(beta_, func, 1.5e-8) analytical_grad = grad(beta_) @@ -164,61 +189,94 @@ def test_glmnet(): distrs = ['softplus', 'gaussian', 'poisson', 'binomial', 'probit'] solvers = ['batch-gradient', 'cdfast'] + reg_lambdas = [0.0, 0.1] score_metric = 'pseudo_R2' learning_rate = 2e-1 random_state = 0 for distr in distrs: - betas_ = list() - for solver in solvers: - - np.random.seed(random_state) - - X_train = np.random.normal(0.0, 1.0, [n_samples, n_features]) - y_train = simulate_glm(distr, beta0, beta, X_train, - sample=False) - - alpha = 0. - reg_lambda = 0. - loss_trace = list() - - def callback(beta): - Tau = None - eta = 2.0 - group = None - - loss_trace.append( - _loss(distr, alpha, Tau, reg_lambda, - X_train, y_train, eta, group, beta)) - - glm = GLM(distr, learning_rate=learning_rate, - reg_lambda=reg_lambda, tol=1e-3, max_iter=5000, - alpha=alpha, solver=solver, score_metric=score_metric, - random_state=random_state, callback=callback) - assert(repr(glm)) - - glm.fit(X_train, y_train) - - # verify loss decreases - assert(np.all(np.diff(loss_trace) <= 1e-7)) - - # verify loss at convergence = loss when beta=beta_ - l_true = _loss(distr, 0., np.eye(beta.shape[0]), 0., - X_train, y_train, 2.0, None, - np.concatenate(([beta0], beta))) - assert_allclose(loss_trace[-1], l_true, rtol=1e-4, atol=1e-5) - # beta=beta_ when reg_lambda = 0. - assert_allclose(beta, glm.beta_, rtol=0.05, atol=1e-2) - betas_.append(glm.beta_) - - y_pred = glm.predict(X_train) - assert(y_pred.shape[0] == X_train.shape[0]) - - # compare all solvers pairwise to make sure they're close - for i, first_beta in enumerate(betas_[:-1]): - for second_beta in betas_[i + 1:]: - assert_allclose(first_beta, second_beta, rtol=0.05, atol=1e-2) + for reg_lambda in reg_lambdas: + betas_ = list() + for solver in solvers: + + np.random.seed(random_state) + + X_train = np.random.normal(0.0, 1.0, [n_samples, n_features]) + y_train = simulate_glm(distr, beta0, beta, X_train, + sample=False) + w_train = np.ones_like(y_train) + alpha = 0.5 + loss_trace = list() + + def callback(beta): + Tau = None + eta = 2.0 + group = None + + loss_trace.append( + _loss(distr, alpha, Tau, reg_lambda, + X_train, y_train, w_train, eta, group, beta)) + + glm = GLM(distr, learning_rate=learning_rate, + reg_lambda=reg_lambda, tol=1e-3, max_iter=5000, + alpha=alpha, solver=solver, + score_metric=score_metric, + random_state=random_state, callback=callback) + assert(repr(glm)) + + glm.fit(X_train, y_train) + + # verify that loss doesn't increase more than tolerance + # for too many consecutive iterations + loss_increased = (np.diff(loss_trace) > glm.tol) + loss_increase_runlengths = [sum(grp) for + val, grp in + itertools.groupby(loss_increased) + if val] + loss_increase_runlengths = np.array(loss_increase_runlengths) + assert np.all(loss_increase_runlengths <= 5), \ + ('Loss increased for {maxlen} consecutive iterations' + ' on distr={d} solver={s} with' + ' reg_lambda={rl}' + ).format(d=distr, s=solver, + rl=reg_lambda, + maxlen=np.max(loss_increase_runlengths)) + + if reg_lambda == 0.0: + # check that the true model can be recreated + # almost perfectly when no regularization is applied + # verify loss at convergence = loss when beta=beta_ + l_true = _loss(distr, alpha, np.eye(beta.shape[0]), + reg_lambda, X_train, y_train, w_train, + 2.0, None, np.concatenate(([beta0], beta))) + assert_allclose(loss_trace[-1], l_true, + rtol=1e-4, atol=1e-5, + err_msg=('Final loss trace value different' + ' from true loss ' + ' on distr={d} solver={s}' + ' with reg_lambda={rl}' + ).format(d=distr, s=solver, + rl=reg_lambda)) + # beta=beta_ when reg_lambda = 0. + assert_allclose(beta, glm.beta_, rtol=0.05, atol=1e-2, + err_msg=('Fitted beta too different' + ' from true beta' + ' in distr={} solver={}' + ).format(distr, solver)) + betas_.append(glm.beta_) + + y_pred = glm.predict(X_train) + assert y_pred.shape[0] == X_train.shape[0], \ + ('Fitted values have wrong number of rows in ' + ' on distr={d} solver={s} with reg_lambda={rl}' + ).format(d=distr, s=solver, rl=reg_lambda) + + # compare all solvers pairwise to make sure they're close + for i, first_beta in enumerate(betas_[:-1]): + for second_beta in betas_[i + 1:]: + assert_allclose(first_beta, second_beta, + rtol=0.05, atol=1e-2) # test fit_predict glm_poisson = GLM(distr='softplus') @@ -317,7 +375,8 @@ def test_cdfast(): z = beta_[0] + np.dot(X, beta_[1:]) k = 1 xk = X[:, k - 1] - gk, hk = _gradhess_logloss_1d(glm.distr, xk, y, z, glm.eta) + w = np.ones_like(y) + gk, hk = _gradhess_logloss_1d(glm.distr, xk, y, z, w, glm.eta) # test grad and hess if distr != 'multinomial': @@ -335,10 +394,9 @@ def test_cdfast(): # test cdfast ActiveSet = np.ones(n_features + 1) - beta_ret, z_ret = glm._cdfast(X, y, z, - ActiveSet, beta_, glm.reg_lambda) + beta_ret = glm._cdfast(X, y, w, + ActiveSet, beta_, glm.reg_lambda) assert(beta_ret.shape == beta_.shape) - assert(z_ret.shape == z.shape) def test_fetch_datasets():