diff --git a/.travis.yml b/.travis.yml
index 61a6372e..08a3e390 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,7 +11,7 @@ before_install:
 install:
     - conda create -n testenv --yes pip python=$PYTHON
     - source activate testenv
-    - conda install --yes --quiet numpy=1.11 scipy=0.17 scikit-learn=0.18 pytest pytest-cov
+    - conda install --yes --quiet nomkl numpy=1.11 scipy=0.17 scikit-learn=0.18 pytest pytest-cov
     - pip install pandas
     - pip install -q flake8
     - pip install coverage codecov
diff --git a/doc/cheatsheet.rst b/doc/cheatsheet.rst
index 4baaad87..b890a42b 100644
--- a/doc/cheatsheet.rst
+++ b/doc/cheatsheet.rst
@@ -6,6 +6,7 @@ This is a simple cheatsheet with the gradients and Hessians
 of the penalized log likelihood loss to use as updates in the
 Newton coordinate descent algorithm for GLMs.
 
+
 Poisson: `softplus`
 -------------------
 
@@ -20,14 +21,15 @@ Poisson: `softplus`
 
 .. math::
 
-    \mathcal{L} = \sum_i y_i \log(\mu_i) - \sum_i \mu_i
+    \mathcal{L} = \sum_i w_i \left[ y_i \log(\mu_i) - \mu_i \right]
 
 **L2-penalized loss function**
 
 .. math::
 
-    J = \frac{1}{n}\sum_i \left\{ \log( 1 + \exp( \beta_0 + \sum_j \beta_j x_{ij} ) ) \right\} \\
-    - \frac{1}{n}\sum_i \left\{ y_i \log( \log( 1 + \exp(\beta_0 + \sum_j \beta_j x_{ij} ) ) ) \right\} \\
+    J = \frac{1}{n}\sum_i w_i \\
+        \left\{ \log( 1 + \exp( \beta_0 + \sum_j \beta_j x_{ij} ) ) \\
+                      - y_i \log( \log( 1 + \exp(\beta_0 + \sum_j \beta_j x_{ij} ) ) ) \right\} \\
     + \lambda (1-\alpha) \frac{1}{2} \sum_j \beta_j^2
 
 **Gradient**
@@ -36,8 +38,8 @@ Poisson: `softplus`
 
     \mu(z_i) &= \log(1 + \exp(z_i)) \\
     \sigma(z_i) &= \frac{1}{1 + \exp(-z_i)} \\
-    \frac{\partial J}{\partial \beta_0} &= \frac{1}{n}\sum_i \sigma(z_i) - \frac{1}{n}\sum_i y_i \frac{\sigma(z_i)}{\mu(z_i)} \\
-    \frac{\partial J}{\partial \beta_j} &= \frac{1}{n}\sum_i \sigma(z_i) x_{ij} - \frac{1}{n}\sum_i \sigma(z_i) y_i \frac{\sigma(z_i)}{\mu(z_i)}x_{ij} + \lambda (1 - \alpha) \beta_j
+    \frac{\partial J}{\partial \beta_0} &= \frac{1}{n}\sum_i w_i \sigma(z_i) - \frac{1}{n}\sum_i w_i y_i \frac{\sigma(z_i)}{\mu(z_i)} \\
+    \frac{\partial J}{\partial \beta_j} &= \frac{1}{n}\sum_i w_i \sigma(z_i) x_{ij} - \frac{1}{n}\sum_i w_i y_i \frac{\sigma(z_i)}{\mu(z_i)}x_{ij} + \lambda (1 - \alpha) \beta_j
 
 **Hessian**
 
@@ -45,10 +47,10 @@ Poisson: `softplus`
 
     \mu(z_i) &= \log(1 + \exp(z_i)) \\
     \sigma(z_i) &= \frac{1}{1 + \exp(-z_i)} \\
-    \frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_i \sigma(z_i) (1 - \sigma(z_i))
-    - \frac{1}{n}\sum_i y_i \left\{ \frac{\sigma(z_i) (1 - \sigma(z_i))}{\mu(z_i)} - \frac{\sigma(z_i)}{\mu(z_i)^2} \right\} \\
-    \frac{\partial^2 J}{\partial \beta_j^2} &=  \frac{1}{n}\sum_i \sigma(z_i) (1 - \sigma(z_i)) x_{ij}^2
-    - \frac{1}{n}\sum_i y_i \left\{ \frac{\sigma(z_i) (1 - \sigma(z_i))}{\mu(z_i)} - \frac{\sigma(z_i)}{\mu(z_i)^2} \right\} x_{ij}^2
+    \frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_i w_i \sigma(z_i) (1 - \sigma(z_i))
+    - \frac{1}{n}\sum_i w_i y_i \left\{ \frac{\sigma(z_i) (1 - \sigma(z_i))}{\mu(z_i)} - \frac{\sigma(z_i)}{\mu(z_i)^2} \right\} \\
+    \frac{\partial^2 J}{\partial \beta_j^2} &=  \frac{1}{n}\sum_i w_i \sigma(z_i) (1 - \sigma(z_i)) x_{ij}^2
+    - \frac{1}{n}\sum_i w_i y_i \left\{ \frac{\sigma(z_i) (1 - \sigma(z_i))}{\mu(z_i)} - \frac{\sigma(z_i)}{\mu(z_i)^2} \right\} x_{ij}^2
     + \lambda (1 - \alpha)
 
 Poisson (linearized): `poisson`
@@ -70,7 +72,7 @@ Poisson (linearized): `poisson`
 
 .. math::
 
-  \mathcal{L} = \sum_i y_i \log(\mu_i) - \sum_i \mu_i
+  \mathcal{L} = \sum_i w_i \left[ y_i \log(\mu_i) - \mu_i \right]
 
 **L2-penalized loss function**
 
@@ -89,10 +91,10 @@ Poisson (linearized): `poisson`
     \exp(\eta)z_i + (1-\eta)\exp(\eta),  & z_i > \eta
     \end{cases}
     \\
-    \frac{\partial J}{\partial \beta_0} &= \frac{1}{n}\sum_{i; z_i \leq \eta} (\mu_i - y_i)
-    + \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta) (1 - y_i/\mu_i) \\
-    \frac{\partial J}{\partial \beta_j} &= \frac{1}{n}\sum_{i; z_i \leq \eta} (\mu_i - y_i) x_{ij}
-    + \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta) (1 - y_i/\mu_i) x_{ij}
+    \frac{\partial J}{\partial \beta_0} &= \frac{1}{n}\sum_{i; z_i \leq \eta} w_i (\mu_i - y_i)
+    + \frac{1}{n}\sum_{i; z_i > \eta} w_i \exp(\eta) (1 - y_i/\mu_i) \\
+    \frac{\partial J}{\partial \beta_j} &= \frac{1}{n}\sum_{i; z_i \leq \eta} w_i (\mu_i - y_i) x_{ij}
+    + \frac{1}{n}\sum_{i; z_i > \eta} w_i \exp(\eta) (1 - y_i/\mu_i) x_{ij}
 
 **Hessian**
 
@@ -105,10 +107,10 @@ Poisson (linearized): `poisson`
     \exp(\eta)z_i + (1-\eta)\exp(\eta),  & z_i > \eta
     \end{cases}
     \\
-    \frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_{i; z_i \leq \eta} \mu_i
-    + \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta)^2 \frac{y_i}{\mu_i^2}  \\
-    \frac{\partial^2 J}{\partial \beta_j^2} &=  \frac{1}{n}\sum_{i; z_i \leq \eta} \mu_i x_{ij}^2
-    + \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta)^2 \frac{y_i}{\mu_i^2} x_{ij}^2
+    \frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_{i; z_i \leq \eta} w_i \mu_i
+    + \frac{1}{n}\sum_{i; z_i > \eta} w_i \exp(\eta)^2 \frac{y_i}{\mu_i^2}  \\
+    \frac{\partial^2 J}{\partial \beta_j^2} &=  \frac{1}{n}\sum_{i; z_i \leq \eta} w_i \mu_i x_{ij}^2
+    + \frac{1}{n}\sum_{i; z_i > \eta} w_i \exp(\eta)^2 \frac{y_i}{\mu_i^2} x_{ij}^2
     + \lambda (1 - \alpha)
 
 Gaussian: `gaussian`
@@ -125,13 +127,13 @@ Gaussian: `gaussian`
 
 .. math::
 
-    \mathcal{L} = -\frac{1}{2} \sum_i (y_i - \mu_i)^2 \\
+    \mathcal{L} = -\frac{1}{2} \sum_i w_i (y_i - \mu_i)^2 \\
 
 **L2-penalized loss function**
 
 .. math::
 
-    J = \frac{1}{2n}\sum_i (y_i - (\beta_0 + \sum_j \beta_j x_{ij}))^2 +
+    J = \frac{1}{2n}\sum_i w_i (y_i - \mu_i)^2 +
     \lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\
 
 **Gradient**
@@ -139,8 +141,8 @@ Gaussian: `gaussian`
 .. math::
 
     \mu(z_i) &= z_i \\
-    \frac{\partial J}{\partial \beta_0} &= -\frac{1}{n}\sum_i (y_i - \mu_i) \\
-    \frac{\partial J}{\partial \beta_j} &= -\frac{1}{n}\sum_i (y_i - \mu_i) x_{ij}
+    \frac{\partial J}{\partial \beta_0} &= -\frac{1}{n}\sum_i w_i (y_i - \mu_i) \\
+    \frac{\partial J}{\partial \beta_j} &= -\frac{1}{n}\sum_i w_i (y_i - \mu_i) x_{ij}
     + \lambda (1 - \alpha) \beta_j
 
 **Hessian**
@@ -148,7 +150,7 @@ Gaussian: `gaussian`
 .. math::
 
     \frac{\partial^2 J}{\partial \beta_0^2} &= 1 \\
-    \frac{\partial^2 J}{\partial \beta_j^2} &=  \frac{1}{n}\sum_i x_{ij}^2
+    \frac{\partial^2 J}{\partial \beta_j^2} &=  \frac{1}{n}\sum_i w_i x_{ij}^2
     + \lambda (1 - \alpha)
 
 Logistic: `binomial`
@@ -165,13 +167,13 @@ Logistic: `binomial`
 
 .. math::
 
-    \mathcal{L} = \sum_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} \\
+    \mathcal{L} = \sum_i w_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} \\
 
 **L2-penalized loss function**
 
 .. math::
 
-    J = -\frac{1}{n}\sum_i \left\{ y_i \log(\mu_i) +
+    J = -\frac{1}{n}\sum_i w_i \left\{ y_i \log(\mu_i) +
     (1-y_i) \log(1 - \mu_i) \right\}
     + \lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\
 
@@ -181,16 +183,16 @@ Logistic: `binomial`
 .. math::
 
     \mu(z_i) &= \frac{1}{1 + \exp(-z_i)} \\
-    \frac{\partial J}{\partial \beta_0} &= -\frac{1}{n}\sum_i (y_i - \mu_i) \\
-    \frac{\partial J}{\partial \beta_j} &= -\frac{1}{n}\sum_i (y_i - \mu_i) x_{ij}
+    \frac{\partial J}{\partial \beta_0} &= -\frac{1}{n}\sum_i w_i (y_i - \mu_i) \\
+    \frac{\partial J}{\partial \beta_j} &= -\frac{1}{n}\sum_i w_i (y_i - \mu_i) x_{ij}
     + \lambda (1 - \alpha) \beta_j
 
 **Hessian**
 
 .. math::
 
-    \frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_i \mu_i (1 - \mu_i) \\
-    \frac{\partial^2 J}{\partial \beta_j^2} &=  \frac{1}{n}\sum_i \mu_i (1 - \mu_i) x_{ij}^2
+    \frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_i w_i \mu_i (1 - \mu_i) \\
+    \frac{\partial^2 J}{\partial \beta_j^2} &=  \frac{1}{n}\sum_i w_i \mu_i (1 - \mu_i) x_{ij}^2
     + \lambda (1 - \alpha)
 
 Logistic: `probit`
@@ -209,13 +211,13 @@ where :math:`\Phi(z_i)` is the standard normal cumulative distribution function.
 
 .. math::
 
-    \mathcal{L} = \sum_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} \\
+    \mathcal{L} = \sum_i w_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} \\
 
 **L2-penalized loss function**
 
 .. math::
 
-    J = J = -\frac{1}{n}\sum_i \left\{ y_i \log(\mu_i) +
+    J = J = -\frac{1}{n}\sum_i w_i \left\{ y_i \log(\mu_i) +
     (1-y_i) \log(1 - \mu_i) \right\}
     + \lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\
 
@@ -233,9 +235,9 @@ where :math:`\Phi(z_i)` and :math:`\phi(z_i)` are the standard normal cdf and pd
 .. math::
 
     \frac{\partial J}{\partial \beta_0} &=
-      -\frac{1}{n}\sum_i \Bigg\{y_i \frac{\mu'(z_i)}{\mu(z_i)} - (1 - y_i)\frac{\mu'(z_i)}{1 - \mu(z_i)}\Bigg\} \\
+      -\frac{1}{n}\sum_i w_i \Bigg\{y_i \frac{\mu'(z_i)}{\mu(z_i)} - (1 - y_i)\frac{\mu'(z_i)}{1 - \mu(z_i)}\Bigg\} \\
       \frac{\partial J}{\partial \beta_j} &=
-        -\frac{1}{n}\sum_i \Bigg\{y_i \frac{\mu'(z_i)}{\mu(z_i)} - (1 - y_i)\frac{\mu'(z_i)}{1 - \mu(z_i)}\Bigg\} x_{ij}
+        -\frac{1}{n}\sum_i w_i \Bigg\{y_i \frac{\mu'(z_i)}{\mu(z_i)} - (1 - y_i)\frac{\mu'(z_i)}{1 - \mu(z_i)}\Bigg\} x_{ij}
     + \lambda (1 - \alpha) \beta_j
 
 
@@ -244,10 +246,10 @@ where :math:`\Phi(z_i)` and :math:`\phi(z_i)` are the standard normal cdf and pd
 .. math::
 
     \frac{\partial^2 J}{\partial \beta_0^2} &=
-      \frac{1}{n}\sum_i \mu'(z_i) \Bigg\{y_i \frac{z_i\mu(z_i) + \mu'(z_i)}{\mu^2(z_i)} +
+      \frac{1}{n}\sum_i w_i \mu'(z_i) \Bigg\{y_i \frac{z_i\mu(z_i) + \mu'(z_i)}{\mu^2(z_i)} +
       (1 - y_i)\frac{-z_i(1 - \mu(z_i)) + \mu'(z_i)}{(1 - \mu(z_i))^2} \Bigg\} \\
       \frac{\partial^2 J}{\partial \beta_j^2} &=
-        \frac{1}{n}\sum_i \mu'(z_i) \Bigg\{y_i \frac{z_i\mu(z_i) + \mu'(z_i)}{\mu^2(z_i)} +
+        \frac{1}{n}\sum_i w_i \mu'(z_i) \Bigg\{y_i \frac{z_i\mu(z_i) + \mu'(z_i)}{\mu^2(z_i)} +
         (1 - y_i)\frac{-z_i(1 - \mu(z_i)) + \mu'(z_i)}{(1 - \mu(z_i))^2} \Bigg\} x_{ij}^2
     + \lambda (1 - \alpha)
 
@@ -265,7 +267,7 @@ Gamma
 
 .. math::
 
-    \mathcal{L} = \sum_{i} \nu\Bigg\{\frac{-y_i}{\mu_i} - log(\mu_i)\Bigg\}
+    \mathcal{L} = \sum_{i} \nu w_i \Bigg\{\frac{-y_i}{\mu_i} - log(\mu_i)\Bigg\}
 
 where :math:`\nu` is the shape parameter. It is exponential for :math:`\nu = 1`
 and normal for :math:`\nu = \infty`.
@@ -274,16 +276,16 @@ and normal for :math:`\nu = \infty`.
 
 .. math::
 
-    J = -\frac{1}{n}\sum_{i} \nu\Bigg\{\frac{-y_i}{\mu_i} - log(\mu_i)\Bigg\}
+    J = -\frac{1}{n}\sum_{i} \nu w_i \Bigg\{\frac{-y_i}{\mu_i} - log(\mu_i)\Bigg\}
     + \lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\
 
 **Gradient**
 
 .. math::
 
-    \frac{\partial J}{\partial \beta_0} &= \frac{1}{n} \sum_{i} \nu\Bigg\{\frac{y_i}{\mu_i^2}
+    \frac{\partial J}{\partial \beta_0} &= \frac{1}{n} \sum_{i} w_i \nu\Bigg\{\frac{y_i}{\mu_i^2}
     - \frac{1}{\mu_i}\Bigg\}{\mu_i'} \\
-    \frac{\partial J}{\partial \beta_j} &= \frac{1}{n} \sum_{i} \nu\Bigg\{\frac{y_i}{\mu_i^2}
+    \frac{\partial J}{\partial \beta_j} &= \frac{1}{n} \sum_{i} w_i \nu\Bigg\{\frac{y_i}{\mu_i^2}
     - \frac{1}{\mu_i}\Bigg\}{\mu_i'}x_{ij} + \lambda (1 - \alpha) \beta_j
 
 where :math:`\mu_i' = \frac{1}{1 + \exp(-z_i)}`.
diff --git a/pyglmnet/datasets.py b/pyglmnet/datasets.py
index 073c00ea..51f8f44c 100644
--- a/pyglmnet/datasets.py
+++ b/pyglmnet/datasets.py
@@ -7,7 +7,7 @@
 import tempfile
 import itertools
 import numpy as np
-from scipy.misc import comb
+from scipy.special import comb
 
 from .externals.six.moves import urllib
 
diff --git a/pyglmnet/metrics.py b/pyglmnet/metrics.py
index ee22fb6f..426d2b4a 100644
--- a/pyglmnet/metrics.py
+++ b/pyglmnet/metrics.py
@@ -4,7 +4,7 @@
 from .pyglmnet import _logL
 
 
-def deviance(y, yhat, distr):
+def deviance(y, yhat, sample_weight, distr):
     """Deviance metrics.
 
     Parameters
@@ -15,6 +15,9 @@ def deviance(y, yhat, distr):
     yhat : array
         Predicted labels of shape (n_samples, )
 
+    sample_weight : array
+        Sample weights of shape (n_samples, )
+
     distr: str
         distribution
 
@@ -24,16 +27,16 @@ def deviance(y, yhat, distr):
         Deviance of the predicted labels.
     """
     if distr in ['softplus', 'poisson']:
-        LS = _logL(distr, y, y)
+        LS = _logL(distr, y, y, w=sample_weight)
     else:
         LS = 0
 
-    L1 = _logL(distr, y, yhat)
+    L1 = _logL(distr, y, yhat, w=sample_weight)
     score = -2 * (L1 - LS)
     return score
 
 
-def pseudo_R2(X, y, yhat, ynull_, distr):
+def pseudo_R2(X, y, yhat, ynull_, sample_weight, distr):
     """Pseudo-R2 metric.
 
     Parameters
@@ -47,6 +50,9 @@ def pseudo_R2(X, y, yhat, ynull_, distr):
     ynull_ : float
         Mean of the target labels (null model prediction)
 
+    sample_weight : array
+        Sample weights of shape (n_samples, )
+
     distr: str
         distribution
 
@@ -56,12 +62,12 @@ def pseudo_R2(X, y, yhat, ynull_, distr):
         Pseudo-R2 score.
     """
     if distr in ['softplus', 'poisson']:
-        LS = _logL(distr, y, y)
+        LS = _logL(distr, y, y, w=sample_weight)
     else:
         LS = 0
 
-    L0 = _logL(distr, y, ynull_)
-    L1 = _logL(distr, y, yhat)
+    L0 = _logL(distr, y, ynull_, w=sample_weight)
+    L1 = _logL(distr, y, yhat, w=sample_weight)
 
     if distr in ['softplus', 'poisson']:
         score = (1 - (LS - L1) / (LS - L0))
@@ -70,7 +76,7 @@ def pseudo_R2(X, y, yhat, ynull_, distr):
     return score
 
 
-def accuracy(y, yhat):
+def accuracy(y, yhat, sample_weight):
     """Accuracy as ratio of correct predictions.
 
     Parameters
@@ -81,9 +87,12 @@ def accuracy(y, yhat):
     yhat : array
         Predicted labels of shape (n_samples, )
 
+    sample_weight : array
+        Sample weights of shape (n_samples, )
+
     Returns
     -------
     accuracy : float
         Accuracy score.
     """
-    return float(np.sum(y == yhat)) / yhat.shape[0]
+    return float(np.dot(sample_weight, (y == yhat))) / np.sum(sample_weight)
diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py
index b50178ce..ecfac5c9 100644
--- a/pyglmnet/pyglmnet.py
+++ b/pyglmnet/pyglmnet.py
@@ -111,33 +111,35 @@ def _grad_mu(distr, z, eta):
     return grad_mu
 
 
-def _logL(distr, y, y_hat, z=None):
+def _logL(distr, y, y_hat, w, z=None):
     """The log likelihood."""
     if distr in ['softplus', 'poisson']:
         eps = np.spacing(1)
-        logL = np.sum(y * np.log(y_hat + eps) - y_hat)
+        logL = np.sum(w * (y * np.log(y_hat + eps) - y_hat))
     elif distr == 'gaussian':
-        logL = -0.5 * np.sum((y - y_hat)**2)
+        logL = -0.5 * np.sum(w * ((y - y_hat) ** 2))
     elif distr == 'binomial':
 
         # prevents underflow
         if z is not None:
-            logL = np.sum(y * z - np.log(1 + np.exp(z)))
+            logL = np.sum(w * (y * z - np.log(1 + np.exp(z))))
         # for scoring
         else:
-            logL = np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
+            logL = np.sum(w * (y * np.log(y_hat) +
+                               (1 - y) * np.log(1 - y_hat)))
     elif distr == 'probit':
         if z is not None:
             pdfz, cdfz = norm.pdf(z), norm.cdf(z)
-            logL = np.sum(y * _probit_g1(z, pdfz, cdfz) +
-                          (1 - y) * _probit_g2(z, pdfz, cdfz))
+            logL = np.sum(w * (y * _probit_g1(z, pdfz, cdfz) +
+                          (1 - y) * _probit_g2(z, pdfz, cdfz)))
         else:
-            logL = np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
+            logL = np.sum(w * (y * np.log(y_hat) +
+                               (1 - y) * np.log(1 - y_hat)))
     elif distr == 'gamma':
         # see
         # https://www.statistics.ma.tum.de/fileadmin/w00bdb/www/czado/lec8.pdf
         nu = 1.  # shape parameter, exponential for now
-        logL = np.sum(nu * (-y / y_hat - np.log(y_hat)))
+        logL = np.sum(w * (nu * (-y / y_hat - np.log(y_hat))))
     return logL
 
 
@@ -183,29 +185,29 @@ def _L1penalty(beta, group=None):
     return L1penalty
 
 
-def _loss(distr, alpha, Tau, reg_lambda, X, y, eta, group, beta):
+def _loss(distr, alpha, Tau, reg_lambda, X, y, w, eta, group, beta):
     """Define the objective function for elastic net."""
     n_samples = X.shape[0]
     z = beta[0] + np.dot(X, beta[1:])
     y_hat = _mu(distr, z, eta)
-    L = 1. / n_samples * _logL(distr, y, y_hat, z)
+    L = 1. / n_samples * _logL(distr, y, y_hat, w=w, z=z)
     P = _penalty(alpha, beta[1:], Tau, group)
     J = -L + reg_lambda * P
     return J
 
 
-def _L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, group, beta):
+def _L2loss(distr, alpha, Tau, reg_lambda, X, y, w, eta, group, beta):
     """Define the objective function for elastic net."""
     n_samples = X.shape[0]
     z = beta[0] + np.dot(X, beta[1:])
     y_hat = _mu(distr, z, eta)
-    L = 1. / n_samples * _logL(distr, y, y_hat, z)
+    L = 1. / n_samples * _logL(distr, y, y_hat, w=w, z=z)
     P = 0.5 * (1 - alpha) * _L2penalty(beta[1:], Tau)
     J = -L + reg_lambda * P
     return J
 
 
-def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, beta):
+def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, w, eta, beta):
     """The gradient."""
     n_samples = np.float(X.shape[0])
 
@@ -218,29 +220,29 @@ def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, beta):
     grad_mu = _grad_mu(distr, z, eta)
 
     if distr in ['poisson', 'softplus']:
-        grad_beta0 = np.sum(grad_mu) - np.sum(y * grad_mu / mu)
-        grad_beta = ((np.dot(grad_mu.T, X) -
-                     np.dot((y * grad_mu / mu).T, X)).T)
+        grad_beta0 = np.sum(w * grad_mu) - np.sum(w * y * grad_mu / mu)
+        grad_beta = ((np.dot(w * grad_mu.T, X) -
+                     np.dot((w * y * grad_mu / mu).T, X)).T)
 
     elif distr == 'gaussian':
-        grad_beta0 = np.sum((mu - y) * grad_mu)
-        grad_beta = np.dot((mu - y).T, X * grad_mu[:, None]).T
+        grad_beta0 = np.sum(w * (mu - y) * grad_mu)
+        grad_beta = np.dot((w * (mu - y)).T, X * grad_mu[:, None]).T
 
     elif distr == 'binomial':
-        grad_beta0 = np.sum(mu - y)
-        grad_beta = np.dot((mu - y).T, X).T
+        grad_beta0 = np.sum(w * (mu - y))
+        grad_beta = np.dot((w * (mu - y)).T, X).T
 
     elif distr == 'probit':
         grad_logl = (y * _probit_g3(z, grad_mu, mu) -
                      (1 - y) * _probit_g4(z, grad_mu, mu))
-        grad_beta0 = -np.sum(grad_logl)
-        grad_beta = -np.dot(grad_logl.T, X).T
+        grad_beta0 = -np.sum(w * grad_logl)
+        grad_beta = -np.dot((w * grad_logl).T, X).T
 
     elif distr == 'gamma':
         nu = 1.
         grad_logl = (y / mu ** 2 - 1 / mu) * grad_mu
-        grad_beta0 = -nu * np.sum(grad_logl)
-        grad_beta = -nu * np.dot(grad_logl.T, X).T
+        grad_beta0 = -nu * np.sum(w * grad_logl)
+        grad_beta = -nu * np.dot((w * grad_logl).T, X).T
 
     grad_beta0 *= 1. / n_samples
     grad_beta *= 1. / n_samples
@@ -252,7 +254,7 @@ def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, beta):
     return g
 
 
-def _gradhess_logloss_1d(distr, xk, y, z, eta):
+def _gradhess_logloss_1d(distr, xk, y, w, z, eta):
     """
     Compute gradient (1st derivative)
     and Hessian (2nd derivative)
@@ -279,41 +281,42 @@ def _gradhess_logloss_1d(distr, xk, y, z, eta):
     if distr == 'softplus':
         mu = _mu(distr, z, eta)
         s = expit(z)
-        gk = np.sum(s * xk) - np.sum(y * s / mu * xk)
+        gk = np.sum(w * s * xk) - np.sum(w * y * s / mu * xk)
 
         grad_s = s * (1 - s)
         grad_s_by_mu = grad_s / mu - s / (mu ** 2)
-        hk = np.sum(grad_s * xk ** 2) - np.sum(y * grad_s_by_mu * xk ** 2)
+        hk = np.sum(w * grad_s * xk ** 2) - \
+            np.sum(w * y * grad_s_by_mu * xk ** 2)
 
     elif distr == 'poisson':
         mu = _mu(distr, z, eta)
         s = expit(z)
-        gk = np.sum((mu[z <= eta] - y[z <= eta]) *
+        gk = np.sum(w[z <= eta] * (mu[z <= eta] - y[z <= eta]) *
                     xk[z <= eta]) + \
             np.exp(eta) * \
-            np.sum((1 - y[z > eta] / mu[z > eta]) *
+            np.sum(w[z > eta] * (1 - y[z > eta] / mu[z > eta]) *
                    xk[z > eta])
-        hk = np.sum(mu[z <= eta] * xk[z <= eta] ** 2) + \
+        hk = np.sum(w[z <= eta] * mu[z <= eta] * xk[z <= eta] ** 2) + \
             np.exp(eta) ** 2 * \
-            np.sum(y[z > eta] / (mu[z > eta] ** 2) *
+            np.sum(w[z > eta] * y[z > eta] / (mu[z > eta] ** 2) *
                    (xk[z > eta] ** 2))
 
     elif distr == 'gaussian':
-        gk = np.sum((z - y) * xk)
-        hk = np.sum(xk * xk)
+        gk = np.sum(w * (z - y) * xk)
+        hk = np.sum(w * xk * xk)
 
     elif distr == 'binomial':
         mu = _mu(distr, z, eta)
-        gk = np.sum((mu - y) * xk)
-        hk = np.sum(mu * (1.0 - mu) * xk * xk)
+        gk = np.sum(w * (mu - y) * xk)
+        hk = np.sum(w * mu * (1.0 - mu) * xk * xk)
 
     elif distr == 'probit':
         pdfz = norm.pdf(z)
         cdfz = norm.cdf(z)
-        gk = -np.sum((y * _probit_g3(z, pdfz, cdfz) -
-                      (1 - y) * _probit_g4(z, pdfz, cdfz)) * xk)
-        hk = np.sum((y * _probit_g5(z, pdfz, cdfz) +
-                     (1 - y) * _probit_g6(z, pdfz, cdfz)) * (xk * xk))
+        gk = -np.sum(w * (y * _probit_g3(z, pdfz, cdfz) -
+                          (1 - y) * _probit_g4(z, pdfz, cdfz)) * xk)
+        hk = np.sum(w * (y * _probit_g5(z, pdfz, cdfz) +
+                         (1 - y) * _probit_g6(z, pdfz, cdfz)) * (xk * xk))
 
     elif distr == 'gamma':
         raise NotImplementedError('cdfast is not implemented for Gamma '
@@ -478,6 +481,8 @@ def __init__(self, distr='poisson', alpha=0.5,
         self.solver = solver
         self.learning_rate = learning_rate
         self.max_iter = max_iter
+        self.n_iter = 0
+        self.converged = False
         self.beta0_ = None
         self.beta_ = None
         self.ynull_ = None
@@ -595,7 +600,7 @@ def _prox(self, beta, thresh):
 
             return result
 
-    def _cdfast(self, X, y, z, ActiveSet, beta, rl):
+    def _cdfast(self, X, y, w, ActiveSet, beta, rl):
         """
         Perform one cycle of Newton updates for all coordinates.
 
@@ -607,9 +612,10 @@ def _cdfast(self, X, y, z, ActiveSet, beta, rl):
         y : array
             Labels to the data
             n_samples x 1
-        z:  array
+        w : array
+            Weights to the data
             n_samples x 1
-            beta[0] + X * beta[1:]
+
         ActiveSet: array
             n_features + 1 x 1
             Active set storing which betas are non-zero
@@ -624,9 +630,6 @@ def _cdfast(self, X, y, z, ActiveSet, beta, rl):
         beta: array
             (n_features + 1) x 1
             Updated parameters
-        z: array
-            beta[0] + X * beta[1:]
-            (n_features + 1) x 1
         """
         n_samples = X.shape[0]
         n_features = X.shape[1]
@@ -641,7 +644,9 @@ def _cdfast(self, X, y, z, ActiveSet, beta, rl):
                     xk = np.ones((n_samples, ))
 
                 # Calculate grad and hess of log likelihood term
-                gk, hk = _gradhess_logloss_1d(self.distr, xk, y, z, self.eta)
+                z = beta[0] + np.dot(X, beta[1:])
+                gk, hk = \
+                    _gradhess_logloss_1d(self.distr, xk, y, w, z, self.eta)
 
                 # Add grad and hess of regularization term
                 if self.Tau is None:
@@ -656,10 +661,10 @@ def _cdfast(self, X, y, z, ActiveSet, beta, rl):
 
                 # Update parameters, z
                 update = 1. / hk * gk
-                beta[k], z = beta[k] - update, z - update * xk
-        return beta, z
+                beta[k] = beta[k] - update
+        return beta
 
-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight=None):
         """The fit function.
 
         Parameters
@@ -694,6 +699,13 @@ def fit(self, X, y):
 
         n_features = X.shape[1]
 
+        # normalize weights so that X.shape[0] = sum(sample_weight)
+        # as assumed by weighted sum calculations in loss functions
+        if sample_weight is None:
+            sample_weight = np.ones_like(y)
+        else:
+            sample_weight /= np.mean(sample_weight)
+
         # Initialize parameters
         beta = np.zeros((n_features + 1,))
         if self.beta0_ is None and self.beta_ is None:
@@ -713,31 +725,43 @@ def fit(self, X, y):
 
         if self.solver == 'cdfast':
             ActiveSet = np.ones(n_features + 1)     # init active set
-            z = beta[0] + np.dot(X, beta[1:])       # cache z
 
         # Iterative updates
         for t in range(0, self.max_iter):
+            logger.info("t: %i" % t)
+            convergence_metric = float('inf')
             if self.solver == 'batch-gradient':
                 grad = _grad_L2loss(self.distr,
                                     alpha, self.Tau,
-                                    reg_lambda, X, y, self.eta,
+                                    reg_lambda, X, y, sample_weight, self.eta,
                                     beta)
                 # Converged if the norm(gradient) < tol
-                if (t > 1) and (np.linalg.norm(grad) < tol):
-                        msg = ('\tConverged in {0:d} iterations'.format(t))
-                        logger.info(msg)
-                        break
+                convergence_metric = np.linalg.norm(grad)
+                logger.debug("convergence_metric: %f" % convergence_metric)
+                if (t > 1) and (convergence_metric < tol):
+                    self.converged = True
+                    self.n_iter += t
+                    msg = ('\tConverged in {0:d} iterations'
+                           .format(self.n_iter))
+                    logger.info(msg)
+                    break
                 beta = beta - self.learning_rate * grad
 
             elif self.solver == 'cdfast':
                 beta_old = deepcopy(beta)
-                beta, z = \
-                    self._cdfast(X, y, z, ActiveSet, beta, reg_lambda)
+
+                beta = self._cdfast(X, y, sample_weight,
+                                    ActiveSet, beta, reg_lambda)
                 # Converged if the norm(update) < tol
-                if (t > 1) and (np.linalg.norm(beta - beta_old) < tol):
-                        msg = ('\tConverged in {0:d} iterations'.format(t))
-                        logger.info(msg)
-                        break
+                convergence_metric = np.linalg.norm(beta - beta_old)
+                logger.debug("convergence_metric: %f" % convergence_metric)
+                if (t > 1) and (convergence_metric < tol):
+                    self.converged = True
+                    self.n_iter += t
+                    msg = ('\tConverged in {0:d} iterations'
+                           .format(self.n_iter))
+                    logger.info(msg)
+                    break
             # Apply proximal operator
             beta[1:] = self._prox(beta[1:], reg_lambda * alpha)
 
@@ -749,11 +773,21 @@ def fit(self, X, y):
             # Compute and save loss if callbacks are requested
             if callable(self.callback):
                 self.callback(beta)
+        else:
+            # Warn if it hit max_iter without converging
+            self.converged = False
+            self.n_iter += t + 1
+            msg = ('\t'
+                   'Failed to converge after {0:d} iterations.'
+                   ' Last convergence metric was {1:f}'
+                   ' and convergence threshold {2:f}.'
+                   ).format(self.n_iter, convergence_metric, tol)
+            logger.warning(msg)
 
         # Update the estimated variables
         self.beta0_ = beta[0]
         self.beta_ = beta[1:]
-        self.ynull_ = np.mean(y)
+        self.ynull_ = np.sum(sample_weight * y) / np.sum(sample_weight)
         return self
 
     def predict(self, X):
@@ -813,7 +847,7 @@ def predict_proba(self, X):
         yhat = np.asarray(yhat)
         return yhat
 
-    def fit_predict(self, X, y):
+    def fit_predict(self, X, y, sample_weight=None):
         """Fit the model and predict on the same data.
 
         Parameters
@@ -828,9 +862,9 @@ def fit_predict(self, X, y):
         yhat : array
             The predicted targets of shape (n_samples,).
         """
-        return self.fit(X, y).predict(X)
+        return self.fit(X, y, sample_weight).predict(X)
 
-    def score(self, X, y):
+    def score(self, X, y, sample_weight=None):
         """Score the model.
 
         Parameters
@@ -865,6 +899,10 @@ def score(self, X, y):
                                  'or multinomial distributions')
 
         y = y.ravel()
+        if sample_weight is None:
+            sample_weight = np.ones_like(y)
+        else:
+            sample_weight /= np.mean(sample_weight)
 
         if self.distr == 'binomial' and self.score_metric != 'accuracy':
             yhat = self.predict_proba(X)
@@ -873,11 +911,12 @@ def score(self, X, y):
 
         # Check whether we have a list of estimators or a single estimator
         if self.score_metric == 'deviance':
-            return metrics.deviance(y, yhat, self.distr)
+            return metrics.deviance(y, yhat, sample_weight, self.distr)
         elif self.score_metric == 'pseudo_R2':
-            return metrics.pseudo_R2(X, y, yhat, self.ynull_, self.distr)
+            return metrics.pseudo_R2(X, y, yhat, self.ynull_,
+                                     sample_weight, self.distr)
         if self.score_metric == 'accuracy':
-            return metrics.accuracy(y, yhat)
+            return metrics.accuracy(y, yhat, sample_weight)
 
 
 class GLMCV(object):
@@ -1052,7 +1091,7 @@ def copy(self):
         """
         return deepcopy(self)
 
-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight=None):
         """The fit function.
         Parameters
         ----------
@@ -1069,7 +1108,12 @@ def fit(self, X, y):
         """
         logger.info('Looping through the regularization path')
         glms, scores = list(), list()
-        self.ynull_ = np.mean(y)
+        if sample_weight is None:
+            sample_weight = np.ones_like(y)
+        else:
+            sample_weight /= np.mean(sample_weight)
+
+        self.ynull_ = np.sum(sample_weight * y) / np.sum(sample_weight)
 
         if not type(int):
             raise ValueError('cv must be int. We do not support scikit-learn '
@@ -1104,15 +1148,16 @@ def fit(self, X, y):
                 else:
                     glm.beta0_, glm.beta_ = glms[-1].beta0_, glms[-1].beta_
 
-                glm.fit(X[train], y[train])
-                scores_fold.append(glm.score(X[val], y[val]))
+                glm.fit(X[train], y[train], sample_weight[train])
+                scores_fold.append(
+                    glm.score(X[val], y[val], sample_weight[val]))
             scores.append(np.mean(scores_fold))
 
             if idx == 0:
                 glm.beta0_, glm.beta_ = self.beta0_, self.beta_
             else:
                 glm.beta0_, glm.beta_ = glms[-1].beta0_, glms[-1].beta_
-            glm.fit(X, y)
+            glm.fit(X, y, sample_weight)
             glms.append(glm)
         # Update the estimated variables
         if self.score_metric == 'deviance':
@@ -1162,7 +1207,7 @@ def predict_proba(self, X):
         """
         return self.glm_.predict_proba(X)
 
-    def fit_predict(self, X, y):
+    def fit_predict(self, X, y, sample_weight=None):
         """Fit the model and predict on the same data.
 
         Parameters
@@ -1177,10 +1222,10 @@ def fit_predict(self, X, y):
             The predicted targets of shape based on the model with optimal
             reg_lambda (n_samples,)
         """
-        self.fit(X, y)
+        self.fit(X, y, sample_weight)
         return self.glm_.predict(X)
 
-    def score(self, X, y):
+    def score(self, X, y, sample_weight=None):
         """Score the model.
 
         Parameters
@@ -1197,4 +1242,4 @@ def score(self, X, y):
         score: float
             The score metric for the optimal reg_lambda
         """
-        return self.glm_.score(X, y)
+        return self.glm_.score(X, y, sample_weight)
diff --git a/setup.cfg b/setup.cfg
index b5903cc1..40e9a84a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -7,7 +7,11 @@ ignore-files = (benchmarks.py|datasets.py)
 
 [flake8]
 exclude = __init__.py,six.py,funcsigs.py
-ignore = E241
+ignore =
+  # W504: Line break occurred after a binary operator
+  W504, 
+  # W605: invalid escape sequence 'x'. Conflicts with Sphinx math in docs
+  W605
 
 [metadata]
 description-file = "README.md"
diff --git a/setup.py b/setup.py
index aa136326..2b3ff498 100644
--- a/setup.py
+++ b/setup.py
@@ -35,7 +35,5 @@
               'Operating System :: MacOS',
           ],
           platforms='any',
-          packages=[
-              'pyglmnet'
-          ],
+          packages=setuptools.find_packages(),
           )
diff --git a/tests/test_pyglmnet.py b/tests/test_pyglmnet.py
index 20d5202c..6f08444b 100644
--- a/tests/test_pyglmnet.py
+++ b/tests/test_pyglmnet.py
@@ -1,4 +1,5 @@
 from functools import partial
+import itertools
 
 import numpy as np
 from numpy.testing import assert_allclose, assert_array_equal
@@ -15,6 +16,29 @@
                       _gradhess_logloss_1d, _loss, datasets)
 
 
+# TODO move this into a better place in line after branch testing is done
+def test_sample_weight_cv():
+    """Simple sample weight check."""
+    # XXX: don't use scikit-learn for tests.
+    n_samples = 100
+    X, y = make_regression(n_samples=n_samples)
+    w = np.random.lognormal(0.0, 1.0, n_samples)
+    w /= w.sum()
+    cv = KFold(n_splits=5)
+
+    glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1)
+    # check that cv and rest of sklearn interface works
+    cv_scores = cross_val_score(glm_normal, X, y,
+                                fit_params={'sample_weight': w}, cv=cv)
+    assert(len(cv_scores) == 5)
+
+    param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)},
+                  {'reg_lambda': np.logspace(np.log(0.5), np.log(0.01),
+                                             10, base=np.exp(1))}]
+    glmcv = GridSearchCV(glm_normal, param_grid, cv=cv)
+    glmcv.fit(X, y)
+
+
 def test_gradients():
     """Test gradient accuracy."""
     # data
@@ -22,6 +46,7 @@ def test_gradients():
     n_samples, n_features = 1000, 100
     X = np.random.normal(0.0, 1.0, [n_samples, n_features])
     X = scaler.fit_transform(X)
+    w = np.ones(n_samples)
 
     density = 0.1
     beta_ = np.zeros(n_features + 1)
@@ -35,9 +60,9 @@ def test_gradients():
         y = simulate_glm(glm.distr, beta_[0], beta_[1:], X)
 
         func = partial(_L2loss, distr, glm.alpha,
-                       glm.Tau, reg_lambda, X, y, glm.eta, glm.group)
+                       glm.Tau, reg_lambda, X, y, w, glm.eta, glm.group)
         grad = partial(_grad_L2loss, distr, glm.alpha, glm.Tau,
-                       reg_lambda, X, y,
+                       reg_lambda, X, y, w,
                        glm.eta)
         approx_grad = approx_fprime(beta_, func, 1.5e-8)
         analytical_grad = grad(beta_)
@@ -164,61 +189,94 @@ def test_glmnet():
 
     distrs = ['softplus', 'gaussian', 'poisson', 'binomial', 'probit']
     solvers = ['batch-gradient', 'cdfast']
+    reg_lambdas = [0.0, 0.1]
 
     score_metric = 'pseudo_R2'
     learning_rate = 2e-1
     random_state = 0
 
     for distr in distrs:
-        betas_ = list()
-        for solver in solvers:
-
-            np.random.seed(random_state)
-
-            X_train = np.random.normal(0.0, 1.0, [n_samples, n_features])
-            y_train = simulate_glm(distr, beta0, beta, X_train,
-                                   sample=False)
-
-            alpha = 0.
-            reg_lambda = 0.
-            loss_trace = list()
-
-            def callback(beta):
-                Tau = None
-                eta = 2.0
-                group = None
-
-                loss_trace.append(
-                    _loss(distr, alpha, Tau, reg_lambda,
-                          X_train, y_train, eta, group, beta))
-
-            glm = GLM(distr, learning_rate=learning_rate,
-                      reg_lambda=reg_lambda, tol=1e-3, max_iter=5000,
-                      alpha=alpha, solver=solver, score_metric=score_metric,
-                      random_state=random_state, callback=callback)
-            assert(repr(glm))
-
-            glm.fit(X_train, y_train)
-
-            # verify loss decreases
-            assert(np.all(np.diff(loss_trace) <= 1e-7))
-
-            # verify loss at convergence = loss when beta=beta_
-            l_true = _loss(distr, 0., np.eye(beta.shape[0]), 0.,
-                           X_train, y_train, 2.0, None,
-                           np.concatenate(([beta0], beta)))
-            assert_allclose(loss_trace[-1], l_true, rtol=1e-4, atol=1e-5)
-            # beta=beta_ when reg_lambda = 0.
-            assert_allclose(beta, glm.beta_, rtol=0.05, atol=1e-2)
-            betas_.append(glm.beta_)
-
-            y_pred = glm.predict(X_train)
-            assert(y_pred.shape[0] == X_train.shape[0])
-
-        # compare all solvers pairwise to make sure they're close
-        for i, first_beta in enumerate(betas_[:-1]):
-            for second_beta in betas_[i + 1:]:
-                assert_allclose(first_beta, second_beta, rtol=0.05, atol=1e-2)
+        for reg_lambda in reg_lambdas:
+            betas_ = list()
+            for solver in solvers:
+
+                np.random.seed(random_state)
+
+                X_train = np.random.normal(0.0, 1.0, [n_samples, n_features])
+                y_train = simulate_glm(distr, beta0, beta, X_train,
+                                       sample=False)
+                w_train = np.ones_like(y_train)
+                alpha = 0.5
+                loss_trace = list()
+
+                def callback(beta):
+                    Tau = None
+                    eta = 2.0
+                    group = None
+
+                    loss_trace.append(
+                        _loss(distr, alpha, Tau, reg_lambda,
+                              X_train, y_train, w_train, eta, group, beta))
+
+                glm = GLM(distr, learning_rate=learning_rate,
+                          reg_lambda=reg_lambda, tol=1e-3, max_iter=5000,
+                          alpha=alpha, solver=solver,
+                          score_metric=score_metric,
+                          random_state=random_state, callback=callback)
+                assert(repr(glm))
+
+                glm.fit(X_train, y_train)
+
+                # verify that loss doesn't increase more than tolerance
+                # for too many consecutive iterations
+                loss_increased = (np.diff(loss_trace) > glm.tol)
+                loss_increase_runlengths = [sum(grp) for
+                                            val, grp in
+                                            itertools.groupby(loss_increased)
+                                            if val]
+                loss_increase_runlengths = np.array(loss_increase_runlengths)
+                assert np.all(loss_increase_runlengths <= 5), \
+                    ('Loss increased for {maxlen} consecutive iterations'
+                     ' on distr={d} solver={s} with'
+                     ' reg_lambda={rl}'
+                     ).format(d=distr, s=solver,
+                              rl=reg_lambda,
+                              maxlen=np.max(loss_increase_runlengths))
+
+                if reg_lambda == 0.0:
+                    # check that the true model can be recreated
+                    # almost perfectly when no regularization is applied
+                    # verify loss at convergence = loss when beta=beta_
+                    l_true = _loss(distr, alpha, np.eye(beta.shape[0]),
+                                   reg_lambda, X_train, y_train, w_train,
+                                   2.0, None, np.concatenate(([beta0], beta)))
+                    assert_allclose(loss_trace[-1], l_true,
+                                    rtol=1e-4, atol=1e-5,
+                                    err_msg=('Final loss trace value different'
+                                             ' from true loss '
+                                             ' on distr={d} solver={s}'
+                                             ' with reg_lambda={rl}'
+                                             ).format(d=distr, s=solver,
+                                                      rl=reg_lambda))
+                    # beta=beta_ when reg_lambda = 0.
+                    assert_allclose(beta, glm.beta_, rtol=0.05, atol=1e-2,
+                                    err_msg=('Fitted beta too different'
+                                             ' from true beta'
+                                             ' in distr={} solver={}'
+                                             ).format(distr, solver))
+                    betas_.append(glm.beta_)
+
+                y_pred = glm.predict(X_train)
+                assert y_pred.shape[0] == X_train.shape[0], \
+                    ('Fitted values have wrong number of rows in '
+                     ' on distr={d} solver={s} with reg_lambda={rl}'
+                     ).format(d=distr, s=solver, rl=reg_lambda)
+
+            # compare all solvers pairwise to make sure they're close
+            for i, first_beta in enumerate(betas_[:-1]):
+                for second_beta in betas_[i + 1:]:
+                    assert_allclose(first_beta, second_beta,
+                                    rtol=0.05, atol=1e-2)
 
     # test fit_predict
     glm_poisson = GLM(distr='softplus')
@@ -317,7 +375,8 @@ def test_cdfast():
         z = beta_[0] + np.dot(X, beta_[1:])
         k = 1
         xk = X[:, k - 1]
-        gk, hk = _gradhess_logloss_1d(glm.distr, xk, y, z, glm.eta)
+        w = np.ones_like(y)
+        gk, hk = _gradhess_logloss_1d(glm.distr, xk, y, z, w, glm.eta)
 
         # test grad and hess
         if distr != 'multinomial':
@@ -335,10 +394,9 @@ def test_cdfast():
 
         # test cdfast
         ActiveSet = np.ones(n_features + 1)
-        beta_ret, z_ret = glm._cdfast(X, y, z,
-                                      ActiveSet, beta_, glm.reg_lambda)
+        beta_ret = glm._cdfast(X, y, w,
+                               ActiveSet, beta_, glm.reg_lambda)
         assert(beta_ret.shape == beta_.shape)
-        assert(z_ret.shape == z.shape)
 
 
 def test_fetch_datasets():