Skip to content

Refactor LinearBoostClassifier to remove sparse data support #12

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
244 changes: 237 additions & 7 deletions src/linearboost/linear_boost.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import sys
import warnings
from abc import abstractmethod
from numbers import Integral, Real

if sys.version_info >= (3, 11):
Expand All @@ -40,7 +41,7 @@
from sklearn.utils.multiclass import check_classification_targets, type_of_target
from sklearn.utils.validation import check_is_fitted

from ._utils import SKLEARN_V1_6_OR_LATER, check_X_y
from ._utils import SKLEARN_V1_6_OR_LATER, check_X_y, validate_data
from .sefr import SEFR

__all__ = ["LinearBoostClassifier"]
Expand All @@ -63,7 +64,201 @@
}


class LinearBoostClassifier(AdaBoostClassifier):
class _DenseAdaBoostClassifier(AdaBoostClassifier):
if SKLEARN_V1_6_OR_LATER:

def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.sparse = False
return tags

def _check_X(self, X):
# Only called to validate X in non-fit methods, therefore reset=False
return validate_data(
self,
X,
accept_sparse=False,
ensure_2d=True,
allow_nd=True,
dtype=None,
reset=False,
)

@abstractmethod
def _boost(self, iboost, X, y, sample_weight, random_state):
"""Implement a single boost.

Warning: This method needs to be overridden by subclasses.

Parameters
----------
iboost : int
The index of the current boost iteration.

X : {array-like} of shape (n_samples, n_features)
The training input samples.

y : array-like of shape (n_samples,)
The target values (class labels).

sample_weight : array-like of shape (n_samples,)
The current sample weights.

random_state : RandomState
The current random number generator

Returns
-------
sample_weight : array-like of shape (n_samples,) or None
The reweighted sample weights.
If None then boosting has terminated early.

estimator_weight : float
The weight for the current boost.
If None then boosting has terminated early.

error : float
The classification error for the current boost.
If None then boosting has terminated early.
"""
pass

def staged_score(self, X, y, sample_weight=None):
"""Return staged scores for X, y.

This generator method yields the ensemble score after each iteration of
boosting and therefore allows monitoring, such as to determine the
score on a test set after each boost.

Parameters
----------
X : {array-like} of shape (n_samples, n_features)
The training input samples.

y : array-like of shape (n_samples,)
Labels for X.

sample_weight : array-like of shape (n_samples,), default=None
Sample weights.

Yields
------
z : float
"""
return super().staged_score(X, y, sample_weight)

def staged_predict(self, X):
"""Return staged predictions for X.

The predicted class of an input sample is computed as the weighted mean
prediction of the classifiers in the ensemble.

This generator method yields the ensemble prediction after each
iteration of boosting and therefore allows monitoring, such as to
determine the prediction on a test set after each boost.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The input samples.

Yields
------
y : generator of ndarray of shape (n_samples,)
The predicted classes.
"""
return super().staged_predict(X)

def staged_decision_function(self, X):
"""Compute decision function of ``X`` for each boosting iteration.

This method allows monitoring (i.e. determine error on testing set)
after each boosting iteration.

Parameters
----------
X : {array-like} of shape (n_samples, n_features)
The training input samples.

Yields
------
score : generator of ndarray of shape (n_samples, k)
The decision function of the input samples. The order of
outputs is the same of that of the :term:`classes_` attribute.
Binary classification is a special cases with ``k == 1``,
otherwise ``k==n_classes``. For binary classification,
values closer to -1 or 1 mean more like the first or second
class in ``classes_``, respectively.
"""
return super().staged_decision_function(X)

def predict_proba(self, X):
"""Predict class probabilities for X.

The predicted class probabilities of an input sample is computed as
the weighted mean predicted class probabilities of the classifiers
in the ensemble.

Parameters
----------
X : {array-like} of shape (n_samples, n_features)
The training input samples.

Returns
-------
p : ndarray of shape (n_samples, n_classes)
The class probabilities of the input samples. The order of
outputs is the same of that of the :term:`classes_` attribute.
"""
return super().predict_proba(X)

def staged_predict_proba(self, X):
"""Predict class probabilities for X.

The predicted class probabilities of an input sample is computed as
the weighted mean predicted class probabilities of the classifiers
in the ensemble.

This generator method yields the ensemble predicted class probabilities
after each iteration of boosting and therefore allows monitoring, such
as to determine the predicted class probabilities on a test set after
each boost.

Parameters
----------
X : {array-like} of shape (n_samples, n_features)
The training input samples.

Yields
------
p : generator of ndarray of shape (n_samples,)
The class probabilities of the input samples. The order of
outputs is the same of that of the :term:`classes_` attribute.
"""
return super().staged_predict_proba(X)

def predict_log_proba(self, X):
"""Predict class log-probabilities for X.

The predicted class log-probabilities of an input sample is computed as
the weighted mean predicted class log-probabilities of the classifiers
in the ensemble.

Parameters
----------
X : {array-like} of shape (n_samples, n_features)
The training input samples.

Returns
-------
p : ndarray of shape (n_samples, n_classes)
The class probabilities of the input samples. The order of
outputs is the same of that of the :term:`classes_` attribute.
"""
return super().predict_log_proba(X)


class LinearBoostClassifier(_DenseAdaBoostClassifier):
"""A LinearBoost classifier.

A LinearBoost classifier is a meta-estimator based on AdaBoost and SEFR.
Expand Down Expand Up @@ -221,7 +416,6 @@ def __init__(

def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.sparse = False
tags.target_tags.required = True
tags.classifier_tags.multi_class = False
tags.classifier_tags.poor_score = True
Expand Down Expand Up @@ -268,6 +462,25 @@ def _check_X_y(self, X, y) -> tuple[np.ndarray, np.ndarray]:
return X, y

def fit(self, X, y, sample_weight=None) -> Self:
"""Build a LinearBoost classifier from the training set (X, y).

Parameters
----------
X : {array-like} of shape (n_samples, n_features)
The training input samples.

y : array-like of shape (n_samples,)
The target values.

sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, the sample weights are initialized to
1 / n_samples.

Returns
-------
self : object
Fitted estimator.
"""
if self.algorithm not in {"SAMME", "SAMME.R"}:
raise ValueError("algorithm must be 'SAMME' or 'SAMME.R'")

Expand Down Expand Up @@ -322,7 +535,8 @@ def fit(self, X, y, sample_weight=None) -> Self:
)
return super().fit(X_transformed, y, sample_weight)

def _samme_proba(self, estimator, n_classes, X):
@staticmethod
def _samme_proba(estimator, n_classes, X):
"""Calculate algorithm 4, step 2, equation c) of Zhu et al [1].

References
Expand Down Expand Up @@ -401,6 +615,23 @@ def _boost(self, iboost, X, y, sample_weight, random_state):
return sample_weight, estimator_weight, estimator_error

def decision_function(self, X):
"""Compute the decision function of ``X``.

Parameters
----------
X : {array-like} of shape (n_samples, n_features)
The training input samples.

Returns
-------
score : ndarray of shape of (n_samples, k)
The decision function of the input samples. The order of
outputs is the same as that of the :term:`classes_` attribute.
Binary classification is a special cases with ``k == 1``,
otherwise ``k==n_classes``. For binary classification,
values closer to -1 or 1 mean more like the first or second
class in ``classes_``, respectively.
"""
check_is_fitted(self)
X_transformed = self.scaler_.transform(X)

Expand Down Expand Up @@ -431,9 +662,8 @@ def predict(self, X):

Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The training input samples. Sparse matrix can be CSC, CSR, COO,
DOK, or LIL. COO, DOK, and LIL are converted to CSR.
X : {array-like} of shape (n_samples, n_features)
The training input samples.

Returns
-------
Expand Down
34 changes: 9 additions & 25 deletions tests/test_linearboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,9 @@ def test_invalid_algorithm_error():
y = np.array([0, 1])

clf = LinearBoostClassifier(algorithm="INVALID")
with pytest.raises(ValueError, match="algorithm must be 'SAMME' or 'SAMME.R'"):
msg1 = "algorithm must be 'SAMME' or 'SAMME.R'"
msg2 = r"The 'algorithm' parameter of LinearBoostClassifier must be a str among \{('SAMME', 'SAMME\.R'|'SAMME\.R', 'SAMME')\}"
with pytest.raises(ValueError, match=rf"({msg1}|{msg2})"):
clf.fit(X, y)


Expand All @@ -403,7 +405,9 @@ def test_invalid_scaler_error():
y = np.array([0, 1])

clf = LinearBoostClassifier(scaler="invalid_scaler")
with pytest.raises(ValueError, match="Invalid scaler provided"):
msg1 = "Invalid scaler provided"
msg2 = r"The 'scaler' parameter of LinearBoostClassifier must be a str among .*\. Got 'invalid_scaler' instead\."
with pytest.raises(ValueError, match=rf"({msg1}|{msg2})"):
clf.fit(X, y)


Expand All @@ -413,7 +417,9 @@ def test_invalid_class_weight_error():
y = np.array([0, 1])

clf = LinearBoostClassifier(class_weight="invalid_weight")
with pytest.raises(ValueError, match='Valid preset for class_weight is "balanced"'):
msg1 = 'Valid preset for class_weight is "balanced"'
msg2 = r"The 'class_weight' parameter of LinearBoostClassifier must be a str among \{'balanced'\}, an instance of 'dict', an instance of 'list' or None"
with pytest.raises(ValueError, match=rf"({msg1}|{msg2})"):
clf.fit(X, y)


Expand Down Expand Up @@ -691,28 +697,6 @@ def test_breast_cancer_dataset():
assert score > 0.5 # Should be better than random guessing


def test_memory_efficiency():
"""Test that LinearBoostClassifier doesn't consume excessive memory."""
X, y = make_classification(
n_samples=200,
n_features=10,
n_redundant=0,
random_state=42,
n_clusters_per_class=1,
)

clf = LinearBoostClassifier(n_estimators=10)
clf.fit(X, y)

# Check that the model doesn't store the training data
assert not hasattr(clf, "X_")
assert not hasattr(clf, "y_")

# Check that estimators are SEFR instances (lightweight)
for estimator in clf.estimators_:
assert estimator.__class__.__name__ == "SEFR"


def test_different_class_labels():
"""Test with different types of class labels."""
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
Expand Down
22 changes: 0 additions & 22 deletions tests/test_sefr.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,28 +457,6 @@ def test_breast_cancer_dataset():
assert score > 0.5 # Should be better than random guessing


def test_memory_efficiency():
"""Test that SEFR doesn't consume excessive memory."""
# This is a basic test - in practice you might want more sophisticated memory profiling
X, y = make_classification(
n_samples=1000,
n_features=20,
n_redundant=0,
random_state=42,
n_clusters_per_class=1,
)

sefr = SEFR()
sefr.fit(X, y)

# Check that the model doesn't store the training data
assert not hasattr(sefr, "X_")
assert not hasattr(sefr, "y_")

# Check that coefficients are reasonably sized
assert sefr.coef_.nbytes < 1000 # Should be small for 20 features


def test_different_class_labels():
"""Test with different types of class labels."""
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
Expand Down