Skip to content

Commit 2275f4b

Browse files
authored
Merge pull request #12 from msamsami/refactor-linearboost-remove-sparse-support
Refactor `LinearBoostClassifier` to remove sparse data support
2 parents a635126 + bebfd03 commit 2275f4b

File tree

3 files changed

+246
-54
lines changed

3 files changed

+246
-54
lines changed

src/linearboost/linear_boost.py

Lines changed: 237 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import sys
1717
import warnings
18+
from abc import abstractmethod
1819
from numbers import Integral, Real
1920

2021
if sys.version_info >= (3, 11):
@@ -40,7 +41,7 @@
4041
from sklearn.utils.multiclass import check_classification_targets, type_of_target
4142
from sklearn.utils.validation import check_is_fitted
4243

43-
from ._utils import SKLEARN_V1_6_OR_LATER, check_X_y
44+
from ._utils import SKLEARN_V1_6_OR_LATER, check_X_y, validate_data
4445
from .sefr import SEFR
4546

4647
__all__ = ["LinearBoostClassifier"]
@@ -63,7 +64,201 @@
6364
}
6465

6566

66-
class LinearBoostClassifier(AdaBoostClassifier):
67+
class _DenseAdaBoostClassifier(AdaBoostClassifier):
68+
if SKLEARN_V1_6_OR_LATER:
69+
70+
def __sklearn_tags__(self):
71+
tags = super().__sklearn_tags__()
72+
tags.input_tags.sparse = False
73+
return tags
74+
75+
def _check_X(self, X):
76+
# Only called to validate X in non-fit methods, therefore reset=False
77+
return validate_data(
78+
self,
79+
X,
80+
accept_sparse=False,
81+
ensure_2d=True,
82+
allow_nd=True,
83+
dtype=None,
84+
reset=False,
85+
)
86+
87+
@abstractmethod
88+
def _boost(self, iboost, X, y, sample_weight, random_state):
89+
"""Implement a single boost.
90+
91+
Warning: This method needs to be overridden by subclasses.
92+
93+
Parameters
94+
----------
95+
iboost : int
96+
The index of the current boost iteration.
97+
98+
X : {array-like} of shape (n_samples, n_features)
99+
The training input samples.
100+
101+
y : array-like of shape (n_samples,)
102+
The target values (class labels).
103+
104+
sample_weight : array-like of shape (n_samples,)
105+
The current sample weights.
106+
107+
random_state : RandomState
108+
The current random number generator
109+
110+
Returns
111+
-------
112+
sample_weight : array-like of shape (n_samples,) or None
113+
The reweighted sample weights.
114+
If None then boosting has terminated early.
115+
116+
estimator_weight : float
117+
The weight for the current boost.
118+
If None then boosting has terminated early.
119+
120+
error : float
121+
The classification error for the current boost.
122+
If None then boosting has terminated early.
123+
"""
124+
pass
125+
126+
def staged_score(self, X, y, sample_weight=None):
127+
"""Return staged scores for X, y.
128+
129+
This generator method yields the ensemble score after each iteration of
130+
boosting and therefore allows monitoring, such as to determine the
131+
score on a test set after each boost.
132+
133+
Parameters
134+
----------
135+
X : {array-like} of shape (n_samples, n_features)
136+
The training input samples.
137+
138+
y : array-like of shape (n_samples,)
139+
Labels for X.
140+
141+
sample_weight : array-like of shape (n_samples,), default=None
142+
Sample weights.
143+
144+
Yields
145+
------
146+
z : float
147+
"""
148+
return super().staged_score(X, y, sample_weight)
149+
150+
def staged_predict(self, X):
151+
"""Return staged predictions for X.
152+
153+
The predicted class of an input sample is computed as the weighted mean
154+
prediction of the classifiers in the ensemble.
155+
156+
This generator method yields the ensemble prediction after each
157+
iteration of boosting and therefore allows monitoring, such as to
158+
determine the prediction on a test set after each boost.
159+
160+
Parameters
161+
----------
162+
X : array-like of shape (n_samples, n_features)
163+
The input samples.
164+
165+
Yields
166+
------
167+
y : generator of ndarray of shape (n_samples,)
168+
The predicted classes.
169+
"""
170+
return super().staged_predict(X)
171+
172+
def staged_decision_function(self, X):
173+
"""Compute decision function of ``X`` for each boosting iteration.
174+
175+
This method allows monitoring (i.e. determine error on testing set)
176+
after each boosting iteration.
177+
178+
Parameters
179+
----------
180+
X : {array-like} of shape (n_samples, n_features)
181+
The training input samples.
182+
183+
Yields
184+
------
185+
score : generator of ndarray of shape (n_samples, k)
186+
The decision function of the input samples. The order of
187+
outputs is the same of that of the :term:`classes_` attribute.
188+
Binary classification is a special cases with ``k == 1``,
189+
otherwise ``k==n_classes``. For binary classification,
190+
values closer to -1 or 1 mean more like the first or second
191+
class in ``classes_``, respectively.
192+
"""
193+
return super().staged_decision_function(X)
194+
195+
def predict_proba(self, X):
196+
"""Predict class probabilities for X.
197+
198+
The predicted class probabilities of an input sample is computed as
199+
the weighted mean predicted class probabilities of the classifiers
200+
in the ensemble.
201+
202+
Parameters
203+
----------
204+
X : {array-like} of shape (n_samples, n_features)
205+
The training input samples.
206+
207+
Returns
208+
-------
209+
p : ndarray of shape (n_samples, n_classes)
210+
The class probabilities of the input samples. The order of
211+
outputs is the same of that of the :term:`classes_` attribute.
212+
"""
213+
return super().predict_proba(X)
214+
215+
def staged_predict_proba(self, X):
216+
"""Predict class probabilities for X.
217+
218+
The predicted class probabilities of an input sample is computed as
219+
the weighted mean predicted class probabilities of the classifiers
220+
in the ensemble.
221+
222+
This generator method yields the ensemble predicted class probabilities
223+
after each iteration of boosting and therefore allows monitoring, such
224+
as to determine the predicted class probabilities on a test set after
225+
each boost.
226+
227+
Parameters
228+
----------
229+
X : {array-like} of shape (n_samples, n_features)
230+
The training input samples.
231+
232+
Yields
233+
------
234+
p : generator of ndarray of shape (n_samples,)
235+
The class probabilities of the input samples. The order of
236+
outputs is the same of that of the :term:`classes_` attribute.
237+
"""
238+
return super().staged_predict_proba(X)
239+
240+
def predict_log_proba(self, X):
241+
"""Predict class log-probabilities for X.
242+
243+
The predicted class log-probabilities of an input sample is computed as
244+
the weighted mean predicted class log-probabilities of the classifiers
245+
in the ensemble.
246+
247+
Parameters
248+
----------
249+
X : {array-like} of shape (n_samples, n_features)
250+
The training input samples.
251+
252+
Returns
253+
-------
254+
p : ndarray of shape (n_samples, n_classes)
255+
The class probabilities of the input samples. The order of
256+
outputs is the same of that of the :term:`classes_` attribute.
257+
"""
258+
return super().predict_log_proba(X)
259+
260+
261+
class LinearBoostClassifier(_DenseAdaBoostClassifier):
67262
"""A LinearBoost classifier.
68263
69264
A LinearBoost classifier is a meta-estimator based on AdaBoost and SEFR.
@@ -221,7 +416,6 @@ def __init__(
221416

222417
def __sklearn_tags__(self):
223418
tags = super().__sklearn_tags__()
224-
tags.input_tags.sparse = False
225419
tags.target_tags.required = True
226420
tags.classifier_tags.multi_class = False
227421
tags.classifier_tags.poor_score = True
@@ -268,6 +462,25 @@ def _check_X_y(self, X, y) -> tuple[np.ndarray, np.ndarray]:
268462
return X, y
269463

270464
def fit(self, X, y, sample_weight=None) -> Self:
465+
"""Build a LinearBoost classifier from the training set (X, y).
466+
467+
Parameters
468+
----------
469+
X : {array-like} of shape (n_samples, n_features)
470+
The training input samples.
471+
472+
y : array-like of shape (n_samples,)
473+
The target values.
474+
475+
sample_weight : array-like of shape (n_samples,), default=None
476+
Sample weights. If None, the sample weights are initialized to
477+
1 / n_samples.
478+
479+
Returns
480+
-------
481+
self : object
482+
Fitted estimator.
483+
"""
271484
if self.algorithm not in {"SAMME", "SAMME.R"}:
272485
raise ValueError("algorithm must be 'SAMME' or 'SAMME.R'")
273486

@@ -322,7 +535,8 @@ def fit(self, X, y, sample_weight=None) -> Self:
322535
)
323536
return super().fit(X_transformed, y, sample_weight)
324537

325-
def _samme_proba(self, estimator, n_classes, X):
538+
@staticmethod
539+
def _samme_proba(estimator, n_classes, X):
326540
"""Calculate algorithm 4, step 2, equation c) of Zhu et al [1].
327541
328542
References
@@ -401,6 +615,23 @@ def _boost(self, iboost, X, y, sample_weight, random_state):
401615
return sample_weight, estimator_weight, estimator_error
402616

403617
def decision_function(self, X):
618+
"""Compute the decision function of ``X``.
619+
620+
Parameters
621+
----------
622+
X : {array-like} of shape (n_samples, n_features)
623+
The training input samples.
624+
625+
Returns
626+
-------
627+
score : ndarray of shape of (n_samples, k)
628+
The decision function of the input samples. The order of
629+
outputs is the same as that of the :term:`classes_` attribute.
630+
Binary classification is a special cases with ``k == 1``,
631+
otherwise ``k==n_classes``. For binary classification,
632+
values closer to -1 or 1 mean more like the first or second
633+
class in ``classes_``, respectively.
634+
"""
404635
check_is_fitted(self)
405636
X_transformed = self.scaler_.transform(X)
406637

@@ -431,9 +662,8 @@ def predict(self, X):
431662
432663
Parameters
433664
----------
434-
X : {array-like, sparse matrix} of shape (n_samples, n_features)
435-
The training input samples. Sparse matrix can be CSC, CSR, COO,
436-
DOK, or LIL. COO, DOK, and LIL are converted to CSR.
665+
X : {array-like} of shape (n_samples, n_features)
666+
The training input samples.
437667
438668
Returns
439669
-------

tests/test_linearboost.py

Lines changed: 9 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,9 @@ def test_invalid_algorithm_error():
393393
y = np.array([0, 1])
394394

395395
clf = LinearBoostClassifier(algorithm="INVALID")
396-
with pytest.raises(ValueError, match="algorithm must be 'SAMME' or 'SAMME.R'"):
396+
msg1 = "algorithm must be 'SAMME' or 'SAMME.R'"
397+
msg2 = r"The 'algorithm' parameter of LinearBoostClassifier must be a str among \{('SAMME', 'SAMME\.R'|'SAMME\.R', 'SAMME')\}"
398+
with pytest.raises(ValueError, match=rf"({msg1}|{msg2})"):
397399
clf.fit(X, y)
398400

399401

@@ -403,7 +405,9 @@ def test_invalid_scaler_error():
403405
y = np.array([0, 1])
404406

405407
clf = LinearBoostClassifier(scaler="invalid_scaler")
406-
with pytest.raises(ValueError, match="Invalid scaler provided"):
408+
msg1 = "Invalid scaler provided"
409+
msg2 = r"The 'scaler' parameter of LinearBoostClassifier must be a str among .*\. Got 'invalid_scaler' instead\."
410+
with pytest.raises(ValueError, match=rf"({msg1}|{msg2})"):
407411
clf.fit(X, y)
408412

409413

@@ -413,7 +417,9 @@ def test_invalid_class_weight_error():
413417
y = np.array([0, 1])
414418

415419
clf = LinearBoostClassifier(class_weight="invalid_weight")
416-
with pytest.raises(ValueError, match='Valid preset for class_weight is "balanced"'):
420+
msg1 = 'Valid preset for class_weight is "balanced"'
421+
msg2 = r"The 'class_weight' parameter of LinearBoostClassifier must be a str among \{'balanced'\}, an instance of 'dict', an instance of 'list' or None"
422+
with pytest.raises(ValueError, match=rf"({msg1}|{msg2})"):
417423
clf.fit(X, y)
418424

419425

@@ -691,28 +697,6 @@ def test_breast_cancer_dataset():
691697
assert score > 0.5 # Should be better than random guessing
692698

693699

694-
def test_memory_efficiency():
695-
"""Test that LinearBoostClassifier doesn't consume excessive memory."""
696-
X, y = make_classification(
697-
n_samples=200,
698-
n_features=10,
699-
n_redundant=0,
700-
random_state=42,
701-
n_clusters_per_class=1,
702-
)
703-
704-
clf = LinearBoostClassifier(n_estimators=10)
705-
clf.fit(X, y)
706-
707-
# Check that the model doesn't store the training data
708-
assert not hasattr(clf, "X_")
709-
assert not hasattr(clf, "y_")
710-
711-
# Check that estimators are SEFR instances (lightweight)
712-
for estimator in clf.estimators_:
713-
assert estimator.__class__.__name__ == "SEFR"
714-
715-
716700
def test_different_class_labels():
717701
"""Test with different types of class labels."""
718702
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])

tests/test_sefr.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -457,28 +457,6 @@ def test_breast_cancer_dataset():
457457
assert score > 0.5 # Should be better than random guessing
458458

459459

460-
def test_memory_efficiency():
461-
"""Test that SEFR doesn't consume excessive memory."""
462-
# This is a basic test - in practice you might want more sophisticated memory profiling
463-
X, y = make_classification(
464-
n_samples=1000,
465-
n_features=20,
466-
n_redundant=0,
467-
random_state=42,
468-
n_clusters_per_class=1,
469-
)
470-
471-
sefr = SEFR()
472-
sefr.fit(X, y)
473-
474-
# Check that the model doesn't store the training data
475-
assert not hasattr(sefr, "X_")
476-
assert not hasattr(sefr, "y_")
477-
478-
# Check that coefficients are reasonably sized
479-
assert sefr.coef_.nbytes < 1000 # Should be small for 20 features
480-
481-
482460
def test_different_class_labels():
483461
"""Test with different types of class labels."""
484462
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])

0 commit comments

Comments
 (0)