Merge pull request #8 from LinearBoost/v.0.1.2

hamidkm9 · web-flow · commit b387f15ebcf9 · 2025-05-25T21:26:26.000+02:00
V.0.1.2
diff --git a/.gitignore b/.gitignore
@@ -130,3 +130,8 @@ dmypy.json
 
 #PyCharm
 .idea/
+catboost_info/catboost_training.json
+catboost_info/learn/events.out.tfevents
+catboost_info/learn_error.tsv
+catboost_info/time_left.tsv
+*.ipynb
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # LinearBoost Classifier
 
-![Lastest Release](https://img.shields.io/badge/release-v0.1.1-green)
+![Lastest Release](https://img.shields.io/badge/release-v0.1.2-green)
 [![PyPI Version](https://img.shields.io/pypi/v/linearboost)](https://pypi.org/project/linearboost/)
 ![Python Versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue)
 
@@ -19,10 +19,11 @@ Key Features:
 - Exceptional Speed: Blazing fast training and inference times
 - Resource Efficient: Low memory usage, ideal for large datasets
 
-## 🚀 New Major Release (v0.1.1)
-Version 0.1.1 of **LinearBoost Classifier** is released, with a pull request from [@msamsami](https://github.com/msamsami). Here are the changes:
+## 🚀 New Major Release (v0.1.2)
+Version 0.1.2 of **LinearBoost Classifier** is released. Here are the changes:
 
 - The codebase is refactored into a new structure.
+- SAMME.R algorithm is returned to the classifier.
 - Both SEFR and LinearBoostClassifier classes are refactored to fully adhere to Scikit-learn's conventions and API. Now, they are standard Scikit-learn estimators that can be used in Scikit-learn pipelines, grid search, etc.
 - Added unit tests (using pytest) to ensure the estimators adhere to Scikit-learn conventions.
 - Added fit_intercept parameter to SEFR similar to other linear estimators in Scikit-learn (e.g., LogisticRegression, LinearRegression, etc.).
@@ -35,16 +36,6 @@ Version 0.1.1 of **LinearBoost Classifier** is released, with a pull request fro
 - Improved Scikit-learn compatibility.
 
 
-## 🚀 New Release (v0.0.5) 
-Version 0.0.5 of the **LinearBoost Classifier** is released! This new version introduces several exciting features and improvements:
-
-- 🛠️ Support of custom loss function
-- ✅ Enhanced handling of class weights
-- 🎨 Customized handling of the data scalers
-- ⚡ Optimized boosting
-- 🕒 Improved runtime and scalability
-
-
 Get Started and Documentation
 -----------------------------
 
@@ -228,3 +219,12 @@ License
 -------
 
 This project is licensed under the terms of the MIT license. See [LICENSE](https://github.com/LinearBoost/linearboost-classifier/blob/main/LICENSE) for additional details.
+
+## Acknowledgments
+
+Some portions of this code are adapted from the scikit-learn project
+(https://scikit-learn.org), which is licensed under the BSD 3-Clause License.
+See the `licenses/` folder for details. The modifications and additions made to the original code are licensed under the MIT License © 2025 Hamidreza Keshavarz, Reza Rawassizadeh.
+Special Thanks to:
+- **Mehdi Samsami** – for software engineering, refactoring, and ensuring compatibility.
+The original code from scikit-learn is available at [scikit-learn GitHub repository](https://github.com/scikit-learn/scikit-learn)
diff --git a/src/linearboost/__init__.py b/src/linearboost/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.1.1"
+__version__ = "0.1.2"
 
 from .linear_boost import LinearBoostClassifier
 from .sefr import SEFR
diff --git a/src/linearboost/licenses/BSD-3-Clause.txt b/src/linearboost/licenses/BSD-3-Clause.txt
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2007-2024 The scikit-learn developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/linearboost/linear_boost.py b/src/linearboost/linear_boost.py
@@ -1,3 +1,16 @@
+# This file is part of the LinearBoost project.
+#
+# Portions of this file are derived from scikit-learn
+# Copyright (c) 2007–2024, scikit-learn developers (version 1.5)
+# Licensed under the BSD 3-Clause License
+# See https://github.com/scikit-learn/scikit-learn/blob/main/COPYING for details.
+#
+# Additional code and modifications:
+#   - Hamidreza Keshavarz (hamid9@outlook.com) — machine learning logic, design, and new algorithms
+#   - Mehdi Samsami (mehdisamsami@live.com) — software refactoring, compatibility with scikit-learn framework, and packaging
+#
+# The combined work is licensed under the MIT License.
+
 from __future__ import annotations
 
 import sys
@@ -23,7 +36,7 @@
     StandardScaler,
 )
 from sklearn.utils import compute_sample_weight
-from sklearn.utils._param_validation import Hidden, Interval, StrOptions
+from sklearn.utils._param_validation import Interval, StrOptions
 from sklearn.utils.multiclass import check_classification_targets, type_of_target
 from sklearn.utils.validation import check_is_fitted
 
@@ -73,18 +86,10 @@ class LinearBoostClassifier(AdaBoostClassifier):
     algorithm : {'SAMME', 'SAMME.R'}, default='SAMME'
         If 'SAMME' then use the SAMME discrete boosting algorithm.
         If 'SAMME.R' then use the SAMME.R real boosting algorithm
-        (only available in scikit-learn < 1.6).
+        (implemented from scikit-learn = 1.5).
         The SAMME.R algorithm typically converges faster than SAMME,
         achieving a lower test error with fewer boosting iterations.
 
-        .. deprecated:: scikit-learn 1.4
-            `"SAMME.R"` is deprecated and will be removed in scikit-learn 1.6.
-            '"SAMME"' will become the default.
-
-        .. deprecated:: scikit-learn 1.6
-            `algorithm` is deprecated and will be removed in scikit-learn 1.8.
-            This estimator only implements the 'SAMME' algorithm in scikit-learn >= 1.6.
-
     scaler : str, default='minmax'
         Specifies the scaler to apply to the data. Options include:
 
@@ -188,9 +193,7 @@ class LinearBoostClassifier(AdaBoostClassifier):
     _parameter_constraints: dict = {
         "n_estimators": [Interval(Integral, 1, None, closed="left")],
         "learning_rate": [Interval(Real, 0, None, closed="neither")],
-        "algorithm": [StrOptions({"SAMME"}), Hidden(StrOptions({"deprecated"}))]
-        if SKLEARN_V1_6_OR_LATER
-        else [StrOptions({"SAMME", "SAMME.R"})],
+        "algorithm": [StrOptions({"SAMME", "SAMME.R"})],
         "scaler": [StrOptions({s for s in _scalers})],
         "class_weight": [
             StrOptions({"balanced_subsample", "balanced"}),
@@ -206,18 +209,15 @@ def __init__(
         n_estimators=200,
         *,
         learning_rate=1.0,
-        algorithm="SAMME",
+        algorithm="SAMME.R",
         scaler="minmax",
         class_weight=None,
         loss_function=None,
     ):
         super().__init__(
-            estimator=SEFR(),
-            n_estimators=n_estimators,
-            learning_rate=learning_rate,
-            algorithm=algorithm,
+            estimator=SEFR(), n_estimators=n_estimators, learning_rate=learning_rate
         )
-
+        self.algorithm = algorithm
         self.scaler = scaler
         self.class_weight = class_weight
         self.loss_function = loss_function
@@ -241,7 +241,11 @@ def _more_tags(self) -> dict[str, bool]:
                 "check_sample_weight_equivalence_on_dense_data": (
                     "In LinearBoostClassifier, setting a sample's weight to 0 can produce a different "
                     "result than omitting the sample. Such samples intentionally still affect the data scaling process."
-                )
+                ),
+                "check_sample_weights_invariance": (
+                    "In LinearBoostClassifier, a zero sample_weight is not equivalent to removing the sample, "
+                    "as samples with zero weight intentionally still affect the data scaling process."
+                ),
             },
         }
 
@@ -269,9 +273,8 @@ def _check_X_y(self, X, y) -> tuple[np.ndarray, np.ndarray]:
         return X, y
 
     def fit(self, X, y, sample_weight=None) -> Self:
-        X, y = self._check_X_y(X, y)
-        self.classes_ = np.unique(y)
-        self.n_classes_ = self.classes_.shape[0]
+        if self.algorithm not in {"SAMME", "SAMME.R"}:
+            raise ValueError("algorithm must be 'SAMME' or 'SAMME.R'")
 
         if self.scaler not in _scalers:
             raise ValueError('Invalid scaler provided; got "%s".' % self.scaler)
@@ -283,6 +286,25 @@ def fit(self, X, y, sample_weight=None) -> Self:
                 clone(_scalers[self.scaler]), clone(_scalers["minmax"])
             )
         X_transformed = self.scaler_.fit_transform(X)
+        y = np.asarray(y)
+
+        if sample_weight is not None:
+            sample_weight = np.asarray(sample_weight)
+            if sample_weight.shape[0] != X_transformed.shape[0]:
+                raise ValueError(
+                    f"sample_weight.shape == {sample_weight.shape} is incompatible with X.shape == {X_transformed.shape}"
+                )
+            nonzero_mask = (
+                sample_weight.sum(axis=1) != 0
+                if sample_weight.ndim > 1
+                else sample_weight != 0
+            )
+            X_transformed = X_transformed[nonzero_mask]
+            y = y[nonzero_mask]
+            sample_weight = sample_weight[nonzero_mask]
+        X_transformed, y = self._check_X_y(X_transformed, y)
+        self.classes_ = np.unique(y)
+        self.n_classes_ = self.classes_.shape[0]
 
         if self.class_weight is not None:
             valid_presets = ("balanced", "balanced_subsample")
@@ -307,50 +329,131 @@ def fit(self, X, y, sample_weight=None) -> Self:
                 warnings.filterwarnings(
                     "ignore",
                     category=FutureWarning,
-                    message=".*parameter 'algorithm' is deprecated.*",
+                    message=".*parameter 'algorithm' may change in the future.*",
                 )
             return super().fit(X_transformed, y, sample_weight)
 
+    def _samme_proba(self, estimator, n_classes, X):
+        """Calculate algorithm 4, step 2, equation c) of Zhu et al [1].
+
+        References
+        ----------
+        .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
+
+        """
+        proba = estimator.predict_proba(X)
+
+        # Displace zero probabilities so the log is defined.
+        # Also fix negative elements which may occur with
+        # negative sample weights.
+        np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba)
+        log_proba = np.log(proba)
+
+        return (n_classes - 1) * (
+            log_proba - (1.0 / n_classes) * log_proba.sum(axis=1)[:, np.newaxis]
+        )
+
     def _boost(self, iboost, X, y, sample_weight, random_state):
         estimator = self._make_estimator(random_state=random_state)
         estimator.fit(X, y, sample_weight=sample_weight)
 
-        y_pred = estimator.predict(X)
-        missclassified = y_pred != y
+        if self.algorithm == "SAMME.R":
+            y_pred = estimator.predict(X)
 
-        if self.loss_function:
-            estimator_error = self.loss_function(y, y_pred, sample_weight)
-        else:
+            incorrect = y_pred != y
             estimator_error = np.mean(
-                np.average(missclassified, weights=sample_weight, axis=0)
+                np.average(incorrect, weights=sample_weight, axis=0)
             )
 
-        if estimator_error <= 0:
-            return sample_weight, 1.0, 0.0
+            if estimator_error <= 0:
+                return sample_weight, 1.0, 0.0
+            elif estimator_error >= 0.5:
+                if len(self.estimators_) > 1:
+                    self.estimators_.pop(-1)
+                return None, None, None
 
-        if estimator_error >= 0.5:
-            self.estimators_.pop(-1)
-            if len(self.estimators_) == 0:
-                raise ValueError(
-                    "BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit."
+            # Compute SEFR-specific weight update
+            estimator_weight = self.learning_rate * np.log(
+                (1 - estimator_error) / estimator_error
+            )
+
+            if iboost < self.n_estimators - 1:
+                sample_weight = np.exp(
+                    np.log(sample_weight)
+                    + estimator_weight * incorrect * (sample_weight > 0)
                 )
-            return None, None, None
 
-        estimator_weight = (
-            self.learning_rate
-            * 0.5
-            * np.log((1.0 - estimator_error) / max(estimator_error, 1e-10))
-        )
+            return sample_weight, estimator_weight, estimator_error
+
+        else:  # standard SAMME
+            y_pred = estimator.predict(X)
+            incorrect = y_pred != y
+            estimator_error = np.mean(np.average(incorrect, weights=sample_weight))
+
+            if estimator_error <= 0:
+                return sample_weight, 1.0, 0.0
+            if estimator_error >= 0.5:
+                self.estimators_.pop(-1)
+                if len(self.estimators_) == 0:
+                    raise ValueError(
+                        "BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble cannot be fit."
+                    )
+                return None, None, None
+
+            estimator_weight = self.learning_rate * np.log(
+                (1.0 - estimator_error) / max(estimator_error, 1e-10)
+            )
 
-        sample_weight *= np.exp(
-            estimator_weight
-            * missclassified
-            * ((sample_weight > 0) | (estimator_weight < 0))
-        )
+            sample_weight *= np.exp(estimator_weight * incorrect)
+
+            # Normalize sample weights
+            sample_weight /= np.sum(sample_weight)
 
-        return sample_weight, estimator_weight, estimator_error
+            return sample_weight, estimator_weight, estimator_error
 
     def decision_function(self, X):
         check_is_fitted(self)
         X_transformed = self.scaler_.transform(X)
-        return super().decision_function(X_transformed)
+
+        if self.algorithm == "SAMME.R":
+            # Proper SAMME.R decision function
+            classes = self.classes_
+            n_classes = len(classes)
+
+            pred = sum(
+                self._samme_proba(estimator, n_classes, X_transformed)
+                for estimator in self.estimators_
+            )
+            pred /= self.estimator_weights_.sum()
+            if n_classes == 2:
+                pred[:, 0] *= -1
+                return pred.sum(axis=1)
+            return pred
+
+        else:
+            # Standard SAMME algorithm from AdaBoostClassifier (discrete)
+            return super().decision_function(X_transformed)
+
+    def predict(self, X):
+        """Predict classes for X.
+
+        The predicted class of an input sample is computed as the weighted mean
+        prediction of the classifiers in the ensemble.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrix can be CSC, CSR, COO,
+            DOK, or LIL. COO, DOK, and LIL are converted to CSR.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            The predicted classes.
+        """
+        pred = self.decision_function(X)
+
+        if self.n_classes_ == 2:
+            return self.classes_.take(pred > 0, axis=0)
+
+        return self.classes_.take(np.argmax(pred, axis=1), axis=0)
diff --git a/tests/_utils.py b/tests/_utils.py
@@ -16,4 +16,4 @@ def check_estimator(estimator, *args, **kwargs):
 
 
 def get_expected_failed_tests(estimator) -> dict[str, str]:
-    return estimator._more_tags()["_xfail_checks"]
+    return estimator._more_tags().get("_xfail_checks", {})
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "0.1.1"`
	`1`	`+__version__ = "0.1.2"`
`2`	`2`
`3`	`3`	`from .linear_boost import LinearBoostClassifier`
`4`	`4`	`from .sefr import SEFR`
Original file line number	Diff line number	Diff line change
`@@ -16,4 +16,4 @@ def check_estimator(estimator, args, *kwargs):`
`16`	`16`
`17`	`17`
`18`	`18`	`def get_expected_failed_tests(estimator) -> dict[str, str]:`
`19`		`- return estimator._more_tags()["_xfail_checks"]`
	`19`	`+ return estimator._more_tags().get("_xfail_checks", {})`