#20 - fixed scipy.sparse support in expected_error, fixed issues from code review

Boyan Hristov · Boyan Hristov · commit 143067c9b8ff · 2020-10-16T21:01:04.000+02:00
diff --git a/modAL/expected_error.py b/modAL/expected_error.py
@@ -10,7 +10,7 @@
 from sklearn.exceptions import NotFittedError
 
 from modAL.models import ActiveLearner
-from modAL.utils.data import modALinput, data_vstack, enumerate_data, drop_rows
+from modAL.utils.data import modALinput, data_vstack, enumerate_data, drop_rows, data_shape, add_row
 from modAL.utils.selection import multi_argmax, shuffled_argmax
 from modAL.uncertainty import _proba_uncertainty, _proba_entropy
 
@@ -38,14 +38,13 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str =
 
 
     Returns:
-        The indices of the instances from X chosen to be labelled;
-        the instances from X chosen to be labelled.
+        The indices of the instances from X chosen to be labelled.
     """
 
     assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0'
     assert loss in ['binary', 'log'], 'loss must be \'binary\' or \'log\''
 
-    expected_error = np.zeros(shape=(len(X), ))
+    expected_error = np.zeros(shape=(data_shape(X)[0],))
     possible_labels = np.unique(learner.y_training)
 
     try:
@@ -62,7 +61,7 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str =
             X_reduced = drop_rows(X, x_idx)
             # estimate the expected error
             for y_idx, y in enumerate(possible_labels):
-                X_new = data_vstack((learner.X_training, [x]))
+                X_new = add_row(learner.X_training, x)
                 y_new = data_vstack((learner.y_training, np.array(y).reshape(1,)))
 
                 cloned_estimator.fit(X_new, y_new)
diff --git a/modAL/utils/data.py b/modAL/utils/data.py
@@ -1,12 +1,11 @@
 from typing import Union, List, Sequence
-from itertools import chain
 
 import numpy as np
 import pandas as pd
 import scipy.sparse as sp
 
 
-modALinput = Union[list, np.ndarray, sp.csr_matrix, pd.DataFrame]
+modALinput = Union[sp.csr_matrix, pd.DataFrame, np.ndarray, list]
 
 
 def data_vstack(blocks: Sequence[modALinput]) -> modALinput:
@@ -19,60 +18,137 @@ def data_vstack(blocks: Sequence[modALinput]) -> modALinput:
     Returns:
         New sequence of vertically stacked elements.
     """
-    if isinstance(blocks[0], np.ndarray):
-        return np.concatenate(blocks)
-    elif isinstance(blocks[0], list):
-        return list(chain(blocks))
-    elif sp.issparse(blocks[0]):
+    if any([sp.issparse(b) for b in blocks]):
         return sp.vstack(blocks)
     elif isinstance(blocks[0], pd.DataFrame):
-        return blocks[0].append(blocks[1])
-    else:
-        try:
-            return np.concatenate(blocks)
-        except:
-            raise TypeError('%s datatype is not supported' % type(blocks[0]))
+        return blocks[0].append(blocks[1:])
+    elif isinstance(blocks[0], np.ndarray):
+        return np.concatenate(blocks)
+    elif isinstance(blocks[0], list):
+        return np.concatenate(blocks).tolist()
+
+    raise TypeError('%s datatype is not supported' % type(blocks[0]))
 
 
 def data_hstack(blocks: Sequence[modALinput]) -> modALinput:
     """
-    Stack horizontally both sparse and dense arrays
+    Stack horizontally sparse/dense arrays and pandas data frames.
 
     Args:
         blocks: Sequence of modALinput objects.
 
     Returns:
         New sequence of horizontally stacked elements.
     """
-    # use sparse representation if any of the blocks do
     if any([sp.issparse(b) for b in blocks]):
         return sp.hstack(blocks)
-
-    try:
+    elif isinstance(blocks[0], pd.DataFrame):
+        pd.concat(blocks, axis=1)
+    elif isinstance(blocks[0], np.ndarray):
         return np.hstack(blocks)
-    except:
-        raise TypeError('%s datatype is not supported' % type(blocks[0]))
+    elif isinstance(blocks[0], list):
+        return np.hstack(blocks).tolist()
+
+    TypeError('%s datatype is not supported' % type(blocks[0]))
+
+
+def add_row(X:modALinput, row: modALinput):
+    """
+    Returns X' =
+
+    [X
+
+    row]
+    """
+    if isinstance(X, np.ndarray):
+        return np.vstack((X, row))
+    elif isinstance(X, list):
+        return np.vstack((X, row)).tolist()
+
+    # data_vstack readily supports stacking of matrix as first argument
+    # and row as second for the other data types
+    return data_vstack([X, row])
 
 
 def retrieve_rows(X: modALinput,
                   I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]:
     """
     Returns the rows I from the data set X
+
+    For a single index, the result is as follows:
+    * 1xM matrix in case of scipy sparse NxM matrix X
+    * pandas series in case of a pandas data frame
+    * row in case of list or numpy format
     """
-    if isinstance(X, pd.DataFrame):
+    if sp.issparse(X):
+        # Out of the sparse matrix formats (sp.csc_matrix, sp.csr_matrix, sp.bsr_matrix,
+        # sp.lil_matrix, sp.dok_matrix, sp.coo_matrix, sp.dia_matrix), only sp.bsr_matrix, sp.coo_matrix
+        # and sp.dia_matrix don't support indexing and need to be converted to a sparse format
+        # that does support indexing. It seems conversion to CSR is currently most efficient.
+
+        try:
+            return X[I]
+        except:
+            sp_format = X.getformat()
+            return X.tocsr()[I].asformat(sp_format)
+    elif isinstance(X, pd.DataFrame):
         return X.iloc[I]
+    elif isinstance(X, np.ndarray):
+        return X[I]
+    elif isinstance(X, list):
+        return np.array(X)[I].tolist()
+
+    raise TypeError('%s datatype is not supported' % type(X))
 
-    return X[I]
 
 def drop_rows(X: modALinput,
               I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]:
-    if isinstance(X, pd.DataFrame):
+    """
+    Returns X without the row(s) at index/indices I
+    """
+    if sp.issparse(X):
+        mask = np.ones(X.shape[0], dtype=bool)
+        mask[I] = False
+        return retrieve_rows(X, mask)
+    elif isinstance(X, pd.DataFrame):
         return X.drop(I, axis=0)
+    elif isinstance(X, np.ndarray):
+        return np.delete(X, I, axis=0)
+    elif isinstance(X, list):
+        return np.delete(X, I, axis=0).tolist()
+
+    raise TypeError('%s datatype is not supported' % type(X))
 
-    return np.delete(X, I, axis=0)
 
 def enumerate_data(X: modALinput):
-    if isinstance(X, pd.DataFrame):
+    """
+    for i, x in enumerate_data(X):
+
+    Depending on the data type of X, returns:
+
+    * A 1xM matrix in case of scipy sparse NxM matrix X
+    * pandas series in case of a pandas data frame X
+    * row in case of list or numpy format
+    """
+    if sp.issparse(X):
+        return enumerate(X.tocsr())
+    elif isinstance(X, pd.DataFrame):
         return X.iterrows()
+    elif isinstance(X, np.ndarray) or isinstance(X, list):
+        # numpy arrays and lists can readily be enumerated
+        return enumerate(X)
+
+    raise TypeError('%s datatype is not supported' % type(X))
+
+
+def data_shape(X: modALinput):
+    """
+    Returns the shape of the data set X
+    """
+    if sp.issparse(X) or isinstance(X, pd.DataFrame) or isinstance(X, np.ndarray):
+        # scipy.sparse, pandas and numpy all support .shape
+        return X.shape
+    elif isinstance(X, list):
+        return np.array(X).shape
 
-    return enumerate(X)
+    raise TypeError('%s datatype is not supported' % type(X))
diff --git a/tests/core_tests.py b/tests/core_tests.py
@@ -457,21 +457,24 @@ def test_max_std_sampling(self):
 class TestEER(unittest.TestCase):
     def test_eer(self):
         for n_pool, n_features, n_classes in product(range(5, 10), range(1, 5), range(2, 5)):
-            X_training, y_training = np.random.rand(10, n_features), np.random.randint(0, n_classes, size=10)
-            X_pool, y_pool = np.random.rand(n_pool, n_features), np.random.randint(0, n_classes+1, size=n_pool)
-
-            learner = modAL.models.ActiveLearner(RandomForestClassifier(n_estimators=2),
-                                                 X_training=X_training, y_training=y_training)
-
-            modAL.expected_error.expected_error_reduction(learner, X_pool)
-            modAL.expected_error.expected_error_reduction(learner, X_pool, random_tie_break=True)
-            modAL.expected_error.expected_error_reduction(learner, X_pool, p_subsample=0.1)
-            modAL.expected_error.expected_error_reduction(learner, X_pool, loss='binary')
-            modAL.expected_error.expected_error_reduction(learner, X_pool, p_subsample=0.1, loss='log')
-            self.assertRaises(AssertionError, modAL.expected_error.expected_error_reduction,
-                              learner, X_pool, p_subsample=1.5)
-            self.assertRaises(AssertionError, modAL.expected_error.expected_error_reduction,
-                              learner, X_pool, loss=42)
+            X_training_, y_training = np.random.rand(10, n_features).tolist(), np.random.randint(0, n_classes, size=10)
+            X_pool_, y_pool = np.random.rand(n_pool, n_features).tolist(), np.random.randint(0, n_classes+1, size=n_pool)
+
+            for data_type in (sp.csr_matrix, pd.DataFrame, np.array, list):
+                X_training, X_pool = data_type(X_training_), data_type(X_pool_)
+
+                learner = modAL.models.ActiveLearner(RandomForestClassifier(n_estimators=2),
+                                                     X_training=X_training, y_training=y_training)
+
+                modAL.expected_error.expected_error_reduction(learner, X_pool)
+                modAL.expected_error.expected_error_reduction(learner, X_pool, random_tie_break=True)
+                modAL.expected_error.expected_error_reduction(learner, X_pool, p_subsample=0.1)
+                modAL.expected_error.expected_error_reduction(learner, X_pool, loss='binary')
+                modAL.expected_error.expected_error_reduction(learner, X_pool, p_subsample=0.1, loss='log')
+                self.assertRaises(AssertionError, modAL.expected_error.expected_error_reduction,
+                                  learner, X_pool, p_subsample=1.5)
+                self.assertRaises(AssertionError, modAL.expected_error.expected_error_reduction,
+                                  learner, X_pool, loss=42)
 
 
 class TestUncertainties(unittest.TestCase):