|
4 | 4 | __all__ = ["BaseAnomalyDetector"]
|
5 | 5 |
|
6 | 6 | from abc import abstractmethod
|
7 |
| -from typing import final |
8 | 7 |
|
9 | 8 | import numpy as np
|
10 |
| -import pandas as pd |
11 | 9 |
|
12 |
| -from aeon.base import BaseSeriesEstimator |
13 |
| -from aeon.base._base_series import VALID_SERIES_INPUT_TYPES |
| 10 | +from aeon.base import BaseAeonEstimator |
14 | 11 |
|
15 | 12 |
|
16 |
| -class BaseAnomalyDetector(BaseSeriesEstimator): |
17 |
| - """Base class for anomaly detection algorithms. |
18 |
| -
|
19 |
| - Anomaly detection algorithms are used to identify anomalous subsequences in time |
20 |
| - series data. These algorithms take a series of length m and return a boolean, int or |
21 |
| - float array of length m, where each element indicates whether the corresponding |
22 |
| - subsequence is anomalous or its anomaly score. |
23 |
| -
|
24 |
| - Input and internal data format (where m is the number of time points and d is the |
25 |
| - number of channels): |
26 |
| - Univariate series (default): |
27 |
| - np.ndarray, shape ``(m,)``, ``(m, 1)`` or ``(1, m)`` depending on axis. |
28 |
| - This is converted to a 2D np.ndarray internally. |
29 |
| - pd.DataFrame, shape ``(m, 1)`` or ``(1, m)`` depending on axis. |
30 |
| - pd.Series, shape ``(m,)``. |
31 |
| - Multivariate series: |
32 |
| - np.ndarray array, shape ``(m, d)`` or ``(d, m)`` depending on axis. |
33 |
| - pd.DataFrame ``(m, d)`` or ``(d, m)`` depending on axis. |
34 |
| -
|
35 |
| - Output data format (one of the following): |
36 |
| - Anomaly scores (default): |
37 |
| - np.ndarray, shape ``(m,)`` of type float. For each point of the input time |
38 |
| - series, the anomaly score is a float value indicating the degree of |
39 |
| - anomalousness. The higher the score, the more anomalous the point. |
40 |
| - Binary classification: |
41 |
| - np.ndarray, shape ``(m,)`` of type bool or int. For each point of the input |
42 |
| - time series, the output is a boolean or integer value indicating whether the |
43 |
| - point is anomalous (``True``/``1``) or not (``False``/``0``). |
44 |
| -
|
45 |
| - Detector learning types: |
46 |
| - Unsupervised (default): |
47 |
| - Unsupervised detectors do not require any training data and can directly be |
48 |
| - used on the target time series. Their tags are set to ``fit_is_empty=True`` |
49 |
| - and ``requires_y=False``. You would usually call the ``fit_predict`` method |
50 |
| - on these detectors. |
51 |
| - Semi-supervised: |
52 |
| - Semi-supervised detectors require a training step on a time series without |
53 |
| - anomalies (normal behaving time series). The target value ``y`` would |
54 |
| - consist of only zeros. Thus, these algorithms have logic in the ``fit`` |
55 |
| - method, but do not require the target values. Their tags are set to |
56 |
| - ``fit_is_empty=False`` and ``requires_y=False``. You would usually first |
57 |
| - call the ``fit`` method on the training data and then the ``predict`` |
58 |
| - method for your target time series. |
59 |
| - Supervised: |
60 |
| - Supervised detectors require a training step on a time series with known |
61 |
| - anomalies (anomalies should be present and must be annotated). The detector |
62 |
| - implements the ``fit`` method, and the target value ``y`` consists of zeros |
63 |
| - and ones. Their tags are, thus, set to ``fit_is_empty=False`` and |
64 |
| - ``requires_y=True``. You would usually first call the ``fit`` method on the |
65 |
| - training data and then the ``predict`` method for your target time series. |
66 |
| -
|
67 |
| - Parameters |
68 |
| - ---------- |
69 |
| - axis : int |
70 |
| - The time point axis of the input series if it is 2D. If ``axis==0``, it is |
71 |
| - assumed each column is a time series and each row is a time point. i.e. the |
72 |
| - shape of the data is ``(n_timepoints, n_channels)``. ``axis==1`` indicates |
73 |
| - the time series are in rows, i.e. the shape of the data is |
74 |
| - ``(n_channels, n_timepoints)``. |
75 |
| - Setting this class variable will convert the input data to the chosen axis. |
76 |
| - """ |
| 13 | +class BaseAnomalyDetector(BaseAeonEstimator): |
| 14 | + """Anomaly detection base class.""" |
77 | 15 |
|
78 | 16 | _tags = {
|
79 |
| - "X_inner_type": "np.ndarray", # One of VALID_SERIES_INNER_TYPES |
80 | 17 | "fit_is_empty": True,
|
81 | 18 | "requires_y": False,
|
82 | 19 | "learning_type:unsupervised": False,
|
83 | 20 | "learning_type:semi_supervised": False,
|
84 | 21 | "learning_type:supervised": False,
|
85 | 22 | }
|
86 | 23 |
|
87 |
| - def __init__(self, axis): |
88 |
| - super().__init__(axis=axis) |
| 24 | + def __init__(self): |
| 25 | + super().__init__() |
89 | 26 |
|
90 |
| - @final |
91 |
| - def fit(self, X, y=None, axis=1): |
92 |
| - """Fit time series anomaly detector to X. |
| 27 | + @abstractmethod |
| 28 | + def fit(self, X, y=None): |
| 29 | + """Fit anomaly detector to X, optionally to y. |
| 30 | +
|
| 31 | + State change: |
| 32 | + Changes state to "fitted". |
93 | 33 |
|
94 |
| - If the tag ``fit_is_empty`` is true, this just sets the ``is_fitted`` tag to |
95 |
| - true. Otherwise, it checks ``self`` can handle ``X``, formats ``X`` into |
96 |
| - the structure required by ``self`` then passes ``X`` (and possibly ``y``) to |
97 |
| - ``_fit``. |
| 34 | + Writes to self: |
| 35 | + _is_fitted : flag is set to True. |
98 | 36 |
|
99 | 37 | Parameters
|
100 | 38 | ----------
|
101 |
| - X : one of aeon.base._base_series.VALID_SERIES_INPUT_TYPES |
102 |
| - The time series to fit the model to. |
103 |
| - A valid aeon time series data structure. See |
104 |
| - aeon.base._base_series.VALID_SERIES_INPUT_TYPES for aeon supported types. |
105 |
| - y : one of aeon.base._base_series.VALID_SERIES_INPUT_TYPES, default=None |
106 |
| - The target values for the time series. |
107 |
| - A valid aeon time series data structure. See |
108 |
| - aeon.base._base_series.VALID_SERIES_INPUT_TYPES for aeon supported types. |
109 |
| - axis : int |
110 |
| - The time point axis of the input series if it is 2D. If ``axis==0``, it is |
111 |
| - assumed each column is a time series and each row is a time point. i.e. the |
112 |
| - shape of the data is ``(n_timepoints, n_channels)``. ``axis==1`` indicates |
113 |
| - the time series are in rows, i.e. the shape of the data is |
114 |
| - ``(n_channels, n_timepoints)``. |
| 39 | + X : Series or Collection, any supported type |
| 40 | + Data to fit anomaly detector to, of python type as follows: |
| 41 | + Series: 2D np.ndarray shape (n_channels, n_timepoints) |
| 42 | + Collection: 3D np.ndarray shape (n_cases, n_channels, n_timepoints) |
| 43 | + or list of 2D np.ndarray, case i has shape (n_channels, n_timepoints_i) |
| 44 | + y : Series, default=None |
| 45 | + Additional data, e.g., labels for anomaly detector. |
115 | 46 |
|
116 | 47 | Returns
|
117 | 48 | -------
|
118 | 49 | BaseAnomalyDetector
|
119 | 50 | The fitted estimator, reference to self.
|
120 | 51 | """
|
121 |
| - if self.get_tag("fit_is_empty"): |
122 |
| - self.is_fitted = True |
123 |
| - return self |
124 |
| - |
125 |
| - if self.get_tag("requires_y"): |
126 |
| - if y is None: |
127 |
| - raise ValueError("Tag requires_y is true, but fit called with y=None") |
128 |
| - |
129 |
| - # reset estimator at the start of fit |
130 |
| - self.reset() |
| 52 | + ... |
131 | 53 |
|
132 |
| - X = self._preprocess_series(X, axis, True) |
133 |
| - if y is not None: |
134 |
| - y = self._check_y(y) |
135 |
| - |
136 |
| - self._fit(X=X, y=y) |
137 |
| - |
138 |
| - # this should happen last |
139 |
| - self.is_fitted = True |
140 |
| - return self |
141 |
| - |
142 |
| - @final |
143 |
| - def predict(self, X, axis=1) -> np.ndarray: |
| 54 | + @abstractmethod |
| 55 | + def predict(self, X) -> np.ndarray: |
144 | 56 | """Find anomalies in X.
|
145 | 57 |
|
146 | 58 | Parameters
|
147 | 59 | ----------
|
148 |
| - X : one of aeon.base._base_series.VALID_SERIES_INPUT_TYPES |
149 |
| - The time series to fit the model to. |
150 |
| - A valid aeon time series data structure. See |
151 |
| - aeon.base._base_series.VALID_SERIES_INPUT_TYPES for aeon supported types. |
152 |
| - axis : int, default=1 |
153 |
| - The time point axis of the input series if it is 2D. If ``axis==0``, it is |
154 |
| - assumed each column is a time series and each row is a time point. i.e. the |
155 |
| - shape of the data is ``(n_timepoints, n_channels)``. ``axis==1`` indicates |
156 |
| - the time series are in rows, i.e. the shape of the data is |
157 |
| - ``(n_channels, n_timepoints)``. |
| 60 | + X : Series or Collection, any supported type |
| 61 | + Data to fit anomaly detector to, of python type as follows: |
| 62 | + Series: 2D np.ndarray shape (n_channels, n_timepoints) |
| 63 | + Collection: 3D np.ndarray shape (n_cases, n_channels, n_timepoints) |
| 64 | + or list of 2D np.ndarray, case i has shape (n_channels, n_timepoints_i) |
158 | 65 |
|
159 | 66 | Returns
|
160 | 67 | -------
|
161 | 68 | np.ndarray
|
162 | 69 | A boolean, int or float array of length len(X), where each element indicates
|
163 |
| - whether the corresponding subsequence is anomalous or its anomaly score. |
| 70 | + whether the corresponding subsequence/case is anomalous or its anomaly |
| 71 | + score. |
164 | 72 | """
|
165 |
| - fit_empty = self.get_tag("fit_is_empty") |
166 |
| - if not fit_empty: |
167 |
| - self._check_is_fitted() |
| 73 | + ... |
168 | 74 |
|
169 |
| - X = self._preprocess_series(X, axis, False) |
170 |
| - |
171 |
| - return self._predict(X) |
172 |
| - |
173 |
| - @final |
174 |
| - def fit_predict(self, X, y=None, axis=1) -> np.ndarray: |
| 75 | + @abstractmethod |
| 76 | + def fit_predict(self, X, y=None) -> np.ndarray: |
175 | 77 | """Fit time series anomaly detector and find anomalies for X.
|
176 | 78 |
|
177 | 79 | Parameters
|
178 | 80 | ----------
|
179 |
| - X : one of aeon.base._base_series.VALID_SERIES_INPUT_TYPES |
180 |
| - The time series to fit the model to. |
181 |
| - A valid aeon time series data structure. See |
182 |
| - aeon.base._base_series.VALID_INPUT_TYPES for aeon supported types. |
183 |
| - y : one of aeon.base._base_series.VALID_SERIES_INPUT_TYPES, default=None |
184 |
| - The target values for the time series. |
185 |
| - A valid aeon time series data structure. See |
186 |
| - aeon.base._base_series.VALID_SERIES_INPUT_TYPES for aeon supported types. |
187 |
| - axis : int, default=1 |
188 |
| - The time point axis of the input series if it is 2D. If ``axis==0``, it is |
189 |
| - assumed each column is a time series and each row is a time point. i.e. the |
190 |
| - shape of the data is ``(n_timepoints, n_channels)``. ``axis==1`` indicates |
191 |
| - the time series are in rows, i.e. the shape of the data is |
192 |
| - ``(n_channels, n_timepoints)``. |
| 81 | + X : Series or Collection, any supported type |
| 82 | + Data to fit anomaly detector to, of python type as follows: |
| 83 | + Series: 2D np.ndarray shape (n_channels, n_timepoints) |
| 84 | + Collection: 3D np.ndarray shape (n_cases, n_channels, n_timepoints) |
| 85 | + or list of 2D np.ndarray, case i has shape (n_channels, n_timepoints_i) |
193 | 86 |
|
194 | 87 | Returns
|
195 | 88 | -------
|
196 | 89 | np.ndarray
|
197 | 90 | A boolean, int or float array of length len(X), where each element indicates
|
198 |
| - whether the corresponding subsequence is anomalous or its anomaly score. |
| 91 | + whether the corresponding subsequence/case is anomalous or its anomaly |
| 92 | + score. |
199 | 93 | """
|
200 |
| - if self.get_tag("requires_y"): |
201 |
| - if y is None: |
202 |
| - raise ValueError("Tag requires_y is true, but fit called with y=None") |
203 |
| - |
204 |
| - # reset estimator at the start of fit |
205 |
| - self.reset() |
206 |
| - |
207 |
| - X = self._preprocess_series(X, axis, True) |
208 |
| - |
209 |
| - if self.get_tag("fit_is_empty"): |
210 |
| - self.is_fitted = True |
211 |
| - return self._predict(X) |
212 |
| - |
213 |
| - if y is not None: |
214 |
| - y = self._check_y(y) |
215 |
| - |
216 |
| - pred = self._fit_predict(X, y) |
217 |
| - |
218 |
| - # this should happen last |
219 |
| - self.is_fitted = True |
220 |
| - return pred |
221 |
| - |
222 |
| - def _fit(self, X, y): |
223 |
| - return self |
224 |
| - |
225 |
| - @abstractmethod |
226 |
| - def _predict(self, X) -> np.ndarray: ... |
227 |
| - |
228 |
| - def _fit_predict(self, X, y): |
229 |
| - self._fit(X, y) |
230 |
| - return self._predict(X) |
231 |
| - |
232 |
| - def _check_y(self, y: VALID_SERIES_INPUT_TYPES) -> np.ndarray: |
233 |
| - # Remind user if y is not required for this estimator on failure |
234 |
| - req_msg = ( |
235 |
| - f"{self.__class__.__name__} does not require a y input." |
236 |
| - if self.get_tag("requires_y") |
237 |
| - else "" |
238 |
| - ) |
239 |
| - new_y = y |
240 |
| - |
241 |
| - # must be a valid input type, see VALID_SERIES_INPUT_TYPES in |
242 |
| - # BaseSeriesEstimator |
243 |
| - if isinstance(y, np.ndarray): |
244 |
| - # check valid shape |
245 |
| - if y.ndim > 1: |
246 |
| - raise ValueError( |
247 |
| - "Error in input type for y: y input as np.ndarray should be 1D." |
248 |
| - + req_msg |
249 |
| - ) |
250 |
| - |
251 |
| - # check valid dtype |
252 |
| - fail = False |
253 |
| - if issubclass(y.dtype.type, np.integer): |
254 |
| - new_y = y.astype(bool) |
255 |
| - fail = not np.array_equal(y, new_y) |
256 |
| - elif not issubclass(y.dtype.type, np.bool_): |
257 |
| - fail = True |
258 |
| - |
259 |
| - if fail: |
260 |
| - raise ValueError( |
261 |
| - "Error in input type for y: y input type must be an integer array " |
262 |
| - "containing 0 and 1 or a boolean array." + req_msg |
263 |
| - ) |
264 |
| - elif isinstance(y, pd.Series): |
265 |
| - # check series is of boolean dtype |
266 |
| - if not pd.api.types.is_bool_dtype(y): |
267 |
| - raise ValueError( |
268 |
| - "Error in input type for y: y input as pd.Series must have a " |
269 |
| - "boolean dtype." + req_msg |
270 |
| - ) |
271 |
| - |
272 |
| - new_y = y.values |
273 |
| - elif isinstance(y, pd.DataFrame): |
274 |
| - # only accept size 1 dataframe |
275 |
| - if y.shape[1] > 1: |
276 |
| - raise ValueError( |
277 |
| - "Error in input type for y: y input as pd.DataFrame should have a " |
278 |
| - "single column series." |
279 |
| - ) |
280 |
| - |
281 |
| - # check column is of boolean dtype |
282 |
| - if not all(pd.api.types.is_bool_dtype(y[col]) for col in y.columns): |
283 |
| - raise ValueError( |
284 |
| - "Error in input type for y: y input as pd.DataFrame must have a " |
285 |
| - "boolean dtype." + req_msg |
286 |
| - ) |
287 |
| - |
288 |
| - new_y = y.squeeze().values |
289 |
| - else: |
290 |
| - raise ValueError( |
291 |
| - f"Error in input type for y: it should be one of " |
292 |
| - f"{VALID_SERIES_INPUT_TYPES}, saw {type(y)}" |
293 |
| - ) |
294 |
| - |
295 |
| - new_y = new_y.astype(bool) |
296 |
| - return new_y |
| 94 | + ... |
0 commit comments