Skip to content

Feature/EnhancePreprocessing #99

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
ef5ad94
update pipeline test to have standard scaler transformer
AHReccese Apr 30, 2024
09b72d6
`preprocessing` Transporter
AHReccese Apr 30, 2024
d14ab4f
add StandardScaler
AHReccese Apr 30, 2024
b31eeb7
enhance import
AHReccese Apr 30, 2024
69c4d96
add preprocessing dict
AHReccese Apr 30, 2024
c418a46
add preprocessingTransporter to transporting functions(serialize, des…
AHReccese Apr 30, 2024
4134c0e
update docstring
AHReccese Apr 30, 2024
2614931
remove trailing whitespaces
AHReccese Apr 30, 2024
45f13e4
raise concrete Exception
AHReccese Apr 30, 2024
1ff77c3
`CHANGELOG.md` updated
AHReccese Apr 30, 2024
d054716
init preprocessing parameters
AHReccese Apr 30, 2024
ab694e6
remove concrete preprocessing transporters & add abstract preprocessi…
AHReccese Apr 30, 2024
633df65
Preprocessing Aggregator Transporter
AHReccese Apr 30, 2024
9091bd7
remove concrete preprocessing transporters & add abstract preprocessi…
AHReccese Apr 30, 2024
d25c45e
handle old scikit versions
AHReccese Apr 30, 2024
ea25aa4
refactor numpy import
AHReccese Apr 30, 2024
44e5954
generalize `OneHotEncoder` Transporting
AHReccese Apr 30, 2024
a1799a1
add numpy type itself transporting (for `OneHotEncoder`)
AHReccese Apr 30, 2024
c8ff89c
remove concrete preprocessing transporters
AHReccese Apr 30, 2024
ccf2d98
enhance according to codacy feedback
AHReccese Apr 30, 2024
7a8cfdf
`CHANGELOG.md` updated
AHReccese May 1, 2024
4fea959
`SUPPORTED_MODELS.md` updated
AHReccese May 2, 2024
02e6039
remove comments
AHReccese May 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

## [Unreleased]
### Added
- `StandardScaler` Transformer in `pymilo_param.py`
- `PreprocessingTransporter` Transporter
- ndarray shape config in `GeneralDataStructure` Transporter
- `util.py` in chains
- `BinMapperTransporter` Transporter
- `BunchTransporter` Transporter
- `GeneratorTransporter` Transporter
- `LabelEncoderTransporter` Transporter
- `OneHotEncoderTransporter` Transporter
- `TreePredictorTransporter` Transporter
- `AdaboostClassifier` model
- `AdaboostRegressor` model
Expand All @@ -37,6 +37,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Ensemble chain
- `SECURITY.md`
### Changed
- `Pipeline` test updated
- `LabelBinarizer`,`LabelEncoder` and `OneHotEncoder` got embedded in `PreprocessingTransporter`
- Preprocessing support added to Ensemble chain
- Preprocessing params initialized in `pymilo_param`
- `util.py` in utils updated
- `test_pymilo.py` updated
- `pymilo_func.py` updated
Expand Down
5 changes: 5 additions & 0 deletions SUPPORTED_MODELS.md
Original file line number Diff line number Diff line change
Expand Up @@ -630,4 +630,9 @@
<td><b>LabelEncoder</b></td>
<td>>=0.8</td>
</tr>
<tr align="center">
<td>3</td>
<td><b>StandardScaler</b></td>
<td>>=0.8</td>
</tr>
</table>
2 changes: 2 additions & 0 deletions pymilo/chains/clustering_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ..transporters.general_data_structure_transporter import GeneralDataStructureTransporter
from ..transporters.function_transporter import FunctionTransporter
from ..transporters.cfnode_transporter import CFNodeTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..utils.util import get_sklearn_type

Expand All @@ -15,6 +16,7 @@

bisecting_kmeans_support = SKLEARN_CLUSTERING_TABLE["BisectingKMeans"] != NOT_SUPPORTED
CLUSTERING_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
"FunctionTransporter": FunctionTransporter(),
"CFNodeTransporter": CFNodeTransporter(),
Expand Down
2 changes: 2 additions & 0 deletions pymilo/chains/decision_tree_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ..transporters.general_data_structure_transporter import GeneralDataStructureTransporter
from ..transporters.tree_transporter import TreeTransporter
from ..transporters.randomstate_transporter import RandomStateTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..utils.util import get_sklearn_type

Expand All @@ -16,6 +17,7 @@


DECISION_TREE_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
"RandomStateTransporter": RandomStateTransporter(),
"TreeTransporter": TreeTransporter(),
Expand Down
28 changes: 18 additions & 10 deletions pymilo/chains/ensemble_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@
from ..transporters.general_data_structure_transporter import GeneralDataStructureTransporter
from ..transporters.randomstate_transporter import RandomStateTransporter
from ..transporters.lossfunction_transporter import LossFunctionTransporter
from ..transporters.onehotencoder_transporter import OneHotEncoderTransporter
from ..transporters.bunch_transporter import BunchTransporter
from ..transporters.labelencoder_transporter import LabelEncoderTransporter
from ..transporters.generator_transporter import GeneratorTransporter
from ..transporters.treepredictor_transporter import TreePredictorTransporter
from ..transporters.binmapper_transporter import BinMapperTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..pymilo_param import SKLEARN_ENSEMBLE_TABLE

Expand All @@ -27,14 +26,15 @@
import copy

ENSEMBLE_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
"TreePredictorTransporter": TreePredictorTransporter(),
"BinMapperTransporter": BinMapperTransporter(),
"GeneratorTransporter": GeneratorTransporter(),
"RandomStateTransporter": RandomStateTransporter(),
"LossFunctionTransporter": LossFunctionTransporter(),
"OneHotEncoderTransporter": OneHotEncoderTransporter(),
"LabelEncoderTransporter": LabelEncoderTransporter(),
# "OneHotEncoderTransporter": OneHotEncoderTransporter(),
# "LabelEncoderTransporter": LabelEncoderTransporter(),
"BunchTransporter": BunchTransporter(),
}

Expand Down Expand Up @@ -166,14 +166,18 @@ def serialize_ensemble(ensemble_object):
for key, value in ensemble_object.__dict__.items():
if isinstance(value, list):
has_inner_tuple_with_ml_model = False
pt = PreprocessingTransporter()
for idx, item in enumerate(value):
if isinstance(item, tuple):
listed_tuple = list(item)
for inner_idx, inner_item in enumerate(listed_tuple):
has_inner_model, result = serialize_possible_ml_model(inner_item)
if has_inner_model:
has_inner_tuple_with_ml_model = True
listed_tuple[inner_idx] = result
if pt.is_preprocessing_module(inner_item):
listed_tuple[inner_idx] = pt.serialize_pre_module(inner_item)
else:
has_inner_model, result = serialize_possible_ml_model(inner_item)
if has_inner_model:
has_inner_tuple_with_ml_model = True
listed_tuple[inner_idx] = result
value[idx] = listed_tuple
else:
value[idx] = serialize_possible_ml_model(item)[1]
Expand Down Expand Up @@ -325,12 +329,16 @@ def deserialize_ensemble(ensemble, is_inner_model=False):
value) and value["pymiloed-data-structure"] == "list of (str, estimator) tuples":
listed_tuples = value["pymiloed-data"]
list_of_tuples = []
pt = PreprocessingTransporter()
for listed_tuple in listed_tuples:
name, serialized_ml_model = listed_tuple
name, serialized_model = listed_tuple
retrieved_model = pt.deserialize_pre_module(serialized_model) if pt.is_preprocessing_module(
serialized_model) else deserialize_possible_ml_model(serialized_model)[1]
list_of_tuples.append(
(name, deserialize_possible_ml_model(serialized_ml_model)[1])
(name, retrieved_model)
)
data[key] = list_of_tuples

elif GeneralDataStructureTransporter().is_deserialized_ndarray(value):
has_inner_model, result = deserialize_models_in_ndarray(value)
if has_inner_model:
Expand Down
9 changes: 5 additions & 4 deletions pymilo/chains/linear_model_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ..transporters.general_data_structure_transporter import GeneralDataStructureTransporter
from ..transporters.baseloss_transporter import BaseLossTransporter
from ..transporters.lossfunction_transporter import LossFunctionTransporter
from ..transporters.labelbinarizer_transporter import LabelBinarizerTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..pymilo_param import SKLEARN_LINEAR_MODEL_TABLE
from ..utils.util import get_sklearn_type, is_iterable
Expand All @@ -16,10 +16,11 @@


LINEAR_MODEL_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
"BaseLossTransporter": BaseLossTransporter(),
"LossFunctionTransporter": LossFunctionTransporter(),
"LabelBinarizerTransporter": LabelBinarizerTransporter()}
}


def is_linear_model(model):
Expand Down Expand Up @@ -101,9 +102,9 @@ def serialize_linear_model(linear_model_object):
for key in linear_model_object.__dict__:
if is_linear_model(linear_model_object.__dict__[key]):
linear_model_object.__dict__[key] = {
"pymilo-inner-model-data": transport_linear_model(linear_model_object.__dict__[key], Command.SERIALIZE),
"pymilo-inner-model-data": transport_linear_model(linear_model_object.__dict__[key], Command.SERIALIZE, True),
"pymilo-inner-model-type": get_sklearn_type(linear_model_object.__dict__[key]),
"by-pass": True
"pymilo-by-pass": True
}
# now serializing non-linear model fields
for transporter in LINEAR_MODEL_CHAIN:
Expand Down
2 changes: 2 additions & 0 deletions pymilo/chains/naive_bayes_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from ..transporters.transporter import Command

from ..transporters.general_data_structure_transporter import GeneralDataStructureTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..pymilo_param import SKLEARN_NAIVE_BAYES_TABLE
from ..exceptions.serialize_exception import PymiloSerializationException, SerilaizatoinErrorTypes
Expand All @@ -13,6 +14,7 @@
from traceback import format_exc

NAIVE_BAYES_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
}

Expand Down
2 changes: 2 additions & 0 deletions pymilo/chains/neighbours_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from ..transporters.general_data_structure_transporter import GeneralDataStructureTransporter
from ..transporters.neighbors_tree_transporter import NeighborsTreeTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..pymilo_param import SKLEARN_NEIGHBORS_TABLE
from ..exceptions.serialize_exception import PymiloSerializationException, SerilaizatoinErrorTypes
Expand All @@ -14,6 +15,7 @@
from traceback import format_exc

NEIGHBORS_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
"NeighborsTreeTransporter": NeighborsTreeTransporter(),
}
Expand Down
4 changes: 2 additions & 2 deletions pymilo/chains/neural_network_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from ..transporters.randomstate_transporter import RandomStateTransporter
from ..transporters.sgdoptimizer_transporter import SGDOptimizerTransporter
from ..transporters.adamoptimizer_transporter import AdamOptimizerTransporter
from ..transporters.labelbinarizer_transporter import LabelBinarizerTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..pymilo_param import SKLEARN_NEURAL_NETWORK_TABLE

Expand All @@ -19,11 +19,11 @@


NEURAL_NETWORK_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
"RandomStateTransporter": RandomStateTransporter(),
"SGDOptimizer": SGDOptimizerTransporter(),
"AdamOptimizerTransporter": AdamOptimizerTransporter(),
"LabelBinarizerTransporter": LabelBinarizerTransporter(),
}


Expand Down
2 changes: 2 additions & 0 deletions pymilo/chains/svm_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from ..transporters.general_data_structure_transporter import GeneralDataStructureTransporter
from ..transporters.randomstate_transporter import RandomStateTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..pymilo_param import SKLEARN_SVM_TABLE
from ..exceptions.serialize_exception import PymiloSerializationException, SerilaizatoinErrorTypes
Expand All @@ -14,6 +15,7 @@
from traceback import format_exc

SVM_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
"RandomStateTransporter": RandomStateTransporter(),
}
Expand Down
40 changes: 20 additions & 20 deletions pymilo/pymilo_param.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
# -*- coding: utf-8 -*-
"""Parameters and constants."""
from numpy import uint8
from numpy import intc
from numpy import inf
from numpy import float64
from numpy import int32
from numpy import int64
from numpy import uint64
from sklearn.preprocessing import LabelBinarizer

import numpy as np
import sklearn.linear_model as linear_model
import sklearn.neural_network as neural_network
import sklearn.tree as tree
Expand All @@ -20,7 +12,7 @@
import sklearn.dummy as dummy
import sklearn.ensemble as ensemble
import sklearn.pipeline as pipeline

import sklearn.preprocessing as preprocessing

quantile_regressor_support = False
try:
Expand Down Expand Up @@ -205,24 +197,32 @@
"Pipeline": pipeline.Pipeline,
}

SKLEARN_PREPROCESSING_TABLE = {
"StandardScaler": preprocessing.StandardScaler,
"OneHotEncoder": preprocessing.OneHotEncoder,
"LabelBinarizer": preprocessing.LabelBinarizer,
"LabelEncoder": preprocessing.LabelEncoder,
}

KEYS_NEED_PREPROCESSING_BEFORE_DESERIALIZATION = {
"_label_binarizer": LabelBinarizer, # in Ridge Classifier
"active_": int32, # in Lasso Lars
"n_nonzero_coefs_": int64, # in OMP-CV
"_label_binarizer": preprocessing.LabelBinarizer, # in Ridge Classifier
"active_": np.int32, # in Lasso Lars
"n_nonzero_coefs_": np.int64, # in OMP-CV
"scores_": dict, # in Logistic Regression CV,
"_base_loss": {}, # BaseLoss in Logistic Regression,
"loss_function_": {}, # LossFunction in SGD Classifier,
"estimator_": {}, # LinearRegression model inside RANSAC
}

NUMPY_TYPE_DICT = {
"numpy.intc": intc,
"numpy.int32": int32,
"numpy.int64": int64,
"numpy.float64": float64,
"numpy.infinity": lambda _: inf,
"numpy.uint8": uint8,
"numpy.uint64": uint64,
"numpy.intc": np.intc,
"numpy.int32": np.int32,
"numpy.int64": np.int64,
"numpy.float64": np.float64,
"numpy.infinity": lambda _: np.inf,
"numpy.uint8": np.uint8,
"numpy.uint64": np.uint64,
"numpy.dtype": np.dtype,
}

EXPORTED_MODELS_PATH = {
Expand Down
17 changes: 13 additions & 4 deletions pymilo/transporters/general_data_structure_transporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,16 @@ def serialize(self, data, key, model_type):
:type model_type: str
:return: pymilo serialized output of data[key]
"""
if isinstance(data[key], type):
raw_type = str(data[key])
raw_type = "numpy" + str(raw_type).split("numpy")[-1][:-2]
if raw_type in NUMPY_TYPE_DICT.keys():
data[key] = {
"np-type": "numpy.dtype",
"value": raw_type
}
# 1. Handling numpy infinity, ransac
if isinstance(data[key], np.float64):
elif isinstance(data[key], np.float64):
if np.inf == data[key]:
data[key] = {
"np-type": "numpy.infinity",
Expand Down Expand Up @@ -209,7 +217,7 @@ def get_deserialized_dict(self, content):
return self.deep_deserialize_ndarray(content)

if check_str_in_iterable("np-type", content) and check_str_in_iterable("value", content):
return NUMPY_TYPE_DICT[content["np-type"]](content["value"])
return self.get_deserialized_regular_primary_types(content)

for key in content:

Expand Down Expand Up @@ -271,6 +279,8 @@ def get_deserialized_regular_primary_types(self, content):
:return: the associated np.int32|np.int64|np.inf
"""
if "np-type" in content:
if content["np-type"] == "numpy.dtype":
return NUMPY_TYPE_DICT[content["np-type"]](NUMPY_TYPE_DICT[content['value']])
return NUMPY_TYPE_DICT[content["np-type"]](content['value'])

def is_numpy_primary_type(self, content):
Expand Down Expand Up @@ -359,8 +369,7 @@ def deserialize_primitive_type(self, primitive):
if is_primitive(primitive):
return primitive
elif check_str_in_iterable("np-type", primitive):
return NUMPY_TYPE_DICT[primitive["np-type"]
](primitive['value'])
return self.get_deserialized_regular_primary_types(primitive)
else:
return primitive

Expand Down
Loading