Skip to content

Commit 735ca67

Browse files
Changed target name/series ID divider and added ability to return series ID column with predictions (#4357)
* Try extra debug * Updated release notes * Join with separator symbol * Added infer to the end * updated test to infer type * Add series ID indexing test --------- Co-authored-by: machineFL <49695056+machineFL@users.noreply.github.com>
1 parent 8ffa04f commit 735ca67

File tree

9 files changed

+157
-18
lines changed

9 files changed

+157
-18
lines changed

docs/source/release_notes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ Release Notes
22
-------------
33
**Future Releases**
44
* Enhancements
5+
* Changed target name/series ID divider and added ability to return series ID column with predictions :pr:`4357`
56
* Fixes
67
* Changes
78
* Pinned networkx version below 3.2 for Python version compatibility :pr:`4351`

evalml/pipelines/multiseries_regression_pipeline.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
TimeSeriesRegressionPipeline,
77
)
88
from evalml.problem_types import ProblemTypes
9+
from evalml.utils import infer_feature_types
910

1011

1112
class MultiseriesRegressionPipeline(TimeSeriesRegressionPipeline):
@@ -91,6 +92,7 @@ def predict_in_sample(
9192
y_train,
9293
objective=None,
9394
calculating_residuals=False,
95+
include_series_id=False,
9496
):
9597
"""Predict on future data where the target is known, e.g. cross validation.
9698
@@ -102,6 +104,7 @@ def predict_in_sample(
102104
objective (ObjectiveBase, str, None): Objective used to threshold predicted probabilities, optional.
103105
calculating_residuals (bool): Whether we're calling predict_in_sample to calculate the residuals. This means
104106
the X and y arguments are not future data, but actually the train data.
107+
include_series_id (bool): If true, include the series ID value in the prediction results
105108
106109
Returns:
107110
pd.Series: Estimated labels.
@@ -125,6 +128,33 @@ def predict_in_sample(
125128
self.time_index,
126129
self.input_target_name,
127130
)
131+
132+
# Order series columns to be same as expected input feature names
133+
# and filter to only include features in `X_unstacked`.
134+
input_features = list(self.input_feature_names.values())[0]
135+
X_unstacked = X_unstacked[
136+
[feature for feature in input_features if feature in X_unstacked.columns]
137+
]
138+
X_train_unstacked = X_train_unstacked[
139+
[
140+
feature
141+
for feature in input_features
142+
if feature in X_train_unstacked.columns
143+
]
144+
]
145+
y_overlapping_features = [
146+
feature
147+
for feature in y_train_unstacked.columns
148+
if feature in y_unstacked.columns
149+
]
150+
y_unstacked = y_unstacked[y_overlapping_features]
151+
y_train_unstacked = y_train_unstacked[y_overlapping_features]
152+
153+
X_train_unstacked = infer_feature_types(X_train_unstacked)
154+
y_train_unstacked = infer_feature_types(y_train_unstacked)
155+
X_unstacked = infer_feature_types(X_unstacked)
156+
y_unstacked = infer_feature_types(y_unstacked)
157+
128158
unstacked_predictions = super().predict_in_sample(
129159
X_unstacked,
130160
y_unstacked,
@@ -133,10 +163,15 @@ def predict_in_sample(
133163
objective,
134164
calculating_residuals,
135165
)
136-
stacked_predictions = stack_data(unstacked_predictions)
166+
stacked_predictions = stack_data(
167+
unstacked_predictions,
168+
include_series_id=include_series_id,
169+
series_id_name=self.series_id,
170+
)
137171

138172
# Index will start at the unstacked index, so we need to reset it to the original index
139173
stacked_predictions.index = X.index
174+
stacked_predictions = infer_feature_types(stacked_predictions)
140175
return stacked_predictions
141176

142177
def get_forecast_period(self, X):

evalml/pipelines/time_series_regression_pipeline.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,11 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y):
225225
return return_intervals
226226

227227
if self.problem_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION:
228-
from evalml.pipelines.utils import stack_data, unstack_multiseries
228+
from evalml.pipelines.utils import (
229+
MULTISERIES_SEPARATOR_SYMBOL,
230+
stack_data,
231+
unstack_multiseries,
232+
)
229233

230234
X, y = unstack_multiseries(
231235
X,
@@ -268,7 +272,9 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y):
268272
# `pred_intervals` are in {series_id: {coverage_label: bound_value}} form
269273
for series_id, series_intervals in pred_intervals.items():
270274
series_id_target_name = (
271-
self.input_target_name + "_" + str(series_id)
275+
self.input_target_name
276+
+ MULTISERIES_SEPARATOR_SYMBOL
277+
+ str(series_id)
272278
)
273279
series_id_prediction_intervals = _get_series_intervals(
274280
series_intervals,

evalml/pipelines/utils.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
from evalml.utils.gen_utils import contains_all_ts_parameters
7777

7878
DECOMPOSER_PERIOD_CAP = 1000
79+
MULTISERIES_SEPARATOR_SYMBOL = "|"
7980

8081

8182
def _get_label_encoder(X, y, problem_type, estimator_class, sampler_name=None):
@@ -1418,7 +1419,7 @@ def unstack_multiseries(
14181419
for column_name in full_dataset.columns.drop([time_index, series_id]):
14191420
new_column = single_series[column_name]
14201421
new_column.index = new_time_index
1421-
new_column.name = f"{column_name}_{s_id}"
1422+
new_column.name = f"{column_name}{MULTISERIES_SEPARATOR_SYMBOL}{s_id}"
14221423

14231424
if column_name == target_name:
14241425
y_unstacked_cols.append(new_column)
@@ -1435,11 +1436,15 @@ def unstack_multiseries(
14351436
# Reset the axes now that they've been unstacked, keep time info in X
14361437
X_unstacked = X_unstacked.reset_index()
14371438
y_unstacked = y_unstacked.reset_index(drop=True)
1438-
14391439
return X_unstacked, y_unstacked
14401440

14411441

1442-
def stack_data(data, include_series_id=False, series_id_name=None, starting_index=None):
1442+
def stack_data(
1443+
data,
1444+
include_series_id=False,
1445+
series_id_name=None,
1446+
starting_index=None,
1447+
):
14431448
"""Stacks the given DataFrame back into a single Series, or a DataFrame if include_series_id is True.
14441449
14451450
Should only be used for data that is expected to be a single series. To stack multiple unstacked columns,
@@ -1464,7 +1469,9 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde
14641469

14651470
# Extract the original column name
14661471
series_id_with_name = stacked_series.index.droplevel()
1467-
stacked_series.name = "_".join(series_id_with_name[0].split("_")[:-1])
1472+
stacked_series.name = MULTISERIES_SEPARATOR_SYMBOL.join(
1473+
series_id_with_name[0].split(MULTISERIES_SEPARATOR_SYMBOL)[:-1],
1474+
)
14681475

14691476
# If the index is the time index, keep it
14701477
if not data.index.is_numeric() and starting_index is None:
@@ -1481,11 +1488,14 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde
14811488
# Pull out the series id information, if requested
14821489
if include_series_id:
14831490
series_id_col = pd.Series(
1484-
series_id_with_name.map(lambda col_name: col_name.split("_")[-1]),
1491+
series_id_with_name.map(
1492+
lambda col_name: col_name.split(MULTISERIES_SEPARATOR_SYMBOL)[-1],
1493+
),
14851494
name=series_id_name or "series_id",
14861495
index=stacked_series.index,
14871496
)
14881497
stacked_series = pd.concat([series_id_col, stacked_series], axis=1)
1498+
14891499
return stacked_series
14901500

14911501

@@ -1511,8 +1521,8 @@ def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values
15111521
for col in X.columns:
15121522
if col == time_index:
15131523
continue
1514-
separated_name = col.split("_")
1515-
original_columns.add("_".join(separated_name[:-1]))
1524+
separated_name = col.split(MULTISERIES_SEPARATOR_SYMBOL)
1525+
original_columns.add(MULTISERIES_SEPARATOR_SYMBOL.join(separated_name[:-1]))
15161526
series_ids.add(separated_name[-1])
15171527

15181528
if len(series_ids) == 0:

evalml/tests/component_tests/test_time_series_featurizer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
)
1616

1717
from evalml.pipelines import TimeSeriesFeaturizer
18+
from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL
1819

1920
ROLLING_TRANSFORM_METHOD_NAME = "_compute_rolling_transforms"
2021
DELAYED_FEATURES_METHOD_NAME = "_compute_delays"
@@ -991,7 +992,9 @@ def test_featurizer_y_dataframe(multiseries_ts_data_unstacked):
991992

992993
assert featurizer.statistically_significant_lags == [6]
993994

994-
expected_y_cols = [f"target_{i}_delay_6" for i in range(y.shape[1])]
995+
expected_y_cols = [
996+
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}_delay_6" for i in range(y.shape[1])
997+
]
995998
X_t = featurizer.transform(X, y)
996999
for expected_y_col in expected_y_cols:
9971000
assert expected_y_col in X_t.columns

evalml/tests/component_tests/test_time_series_imputer.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
)
1212

1313
from evalml.pipelines.components import TimeSeriesImputer
14+
from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL
1415

1516

1617
def test_invalid_strategy_parameters():
@@ -745,7 +746,12 @@ def test_time_series_imputer_multiseries(
745746
_, y_imputed = imputer.transform(X, y)
746747
assert isinstance(y_imputed, pd.DataFrame)
747748

748-
y_expected = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)})
749+
y_expected = pd.DataFrame(
750+
{
751+
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
752+
for i in range(5)
753+
},
754+
)
749755
assert_frame_equal(y_imputed, y_expected, check_dtype=False)
750756

751757

@@ -777,7 +783,10 @@ def test_time_series_imputer_multiseries_some_columns_all_nan(
777783
_, y_imputed = imputer.transform(X, y)
778784

779785
y_expected = pd.DataFrame(
780-
{f"target_{i}": range(i, 100, 5) for i in range(num_nan_cols, 5)},
786+
{
787+
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
788+
for i in range(num_nan_cols, 5)
789+
},
781790
)
782791
assert_frame_equal(y_imputed, y_expected, check_dtype=False)
783792

evalml/tests/conftest.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1094,12 +1094,27 @@ def multiseries_ts_data_stacked():
10941094

10951095
@pytest.fixture
10961096
def multiseries_ts_data_unstacked():
1097-
feature_a = pd.DataFrame({f"feature_a_{i}": range(i, 100, 5) for i in range(5)})
1097+
from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL
1098+
1099+
feature_a = pd.DataFrame(
1100+
{
1101+
f"feature_a{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
1102+
for i in range(5)
1103+
},
1104+
)
10981105
feature_b = pd.DataFrame(
1099-
{f"feature_b_{i}": range(99 - i, -1, -5) for i in range(5)},
1106+
{
1107+
f"feature_b{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(99 - i, -1, -5)
1108+
for i in range(5)
1109+
},
11001110
)
11011111
X = pd.concat([feature_a, feature_b], axis=1)
1102-
y = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)})
1112+
y = pd.DataFrame(
1113+
{
1114+
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
1115+
for i in range(5)
1116+
},
1117+
)
11031118

11041119
X["date"] = pd.date_range(start="1/1/2018", periods=20)
11051120
return X, y

evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from evalml.pipelines import MultiseriesRegressionPipeline
99
from evalml.pipelines.utils import unstack_multiseries
1010
from evalml.preprocessing import split_multiseries_data
11+
from evalml.utils import infer_feature_types
1112

1213

1314
@pytest.fixture(scope="module")
@@ -90,7 +91,9 @@ def test_multiseries_pipeline_fit(
9091
assert pipeline.frequency is not None
9192

9293

94+
@pytest.mark.parametrize("include_series_id", [True, False])
9395
def test_multiseries_pipeline_predict_in_sample(
96+
include_series_id,
9497
multiseries_ts_data_stacked,
9598
component_graph,
9699
pipeline_parameters,
@@ -111,14 +114,69 @@ def test_multiseries_pipeline_predict_in_sample(
111114
y_holdout,
112115
X_train=X_train,
113116
y_train=y_train,
117+
include_series_id=include_series_id,
114118
)
115119
expected = pd.Series(
116120
range(55, 65),
117121
index=range(90, 100),
118122
name="target",
119123
dtype="float64",
120124
)
121-
pd.testing.assert_series_equal(y_pred, expected)
125+
if include_series_id:
126+
expected = pd.concat([X_holdout["series_id"], expected], axis=1)
127+
expected = infer_feature_types(expected)
128+
pd.testing.assert_frame_equal(y_pred, expected)
129+
else:
130+
pd.testing.assert_series_equal(y_pred, expected)
131+
132+
133+
@pytest.mark.parametrize("include_series_id", [True, False])
134+
def test_multiseries_pipeline_predict_in_sample_series_out_of_order(
135+
include_series_id,
136+
multiseries_ts_data_stacked,
137+
component_graph,
138+
pipeline_parameters,
139+
):
140+
X, y = multiseries_ts_data_stacked
141+
X_train, X_holdout, y_train, y_holdout = split_multiseries_data(
142+
X,
143+
y,
144+
"series_id",
145+
"date",
146+
)
147+
148+
# Reorder rows but keep ordered by date
149+
# Store ordered series ID values to compare to output later
150+
X_holdout_series_id = X_holdout["series_id"]
151+
X_index = X_holdout.index
152+
X_holdout = X_holdout.sample(frac=1).sort_values(by="date")
153+
y_holdout = y_holdout.reindex(X_holdout.index)
154+
155+
X_holdout.index = X_index
156+
y_holdout.index = X_index
157+
158+
pipeline = MultiseriesRegressionPipeline(component_graph, pipeline_parameters)
159+
pipeline.fit(X_train, y_train)
160+
161+
y_pred = pipeline.predict_in_sample(
162+
X_holdout,
163+
y_holdout,
164+
X_train=X_train,
165+
y_train=y_train,
166+
include_series_id=include_series_id,
167+
)
168+
expected = pd.Series(
169+
range(55, 65),
170+
index=range(90, 100),
171+
name="target",
172+
dtype="float64",
173+
)
174+
if include_series_id:
175+
expected = pd.concat([X_holdout_series_id, expected], axis=1)
176+
expected = infer_feature_types(expected)
177+
pd.testing.assert_frame_equal(y_pred, expected)
178+
else:
179+
pd.testing.assert_series_equal(y_pred, expected)
122180

123181

124182
@pytest.mark.parametrize("forecast_horizon", [1, 7])

evalml/tests/pipeline_tests/test_pipeline_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
handle_component_class,
4444
)
4545
from evalml.pipelines.utils import (
46+
MULTISERIES_SEPARATOR_SYMBOL,
4647
_get_pipeline_base_class,
4748
_get_preprocessing_components,
4849
_make_pipeline_from_multiple_graphs,
@@ -1404,7 +1405,8 @@ def test_unstack_multiseries(
14041405
X_unstacked, y_unstacked = multiseries_ts_data_unstacked
14051406
y.name = target_name
14061407
y_unstacked.columns = [
1407-
f"{target_name}_{i}" for i in range(len(y_unstacked.columns))
1408+
f"{target_name}{MULTISERIES_SEPARATOR_SYMBOL}{i}"
1409+
for i in range(len(y_unstacked.columns))
14081410
]
14091411

14101412
X_unstacked_transformed, y_unstacked_transformed = unstack_multiseries(

0 commit comments

Comments
 (0)