Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data/augmentations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
74 changes: 49 additions & 25 deletions data/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import json
import os
from pathlib import Path

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)
from pathlib import Path
Expand All @@ -30,14 +31,20 @@
from gluonts.transform import InstanceSampler
from pandas.tseries.frequencies import to_offset

from data.read_new_dataset import get_ett_dataset, create_train_dataset_without_last_k_timesteps, TrainDatasets, MetaData
from data.read_new_dataset import (
get_ett_dataset,
create_train_dataset_without_last_k_timesteps,
TrainDatasets,
MetaData,
)


class CombinedDatasetIterator:
def __init__(self, datasets, seed, weights):
self._datasets = [iter(el) for el in datasets]
self._weights = weights
self._rng = random.Random(seed)

def __next__(self):
(dataset,) = self._rng.choices(self._datasets, weights=self._weights, k=1)
return next(dataset)
Expand Down Expand Up @@ -105,15 +112,13 @@ def _count_timesteps(
f"Too large difference between both timestamps ({left} and {right}) for _count_timesteps()."
)


from pathlib import Path
from gluonts.dataset.common import ListDataset
from gluonts.dataset.repository.datasets import get_dataset

def create_train_dataset_last_k_percentage(
raw_train_dataset,
freq,
k=100
):

def create_train_dataset_last_k_percentage(raw_train_dataset, freq, k=100):
# Get training data
train_data = []
for i, series in enumerate(raw_train_dataset):
Expand All @@ -127,6 +132,7 @@ def create_train_dataset_last_k_percentage(

return train_data


def create_train_and_val_datasets_with_dates(
name,
dataset_path,
Expand All @@ -137,7 +143,7 @@ def create_train_and_val_datasets_with_dates(
val_start_date=None,
train_start_date=None,
freq=None,
last_k_percentage=None
last_k_percentage=None,
):
"""
Train Start date is assumed to be the start of the series if not provided
Expand All @@ -148,12 +154,19 @@ def create_train_and_val_datasets_with_dates(
if name in ("ett_h1", "ett_h2", "ett_m1", "ett_m2"):
path = os.path.join(dataset_path, "ett_datasets")
raw_dataset = get_ett_dataset(name, path)
elif name in ("cpu_limit_minute", "cpu_usage_minute", \
"function_delay_minute", "instances_minute", \
"memory_limit_minute", "memory_usage_minute", \
"platform_delay_minute", "requests_minute"):
elif name in (
"cpu_limit_minute",
"cpu_usage_minute",
"function_delay_minute",
"instances_minute",
"memory_limit_minute",
"memory_usage_minute",
"platform_delay_minute",
"requests_minute",
):
path = os.path.join(dataset_path, "huawei/" + name + ".json")
with open(path, "r") as f: data = json.load(f)
with open(path, "r") as f:
data = json.load(f)
metadata = MetaData(**data["metadata"])
train_data = [x for x in data["train"] if type(x["target"][0]) != str]
test_data = [x for x in data["test"] if type(x["target"][0]) != str]
Expand All @@ -167,8 +180,12 @@ def create_train_and_val_datasets_with_dates(
metadata = MetaData(**data["metadata"])
train_test_data = [x for x in data["data"] if type(x["target"][0]) != str]
full_dataset = ListDataset(train_test_data, freq=metadata.freq)
train_ds = create_train_dataset_without_last_k_timesteps(full_dataset, freq=metadata.freq, k=24)
raw_dataset = TrainDatasets(metadata=metadata, train=train_ds, test=full_dataset)
train_ds = create_train_dataset_without_last_k_timesteps(
full_dataset, freq=metadata.freq, k=24
)
raw_dataset = TrainDatasets(
metadata=metadata, train=train_ds, test=full_dataset
)
else:
raw_dataset = get_dataset(name, path=Path(dataset_path))

Expand Down Expand Up @@ -257,9 +274,7 @@ def create_train_and_val_datasets_with_dates(
)


def create_test_dataset(
name, dataset_path, history_length, freq=None, data_id=None
):
def create_test_dataset(name, dataset_path, history_length, freq=None, data_id=None):
"""
For now, only window per series is used.
make_evaluation_predictions automatically only predicts for the last "prediction_length" timesteps
Expand All @@ -270,12 +285,19 @@ def create_test_dataset(
if name in ("ett_h1", "ett_h2", "ett_m1", "ett_m2"):
path = os.path.join(dataset_path, "ett_datasets")
dataset = get_ett_dataset(name, path)
elif name in ("cpu_limit_minute", "cpu_usage_minute", \
"function_delay_minute", "instances_minute", \
"memory_limit_minute", "memory_usage_minute", \
"platform_delay_minute", "requests_minute"):
elif name in (
"cpu_limit_minute",
"cpu_usage_minute",
"function_delay_minute",
"instances_minute",
"memory_limit_minute",
"memory_usage_minute",
"platform_delay_minute",
"requests_minute",
):
path = os.path.join(dataset_path, "huawei/" + name + ".json")
with open(path, "r") as f: data = json.load(f)
with open(path, "r") as f:
data = json.load(f)
metadata = MetaData(**data["metadata"])
train_data = [x for x in data["train"] if type(x["target"][0]) != str]
test_data = [x for x in data["test"] if type(x["target"][0]) != str]
Expand All @@ -289,7 +311,9 @@ def create_test_dataset(
metadata = MetaData(**data["metadata"])
train_test_data = [x for x in data["data"] if type(x["target"][0]) != str]
full_dataset = ListDataset(train_test_data, freq=metadata.freq)
train_ds = create_train_dataset_without_last_k_timesteps(full_dataset, freq=metadata.freq, k=24)
train_ds = create_train_dataset_without_last_k_timesteps(
full_dataset, freq=metadata.freq, k=24
)
dataset = TrainDatasets(metadata=metadata, train=train_ds, test=full_dataset)
else:
dataset = get_dataset(name, path=Path(dataset_path))
Expand Down Expand Up @@ -317,4 +341,4 @@ def create_test_dataset(
series_copy["data_id"] = data_id
data.append(series_copy)
total_points += len(data[-1]["target"])
return ListDataset(data, freq=freq), prediction_length, total_points
return ListDataset(data, freq=freq), prediction_length, total_points
30 changes: 29 additions & 1 deletion data/dataset_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,32 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ALL_DATASETS = ["australian_electricity_demand", "electricity_hourly", "london_smart_meters_without_missing", "solar_10_minutes", "wind_farms_without_missing", "pedestrian_counts", "uber_tlc_hourly", "traffic", "kdd_cup_2018_without_missing", "saugeenday", "sunspot_without_missing", "exchange_rate", "cpu_limit_minute", "cpu_usage_minute", "function_delay_minute", "instances_minute", "memory_limit_minute", "memory_usage_minute", "platform_delay_minute", "requests_minute", "ett_h1", "ett_h2", "ett_m1", "ett_m2", "beijing_pm25", "AirQualityUCI", "beijing_multisite"]
ALL_DATASETS = [
"australian_electricity_demand",
"electricity_hourly",
"london_smart_meters_without_missing",
"solar_10_minutes",
"wind_farms_without_missing",
"pedestrian_counts",
"uber_tlc_hourly",
"traffic",
"kdd_cup_2018_without_missing",
"saugeenday",
"sunspot_without_missing",
"exchange_rate",
"cpu_limit_minute",
"cpu_usage_minute",
"function_delay_minute",
"instances_minute",
"memory_limit_minute",
"memory_usage_minute",
"platform_delay_minute",
"requests_minute",
"ett_h1",
"ett_h2",
"ett_m1",
"ett_m2",
"beijing_pm25",
"AirQualityUCI",
"beijing_multisite",
]
52 changes: 32 additions & 20 deletions data/read_new_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

Expand All @@ -22,52 +23,61 @@
from gluonts.dataset.repository.datasets import get_dataset
import os

def create_train_dataset_without_last_k_timesteps(
raw_train_dataset,
freq,
k=0
):

def create_train_dataset_without_last_k_timesteps(raw_train_dataset, freq, k=0):
train_data = []
for i, series in enumerate(raw_train_dataset):
s_train = series.copy()
s_train["target"] = s_train["target"][:len(s_train["target"])-k]
s_train["target"] = s_train["target"][: len(s_train["target"]) - k]
train_data.append(s_train)
train_data = ListDataset(train_data, freq=freq)
return train_data


def load_jsonl_gzip_file(file_path):
with gzip.open(file_path, 'rt') as f:
with gzip.open(file_path, "rt") as f:
return [json.loads(line) for line in f]


def get_ett_dataset(dataset_name, path):
dataset_path = Path(path) / dataset_name
metadata_path = dataset_path / 'metadata.json'
with open(metadata_path, 'r') as f:
metadata_path = dataset_path / "metadata.json"
with open(metadata_path, "r") as f:
metadata_dict = json.load(f)
metadata = MetaData(**metadata_dict)
# Load train and test datasets
train_data_path = dataset_path / 'train' / 'data.json.gz'
test_data_path = dataset_path / 'test' / 'data.json.gz'
train_data_path = dataset_path / "train" / "data.json.gz"
test_data_path = dataset_path / "test" / "data.json.gz"
# test dataset
test_data = load_jsonl_gzip_file(test_data_path)
# Create GluonTS ListDatasets
test_ds = ListDataset(test_data, freq=metadata.freq)
train_ds = create_train_dataset_without_last_k_timesteps(test_ds, freq=metadata.freq, k=24)
train_ds = create_train_dataset_without_last_k_timesteps(
test_ds, freq=metadata.freq, k=24
)
return TrainDatasets(metadata=metadata, train=train_ds, test=test_ds)


if __name__ == "__main__":
dataset_name = "ett_h1"

if dataset_name in ("ett_h1", "ett_h2", "ett_m1", "ett_m2"):
path = "data/datasets/ett_datasets"
ds = get_ett_dataset(dataset_name, path)

if dataset_name in ("cpu_limit_minute", "cpu_usage_minute", \
"function_delay_minute", "instances_minute", \
"memory_limit_minute", "memory_usage_minute", \
"platform_delay_minute", "requests_minute"):

if dataset_name in (
"cpu_limit_minute",
"cpu_usage_minute",
"function_delay_minute",
"instances_minute",
"memory_limit_minute",
"memory_usage_minute",
"platform_delay_minute",
"requests_minute",
):
path = "data/datasets/huawei/" + dataset_name + ".json"
with open(path, "r") as f: data = json.load(f)
with open(path, "r") as f:
data = json.load(f)
metadata = MetaData(**data["metadata"])
train_data = [x for x in data["train"] if type(x["target"][0]) != str]
test_data = [x for x in data["test"] if type(x["target"][0]) != str]
Expand All @@ -82,5 +92,7 @@ def get_ett_dataset(dataset_name, path):
metadata = MetaData(**data["metadata"])
train_test_data = [x for x in data["data"] if type(x["target"][0]) != str]
full_dataset = ListDataset(train_test_data, freq=metadata.freq)
train_ds = create_train_dataset_without_last_k_timesteps(test_ds, freq=metadata.freq, k=24)
ds = TrainDatasets(metadata=metadata, train=train_ds, test=full_dataset)
train_ds = create_train_dataset_without_last_k_timesteps(
test_ds, freq=metadata.freq, k=24
)
ds = TrainDatasets(metadata=metadata, train=train_ds, test=full_dataset)
Loading