Skip to content

Commit 85fd3dd

Browse files
committed
don't include files list in fo and remove old stuff
1 parent 42754a9 commit 85fd3dd

File tree

14 files changed

+51
-441
lines changed

14 files changed

+51
-441
lines changed

src/datasets/builder.py

Lines changed: 8 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import shutil
2525
import time
2626
import urllib
27-
from collections.abc import Iterator, Mapping
27+
from collections.abc import Iterator
2828
from dataclasses import dataclass
2929
from functools import partial
3030
from pathlib import Path
@@ -56,7 +56,7 @@
5656
rename,
5757
)
5858
from .fingerprint import Hasher
59-
from .info import DatasetInfo, PostProcessedInfo
59+
from .info import DatasetInfo
6060
from .iterable_dataset import ArrowExamplesIterable, ExamplesIterable, IterableDataset
6161
from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH, camelcase_to_snakecase
6262
from .splits import Split, SplitDict, SplitGenerator, SplitInfo
@@ -66,7 +66,7 @@
6666
from .utils import tqdm as hf_tqdm
6767
from .utils._filelock import FileLock
6868
from .utils.file_utils import is_remote_url
69-
from .utils.info_utils import VerificationMode, get_size_checksum_dict, verify_checksums, verify_splits
69+
from .utils.info_utils import VerificationMode, verify_checksums, verify_splits
7070
from .utils.py_utils import (
7171
classproperty,
7272
convert_file_size_to_int,
@@ -406,7 +406,7 @@ def __init__(
406406
self.dl_manager = None
407407

408408
# Set to True by "datasets-cli test" to generate file checksums for (deprecated) dataset_infos.json independently of verification_mode value.
409-
self._record_infos = False
409+
self._record_checksums = False
410410

411411
# Set in `.download_and_prepare` once the format of the generated dataset is known
412412
self._file_format = None
@@ -805,7 +805,7 @@ def download_and_prepare(
805805
download_config=download_config,
806806
data_dir=self.config.data_dir,
807807
base_path=base_path,
808-
record_checksums=(self._record_infos or verification_mode == VerificationMode.ALL_CHECKS),
808+
record_checksums=self._record_checksums,
809809
)
810810

811811
is_local = not is_remote_filesystem(self._fs)
@@ -826,7 +826,6 @@ def download_and_prepare(
826826
# We need to update the info in case some splits were added in the meantime
827827
# for example when calling load_dataset from multiple workers.
828828
self.info = self._load_info()
829-
self.download_post_processing_resources(dl_manager)
830829
return
831830

832831
logger.info(f"Generating dataset {self.dataset_name} ({self._output_dir})")
@@ -835,7 +834,7 @@ def download_and_prepare(
835834
self.info.size_in_bytes or 0, directory=Path(self._output_dir).parent
836835
):
837836
raise OSError(
838-
f"Not enough disk space. Needed: {size_str(self.info.size_in_bytes or 0)} (download: {size_str(self.info.download_size or 0)}, generated: {size_str(self.info.dataset_size or 0)}, post-processed: {size_str(self.info.post_processing_size or 0)})"
837+
f"Not enough disk space. Needed: {size_str(self.info.size_in_bytes or 0)} (download: {size_str(self.info.download_size or 0)}, generated: {size_str(self.info.dataset_size or 0)}"
839838
)
840839

841840
@contextlib.contextmanager
@@ -864,7 +863,6 @@ def incomplete_dir(dirname):
864863
logger.info(
865864
f"Downloading and preparing dataset {self.dataset_name}/{self.config.name} "
866865
f"(download: {size_str(self.info.download_size)}, generated: {size_str(self.info.dataset_size)}, "
867-
f"post-processed: {size_str(self.info.post_processing_size)}, "
868866
f"total: {size_str(self.info.size_in_bytes)}) to {self._output_dir}..."
869867
)
870868
else:
@@ -889,15 +887,13 @@ def incomplete_dir(dirname):
889887
)
890888
# Sync info
891889
self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
892-
self.info.download_checksums = dl_manager.get_recorded_sizes_checksums()
890+
if dl_manager.record_checksums:
891+
self.info.download_checksums = dl_manager.get_recorded_sizes_checksums()
893892
if self.info.download_size is not None:
894893
self.info.size_in_bytes = self.info.dataset_size + self.info.download_size
895894
# Save info
896895
self._save_info()
897896

898-
# Download post processing resources
899-
self.download_post_processing_resources(dl_manager)
900-
901897
logger.info(
902898
f"Dataset {self.dataset_name} downloaded and prepared to {self._output_dir}. "
903899
f"Subsequent calls will reuse this data."
@@ -956,22 +952,6 @@ def _download_and_prepare(self, dl_manager, verification_mode, **prepare_split_k
956952
self.info.splits = split_dict
957953
self.info.download_size = dl_manager.downloaded_size
958954

959-
def download_post_processing_resources(self, dl_manager):
960-
for split in self.info.splits or []:
961-
for resource_name, resource_file_name in self._post_processing_resources(split).items():
962-
if not not is_remote_filesystem(self._fs):
963-
raise NotImplementedError(f"Post processing is not supported on filesystem {self._fs}")
964-
if os.sep in resource_file_name:
965-
raise ValueError(f"Resources shouldn't be in a sub-directory: {resource_file_name}")
966-
resource_path = os.path.join(self._output_dir, resource_file_name)
967-
if not os.path.exists(resource_path):
968-
downloaded_resource_path = self._download_post_processing_resources(
969-
split, resource_name, dl_manager
970-
)
971-
if downloaded_resource_path:
972-
logger.info(f"Downloaded post-processing resource {resource_name} as {resource_file_name}")
973-
shutil.move(downloaded_resource_path, resource_path)
974-
975955
def _load_info(self) -> DatasetInfo:
976956
return DatasetInfo.from_directory(self._output_dir, storage_options=self._fs.storage_options)
977957

@@ -992,18 +972,13 @@ def _make_split_generators_kwargs(self, prepare_split_kwargs):
992972
def as_dataset(
993973
self,
994974
split: Optional[Union[str, Split, list[str], list[Split]]] = None,
995-
run_post_process=True,
996-
verification_mode: Optional[Union[VerificationMode, str]] = None,
997975
in_memory=False,
998976
) -> Union[Dataset, DatasetDict]:
999977
"""Return a Dataset for the specified split.
1000978
1001979
Args:
1002980
split (`datasets.Split`):
1003981
Which subset of the data to return.
1004-
run_post_process (`bool`, defaults to `True`):
1005-
Whether to run post-processing dataset transforms and/or add
1006-
indexes.
1007982
verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
1008983
Verification mode determining the checks to run on the
1009984
downloaded/processed dataset information (checksums/size/splits/...).
@@ -1046,14 +1021,10 @@ def as_dataset(
10461021
if split is None:
10471022
split = {s: s for s in self.info.splits}
10481023

1049-
verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS)
1050-
10511024
# Create a dataset for each of the given splits
10521025
datasets = map_nested(
10531026
partial(
10541027
self._build_single_dataset,
1055-
run_post_process=run_post_process,
1056-
verification_mode=verification_mode,
10571028
in_memory=in_memory,
10581029
),
10591030
split,
@@ -1067,8 +1038,6 @@ def as_dataset(
10671038
def _build_single_dataset(
10681039
self,
10691040
split: Union[str, ReadInstruction, Split],
1070-
run_post_process: bool,
1071-
verification_mode: VerificationMode,
10721041
in_memory: bool = False,
10731042
):
10741043
"""as_dataset for a single split."""
@@ -1083,54 +1052,6 @@ def _build_single_dataset(
10831052
split=split,
10841053
in_memory=in_memory,
10851054
)
1086-
if run_post_process:
1087-
for resource_file_name in self._post_processing_resources(split).values():
1088-
if os.sep in resource_file_name:
1089-
raise ValueError(f"Resources shouldn't be in a sub-directory: {resource_file_name}")
1090-
resources_paths = {
1091-
resource_name: os.path.join(self._output_dir, resource_file_name)
1092-
for resource_name, resource_file_name in self._post_processing_resources(split).items()
1093-
}
1094-
post_processed = self._post_process(ds, resources_paths)
1095-
if post_processed is not None:
1096-
ds = post_processed
1097-
recorded_checksums = {}
1098-
record_checksums = False
1099-
for resource_name, resource_path in resources_paths.items():
1100-
size_checksum = get_size_checksum_dict(resource_path)
1101-
recorded_checksums[resource_name] = size_checksum
1102-
if verification_mode == VerificationMode.ALL_CHECKS and record_checksums:
1103-
if self.info.post_processed is None or self.info.post_processed.resources_checksums is None:
1104-
expected_checksums = None
1105-
else:
1106-
expected_checksums = self.info.post_processed.resources_checksums.get(split)
1107-
verify_checksums(expected_checksums, recorded_checksums, "post processing resources")
1108-
if self.info.post_processed is None:
1109-
self.info.post_processed = PostProcessedInfo()
1110-
if self.info.post_processed.resources_checksums is None:
1111-
self.info.post_processed.resources_checksums = {}
1112-
self.info.post_processed.resources_checksums[str(split)] = recorded_checksums
1113-
self.info.post_processing_size = sum(
1114-
checksums_dict["num_bytes"]
1115-
for split_checksums_dicts in self.info.post_processed.resources_checksums.values()
1116-
for checksums_dict in split_checksums_dicts.values()
1117-
)
1118-
if self.info.dataset_size is not None and self.info.download_size is not None:
1119-
self.info.size_in_bytes = (
1120-
self.info.dataset_size + self.info.download_size + self.info.post_processing_size
1121-
)
1122-
self._save_info()
1123-
ds._info.post_processed = self.info.post_processed
1124-
ds._info.post_processing_size = self.info.post_processing_size
1125-
ds._info.size_in_bytes = self.info.size_in_bytes
1126-
if self.info.post_processed.features is not None:
1127-
if self.info.post_processed.features.type != ds.features.type:
1128-
raise ValueError(
1129-
f"Post-processed features info don't match the dataset:\nGot\n{self.info.post_processed.features}\nbut expected something like\n{ds.features}"
1130-
)
1131-
else:
1132-
ds.info.features = self.info.post_processed.features
1133-
11341055
return ds
11351056

11361057
def _as_dataset(self, split: Union[ReadInstruction, Split] = Split.TRAIN, in_memory: bool = False) -> Dataset:
@@ -1216,20 +1137,6 @@ def _as_streaming_dataset_single(
12161137
ex_iterable, info=self.info, split=splits_generator.name, token_per_repo_id=token_per_repo_id
12171138
)
12181139

1219-
def _post_process(self, dataset: Dataset, resources_paths: Mapping[str, str]) -> Optional[Dataset]:
1220-
"""Run dataset transforms or add indexes"""
1221-
return None
1222-
1223-
def _post_processing_resources(self, split: str) -> dict[str, str]:
1224-
"""Mapping resource_name -> resource_file_name"""
1225-
return {}
1226-
1227-
def _download_post_processing_resources(
1228-
self, split: str, resource_name: str, dl_manager: DownloadManager
1229-
) -> Optional[str]:
1230-
"""Download the resource using the download manager and return the downloaded path."""
1231-
return None
1232-
12331140
@abc.abstractmethod
12341141
def _split_generators(self, dl_manager: Union[DownloadManager, StreamingDownloadManager]):
12351142
"""Specify feature dictionary generators and dataset splits.

src/datasets/commands/test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def get_builders() -> Generator[DatasetBuilder, None, None]:
144144

145145
for j, builder in enumerate(get_builders()):
146146
print(f"Testing builder '{builder.config.name}' ({j + 1}/{n_builders})")
147-
builder._record_infos = os.path.exists(
147+
builder._record_checksums = os.path.exists(
148148
os.path.join(builder.get_imported_module_dir(), datasets.config.DATASETDICT_INFOS_FILENAME)
149149
) # record checksums only if we need to update a (deprecated) dataset_infos.json
150150
builder.download_and_prepare(

src/datasets/download/download_manager.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def __init__(
7777
data_dir: Optional[str] = None,
7878
download_config: Optional[DownloadConfig] = None,
7979
base_path: Optional[str] = None,
80-
record_checksums=True,
80+
record_checksums=False,
8181
):
8282
"""Download manager constructor.
8383
@@ -93,7 +93,7 @@ def __init__(
9393
base_path (`str`):
9494
base path that is used when relative paths are used to
9595
download files. This can be a remote url.
96-
record_checksums (`bool`, defaults to `True`):
96+
record_checksums (`bool`, defaults to `False`):
9797
Whether to record the checksums of the downloaded files. If None, the value is inferred from the builder.
9898
"""
9999
self._dataset_name = dataset_name

src/datasets/info.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ class DatasetInfo:
109109
features ([`Features`], *optional*):
110110
The features used to specify the dataset's column types.
111111
post_processed (`PostProcessedInfo`, *optional*):
112-
Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index.
112+
Deprecated. Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index.
113113
supervised_keys (`SupervisedKeysData`, *optional*):
114114
Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS).
115115
builder_name (`str`, *optional*):
@@ -125,7 +125,7 @@ class DatasetInfo:
125125
download_size (`int`, *optional*):
126126
The size of the files to download to generate the dataset, in bytes.
127127
post_processing_size (`int`, *optional*):
128-
Size of the dataset in bytes after post-processing, if any.
128+
Deprecated. Size of the dataset in bytes after post-processing, if any.
129129
dataset_size (`int`, *optional*):
130130
The combined size in bytes of the Arrow tables for all splits.
131131
size_in_bytes (`int`, *optional*):
@@ -140,7 +140,7 @@ class DatasetInfo:
140140
homepage: str = dataclasses.field(default_factory=str)
141141
license: str = dataclasses.field(default_factory=str)
142142
features: Optional[Features] = None
143-
post_processed: Optional[PostProcessedInfo] = None
143+
post_processed: Optional[PostProcessedInfo] = None # kept for bawkard compat
144144
supervised_keys: Optional[SupervisedKeysData] = None
145145

146146
# Set later by the builder
@@ -320,6 +320,16 @@ def _from_yaml_dict(cls, yaml_data: dict) -> "DatasetInfo":
320320
field_names = {f.name for f in dataclasses.fields(cls)}
321321
return cls(**{k: v for k, v in yaml_data.items() if k in field_names})
322322

323+
def __repr__(self):
324+
return (
325+
self.__class__.__qualname__
326+
+ "("
327+
+ ", ".join(
328+
[f"{f.name}={repr(getattr(self, f.name))}" for f in dataclasses.fields(self) if getattr(self, f.name)]
329+
)
330+
+ ")"
331+
)
332+
323333

324334
class DatasetInfosDict(dict[str, DatasetInfo]):
325335
def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=False) -> None:

src/datasets/io/csv.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,7 @@ def read(self):
6060
base_path=base_path,
6161
num_proc=self.num_proc,
6262
)
63-
dataset = self.builder.as_dataset(
64-
split=self.split, verification_mode=verification_mode, in_memory=self.keep_in_memory
65-
)
63+
dataset = self.builder.as_dataset(split=self.split, in_memory=self.keep_in_memory)
6664
return dataset
6765

6866

src/datasets/io/generator.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,7 @@ def read(self):
5656
base_path=base_path,
5757
num_proc=self.num_proc,
5858
)
59-
dataset = self.builder.as_dataset(
60-
split=self.builder.config.split, verification_mode=verification_mode, in_memory=self.keep_in_memory
61-
)
59+
dataset = self.builder.as_dataset(split=self.builder.config.split, in_memory=self.keep_in_memory)
6260
if self.fingerprint:
6361
dataset._fingerprint = self.fingerprint
6462
return dataset

src/datasets/io/json.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,7 @@ def read(self):
6363
base_path=base_path,
6464
num_proc=self.num_proc,
6565
)
66-
dataset = self.builder.as_dataset(
67-
split=self.split, verification_mode=verification_mode, in_memory=self.keep_in_memory
68-
)
66+
dataset = self.builder.as_dataset(split=self.split, in_memory=self.keep_in_memory)
6967
return dataset
7068

7169

src/datasets/io/parquet.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,7 @@ def read(self):
6666
base_path=base_path,
6767
num_proc=self.num_proc,
6868
)
69-
dataset = self.builder.as_dataset(
70-
split=self.split, verification_mode=verification_mode, in_memory=self.keep_in_memory
71-
)
69+
dataset = self.builder.as_dataset(split=self.split, in_memory=self.keep_in_memory)
7270
return dataset
7371

7472

src/datasets/io/sql.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,7 @@ def read(self):
4747
)
4848

4949
# Build dataset for splits
50-
dataset = self.builder.as_dataset(
51-
split="train", verification_mode=verification_mode, in_memory=self.keep_in_memory
52-
)
50+
dataset = self.builder.as_dataset(split="train", in_memory=self.keep_in_memory)
5351
return dataset
5452

5553

src/datasets/io/text.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,5 @@ def read(self):
5454
base_path=base_path,
5555
num_proc=self.num_proc,
5656
)
57-
dataset = self.builder.as_dataset(
58-
split=self.split, verification_mode=verification_mode, in_memory=self.keep_in_memory
59-
)
57+
dataset = self.builder.as_dataset(split=self.split, in_memory=self.keep_in_memory)
6058
return dataset

0 commit comments

Comments
 (0)