2424import shutil
2525import time
2626import urllib
27- from collections .abc import Iterator , Mapping
27+ from collections .abc import Iterator
2828from dataclasses import dataclass
2929from functools import partial
3030from pathlib import Path
5656 rename ,
5757)
5858from .fingerprint import Hasher
59- from .info import DatasetInfo , PostProcessedInfo
59+ from .info import DatasetInfo
6060from .iterable_dataset import ArrowExamplesIterable , ExamplesIterable , IterableDataset
6161from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH , camelcase_to_snakecase
6262from .splits import Split , SplitDict , SplitGenerator , SplitInfo
6666from .utils import tqdm as hf_tqdm
6767from .utils ._filelock import FileLock
6868from .utils .file_utils import is_remote_url
69- from .utils .info_utils import VerificationMode , get_size_checksum_dict , verify_checksums , verify_splits
69+ from .utils .info_utils import VerificationMode , verify_checksums , verify_splits
7070from .utils .py_utils import (
7171 classproperty ,
7272 convert_file_size_to_int ,
@@ -406,7 +406,7 @@ def __init__(
406406 self .dl_manager = None
407407
408408 # Set to True by "datasets-cli test" to generate file checksums for (deprecated) dataset_infos.json independently of verification_mode value.
409- self ._record_infos = False
409+ self ._record_checksums = False
410410
411411 # Set in `.download_and_prepare` once the format of the generated dataset is known
412412 self ._file_format = None
@@ -805,7 +805,7 @@ def download_and_prepare(
805805 download_config = download_config ,
806806 data_dir = self .config .data_dir ,
807807 base_path = base_path ,
808- record_checksums = ( self ._record_infos or verification_mode == VerificationMode . ALL_CHECKS ) ,
808+ record_checksums = self ._record_checksums ,
809809 )
810810
811811 is_local = not is_remote_filesystem (self ._fs )
@@ -826,7 +826,6 @@ def download_and_prepare(
826826 # We need to update the info in case some splits were added in the meantime
827827 # for example when calling load_dataset from multiple workers.
828828 self .info = self ._load_info ()
829- self .download_post_processing_resources (dl_manager )
830829 return
831830
832831 logger .info (f"Generating dataset { self .dataset_name } ({ self ._output_dir } )" )
@@ -835,7 +834,7 @@ def download_and_prepare(
835834 self .info .size_in_bytes or 0 , directory = Path (self ._output_dir ).parent
836835 ):
837836 raise OSError (
838- f"Not enough disk space. Needed: { size_str (self .info .size_in_bytes or 0 )} (download: { size_str (self .info .download_size or 0 )} , generated: { size_str (self .info .dataset_size or 0 )} , post-processed: { size_str ( self . info . post_processing_size or 0 ) } ) "
837+ f"Not enough disk space. Needed: { size_str (self .info .size_in_bytes or 0 )} (download: { size_str (self .info .download_size or 0 )} , generated: { size_str (self .info .dataset_size or 0 )} "
839838 )
840839
841840 @contextlib .contextmanager
@@ -864,7 +863,6 @@ def incomplete_dir(dirname):
864863 logger .info (
865864 f"Downloading and preparing dataset { self .dataset_name } /{ self .config .name } "
866865 f"(download: { size_str (self .info .download_size )} , generated: { size_str (self .info .dataset_size )} , "
867- f"post-processed: { size_str (self .info .post_processing_size )} , "
868866 f"total: { size_str (self .info .size_in_bytes )} ) to { self ._output_dir } ..."
869867 )
870868 else :
@@ -889,15 +887,13 @@ def incomplete_dir(dirname):
889887 )
890888 # Sync info
891889 self .info .dataset_size = sum (split .num_bytes for split in self .info .splits .values ())
892- self .info .download_checksums = dl_manager .get_recorded_sizes_checksums ()
890+ if dl_manager .record_checksums :
891+ self .info .download_checksums = dl_manager .get_recorded_sizes_checksums ()
893892 if self .info .download_size is not None :
894893 self .info .size_in_bytes = self .info .dataset_size + self .info .download_size
895894 # Save info
896895 self ._save_info ()
897896
898- # Download post processing resources
899- self .download_post_processing_resources (dl_manager )
900-
901897 logger .info (
902898 f"Dataset { self .dataset_name } downloaded and prepared to { self ._output_dir } . "
903899 f"Subsequent calls will reuse this data."
@@ -956,22 +952,6 @@ def _download_and_prepare(self, dl_manager, verification_mode, **prepare_split_k
956952 self .info .splits = split_dict
957953 self .info .download_size = dl_manager .downloaded_size
958954
959- def download_post_processing_resources (self , dl_manager ):
960- for split in self .info .splits or []:
961- for resource_name , resource_file_name in self ._post_processing_resources (split ).items ():
962- if not not is_remote_filesystem (self ._fs ):
963- raise NotImplementedError (f"Post processing is not supported on filesystem { self ._fs } " )
964- if os .sep in resource_file_name :
965- raise ValueError (f"Resources shouldn't be in a sub-directory: { resource_file_name } " )
966- resource_path = os .path .join (self ._output_dir , resource_file_name )
967- if not os .path .exists (resource_path ):
968- downloaded_resource_path = self ._download_post_processing_resources (
969- split , resource_name , dl_manager
970- )
971- if downloaded_resource_path :
972- logger .info (f"Downloaded post-processing resource { resource_name } as { resource_file_name } " )
973- shutil .move (downloaded_resource_path , resource_path )
974-
975955 def _load_info (self ) -> DatasetInfo :
976956 return DatasetInfo .from_directory (self ._output_dir , storage_options = self ._fs .storage_options )
977957
@@ -992,18 +972,13 @@ def _make_split_generators_kwargs(self, prepare_split_kwargs):
992972 def as_dataset (
993973 self ,
994974 split : Optional [Union [str , Split , list [str ], list [Split ]]] = None ,
995- run_post_process = True ,
996- verification_mode : Optional [Union [VerificationMode , str ]] = None ,
997975 in_memory = False ,
998976 ) -> Union [Dataset , DatasetDict ]:
999977 """Return a Dataset for the specified split.
1000978
1001979 Args:
1002980 split (`datasets.Split`):
1003981 Which subset of the data to return.
1004- run_post_process (`bool`, defaults to `True`):
1005- Whether to run post-processing dataset transforms and/or add
1006- indexes.
1007982 verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
1008983 Verification mode determining the checks to run on the
1009984 downloaded/processed dataset information (checksums/size/splits/...).
@@ -1046,14 +1021,10 @@ def as_dataset(
10461021 if split is None :
10471022 split = {s : s for s in self .info .splits }
10481023
1049- verification_mode = VerificationMode (verification_mode or VerificationMode .BASIC_CHECKS )
1050-
10511024 # Create a dataset for each of the given splits
10521025 datasets = map_nested (
10531026 partial (
10541027 self ._build_single_dataset ,
1055- run_post_process = run_post_process ,
1056- verification_mode = verification_mode ,
10571028 in_memory = in_memory ,
10581029 ),
10591030 split ,
@@ -1067,8 +1038,6 @@ def as_dataset(
10671038 def _build_single_dataset (
10681039 self ,
10691040 split : Union [str , ReadInstruction , Split ],
1070- run_post_process : bool ,
1071- verification_mode : VerificationMode ,
10721041 in_memory : bool = False ,
10731042 ):
10741043 """as_dataset for a single split."""
@@ -1083,54 +1052,6 @@ def _build_single_dataset(
10831052 split = split ,
10841053 in_memory = in_memory ,
10851054 )
1086- if run_post_process :
1087- for resource_file_name in self ._post_processing_resources (split ).values ():
1088- if os .sep in resource_file_name :
1089- raise ValueError (f"Resources shouldn't be in a sub-directory: { resource_file_name } " )
1090- resources_paths = {
1091- resource_name : os .path .join (self ._output_dir , resource_file_name )
1092- for resource_name , resource_file_name in self ._post_processing_resources (split ).items ()
1093- }
1094- post_processed = self ._post_process (ds , resources_paths )
1095- if post_processed is not None :
1096- ds = post_processed
1097- recorded_checksums = {}
1098- record_checksums = False
1099- for resource_name , resource_path in resources_paths .items ():
1100- size_checksum = get_size_checksum_dict (resource_path )
1101- recorded_checksums [resource_name ] = size_checksum
1102- if verification_mode == VerificationMode .ALL_CHECKS and record_checksums :
1103- if self .info .post_processed is None or self .info .post_processed .resources_checksums is None :
1104- expected_checksums = None
1105- else :
1106- expected_checksums = self .info .post_processed .resources_checksums .get (split )
1107- verify_checksums (expected_checksums , recorded_checksums , "post processing resources" )
1108- if self .info .post_processed is None :
1109- self .info .post_processed = PostProcessedInfo ()
1110- if self .info .post_processed .resources_checksums is None :
1111- self .info .post_processed .resources_checksums = {}
1112- self .info .post_processed .resources_checksums [str (split )] = recorded_checksums
1113- self .info .post_processing_size = sum (
1114- checksums_dict ["num_bytes" ]
1115- for split_checksums_dicts in self .info .post_processed .resources_checksums .values ()
1116- for checksums_dict in split_checksums_dicts .values ()
1117- )
1118- if self .info .dataset_size is not None and self .info .download_size is not None :
1119- self .info .size_in_bytes = (
1120- self .info .dataset_size + self .info .download_size + self .info .post_processing_size
1121- )
1122- self ._save_info ()
1123- ds ._info .post_processed = self .info .post_processed
1124- ds ._info .post_processing_size = self .info .post_processing_size
1125- ds ._info .size_in_bytes = self .info .size_in_bytes
1126- if self .info .post_processed .features is not None :
1127- if self .info .post_processed .features .type != ds .features .type :
1128- raise ValueError (
1129- f"Post-processed features info don't match the dataset:\n Got\n { self .info .post_processed .features } \n but expected something like\n { ds .features } "
1130- )
1131- else :
1132- ds .info .features = self .info .post_processed .features
1133-
11341055 return ds
11351056
11361057 def _as_dataset (self , split : Union [ReadInstruction , Split ] = Split .TRAIN , in_memory : bool = False ) -> Dataset :
@@ -1216,20 +1137,6 @@ def _as_streaming_dataset_single(
12161137 ex_iterable , info = self .info , split = splits_generator .name , token_per_repo_id = token_per_repo_id
12171138 )
12181139
1219- def _post_process (self , dataset : Dataset , resources_paths : Mapping [str , str ]) -> Optional [Dataset ]:
1220- """Run dataset transforms or add indexes"""
1221- return None
1222-
1223- def _post_processing_resources (self , split : str ) -> dict [str , str ]:
1224- """Mapping resource_name -> resource_file_name"""
1225- return {}
1226-
1227- def _download_post_processing_resources (
1228- self , split : str , resource_name : str , dl_manager : DownloadManager
1229- ) -> Optional [str ]:
1230- """Download the resource using the download manager and return the downloaded path."""
1231- return None
1232-
12331140 @abc .abstractmethod
12341141 def _split_generators (self , dl_manager : Union [DownloadManager , StreamingDownloadManager ]):
12351142 """Specify feature dictionary generators and dataset splits.
0 commit comments