Backward compat sequence instance (#7643)

lhoestq · web-flow · commit 28b5e24344d9 · 2025-06-25T19:05:43.000+02:00
backward compat sequence instance
diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -56,7 +56,7 @@
     rename,
 )
 from .fingerprint import Hasher
-from .info import DatasetInfo, DatasetInfosDict, PostProcessedInfo
+from .info import DatasetInfo, PostProcessedInfo
 from .iterable_dataset import ArrowExamplesIterable, ExamplesIterable, IterableDataset
 from .keyhash import DuplicatedKeysError
 from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH, camelcase_to_snakecase
@@ -349,9 +349,7 @@ def __init__(
         # prepare info: DatasetInfo are a standardized dataclass across all datasets
         # Prefill datasetinfo
         if info is None:
-            # TODO FOR PACKAGED MODULES IT IMPORTS DATA FROM src/packaged_modules which doesn't make sense
-            info = self.get_exported_dataset_info()
-            info.update(self._info())
+            info = self._info()
         info.builder_name = self.name
         info.dataset_name = self.dataset_name
         info.config_name = self.config.name
@@ -391,7 +389,7 @@ def __init__(
                 if os.path.exists(self._cache_dir):  # check if data exist
                     if len(os.listdir(self._cache_dir)) > 0:
                         if os.path.exists(os.path.join(self._cache_dir, config.DATASET_INFO_FILENAME)):
-                            logger.info("Overwrite dataset info from restored data version if exists.")
+                            logger.debug("Overwrite dataset info from restored data version if exists.")
                             self.info = DatasetInfo.from_directory(self._cache_dir)
                     else:  # dir exists but no data, remove the empty dir as data aren't available anymore
                         logger.warning(
@@ -503,35 +501,6 @@ def update_hash_with_config_parameters(hash: str, config_parameters: dict) -> st
             if os.path.isdir(legacy_cache_dir):
                 return legacy_relative_data_dir
 
-    @classmethod
-    def get_all_exported_dataset_infos(cls) -> DatasetInfosDict:
-        """Empty dict if doesn't exist
-
-        Example:
-
-        ```py
-        >>> from datasets import load_dataset_builder
-        >>> ds_builder = load_dataset_builder('vivos')
-        >>> ds_builder.get_all_exported_dataset_infos()
-        {'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value('string'), 'path': Value('string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value('string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)}
-        ```
-        """
-        return DatasetInfosDict.from_directory(cls.get_imported_module_dir())
-
-    def get_exported_dataset_info(self) -> DatasetInfo:
-        """Empty `DatasetInfo` if doesn't exist
-
-        Example:
-
-        ```py
-        >>> from datasets import load_dataset_builder
-        >>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
-        >>> ds_builder.get_exported_dataset_info()
-        DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value('string'), 'path': Value('string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value('string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)
-        ```
-        """
-        return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo())
-
     def _create_builder_config(
         self, config_name=None, custom_features=None, **config_kwargs
     ) -> tuple[BuilderConfig, str]:
diff --git a/src/datasets/config.py b/src/datasets/config.py
@@ -51,7 +51,7 @@
     if TORCH_AVAILABLE:
         try:
             TORCH_VERSION = version.parse(importlib.metadata.version("torch"))
-            logger.info(f"PyTorch version {TORCH_VERSION} available.")
+            logger.debug(f"PyTorch version {TORCH_VERSION} available.")
         except importlib.metadata.PackageNotFoundError:
             pass
 else:
@@ -63,7 +63,7 @@
 if POLARS_AVAILABLE:
     try:
         POLARS_VERSION = version.parse(importlib.metadata.version("polars"))
-        logger.info(f"Polars version {POLARS_VERSION} available.")
+        logger.debug(f"Polars version {POLARS_VERSION} available.")
     except importlib.metadata.PackageNotFoundError:
         pass
 
@@ -74,7 +74,7 @@
 if DUCKDB_AVAILABLE:
     try:
         DUCKDB_VERSION = version.parse(importlib.metadata.version("duckdb"))
-        logger.info(f"Duckdb version {DUCKDB_VERSION} available.")
+        logger.debug(f"Duckdb version {DUCKDB_VERSION} available.")
     except importlib.metadata.PackageNotFoundError:
         pass
 
diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
@@ -1160,7 +1160,7 @@ def _load_names_from_file(names_filepath):
             return [name.strip() for name in f.read().split("\n") if name.strip()]  # Filter empty names
 
 
-def Sequence(feature, length=-1):
+class Sequence:
     """
     A `Sequence` is a utility that automatically converts internal dictionary feature into a dictionary of
     lists. This behavior is implemented to have a compatibility layer with the TensorFlow Datasets library but may be
@@ -1179,14 +1179,18 @@ def Sequence(feature, length=-1):
             which are converted to `dict` of lists of sub-features for compatibility with TFDS.
 
     """
-    if isinstance(feature, dict):
-        return {key: List(value, length=length) for key, value in feature.items()}
-    else:
-        return List(feature, length=length)
+
+    def __new__(cls, feature=None, length=-1, **kwargs):
+        # useful to still get isinstance(Sequence(Value("int64")), Sequence)
+        if isinstance(feature, dict):
+            out = {key: List(value, length=length, **kwargs) for key, value in feature.items()}
+        else:
+            out = super().__new__(List)
+        return out
 
 
 @dataclass(repr=False)
-class List:
+class List(Sequence):
     """Feature type for large list data composed of child feature data type.
 
     It is backed by `pyarrow.ListType`, which uses 32-bit offsets or a fixed length.
diff --git a/src/datasets/info.py b/src/datasets/info.py
@@ -271,7 +271,7 @@ def from_directory(cls, dataset_info_dir: str, storage_options: Optional[dict] =
         """
         fs: fsspec.AbstractFileSystem
         fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))
-        logger.info(f"Loading Dataset info from {dataset_info_dir}")
+        logger.debug(f"Loading Dataset info from {dataset_info_dir}")
         if not dataset_info_dir:
             raise ValueError("Calling DatasetInfo.from_directory() with undefined dataset_info_dir.")
         with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "r", encoding="utf-8") as f:
@@ -352,7 +352,7 @@ def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=Fa
 
     @classmethod
     def from_directory(cls, dataset_infos_dir) -> "DatasetInfosDict":
-        logger.info(f"Loading Dataset Infos from {dataset_infos_dir}")
+        logger.debug(f"Loading Dataset Infos from {dataset_infos_dir}")
         # Load the info from the YAML part of README.md
         if os.path.exists(os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)):
             dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / config.REPOCARD_FILENAME).data