Skip to content

Commit 28b5e24

Browse files
authored
Backward compat sequence instance (#7643)
backward compat sequence instance
1 parent cefda74 commit 28b5e24

File tree

4 files changed

+18
-45
lines changed

4 files changed

+18
-45
lines changed

src/datasets/builder.py

Lines changed: 3 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
rename,
5757
)
5858
from .fingerprint import Hasher
59-
from .info import DatasetInfo, DatasetInfosDict, PostProcessedInfo
59+
from .info import DatasetInfo, PostProcessedInfo
6060
from .iterable_dataset import ArrowExamplesIterable, ExamplesIterable, IterableDataset
6161
from .keyhash import DuplicatedKeysError
6262
from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH, camelcase_to_snakecase
@@ -349,9 +349,7 @@ def __init__(
349349
# prepare info: DatasetInfo are a standardized dataclass across all datasets
350350
# Prefill datasetinfo
351351
if info is None:
352-
# TODO FOR PACKAGED MODULES IT IMPORTS DATA FROM src/packaged_modules which doesn't make sense
353-
info = self.get_exported_dataset_info()
354-
info.update(self._info())
352+
info = self._info()
355353
info.builder_name = self.name
356354
info.dataset_name = self.dataset_name
357355
info.config_name = self.config.name
@@ -391,7 +389,7 @@ def __init__(
391389
if os.path.exists(self._cache_dir): # check if data exist
392390
if len(os.listdir(self._cache_dir)) > 0:
393391
if os.path.exists(os.path.join(self._cache_dir, config.DATASET_INFO_FILENAME)):
394-
logger.info("Overwrite dataset info from restored data version if exists.")
392+
logger.debug("Overwrite dataset info from restored data version if exists.")
395393
self.info = DatasetInfo.from_directory(self._cache_dir)
396394
else: # dir exists but no data, remove the empty dir as data aren't available anymore
397395
logger.warning(
@@ -503,35 +501,6 @@ def update_hash_with_config_parameters(hash: str, config_parameters: dict) -> st
503501
if os.path.isdir(legacy_cache_dir):
504502
return legacy_relative_data_dir
505503

506-
@classmethod
507-
def get_all_exported_dataset_infos(cls) -> DatasetInfosDict:
508-
"""Empty dict if doesn't exist
509-
510-
Example:
511-
512-
```py
513-
>>> from datasets import load_dataset_builder
514-
>>> ds_builder = load_dataset_builder('vivos')
515-
>>> ds_builder.get_all_exported_dataset_infos()
516-
{'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value('string'), 'path': Value('string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value('string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)}
517-
```
518-
"""
519-
return DatasetInfosDict.from_directory(cls.get_imported_module_dir())
520-
521-
def get_exported_dataset_info(self) -> DatasetInfo:
522-
"""Empty `DatasetInfo` if doesn't exist
523-
524-
Example:
525-
526-
```py
527-
>>> from datasets import load_dataset_builder
528-
>>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
529-
>>> ds_builder.get_exported_dataset_info()
530-
DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value('string'), 'path': Value('string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value('string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)
531-
```
532-
"""
533-
return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo())
534-
535504
def _create_builder_config(
536505
self, config_name=None, custom_features=None, **config_kwargs
537506
) -> tuple[BuilderConfig, str]:

src/datasets/config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
if TORCH_AVAILABLE:
5252
try:
5353
TORCH_VERSION = version.parse(importlib.metadata.version("torch"))
54-
logger.info(f"PyTorch version {TORCH_VERSION} available.")
54+
logger.debug(f"PyTorch version {TORCH_VERSION} available.")
5555
except importlib.metadata.PackageNotFoundError:
5656
pass
5757
else:
@@ -63,7 +63,7 @@
6363
if POLARS_AVAILABLE:
6464
try:
6565
POLARS_VERSION = version.parse(importlib.metadata.version("polars"))
66-
logger.info(f"Polars version {POLARS_VERSION} available.")
66+
logger.debug(f"Polars version {POLARS_VERSION} available.")
6767
except importlib.metadata.PackageNotFoundError:
6868
pass
6969

@@ -74,7 +74,7 @@
7474
if DUCKDB_AVAILABLE:
7575
try:
7676
DUCKDB_VERSION = version.parse(importlib.metadata.version("duckdb"))
77-
logger.info(f"Duckdb version {DUCKDB_VERSION} available.")
77+
logger.debug(f"Duckdb version {DUCKDB_VERSION} available.")
7878
except importlib.metadata.PackageNotFoundError:
7979
pass
8080

src/datasets/features/features.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,7 +1160,7 @@ def _load_names_from_file(names_filepath):
11601160
return [name.strip() for name in f.read().split("\n") if name.strip()] # Filter empty names
11611161

11621162

1163-
def Sequence(feature, length=-1):
1163+
class Sequence:
11641164
"""
11651165
A `Sequence` is a utility that automatically converts internal dictionary feature into a dictionary of
11661166
lists. This behavior is implemented to have a compatibility layer with the TensorFlow Datasets library but may be
@@ -1179,14 +1179,18 @@ def Sequence(feature, length=-1):
11791179
which are converted to `dict` of lists of sub-features for compatibility with TFDS.
11801180
11811181
"""
1182-
if isinstance(feature, dict):
1183-
return {key: List(value, length=length) for key, value in feature.items()}
1184-
else:
1185-
return List(feature, length=length)
1182+
1183+
def __new__(cls, feature=None, length=-1, **kwargs):
1184+
# useful to still get isinstance(Sequence(Value("int64")), Sequence)
1185+
if isinstance(feature, dict):
1186+
out = {key: List(value, length=length, **kwargs) for key, value in feature.items()}
1187+
else:
1188+
out = super().__new__(List)
1189+
return out
11861190

11871191

11881192
@dataclass(repr=False)
1189-
class List:
1193+
class List(Sequence):
11901194
"""Feature type for large list data composed of child feature data type.
11911195
11921196
It is backed by `pyarrow.ListType`, which uses 32-bit offsets or a fixed length.

src/datasets/info.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ def from_directory(cls, dataset_info_dir: str, storage_options: Optional[dict] =
271271
"""
272272
fs: fsspec.AbstractFileSystem
273273
fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))
274-
logger.info(f"Loading Dataset info from {dataset_info_dir}")
274+
logger.debug(f"Loading Dataset info from {dataset_info_dir}")
275275
if not dataset_info_dir:
276276
raise ValueError("Calling DatasetInfo.from_directory() with undefined dataset_info_dir.")
277277
with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "r", encoding="utf-8") as f:
@@ -352,7 +352,7 @@ def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=Fa
352352

353353
@classmethod
354354
def from_directory(cls, dataset_infos_dir) -> "DatasetInfosDict":
355-
logger.info(f"Loading Dataset Infos from {dataset_infos_dir}")
355+
logger.debug(f"Loading Dataset Infos from {dataset_infos_dir}")
356356
# Load the info from the YAML part of README.md
357357
if os.path.exists(os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)):
358358
dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / config.REPOCARD_FILENAME).data

0 commit comments

Comments
 (0)