Skip to content

Commit 6923eda

Browse files
authored
update docs and docstrings (#7641)
1 parent da59d68 commit 6923eda

File tree

12 files changed

+94
-94
lines changed

12 files changed

+94
-94
lines changed

docs/source/about_dataset_features.mdx

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ Let's have a look at the features of the MRPC dataset from the GLUE benchmark:
1010
>>> from datasets import load_dataset
1111
>>> dataset = load_dataset('nyu-mll/glue', 'mrpc', split='train')
1212
>>> dataset.features
13-
{'idx': Value(dtype='int32'),
13+
{'idx': Value('int32'),
1414
'label': ClassLabel(names=['not_equivalent', 'equivalent']),
15-
'sentence1': Value(dtype='string'),
16-
'sentence2': Value(dtype='string'),
15+
'sentence1': Value('string'),
16+
'sentence2': Value('string'),
1717
}
1818
```
1919

@@ -38,12 +38,12 @@ If your data type contains a list of objects, then you want to use the [`List`]
3838
>>> from datasets import load_dataset
3939
>>> dataset = load_dataset('rajpurkar/squad', split='train')
4040
>>> dataset.features
41-
{'id': Value(dtype='string'),
42-
'title': Value(dtype='string'),
43-
'context': Value(dtype='string'),
44-
'question': Value(dtype='string'),
45-
'answers': {'text': List(feature=Value(dtype='string'), length=-1),
46-
'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
41+
{'id': Value('string'),
42+
'title': Value('string'),
43+
'context': Value('string'),
44+
'question': Value('string'),
45+
'answers': {'text': List(Value('string')),
46+
'answer_start': List(Value('int32'))}}
4747
```
4848

4949
The `answers` field is constructed using the dict of features because and contains two subfields, `text` and `answer_start`, which are lists of `string` and `int32`, respectively.

docs/source/load_hub.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ Movie Review Dataset. This is a dataset of containing 5,331 positive and 5,331 n
2121
# Inspect dataset features
2222
>>> ds_builder.info.features
2323
{'label': ClassLabel(names=['neg', 'pos']),
24-
'text': Value(dtype='string')}
24+
'text': Value('string')}
2525
```
2626

2727
If you're happy with the dataset, then load it with [`load_dataset`]:

docs/source/loading.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,6 @@ Now when you look at your dataset features, you can see it uses the custom label
417417

418418
```py
419419
>>> dataset['train'].features
420-
{'text': Value(dtype='string'),
420+
{'text': Value('string'),
421421
'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'])}
422422
```

docs/source/process.mdx

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -223,21 +223,21 @@ The [`~Dataset.cast`] function transforms the feature type of one or more column
223223

224224
```py
225225
>>> dataset.features
226-
{'sentence1': Value(dtype='string'),
227-
'sentence2': Value(dtype='string'),
226+
{'sentence1': Value('string'),
227+
'sentence2': Value('string'),
228228
'label': ClassLabel(names=['not_equivalent', 'equivalent']),
229-
'idx': Value(dtype='int32')}
229+
'idx': Value('int32')}
230230

231231
>>> from datasets import ClassLabel, Value
232232
>>> new_features = dataset.features.copy()
233233
>>> new_features["label"] = ClassLabel(names=["negative", "positive"])
234234
>>> new_features["idx"] = Value("int64")
235235
>>> dataset = dataset.cast(new_features)
236236
>>> dataset.features
237-
{'sentence1': Value(dtype='string'),
238-
'sentence2': Value(dtype='string'),
237+
{'sentence1': Value('string'),
238+
'sentence2': Value('string'),
239239
'label': ClassLabel(names=['negative', 'positive']),
240-
'idx': Value(dtype='int64')}
240+
'idx': Value('int64')}
241241
```
242242

243243
<Tip>
@@ -265,12 +265,12 @@ Sometimes a column can be a nested structure of several types. Take a look at th
265265
>>> from datasets import load_dataset
266266
>>> dataset = load_dataset("rajpurkar/squad", split="train")
267267
>>> dataset.features
268-
{'id': Value(dtype='string'),
269-
'title': Value(dtype='string'),
270-
'context': Value(dtype='string'),
271-
'question': Value(dtype='string'),
272-
'answers': {'text': List(feature=Value(dtype='string'), length=-1),
273-
'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
268+
{'id': Value('string'),
269+
'title': Value('string'),
270+
'context': Value('string'),
271+
'question': Value('string'),
272+
'answers': {'text': List(Value('string')),
273+
'answer_start': List(Value('int32'))}}
274274
```
275275

276276
The `answers` field contains two subfields: `text` and `answer_start`. Use the [`~Dataset.flatten`] function to extract the subfields into their own separate columns:

docs/source/stream.mdx

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -241,21 +241,21 @@ When you need to remove one or more columns, give [`IterableDataset.remove_colum
241241
>>> from datasets import load_dataset
242242
>>> dataset = load_dataset('nyu-mll/glue', 'mrpc', split='train', streaming=True)
243243
>>> dataset.features
244-
{'sentence1': Value(dtype='string'),
245-
'sentence2': Value(dtype='string'),
244+
{'sentence1': Value('string'),
245+
'sentence2': Value('string'),
246246
'label': ClassLabel(names=['not_equivalent', 'equivalent']),
247-
'idx': Value(dtype='int32')}
247+
'idx': Value('int32')}
248248

249249
>>> from datasets import ClassLabel, Value
250250
>>> new_features = dataset.features.copy()
251251
>>> new_features["label"] = ClassLabel(names=['negative', 'positive'])
252252
>>> new_features["idx"] = Value('int64')
253253
>>> dataset = dataset.cast(new_features)
254254
>>> dataset.features
255-
{'sentence1': Value(dtype='string'),
256-
'sentence2': Value(dtype='string'),
255+
{'sentence1': Value('string'),
256+
'sentence2': Value('string'),
257257
'label': ClassLabel(names=['negative', 'positive']),
258-
'idx': Value(dtype='int64')}
258+
'idx': Value('int64')}
259259
```
260260

261261
<Tip>

src/datasets/arrow_dataset.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1957,14 +1957,14 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data
19571957
>>> from datasets import load_dataset
19581958
>>> ds = load_dataset("boolq", split="validation")
19591959
>>> ds.features
1960-
{'answer': Value(dtype='bool'),
1961-
'passage': Value(dtype='string'),
1962-
'question': Value(dtype='string')}
1960+
{'answer': Value('bool'),
1961+
'passage': Value('string'),
1962+
'question': Value('string')}
19631963
>>> ds = ds.class_encode_column('answer')
19641964
>>> ds.features
19651965
{'answer': ClassLabel(num_classes=2, names=['False', 'True']),
1966-
'passage': Value(dtype='string'),
1967-
'question': Value(dtype='string')}
1966+
'passage': Value('string'),
1967+
'question': Value('string')}
19681968
```
19691969
"""
19701970
# Sanity checks
@@ -2035,12 +2035,12 @@ def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> "Datas
20352035
>>> from datasets import load_dataset
20362036
>>> ds = load_dataset("rajpurkar/squad", split="train")
20372037
>>> ds.features
2038-
{'id': Value(dtype='string'),
2039-
'title': Value(dtype='string'),
2040-
'context': Value(dtype='string'),
2041-
'question': Value(dtype='string'),
2042-
'answers': {'text': List(feature=Value(dtype='string'), length=-1),
2043-
'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
2038+
{'id': Value('string'),
2039+
'title': Value('string'),
2040+
'context': Value('string'),
2041+
'question': Value('string'),
2042+
'answers': {'text': List(Value('string')),
2043+
'answer_start': List(Value('int32'))}}
20442044
>>> ds.flatten()
20452045
Dataset({
20462046
features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],
@@ -2109,14 +2109,14 @@ def cast(
21092109
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
21102110
>>> ds.features
21112111
{'label': ClassLabel(names=['neg', 'pos']),
2112-
'text': Value(dtype='string')}
2112+
'text': Value('string')}
21132113
>>> new_features = ds.features.copy()
21142114
>>> new_features['label'] = ClassLabel(names=['bad', 'good'])
21152115
>>> new_features['text'] = Value('large_string')
21162116
>>> ds = ds.cast(new_features)
21172117
>>> ds.features
21182118
{'label': ClassLabel(names=['bad', 'good']),
2119-
'text': Value(dtype='large_string')}
2119+
'text': Value('large_string')}
21202120
```
21212121
"""
21222122
if sorted(features) != sorted(self._data.column_names):
@@ -2168,11 +2168,11 @@ def cast_column(self, column: str, feature: FeatureType, new_fingerprint: Option
21682168
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
21692169
>>> ds.features
21702170
{'label': ClassLabel(names=['neg', 'pos']),
2171-
'text': Value(dtype='string')}
2171+
'text': Value('string')}
21722172
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
21732173
>>> ds.features
21742174
{'label': ClassLabel(names=['bad', 'good']),
2175-
'text': Value(dtype='string')}
2175+
'text': Value('string')}
21762176
```
21772177
"""
21782178
feature = _fix_for_backward_compatible_features(feature)

src/datasets/builder.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -513,7 +513,7 @@ def get_all_exported_dataset_infos(cls) -> DatasetInfosDict:
513513
>>> from datasets import load_dataset_builder
514514
>>> ds_builder = load_dataset_builder('vivos')
515515
>>> ds_builder.get_all_exported_dataset_infos()
516-
{'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string'), 'path': Value(dtype='string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)}
516+
{'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value('string'), 'path': Value('string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value('string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)}
517517
```
518518
"""
519519
return DatasetInfosDict.from_directory(cls.get_imported_module_dir())
@@ -527,7 +527,7 @@ def get_exported_dataset_info(self) -> DatasetInfo:
527527
>>> from datasets import load_dataset_builder
528528
>>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
529529
>>> ds_builder.get_exported_dataset_info()
530-
DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string'), 'path': Value(dtype='string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)
530+
DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value('string'), 'path': Value('string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value('string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)
531531
```
532532
"""
533533
return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo())

src/datasets/dataset_dict.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -201,12 +201,12 @@ def flatten(self, max_depth=16) -> "DatasetDict":
201201
>>> from datasets import load_dataset
202202
>>> ds = load_dataset("rajpurkar/squad")
203203
>>> ds["train"].features
204-
{'id': Value(dtype='string'),
205-
'title': Value(dtype='string'),
206-
'context': Value(dtype='string'),
207-
'question': Value(dtype='string'),
208-
'answers.text': List(feature=Value(dtype='string'), length=-1),
209-
'answers.answer_start': List(feature=Value(dtype='int32'), length=-1)}
204+
{'id': Value('string'),
205+
'title': Value('string'),
206+
'context': Value('string'),
207+
'question': Value('string'),
208+
'answers.text': List(Value('string')),
209+
'answers.answer_start': List(Value('int32'))}
210210
>>> ds.flatten()
211211
DatasetDict({
212212
train: Dataset({
@@ -290,14 +290,14 @@ def cast(self, features: Features) -> "DatasetDict":
290290
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")
291291
>>> ds["train"].features
292292
{'label': ClassLabel(names=['neg', 'pos']),
293-
'text': Value(dtype='string')}
293+
'text': Value('string')}
294294
>>> new_features = ds["train"].features.copy()
295295
>>> new_features['label'] = ClassLabel(names=['bad', 'good'])
296296
>>> new_features['text'] = Value('large_string')
297297
>>> ds = ds.cast(new_features)
298298
>>> ds["train"].features
299299
{'label': ClassLabel(names=['bad', 'good']),
300-
'text': Value(dtype='large_string')}
300+
'text': Value('large_string')}
301301
```
302302
"""
303303
self._check_values_type()
@@ -322,11 +322,11 @@ def cast_column(self, column: str, feature) -> "DatasetDict":
322322
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")
323323
>>> ds["train"].features
324324
{'label': ClassLabel(names=['neg', 'pos']),
325-
'text': Value(dtype='string')}
325+
'text': Value('string')}
326326
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
327327
>>> ds["train"].features
328328
{'label': ClassLabel(names=['bad', 'good']),
329-
'text': Value(dtype='string')}
329+
'text': Value('string')}
330330
```
331331
"""
332332
self._check_values_type()
@@ -513,14 +513,14 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data
513513
>>> from datasets import load_dataset
514514
>>> ds = load_dataset("boolq")
515515
>>> ds["train"].features
516-
{'answer': Value(dtype='bool'),
517-
'passage': Value(dtype='string'),
518-
'question': Value(dtype='string')}
516+
{'answer': Value('bool'),
517+
'passage': Value('string'),
518+
'question': Value('string')}
519519
>>> ds = ds.class_encode_column("answer")
520520
>>> ds["train"].features
521521
{'answer': ClassLabel(num_classes=2, names=['False', 'True']),
522-
'passage': Value(dtype='string'),
523-
'question': Value(dtype='string')}
522+
'passage': Value('string'),
523+
'question': Value('string')}
524524
```
525525
"""
526526
self._check_values_type()
@@ -2381,11 +2381,11 @@ def cast_column(self, column: str, feature: FeatureType) -> "IterableDatasetDict
23812381
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True)
23822382
>>> ds["train"].features
23832383
{'label': ClassLabel(names=['neg', 'pos']),
2384-
'text': Value(dtype='string')}
2384+
'text': Value('string')}
23852385
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
23862386
>>> ds["train"].features
23872387
{'label': ClassLabel(names=['bad', 'good']),
2388-
'text': Value(dtype='string')}
2388+
'text': Value('string')}
23892389
```
23902390
"""
23912391
return IterableDatasetDict(
@@ -2417,14 +2417,14 @@ def cast(
24172417
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True)
24182418
>>> ds["train"].features
24192419
{'label': ClassLabel(names=['neg', 'pos']),
2420-
'text': Value(dtype='string')}
2420+
'text': Value('string')}
24212421
>>> new_features = ds["train"].features.copy()
24222422
>>> new_features['label'] = ClassLabel(names=['bad', 'good'])
24232423
>>> new_features['text'] = Value('large_string')
24242424
>>> ds = ds.cast(new_features)
24252425
>>> ds["train"].features
24262426
{'label': ClassLabel(names=['bad', 'good']),
2427-
'text': Value(dtype='large_string')}
2427+
'text': Value('large_string')}
24282428
```
24292429
"""
24302430
return IterableDatasetDict({k: dataset.cast(features=features) for k, dataset in self.items()})

0 commit comments

Comments
 (0)