Skip to content

Commit 7af7ace

Browse files
Fix misleading add_column() usage example in docstring (#7648)
* Fix misleading add_column() usage example in docstring This PR fixes the usage example in the Dataset.add_column() docstring, which previously implied that add_column() modifies the dataset in-place. Why: The method returns a new dataset with the additional column, and users must assign the result to a variable to preserve the change. Fixes #7611 * Fix misleading docstring examples for select_columns, select, filter, shard, and flatten Fix misleading docstring examples for select_columns, select, filter, shard, and flatten - Updated usage examples to show correct behavior (methods return new datasets) - Added inline comments to clarify that methods do not modify in-place - Fixes follow-up from issue #7611 and @lhoestq’s review on PR #7648 * Apply suggestions from code review --------- Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
1 parent 9dfa288 commit 7af7ace

File tree

1 file changed

+13
-6
lines changed

1 file changed

+13
-6
lines changed

src/datasets/arrow_dataset.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2041,7 +2041,8 @@ def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> "Datas
20412041
'question': Value('string'),
20422042
'answers': {'text': List(Value('string')),
20432043
'answer_start': List(Value('int32'))}}
2044-
>>> ds.flatten()
2044+
>>> ds = ds.flatten()
2045+
>>> ds
20452046
Dataset({
20462047
features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],
20472048
num_rows: 87599
@@ -2399,7 +2400,8 @@ def select_columns(self, column_names: Union[str, list[str]], new_fingerprint: O
23992400
```py
24002401
>>> from datasets import load_dataset
24012402
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
2402-
>>> ds.select_columns(['text'])
2403+
>>> ds = ds.select_columns(['text'])
2404+
>>> ds
24032405
Dataset({
24042406
features: ['text'],
24052407
num_rows: 1066
@@ -3869,12 +3871,14 @@ def filter(
38693871
```py
38703872
>>> from datasets import load_dataset
38713873
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
3872-
>>> ds.filter(lambda x: x["label"] == 1)
3874+
>>> ds = ds.filter(lambda x: x["label"] == 1)
3875+
>>> ds
38733876
Dataset({
38743877
features: ['text', 'label'],
38753878
num_rows: 533
38763879
})
38773880
```
3881+
38783882
"""
38793883
if len(self.list_indexes()) > 0:
38803884
raise DatasetTransformationNotAllowedError(
@@ -4041,7 +4045,8 @@ def select(
40414045
```py
40424046
>>> from datasets import load_dataset
40434047
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
4044-
>>> ds.select(range(4))
4048+
>>> ds = ds.select(range(4))
4049+
>>> ds
40454050
Dataset({
40464051
features: ['text', 'label'],
40474052
num_rows: 4
@@ -4936,7 +4941,8 @@ def shard(
49364941
features: ['text', 'label'],
49374942
num_rows: 1066
49384943
})
4939-
>>> ds.shard(num_shards=2, index=0)
4944+
>>> ds = ds.shard(num_shards=2, index=0)
4945+
>>> ds
49404946
Dataset({
49414947
features: ['text', 'label'],
49424948
num_rows: 533
@@ -6005,7 +6011,8 @@ def add_column(
60056011
>>> from datasets import load_dataset
60066012
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
60076013
>>> more_text = ds["text"]
6008-
>>> ds.add_column(name="text_2", column=more_text)
6014+
>>> ds = ds.add_column(name="text_2", column=more_text)
6015+
>>> ds
60096016
Dataset({
60106017
features: ['text', 'label', 'text_2'],
60116018
num_rows: 1066

0 commit comments

Comments
 (0)