Skip to content

Commit ed089da

Browse files
TensorFlow Datasets Teamcopybara-github
authored andcommitted
Add BLiMP dataset.
PiperOrigin-RevId: 301144697
1 parent 669a730 commit ed089da

File tree

5 files changed

+318
-0
lines changed

5 files changed

+318
-0
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"sentence_good": "James is pleasant to flee from.", "sentence_bad": "James is apt to flee from.", "field": "syntax_semantics", "linguistics_term": "control_raising", "UID": "tough_vs_raising_1", "simple_LM_method": true, "one_prefix_method": false, "two_prefix_method": false, "lexically_identical": false, "pairID": "0"}
2+
{"sentence_good": "Samuel's lawyer was easy to reference.", "sentence_bad": "Samuel's lawyer was certain to reference.", "field": "syntax_semantics", "linguistics_term": "control_raising", "UID": "tough_vs_raising_1", "simple_LM_method": true, "one_prefix_method": false, "two_prefix_method": false, "lexically_identical": false, "pairID": "1"}
3+
{"sentence_good": "Benjamin's tutor was easy to boast about.", "sentence_bad": "Benjamin's tutor was certain to boast about.", "field": "syntax_semantics", "linguistics_term": "control_raising", "UID": "tough_vs_raising_1", "simple_LM_method": true, "one_prefix_method": false, "two_prefix_method": false, "lexically_identical": false, "pairID": "2"}
4+
{"sentence_good": "Laura isn't interesting to cure.", "sentence_bad": "Laura isn't soon to cure.", "field": "syntax_semantics", "linguistics_term": "control_raising", "UID": "tough_vs_raising_1", "simple_LM_method": true, "one_prefix_method": false, "two_prefix_method": false, "lexically_identical": false, "pairID": "3"}
5+
{"sentence_good": "Some jacket wasn't ready to wear.", "sentence_bad": "Some jacket wasn't bound to wear.", "field": "syntax_semantics", "linguistics_term": "control_raising", "UID": "tough_vs_raising_1", "simple_LM_method": true, "one_prefix_method": false, "two_prefix_method": false, "lexically_identical": false, "pairID": "4"}

tensorflow_datasets/text/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
# Lint as: python3
1717
"""Text datasets."""
1818

19+
from tensorflow_datasets.text.blimp import Blimp
1920
from tensorflow_datasets.text.c4 import C4
2021
from tensorflow_datasets.text.cfq import CFQ
2122
from tensorflow_datasets.text.civil_comments import CivilComments

tensorflow_datasets/text/blimp.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Lint as: python3
17+
"""BLiMP dataset with minimal pairs of grammatical phenomena in English."""
18+
19+
from __future__ import absolute_import
20+
from __future__ import division
21+
from __future__ import print_function
22+
23+
import json
24+
import os
25+
import tensorflow.compat.v2 as tf
26+
import tensorflow_datasets.public_api as tfds
27+
28+
_CITATION = """
29+
@article{warstadt2019blimp,
30+
title={BLiMP: A Benchmark of Linguistic Minimal Pairs for English},
31+
author={Warstadt, Alex and Parrish, Alicia and Liu, Haokun and Mohananey, Anhad and Peng, Wei, and Wang, Sheng-Fu and Bowman, Samuel R},
32+
journal={arXiv preprint arXiv:1912.00582},
33+
year={2019}
34+
}
35+
"""
36+
37+
_DESCRIPTION = """
38+
BLiMP is a challenge set for evaluating what language models (LMs) know about
39+
major grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each
40+
containing 1000 minimal pairs isolating specific contrasts in syntax,
41+
morphology, or semantics. The data is automatically generated according to
42+
expert-crafted grammars.
43+
"""
44+
45+
_PROJECT_URL = 'https://github.com/alexwarstadt/blimp/tree/master/'
46+
_DOWNLOAD_URL = 'https://raw.githubusercontent.com/alexwarstadt/blimp/master/'
47+
48+
49+
class BlimpConfig(tfds.core.BuilderConfig):
50+
"""BuilderConfig for Blimp."""
51+
52+
@tfds.core.disallow_positional_args
53+
def __init__(self, paradigm_uid, **kwargs):
54+
"""BuilderConfig for Blimp.
55+
56+
Args:
57+
paradigm_uid: string, UID of the linguistic paradigm
58+
**kwargs: keyword arguments forwarded to super.
59+
"""
60+
name = paradigm_uid
61+
62+
description = _DESCRIPTION
63+
description += ('This configuration includes the paradigm {}.').format(name)
64+
65+
super(BlimpConfig, self).__init__(
66+
name=name,
67+
description=description,
68+
version=tfds.core.Version('0.1.0'),
69+
**kwargs)
70+
71+
72+
class Blimp(tfds.core.GeneratorBasedBuilder):
73+
"""Minimal grammatical and ungrammatical pairs of 67 linguistic paradigms."""
74+
75+
all_paradigms = [
76+
'adjunct_island',
77+
'anaphor_gender_agreement',
78+
'anaphor_number_agreement',
79+
'animate_subject_passive',
80+
'animate_subject_trans',
81+
'causative',
82+
'complex_NP_island',
83+
'coordinate_structure_constraint_complex_left_branch',
84+
'coordinate_structure_constraint_object_extraction',
85+
'determiner_noun_agreement_1',
86+
'determiner_noun_agreement_2',
87+
'determiner_noun_agreement_irregular_1',
88+
'determiner_noun_agreement_irregular_2',
89+
'determiner_noun_agreement_with_adj_2',
90+
'determiner_noun_agreement_with_adj_irregular_1',
91+
'determiner_noun_agreement_with_adj_irregular_2',
92+
'determiner_noun_agreement_with_adjective_1',
93+
'distractor_agreement_relational_noun',
94+
'distractor_agreement_relative_clause',
95+
'drop_argument',
96+
'ellipsis_n_bar_1',
97+
'ellipsis_n_bar_2',
98+
'existential_there_object_raising',
99+
'existential_there_quantifiers_1',
100+
'existential_there_quantifiers_2',
101+
'existential_there_subject_raising',
102+
'expletive_it_object_raising',
103+
'inchoative',
104+
'intransitive',
105+
'irregular_past_participle_adjectives',
106+
'irregular_past_participle_verbs',
107+
'irregular_plural_subject_verb_agreement_1',
108+
'irregular_plural_subject_verb_agreement_2',
109+
'left_branch_island_echo_question',
110+
'left_branch_island_simple_question',
111+
'matrix_question_npi_licensor_present',
112+
'npi_present_1',
113+
'npi_present_2',
114+
'only_npi_licensor_present',
115+
'only_npi_scope',
116+
'passive_1',
117+
'passive_2',
118+
'principle_A_c_command',
119+
'principle_A_case_1',
120+
'principle_A_case_2',
121+
'principle_A_domain_1',
122+
'principle_A_domain_2',
123+
'principle_A_domain_3',
124+
'principle_A_reconstruction',
125+
'regular_plural_subject_verb_agreement_1',
126+
'regular_plural_subject_verb_agreement_2',
127+
'sentential_negation_npi_licensor_present',
128+
'sentential_negation_npi_scope',
129+
'sentential_subject_island',
130+
'superlative_quantifiers_1',
131+
'superlative_quantifiers_2',
132+
'tough_vs_raising_1',
133+
'tough_vs_raising_2',
134+
'transitive',
135+
'wh_island',
136+
'wh_questions_object_gap',
137+
'wh_questions_subject_gap',
138+
'wh_questions_subject_gap_long_distance',
139+
'wh_vs_that_no_gap',
140+
'wh_vs_that_no_gap_long_distance',
141+
'wh_vs_that_with_gap',
142+
'wh_vs_that_with_gap_long_distance',
143+
]
144+
145+
BUILDER_CONFIGS = [
146+
BlimpConfig(paradigm_uid=paradigm) for paradigm in all_paradigms
147+
]
148+
149+
def _info(self):
150+
return tfds.core.DatasetInfo(
151+
builder=self,
152+
description=_DESCRIPTION,
153+
features=tfds.features.FeaturesDict({
154+
'sentence_good': tfds.features.Text(),
155+
'sentence_bad': tfds.features.Text(),
156+
'field': tfds.features.Text(),
157+
'linguistics_term': tfds.features.Text(),
158+
'UID': tfds.features.Text(),
159+
'simple_LM_method': tf.bool,
160+
'one_prefix_method': tf.bool,
161+
'two_prefix_method': tf.bool,
162+
'lexically_identical': tf.bool,
163+
'pair_id': tf.int32,
164+
}),
165+
supervised_keys=None,
166+
# Homepage of the dataset for documentation
167+
homepage=_PROJECT_URL,
168+
citation=_CITATION,
169+
)
170+
171+
def _split_generators(self, dl_manager):
172+
"""Returns SplitGenerators."""
173+
cfg = self.builder_config
174+
download_urls = {
175+
cfg.name: os.path.join(_DOWNLOAD_URL, 'data', cfg.name + '.jsonl')
176+
}
177+
178+
downloaded_files = dl_manager.download_and_extract(download_urls)
179+
180+
return [
181+
tfds.core.SplitGenerator(
182+
name=tfds.Split.TRAIN,
183+
gen_kwargs={'filepath': downloaded_files[cfg.name]})
184+
]
185+
186+
def _generate_examples(self, filepath):
187+
"""Yields examples."""
188+
with tf.io.gfile.GFile(filepath, 'rb') as f:
189+
for line in f:
190+
line_dict = json.loads(line)
191+
id_ = line_dict['UID'] + '_' + line_dict['pairID']
192+
feats = {
193+
'sentence_good': line_dict['sentence_good'],
194+
'sentence_bad': line_dict['sentence_bad'],
195+
'field': line_dict['field'],
196+
'linguistics_term': line_dict['linguistics_term'],
197+
'UID': line_dict['UID'],
198+
'simple_LM_method': line_dict['simple_LM_method'],
199+
'one_prefix_method': line_dict['one_prefix_method'],
200+
'two_prefix_method': line_dict['two_prefix_method'],
201+
'lexically_identical': line_dict['lexically_identical'],
202+
'pair_id': line_dict['pairID'],
203+
}
204+
yield id_, feats
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""TODO(blimp): Add a description here."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
from tensorflow_datasets import testing
23+
from tensorflow_datasets.text import blimp
24+
25+
26+
class BlimpTest(testing.DatasetBuilderTestCase):
27+
DATASET_CLASS = blimp.Blimp
28+
SPLITS = {
29+
"train": 5, # Number of fake train example
30+
}
31+
32+
DL_EXTRACT_RESULT = {
33+
"tough_vs_raising_1": "tough_vs_raising_1.jsonl",
34+
}
35+
36+
BUILDER_CONFIG_NAMES_TO_TEST = ["tough_vs_raising_1"]
37+
38+
39+
if __name__ == "__main__":
40+
testing.test_main()
41+

0 commit comments

Comments
 (0)