|
| 1 | +# coding=utf-8 |
| 2 | +# Copyright 2020 The TensorFlow Datasets Authors. |
| 3 | +# |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# Unless required by applicable law or agreed to in writing, software |
| 11 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +# See the License for the specific language governing permissions and |
| 14 | +# limitations under the License. |
| 15 | + |
| 16 | +# Lint as: python3 |
| 17 | +"""BLiMP dataset with minimal pairs of grammatical phenomena in English.""" |
| 18 | + |
| 19 | +from __future__ import absolute_import |
| 20 | +from __future__ import division |
| 21 | +from __future__ import print_function |
| 22 | + |
| 23 | +import json |
| 24 | +import os |
| 25 | +import tensorflow.compat.v2 as tf |
| 26 | +import tensorflow_datasets.public_api as tfds |
| 27 | + |
| 28 | +_CITATION = """ |
| 29 | +@article{warstadt2019blimp, |
| 30 | + title={BLiMP: A Benchmark of Linguistic Minimal Pairs for English}, |
| 31 | + author={Warstadt, Alex and Parrish, Alicia and Liu, Haokun and Mohananey, Anhad and Peng, Wei, and Wang, Sheng-Fu and Bowman, Samuel R}, |
| 32 | + journal={arXiv preprint arXiv:1912.00582}, |
| 33 | + year={2019} |
| 34 | +} |
| 35 | +""" |
| 36 | + |
| 37 | +_DESCRIPTION = """ |
| 38 | +BLiMP is a challenge set for evaluating what language models (LMs) know about |
| 39 | +major grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each |
| 40 | +containing 1000 minimal pairs isolating specific contrasts in syntax, |
| 41 | +morphology, or semantics. The data is automatically generated according to |
| 42 | +expert-crafted grammars. |
| 43 | +""" |
| 44 | + |
| 45 | +_PROJECT_URL = 'https://github.com/alexwarstadt/blimp/tree/master/' |
| 46 | +_DOWNLOAD_URL = 'https://raw.githubusercontent.com/alexwarstadt/blimp/master/' |
| 47 | + |
| 48 | + |
| 49 | +class BlimpConfig(tfds.core.BuilderConfig): |
| 50 | + """BuilderConfig for Blimp.""" |
| 51 | + |
| 52 | + @tfds.core.disallow_positional_args |
| 53 | + def __init__(self, paradigm_uid, **kwargs): |
| 54 | + """BuilderConfig for Blimp. |
| 55 | +
|
| 56 | + Args: |
| 57 | + paradigm_uid: string, UID of the linguistic paradigm |
| 58 | + **kwargs: keyword arguments forwarded to super. |
| 59 | + """ |
| 60 | + name = paradigm_uid |
| 61 | + |
| 62 | + description = _DESCRIPTION |
| 63 | + description += ('This configuration includes the paradigm {}.').format(name) |
| 64 | + |
| 65 | + super(BlimpConfig, self).__init__( |
| 66 | + name=name, |
| 67 | + description=description, |
| 68 | + version=tfds.core.Version('0.1.0'), |
| 69 | + **kwargs) |
| 70 | + |
| 71 | + |
| 72 | +class Blimp(tfds.core.GeneratorBasedBuilder): |
| 73 | + """Minimal grammatical and ungrammatical pairs of 67 linguistic paradigms.""" |
| 74 | + |
| 75 | + all_paradigms = [ |
| 76 | + 'adjunct_island', |
| 77 | + 'anaphor_gender_agreement', |
| 78 | + 'anaphor_number_agreement', |
| 79 | + 'animate_subject_passive', |
| 80 | + 'animate_subject_trans', |
| 81 | + 'causative', |
| 82 | + 'complex_NP_island', |
| 83 | + 'coordinate_structure_constraint_complex_left_branch', |
| 84 | + 'coordinate_structure_constraint_object_extraction', |
| 85 | + 'determiner_noun_agreement_1', |
| 86 | + 'determiner_noun_agreement_2', |
| 87 | + 'determiner_noun_agreement_irregular_1', |
| 88 | + 'determiner_noun_agreement_irregular_2', |
| 89 | + 'determiner_noun_agreement_with_adj_2', |
| 90 | + 'determiner_noun_agreement_with_adj_irregular_1', |
| 91 | + 'determiner_noun_agreement_with_adj_irregular_2', |
| 92 | + 'determiner_noun_agreement_with_adjective_1', |
| 93 | + 'distractor_agreement_relational_noun', |
| 94 | + 'distractor_agreement_relative_clause', |
| 95 | + 'drop_argument', |
| 96 | + 'ellipsis_n_bar_1', |
| 97 | + 'ellipsis_n_bar_2', |
| 98 | + 'existential_there_object_raising', |
| 99 | + 'existential_there_quantifiers_1', |
| 100 | + 'existential_there_quantifiers_2', |
| 101 | + 'existential_there_subject_raising', |
| 102 | + 'expletive_it_object_raising', |
| 103 | + 'inchoative', |
| 104 | + 'intransitive', |
| 105 | + 'irregular_past_participle_adjectives', |
| 106 | + 'irregular_past_participle_verbs', |
| 107 | + 'irregular_plural_subject_verb_agreement_1', |
| 108 | + 'irregular_plural_subject_verb_agreement_2', |
| 109 | + 'left_branch_island_echo_question', |
| 110 | + 'left_branch_island_simple_question', |
| 111 | + 'matrix_question_npi_licensor_present', |
| 112 | + 'npi_present_1', |
| 113 | + 'npi_present_2', |
| 114 | + 'only_npi_licensor_present', |
| 115 | + 'only_npi_scope', |
| 116 | + 'passive_1', |
| 117 | + 'passive_2', |
| 118 | + 'principle_A_c_command', |
| 119 | + 'principle_A_case_1', |
| 120 | + 'principle_A_case_2', |
| 121 | + 'principle_A_domain_1', |
| 122 | + 'principle_A_domain_2', |
| 123 | + 'principle_A_domain_3', |
| 124 | + 'principle_A_reconstruction', |
| 125 | + 'regular_plural_subject_verb_agreement_1', |
| 126 | + 'regular_plural_subject_verb_agreement_2', |
| 127 | + 'sentential_negation_npi_licensor_present', |
| 128 | + 'sentential_negation_npi_scope', |
| 129 | + 'sentential_subject_island', |
| 130 | + 'superlative_quantifiers_1', |
| 131 | + 'superlative_quantifiers_2', |
| 132 | + 'tough_vs_raising_1', |
| 133 | + 'tough_vs_raising_2', |
| 134 | + 'transitive', |
| 135 | + 'wh_island', |
| 136 | + 'wh_questions_object_gap', |
| 137 | + 'wh_questions_subject_gap', |
| 138 | + 'wh_questions_subject_gap_long_distance', |
| 139 | + 'wh_vs_that_no_gap', |
| 140 | + 'wh_vs_that_no_gap_long_distance', |
| 141 | + 'wh_vs_that_with_gap', |
| 142 | + 'wh_vs_that_with_gap_long_distance', |
| 143 | + ] |
| 144 | + |
| 145 | + BUILDER_CONFIGS = [ |
| 146 | + BlimpConfig(paradigm_uid=paradigm) for paradigm in all_paradigms |
| 147 | + ] |
| 148 | + |
| 149 | + def _info(self): |
| 150 | + return tfds.core.DatasetInfo( |
| 151 | + builder=self, |
| 152 | + description=_DESCRIPTION, |
| 153 | + features=tfds.features.FeaturesDict({ |
| 154 | + 'sentence_good': tfds.features.Text(), |
| 155 | + 'sentence_bad': tfds.features.Text(), |
| 156 | + 'field': tfds.features.Text(), |
| 157 | + 'linguistics_term': tfds.features.Text(), |
| 158 | + 'UID': tfds.features.Text(), |
| 159 | + 'simple_LM_method': tf.bool, |
| 160 | + 'one_prefix_method': tf.bool, |
| 161 | + 'two_prefix_method': tf.bool, |
| 162 | + 'lexically_identical': tf.bool, |
| 163 | + 'pair_id': tf.int32, |
| 164 | + }), |
| 165 | + supervised_keys=None, |
| 166 | + # Homepage of the dataset for documentation |
| 167 | + homepage=_PROJECT_URL, |
| 168 | + citation=_CITATION, |
| 169 | + ) |
| 170 | + |
| 171 | + def _split_generators(self, dl_manager): |
| 172 | + """Returns SplitGenerators.""" |
| 173 | + cfg = self.builder_config |
| 174 | + download_urls = { |
| 175 | + cfg.name: os.path.join(_DOWNLOAD_URL, 'data', cfg.name + '.jsonl') |
| 176 | + } |
| 177 | + |
| 178 | + downloaded_files = dl_manager.download_and_extract(download_urls) |
| 179 | + |
| 180 | + return [ |
| 181 | + tfds.core.SplitGenerator( |
| 182 | + name=tfds.Split.TRAIN, |
| 183 | + gen_kwargs={'filepath': downloaded_files[cfg.name]}) |
| 184 | + ] |
| 185 | + |
| 186 | + def _generate_examples(self, filepath): |
| 187 | + """Yields examples.""" |
| 188 | + with tf.io.gfile.GFile(filepath, 'rb') as f: |
| 189 | + for line in f: |
| 190 | + line_dict = json.loads(line) |
| 191 | + id_ = line_dict['UID'] + '_' + line_dict['pairID'] |
| 192 | + feats = { |
| 193 | + 'sentence_good': line_dict['sentence_good'], |
| 194 | + 'sentence_bad': line_dict['sentence_bad'], |
| 195 | + 'field': line_dict['field'], |
| 196 | + 'linguistics_term': line_dict['linguistics_term'], |
| 197 | + 'UID': line_dict['UID'], |
| 198 | + 'simple_LM_method': line_dict['simple_LM_method'], |
| 199 | + 'one_prefix_method': line_dict['one_prefix_method'], |
| 200 | + 'two_prefix_method': line_dict['two_prefix_method'], |
| 201 | + 'lexically_identical': line_dict['lexically_identical'], |
| 202 | + 'pair_id': line_dict['pairID'], |
| 203 | + } |
| 204 | + yield id_, feats |
0 commit comments