|
69 | 69 | 4 ambulances ambulance NOUN NNS Number=Plur 3 obj 3:obj SpaceAfter=No
|
70 | 70 | """
|
71 | 71 |
|
| 72 | +BLANKS_DATA = """ |
| 73 | +# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0018 |
| 74 | +# text = Guerrillas killed an engineer, Asi Ali, from Tikrit. |
| 75 | +1 Guerrillas _ NOUN NNS Number=Plur 2 nsubj 2:nsubj _ |
| 76 | +2 killed _ VERB VBD Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin 0 root 0:root _ |
| 77 | +3 an a DET DT Definite=Ind|PronType=Art 4 det 4:det _ |
| 78 | +4 engineer _ NOUN NN Number=Sing 2 obj 2:obj SpaceAfter=No |
| 79 | +
|
| 80 | +""".lstrip() |
| 81 | + |
72 | 82 |
|
73 | 83 | def test_load_document():
|
74 | 84 | train_doc = CoNLL.conll2doc(input_str=TRAIN_DATA)
|
75 |
| - data = DataLoader.load_doc(train_doc, caseless=False, evaluation=True) |
| 85 | + data = DataLoader.load_doc(train_doc, caseless=False, skip_blank_lemmas=False, evaluation=True) |
76 | 86 | assert len(data) == 33 # meticulously counted by hand
|
77 | 87 | assert all(len(x) == 3 for x in data)
|
78 | 88 |
|
79 |
| - data = DataLoader.load_doc(train_doc, caseless=False, evaluation=False) |
| 89 | + data = DataLoader.load_doc(train_doc, caseless=False, skip_blank_lemmas=False, evaluation=False) |
80 | 90 | assert len(data) == 33
|
81 | 91 | assert all(len(x) == 3 for x in data)
|
82 | 92 |
|
83 | 93 | def test_load_goeswith():
|
84 | 94 | raw_data = TRAIN_DATA + GOESWITH_DATA
|
85 | 95 | train_doc = CoNLL.conll2doc(input_str=raw_data)
|
86 |
| - data = DataLoader.load_doc(train_doc, caseless=False, evaluation=True) |
| 96 | + data = DataLoader.load_doc(train_doc, caseless=False, skip_blank_lemmas=False, evaluation=True) |
87 | 97 | assert len(data) == 36 # will be the same as in test_load_document with three additional words
|
88 | 98 | assert all(len(x) == 3 for x in data)
|
89 | 99 |
|
90 |
| - data = DataLoader.load_doc(train_doc, caseless=False, evaluation=False) |
| 100 | + data = DataLoader.load_doc(train_doc, caseless=False, skip_blank_lemmas=False, evaluation=False) |
91 | 101 | assert len(data) == 33 # will be the same as in test_load_document, but with the trailing 3 GOESWITH removed
|
92 | 102 | assert all(len(x) == 3 for x in data)
|
93 | 103 |
|
94 | 104 | def test_correct_form():
|
95 | 105 | raw_data = TRAIN_DATA + CORRECT_FORM_DATA
|
96 | 106 | train_doc = CoNLL.conll2doc(input_str=raw_data)
|
97 |
| - data = DataLoader.load_doc(train_doc, caseless=False, evaluation=True) |
| 107 | + data = DataLoader.load_doc(train_doc, caseless=False, skip_blank_lemmas=False, evaluation=True) |
98 | 108 | assert len(data) == 37
|
99 | 109 | # the 'targeting' correction should not be applied if evaluation=True
|
100 | 110 | # when evaluation=False, then the CorrectForms will be applied
|
101 | 111 | assert not any(x[0] == 'targeting' for x in data)
|
102 | 112 |
|
103 |
| - data = DataLoader.load_doc(train_doc, caseless=False, evaluation=False) |
| 113 | + data = DataLoader.load_doc(train_doc, caseless=False, skip_blank_lemmas=False, evaluation=False) |
104 | 114 | assert len(data) == 38 # the same, but with an extra row so the model learns both 'targetting' and 'targeting'
|
105 | 115 | assert any(x[0] == 'targeting' for x in data)
|
106 | 116 | assert any(x[0] == 'targetting' for x in data)
|
| 117 | + |
| 118 | +def test_load_blank(): |
| 119 | + raw_data = TRAIN_DATA + BLANKS_DATA |
| 120 | + train_doc = CoNLL.conll2doc(input_str=raw_data) |
| 121 | + data = DataLoader.load_doc(train_doc, caseless=False, skip_blank_lemmas=False, evaluation=False) |
| 122 | + assert len(data) == 37 # will be the same as in test_load_document with FOUR additional words |
| 123 | + assert all(len(x) == 3 for x in data) |
| 124 | + |
| 125 | + data = DataLoader.load_doc(train_doc, caseless=False, skip_blank_lemmas=True, evaluation=False) |
| 126 | + assert len(data) == 34 # will be the same as in test_load_document, but one extra word is added. others were blank |
| 127 | + assert all(len(x) == 3 for x in data) |
| 128 | + |
0 commit comments