5
5
import edu .stanford .nlp .util .*;
6
6
7
7
import static org .junit .Assert .assertEquals ;
8
+ import static org .junit .Assert .assertFalse ;
9
+ import static org .junit .Assert .assertNotNull ;
10
+ import static org .junit .Assert .assertTrue ;
8
11
9
12
import java .io .*;
13
+ import java .util .ArrayList ;
14
+ import java .util .List ;
10
15
import java .util .Properties ;
11
16
12
17
import org .junit .Before ;
@@ -24,53 +29,278 @@ public class CoNLLUReaderITest {
24
29
public Annotation goldDocument ;
25
30
public Annotation readInDocument ;
26
31
27
- @ Before
28
- public void setUp () throws IOException {
29
- // set up the pipeline
30
- Properties props = LanguageInfo .getLanguageProperties ("spanish" );
31
- props .put ("annotators" , "tokenize,ssplit,mwt,pos,lemma,depparse" );
32
- pipeline = new StanfordCoreNLP (props );
33
- }
32
+ static final String [] EXPECTED_SENTENCE_TEXT = {
33
+ "Pero la existencia de dos recién nacidos en la misma caja sólo podía deberse a un descuido de fábrica." ,
34
+ "De allí las rebajas."
35
+ };
36
+ static final String EXPECTED_TEXT = String .join (System .lineSeparator (), EXPECTED_SENTENCE_TEXT ) + System .lineSeparator ();
37
+
38
+ static final String [][] EXPECTED_WORD_TEXT = {
39
+ {"Pero" , "la" , "existencia" , "de" , "dos" , "recién" , "nacidos" , "en" , "la" , "misma" , "caja" , "sólo" , "podía" , "deber" , "se" , "a" , "un" , "descuido" , "de" , "fábrica" , "." },
40
+ {"De" , "allí" , "las" , "rebajas" , "." },
41
+ };
42
+
43
+ static final String [][] EXPECTED_LEMMA_TEXT = {
44
+ {"pero" , "el" , "existencia" , "de" , "dos" , "recién" , "nacido" , "en" , "el" , "mismo" , "caja" , "sólo" , "poder" , "deber" , "él" , "a" , "uno" , "descuido" , "de" , "fábrica" , "." },
45
+ {"de" , "allí" , "el" , "rebaja" , "." },
46
+ };
47
+
48
+ static final String [][] EXPECTED_CPOS = {
49
+ {"CCONJ" , "DET" , "NOUN" , "ADP" , "NUM" , "ADV" , "ADJ" , "ADP" , "DET" , "DET" , "NOUN" , "ADV" , "AUX" , "VERB" , "PRON" , "ADP" , "DET" , "NOUN" , "ADP" , "NOUN" , "PUNCT" },
50
+ {"ADP" , "ADV" , "DET" , "NOUN" , "PUNCT" },
51
+ };
52
+
53
+ static final String [][] EXPECTED_FEATS = {
54
+ {
55
+ null ,
56
+ "Definite=Def|Gender=Fem|Number=Sing|PronType=Art" ,
57
+ "Gender=Fem|Number=Sing" ,
58
+ null ,
59
+ "Number=Plur|NumForm=Word|NumType=Card" ,
60
+ null ,
61
+ "Gender=Masc|Number=Plur|VerbForm=Part" ,
62
+ null ,
63
+ "Definite=Def|Gender=Fem|Number=Sing|PronType=Art" ,
64
+ "Gender=Fem|Number=Sing|PronType=Dem" ,
65
+ "Gender=Fem|Number=Sing" ,
66
+ null ,
67
+ "Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin" ,
68
+ "VerbForm=Inf" ,
69
+ "Case=Acc|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes" ,
70
+ null ,
71
+ "Definite=Ind|Gender=Masc|Number=Sing|PronType=Art" ,
72
+ "Gender=Masc|Number=Sing" ,
73
+ null ,
74
+ "Gender=Fem|Number=Sing" ,
75
+ "PunctType=Peri" ,
76
+ },
77
+ {
78
+ null ,
79
+ null ,
80
+ "Definite=Def|Gender=Fem|Number=Plur|PronType=Art" ,
81
+ "Gender=Fem|Number=Plur" ,
82
+ "PunctType=Peri" ,
83
+ }
84
+ };
85
+
86
+ static final String [][] EXPECTED_RELNS = {
87
+ { "advmod" , "det" , "nsubj" , "case" , "nummod" , "advmod" , "amod" , "case" , "det" , "det" , "nmod" , "advmod" , "aux" , "root" ,
88
+ "expl:pv" , "case" , "det" , "obl:arg" , "case" , "nmod" , "punct" },
89
+ { "case" , "advmod" , "det" , "root" , "punct" },
90
+ };
91
+ static final int [][] EXPECTED_HEADS = {
92
+ { 14 , 3 , 14 , 7 , 7 , 7 , 3 , 11 , 11 , 9 , 3 , 14 , 14 , 0 , 14 , 18 , 18 , 14 , 20 , 18 , 14 },
93
+ { 2 , 4 , 4 , 0 , 4 },
94
+ };
34
95
35
96
@ Test
36
97
public void testReadingInCoNLLUFile () throws ClassNotFoundException , IOException {
37
- goldDocument = pipeline .process (exampleDocument );
38
98
readInDocument = new CoNLLUReader (new Properties ()).readCoNLLUFile (examplePath ).get (0 );
39
- // make some changes for sake of comparison
40
- // remove AfterAnnotation from read in
41
- // remove ParentAnnotation from gold
42
- for (CoreLabel token : goldDocument .get (CoreAnnotations .TokensAnnotation .class )) {
43
- token .remove (CoreAnnotations .ParentAnnotation .class );
99
+
100
+ assertTrue (readInDocument .containsKey (CoreAnnotations .TextAnnotation .class ));
101
+ assertTrue (readInDocument .containsKey (CoreAnnotations .TokensAnnotation .class ));
102
+ assertTrue (readInDocument .containsKey (CoreAnnotations .SentencesAnnotation .class ));
103
+ assertEquals (3 , readInDocument .keySet ().size ());
104
+
105
+ // Compare text of the document and its sentences
106
+ assertEquals (EXPECTED_TEXT , readInDocument .get (CoreAnnotations .TextAnnotation .class ));
107
+ List <CoreMap > sentences = readInDocument .get (CoreAnnotations .SentencesAnnotation .class );
108
+ assertEquals (EXPECTED_SENTENCE_TEXT .length , sentences .size ());
109
+ for (int i = 0 ; i < EXPECTED_SENTENCE_TEXT .length ; ++i ) {
110
+ assertEquals (EXPECTED_SENTENCE_TEXT [i ], sentences .get (i ).get (CoreAnnotations .TextAnnotation .class ));
44
111
}
45
- for (CoreLabel token : readInDocument .get (CoreAnnotations .TokensAnnotation .class )) {
46
- token .remove (CoreAnnotations .CoNLLUFeats .class );
112
+
113
+ // Compare sentence ids
114
+ // Check number of keys on each sentence
115
+ for (int i = 0 ; i < sentences .size (); ++i ) {
116
+ assertEquals (Integer .valueOf (i ), sentences .get (i ).get (CoreAnnotations .SentenceIndexAnnotation .class ));
117
+ assertEquals (4 , sentences .get (i ).keySet ().size ());
118
+ }
119
+
120
+ // Check the document tokens and the sentence tokens lists are the same
121
+ // The composite list on the document level should just be the sentence tokens gathered into one list
122
+ List <CoreMap > allTokens = new ArrayList <>();
123
+ for (int i = 0 ; i < sentences .size (); ++i ) {
124
+ allTokens .addAll (sentences .get (i ).get (CoreAnnotations .TokensAnnotation .class ));
47
125
}
48
- // compare gold vs. read in
49
- // compare document text
50
- assertEquals (goldDocument .get (CoreAnnotations .TextAnnotation .class ),
51
- readInDocument .get (CoreAnnotations .TextAnnotation .class ));
52
- // compare tokens lists
53
- AnnotationComparator .compareTokensLists (goldDocument , readInDocument );
54
- assertEquals (goldDocument .get (CoreAnnotations .TokensAnnotation .class ),
55
- readInDocument .get (CoreAnnotations .TokensAnnotation .class ));
56
- // compare sentences
57
- for (int i = 0 ; i < goldDocument .get (CoreAnnotations .SentencesAnnotation .class ).size (); i ++) {
58
- CoreMap goldSentence = goldDocument .get (CoreAnnotations .SentencesAnnotation .class ).get (i );
59
- CoreMap readInSentence = readInDocument .get (CoreAnnotations .SentencesAnnotation .class ).get (i );
60
- // compare sentence text
61
- assertEquals (goldSentence .get (CoreAnnotations .TextAnnotation .class ),
62
- readInSentence .get (CoreAnnotations .TextAnnotation .class ));
63
- // compare token lists
64
- assertEquals (goldSentence .get (CoreAnnotations .TokensAnnotation .class ),
65
- readInSentence .get (CoreAnnotations .TokensAnnotation .class ));
66
- // compare semantic graphs
67
- SemanticGraph goldGraph =
68
- goldDocument .get (CoreAnnotations .SentencesAnnotation .class ).get (i ).get (
69
- SemanticGraphCoreAnnotations .BasicDependenciesAnnotation .class );
70
- SemanticGraph readInGraph =
71
- goldDocument .get (CoreAnnotations .SentencesAnnotation .class ).get (i ).get (
72
- SemanticGraphCoreAnnotations .BasicDependenciesAnnotation .class );
73
- assertEquals (goldGraph .toList (), readInGraph .toList ());
126
+ assertEquals (readInDocument .get (CoreAnnotations .TokensAnnotation .class ), allTokens );
127
+
128
+ // Check the text on each of the words
129
+ // Check the lemmas
130
+ // Check indices and a couple other annotations we expect to be here
131
+ for (int i = 0 ; i < sentences .size (); ++i ) {
132
+ CoreMap sentence = sentences .get (i );
133
+ List <CoreLabel > tokens = sentence .get (CoreAnnotations .TokensAnnotation .class );
134
+ assertEquals (EXPECTED_WORD_TEXT [i ].length , tokens .size ());
135
+ assertEquals (EXPECTED_LEMMA_TEXT [i ].length , tokens .size ());
136
+ assertEquals (EXPECTED_CPOS [i ].length , tokens .size ());
137
+ for (int j = 0 ; j < tokens .size (); ++j ) {
138
+ CoreLabel token = tokens .get (j );
139
+ assertEquals (EXPECTED_WORD_TEXT [i ][j ], token .value ());
140
+ assertEquals (EXPECTED_WORD_TEXT [i ][j ], token .word ());
141
+ assertEquals (EXPECTED_WORD_TEXT [i ][j ], token .get (CoreAnnotations .OriginalTextAnnotation .class ));
142
+
143
+ assertEquals (EXPECTED_LEMMA_TEXT [i ][j ], token .lemma ());
144
+ assertEquals (EXPECTED_CPOS [i ][j ], token .tag ());
145
+
146
+ assertEquals (Integer .valueOf (i ), token .get (CoreAnnotations .SentenceIndexAnnotation .class ));
147
+ assertEquals (Integer .valueOf (j +1 ), token .get (CoreAnnotations .IndexAnnotation .class ));
148
+
149
+ // all tokens should have a False isNewline
150
+ assertFalse (token .get (CoreAnnotations .IsNewlineAnnotation .class ));
151
+ }
152
+ }
153
+
154
+ // Check the MWT features
155
+ for (int i = 0 ; i < sentences .size (); ++i ) {
156
+ CoreMap sentence = sentences .get (i );
157
+ List <CoreLabel > tokens = sentence .get (CoreAnnotations .TokensAnnotation .class );
158
+ for (int j = 0 ; j < tokens .size (); ++j ) {
159
+ CoreLabel token = tokens .get (j );
160
+ // words 14-15 (indexed one lower here) are the only MWT in this document
161
+ // otherwise, all fields should be false
162
+ if (i == 0 && j == 13 ) {
163
+ assertTrue (token .get (CoreAnnotations .IsMultiWordTokenAnnotation .class ));
164
+ assertTrue (token .get (CoreAnnotations .IsFirstWordOfMWTAnnotation .class ));
165
+ assertEquals ("deberse" , token .get (CoreAnnotations .MWTTokenTextAnnotation .class ));
166
+ } else if (i == 0 && j == 14 ) {
167
+ assertTrue (token .get (CoreAnnotations .IsMultiWordTokenAnnotation .class ));
168
+ assertFalse (token .get (CoreAnnotations .IsFirstWordOfMWTAnnotation .class ));
169
+ assertEquals ("deberse" , token .get (CoreAnnotations .MWTTokenTextAnnotation .class ));
170
+ } else {
171
+ assertFalse (token .get (CoreAnnotations .IsMultiWordTokenAnnotation .class ));
172
+ assertFalse (token .get (CoreAnnotations .IsFirstWordOfMWTAnnotation .class ));
173
+ assertFalse (token .containsKey (CoreAnnotations .MWTTokenTextAnnotation .class ));
174
+ }
175
+ }
176
+ }
177
+
178
+ // Check the Before & After features
179
+ // TODO: May need to reconsider the end of sentence treatment
180
+ for (int i = 0 ; i < sentences .size (); ++i ) {
181
+ CoreMap sentence = sentences .get (i );
182
+ List <CoreLabel > tokens = sentence .get (CoreAnnotations .TokensAnnotation .class );
183
+ for (int j = 0 ; j < tokens .size (); ++j ) {
184
+ CoreLabel token = tokens .get (j );
185
+ if (j == tokens .size () - 1 ) {
186
+ assertEquals ("\n " , token .after ());
187
+ } else if (j == tokens .size () - 2 ) {
188
+ assertEquals ("" , token .after ());
189
+ } else {
190
+ // TODO: after() should be "" for an MWT
191
+ // it just doesn't get marked on the CoNLLU
192
+ assertEquals (" " , token .after ());
193
+ }
194
+
195
+ if (i == 0 && j == 0 ) {
196
+ assertEquals ("" , token .before ());
197
+ } else if (j == 0 ) {
198
+ assertEquals ("\n " , token .before ());
199
+ } else if (j == tokens .size () - 1 ) {
200
+ assertEquals ("" , token .before ());
201
+ } else {
202
+ assertEquals (" " , token .before ());
203
+ }
204
+ }
205
+ }
206
+
207
+ // Check that these fields are set
208
+ // Perhaps not checking the values of the offsets, though
209
+ int tokenCount = 0 ;
210
+ for (int i = 0 ; i < sentences .size (); ++i ) {
211
+ CoreMap sentence = sentences .get (i );
212
+ List <CoreLabel > tokens = sentence .get (CoreAnnotations .TokensAnnotation .class );
213
+ for (int j = 0 ; j < tokens .size (); ++j ) {
214
+ CoreLabel token = tokens .get (j );
215
+ assertTrue (token .containsKey (CoreAnnotations .CharacterOffsetBeginAnnotation .class ));
216
+ assertTrue (token .containsKey (CoreAnnotations .CharacterOffsetEndAnnotation .class ));
217
+ assertEquals (Integer .valueOf (tokenCount ), token .get (CoreAnnotations .TokenBeginAnnotation .class ));
218
+ assertEquals (Integer .valueOf (tokenCount +1 ), token .get (CoreAnnotations .TokenEndAnnotation .class ));
219
+ ++tokenCount ;
220
+ }
221
+ }
222
+
223
+ // check the features and that there are no fields currently unaccounted for
224
+ for (int i = 0 ; i < sentences .size (); ++i ) {
225
+ CoreMap sentence = sentences .get (i );
226
+ List <CoreLabel > tokens = sentence .get (CoreAnnotations .TokensAnnotation .class );
227
+ assertEquals (EXPECTED_FEATS [i ].length , tokens .size ());
228
+ for (int j = 0 ; j < tokens .size (); ++j ) {
229
+ CoreLabel token = tokens .get (j );
230
+
231
+ String expected = EXPECTED_FEATS [i ][j ];
232
+ int expectedKeys = 16 ;
233
+
234
+ if (expected == null ) {
235
+ assertFalse (token .containsKey (CoreAnnotations .CoNLLUFeats .class ));
236
+ } else {
237
+ expectedKeys += 1 ;
238
+ String feats = token .get (CoreAnnotations .CoNLLUFeats .class ).toString ();
239
+ assertEquals (expected , feats );
240
+ }
241
+
242
+ // the MWT token specifically gets one more field, the MWT text
243
+ if (i == 0 && (j == 13 || j == 14 )) {
244
+ expectedKeys += 1 ;
245
+ }
246
+ assertEquals (expectedKeys , token .keySet ().size ());
247
+
248
+ // The known fields should be the ones checked above:
249
+ // CoreAnnotations.TextAnnotation
250
+ // CoreAnnotations.ValueAnnotation
251
+ // CoreAnnotations.OriginalTextAnnotation
252
+ // CoreAnnotations.IsNewlineAnnotation
253
+ // CoreAnnotations.LemmaAnnotation
254
+ // CoreAnnotations.PartOfSpeechAnnotation
255
+ // CoreAnnotations.IndexAnnotation
256
+ // CoreAnnotations.AfterAnnotation
257
+ // CoreAnnotations.BeforeAnnotation
258
+ // CoreAnnotations.IsMultiWordTokenAnnotation
259
+ // CoreAnnotations.IsFirstWordOfMWTAnnotation
260
+ // CoreAnnotations.CharacterOffsetBeginAnnotation
261
+ // CoreAnnotations.CharacterOffsetEndAnnotation
262
+ // CoreAnnotations.TokenBeginAnnotation
263
+ // CoreAnnotations.TokenEndAnnotation
264
+ // CoreAnnotations.SentenceIndexAnnotation
265
+ // and sometimes
266
+ // CoreAnnotations.CoNLLUFeats
267
+ // CoreAnnotations.MWTTokenTextAnnotation
268
+ //
269
+ // TODO: make it always add a Feats, even if it's not present?
270
+ }
271
+ }
272
+
273
+ // compare the SemanticGraph
274
+ for (int i = 0 ; i < sentences .size (); ++i ) {
275
+ CoreMap sentence = sentences .get (i );
276
+ SemanticGraph graph = sentence .get (SemanticGraphCoreAnnotations .BasicDependenciesAnnotation .class );
277
+ assertNotNull (graph );
278
+
279
+ List <IndexedWord > vertices = graph .vertexListSorted ();
280
+ assertEquals (EXPECTED_WORD_TEXT [i ].length , vertices .size ());
281
+ assertEquals (EXPECTED_RELNS [i ].length , vertices .size ());
282
+ assertEquals (EXPECTED_HEADS [i ].length , vertices .size ());
283
+ for (int j = 0 ; j < vertices .size (); ++j ) {
284
+ IndexedWord vertex = vertices .get (j );
285
+ assertEquals (EXPECTED_WORD_TEXT [i ][j ], vertex .value ());
286
+
287
+ // each word should be properly indexed with the sentIndex and position in the sentence
288
+ assertEquals (i , vertex .sentIndex ());
289
+ // j+1 because the arrows are laid out with 0 as root, words with a 1-based index
290
+ assertEquals (j +1 , vertex .index ());
291
+
292
+ if (EXPECTED_HEADS [i ][j ] == 0 ) {
293
+ assertTrue (graph .isRoot (vertex ));
294
+ continue ;
295
+ }
296
+
297
+ // If not a root, then the word should have exactly one parent
298
+ // The HEAD and RELNS arrays specify the expected parent and relation of the edge
299
+ List <SemanticGraphEdge > edges = graph .getIncomingEdgesSorted (vertex );
300
+ assertEquals (1 , edges .size ());
301
+ assertEquals (EXPECTED_HEADS [i ][j ], edges .get (0 ).getGovernor ().index ());
302
+ assertEquals (EXPECTED_RELNS [i ][j ], edges .get (0 ).getRelation ().toString ());
303
+ }
74
304
}
75
305
}
76
306
}
0 commit comments