|
96 | 96 | 3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 aux:pass _ start_char=13|end_char=16|ner=O
|
97 | 97 | 4 born bear VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=17|end_char=21|ner=O
|
98 | 98 | 5 in in ADP IN _ 6 case _ start_char=22|end_char=24|ner=O
|
99 |
| -6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ start_char=25|end_char=31|ner=S-GPE|SpaceAfter=No |
100 |
| -7 . . PUNCT . _ 4 punct _ start_char=31|end_char=32|ner=O|SpacesAfter=\\s\\s |
| 99 | +6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ SpaceAfter=No|start_char=25|end_char=31|ner=S-GPE |
| 100 | +7 . . PUNCT . _ 4 punct _ SpacesAfter=\\s\\s|start_char=31|end_char=32|ner=O |
101 | 101 |
|
102 | 102 | # text = He was elected president in 2008.
|
103 | 103 | # sent_id = 1
|
|
108 | 108 | 3 elected elect VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=41|end_char=48|ner=O
|
109 | 109 | 4 president president NOUN NN Number=Sing 3 xcomp _ start_char=49|end_char=58|ner=O
|
110 | 110 | 5 in in ADP IN _ 6 case _ start_char=59|end_char=61|ner=O
|
111 |
| -6 2008 2008 NUM CD NumForm=Digit|NumType=Card 3 obl _ start_char=62|end_char=66|ner=S-DATE|SpaceAfter=No |
112 |
| -7 . . PUNCT . _ 3 punct _ start_char=66|end_char=67|ner=O|SpacesAfter=\\s\\s |
| 111 | +6 2008 2008 NUM CD NumForm=Digit|NumType=Card 3 obl _ SpaceAfter=No|start_char=62|end_char=66|ner=S-DATE |
| 112 | +7 . . PUNCT . _ 3 punct _ SpacesAfter=\\s\\s|start_char=66|end_char=67|ner=O |
113 | 113 |
|
114 | 114 | # text = Obama attended Harvard.
|
115 | 115 | # sent_id = 2
|
116 | 116 | # constituency = (ROOT (S (NP (NNP Obama)) (VP (VBD attended) (NP (NNP Harvard))) (. .)))
|
117 | 117 | # sentiment = 1
|
118 | 118 | 1 Obama Obama PROPN NNP Number=Sing 2 nsubj _ start_char=69|end_char=74|ner=S-PERSON
|
119 | 119 | 2 attended attend VERB VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 0 root _ start_char=75|end_char=83|ner=O
|
120 |
| -3 Harvard Harvard PROPN NNP Number=Sing 2 obj _ start_char=84|end_char=91|ner=S-ORG|SpaceAfter=No |
121 |
| -4 . . PUNCT . _ 2 punct _ start_char=91|end_char=92|ner=O|SpaceAfter=No |
| 120 | +3 Harvard Harvard PROPN NNP Number=Sing 2 obj _ SpaceAfter=No|start_char=84|end_char=91|ner=S-ORG |
| 121 | +4 . . PUNCT . _ 2 punct _ SpaceAfter=No|start_char=91|end_char=92|ner=O |
122 | 122 | """.strip()
|
123 | 123 |
|
124 | 124 | EN_DOC_CONLLU_GOLD_MULTIDOC = """
|
|
131 | 131 | 3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 aux:pass _ start_char=13|end_char=16|ner=O
|
132 | 132 | 4 born bear VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=17|end_char=21|ner=O
|
133 | 133 | 5 in in ADP IN _ 6 case _ start_char=22|end_char=24|ner=O
|
134 |
| -6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ start_char=25|end_char=31|ner=S-GPE|SpaceAfter=No |
135 |
| -7 . . PUNCT . _ 4 punct _ start_char=31|end_char=32|ner=O|SpaceAfter=No |
| 134 | +6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ SpaceAfter=No|start_char=25|end_char=31|ner=S-GPE |
| 135 | +7 . . PUNCT . _ 4 punct _ SpaceAfter=No|start_char=31|end_char=32|ner=O |
136 | 136 |
|
137 | 137 | # text = He was elected president in 2008.
|
138 | 138 | # sent_id = 1
|
|
143 | 143 | 3 elected elect VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=7|end_char=14|ner=O
|
144 | 144 | 4 president president NOUN NN Number=Sing 3 xcomp _ start_char=15|end_char=24|ner=O
|
145 | 145 | 5 in in ADP IN _ 6 case _ start_char=25|end_char=27|ner=O
|
146 |
| -6 2008 2008 NUM CD NumForm=Digit|NumType=Card 3 obl _ start_char=28|end_char=32|ner=S-DATE|SpaceAfter=No |
147 |
| -7 . . PUNCT . _ 3 punct _ start_char=32|end_char=33|ner=O|SpaceAfter=No |
| 146 | +6 2008 2008 NUM CD NumForm=Digit|NumType=Card 3 obl _ SpaceAfter=No|start_char=28|end_char=32|ner=S-DATE |
| 147 | +7 . . PUNCT . _ 3 punct _ SpaceAfter=No|start_char=32|end_char=33|ner=O |
148 | 148 |
|
149 | 149 | # text = Obama attended Harvard.
|
150 | 150 | # sent_id = 2
|
151 | 151 | # constituency = (ROOT (S (NP (NNP Obama)) (VP (VBD attended) (NP (NNP Harvard))) (. .)))
|
152 | 152 | # sentiment = 1
|
153 | 153 | 1 Obama Obama PROPN NNP Number=Sing 2 nsubj _ start_char=0|end_char=5|ner=S-PERSON
|
154 | 154 | 2 attended attend VERB VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 0 root _ start_char=6|end_char=14|ner=O
|
155 |
| -3 Harvard Harvard PROPN NNP Number=Sing 2 obj _ start_char=15|end_char=22|ner=S-ORG|SpaceAfter=No |
156 |
| -4 . . PUNCT . _ 2 punct _ start_char=22|end_char=23|ner=O|SpaceAfter=No |
| 155 | +3 Harvard Harvard PROPN NNP Number=Sing 2 obj _ SpaceAfter=No|start_char=15|end_char=22|ner=S-ORG |
| 156 | +4 . . PUNCT . _ 2 punct _ SpaceAfter=No|start_char=22|end_char=23|ner=O |
| 157 | +""".strip() |
| 158 | + |
| 159 | +PRETOKENIZED_TEXT = "Jennifer has lovely blue antennae ." |
| 160 | + |
| 161 | +PRETOKENIZED_PIECES = [PRETOKENIZED_TEXT.split()] |
| 162 | + |
| 163 | +EXPECTED_TOKENIZED_ONLY_CONLLU = """ |
| 164 | +# text = Jennifer has lovely blue antennae . |
| 165 | +# sent_id = 0 |
| 166 | +1 Jennifer _ _ _ _ 0 _ _ start_char=0|end_char=8 |
| 167 | +2 has _ _ _ _ 1 _ _ start_char=9|end_char=12 |
| 168 | +3 lovely _ _ _ _ 2 _ _ start_char=13|end_char=19 |
| 169 | +4 blue _ _ _ _ 3 _ _ start_char=20|end_char=24 |
| 170 | +5 antennae _ _ _ _ 4 _ _ start_char=25|end_char=33 |
| 171 | +6 . _ _ _ _ 5 _ _ SpaceAfter=No|start_char=34|end_char=35 |
| 172 | +""".strip() |
| 173 | + |
| 174 | +EXPECTED_PRETOKENIZED_CONLLU = """ |
| 175 | +# text = Jennifer has lovely blue antennae . |
| 176 | +# sent_id = 0 |
| 177 | +# constituency = (ROOT (S (NP (NNP Jennifer)) (VP (VBZ has) (NP (JJ lovely) (JJ blue) (NNS antennae))) (. .))) |
| 178 | +# sentiment = 2 |
| 179 | +1 Jennifer Jennifer PROPN NNP Number=Sing 2 nsubj _ start_char=0|end_char=8|ner=S-PERSON |
| 180 | +2 has have VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ start_char=9|end_char=12|ner=O |
| 181 | +3 lovely lovely ADJ JJ Degree=Pos 5 amod _ start_char=13|end_char=19|ner=O |
| 182 | +4 blue blue ADJ JJ Degree=Pos 5 amod _ start_char=20|end_char=24|ner=O |
| 183 | +5 antennae antenna NOUN NNS Number=Plur 2 obj _ start_char=25|end_char=33|ner=O |
| 184 | +6 . . PUNCT . _ 2 punct _ SpaceAfter=No|start_char=34|end_char=35|ner=O |
157 | 185 | """.strip()
|
158 | 186 |
|
159 | 187 | class TestEnglishPipeline:
|
160 | 188 | @pytest.fixture(scope="class")
|
161 | 189 | def pipeline(self):
|
162 | 190 | return stanza.Pipeline(dir=TEST_MODELS_DIR, download_method=None)
|
163 | 191 |
|
| 192 | + @pytest.fixture(scope="class") |
| 193 | + def pretokenized_pipeline(self): |
| 194 | + return stanza.Pipeline(dir=TEST_MODELS_DIR, tokenize_pretokenized=True, download_method=None) |
| 195 | + |
| 196 | + @pytest.fixture(scope="class") |
| 197 | + def tokenizer_pipeline(self): |
| 198 | + return stanza.Pipeline(dir=TEST_MODELS_DIR, processors="tokenize", download_method=None) |
| 199 | + |
164 | 200 | @pytest.fixture(scope="class")
|
165 | 201 | def processed_doc(self, pipeline):
|
166 | 202 | """ Document created by running full English pipeline on a few sentences """
|
@@ -207,6 +243,20 @@ def test_empty_bulk_process(self, pipeline):
|
207 | 243 | processed = pipeline.bulk_process([])
|
208 | 244 | assert processed == []
|
209 | 245 |
|
| 246 | + def test_pretokenized(self, pretokenized_pipeline, tokenizer_pipeline): |
| 247 | + doc = pretokenized_pipeline(PRETOKENIZED_PIECES) |
| 248 | + conllu = "{:C}".format(doc).strip() |
| 249 | + assert conllu == EXPECTED_PRETOKENIZED_CONLLU |
| 250 | + |
| 251 | + doc = tokenizer_pipeline(PRETOKENIZED_TEXT) |
| 252 | + conllu = "{:C}".format(doc).strip() |
| 253 | + assert conllu == EXPECTED_TOKENIZED_ONLY_CONLLU |
| 254 | + |
| 255 | + # putting a doc with tokens into the pipeline should also work |
| 256 | + reparsed = pretokenized_pipeline(doc) |
| 257 | + conllu = "{:C}".format(reparsed).strip() |
| 258 | + assert conllu == EXPECTED_PRETOKENIZED_CONLLU |
| 259 | + |
210 | 260 | def test_stream(self, pipeline):
|
211 | 261 | """ Test the streaming interface to the Pipeline """
|
212 | 262 | # Test all of the documents in one batch
|
|
0 commit comments