Skip to content

Commit 377f8ed

Browse files
committed
Put MISC, START_CHAR, END_CHAR, NER in a canonical order despite potentially being added in different orders in the token / word maps
Many tests are updated because SpaceAfter etc should now be at the start of a misc column
1 parent f4acbc0 commit 377f8ed

File tree

3 files changed

+107
-52
lines changed

3 files changed

+107
-52
lines changed

stanza/models/common/doc.py

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -982,36 +982,41 @@ def init_from_misc(unit):
982982

983983
def dict_to_conll_text(token_dict, id_connector="-"):
984984
token_conll = ['_' for i in range(FIELD_NUM)]
985+
985986
misc = []
986-
for key in token_dict:
987-
if key == START_CHAR or key == END_CHAR:
988-
misc.append("{}={}".format(key, token_dict[key]))
989-
elif key == NER:
990-
# TODO: potentially need to escape =|\ in the NER
987+
if token_dict.get(MISC):
988+
# avoid appending a blank misc entry.
989+
# otherwise the resulting misc field in the conll doc will wind up being blank text
990+
# TODO: potentially need to escape =|\ in the MISC as well
991+
misc.append(token_dict[MISC])
992+
993+
# for other items meant to be in the MISC field,
994+
# we try to operate on those columns in a deterministic order
995+
# so that the output doesn't change based on the order of keys
996+
# in the token_dict
997+
for key in [START_CHAR, END_CHAR, NER]:
998+
if key in token_dict:
991999
misc.append("{}={}".format(key, token_dict[key]))
992-
elif key == COREF_CHAINS:
993-
chains = token_dict[key]
994-
if len(chains) > 0:
995-
misc_chains = []
996-
for chain in chains:
997-
if chain.is_start and chain.is_end:
998-
coref_position = "unit-"
999-
elif chain.is_start:
1000-
coref_position = "start-"
1001-
elif chain.is_end:
1002-
coref_position = "end-"
1003-
else:
1004-
coref_position = "middle-"
1005-
is_representative = "repr-" if chain.is_representative else ""
1006-
misc_chains.append("%s%sid%d" % (coref_position, is_representative, chain.chain.index))
1007-
misc.append("{}={}".format(key, ",".join(misc_chains)))
1008-
elif key == MISC:
1009-
# avoid appending a blank misc entry.
1010-
# otherwise the resulting misc field in the conll doc will wind up being blank text
1011-
# TODO: potentially need to escape =|\ in the MISC as well
1012-
if token_dict[key]:
1013-
misc.append(token_dict[key])
1014-
elif key == ID:
1000+
1001+
if COREF_CHAINS in token_dict:
1002+
chains = token_dict[COREF_CHAINS]
1003+
if len(chains) > 0:
1004+
misc_chains = []
1005+
for chain in chains:
1006+
if chain.is_start and chain.is_end:
1007+
coref_position = "unit-"
1008+
elif chain.is_start:
1009+
coref_position = "start-"
1010+
elif chain.is_end:
1011+
coref_position = "end-"
1012+
else:
1013+
coref_position = "middle-"
1014+
is_representative = "repr-" if chain.is_representative else ""
1015+
misc_chains.append("%s%sid%d" % (coref_position, is_representative, chain.chain.index))
1016+
misc.append("{}={}".format(key, ",".join(misc_chains)))
1017+
1018+
for key in token_dict.keys():
1019+
if key == ID:
10151020
token_conll[FIELD_TO_IDX[key]] = id_connector.join([str(x) for x in token_dict[key]]) if isinstance(token_dict[key], tuple) else str(token_dict[key])
10161021
elif key == FEATS:
10171022
feats = token_dict[key]

stanza/tests/pipeline/test_english_pipeline.py

Lines changed: 62 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@
9696
3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 aux:pass _ start_char=13|end_char=16|ner=O
9797
4 born bear VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=17|end_char=21|ner=O
9898
5 in in ADP IN _ 6 case _ start_char=22|end_char=24|ner=O
99-
6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ start_char=25|end_char=31|ner=S-GPE|SpaceAfter=No
100-
7 . . PUNCT . _ 4 punct _ start_char=31|end_char=32|ner=O|SpacesAfter=\\s\\s
99+
6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ SpaceAfter=No|start_char=25|end_char=31|ner=S-GPE
100+
7 . . PUNCT . _ 4 punct _ SpacesAfter=\\s\\s|start_char=31|end_char=32|ner=O
101101
102102
# text = He was elected president in 2008.
103103
# sent_id = 1
@@ -108,17 +108,17 @@
108108
3 elected elect VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=41|end_char=48|ner=O
109109
4 president president NOUN NN Number=Sing 3 xcomp _ start_char=49|end_char=58|ner=O
110110
5 in in ADP IN _ 6 case _ start_char=59|end_char=61|ner=O
111-
6 2008 2008 NUM CD NumForm=Digit|NumType=Card 3 obl _ start_char=62|end_char=66|ner=S-DATE|SpaceAfter=No
112-
7 . . PUNCT . _ 3 punct _ start_char=66|end_char=67|ner=O|SpacesAfter=\\s\\s
111+
6 2008 2008 NUM CD NumForm=Digit|NumType=Card 3 obl _ SpaceAfter=No|start_char=62|end_char=66|ner=S-DATE
112+
7 . . PUNCT . _ 3 punct _ SpacesAfter=\\s\\s|start_char=66|end_char=67|ner=O
113113
114114
# text = Obama attended Harvard.
115115
# sent_id = 2
116116
# constituency = (ROOT (S (NP (NNP Obama)) (VP (VBD attended) (NP (NNP Harvard))) (. .)))
117117
# sentiment = 1
118118
1 Obama Obama PROPN NNP Number=Sing 2 nsubj _ start_char=69|end_char=74|ner=S-PERSON
119119
2 attended attend VERB VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 0 root _ start_char=75|end_char=83|ner=O
120-
3 Harvard Harvard PROPN NNP Number=Sing 2 obj _ start_char=84|end_char=91|ner=S-ORG|SpaceAfter=No
121-
4 . . PUNCT . _ 2 punct _ start_char=91|end_char=92|ner=O|SpaceAfter=No
120+
3 Harvard Harvard PROPN NNP Number=Sing 2 obj _ SpaceAfter=No|start_char=84|end_char=91|ner=S-ORG
121+
4 . . PUNCT . _ 2 punct _ SpaceAfter=No|start_char=91|end_char=92|ner=O
122122
""".strip()
123123

124124
EN_DOC_CONLLU_GOLD_MULTIDOC = """
@@ -131,8 +131,8 @@
131131
3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 aux:pass _ start_char=13|end_char=16|ner=O
132132
4 born bear VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=17|end_char=21|ner=O
133133
5 in in ADP IN _ 6 case _ start_char=22|end_char=24|ner=O
134-
6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ start_char=25|end_char=31|ner=S-GPE|SpaceAfter=No
135-
7 . . PUNCT . _ 4 punct _ start_char=31|end_char=32|ner=O|SpaceAfter=No
134+
6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ SpaceAfter=No|start_char=25|end_char=31|ner=S-GPE
135+
7 . . PUNCT . _ 4 punct _ SpaceAfter=No|start_char=31|end_char=32|ner=O
136136
137137
# text = He was elected president in 2008.
138138
# sent_id = 1
@@ -143,24 +143,60 @@
143143
3 elected elect VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=7|end_char=14|ner=O
144144
4 president president NOUN NN Number=Sing 3 xcomp _ start_char=15|end_char=24|ner=O
145145
5 in in ADP IN _ 6 case _ start_char=25|end_char=27|ner=O
146-
6 2008 2008 NUM CD NumForm=Digit|NumType=Card 3 obl _ start_char=28|end_char=32|ner=S-DATE|SpaceAfter=No
147-
7 . . PUNCT . _ 3 punct _ start_char=32|end_char=33|ner=O|SpaceAfter=No
146+
6 2008 2008 NUM CD NumForm=Digit|NumType=Card 3 obl _ SpaceAfter=No|start_char=28|end_char=32|ner=S-DATE
147+
7 . . PUNCT . _ 3 punct _ SpaceAfter=No|start_char=32|end_char=33|ner=O
148148
149149
# text = Obama attended Harvard.
150150
# sent_id = 2
151151
# constituency = (ROOT (S (NP (NNP Obama)) (VP (VBD attended) (NP (NNP Harvard))) (. .)))
152152
# sentiment = 1
153153
1 Obama Obama PROPN NNP Number=Sing 2 nsubj _ start_char=0|end_char=5|ner=S-PERSON
154154
2 attended attend VERB VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 0 root _ start_char=6|end_char=14|ner=O
155-
3 Harvard Harvard PROPN NNP Number=Sing 2 obj _ start_char=15|end_char=22|ner=S-ORG|SpaceAfter=No
156-
4 . . PUNCT . _ 2 punct _ start_char=22|end_char=23|ner=O|SpaceAfter=No
155+
3 Harvard Harvard PROPN NNP Number=Sing 2 obj _ SpaceAfter=No|start_char=15|end_char=22|ner=S-ORG
156+
4 . . PUNCT . _ 2 punct _ SpaceAfter=No|start_char=22|end_char=23|ner=O
157+
""".strip()
158+
159+
PRETOKENIZED_TEXT = "Jennifer has lovely blue antennae ."
160+
161+
PRETOKENIZED_PIECES = [PRETOKENIZED_TEXT.split()]
162+
163+
EXPECTED_TOKENIZED_ONLY_CONLLU = """
164+
# text = Jennifer has lovely blue antennae .
165+
# sent_id = 0
166+
1 Jennifer _ _ _ _ 0 _ _ start_char=0|end_char=8
167+
2 has _ _ _ _ 1 _ _ start_char=9|end_char=12
168+
3 lovely _ _ _ _ 2 _ _ start_char=13|end_char=19
169+
4 blue _ _ _ _ 3 _ _ start_char=20|end_char=24
170+
5 antennae _ _ _ _ 4 _ _ start_char=25|end_char=33
171+
6 . _ _ _ _ 5 _ _ SpaceAfter=No|start_char=34|end_char=35
172+
""".strip()
173+
174+
EXPECTED_PRETOKENIZED_CONLLU = """
175+
# text = Jennifer has lovely blue antennae .
176+
# sent_id = 0
177+
# constituency = (ROOT (S (NP (NNP Jennifer)) (VP (VBZ has) (NP (JJ lovely) (JJ blue) (NNS antennae))) (. .)))
178+
# sentiment = 2
179+
1 Jennifer Jennifer PROPN NNP Number=Sing 2 nsubj _ start_char=0|end_char=8|ner=S-PERSON
180+
2 has have VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ start_char=9|end_char=12|ner=O
181+
3 lovely lovely ADJ JJ Degree=Pos 5 amod _ start_char=13|end_char=19|ner=O
182+
4 blue blue ADJ JJ Degree=Pos 5 amod _ start_char=20|end_char=24|ner=O
183+
5 antennae antenna NOUN NNS Number=Plur 2 obj _ start_char=25|end_char=33|ner=O
184+
6 . . PUNCT . _ 2 punct _ SpaceAfter=No|start_char=34|end_char=35|ner=O
157185
""".strip()
158186

159187
class TestEnglishPipeline:
160188
@pytest.fixture(scope="class")
161189
def pipeline(self):
162190
return stanza.Pipeline(dir=TEST_MODELS_DIR, download_method=None)
163191

192+
@pytest.fixture(scope="class")
193+
def pretokenized_pipeline(self):
194+
return stanza.Pipeline(dir=TEST_MODELS_DIR, tokenize_pretokenized=True, download_method=None)
195+
196+
@pytest.fixture(scope="class")
197+
def tokenizer_pipeline(self):
198+
return stanza.Pipeline(dir=TEST_MODELS_DIR, processors="tokenize", download_method=None)
199+
164200
@pytest.fixture(scope="class")
165201
def processed_doc(self, pipeline):
166202
""" Document created by running full English pipeline on a few sentences """
@@ -207,6 +243,20 @@ def test_empty_bulk_process(self, pipeline):
207243
processed = pipeline.bulk_process([])
208244
assert processed == []
209245

246+
def test_pretokenized(self, pretokenized_pipeline, tokenizer_pipeline):
247+
doc = pretokenized_pipeline(PRETOKENIZED_PIECES)
248+
conllu = "{:C}".format(doc).strip()
249+
assert conllu == EXPECTED_PRETOKENIZED_CONLLU
250+
251+
doc = tokenizer_pipeline(PRETOKENIZED_TEXT)
252+
conllu = "{:C}".format(doc).strip()
253+
assert conllu == EXPECTED_TOKENIZED_ONLY_CONLLU
254+
255+
# putting a doc with tokens into the pipeline should also work
256+
reparsed = pretokenized_pipeline(doc)
257+
conllu = "{:C}".format(reparsed).strip()
258+
assert conllu == EXPECTED_PRETOKENIZED_CONLLU
259+
210260
def test_stream(self, pipeline):
211261
""" Test the streaming interface to the Pipeline """
212262
# Test all of the documents in one batch

stanza/tests/tokenization/test_spaces.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,21 @@
88
EXPECTED_NO_MWT = """
99
# text = Jennifer has nice antennae.
1010
# sent_id = 0
11-
1 Jennifer _ _ _ _ 0 _ _ start_char=2|end_char=10|SpacesBefore=\\s\\s
11+
1 Jennifer _ _ _ _ 0 _ _ SpacesBefore=\\s\\s|start_char=2|end_char=10
1212
2 has _ _ _ _ 1 _ _ start_char=11|end_char=14
1313
3 nice _ _ _ _ 2 _ _ start_char=15|end_char=19
14-
4 antennae _ _ _ _ 3 _ _ start_char=20|end_char=28|SpaceAfter=No
15-
5 . _ _ _ _ 4 _ _ start_char=28|end_char=29|SpacesAfter=\\s\\s
14+
4 antennae _ _ _ _ 3 _ _ SpaceAfter=No|start_char=20|end_char=28
15+
5 . _ _ _ _ 4 _ _ SpacesAfter=\\s\\s|start_char=28|end_char=29
1616
1717
# text = Not very nice person, though.
1818
# sent_id = 1
1919
1 Not _ _ _ _ 0 _ _ start_char=31|end_char=34
2020
2 very _ _ _ _ 1 _ _ start_char=35|end_char=39
2121
3 nice _ _ _ _ 2 _ _ start_char=40|end_char=44
22-
4 person _ _ _ _ 3 _ _ start_char=45|end_char=51|SpaceAfter=No
22+
4 person _ _ _ _ 3 _ _ SpaceAfter=No|start_char=45|end_char=51
2323
5 , _ _ _ _ 4 _ _ start_char=51|end_char=52
24-
6 though _ _ _ _ 5 _ _ start_char=53|end_char=59|SpaceAfter=No
25-
7 . _ _ _ _ 6 _ _ start_char=59|end_char=60|SpacesAfter=\\s\\s
24+
6 though _ _ _ _ 5 _ _ SpaceAfter=No|start_char=53|end_char=59
25+
7 . _ _ _ _ 6 _ _ SpacesAfter=\\s\\s|start_char=59|end_char=60
2626
""".strip()
2727

2828
def test_spaces_no_mwt():
@@ -38,18 +38,18 @@ def test_spaces_no_mwt():
3838
EXPECTED_MWT = """
3939
# text = She's not a nice person.
4040
# sent_id = 0
41-
1-2 She's _ _ _ _ _ _ _ start_char=2|end_char=7|SpacesBefore=\\s\\s
41+
1-2 She's _ _ _ _ _ _ _ SpacesBefore=\\s\\s|start_char=2|end_char=7
4242
1 She _ _ _ _ 0 _ _ start_char=2|end_char=5
4343
2 's _ _ _ _ 1 _ _ start_char=5|end_char=7
4444
3 not _ _ _ _ 2 _ _ start_char=8|end_char=11
4545
4 a _ _ _ _ 3 _ _ start_char=12|end_char=13
4646
5 nice _ _ _ _ 4 _ _ start_char=14|end_char=18
47-
6 person _ _ _ _ 5 _ _ start_char=19|end_char=25|SpaceAfter=No
48-
7 . _ _ _ _ 6 _ _ start_char=25|end_char=26|SpacesAfter=\\s\\s
47+
6 person _ _ _ _ 5 _ _ SpaceAfter=No|start_char=19|end_char=25
48+
7 . _ _ _ _ 6 _ _ SpacesAfter=\\s\\s|start_char=25|end_char=26
4949
5050
# text = However, the best antennae on the Cerritos are Jennifer's.
5151
# sent_id = 1
52-
1 However _ _ _ _ 0 _ _ start_char=28|end_char=35|SpaceAfter=No
52+
1 However _ _ _ _ 0 _ _ SpaceAfter=No|start_char=28|end_char=35
5353
2 , _ _ _ _ 1 _ _ start_char=35|end_char=36
5454
3 the _ _ _ _ 2 _ _ start_char=37|end_char=40
5555
4 best _ _ _ _ 3 _ _ start_char=41|end_char=45
@@ -58,10 +58,10 @@ def test_spaces_no_mwt():
5858
7 the _ _ _ _ 6 _ _ start_char=58|end_char=61
5959
8 Cerritos _ _ _ _ 7 _ _ start_char=62|end_char=70
6060
9 are _ _ _ _ 8 _ _ start_char=71|end_char=74
61-
10-11 Jennifer's _ _ _ _ _ _ _ start_char=75|end_char=85|SpaceAfter=No
61+
10-11 Jennifer's _ _ _ _ _ _ _ SpaceAfter=No|start_char=75|end_char=85
6262
10 Jennifer _ _ _ _ 9 _ _ start_char=75|end_char=83
6363
11 's _ _ _ _ 10 _ _ start_char=83|end_char=85
64-
12 . _ _ _ _ 11 _ _ start_char=85|end_char=86|SpacesAfter=\\s\\s
64+
12 . _ _ _ _ 11 _ _ SpacesAfter=\\s\\s|start_char=85|end_char=86
6565
""".strip()
6666

6767
def test_spaces_mwt():

0 commit comments

Comments
 (0)