Skip to content

Commit 07c3fe1

Browse files
committed
Check if a Document is already chopped into sentences before trying to whitespace tokenize it when doing a bulk_process with a pretokenized TokenizeProcessor
1 parent 377f8ed commit 07c3fe1

File tree

2 files changed

+38
-2
lines changed

2 files changed

+38
-2
lines changed

stanza/pipeline/tokenize_processor.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Processor for performing tokenization
33
"""
44

5+
import copy
56
import io
67
import logging
78

@@ -128,8 +129,18 @@ def bulk_process(self, docs):
128129
if self.config.get('pretokenized'):
129130
res = []
130131
for document in docs:
131-
raw_text, document = self.process_pre_tokenized_text(document.text)
132-
res.append(doc.Document(document, raw_text))
132+
if len(document.sentences) > 0:
133+
# perhaps this is a document already tokenized,
134+
# being sent back in for more analysis / reparsing / etc?
135+
# in that case, no need to try to tokenize it
136+
# based on whitespace tokenizing the document text
137+
# which, interestingly, may not even exist depending on
138+
# how the document was created)
139+
# by making a whole deepcopy, the original Document is unchanged
140+
res.append(copy.deepcopy(document))
141+
else:
142+
raw_text, document = self.process_pre_tokenized_text(document.text)
143+
res.append(doc.Document(document, raw_text))
133144
return res
134145

135146
combined_text = '\n\n'.join([thisdoc.text for thisdoc in docs])

stanza/tests/pipeline/test_english_pipeline.py

+25
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,31 @@ def test_pretokenized(self, pretokenized_pipeline, tokenizer_pipeline):
257257
conllu = "{:C}".format(reparsed).strip()
258258
assert conllu == EXPECTED_PRETOKENIZED_CONLLU
259259

260+
def test_bulk_pretokenized(self, pretokenized_pipeline, tokenizer_pipeline):
261+
doc = tokenizer_pipeline(PRETOKENIZED_TEXT)
262+
conllu = "{:C}".format(doc).strip()
263+
assert conllu == EXPECTED_TOKENIZED_ONLY_CONLLU
264+
265+
docs = pretokenized_pipeline([doc, doc])
266+
assert len(docs) == 2
267+
for doc in docs:
268+
conllu = "{:C}".format(doc).strip()
269+
assert conllu == EXPECTED_PRETOKENIZED_CONLLU
270+
271+
def test_conll2doc_pretokenized(self, pretokenized_pipeline):
272+
doc = CoNLL.conll2doc(input_str=EXPECTED_TOKENIZED_ONLY_CONLLU)
273+
# this was bug from version 1.10.1 sent to us from a user
274+
# the pretokenized tokenize_processor would try to whitespace tokenize a document
275+
# even if the document already had sentences & words & stuff
276+
# not only would that be wrong if the text wouldn't whitespace tokenize into the words
277+
# (such as with punctuation and SpaceAfter=No),
278+
# it wouldn't even work in the case of conll2doc, since the document.text wasn't set
279+
docs = pretokenized_pipeline([doc, doc])
280+
assert len(docs) == 2
281+
for doc in docs:
282+
conllu = "{:C}".format(doc).strip()
283+
assert conllu == EXPECTED_PRETOKENIZED_CONLLU
284+
260285
def test_stream(self, pipeline):
261286
""" Test the streaming interface to the Pipeline """
262287
# Test all of the documents in one batch

0 commit comments

Comments
 (0)