Check if a Document is already chopped into sentences before trying to whitespace tokenize it when doing a bulk_process with a pretokenized TokenizeProcessor

AngledLuffa · AngledLuffa · commit 07c3fe1e4cc0 · 2025-02-28T00:49:55.000-08:00
diff --git a/stanza/pipeline/tokenize_processor.py b/stanza/pipeline/tokenize_processor.py
@@ -2,6 +2,7 @@
 Processor for performing tokenization
 """
 
+import copy
 import io
 import logging
 
@@ -128,8 +129,18 @@ def bulk_process(self, docs):
         if self.config.get('pretokenized'):
             res = []
             for document in docs:
-                raw_text, document = self.process_pre_tokenized_text(document.text)
-                res.append(doc.Document(document, raw_text))
+                if len(document.sentences) > 0:
+                    # perhaps this is a document already tokenized,
+                    # being sent back in for more analysis / reparsing / etc?
+                    # in that case, no need to try to tokenize it
+                    # based on whitespace tokenizing the document text
+                    # which, interestingly, may not even exist depending on
+                    # how the document was created)
+                    # by making a whole deepcopy, the original Document is unchanged
+                    res.append(copy.deepcopy(document))
+                else:
+                    raw_text, document = self.process_pre_tokenized_text(document.text)
+                    res.append(doc.Document(document, raw_text))
             return res
 
         combined_text = '\n\n'.join([thisdoc.text for thisdoc in docs])
diff --git a/stanza/tests/pipeline/test_english_pipeline.py b/stanza/tests/pipeline/test_english_pipeline.py
@@ -257,6 +257,31 @@ def test_pretokenized(self, pretokenized_pipeline, tokenizer_pipeline):
         conllu = "{:C}".format(reparsed).strip()
         assert conllu == EXPECTED_PRETOKENIZED_CONLLU
 
+    def test_bulk_pretokenized(self, pretokenized_pipeline, tokenizer_pipeline):
+        doc = tokenizer_pipeline(PRETOKENIZED_TEXT)
+        conllu = "{:C}".format(doc).strip()
+        assert conllu == EXPECTED_TOKENIZED_ONLY_CONLLU
+
+        docs = pretokenized_pipeline([doc, doc])
+        assert len(docs) == 2
+        for doc in docs:
+            conllu = "{:C}".format(doc).strip()
+            assert conllu == EXPECTED_PRETOKENIZED_CONLLU
+
+    def test_conll2doc_pretokenized(self, pretokenized_pipeline):
+        doc = CoNLL.conll2doc(input_str=EXPECTED_TOKENIZED_ONLY_CONLLU)
+        # this was bug from version 1.10.1 sent to us from a user
+        # the pretokenized tokenize_processor would try to whitespace tokenize a document
+        # even if the document already had sentences & words & stuff
+        # not only would that be wrong if the text wouldn't whitespace tokenize into the words
+        # (such as with punctuation and SpaceAfter=No),
+        # it wouldn't even work in the case of conll2doc, since the document.text wasn't set
+        docs = pretokenized_pipeline([doc, doc])
+        assert len(docs) == 2
+        for doc in docs:
+            conllu = "{:C}".format(doc).strip()
+            assert conllu == EXPECTED_PRETOKENIZED_CONLLU
+
     def test_stream(self, pipeline):
         """ Test the streaming interface to the Pipeline """
         # Test all of the documents in one batch