Seems that a full list of PAD isn't needed

AngledLuffa · AngledLuffa · commit d39978180a54 · 2025-04-04T20:21:19.000-07:00
... need to use the longest sentence to get the batch length
diff --git a/stanza/models/tokenization/data.py b/stanza/models/tokenization/data.py
@@ -431,7 +431,7 @@ def collate(self, samples):
             units[i, :len(u_)] = torch.from_numpy(u_)
             labels[i, :len(l_)] = torch.from_numpy(l_)
             features[i, :len(f_), :] = torch.from_numpy(f_)
-            raw_units.append(r_ + ['<PAD>'] * (pad_len - len(r_)))
+            raw_units.append(r_ + ['<PAD>'])
 
         return units, labels, features, raw_units
 
diff --git a/stanza/models/tokenization/utils.py b/stanza/models/tokenization/utils.py
@@ -258,7 +258,8 @@ def predict(trainer, data_generator, batch_size, max_seqlen, use_regex_tokens, n
     dataloader = TorchDataLoader(sorted_data, batch_size=batch_size, collate_fn=sorted_data.collate, num_workers=num_workers)
     for batch_idx, batch in enumerate(dataloader):
         num_sentences = len(batch[3])
-        N = len(batch[3][0])
+        # being sorted by length, we need to use -1 as the longest sentence
+        N = len(batch[3][-1])
         for paragraph in batch[3]:
             all_raw.append(list(paragraph))