Merge pull request #152 from afiaka87/truncate_captions

lucidrains · web-flow · commit 1000e7465323 · 2021-04-01T08:09:08.000-07:00
Add --truncate_captions beneath token length arg.
diff --git a/dalle_pytorch/dalle_pytorch.py b/dalle_pytorch/dalle_pytorch.py
@@ -302,7 +302,7 @@ def __init__(
         ff_dropout = 0,
         sparse_attn = False,
         attn_types = None,
-        loss_img_weight = 7
+        loss_img_weight = 7,
     ):
         super().__init__()
         assert isinstance(vae, (DiscreteVAE, OpenAIDiscreteVAE, VQGanVAE1024)), 'vae must be an instance of DiscreteVAE'
diff --git a/dalle_pytorch/simple_tokenizer.py b/dalle_pytorch/simple_tokenizer.py
@@ -122,7 +122,7 @@ def decode(self, tokens):
 
 tokenizer = SimpleTokenizer()
 
-def tokenize(texts, context_length = 256, add_start_and_end = False):
+def tokenize(texts, context_length = 256, add_start_and_end = False, truncate_text=False):
     if isinstance(texts, str):
         texts = [texts]
 
@@ -133,7 +133,10 @@ def tokenize(texts, context_length = 256, add_start_and_end = False):
 
     for i, tokens in enumerate(all_tokens):
         if len(tokens) > context_length:
-            raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
+            if truncate_text:
+                tokens = tokens[:context_length]
+            else:
+                raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
         result[i, :len(tokens)] = torch.tensor(tokens)
 
     return result
diff --git a/train_dalle.py b/train_dalle.py
@@ -38,6 +38,9 @@
 parser.add_argument('--image_text_folder', type = str, required = True,
                     help='path to your folder of images and text for learning the DALL-E')
 
+parser.add_argument('--truncate_captions', dest='truncate_captions',
+                    help='Captions passed in which exceed the max token length will be truncated if this is set.')
+
 parser.add_argument('--taming', dest='taming', action='store_true')
 
 parser = deepspeed_utils.wrap_arg_parser(parser)
@@ -197,7 +200,7 @@ def __getitem__(self, ind):
         descriptions = list(filter(lambda t: len(t) > 0, descriptions))
         description = choice(descriptions)
 
-        tokenized_text = tokenize(description, self.text_len).squeeze(0)
+        tokenized_text = tokenize(description, self.text_len, truncate_text=args.truncate_captions).squeeze(0)
         mask = tokenized_text != 0
 
         image_tensor = self.image_tranform(image)