first make sure prompting is seamless with text first

lucidrains · lucidrains · commit 95d85e36e37e · 2024-12-02T07:41:39.000-08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "transfusion-pytorch"
-version = "0.5.5"
+version = "0.5.6"
 description = "Transfusion in Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/train_mnist.py b/train_mnist.py
@@ -13,7 +13,7 @@
 import torchvision.transforms as T
 from torchvision.utils import save_image
 
-from transfusion_pytorch import Transfusion, print_modality_sample
+from transfusion_pytorch.transfusion import Transfusion, print_modality_sample
 
 rmtree('./results', ignore_errors = True)
 results_folder = Path('./results')
@@ -24,6 +24,7 @@
 IMAGE_AFTER_TEXT = True
 NUM_TRAIN_STEPS = 20_000
 SAMPLE_EVERY = 500
+USE_PROMPT = True
 CHANNEL_FIRST = True
 
 # functions
@@ -127,7 +128,24 @@ def collate_fn(data):
     # eval
 
     if divisible_by(step, SAMPLE_EVERY):
-        one_multimodal_sample = ema_model.sample(max_length = 384)
+
+        if not USE_PROMPT:
+            # sampling from start to finish
+
+            one_multimodal_sample = ema_model.sample(max_length = 384)
+        else:
+            # sampling using prompt
+            # which differs depending on which comes first, text or images
+
+            if IMAGE_AFTER_TEXT:
+
+                maybe_label = torch.randint(0, 10, ()).cuda()
+                one_multimodal_sample = ema_model.sample(prompt = maybe_label, max_length = 384)
+
+            else:
+                raise NotImplementedError
+
+        # make sure modality sample overall order of modalities look correct
 
         print_modality_sample(one_multimodal_sample)
 
diff --git a/transfusion_pytorch/transfusion.py b/transfusion_pytorch/transfusion.py
@@ -195,6 +195,28 @@ def default_modality_length_to_time_fn(num_modalities: Int['b']) -> Float['b m']
 
 # pretty print
 
+def concat_contiguous_text(
+    modality_sample: ModalitySample
+) -> ModalitySample:
+    """ within a modality sample, any two tensors of type int / long will be concatted together if next to each other, so all text is followed by a modality, and all modality followed by text """
+
+    output = []
+    curr_modality = None
+
+    for modality in modality_sample:
+        if (
+            len(output) > 0 and
+            output[-1].dtype == modality.dtype and
+            modality.dtype in (torch.int, torch.long)
+        ):
+            packed_text, _ = pack((output[-1], modality), '*')
+            output[-1] = packed_text
+
+        else:
+            output.append(modality)
+
+    return output
+
 def print_modality_sample(
     modality_sample: ModalitySample
 ):
@@ -1394,12 +1416,18 @@ def sample(
         device = self.device
 
         init_text_seq = tensor([self.sos_id], device = device)
+
+        # just take care of prompt being zero dimensions
+
+        prompt = tree_map_tensor(prompt, lambda t: rearrange(t, '-> 1') if t.ndim == 0 else t)
+
         modality_sample = [init_text_seq, *default(prompt, [])]
 
         # take care of moving to device
 
         modality_sample = tree_map_tensor(modality_sample, lambda t: t.to(device))
-        modality_sample = tree_map_tensor(modality_sample, lambda t: rearrange(t, '-> 1') if t.ndim == 0 else t)
+
+        modality_sample = concat_contiguous_text(modality_sample)
 
         *_, last_modality_sample = modality_sample
         assert last_modality_sample.dtype in (torch.int, torch.long), 'prompt must be text tokens'