address zero-dimensional modality, for #39

lucidrains · lucidrains · commit 1688ff896b60 · 2025-05-08T07:29:34.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "transfusion-pytorch"
-version = "0.10.2"
+version = "0.10.3"
 description = "Transfusion in Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_transfusion.py b/tests/test_transfusion.py
@@ -87,7 +87,7 @@ def test_auto_modality_transform(
         num_text_tokens = text_tokens,
         dim_latent = 384,
         channel_first_latent = True,
-        modality_default_shape = (32,),
+        modality_default_shape = (2, 2),
         transformer = dict(
             dim = 64,
             depth = 2,
@@ -385,3 +385,32 @@ def test_apply_fn_modality_type():
 
     assert (modalities[0][0][-1] == 1).all()
     assert (modalities[2][0][-1] == 2).all()
+
+
+def test_zero_dimensional():
+
+    model = Transfusion(
+        num_text_tokens = 256,
+        dim_latent = 384,
+        modality_default_shape = (),
+        transformer = dict(
+            dim = 512,
+            depth = 8,
+            num_residual_streams = 1
+        )
+    )
+
+    # any torch.long is text, torch.float is modalities
+
+    text_and_embeds = [
+        [randint(0, 256, (16,)), randn(384), randint(0, 256, (8,)), randn(384)],
+        [randint(0, 256, (16,)), randn(384), randint(0, 256, (5,)), randn(384), randint(0, 256, (9,))]
+    ]
+
+    loss = model(text_and_embeds)
+
+    loss.backward()
+
+    # after much training
+
+    one_multimodal_sample = model.sample()
diff --git a/transfusion_pytorch/transfusion.py b/transfusion_pytorch/transfusion.py
@@ -2224,6 +2224,7 @@ def forward(
             batch_num_modalities = 0
 
             for ind, modality in enumerate(batch_modalities):
+
                 if is_tensor(modality) and modality.dtype == torch.float:
                     modality = (0, modality)
 
@@ -2345,7 +2346,9 @@ def inner(pred_flow):
                     assert 0 <= modality_type < self.num_modalities, f'received a modality index that is out of range. only {self.num_modalities} modalities specified'
 
                     channel_dim = 0 if mod.channel_first_latent else -1
+
                     assert mod.dim_latent == modality_tensor.shape[channel_dim], f'mismatch for modality latent dimension - expected {mod.dim_latent} but received {modality_tensor.shape[-1]} - modality shape is {tuple(modality_tensor.shape)}, perhaps you need to set `channel_first_latent` to the correct value'
+                    assert mod.num_dim == (len(modality_tensor.shape) - 1), f'mismatch for modality number of dimensions - expected {mod.num_dim} but received {len(modality_tensor.shape) - 1} {modality_tensor.shape}'
 
                 # auto ward against scalars (lone start end tokens)
 
@@ -2355,7 +2358,7 @@ def inner(pred_flow):
                 # handle text
 
                 if is_text:
-                    assert modality_tensor.ndim == 1
+                    assert modality_tensor.ndim == 1 and modality_tensor.dtype in (torch.int, torch.long)
                     text_length = modality_tensor.shape[0]
 
                     batch_text.append(modality_tensor)
@@ -2420,7 +2423,7 @@ def inner(pred_flow):
                     # start by just storing the token length of the modality
 
                     modality_shape_str = join([*map(str, modality_shape_tuple)], ',')
-                    modality_meta_info = self.char_tokenizer(modality_shape_str, device = device)
+                    modality_meta_info = self.char_tokenizer(modality_shape_str, device = device).long()
 
                     precede_modality_tokens = len(modality_meta_info) + 2
                     succeed_modality_tokens = 1
@@ -2450,6 +2453,7 @@ def inner(pred_flow):
                 modality_tensor = F.pad(modality_tensor, (0, 0, precede_modality_tokens, succeed_modality_tokens))
 
                 batch_modality_tokens.append(modality_tensor)
+
                 batch_text.append(text_tensor)
 
                 # handle axial positional embedding