complete batched preocessing of modality tensors of same shapes for each modality type, if they need initial encoding (latent flow matching)

lucidrains · lucidrains · commit 22a9c500d246 · 2025-02-01T14:15:44.000Z
diff --git a/README.md b/README.md
@@ -155,6 +155,20 @@ loss.backward()
 sampled = model.generate_text_only(text[:, :1], 1024)
 ```
 
+## Examples
+
+To run any of the examples `train_{example_name}.py` in the project root, simply install dependencies first as so
+
+```bash
+$ pip install .[examples]
+```
+
+If you run into some weird error with `safetensors`, run this too
+
+```bash
+$ pip install -U diffusers transformers accelerate scipy ftfy safetensors
+```
+
 ## Citations
 
 ```bibtex
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "transfusion-pytorch"
-version = "0.9.5"
+version = "0.10.0"
 description = "Transfusion in Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/transfusion_pytorch/transfusion.py b/transfusion_pytorch/transfusion.py
@@ -113,6 +113,7 @@ class ModalityInfo(NamedTuple):
     eom_id: int
     to_shape_fn: Callable | None
     channel_first_latent: bool
+    modality_type: int
 
 # helper functions
 
@@ -1489,7 +1490,8 @@ def get_modality_info(
             som_id = som_id,
             eom_id = eom_id,
             to_shape_fn = to_shape_fn,
-            channel_first_latent = channel_first_latent
+            channel_first_latent = channel_first_latent,
+            modality_type = modality_type
         )
 
     def get_all_modality_info(self) -> list[ModalityInfo]:
@@ -2264,10 +2266,24 @@ def forward(
 
         text = []
 
-        flows = defaultdict(list) # store flows for loss
+        # auto move all tensors to device of model
+
+        modalities = tree_map_tensor(modalities, lambda t: t.to(device))
+
+        # for all modalities, batch process same shaped modalities of the same type
+
+        if not is_decoding:
+            for mod in self.get_all_modality_info():
+                encode_fn = default(mod.encoder, nn.Identity())
+
+                with torch.no_grad():
+                    encode_fn.eval()
+                    modalities = apply_fn_modality_type(encode_fn, modalities, modality_type = mod.modality_type)
 
         # for parsing out the predicted flow from flattened sequence of tokens coming out of transformer
 
+        flows = defaultdict(list) # store flows for loss
+
         get_pred_flows: GetPredFlows = defaultdict(list) # functions for parsing modalities from Float['b n d'] for model back to latents or pixel space
 
         def model_to_pred_flow(batch_index, start_index, modality_length, unpack_fn):
@@ -2322,22 +2338,13 @@ def inner(pred_flow):
                 if is_text:
                     modality_tensor = modality
                 else:
-                    modality_type, modality_tensor = modality
+                    modality_type, modality_tensor, *_ = modality
 
                 # auto move modality tensor to correct device
 
-                modality_tensor = modality_tensor.to(device)
-
                 mod = self.get_modality_info(modality_type)
 
                 if is_modality:
-                    if not is_decoding:
-
-                        if exists(mod.encoder):
-                            with torch.no_grad():
-                                mod.encoder.eval()
-                                modality_tensor = self.maybe_add_temp_batch_dim(mod.encoder)(modality_tensor).detach()
-
                     assert 0 <= modality_type < self.num_modalities, f'received a modality index that is out of range. only {self.num_modalities} modalities specified'
 
                     channel_dim = 0 if mod.channel_first_latent else -1