complete refactor to allow for learned encoder / decoders (unet in paper), validated with an mnist training script

lucidrains · lucidrains · commit d7e726b6341f · 2024-11-24T19:30:55.000Z
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -1,5 +1,5 @@
 name: Tests the examples in README
-on: push
+on: [push, pull_request]
 
 env:
   TYPECHECK: True
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "transfusion-pytorch"
-version = "0.4.16"
+version = "0.5.0"
 description = "Transfusion in Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_transfusion.py b/tests/test_transfusion.py
@@ -36,7 +36,7 @@ def test_transfusion(
         dim_latent = (384, 192), # specify multiple latent dimensions
         modality_default_shape = ((32,), (64,)),
         transformer = dict(
-            dim = 512,
+            dim = 64,
             depth = 2,
             use_flex_attn = use_flex_attn
         )
@@ -80,7 +80,7 @@ def test_auto_modality_transform(
         channel_first_latent = True,
         modality_default_shape = (32,),
         transformer = dict(
-            dim = 512,
+            dim = 64,
             depth = 2,
             use_flex_attn = use_flex_attn
         )
@@ -117,7 +117,7 @@ def test_text(
         channel_first_latent = True,
         modality_default_shape = (32,),
         transformer = dict(
-            dim = 512,
+            dim = 64,
             depth = 2,
             use_flex_attn = use_flex_attn
         )
@@ -141,7 +141,7 @@ def test_modality_only(
         channel_first_latent = channel_first,
         modality_default_shape = (32,),
         transformer = dict(
-            dim = 512,
+            dim = 64,
             depth = 2,
             use_flex_attn = False
         )
@@ -173,8 +173,8 @@ def test_text_image_end_to_end(
         modality_encoder = mock_vae_encoder,
         modality_decoder = mock_vae_decoder,
         transformer = dict(
-            dim = 512,
-            depth = 8
+            dim = 64,
+            depth = 2
         )
     )
 
@@ -196,24 +196,26 @@ def test_text_image_end_to_end(
 
     # allow researchers to experiment with different time distributions across multiple modalities in a sample
 
-    def modality_length_to_times(modality_length):
-        has_modality = modality_length > 0
-        return torch.where(has_modality, torch.ones_like(modality_length), 0.)
+    def num_modalities_to_times(num_modalities):
+        batch = num_modalities.shape[0]
+        device = num_modalities.device
+        total_modalities = num_modalities.amax().item()
+        return torch.ones((batch, total_modalities), device = device)
 
-    time_fn = modality_length_to_times if custom_time_fn else None
+    time_fn = num_modalities_to_times if custom_time_fn else None
 
     # forward
 
     loss = model(
         text_and_images,
-        modality_length_to_times_fn = time_fn
+        num_modalities_to_times_fn = time_fn
     )
 
     loss.backward()
 
     # after much training
 
-    one_multimodal_sample = model.sample()
+    one_multimodal_sample = model.sample(max_length = 128)
 
 def test_velocity_consistency():
     mock_encoder = nn.Conv2d(3, 384, 3, padding = 1)
@@ -228,7 +230,7 @@ def test_velocity_consistency():
         modality_encoder = mock_encoder,
         modality_decoder = mock_decoder,
         transformer = dict(
-            dim = 512,
+            dim = 64,
             depth = 1
         )
     )
@@ -251,14 +253,9 @@ def test_velocity_consistency():
         ]
     ]
 
-    def modality_length_to_times(modality_length):
-        has_modality = modality_length > 0
-        return torch.where(has_modality, torch.ones_like(modality_length), 0.)
-
     loss, breakdown = model(
         text_and_images,
         velocity_consistency_ema_model = ema_model,
-        modality_length_to_times_fn = modality_length_to_times,
         return_breakdown = True
     )
 
@@ -275,7 +272,7 @@ def test_axial_pos_emb():
         add_pos_emb = True,
         modality_num_dim = (2, 1),
         transformer = dict(
-            dim = 512,
+            dim = 64,
             depth = 8
         )
     )
@@ -295,7 +292,7 @@ def test_axial_pos_emb():
 
     # after much training
 
-    one_multimodal_sample = model.sample()
+    one_multimodal_sample = model.sample(max_length = 128)
 
 # unet related
 
diff --git a/train_mnist.py b/train_mnist.py
@@ -24,7 +24,7 @@
 IMAGE_AFTER_TEXT = False
 NUM_TRAIN_STEPS = 10_000
 SAMPLE_EVERY = 250
-CHANNEL_FIRST = False
+CHANNEL_FIRST = True
 
 # functions
 
diff --git a/train_mnist_vae.py b/train_mnist_vae.py
@@ -59,7 +59,7 @@ def __getitem__(self, idx):
 
 # contrived encoder / decoder with layernorm at bottleneck
 
-autoencoder_train_steps = 15000
+autoencoder_train_steps = 15_000
 dim_latent = 16
 
 class Normalize(Module):
@@ -133,7 +133,7 @@ def forward(self, x):
 
 # training transfusion
 
-dataloader = model.create_dataloader(dataset, batch_size = 16, collate_fn = collate_fn, shuffle = True)
+dataloader = model.create_dataloader(dataset, batch_size = 16, shuffle = True)
 iter_dl = cycle(dataloader)
 
 optimizer = Adam(model.parameters_without_encoder_decoder(), lr = 3e-4)
diff --git a/transfusion_pytorch/transfusion.py b/transfusion_pytorch/transfusion.py