rely on all zeros in the feature dimension to be treated as padding to be masked out, to remove need to keep track of mask. also always project the text encodings, so in the case of multiple text models, acts like a model type embedding.

lucidrains · lucidrains · commit afdd14658cc9 · 2023-03-08T12:21:47.000-08:00
diff --git a/README.md b/README.md
@@ -48,6 +48,12 @@ first_conditioned = first_condition_fn(first_hidden)
 second_conditioned = second_condition_fn(second_hidden)
 ```
 
+If you wish to use cross attention based conditioning (each hidden feature in your network can attend to individual subword tokens), just import the `AttentionTextConditioner` instead. Rest is the same
+
+```python
+from classifier_free_guidance_pytorch import AttentionTextConditioner
+```
+
 ## Magic Decorator (wip)
 
 This is a work in progress to make it as easy as possible to text condition your network.
diff --git a/classifier_free_guidance_pytorch/classifier_free_guidance_pytorch.py b/classifier_free_guidance_pytorch/classifier_free_guidance_pytorch.py
@@ -271,7 +271,7 @@ def forward(
         hiddens,
         mask = None
     ):
-        return self.attn(hiddens, condition, mask = mask)
+        return self.attn(hiddens, condition, mask = mask) + hiddens
 
 # film text conditioning
 
@@ -440,7 +440,7 @@ def __init__(
         dim_latent = default(dim_latent, max([model.dim_latent for model in text_models]))
 
         for model in text_models:
-            self.to_latent_dims.append(nn.Linear(model.dim_latent, dim_latent) if model.dim_latent != dim_latent else nn.Identity())
+            self.to_latent_dims.append(nn.Linear(model.dim_latent, dim_latent))
 
         self.conditioners = nn.ModuleList([])
 
@@ -465,25 +465,25 @@ def embed_texts(self, texts: List[str]):
         device = self.device
 
         text_embeds = []
-        masks = []
 
         for text_model, to_latent in zip(self.text_models, self.to_latent_dims):
             text_embed = text_model.embed_text(texts, return_text_encodings = True)
 
             text_embed = text_embed.to(device)
 
             mask = (text_embed != 0).any(dim = -1)
-            mask = mask.to(device)
 
-            text_embeds.append(to_latent(text_embed))
-            masks.append(mask)
+            text_embed = to_latent(text_embed)
+            text_embed = text_embed.masked_fill(~mask[..., None], 0.)
 
-        return torch.cat(text_embeds, dim = -2), torch.cat(masks, dim = -1)
+            text_embeds.append(text_embed)
+
+        return torch.cat(text_embeds, dim = -2)
 
     def forward(
         self,
         texts: Optional[List[str]] = None,
-        text_embeds: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        text_embeds: Optional[torch.Tensor] = None,
         cond_drop_prob = None,
         repeat_batch = 1,  # for robotic transformer edge case
     ) -> Tuple[Callable, ...]:
@@ -497,14 +497,14 @@ def forward(
 
         if exists(texts):
             batch = len(texts)
-        elif exists(text_embeds):
-            batch = text_embeds[0].shape[0]
 
-        if exists(text_embeds):
-            text_embeds, mask = text_embeds
+        elif exists(text_embeds):
+            batch = text_embeds.shape[0]
 
         if not exists(text_embeds):
-            text_embeds, mask = self.embed_texts(texts)
+            text_embeds = self.embed_texts(texts)
+
+        mask = (text_embeds != 0).any(dim = -1)
 
         if cond_drop_prob > 0.:
             prob_keep_mask = prob_mask_like((batch, 1), 1. - cond_drop_prob, device = self.device)
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
   name = 'classifier-free-guidance-pytorch',
   packages = find_packages(exclude=[]),
   include_package_data = True,
-  version = '0.1.0',
+  version = '0.1.2',
   license='MIT',
   description = 'Classifier Free Guidance - Pytorch',
   author = 'Phil Wang',