Gemma3 is Torch Exportable

Guang Yang · Guang Yang · commit 41ccd3d5e63a · 2025-04-23T16:44:10.000-07:00
diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py
@@ -410,7 +410,7 @@ def forward(
                 # In case we are beyond the sliding window, we need to correctly offset the mask slicing
                 offset = cache_position[-1] - effective_seq_len + 1
                 # Should only be used when beyond the sliding window (i.e. offset > 0)
-                offset = max(0, offset)
+                offset = torch.clamp(offset, min=0)
                 # equivalent to: `attention_mask = attention_mask[:, :, :, offset : offset + effective_seq_len]`,
                 # but without data-dependent slicing (i.e. torch.compile friendly)
                 mask_indexes = torch.arange(
diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py
@@ -30,7 +30,6 @@
 from transformers.testing_utils import (
     cleanup,
     require_flash_attn,
-    require_read_token,
     require_torch,
     require_torch_gpu,
     slow,
@@ -664,3 +663,257 @@ def test_generation_beyond_sliding_window_with_generation_config(self):
         model.generation_config.transformers_version = "4.49.0"
         with self.assertRaises(RuntimeError):  # errors out because it is not using hybrid cache
             out = model.generate(**inputs, generation_config=generation_config)
+
+    def test_export_text_only_with_hybrid_cache(self):
+        from transformers import HybridCache
+
+        class Gemma3ExportableModule(torch.nn.Module):
+            """
+            A wrapper module designed to make Gemma3 models exportable with `torch.export`,
+            specifically for use with HybridCache to support interleaved global and local attention.
+
+            This wrapper ensures that the exported model is compatible with further lowering
+            and execution in frameworks like ExecuTorch.
+            """
+
+            def __init__(self, model: PreTrainedModel, max_batch_size: int = 1, max_seq_len: int = 4096):
+                """
+                Initializes the wrapper module with the Gemma3 model.
+
+                Args:
+                    model (`PreTrainedModel`): The Gemma3 model to wrap.
+                    max_batch_size (int): Maximum batch size for the cache.
+                    max_seq_len (int): Maximum sequence length for the cache.
+
+                Raises:
+                    AssertionError: If the model doesn't have the expected configuration for HybridCache.
+                """
+                super().__init__()
+                self.model = model
+                self.config = model.config
+
+                # Verify the model is configured for HybridCache
+                assert self.config.cache_implementation == "hybrid", "Model must use 'hybrid' cache implementation"
+
+                # Verify sliding window configuration for local attention
+                assert hasattr(self.config, "sliding_window"), "Model config must have sliding_window attribute"
+                assert hasattr(self.config, "sliding_window_pattern"), (
+                    "Model config must have sliding_window_pattern attribute"
+                )
+
+                # Initialize the HybridCache
+                self.cache = HybridCache(
+                    config=self.config,
+                    max_batch_size=max_batch_size,
+                    max_cache_len=max_seq_len,
+                    device=model.device,
+                    dtype=model.dtype,
+                )
+
+                # Register buffers for tracking state
+                self.register_buffer("last_position", torch.tensor([-1], dtype=torch.long))
+
+                # Store the sliding window pattern for reference
+                self.sliding_window = self.config.sliding_window
+                self.sliding_window_pattern = self.config.sliding_window_pattern
+
+                # Determine which layers use global vs local attention
+                # In Gemma3, typically every 6th layer (0-indexed) uses global attention
+                self.global_attention_layers = [
+                    i
+                    for i in range(self.config.num_hidden_layers)
+                    if i % self.sliding_window_pattern == (self.sliding_window_pattern - 1)
+                ]
+
+            def forward(
+                self,
+                input_ids: torch.Tensor,
+                cache_position: torch.Tensor,
+            ) -> torch.Tensor:
+                """
+                Forward pass of the module, compatible with torch.export.
+
+                Args:
+                    input_ids (`torch.Tensor`): Tensor representing current input token id(s).
+                    cache_position (`torch.Tensor`): Tensor representing current position(s) in the cache.
+
+                Returns:
+                    torch.Tensor: Logits output from the model.
+                """
+                batch_size, seq_len = input_ids.shape
+
+                # Update the last_position for tracking
+                self.last_position = cache_position[-1].unsqueeze(0)
+
+                # Generate position_ids from cache_position
+                position_ids = cache_position.unsqueeze(0).expand(batch_size, -1)
+
+                # Create attention mask (always ones for token-by-token generation)
+                attention_mask = torch.ones((batch_size, seq_len), dtype=torch.long, device=input_ids.device)
+
+                # Forward pass with the model
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=self.cache,
+                    use_cache=True,
+                    cache_position=cache_position,
+                )
+
+                # Return only the logits to simplify the export
+                return outputs.logits
+
+            @staticmethod
+            def generate(
+                exported_model: torch.export.ExportedProgram,
+                tokenizer,
+                prompt: str,
+                max_new_tokens: int = 20,
+                do_sample: bool = False,
+                temperature: float = 1.0,
+                top_k: int = 50,
+                top_p: float = 1.0,
+                device: str = "cpu",
+            ) -> str:
+                """
+                Generate text using an exported Gemma3 model.
+
+                Args:
+                    exported_model (`torch.export.ExportedProgram`): The exported model being used for generate.
+                    tokenizer: The tokenizer to use.
+                    prompt (str): The input prompt.
+                    max_new_tokens (int): Maximum number of new tokens to generate.
+                    do_sample (bool): Whether to use sampling or greedy decoding.
+                    temperature (float): The temperature for sampling.
+                    top_k (int): The number of highest probability tokens to keep for top-k sampling.
+                    top_p (float): The cumulative probability for nucleus sampling.
+                    device (str): The device to use.
+
+                Returns:
+                    str: The generated text.
+                """
+                # Get the module from the exported program
+                exported_module = exported_model.module()
+
+                # Tokenize the prompt
+                input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+
+                # Initialize with the prompt
+                generated_ids = input_ids.clone()
+
+                # Process the prompt tokens first
+                curr_position = 0
+                for i in range(input_ids.shape[1]):
+                    # Process one token at a time
+                    curr_input_ids = input_ids[:, i : i + 1]
+                    curr_cache_position = torch.tensor([curr_position], dtype=torch.long, device=device)
+
+                    # Forward pass
+                    _ = exported_module(curr_input_ids, curr_cache_position)
+                    curr_position += 1
+
+                # Generate new tokens
+                for _ in range(max_new_tokens):
+                    # Get the last token as input
+                    curr_input_ids = generated_ids[:, -1:]
+                    curr_cache_position = torch.tensor([curr_position], dtype=torch.long, device=device)
+
+                    # Forward pass to get next token logits
+                    outputs = exported_module(curr_input_ids, curr_cache_position)
+
+                    # Get the next token ID
+                    if do_sample:
+                        # Apply temperature
+                        if temperature > 0:
+                            logits = outputs / temperature
+                        else:
+                            logits = outputs
+
+                        # Apply top-k filtering
+                        if top_k > 0:
+                            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+                            logits[indices_to_remove] = float("-inf")
+
+                        # Apply top-p (nucleus) filtering
+                        if top_p < 1.0:
+                            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+                            cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+
+                            # Remove tokens with cumulative probability above the threshold
+                            sorted_indices_to_remove = cumulative_probs > top_p
+                            # Shift the indices to the right to keep also the first token above the threshold
+                            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                            sorted_indices_to_remove[..., 0] = 0
+
+                            # Scatter sorted tensors to original indexing
+                            indices_to_remove = sorted_indices_to_remove.scatter(
+                                -1, sorted_indices, sorted_indices_to_remove
+                            )
+                            logits[indices_to_remove] = float("-inf")
+
+                        # Sample from the filtered distribution
+                        probs = torch.softmax(logits, dim=-1)
+                        next_token_id = torch.multinomial(probs, num_samples=1)
+                    else:
+                        # Greedy decoding
+                        next_token_id = outputs.argmax(dim=-1, keepdim=True)
+
+                    # Ensure next_token_id has the right shape before concatenation
+                    if next_token_id.dim() > 2:
+                        next_token_id = next_token_id.squeeze(-1)
+
+                    # Append to the generated sequence
+                    generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)
+                    curr_position += 1
+
+                    # Stop if we generate an EOS token
+                    if next_token_id.item() == tokenizer.eos_token_id:
+                        break
+
+                # Decode the generated text
+                return tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+
+        model_id = "google/gemma-3-1b-it"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = AutoModelForCausalLM.from_pretrained(model_id)
+        model.eval()
+        print(f"Model config: {model.config}")
+
+        # Create a wrapper for export with static batch size
+        wrapper = Gemma3ExportableModule(model)
+
+        # Prepare example inputs
+        example_input_ids = torch.tensor([[1]], dtype=torch.long)
+        example_cache_position = torch.tensor([0], dtype=torch.long)
+
+        # Export the model with static shapes
+        exported_program = torch.export.export(
+            wrapper,
+            (example_input_ids, example_cache_position),
+            strict=False,
+        )
+        print(f"Exported program: {exported_program}")
+
+        # Test generation with the exported model
+        # prompt = "What is the capital of France?"
+        prompt = "Write a poem about Machine Learning."
+        max_new_tokens_to_generate = 100
+        # Generate text with the exported model
+        export_generated_text = Gemma3ExportableModule.generate(
+            exported_program, tokenizer, prompt, max_new_tokens=max_new_tokens_to_generate
+        )
+        print(f"Export generated texts: '{export_generated_text}'")
+
+        input_text = tokenizer(prompt, return_tensors="pt")
+        with torch.no_grad():
+            eager_outputs = model.generate(
+                **input_text,
+                max_new_tokens=max_new_tokens_to_generate,
+                do_sample=False,  # Use greedy decoding to match the exported model
+            )
+
+        eager_generated_text = tokenizer.decode(eager_outputs[0], skip_special_tokens=True)
+        print(f"Eager generated texts: '{eager_generated_text}'")
+
+        self.assertEqual(export_generated_text, eager_generated_text)