reset attention_factor to old behaviour

nitsanluke · nitsanluke · commit 0306e3622780 · 2025-06-25T22:32:09.000Z
diff --git a/fast_llm/layers/transformer/rotary/config.py b/fast_llm/layers/transformer/rotary/config.py
@@ -127,9 +127,9 @@ class YarnRotaryConfig(DefaultRotaryConfig):
     original_context_length: int = Field(default=8192, hint=FieldHint.feature)
 
     def _validate(self) -> None:
-        if self.attention_factor is None:
-            # with self._set_implicit_default():
-            self.attention_factor = 0.1 * math.log(self.scale_factor) + 1.0
+        # if self.attention_factor is None:
+        #     # with self._set_implicit_default():
+        #     self.attention_factor = 0.1 * math.log(self.scale_factor) + 1.0
         super()._validate()
 
     def _get_configurable_class(self) -> "type[YarnRotary]":
diff --git a/fast_llm/layers/transformer/rotary/rotary.py b/fast_llm/layers/transformer/rotary/rotary.py
@@ -181,7 +181,10 @@ class YarnRotary[ConfigType: YarnRotaryConfig](DefaultRotary[YarnRotaryConfig]):
     """
 
     def _get_frequencies(self, sequence_length: int, kv_channels: int, device="cuda") -> torch.Tensor:
-        return super()._get_frequencies(sequence_length, kv_channels, device) * self._config.attention_factor
+        attention_factor = self._config.attention_factor
+        if attention_factor is None:
+            attention_factor = 0.1 * math.log(self._config.scale_factor) + 1.0
+        return super()._get_frequencies(sequence_length, kv_channels, device) * attention_factor
 
     def _get_angle_scales(self, kv_channels: int, device="cuda") -> torch.Tensor:
         scales = super()._get_angle_scales(kv_channels, device)