Yarn (#145)

RaymondLi0 · oleksost · web-flow · commit 2dc0d4e31d18 · 2025-02-19T09:41:47.000-05:00
Co-authored-by: oleksost &lt;ostapy2@gmail.com&gt;
diff --git a/fast_llm/data/dataset/config.py b/fast_llm/data/dataset/config.py
@@ -8,7 +8,7 @@
 from fast_llm.config import Config, Field, FieldHint, FieldVerboseLevel, check_field, config_class
 from fast_llm.data.dataset.abstract import SamplableDataset, SampledDataset
 from fast_llm.engine.distributed.config import PhaseType
-from fast_llm.utils import Assert
+from fast_llm.utils import Assert, normalize_probabilities
 
 if typing.TYPE_CHECKING:
     from fast_llm.data.dataset.indexed import ConcatenatedDataset, DatasetSlice, IndexedDataset
@@ -204,6 +204,7 @@ class BlendedDatasetConfig(SampledDatasetConfig):
     )
 
     def _validate(self) -> None:
+        self.weights = normalize_probabilities(self.weights)
         super()._validate()
         Assert.geq(len(self.datasets), 2)
         Assert.eq(len(self.datasets), len(self.weights))
diff --git a/fast_llm/layers/transformer/config.py b/fast_llm/layers/transformer/config.py
@@ -80,6 +80,7 @@ class RotaryEmbeddingType(str, enum.Enum):
     none = "none"
     default = "default"
     llama3 = "llama3"
+    yarn = "yarn"
 
 
 @config_class()
@@ -110,7 +111,22 @@ class RotaryArchitectureConfig(BaseModelArchitectureConfig):
         default=4.0, desc="High frequency factor for llama3-type scaling.", hint=FieldHint.feature
     )
     original_context_length: int = Field(
-        default=8192, desc="Original context length for llama3-type scaling.", hint=FieldHint.feature
+        default=8192, desc="Original context length for llama3/yarn-type scaling.", hint=FieldHint.feature
+    )
+    attention_factor: None | float = Field(
+        default=None,
+        desc="Attention factor for yarn-type scaling.",
+        hint=FieldHint.feature,
+    )
+    beta_fast: float = Field(
+        default=32.,
+        desc="Beta-fast for yarn-type scaling.",
+        hint=FieldHint.feature,
+    )
+    beta_slow: float = Field(
+        default=1.,
+        desc="Beta-slow for yarn-type scaling.",
+        hint=FieldHint.feature,
     )
 
     @property
diff --git a/fast_llm/layers/transformer/preprocessing.py b/fast_llm/layers/transformer/preprocessing.py
@@ -37,7 +37,67 @@ def apply_llama3_scaling(config: RotaryConfig, frequencies: torch.Tensor) -> tor
                 config.high_frequency_factor - config.low_frequency_factor
             )
             new_frequencies.append((1 - smooth) * frequency / config.scale_factor + smooth * frequency)
-    return torch.tensor(new_frequencies, dtype=frequencies.dtype, device=frequencies.device)
+    return torch.tensor(new_frequencies, dtype=frequencies.dtype, device=frequencies.device), 1.0
+
+
+def apply_yarn_scaling(config: RotaryConfig, frequencies: torch.Tensor, kv_channels, sequence_length) -> torch.Tensor:
+    """
+    Yarn scaling:
+    https://github.com/huggingface/transformers/blob/006d9249ec0270ff6c4d3840979d23fe94bdc763/src/transformers/modeling_rope_utils.py#L163
+    [original paper](https://arxiv.org/abs/2309.00071)
+    """
+    base = config.theta
+    partial_rotary_factor = 1.0
+    dim = int(kv_channels * partial_rotary_factor)
+    max_position_embeddings = sequence_length
+    factor = config.scale_factor
+
+    attention_factor = config.attention_factor
+    if attention_factor is None:
+        attention_factor = 0.1 * math.log(factor) + 1.0
+
+    # Compute the inverse frequencies
+    def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
+        """Inverse dimension formula to find the dimension based on the number of rotations"""
+        return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
+
+    def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
+        """Find dimension range bounds based on rotations"""
+        low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
+        high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
+        return max(low, 0), min(high, dim - 1)
+
+    def linear_ramp_factor(min, max, dim):
+        if min == max:
+            max += 0.001  # Prevent singularity
+
+        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+        ramp_func = torch.clamp(linear_func, 0, 1)
+        return ramp_func
+
+
+    # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
+    # to expand the possible context length. In other words, interpolation = apply scaling factor.
+    # pos_freqs = base ** (torch.arange(0, dim, 2).float().to(frequencies.device) / dim)
+    # inv_freq_extrapolation = 1.0 / pos_freqs
+    # inv_freq_interpolation = 1.0 / (factor * pos_freqs)
+
+    inv_freq_extrapolation = frequencies
+    inv_freq_interpolation = frequencies / factor
+
+    # TODO: max_position_embeddings or original_context_length?
+    # see https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/modeling_deepseek.py#L304
+    low, high = find_correction_range(config.beta_fast, config.beta_slow, dim, base, config.original_context_length)
+
+    # Get n-dimensional rotational scaling corrected for extrapolation
+    inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(frequencies.device)
+    inv_freq = (
+        inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
+        + inv_freq_extrapolation * inv_freq_extrapolation_factor
+    )
+
+    return inv_freq, attention_factor
+
 
 
 def get_rotary_frequencies(
@@ -56,13 +116,19 @@ def get_rotary_frequencies(
     frequencies = config.theta ** -torch.arange(0, 1, 2 / kv_channels, device=device, dtype=torch.float64)
     # Apply scaling
     if config.type == RotaryEmbeddingType.llama3:
-        frequencies = apply_llama3_scaling(config, frequencies)
+        frequencies, attention_scaling = apply_llama3_scaling(config, frequencies)
+    elif config.type == RotaryEmbeddingType.yarn:
+        frequencies, attention_scaling = apply_yarn_scaling(config, frequencies, kv_channels, sequence_length)
+    else:
+        attention_scaling = 1.0
     angles = torch.outer(positions, frequencies)
     frequencies = torch.polar(torch.ones_like(angles), angles)[None, :, None, :].to(torch.complex64)
     if not config.complex_format:
         frequencies = convert_rotary_complex_to_real(
             torch.view_as_real(frequencies).flatten(-2), kv_channels, 3
         ).contiguous()
+    # Advanced Rope types like yarn apply a post-processing scaling factor, equivalent to scaling attention.
+    frequencies = frequencies * attention_scaling
     return frequencies
 
 
diff --git a/fast_llm/models/gpt/conversion.py b/fast_llm/models/gpt/conversion.py
@@ -294,10 +294,13 @@ class RopeScalingParamConverter(ParamConverter):
         "low_freq_factor",
         "high_freq_factor",
         "original_max_position_embeddings",
+        "attention_factor",
+        "beta_fast",
+        "beta_slow",
     )
 
     def __post_init__(self):
-        Assert.eq(len(self.fast_llm_names), 5)
+        Assert.eq(len(self.fast_llm_names), 8)
         Assert.eq(len(self.export_names), 1)
 
     def export_params(self, fast_llm_values: tuple[typing.Any, ...]) -> tuple[typing.Any, ...]:
@@ -306,16 +309,19 @@ def export_params(self, fast_llm_values: tuple[typing.Any, ...]) -> tuple[typing
             return (None,)
         elif rope_type == RotaryEmbeddingType.llama3:
             return ({key: value for key, value in zip(self._HUGGINGFACE_NAMES, ("llama3", *parameters), strict=True)},)
+        elif rope_type == RotaryEmbeddingType.yarn:
+            return ({key: value for key, value in zip(self._HUGGINGFACE_NAMES, ("yarn", *parameters), strict=True)},)
         else:
             raise ValueError(f"Unsupported rotary scaling type: {rope_type}")
 
     def import_params(self, export_values: tuple[typing.Any, ...]) -> tuple[typing.Any, ...]:
         (export_value,) = export_values
         if export_value is None or (rope_type := export_value[self._HUGGINGFACE_NAMES[0]]) == "default":
-            return (RotaryEmbeddingType.default,) + (DEFAULT,) * 4
+            return (RotaryEmbeddingType.default,) + (DEFAULT,) * 7
         elif rope_type == RotaryEmbeddingType.llama3:
-            # TODO: Is it safe to assume all values are provided?
-            return ("llama3", *[export_value[key] for key in self._HUGGINGFACE_NAMES[1:]])
+            return ("llama3", *[export_value.get(key, DEFAULT) for key in self._HUGGINGFACE_NAMES[1:]])
+        elif rope_type == RotaryEmbeddingType.yarn:
+            return ("yarn", *[export_value.get(key, DEFAULT) for key in self._HUGGINGFACE_NAMES[1:]])
         else:
             raise ValueError(f"Unsupported rotary scaling type: {rope_type}")
 
@@ -337,6 +343,9 @@ def _create_config_converters(cls) -> list[ParamConverter]:
                     ("transformer", "rotary", "low_frequency_factor"),
                     ("transformer", "rotary", "high_frequency_factor"),
                     ("transformer", "rotary", "original_context_length"),
+                    ("transformer", "rotary", "attention_factor"),
+                    ("transformer", "rotary", "beta_fast"),
+                    ("transformer", "rotary", "beta_slow"),
                 ),
                 export_names=(("rope_scaling",),),
             ),