diff --git a/fast_llm/layers/block/config.py b/fast_llm/layers/block/config.py index 680a122e..cad9133f 100644 --- a/fast_llm/layers/block/config.py +++ b/fast_llm/layers/block/config.py @@ -1,4 +1,6 @@ +import abc import enum +import functools import typing from fast_llm.config import Field, FieldHint, check_field, config_class @@ -9,7 +11,8 @@ from fast_llm.utils import Assert if typing.TYPE_CHECKING: - from fast_llm.layers.block.block import BlockLayer + from fast_llm.layers.block.block import Block, BlockLayer + # TODO: Generalize these beyond language models? (Ex. vision) @@ -156,6 +159,43 @@ class BlockConfig(BaseModelConfig): hint=FieldHint.architecture, ) + block_sequence: "BlockSequenceConfig" = Field(init=False) + + def _validate(self) -> None: + assert hasattr(self, "block_sequence") + Assert.incl(self, self.block_sequence.blocks.values()) + self.mixer.block = self + self.mlp.block = self + super()._validate() + + def setup_tensor_space(self, tensor_space: TensorSpace) -> None: + self.mlp.setup_tensor_space(tensor_space) + self.mixer.setup_tensor_space(tensor_space) + + # Hidden dimension + tensor_space.add_tensor_dim(TensorDim(BlockDimNames.hidden, self.block_sequence.hidden_size)) + + @abc.abstractmethod + def get_block(self) -> "Block": + pass + + +@config_class() +class BlockSequenceConfig(BaseModelConfig): + _abstract = True + + blocks: dict[str, BlockConfig] = Field() + block_pattern: tuple[str, ...] = Field( + default=None, + desc="The pattern of blocks (referred by name) to use. The sequence is repeated until reaching `num_blocks`." + " Default: cycle over `blocks` in the order they are defined.", + ) + default_block: str = Field( + default=None, + desc="The default block configuration to use when referring to the model." + " Used to set some defaults in the language model.", + ) + # TODO: Move these, not specific to a single block. num_blocks: int = Field( default=12, @@ -174,15 +214,23 @@ class BlockConfig(BaseModelConfig): desc="Store the residuals for the transformer in full precision (`optimization_dtype`).", hint=FieldHint.stability, ) - per_layer_lr_scale: list[float] | None = Field( - default=None, - desc="Custom learning rate scale for each layer.", - doc="May be used to freeze some layers by setting their scale to zero.", - hint=FieldHint.feature, - ) + + def _validate(self) -> None: + for block in self.blocks.values(): + block.validate() + if self.block_pattern is None: + self.block_pattern = tuple(self.blocks) + if self.default_block is None: + self.default_block = self.block_pattern[0] + super()._validate() + + def get_block_config(self, block_index: int) -> BlockConfig: + return self.blocks[self.block_pattern[block_index % len(self.block_pattern)]] def setup_tensor_space(self, tensor_space: TensorSpace) -> None: - super().setup_tensor_space(tensor_space) + for block in self.blocks.values(): + block.setup_tensor_space(tensor_space) - # Hidden dimension - tensor_space.add_tensor_dim(TensorDim(BlockDimNames.hidden, self.hidden_size)) + @functools.cached_property + def default_block_config(self) -> BlockConfig: + return self.blocks[self.default_block] diff --git a/fast_llm/layers/block/mlp/config.py b/fast_llm/layers/block/mlp/config.py index bde775a2..d3393910 100644 --- a/fast_llm/layers/block/mlp/config.py +++ b/fast_llm/layers/block/mlp/config.py @@ -181,7 +181,7 @@ def _validate(self) -> None: self.activation_type = ActivationType.silu if self.gated else ActivationType.gelu # TODO: `hidden_size` not yet validated. if self.ffn_hidden_size is None: - self.ffn_hidden_size = 4 * self.block.hidden_size + self.ffn_hidden_size = 4 * self.block.block_sequence.hidden_size self.num_unshared_experts = self.num_experts - self.num_shared_experts @@ -206,7 +206,7 @@ def layer_1_weight_initialization_method(self) -> Initializer: if self.layer_1_weight_initialization.has_initialization: return self.layer_1_weight_initialization.get_initializer() else: - return init_normal_(0, self.block.hidden_size**-0.5) + return init_normal_(0, self.block.block_sequence.hidden_size**-0.5) @functools.cached_property def layer_1_bias_initialization_method(self) -> Initializer: @@ -220,7 +220,9 @@ def layer_2_weight_initialization_method(self) -> Initializer: if self.layer_2_weight_initialization.has_initialization: return self.layer_2_weight_initialization.get_initializer() else: - return init_normal_(0, self.block.hidden_size**-0.5 / max(2 * self.block.num_blocks, 1)) + return init_normal_( + 0, self.block.block_sequence.hidden_size**-0.5 / max(2 * self.block.block_sequence.num_blocks, 1) + ) @functools.cached_property def layer_2_bias_initialization_method(self) -> Initializer: diff --git a/fast_llm/layers/block/mlp/mlp.py b/fast_llm/layers/block/mlp/mlp.py index 0716bf77..43813d86 100644 --- a/fast_llm/layers/block/mlp/mlp.py +++ b/fast_llm/layers/block/mlp/mlp.py @@ -6,7 +6,7 @@ from fast_llm.functional.config import TritonConfig from fast_llm.functional.triton.mlp import mlp_autograd, torch_mlp_activation, triton_mlp_activation_autograd from fast_llm.layers.block.block import BlockLayer -from fast_llm.layers.block.config import BlockConfig, BlockDimNames +from fast_llm.layers.block.config import BlockDimNames from fast_llm.layers.block.mlp.config import MLPConfig, MLPDimNames from fast_llm.layers.block.peft import TransformerSubLayerName from fast_llm.layers.common.linear import LinearBase @@ -14,14 +14,18 @@ class MLPBase[ConfigType: MLPConfig](BlockLayer[ConfigType]): - def __init__(self, config: BlockConfig, tensor_space: TensorSpace, block_index: int, name: str): + def __init__(self, config: ConfigType, tensor_space: TensorSpace, block_index: int, name: str): super().__init__(config, tensor_space, block_index, name) hidden_dim = self._tensor_space[BlockDimNames.hidden] self._intermediate_dim = self._tensor_space[MLPDimNames.composite_expert_mlp] self._activation_fn = triton_mlp_activation_autograd if TritonConfig.TRITON_ENABLED else torch_mlp_activation - layer_lr_scale = self._config.per_layer_lr_scale[block_index] if self._config.per_layer_lr_scale else None + layer_lr_scale = ( + self._config.block.block_sequence.per_layer_lr_scale[self._block_index] + if self._config.block.block_sequence.per_layer_lr_scale + else None + ) lr_scale = ( tuple(self._config.mlp_lr_scale) if isinstance(self._config.mlp_lr_scale, list) @@ -50,8 +54,8 @@ def __init__(self, config: BlockConfig, tensor_space: TensorSpace, block_index: ) # PEFT. - self.layer_1 = self._config.peft.apply_linear(self.layer_1, TransformerSubLayerName.mlp_1) - self.layer_2 = self._config.peft.apply_linear(self.layer_2, TransformerSubLayerName.mlp_2) + self.layer_1 = self._config.block.peft.apply_linear(self.layer_1, TransformerSubLayerName.mlp_1) + self.layer_2 = self._config.block.peft.apply_linear(self.layer_2, TransformerSubLayerName.mlp_2) class MLP[ConfigType: MLPConfig](MLPBase[ConfigType]): diff --git a/fast_llm/layers/language_model/config.py b/fast_llm/layers/language_model/config.py index 943c64d0..6c9210f0 100644 --- a/fast_llm/layers/language_model/config.py +++ b/fast_llm/layers/language_model/config.py @@ -1,12 +1,11 @@ import functools from fast_llm.config import Field, FieldHint, check_field, config_class, skip_valid_if_none -from fast_llm.engine.base_model.config import BaseModelConfig from fast_llm.engine.config_utils.initialization import InitializationConfig, Initializer, init_normal_ from fast_llm.engine.config_utils.tensor_space import TensorDim, TensorSpace from fast_llm.engine.distributed.config import DistributedDimNames from fast_llm.functional.config import CrossEntropyImpl, DistillationLossImpl -from fast_llm.layers.block.config import BlockConfig, BlockDimNames, BlockKwargs +from fast_llm.layers.block.config import BlockDimNames, BlockKwargs, BlockSequenceConfig from fast_llm.utils import Assert @@ -45,10 +44,8 @@ class LanguageModelKwargs(BlockKwargs): @config_class() -class LanguageModelBaseConfig(BaseModelConfig): - # TODO: block - transformer: BlockConfig = Field( - desc="Configuration for the transformer architecture.", +class LanguageModelBaseConfig(BlockSequenceConfig): + decoder: BlockSequenceConfig = Field( hint=FieldHint.architecture, ) vocab_size: int = Field( @@ -57,6 +54,13 @@ class LanguageModelBaseConfig(BaseModelConfig): hint=FieldHint.architecture, valid=check_field(Assert.gt, 0), ) + embedding_dropout: float = Field( + # TODO: backward compatibility? + default=0.0, + desc="Dropout applied to the embedding layer.", + hint=FieldHint.feature, + valid=check_field(Assert.geq, 0), + ) absolute_position_embeddings: int | None = Field( # TODO: backward compatibility? default=None, @@ -209,19 +213,14 @@ def _validate(self) -> None: Assert.eq(len(self.prediction_loss_coefficient), self.prediction_heads) for coeff in self.prediction_loss_coefficient: Assert.geq(coeff, 0) - if self.transformer.per_layer_lr_scale is not None: - # -1 because the first prediction head's transformer layer is accounted for in num_layers - # +1 because the layer index starts at 1 - Assert.eq( - len(self.transformer.per_layer_lr_scale), self.transformer.num_blocks + self.prediction_heads - 1 + 1 - ) + if self.output_weight_initialization.has_initialization: assert self.use_absolute_position_embeddings if self.output_weight_initialization.has_initialization: assert not self.tie_word_embeddings def setup_tensor_space(self, tensor_space: TensorSpace) -> None: - self.transformer.setup_tensor_space(tensor_space) + super().setup_tensor_space(tensor_space) tensor = tensor_space.distributed_config.get_distributed_dim(DistributedDimNames.tensor) # Embedding dimensions @@ -235,6 +234,7 @@ def setup_tensor_space(self, tensor_space: TensorSpace) -> None: @property def use_absolute_position_embeddings(self) -> int: + # TODO: Set through num embeddings instead instead. return self.absolute_position_embeddings is not None @functools.cached_property @@ -242,18 +242,18 @@ def word_embedding_weight_initialization_method(self) -> Initializer: if self.word_embedding_weight_initialization.has_initialization: return self.word_embedding_weight_initialization.get_initializer() else: - return init_normal_(self.transformer.hidden_size**-0.5) + return init_normal_(self.hidden_size**-0.5) @functools.cached_property def position_embedding_weight_initialization_method(self) -> Initializer: if self.position_embedding_weight_initialization.has_initialization: return self.position_embedding_weight_initialization.get_initializer() else: - return init_normal_(self.transformer.hidden_size**-0.5) + return init_normal_(self.hidden_size**-0.5) @functools.cached_property def output_weight_initialization_method(self) -> Initializer: if self.output_weight_initialization.has_initialization: return self.output_weight_initialization.get_initializer() else: - return init_normal_(self.transformer.hidden_size**-0.5) + return init_normal_(self.hidden_size**-0.5) diff --git a/fast_llm/layers/language_model/embedding.py b/fast_llm/layers/language_model/embedding.py index d99144e4..b183220a 100644 --- a/fast_llm/layers/language_model/embedding.py +++ b/fast_llm/layers/language_model/embedding.py @@ -14,7 +14,7 @@ WORD_EMBEDDINGS_WEIGHT = "word_embeddings_weight" -class LanguageModelEmbedding[ConfigType: LanguageModelBaseConfig](Configurable[LanguageModelBaseConfig], Layer): +class LanguageModelEmbedding[ConfigType: LanguageModelBaseConfig](Configurable[ConfigType], Layer): """ A language model embedding layer. Consists of word embeddings (tensor-parallel or sequence-tensor-parallel), @@ -44,6 +44,7 @@ def __init__( self._parallel_embeddings = ( self._tensor_space.distributed_config.tensor_parallel > 1 and self._config.parallel_embeddings ) + hidden_dim = self._tensor_space[LanguageModelDimNames.hidden] vocab_dim = self._tensor_space[ LanguageModelDimNames.vocab_tp if self._parallel_embeddings else LanguageModelDimNames.vocab @@ -107,7 +108,7 @@ def _forward(self, input_: torch.Tensor, position_ids: torch.Tensor | None, mask if self._sequence_parallel else self._tensor_space.distributed.pp_generator ): - embeddings = torch.dropout(embeddings, self._config.transformer.hidden_dropout, self.training) + embeddings = torch.dropout(embeddings, self._config.embedding_dropout, self.training) return embeddings.to(dtype=self._residual_dtype) def forward( diff --git a/fast_llm/layers/language_model/head.py b/fast_llm/layers/language_model/head.py index fcd1fae2..5e88e801 100644 --- a/fast_llm/layers/language_model/head.py +++ b/fast_llm/layers/language_model/head.py @@ -39,14 +39,22 @@ class LanguageModelHead[ConfigType: LanguageModelBaseConfig](Configurable[Config config_class: typing.ClassVar[type[LanguageModelBaseConfig]] = LanguageModelBaseConfig - def __init__(self, config: ConfigType, tensor_space: TensorSpace, prediction_distance: int): + def __init__( + self, + config: ConfigType, + tensor_space: TensorSpace, + prediction_distance: int, + ): super().__init__(config) + # TODO: Avoid default_block_config? self._debug = DebugLayer( tensor_space, f"Language model head", - self._config.transformer.debug_transformer, - self._config.transformer.debug_transformer_memory, + self._config.default_block_config.debug_transformer, + self._config.default_block_config.debug_transformer_memory, ) + self._tensor_space = tensor_space + self._group_size = tensor_space.distributed_config.tensor_parallel self._sequence_parallel = tensor_space.distributed_config.sequence_tensor_parallel self._parallel_embeddings = ( @@ -67,7 +75,8 @@ def __init__(self, config: ConfigType, tensor_space: TensorSpace, prediction_dis else 1.0 ) self._loss_name = LanguageModelLossNames.multi_token_prediction_loss(prediction_distance) - self.final_norm = self._config.transformer.normalization.get_layer(hidden_dim) + # TODO: Avoid default_block_config? + self.final_norm = self._config.default_block_config.normalization.get_layer(hidden_dim) self._logits_scale_factor = self._config.logits_scale_factor self._language_model_loss_factor = self._config.language_model_loss_factor self._distillation_loss_factor = self._config.distillation_loss_factor diff --git a/fast_llm/layers/ssm/discrete_mamba2.py b/fast_llm/layers/ssm/discrete_mamba2.py index 61291f84..a471ed5a 100644 --- a/fast_llm/layers/ssm/discrete_mamba2.py +++ b/fast_llm/layers/ssm/discrete_mamba2.py @@ -46,13 +46,7 @@ def __init__( tensor_space: TensorSpace, block_config: BlockConfig, ): - super().__init__( - tensor_space, - block_index, - self._mixer_name, - debug_level=block_config.debug_transformer, - debug_memory=block_config.debug_transformer_memory, - ) + super().__init__(tensor_space, block_index, debug_level=block_config.debug_transformer) self._config: SSMConfig = config layer_lr_scale = block_config.per_layer_lr_scale[block_index] if block_config.per_layer_lr_scale else None lr_scale = get_lr_scale(self._config.mamba_lr_scale, layer_lr_scale) diff --git a/fast_llm/layers/ssm/mamba2.py b/fast_llm/layers/ssm/mamba2.py index b6626e89..5c5d18df 100644 --- a/fast_llm/layers/ssm/mamba2.py +++ b/fast_llm/layers/ssm/mamba2.py @@ -57,13 +57,7 @@ def __init__( block_index: int, block_config: BlockConfig, ): - super().__init__( - tensor_space, - block_index, - self._mixer_name, - debug_level=block_config.debug_transformer, - debug_memory=block_config.debug_transformer_memory, - ) + super().__init__(tensor_space, block_index, debug_level=block_config.debug_transformer) self._config: SSMConfig = config Assert.eq(self._config.activation_type, ActivationType.silu) layer_lr_scale: float | None = ( diff --git a/fast_llm/layers/ssm/mamba_layer.py b/fast_llm/layers/ssm/mamba_layer.py index 0dcc29f0..2dd338ba 100644 --- a/fast_llm/layers/ssm/mamba_layer.py +++ b/fast_llm/layers/ssm/mamba_layer.py @@ -63,13 +63,7 @@ def __init__( tensor_space: TensorSpace, block_config: BlockConfig, ): - super().__init__( - tensor_space, - block_index, - self._mixer_name, - debug_level=block_config.debug_transformer, - debug_memory=block_config.debug_transformer_memory, - ) + super().__init__(tensor_space, block_index, debug_level=block_config.debug_transformer) assert tensor_space.distributed_config.tensor_parallel == 1, "Tensor-parallel not supported for MambaLayer" self._config = config # TODO: It's not silu? diff --git a/fast_llm/layers/transformer/attention.py b/fast_llm/layers/transformer/attention.py index 0bea58d9..60ccfbbb 100644 --- a/fast_llm/layers/transformer/attention.py +++ b/fast_llm/layers/transformer/attention.py @@ -78,7 +78,7 @@ def __init__(self, config: ConfigType, tensor_space: TensorSpace, block_index: i self._local_head_groups = self._tensor_space[AttentionDimNames.head_groups].size self._local_heads_per_group = self._tensor_space[AttentionDimNames.group_heads].size self._local_heads = self._local_head_groups * self._local_heads_per_group - self._softmax_scale = self._kv_channels ** (-self._config.attention_softmax_scale_power) + self._softmax_scale: float = self._kv_channels ** (-self._config.attention_softmax_scale_power) hidden_dim = self._tensor_space[AttentionDimNames.hidden] diff --git a/fast_llm/layers/transformer/config.py b/fast_llm/layers/transformer/config.py index e8c319b0..93884de4 100644 --- a/fast_llm/layers/transformer/config.py +++ b/fast_llm/layers/transformer/config.py @@ -9,10 +9,13 @@ from fast_llm.engine.config_utils.tensor_space import CompositeTensorDim, TensorDim, TensorSpace from fast_llm.engine.distributed.config import DistributedConfig, DistributedDimNames from fast_llm.functional.config import TritonConfig -from fast_llm.layers.block.config import AddLinearBiasChoices, BlockConfig, BlockDimNames, BlockKwargs, MixerConfig +from fast_llm.layers.block.config import AddLinearBiasChoices, BlockDimNames, BlockKwargs, MixerConfig from fast_llm.layers.transformer.rotary.config import RotaryConfig from fast_llm.utils import Assert, div +if typing.TYPE_CHECKING: + from fast_llm.layers.transformer.attention import Attention + logger = logging.getLogger(__name__) @@ -47,9 +50,6 @@ class AttentionKwargs(BlockKwargs): class AttentionConfig(MixerConfig): _abstract = False - # Needed for backward compatibility. TODO: remove - module_name: typing.ClassVar[str] = "attn" - # TODO: Review names rotary: RotaryConfig = Field( desc="Configuration for the rotary positional embeddings.", @@ -106,6 +106,7 @@ class AttentionConfig(MixerConfig): " Under muP (if scaling number of heads instead of kv_channels): use 0.5.", valid=skip_valid_if_none(check_field(Assert.geq, 0)), ) + qkv_weight_initialization: InitializationConfig = Field( desc="Initialization configuration for the query, key and value layer weights." " Default: normal(std=hidden_size**-0.5)", @@ -127,9 +128,9 @@ class AttentionConfig(MixerConfig): def _validate(self) -> None: with self._set_implicit_default(): - # TODO: hidden_size not yet validated. if self.kv_channels is None: - self.kv_channels = div(self.block.hidden_size, self.num_attention_heads) + # TODO: hidden_size not yet validated. + self.kv_channels = div(self.block.block_sequence.hidden_size, self.num_attention_heads) super()._validate() @@ -180,6 +181,9 @@ def setup_tensor_space(self, tensor_space: TensorSpace) -> None: CompositeTensorDim(AttentionDimNames.composite_dense, (head_groups, group_heads, kv_channels)) ) + def get_block(self) -> "Attention": + pass + @functools.cached_property def add_qkv_bias(self) -> bool: if isinstance(self.block.add_linear_biases, bool): @@ -197,7 +201,7 @@ def qkv_weight_initialization_method(self) -> Initializer: if self.qkv_weight_initialization.has_initialization: return self.qkv_weight_initialization.get_initializer() else: - return init_normal_(0, self.block.hidden_size**-0.5) + return init_normal_(0, self.block.block_sequence.hidden_size**-0.5) @functools.cached_property def qkv_bias_initialization_method(self) -> Initializer: @@ -211,7 +215,9 @@ def dense_weight_initialization_method(self) -> Initializer: if self.dense_weight_initialization.has_initialization: return self.dense_weight_initialization.get_initializer() else: - return init_normal_(0, self.block.hidden_size**-0.5 / max(2 * self.block.num_blocks, 1)) + return init_normal_( + 0, self.block.block_sequence.hidden_size**-0.5 / max(2 * self.block.block_sequence.num_blocks, 1) + ) @functools.cached_property def dense_bias_initialization_method(self) -> Initializer: @@ -219,9 +225,3 @@ def dense_bias_initialization_method(self) -> Initializer: return self.dense_bias_initialization.get_initializer() else: return init_zeros_ - - -@config_class() -# TODO: Remove -class TransformerConfig(BlockConfig): - pass diff --git a/fast_llm/models/gpt/conversion.py b/fast_llm/models/gpt/conversion.py index 0ef970db..39821e87 100644 --- a/fast_llm/models/gpt/conversion.py +++ b/fast_llm/models/gpt/conversion.py @@ -241,13 +241,13 @@ def _create_transformer_layer_converters( converters += self._get_weight_and_bias_converters( f"{fast_llm_layer_name}.mlp.layer_1", (), - transformer_config.add_bias, + transformer_config.add_mlp_bias, cls=IgnoreExportWeightConverter, ) converters += self._get_weight_and_bias_converters( f"{fast_llm_layer_name}.mlp.layer_2", (), - transformer_config.add_bias, + transformer_config.add_mlp_bias, cls=IgnoreExportWeightConverter, ) converters += [IgnoreExportWeightConverter(f"{fast_llm_layer_name}.mlp.router.weight", ())] @@ -344,12 +344,12 @@ def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[Weig transformer_config: TransformerConfig = self._model.config.base_model.transformer return [ *self._get_weight_and_bias_converters( - f"{fast_llm_prefix}.mlp.layer_1", f"{hf_prefix}.mlp.c_fc", transformer_config.add_bias + f"{fast_llm_prefix}.mlp.layer_1", f"{hf_prefix}.mlp.c_fc", transformer_config.add_mlp_bias ), *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_2", f"{hf_prefix}.mlp.c_proj", - transformer_config.add_bias, + transformer_config.add_mlp_bias, MLPLayer2Converter, ), ] @@ -463,13 +463,13 @@ def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[Weig *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_1", (f"{hf_prefix}.mlp.gate_proj", f"{hf_prefix}.mlp.up_proj"), - transformer_config.add_bias, + transformer_config.add_mlp_bias, SplitWeightConverter, ), *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_2", f"{hf_prefix}.mlp.down_proj", - transformer_config.add_bias, + transformer_config.add_mlp_bias, MLPLayer2Converter, ), ] @@ -531,13 +531,13 @@ def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[Weig *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_1", (f"{hf_prefix}.mlp.gate_proj", f"{hf_prefix}.mlp.up_proj"), - transformer_config.add_bias, + transformer_config.add_mlp_bias, SplitWeightConverter, ), *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_2", f"{hf_prefix}.mlp.down_proj", - transformer_config.add_bias, + transformer_config.add_mlp_bias, MLPLayer2Converter, ), ] @@ -641,13 +641,13 @@ def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[Weig *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_1", (f"{hf_prefix}.mlp.gate_proj", f"{hf_prefix}.mlp.up_proj"), - transformer_config.add_bias, + transformer_config.add_mlp_bias, SplitWeightConverter, ), *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_2", f"{hf_prefix}.mlp.down_proj", - transformer_config.add_bias, + transformer_config.add_mlp_bias, MLPLayer2Converter, ), ]