|
1 | | -from typing import Any, Mapping, Optional, Set, Type, TypeVar |
| 1 | +from typing import Any, List, Mapping, Optional, Set, Type, TypeVar |
2 | 2 |
|
3 | 3 | import torch |
| 4 | +import torch.nn.functional as F |
4 | 5 | from torch import Tensor |
5 | | -from torch.nn import Linear |
| 6 | +from torch.nn import Embedding |
6 | 7 |
|
| 8 | +from ...layers.attention import AttentionMask |
| 9 | +from ...layers.cache import KeyValueCache |
7 | 10 | from ...quantization import Quantizable |
8 | 11 | from ..hf_hub import FromHFHub |
| 12 | +from ..output import CausalLMOutputWithCache |
9 | 13 | from ..transformer import TransformerCausalLM |
10 | 14 | from ._hf import convert_hf_config, convert_hf_state_dict |
11 | 15 | from .config import MPTConfig |
@@ -38,11 +42,45 @@ def __init__( |
38 | 42 | super().__init__() |
39 | 43 |
|
40 | 44 | self.decoder = MPTDecoder(config, device=device) |
41 | | - self.output_embeddings = Linear( |
42 | | - in_features=config.layer.feedforward.hidden_width, |
43 | | - out_features=config.embedding.n_pieces, |
44 | | - bias=False, |
45 | | - device=device, |
| 45 | + |
| 46 | + # Once we have proper support for tied weights, we will do something like: |
| 47 | + # |
| 48 | + # self.output_embeddings = Linear( |
| 49 | + # in_features=config.layer.feedforward.hidden_width, |
| 50 | + # out_features=config.embedding.n_pieces, |
| 51 | + # bias=False, |
| 52 | + # device=device, |
| 53 | + # ) |
| 54 | + # self.output_embeddings.weights = self.decoder.embeddings.piece_embeddings.weights |
| 55 | + # |
| 56 | + # For now we'll work around this by using the piece embeddings directly. |
| 57 | + |
| 58 | + def forward( |
| 59 | + self, |
| 60 | + piece_ids: Tensor, |
| 61 | + attention_mask: AttentionMask, |
| 62 | + cache: Optional[List[KeyValueCache]] = None, |
| 63 | + positions: Optional[Tensor] = None, |
| 64 | + store_cache: bool = False, |
| 65 | + ) -> CausalLMOutputWithCache[KeyValueCache]: |
| 66 | + # TODO: remove this forward method once we support weight tying. |
| 67 | + |
| 68 | + decoder_output = self.decoder( |
| 69 | + piece_ids, |
| 70 | + attention_mask, |
| 71 | + cache=cache, |
| 72 | + store_cache=store_cache, |
| 73 | + positions=positions, |
| 74 | + ) |
| 75 | + |
| 76 | + assert isinstance(self.decoder.embeddings.piece_embeddings, Embedding) |
| 77 | + output_embeddings = self.decoder.embeddings.piece_embeddings.weight |
| 78 | + |
| 79 | + logits = F.linear(decoder_output.last_hidden_layer_state, output_embeddings) |
| 80 | + return CausalLMOutputWithCache( |
| 81 | + all_outputs=decoder_output.all_outputs, |
| 82 | + cache=decoder_output.cache, |
| 83 | + logits=logits, |
46 | 84 | ) |
47 | 85 |
|
48 | 86 | @classmethod |
|
0 commit comments