Skip to content

Commit b4d822a

Browse files
authored
Force use_cache=True in config only (#497)
This reverts a part of #496 and instead overrides `use_cache` in `LlamaConfig`s only (so the correct value is visible by HF `.generate()` as well).
1 parent abd5477 commit b4d822a

File tree

3 files changed

+5
-2
lines changed

3 files changed

+5
-2
lines changed

src/petals/models/bloom/model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def forward(
4343
attention_mask: Optional[torch.Tensor] = None,
4444
head_mask: Optional[torch.LongTensor] = None,
4545
inputs_embeds: Optional[torch.LongTensor] = None,
46-
use_cache: Optional[bool] = None, # Not used here but needed for HF Transformers compatibility
46+
use_cache: Optional[bool] = None,
4747
output_attentions: Optional[bool] = None,
4848
output_hidden_states: Optional[bool] = None,
4949
return_dict: Optional[bool] = None,
@@ -63,6 +63,7 @@ def forward(
6363
attention_mask is None or (attention_mask == 1).all()
6464
), f"Custom attention masks are not supported, {attention_mask=}"
6565
assert head_mask is None, f"Custom head masks are not supported, {head_mask=}"
66+
assert use_cache is None or use_cache, f"{use_cache=} is not supported"
6667
assert not output_attentions, f"{output_attentions=} is not supported"
6768
assert not output_hidden_states, f"{output_hidden_states=} is not supported"
6869
assert return_dict is None or return_dict, f"{return_dict=} is not supported"

src/petals/models/llama/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,5 @@ def from_pretrained(
4343
result = super().from_pretrained(model_name_or_path, *args, dht_prefix=dht_prefix, **kwargs)
4444
config = result[0] if isinstance(result, tuple) else result
4545
config.pretraining_tp = 1 # This may give less accurate results but it doesn't matter if we use quantization
46+
config.use_cache = True # use_cache=False leads to identical results but is slower and not supported by Petals
4647
return result

src/petals/models/llama/model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def forward(
4343
position_ids: Optional[torch.LongTensor] = None,
4444
past_key_values: Optional[RemotePastKeyValues] = None,
4545
inputs_embeds: Optional[torch.FloatTensor] = None,
46-
use_cache: Optional[bool] = None, # Not used here but needed for HF Transformers compatibility
46+
use_cache: Optional[bool] = None,
4747
output_attentions: Optional[bool] = None,
4848
output_hidden_states: Optional[bool] = None,
4949
return_dict: Optional[bool] = None,
@@ -65,6 +65,7 @@ def forward(
6565
assert (
6666
position_ids is None or (position_ids[:, 1:] - position_ids[:, :-1] == 1).all()
6767
), f"Non-consecutive position_ids are not supported, {position_ids=}"
68+
assert use_cache is None or use_cache, f"{use_cache=} is not supported"
6869
assert not output_attentions, f"{output_attentions=} is not supported"
6970
assert not output_hidden_states, f"{output_hidden_states=} is not supported"
7071
assert return_dict is None or return_dict, f"{return_dict=} is not supported"

0 commit comments

Comments
 (0)