Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions vllm_ascend/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@
lambda: os.getenv("C_COMPILER", None),
"VLLM_VERSION":
lambda: os.getenv("VLLM_VERSION", None),
"VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH":
lambda: int(os.getenv("VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH", "8192")),
}

# end-env-vars-definition
Expand Down
28 changes: 19 additions & 9 deletions vllm_ascend/quantization/w8a8_dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from vllm_ascend.ops.fused_moe import select_experts

VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH: int = envs_ascend.VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH


def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
Expand Down Expand Up @@ -636,15 +637,24 @@ def apply(
expert_map=expert_map,
moe_all_to_all_group_name=self.moe_all_to_all_group_name)
elif dp_size == 1:
return fused_experts(hidden_states=x,
w1=layer.w13_weight,
w1_scale=layer.w13_weight_scale,
w2=layer.w2_weight,
w2_scale=layer.w2_weight_scale,
topk_weights=topk_weights,
topk_ids=topk_ids,
top_k=top_k,
expert_map=expert_map)
x_list = x.split(VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH)
topk_weights_list = topk_weights.split(
VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH)
topk_ids_list = topk_ids.split(VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH)
final_hidden_states_list = []
for i in range(len(x_list)):
final_hidden_states = fused_experts(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have some assessment on the performance of this PR? This might bring some substantial performance regression, for this change may lead to repeat weight access from HBM

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe the env VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH can be changed to something like ENABLE_FUSED_EXPOERS_SEQ_SPLIT to keep the backward capibility

hidden_states=x_list[i],
w1=layer.w13_weight,
w1_scale=layer.w13_weight_scale,
w2=layer.w2_weight,
w2_scale=layer.w2_weight_scale,
topk_weights=topk_weights_list[i],
topk_ids=topk_ids_list[i],
top_k=top_k,
expert_map=expert_map)
final_hidden_states_list.append(final_hidden_states)
return torch.concat(final_hidden_states_list)
else:
return fused_experts_with_all2all(hidden_states=x,
w1=layer.w13_weight,
Expand Down
9 changes: 5 additions & 4 deletions vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,10 +219,11 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
device="cpu",
pin_memory=True)

self.inputs_embeds = torch.zeros(
(self.max_num_tokens, self.hidden_size),
dtype=self.dtype,
device=self.device)
if self.is_multimodal_model:
self.inputs_embeds = torch.zeros(
(self.max_num_tokens, self.hidden_size),
dtype=self.dtype,
device=self.device)

# OPTIMIZATION: Cache the tensors rather than creating them every step.
self.arange_np: npt.NDArray[np.int32] = np.arange(max(
Expand Down