From 597488b2831fb7c302151150a4425dd27f4f7b29 Mon Sep 17 00:00:00 2001 From: wangli Date: Wed, 16 Jul 2025 11:48:37 +0800 Subject: [PATCH 1/5] cherry pick #1749 from v0.9.1-dev Signed-off-by: wangli --- vllm_ascend/quantization/quantizer.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/vllm_ascend/quantization/quantizer.py b/vllm_ascend/quantization/quantizer.py index 8178d5e7f3..0252aa4173 100644 --- a/vllm_ascend/quantization/quantizer.py +++ b/vllm_ascend/quantization/quantizer.py @@ -46,14 +46,8 @@ def get_quantizer(cls, if quantization_algorithm in CUSTOMIZED_QUANTIZER_TYPE: return - try: - module = importlib.import_module("mindie_turbo") - MindIETurboQuantizer = module.MindIETurboQuantizer - return MindIETurboQuantizer.get_quantizer(quant_config, prefix, - packed_modules_mapping) - except ImportError: - return VLLMAscendQuantizer.get_quantizer(quant_config, prefix, - packed_modules_mapping) + return VLLMAscendQuantizer.get_quantizer(quant_config, prefix, + packed_modules_mapping) def build_linear_method(self): raise NotImplementedError From df475053e044b33712acc63f6ca5e4261621194d Mon Sep 17 00:00:00 2001 From: wangli Date: Wed, 16 Jul 2025 11:56:02 +0800 Subject: [PATCH 2/5] cherry pick #1755 from v0.9.1-dev Signed-off-by: wangli --- vllm_ascend/ops/fused_moe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 61205ffea2..e235598af6 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1437,7 +1437,6 @@ def forward(self, final_hidden_states = e_hidden_states if num_tokens < padding_size: final_hidden_states = final_hidden_states[:num_tokens] - dispose_tensor(e_hidden_states) elif self.dp_size > 1: if fused_moe_state == FusedMoEState.NaiveMulticast: start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[ From 0aaafc51644b86992f45a6c7573c77af3e460745 Mon Sep 17 00:00:00 2001 From: wangli Date: Mon, 21 Jul 2025 15:36:26 +0800 Subject: [PATCH 3/5] cherry pick from #1705 Signed-off-by: wangli --- .../models/qwen2_5_vl_without_padding.py | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/vllm_ascend/models/qwen2_5_vl_without_padding.py b/vllm_ascend/models/qwen2_5_vl_without_padding.py index 47ddd4455a..42abc67512 100644 --- a/vllm_ascend/models/qwen2_5_vl_without_padding.py +++ b/vllm_ascend/models/qwen2_5_vl_without_padding.py @@ -202,6 +202,66 @@ def cal_cos_sin(self, rotary_pos_emb): self.hidden_size_per_attention_head) return cos_new, sin_new + def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + pos_ids.append( + torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb + + def get_window_index(self, grid_thw): + window_index: list = [] + cu_window_seqlens: list = [0] + window_index_id = 0 + vit_merger_window_size = (self.window_size // + self.spatial_merge_size // self.patch_size) + + for grid_t, grid_h, grid_w in grid_thw: + llm_grid_h = grid_h // self.spatial_merge_size + llm_grid_w = grid_w // self.spatial_merge_size + index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape( + grid_t, llm_grid_h, llm_grid_w) + pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size + pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size + num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size + num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size + index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100) + index_padded = index_padded.reshape(grid_t, num_windows_h, + vit_merger_window_size, + num_windows_w, + vit_merger_window_size) + index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( + grid_t, num_windows_h * num_windows_w, vit_merger_window_size, + vit_merger_window_size) + seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) + index_padded = index_padded.reshape(-1) + index_new = index_padded[index_padded != -100] + window_index.append(index_new + window_index_id) + cu_seqlens_tmp = seqlens.cumsum( + 0) * self.spatial_merge_unit + cu_window_seqlens[-1] + cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) + window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() + window_index = torch.cat(window_index, dim=0) + return window_index, cu_window_seqlens + def forward( self, x: torch.Tensor, @@ -253,6 +313,39 @@ def forward( x = x[reverse_indices, :] return x + def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: + + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + + # Split concatenated embeddings for each image item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + return image_embeds.split(sizes.tolist()) + + def _process_video_input(self, video_input) -> tuple[torch.Tensor, ...]: + + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 + + if video_input["type"] == "video_embeds": + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"].type( + self.visual.dtype) + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + + # Split concatenated embeddings for each video item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + return video_embeds.split(sizes.tolist()) + @MULTIMODAL_REGISTRY.register_processor( Qwen2_5_VLMultiModalProcessor, From f50c268d4edbdd7da26fcc601dc14b59e88e2526 Mon Sep 17 00:00:00 2001 From: wangli Date: Mon, 21 Jul 2025 15:41:54 +0800 Subject: [PATCH 4/5] cherry pick 1759 Signed-off-by: wangli --- vllm_ascend/models/deepseek_dbo.py | 32 ------------------------------ 1 file changed, 32 deletions(-) diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py index 13e5efac62..b4c30481fa 100644 --- a/vllm_ascend/models/deepseek_dbo.py +++ b/vllm_ascend/models/deepseek_dbo.py @@ -170,38 +170,6 @@ def __init__( ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled - def forward( - self, - hidden_states: torch.Tensor, - attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: - forward_context = get_forward_context() - # when profile runs, force experts to load balanced tokens - # to avoid high memory consumption on a single rank. - enable_force_load_balance = forward_context.in_profile_run - - is_prefill = forward_context.with_prefill - - old_hidden_states = hidden_states.clone() - - # router_logits: (num_tokens, n_experts) - router_logits, _ = self.gate(hidden_states) - - hidden_states = self.experts( - hidden_states=hidden_states, - router_logits=router_logits, - is_prefill=is_prefill, - top_k=CustomDeepseekDBOMoE.top_k, - enable_force_load_balance=enable_force_load_balance, - ) * self.routed_scaling_factor - - if self.n_shared_experts is not None: - shared_output = self.shared_experts(old_hidden_states) - - if shared_output is not None: - hidden_states = hidden_states + shared_output - - return hidden_states - # ----------------------------------------- TBO-related -------------------------------------------- def _forward_ms_op_shared_expert( self, From c5b869450f33bc0e7090f672ada88922f755f263 Mon Sep 17 00:00:00 2001 From: wangli Date: Mon, 28 Jul 2025 16:12:23 +0800 Subject: [PATCH 5/5] fix Signed-off-by: wangli --- vllm_ascend/ops/fused_moe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index e235598af6..61205ffea2 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1437,6 +1437,7 @@ def forward(self, final_hidden_states = e_hidden_states if num_tokens < padding_size: final_hidden_states = final_hidden_states[:num_tokens] + dispose_tensor(e_hidden_states) elif self.dp_size > 1: if fused_moe_state == FusedMoEState.NaiveMulticast: start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[