sgl-project
diff --git a/‎csrc/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎csrc/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎csrc/build_tree/op_host/build_tree.cpp‎
Lines changed: 93 additions & 0 deletions b/‎csrc/build_tree/op_host/build_tree.cpp‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎csrc/build_tree/op_host/build_tree_tiling.h‎
Lines changed: 37 additions & 0 deletions b/‎csrc/build_tree/op_host/build_tree_tiling.h‎
Lines changed: 37 additions & 0 deletions
@@ -8,6 +8,7 @@ FILE(GLOB OP_SRCS
     ${PROJECT_OP_SRC_BASE}/cache_location_assign/op_host/cache_loc_assign.cpp
     ${PROJECT_OP_SRC_BASE}/alloc_extend/op_host/alloc_extend_tiling.cpp
     ${PROJECT_OP_SRC_BASE}/assign_cache_op/op_host/assign_cache.cpp
+    ${PROJECT_OP_SRC_BASE}/build_tree/op_host/build_tree.cpp
     ${PROJECT_OP_SRC_BASE}/mla_preprocess/op_host/mla_preprocess.cpp
     )
 
@@ -24,6 +25,7 @@ ascendc_library(no_workspace_kernel STATIC
 ascendc_library(workspace_kernel STATIC
     ${PROJECT_OP_SRC_BASE}/mla_preprocess/op_kernel/mla_preprocess_kernel.cpp
     ${PROJECT_OP_SRC_BASE}/alloc_extend/op_kernel/alloc_extend_kernel.cpp
+    ${PROJECT_OP_SRC_BASE}/build_tree/op_kernel/build_tree_kernel.cpp
 )
 
 ascendc_compile_definitions(workspace_kernel PRIVATE
 
@@ -0,0 +1,93 @@
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "defines.h"
+#include "build_tree_tiling.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "aclrtlaunch_build_tree_efficient.h"
+#include "torch_helper.h"
+
+namespace sglang {
+namespace npu_kernel {
+constexpr uint32_t PADDING_BYTE = 32U;
+
+at::Tensor get_tiling(int32_t &block_dim, int32_t &workspace_size, int32_t batch_size, int32_t mask_size,
+    int64_t topk, int64_t depth, int64_t draft_token_num, int64_t tree_mask_mode)
+{
+    auto ascendc_platform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    int32_t max_aiv_core = static_cast<int32_t>(ascendc_platform->GetCoreNumAiv());
+    block_dim = std::min(max_aiv_core, batch_size);
+    workspace_size = static_cast<int32_t>(ascendc_platform->GetLibApiWorkSpaceSize());
+
+    // align to 32 bytes
+    int32_t tiling_size = (sizeof(BuildTreeTilingData) + PADDING_BYTE - 1) / PADDING_BYTE * PADDING_BYTE;
+    auto tiling_buffer = at::empty({tiling_size}, at::TensorOptions().dtype(at::kByte).device(at::kCPU));
+    
+    BuildTreeTilingData *tiling_data = reinterpret_cast<BuildTreeTilingData *>(tiling_buffer.data_ptr());
+    tiling_data->batch_size = batch_size;
+    tiling_data->mask_size = mask_size;
+    tiling_data->topk = topk;
+    tiling_data->depth = depth;
+    tiling_data->draft_token_num = draft_token_num;
+    tiling_data->tree_mask_mode = tree_mask_mode;
+    
+    auto num_big_core = batch_size % max_aiv_core;
+    tiling_data->big_core_num = num_big_core == 0 ? block_dim : num_big_core;
+    tiling_data->big_core_tile_num = (batch_size + num_big_core - 1) / num_big_core;
+    tiling_data->small_core_tile_num = batch_size / num_big_core;
+
+    auto tiling_tensor = TorchNpuHepler::CopyTensorHostToDevice(tiling_buffer);
+    return tiling_tensor;
+}
+
+HOST_API void build_tree_efficient(const at::Tensor &parent_list, 
+    const at::Tensor &selected_index,
+    const at::Tensor &verified_seq_len, 
+    const at::Tensor &tree_mask, 
+    const at::Tensor &positions,
+    const at::Tensor &retrive_index, 
+    const at::Tensor &retrive_next_token, 
+    const at::Tensor &retrive_next_sibling, 
+    int64_t topk, 
+    int64_t depth, 
+    int64_t draft_token_num, 
+    int64_t tree_mask_mode)
+{
+    if (QLEN_ONLY_BITPACKING == tree_mask_mode) {
+        throw std::runtime_error("Not implemented");
+    }
+
+    if (parent_list.options().dtype() != at::kLong 
+        || selected_index.options().dtype() != at::kLong
+        || verified_seq_len.options().dtype() != at::kLong
+        || tree_mask.options().dtype() != at::kBool 
+        || positions.options().dtype() != at::kLong
+        || retrive_index.options().dtype() != at::kLong 
+        || retrive_next_token.options().dtype() != at::kLong
+        || retrive_next_sibling.options().dtype() != at::kLong) {
+        throw std::invalid_argument("Invaild input datetype. " \
+            "Support combo: int64, int64, int64, bool, int64, int64, int64, int64");
+    }
+    int32_t block_dim;
+    int32_t workspace_size;
+    int32_t batch_size = parent_list.sizes()[0];
+    int32_t mask_size = tree_mask.size(0);
+
+    at::Tensor tiling_tensor = get_tiling(block_dim, workspace_size, batch_size, mask_size, topk, depth, draft_token_num, 
+        tree_mask_mode);
+
+    auto workspace_tensor = 
+        at::empty({workspace_size}, at::TensorOptions().dtype(at::kByte).device(parent_list.options().device()));
+    /* lauch the kernal function via torch */
+    EXEC_KERNEL_CMD(build_tree_efficient, block_dim, parent_list, selected_index, verified_seq_len, tree_mask, 
+        positions, retrive_index, retrive_next_token, retrive_next_sibling, workspace_tensor, tiling_tensor);
+}
+
+}
+}
@@ -0,0 +1,37 @@
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BUILD_TREE_TILING_H
+#define BUILD_TREE_TILING_H
+
+#include <cstdint>
+namespace sglang {
+namespace npu_kernel {
+
+typedef enum { FULL_MASK = 0, QLEN_ONLY = 1, QLEN_ONLY_BITPACKING = 2 } TreeMaskMode;
+
+struct BuildTreeTilingData {
+    int64_t topk;
+    int64_t depth;
+    int64_t draft_token_num;
+    int64_t tree_mask_mode;
+
+    int32_t batch_size;
+    int32_t mask_size;
+
+    int32_t big_core_num;
+    int32_t big_core_tile_num;
+    int32_t small_core_tile_num;
+};
+
+}  // namespace npu_kernel
+}  // namespace sglang
+
+#endif  // BUILD_TREE_TILING_H
Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@ FILE(GLOB OP_SRCS`
`8`	`8`	`${PROJECT_OP_SRC_BASE}/cache_location_assign/op_host/cache_loc_assign.cpp`
`9`	`9`	`${PROJECT_OP_SRC_BASE}/alloc_extend/op_host/alloc_extend_tiling.cpp`
`10`	`10`	`${PROJECT_OP_SRC_BASE}/assign_cache_op/op_host/assign_cache.cpp`
	`11`	`+ ${PROJECT_OP_SRC_BASE}/build_tree/op_host/build_tree.cpp`
`11`	`12`	`${PROJECT_OP_SRC_BASE}/mla_preprocess/op_host/mla_preprocess.cpp`
`12`	`13`	`)`
`13`	`14`
`@@ -24,6 +25,7 @@ ascendc_library(no_workspace_kernel STATIC`
`24`	`25`	`ascendc_library(workspace_kernel STATIC`
`25`	`26`	`${PROJECT_OP_SRC_BASE}/mla_preprocess/op_kernel/mla_preprocess_kernel.cpp`
`26`	`27`	`${PROJECT_OP_SRC_BASE}/alloc_extend/op_kernel/alloc_extend_kernel.cpp`
	`28`	`+ ${PROJECT_OP_SRC_BASE}/build_tree/op_kernel/build_tree_kernel.cpp`
`27`	`29`	`)`
`28`	`30`
`29`	`31`	`ascendc_compile_definitions(workspace_kernel PRIVATE`