From 53125f23171cf2c3e274b8425223fdde4d13ad39 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Fri, 10 Oct 2025 15:41:09 +0800 Subject: [PATCH 01/24] Create enumerate_test_intranode.sh --- scripts/enumerate_test_intranode.sh | 105 ++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 scripts/enumerate_test_intranode.sh diff --git a/scripts/enumerate_test_intranode.sh b/scripts/enumerate_test_intranode.sh new file mode 100644 index 00000000..a91b19fe --- /dev/null +++ b/scripts/enumerate_test_intranode.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +SKN_PWD="" + +# 默认值 +SKIP_BUILD=false + +TEMP=$(getopt -o sw:t:h --long skip-build -n "$0" -- "$@") +if [ $? != 0 ]; then + echo "Terminating..." >&2 + exit 1 +fi + +eval set -- "$TEMP" + +while true; do + case "$1" in + -s|--skip-build) + SKIP_BUILD=true + shift + ;; + --) + shift + break + ;; + *) + echo "Invalid option: $1" >&2 + show_help + exit 1 + ;; + esac +done + +# 切换目录 +cd "${SKN_PWD}" || { echo "Directory not found: ${SKN_PWD}"; exit 1; } + +# 条件构建 +if [ "$SKIP_BUILD" = false ]; then + echo ">>> Building package..." + bash build.sh -a deepep || { echo "Build failed!"; exit 1; } + pip uninstall -y deep-ep + pip install ./output/deep_ep-*.whl || { echo "Install failed!"; exit 1; } +else + echo ">>> Skipping build and install (--skip-build)" +fi + +# 进入测试目录 +cd ./tests/python/deepep || { echo "Test directory not found"; exit 1; } + +# 设置环境变量 +export HCCL_BUFFSIZE=4096 +# 设置 Ascend 环境 +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +#遍历test_intranode.py +# 设置参数范围 +NUM_PROCESSES_LIST_=(4 8 16) +NUM_TOKENS_LIST=(1024 2048 4096) +HIDDEN_LIST=(4096 7168) +NUM_TOPK_LIST=(4 8) +NUM_EXPERTS_LIST=(64 128 256) +ACTIVE_RANKS_LIST=("" "0,1" "0,2,3") +ENABLE_DIAGNOSE_LIST=("false" "true") + +SCRIPT="test_intranode.py" + +# 遍历所有组合 +for NUM_PROCESSES in "${NUM_PROCESSES_LIST[@]}"; do + for NUM_TOKENS in "${NUM_TOKENS_LIST[@]}"; do + for HIDDEN in "${HIDDEN_LIST[@]}"; do + for NUM_TOPK in "${NUM_TOPK_LIST[@]}"; do + for NUM_EXPERTS in "${NUM_EXPERTS_LIST[@]}"; do + for ACTIVE_RANKS in "${ACTIVE_RANKS_LIST[@]}"; do + for ENABLE_DIAGNOSE in "${ENABLE_DIAGNOSE_LIST[@]}"; do + + # 构建命令 + CMD="python3 $SCRIPT \ + --num-processes $NUM_PROCESSES \ + --num-tokens $NUM_TOKENS \ + --hidden $HIDDEN \ + --num-topk $NUM_TOPK \ + --num-experts $NUM_EXPERTS" + + # 添加可选参数 + if [ -n "$ACTIVE_RANKS" ]; then + CMD="$CMD --active-ranks \"$ACTIVE_RANKS\"" + fi + + if [ "$ENABLE_DIAGNOSE" == "true" ]; then + CMD="$CMD --enable-diagnose" + fi + + # 打印并执行命令 + echo "Running: $CMD" + eval $CMD + + echo "--------------------------------------------------" + + done + done + done + done + done + done +done From 38b7c4d02ce673747a1f2b5d74bf29b94a73f9ff Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Fri, 10 Oct 2025 15:42:00 +0800 Subject: [PATCH 02/24] Create enumerate_test_low_latency.sh --- scripts/enumerate_test_low_latency.sh | 103 ++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 scripts/enumerate_test_low_latency.sh diff --git a/scripts/enumerate_test_low_latency.sh b/scripts/enumerate_test_low_latency.sh new file mode 100644 index 00000000..9b6f6c32 --- /dev/null +++ b/scripts/enumerate_test_low_latency.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +SKN_PWD="" + +# 默认值 +SKIP_BUILD=false + +TEMP=$(getopt -o sw:t:h --long skip-build -n "$0" -- "$@") +if [ $? != 0 ]; then + echo "Terminating..." >&2 + exit 1 +fi + +eval set -- "$TEMP" + +while true; do + case "$1" in + -s|--skip-build) + SKIP_BUILD=true + shift + ;; + --) + shift + break + ;; + *) + echo "Invalid option: $1" >&2 + show_help + exit 1 + ;; + esac +done + +# 切换目录 +cd "${SKN_PWD}" || { echo "Directory not found: ${SKN_PWD}"; exit 1; } + +# 条件构建 +if [ "$SKIP_BUILD" = false ]; then + echo ">>> Building package..." + bash build.sh -a deepep || { echo "Build failed!"; exit 1; } + pip uninstall -y deep-ep + pip install ./output/deep_ep-*.whl || { echo "Install failed!"; exit 1; } +else + echo ">>> Skipping build and install (--skip-build)" +fi + +# 进入测试目录 +cd ./tests/python/deepep || { echo "Test directory not found"; exit 1; } + +# 设置环境变量 +export HCCL_BUFFSIZE=4096 +# 设置 Ascend 环境 +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +#遍历test_low_latency.py +# 设置参数范围 +NUM_PROCESSES_LIST=(4 8 16) +NUM_TOKENS_LIST=(128 256 512) +HIDDEN_LIST=(4096 7168) +NUM_TOPK_LIST=(4 8) +NUM_EXPERTS_LIST=(64 128 256) + +SCRIPT="test_low_latency.py" + +# 遍历所有组合 +for NUM_PROCESSES in "${NUM_PROCESSES_LIST[@]}"; do + for NUM_TOKENS in "${NUM_TOKENS_LIST[@]}"; do + for HIDDEN in "${HIDDEN_LIST[@]}"; do + for NUM_TOPK in "${NUM_TOPK_LIST[@]}"; do + for NUM_EXPERTS in "${NUM_EXPERTS_LIST[@]}"; do + for ACTIVE_RANKS in "${ACTIVE_RANKS_LIST[@]}"; do + for ENABLE_DIAGNOSE in "${ENABLE_DIAGNOSE_LIST[@]}"; do + + # 构建命令 + CMD="python3 $SCRIPT \ + --num-processes $NUM_PROCESSES \ + --num-tokens $NUM_TOKENS \ + --hidden $HIDDEN \ + --num-topk $NUM_TOPK \ + --num-experts $NUM_EXPERTS" + + # 添加可选参数 + if [ -n "$ACTIVE_RANKS" ]; then + CMD="$CMD --active-ranks \"$ACTIVE_RANKS\"" + fi + + if [ "$ENABLE_DIAGNOSE" == "true" ]; then + CMD="$CMD --enable-diagnose" + fi + + # 打印并执行命令 + echo "Running: $CMD" + eval $CMD + + echo "--------------------------------------------------" + + done + done + done + done + done + done +done From ec6249bea600f523651be13231c9c7a1b998fe52 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Tue, 14 Oct 2025 16:16:30 +0800 Subject: [PATCH 03/24] Update enumerate_test_intranode.sh --- scripts/enumerate_test_intranode.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/scripts/enumerate_test_intranode.sh b/scripts/enumerate_test_intranode.sh index a91b19fe..1678b82d 100644 --- a/scripts/enumerate_test_intranode.sh +++ b/scripts/enumerate_test_intranode.sh @@ -1,7 +1,5 @@ #!/bin/bash -SKN_PWD="" - # 默认值 SKIP_BUILD=false @@ -32,7 +30,7 @@ while true; do done # 切换目录 -cd "${SKN_PWD}" || { echo "Directory not found: ${SKN_PWD}"; exit 1; } +cd ${GITHUB_WORKSPACE} # 条件构建 if [ "$SKIP_BUILD" = false ]; then @@ -47,8 +45,6 @@ fi # 进入测试目录 cd ./tests/python/deepep || { echo "Test directory not found"; exit 1; } -# 设置环境变量 -export HCCL_BUFFSIZE=4096 # 设置 Ascend 环境 source /usr/local/Ascend/ascend-toolkit/set_env.sh From 65c76304d74c4ffef09bcaa637b4b898df7fd6b9 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Tue, 14 Oct 2025 16:20:03 +0800 Subject: [PATCH 04/24] Update enumerate_test_low_latency.sh --- scripts/enumerate_test_low_latency.sh | 53 ++------------------------- 1 file changed, 3 insertions(+), 50 deletions(-) diff --git a/scripts/enumerate_test_low_latency.sh b/scripts/enumerate_test_low_latency.sh index 9b6f6c32..97616512 100644 --- a/scripts/enumerate_test_low_latency.sh +++ b/scripts/enumerate_test_low_latency.sh @@ -1,56 +1,7 @@ #!/bin/bash -SKN_PWD="" - -# 默认值 -SKIP_BUILD=false - -TEMP=$(getopt -o sw:t:h --long skip-build -n "$0" -- "$@") -if [ $? != 0 ]; then - echo "Terminating..." >&2 - exit 1 -fi - -eval set -- "$TEMP" - -while true; do - case "$1" in - -s|--skip-build) - SKIP_BUILD=true - shift - ;; - --) - shift - break - ;; - *) - echo "Invalid option: $1" >&2 - show_help - exit 1 - ;; - esac -done - # 切换目录 -cd "${SKN_PWD}" || { echo "Directory not found: ${SKN_PWD}"; exit 1; } - -# 条件构建 -if [ "$SKIP_BUILD" = false ]; then - echo ">>> Building package..." - bash build.sh -a deepep || { echo "Build failed!"; exit 1; } - pip uninstall -y deep-ep - pip install ./output/deep_ep-*.whl || { echo "Install failed!"; exit 1; } -else - echo ">>> Skipping build and install (--skip-build)" -fi - -# 进入测试目录 -cd ./tests/python/deepep || { echo "Test directory not found"; exit 1; } - -# 设置环境变量 -export HCCL_BUFFSIZE=4096 -# 设置 Ascend 环境 -source /usr/local/Ascend/ascend-toolkit/set_env.sh +cd ${GITHUB_WORKSPACE}/tests/python/deepep #遍历test_low_latency.py # 设置参数范围 @@ -101,3 +52,5 @@ for NUM_PROCESSES in "${NUM_PROCESSES_LIST[@]}"; do done done done + +cd ./ From 40643be2269af418b6c9ac57d5772c3d1c1c1965 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Tue, 14 Oct 2025 16:20:34 +0800 Subject: [PATCH 05/24] Update enumerate_test_intranode.sh --- scripts/enumerate_test_intranode.sh | 49 ++--------------------------- 1 file changed, 3 insertions(+), 46 deletions(-) diff --git a/scripts/enumerate_test_intranode.sh b/scripts/enumerate_test_intranode.sh index 1678b82d..ea686f44 100644 --- a/scripts/enumerate_test_intranode.sh +++ b/scripts/enumerate_test_intranode.sh @@ -1,52 +1,7 @@ #!/bin/bash -# 默认值 -SKIP_BUILD=false - -TEMP=$(getopt -o sw:t:h --long skip-build -n "$0" -- "$@") -if [ $? != 0 ]; then - echo "Terminating..." >&2 - exit 1 -fi - -eval set -- "$TEMP" - -while true; do - case "$1" in - -s|--skip-build) - SKIP_BUILD=true - shift - ;; - --) - shift - break - ;; - *) - echo "Invalid option: $1" >&2 - show_help - exit 1 - ;; - esac -done - # 切换目录 -cd ${GITHUB_WORKSPACE} - -# 条件构建 -if [ "$SKIP_BUILD" = false ]; then - echo ">>> Building package..." - bash build.sh -a deepep || { echo "Build failed!"; exit 1; } - pip uninstall -y deep-ep - pip install ./output/deep_ep-*.whl || { echo "Install failed!"; exit 1; } -else - echo ">>> Skipping build and install (--skip-build)" -fi - -# 进入测试目录 -cd ./tests/python/deepep || { echo "Test directory not found"; exit 1; } - -# 设置 Ascend 环境 -source /usr/local/Ascend/ascend-toolkit/set_env.sh +cd ${GITHUB_WORKSPACE}/tests/python/deepep #遍历test_intranode.py # 设置参数范围 @@ -99,3 +54,5 @@ for NUM_PROCESSES in "${NUM_PROCESSES_LIST[@]}"; do done done done + +cd ./ From 18ea6faff837a4f8449b2e3a219fc44fc62ade96 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Tue, 14 Oct 2025 16:20:49 +0800 Subject: [PATCH 06/24] Update pr-test-npu.yml --- .github/workflows/pr-test-npu.yml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml index 7f9b1964..769eb616 100644 --- a/.github/workflows/pr-test-npu.yml +++ b/.github/workflows/pr-test-npu.yml @@ -50,7 +50,7 @@ jobs: - name: Prepare Deepep run: bash scripts/prepare_deepep_in_container.sh - + - name: Run test intranode timeout-minutes: 10 env: @@ -65,6 +65,20 @@ jobs: run: | python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py + - name: Run enumerate test intranode + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2239 + run: | + bash scripts/enumerate_test_intranode.sh + + - name: Run enumerate test low latency + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1913 + run: | + bash scripts/enumerate_test_low_latency.sh + test-build-deepep: if: (github.repository == 'sgl-project/sgl-kernel-npu' || github.event_name == 'pull_request') && github.event.pull_request.draft == false From b95ebdc921f44f764adbe099abaf985769d22b2f Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Tue, 14 Oct 2025 16:31:10 +0800 Subject: [PATCH 07/24] Update pr-test-npu.yml --- .github/workflows/pr-test-npu.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml index 769eb616..ed2951ef 100644 --- a/.github/workflows/pr-test-npu.yml +++ b/.github/workflows/pr-test-npu.yml @@ -9,6 +9,7 @@ on: - "python/**" - "csrc/**" - "test/**" + - "scripts/**" - ".github/workflows/pr-test-npu.yml" workflow_dispatch: @@ -145,3 +146,4 @@ jobs: done echo "All jobs completed successfully" exit 0 + From 894a885ac17860055bcb8745b9a2df2384826234 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Tue, 14 Oct 2025 16:48:07 +0800 Subject: [PATCH 08/24] Update enumerate_test_low_latency.sh --- scripts/enumerate_test_low_latency.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/enumerate_test_low_latency.sh b/scripts/enumerate_test_low_latency.sh index 97616512..3afa6ba7 100644 --- a/scripts/enumerate_test_low_latency.sh +++ b/scripts/enumerate_test_low_latency.sh @@ -10,6 +10,8 @@ NUM_TOKENS_LIST=(128 256 512) HIDDEN_LIST=(4096 7168) NUM_TOPK_LIST=(4 8) NUM_EXPERTS_LIST=(64 128 256) +ACTIVE_RANKS_LIST=("" "0,1" "0,2,3") +ENABLE_DIAGNOSE_LIST=("false" "true") SCRIPT="test_low_latency.py" From 015f190715bccbeab4272d10cf4666dd0a10acba Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Tue, 14 Oct 2025 16:48:31 +0800 Subject: [PATCH 09/24] Update enumerate_test_intranode.sh --- scripts/enumerate_test_intranode.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/enumerate_test_intranode.sh b/scripts/enumerate_test_intranode.sh index ea686f44..8566a666 100644 --- a/scripts/enumerate_test_intranode.sh +++ b/scripts/enumerate_test_intranode.sh @@ -5,7 +5,7 @@ cd ${GITHUB_WORKSPACE}/tests/python/deepep #遍历test_intranode.py # 设置参数范围 -NUM_PROCESSES_LIST_=(4 8 16) +NUM_PROCESSES_LIST=(4 8 16) NUM_TOKENS_LIST=(1024 2048 4096) HIDDEN_LIST=(4096 7168) NUM_TOPK_LIST=(4 8) From 0152857bf45a8b24e47bc1b0f260d931939b11c1 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Wed, 15 Oct 2025 14:26:43 +0800 Subject: [PATCH 10/24] Update pr-test-npu.yml --- .github/workflows/pr-test-npu.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml index ed2951ef..08507b43 100644 --- a/.github/workflows/pr-test-npu.yml +++ b/.github/workflows/pr-test-npu.yml @@ -19,8 +19,6 @@ concurrency: jobs: test-all-build: - if: (github.repository == 'sgl-project/sgl-kernel-npu' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false runs-on: linux-aarch64-a3-16 container: image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11 @@ -147,3 +145,4 @@ jobs: echo "All jobs completed successfully" exit 0 + From 04db4cd15c2783fc10fb2004cca41b7d21044ec3 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Thu, 16 Oct 2025 09:58:07 +0800 Subject: [PATCH 11/24] Update pr-test-npu.yml --- .github/workflows/pr-test-npu.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml index 08507b43..1a1d5c07 100644 --- a/.github/workflows/pr-test-npu.yml +++ b/.github/workflows/pr-test-npu.yml @@ -65,14 +65,14 @@ jobs: python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py - name: Run enumerate test intranode - timeout-minutes: 10 + timeout-minutes: 20 env: HCCL_BUFFSIZE: 2239 run: | bash scripts/enumerate_test_intranode.sh - name: Run enumerate test low latency - timeout-minutes: 10 + timeout-minutes: 20 env: HCCL_BUFFSIZE: 1913 run: | @@ -146,3 +146,4 @@ jobs: exit 0 + From f5aba2b36e0c98734e381a886df8c0997ec24b35 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Thu, 16 Oct 2025 10:14:34 +0800 Subject: [PATCH 12/24] Update pr-test-npu.yml --- .github/workflows/pr-test-npu.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml index 1a1d5c07..ab50f458 100644 --- a/.github/workflows/pr-test-npu.yml +++ b/.github/workflows/pr-test-npu.yml @@ -65,14 +65,14 @@ jobs: python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py - name: Run enumerate test intranode - timeout-minutes: 20 + timeout-minutes: 360 env: HCCL_BUFFSIZE: 2239 run: | bash scripts/enumerate_test_intranode.sh - name: Run enumerate test low latency - timeout-minutes: 20 + timeout-minutes: 360 env: HCCL_BUFFSIZE: 1913 run: | @@ -147,3 +147,4 @@ jobs: + From a1667f891b0d6dab1303081891ced6644c1ead64 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Tue, 28 Oct 2025 17:04:22 +0800 Subject: [PATCH 13/24] Update CMakeLists.txt --- csrc/deepep/ops/op_kernel/CMakeLists.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/csrc/deepep/ops/op_kernel/CMakeLists.txt b/csrc/deepep/ops/op_kernel/CMakeLists.txt index c8221a5f..3d6cb057 100644 --- a/csrc/deepep/ops/op_kernel/CMakeLists.txt +++ b/csrc/deepep/ops/op_kernel/CMakeLists.txt @@ -3,6 +3,24 @@ if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx") add_ops_compile_options(ALL OPTIONS -g -O0 ) endif() +message(STATUS "ASCEND include path: ${ASCEND_CANN_PACKAGE_PATH}/opp/built-in/op_impl/ai_core/tbe/impl/ascendc/moe_distribute_dispatch") + +# copy moe_distribute_base head file +execute_process( + COMMAND ${CMAKE_COMMAND} -E copy + "${ASCEND_CANN_PACKAGE_PATH}/opp/built-in/op_impl/ai_core/tbe/impl/ascendc/moe_distribute_dispatch/moe_distribute_base.h" + "${CMAKE_CURRENT_BINARY_DIR}/../../op_kernel/moe_distribute_base.h" + RESULT_VARIABLE copy_result +) + +#check moe_distribute_base head file +if(copy_result EQUAL 0) + message(STATUS "Successfully copied moe_distribute_base.h") +else() + message(WARNING "Failed to copy moe_distribute_base.h") +endif() + + add_ops_compile_options(ALL OPTIONS -DASCENDC_DUMP=0 --cce-auto-sync=off) add_kernels_compile() From 55ab3a8a8ab525eb71c7e987df7cdc730a3854f4 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Thu, 30 Oct 2025 15:48:57 +0800 Subject: [PATCH 14/24] Update enumerate_test_intranode.sh --- scripts/enumerate_test_intranode.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/enumerate_test_intranode.sh b/scripts/enumerate_test_intranode.sh index 8566a666..c37a6db8 100644 --- a/scripts/enumerate_test_intranode.sh +++ b/scripts/enumerate_test_intranode.sh @@ -5,11 +5,11 @@ cd ${GITHUB_WORKSPACE}/tests/python/deepep #遍历test_intranode.py # 设置参数范围 -NUM_PROCESSES_LIST=(4 8 16) -NUM_TOKENS_LIST=(1024 2048 4096) +NUM_PROCESSES_LIST=(8 16) +NUM_TOKENS_LIST=(1 4096) HIDDEN_LIST=(4096 7168) -NUM_TOPK_LIST=(4 8) -NUM_EXPERTS_LIST=(64 128 256) +NUM_TOPK_LIST=(8 9) +NUM_EXPERTS_LIST=(64 256) ACTIVE_RANKS_LIST=("" "0,1" "0,2,3") ENABLE_DIAGNOSE_LIST=("false" "true") From 0def944474f503e236b99d3bc98f9e8807166c05 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Thu, 30 Oct 2025 15:50:57 +0800 Subject: [PATCH 15/24] Update enumerate_test_low_latency.sh --- scripts/enumerate_test_low_latency.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/enumerate_test_low_latency.sh b/scripts/enumerate_test_low_latency.sh index 3afa6ba7..e5dad220 100644 --- a/scripts/enumerate_test_low_latency.sh +++ b/scripts/enumerate_test_low_latency.sh @@ -5,11 +5,11 @@ cd ${GITHUB_WORKSPACE}/tests/python/deepep #遍历test_low_latency.py # 设置参数范围 -NUM_PROCESSES_LIST=(4 8 16) -NUM_TOKENS_LIST=(128 256 512) +NUM_PROCESSES_LIST=(8 16) +NUM_TOKENS_LIST=(128 512) HIDDEN_LIST=(4096 7168) -NUM_TOPK_LIST=(4 8) -NUM_EXPERTS_LIST=(64 128 256) +NUM_TOPK_LIST=(8 9) +NUM_EXPERTS_LIST=(64 256) ACTIVE_RANKS_LIST=("" "0,1" "0,2,3") ENABLE_DIAGNOSE_LIST=("false" "true") From bb0b9747613f1db290a65f6e7c89130eb7547ee8 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Tue, 4 Nov 2025 15:52:02 +0800 Subject: [PATCH 16/24] Create daily-build-test.yml --- .github/workflows/daily-build-test.yml | 160 +++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 .github/workflows/daily-build-test.yml diff --git a/.github/workflows/daily-build-test.yml b/.github/workflows/daily-build-test.yml new file mode 100644 index 00000000..4526d0ac --- /dev/null +++ b/.github/workflows/daily-build-test.yml @@ -0,0 +1,160 @@ +name: Daily Enumerate Tests (Ascend NPU) + +on: + schedule: + - cron: "0 4,16 * * *" # 每天 UTC 时间 4:00 和 16:00 各运行一次(即每12小时) + workflow_dispatch: # 保留手动触发能力 + +concurrency: + group: daily-enumerate-tests-${{ github.ref }} + cancel-in-progress: true + +jobs: + daily-enumerate-tests: + runs-on: linux-aarch64-a3-16 + container: + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + + steps: + - name: Clean git config + run: | + CONFIG_KEY='http.https://gh-proxy.test.osinfra.cn/.extraheader' + git config --global --unset "$CONFIG_KEY" || true + + - name: Clean workspace + run: | + sudo rm -rf --one-file-system "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.* 2>/dev/null || true + + - name: Checkout code + uses: actions/checkout@v4 + with: + clean: true + + - name: Install dependencies + run: | + # speed up by using infra cache services + CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local" + sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list + pip config set global.index-url http://${CACHING_URL}/pypi/simple + pip config set global.trusted-host ${CACHING_URL} + + bash scripts/npu_ci_install_dependency.sh + + - name: Prepare Deepep + run: bash scripts/prepare_deepep_in_container.sh + + - name: Run quick sanity tests + timeout-minutes: 20 + env: + HCCL_BUFFSIZE: 2239 + run: | + # 先运行快速测试确保基础功能正常 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py + + - name: Run enumerate test intranode (Daily) + timeout-minutes: 360 # 6小时超时 + env: + HCCL_BUFFSIZE: 2239 + TEST_ENV: daily-build + run: | + echo "Starting daily enumerate intranode test at $(date)" + bash scripts/enumerate_test_intranode.sh + echo "Completed daily enumerate intranode test at $(date)" + + - name: Run enumerate test low latency (Daily) + timeout-minutes: 360 # 6小时超时 + env: + HCCL_BUFFSIZE: 1913 + TEST_ENV: daily-build + run: | + echo "Starting daily enumerate low latency test at $(date)" + bash scripts/enumerate_test_low_latency.sh + echo "Completed daily enumerate low latency test at $(date)" + + - name: Generate daily test report + if: always() + run: | + echo "## Daily Enumerate Tests Report" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Execution Time**: $(date)" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Test Results:" >> $GITHUB_STEP_SUMMARY + + # 检查测试结果文件(如果脚本生成的话) + if [ -f "test-results/enumerate-intranode-results.txt" ]; then + echo "- **Intranode Enumerate Test**: Completed" >> $GITHUB_STEP_SUMMARY + else + echo "- **Intranode Enumerate Test**: No result file found" >> $GITHUB_STEP_SUMMARY + fi + + if [ -f "test-results/enumerate-low-latency-results.txt" ]; then + echo "- **Low Latency Enumerate Test**: Completed" >> $GITHUB_STEP_SUMMARY + else + echo "- **Low Latency Enumerate Test**: No result file found" >> $GITHUB_STEP_SUMMARY + fi + + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Workflow Run**: [$GITHUB_RUN_ID](https://github.com/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID)" >> $GITHUB_STEP_SUMMARY + + - name: Upload test artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: daily-enumerate-results-${{ github.sha }}-${{ github.run_id }} + path: | + test-results/ + logs/ + *.log + retention-days: 30 # 保留30天,便于问题排查 + + # 可选:添加一个轻量级的验证任务,确保每日构建的基础环境正常 + daily-smoke-test: + runs-on: linux-aarch64-a3-16 + container: + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + timeout-minutes: 30 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Quick environment check + run: | + # 快速检查关键依赖和环境 + python3 --version + pip list | grep -i "torch\|npu" + echo "Basic environment check passed" + + - name: Verify test scripts exist + run: | + # 确认测试脚本存在 + ls -la scripts/enumerate_test_*.sh + ls -la tests/python/deepep/test_*.py + echo "All required test scripts are present" + + finish: + if: always() + needs: [daily-enumerate-tests, daily-smoke-test] + runs-on: ubuntu-latest + + steps: + - name: Check all dependent job statuses + run: | + results=(${{ join(needs.*.result, ' ') }}) + all_success=true + + for result in "${results[@]}"; do + if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then + echo "Job failed with result: $result" + all_success=false + fi + done + + if [ "$all_success" = true ]; then + echo "All daily enumerate tests completed successfully" + exit 0 + else + echo "Some daily tests failed" + exit 1 + fi From 211299835a3103b2010e587ffa2681f8d8ad0cb7 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Tue, 4 Nov 2025 15:52:28 +0800 Subject: [PATCH 17/24] Update pr-test-npu.yml --- .github/workflows/pr-test-npu.yml | 32 +++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml index ab50f458..6c5942a6 100644 --- a/.github/workflows/pr-test-npu.yml +++ b/.github/workflows/pr-test-npu.yml @@ -19,6 +19,8 @@ concurrency: jobs: test-all-build: + if: (github.repository == 'sgl-project/sgl-kernel-npu' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false runs-on: linux-aarch64-a3-16 container: image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11 @@ -49,7 +51,7 @@ jobs: - name: Prepare Deepep run: bash scripts/prepare_deepep_in_container.sh - + - name: Run test intranode timeout-minutes: 10 env: @@ -64,19 +66,13 @@ jobs: run: | python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py - - name: Run enumerate test intranode - timeout-minutes: 360 - env: - HCCL_BUFFSIZE: 2239 - run: | - bash scripts/enumerate_test_intranode.sh - - - name: Run enumerate test low latency - timeout-minutes: 360 + - name: Run test deepep eplb + timeout-minutes: 10 env: - HCCL_BUFFSIZE: 1913 + HCCL_BUFFSIZE: 2000 run: | - bash scripts/enumerate_test_low_latency.sh + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --minus1-flag True --small-bs-flag True test-build-deepep: if: (github.repository == 'sgl-project/sgl-kernel-npu' || github.event_name == 'pull_request') && @@ -126,6 +122,14 @@ jobs: run: | python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py + - name: Run test deepep eplb + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --minus1-flag True --small-bs-flag True + finish: if: always() needs: @@ -144,7 +148,3 @@ jobs: done echo "All jobs completed successfully" exit 0 - - - - From 08a612735f61567797c7d8709b716ce4b681dc53 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Tue, 4 Nov 2025 15:55:29 +0800 Subject: [PATCH 18/24] Update pr-test-npu.yml --- .github/workflows/pr-test-npu.yml | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml index 6c5942a6..b7d4eb1a 100644 --- a/.github/workflows/pr-test-npu.yml +++ b/.github/workflows/pr-test-npu.yml @@ -9,7 +9,6 @@ on: - "python/**" - "csrc/**" - "test/**" - - "scripts/**" - ".github/workflows/pr-test-npu.yml" workflow_dispatch: @@ -66,14 +65,6 @@ jobs: run: | python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py - - name: Run test deepep eplb - timeout-minutes: 10 - env: - HCCL_BUFFSIZE: 2000 - run: | - python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py - python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --minus1-flag True --small-bs-flag True - test-build-deepep: if: (github.repository == 'sgl-project/sgl-kernel-npu' || github.event_name == 'pull_request') && github.event.pull_request.draft == false @@ -122,14 +113,6 @@ jobs: run: | python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py - - name: Run test deepep eplb - timeout-minutes: 10 - env: - HCCL_BUFFSIZE: 2000 - run: | - python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py - python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --minus1-flag True --small-bs-flag True - finish: if: always() needs: @@ -148,3 +131,4 @@ jobs: done echo "All jobs completed successfully" exit 0 + From e8c8a68fbf68cc5a00774269a8ded63b4d961942 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Tue, 4 Nov 2025 15:56:02 +0800 Subject: [PATCH 19/24] Update CMakeLists.txt --- csrc/deepep/ops/op_kernel/CMakeLists.txt | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/csrc/deepep/ops/op_kernel/CMakeLists.txt b/csrc/deepep/ops/op_kernel/CMakeLists.txt index 3d6cb057..c8221a5f 100644 --- a/csrc/deepep/ops/op_kernel/CMakeLists.txt +++ b/csrc/deepep/ops/op_kernel/CMakeLists.txt @@ -3,24 +3,6 @@ if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx") add_ops_compile_options(ALL OPTIONS -g -O0 ) endif() -message(STATUS "ASCEND include path: ${ASCEND_CANN_PACKAGE_PATH}/opp/built-in/op_impl/ai_core/tbe/impl/ascendc/moe_distribute_dispatch") - -# copy moe_distribute_base head file -execute_process( - COMMAND ${CMAKE_COMMAND} -E copy - "${ASCEND_CANN_PACKAGE_PATH}/opp/built-in/op_impl/ai_core/tbe/impl/ascendc/moe_distribute_dispatch/moe_distribute_base.h" - "${CMAKE_CURRENT_BINARY_DIR}/../../op_kernel/moe_distribute_base.h" - RESULT_VARIABLE copy_result -) - -#check moe_distribute_base head file -if(copy_result EQUAL 0) - message(STATUS "Successfully copied moe_distribute_base.h") -else() - message(WARNING "Failed to copy moe_distribute_base.h") -endif() - - add_ops_compile_options(ALL OPTIONS -DASCENDC_DUMP=0 --cce-auto-sync=off) add_kernels_compile() From 0470cb14ffd97e911651d71c23ad4e9a58531fc8 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Tue, 4 Nov 2025 15:56:29 +0800 Subject: [PATCH 20/24] Update pr-test-npu.yml From 12bfe79ce94c738cda58a0ae52141257fb00a636 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Tue, 4 Nov 2025 15:56:48 +0800 Subject: [PATCH 21/24] Update pr-test-npu.yml --- .github/workflows/pr-test-npu.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml index b7d4eb1a..7f9b1964 100644 --- a/.github/workflows/pr-test-npu.yml +++ b/.github/workflows/pr-test-npu.yml @@ -131,4 +131,3 @@ jobs: done echo "All jobs completed successfully" exit 0 - From ae72436755fa16f99c07a3c8e3c43ffe088bad1f Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Wed, 5 Nov 2025 15:20:27 +0800 Subject: [PATCH 22/24] Update daily-build-test.yml --- .github/workflows/daily-build-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/daily-build-test.yml b/.github/workflows/daily-build-test.yml index 4526d0ac..11acd98d 100644 --- a/.github/workflows/daily-build-test.yml +++ b/.github/workflows/daily-build-test.yml @@ -2,7 +2,7 @@ name: Daily Enumerate Tests (Ascend NPU) on: schedule: - - cron: "0 4,16 * * *" # 每天 UTC 时间 4:00 和 16:00 各运行一次(即每12小时) + - cron: "0 16 * * *" # 每天 UTC 时间 16:00(下午4点)运行 workflow_dispatch: # 保留手动触发能力 concurrency: From d4ca2c8522ff628e0fda5e62e3fe3610e13d6acf Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Thu, 6 Nov 2025 10:40:21 +0800 Subject: [PATCH 23/24] Update daily-build-test.yml --- .github/workflows/daily-build-test.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/daily-build-test.yml b/.github/workflows/daily-build-test.yml index 11acd98d..f50d3d18 100644 --- a/.github/workflows/daily-build-test.yml +++ b/.github/workflows/daily-build-test.yml @@ -3,6 +3,12 @@ name: Daily Enumerate Tests (Ascend NPU) on: schedule: - cron: "0 16 * * *" # 每天 UTC 时间 16:00(下午4点)运行 + pull_request: # 添加 pull_request 触发器 + branches: [ main ] # 仅当拉取请求目标分支是 main 时触发 + paths: + - ".github/workflows/daily-build-test.yml" # 当前工作流文件更改时触发 + - "scripts/enumerate_test_intranode.sh" # 指定脚本文件更改时触发 + - "scripts/enumerate_test_low_latency.sh" # 指定脚本文件更改时触发 workflow_dispatch: # 保留手动触发能力 concurrency: From 237ca23e0e52eac07fd6c5cd91769f4222c73431 Mon Sep 17 00:00:00 2001 From: DubhepPan <845021026@qq.com> Date: Thu, 6 Nov 2025 19:08:05 +0800 Subject: [PATCH 24/24] Update daily-build-test.yml --- .github/workflows/daily-build-test.yml | 31 +++----------------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/.github/workflows/daily-build-test.yml b/.github/workflows/daily-build-test.yml index f50d3d18..c76c7ac1 100644 --- a/.github/workflows/daily-build-test.yml +++ b/.github/workflows/daily-build-test.yml @@ -59,7 +59,7 @@ jobs: python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py - name: Run enumerate test intranode (Daily) - timeout-minutes: 360 # 6小时超时 + timeout-minutes: 480 # 8小时超时 env: HCCL_BUFFSIZE: 2239 TEST_ENV: daily-build @@ -69,7 +69,7 @@ jobs: echo "Completed daily enumerate intranode test at $(date)" - name: Run enumerate test low latency (Daily) - timeout-minutes: 360 # 6小时超时 + timeout-minutes: 480 # 8小时超时 env: HCCL_BUFFSIZE: 1913 TEST_ENV: daily-build @@ -114,34 +114,9 @@ jobs: *.log retention-days: 30 # 保留30天,便于问题排查 - # 可选:添加一个轻量级的验证任务,确保每日构建的基础环境正常 - daily-smoke-test: - runs-on: linux-aarch64-a3-16 - container: - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11 - timeout-minutes: 30 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Quick environment check - run: | - # 快速检查关键依赖和环境 - python3 --version - pip list | grep -i "torch\|npu" - echo "Basic environment check passed" - - - name: Verify test scripts exist - run: | - # 确认测试脚本存在 - ls -la scripts/enumerate_test_*.sh - ls -la tests/python/deepep/test_*.py - echo "All required test scripts are present" - finish: if: always() - needs: [daily-enumerate-tests, daily-smoke-test] + needs: [daily-enumerate-tests] runs-on: ubuntu-latest steps: