diff --git a/.github/workflows/daily-build-test.yml b/.github/workflows/daily-build-test.yml new file mode 100644 index 00000000..c76c7ac1 --- /dev/null +++ b/.github/workflows/daily-build-test.yml @@ -0,0 +1,141 @@ +name: Daily Enumerate Tests (Ascend NPU) + +on: + schedule: + - cron: "0 16 * * *" # 每天 UTC 时间 16:00(下午4点)运行 + pull_request: # 添加 pull_request 触发器 + branches: [ main ] # 仅当拉取请求目标分支是 main 时触发 + paths: + - ".github/workflows/daily-build-test.yml" # 当前工作流文件更改时触发 + - "scripts/enumerate_test_intranode.sh" # 指定脚本文件更改时触发 + - "scripts/enumerate_test_low_latency.sh" # 指定脚本文件更改时触发 + workflow_dispatch: # 保留手动触发能力 + +concurrency: + group: daily-enumerate-tests-${{ github.ref }} + cancel-in-progress: true + +jobs: + daily-enumerate-tests: + runs-on: linux-aarch64-a3-16 + container: + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + + steps: + - name: Clean git config + run: | + CONFIG_KEY='http.https://gh-proxy.test.osinfra.cn/.extraheader' + git config --global --unset "$CONFIG_KEY" || true + + - name: Clean workspace + run: | + sudo rm -rf --one-file-system "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.* 2>/dev/null || true + + - name: Checkout code + uses: actions/checkout@v4 + with: + clean: true + + - name: Install dependencies + run: | + # speed up by using infra cache services + CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local" + sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list + pip config set global.index-url http://${CACHING_URL}/pypi/simple + pip config set global.trusted-host ${CACHING_URL} + + bash scripts/npu_ci_install_dependency.sh + + - name: Prepare Deepep + run: bash scripts/prepare_deepep_in_container.sh + + - name: Run quick sanity tests + timeout-minutes: 20 + env: + HCCL_BUFFSIZE: 2239 + run: | + # 先运行快速测试确保基础功能正常 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py + + - name: Run enumerate test intranode (Daily) + timeout-minutes: 480 # 8小时超时 + env: + HCCL_BUFFSIZE: 2239 + TEST_ENV: daily-build + run: | + echo "Starting daily enumerate intranode test at $(date)" + bash scripts/enumerate_test_intranode.sh + echo "Completed daily enumerate intranode test at $(date)" + + - name: Run enumerate test low latency (Daily) + timeout-minutes: 480 # 8小时超时 + env: + HCCL_BUFFSIZE: 1913 + TEST_ENV: daily-build + run: | + echo "Starting daily enumerate low latency test at $(date)" + bash scripts/enumerate_test_low_latency.sh + echo "Completed daily enumerate low latency test at $(date)" + + - name: Generate daily test report + if: always() + run: | + echo "## Daily Enumerate Tests Report" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Execution Time**: $(date)" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Test Results:" >> $GITHUB_STEP_SUMMARY + + # 检查测试结果文件(如果脚本生成的话) + if [ -f "test-results/enumerate-intranode-results.txt" ]; then + echo "- **Intranode Enumerate Test**: Completed" >> $GITHUB_STEP_SUMMARY + else + echo "- **Intranode Enumerate Test**: No result file found" >> $GITHUB_STEP_SUMMARY + fi + + if [ -f "test-results/enumerate-low-latency-results.txt" ]; then + echo "- **Low Latency Enumerate Test**: Completed" >> $GITHUB_STEP_SUMMARY + else + echo "- **Low Latency Enumerate Test**: No result file found" >> $GITHUB_STEP_SUMMARY + fi + + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Workflow Run**: [$GITHUB_RUN_ID](https://github.com/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID)" >> $GITHUB_STEP_SUMMARY + + - name: Upload test artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: daily-enumerate-results-${{ github.sha }}-${{ github.run_id }} + path: | + test-results/ + logs/ + *.log + retention-days: 30 # 保留30天,便于问题排查 + + finish: + if: always() + needs: [daily-enumerate-tests] + runs-on: ubuntu-latest + + steps: + - name: Check all dependent job statuses + run: | + results=(${{ join(needs.*.result, ' ') }}) + all_success=true + + for result in "${results[@]}"; do + if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then + echo "Job failed with result: $result" + all_success=false + fi + done + + if [ "$all_success" = true ]; then + echo "All daily enumerate tests completed successfully" + exit 0 + else + echo "Some daily tests failed" + exit 1 + fi diff --git a/scripts/enumerate_test_intranode.sh b/scripts/enumerate_test_intranode.sh new file mode 100644 index 00000000..c37a6db8 --- /dev/null +++ b/scripts/enumerate_test_intranode.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# 切换目录 +cd ${GITHUB_WORKSPACE}/tests/python/deepep + +#遍历test_intranode.py +# 设置参数范围 +NUM_PROCESSES_LIST=(8 16) +NUM_TOKENS_LIST=(1 4096) +HIDDEN_LIST=(4096 7168) +NUM_TOPK_LIST=(8 9) +NUM_EXPERTS_LIST=(64 256) +ACTIVE_RANKS_LIST=("" "0,1" "0,2,3") +ENABLE_DIAGNOSE_LIST=("false" "true") + +SCRIPT="test_intranode.py" + +# 遍历所有组合 +for NUM_PROCESSES in "${NUM_PROCESSES_LIST[@]}"; do + for NUM_TOKENS in "${NUM_TOKENS_LIST[@]}"; do + for HIDDEN in "${HIDDEN_LIST[@]}"; do + for NUM_TOPK in "${NUM_TOPK_LIST[@]}"; do + for NUM_EXPERTS in "${NUM_EXPERTS_LIST[@]}"; do + for ACTIVE_RANKS in "${ACTIVE_RANKS_LIST[@]}"; do + for ENABLE_DIAGNOSE in "${ENABLE_DIAGNOSE_LIST[@]}"; do + + # 构建命令 + CMD="python3 $SCRIPT \ + --num-processes $NUM_PROCESSES \ + --num-tokens $NUM_TOKENS \ + --hidden $HIDDEN \ + --num-topk $NUM_TOPK \ + --num-experts $NUM_EXPERTS" + + # 添加可选参数 + if [ -n "$ACTIVE_RANKS" ]; then + CMD="$CMD --active-ranks \"$ACTIVE_RANKS\"" + fi + + if [ "$ENABLE_DIAGNOSE" == "true" ]; then + CMD="$CMD --enable-diagnose" + fi + + # 打印并执行命令 + echo "Running: $CMD" + eval $CMD + + echo "--------------------------------------------------" + + done + done + done + done + done + done +done + +cd ./ diff --git a/scripts/enumerate_test_low_latency.sh b/scripts/enumerate_test_low_latency.sh new file mode 100644 index 00000000..e5dad220 --- /dev/null +++ b/scripts/enumerate_test_low_latency.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# 切换目录 +cd ${GITHUB_WORKSPACE}/tests/python/deepep + +#遍历test_low_latency.py +# 设置参数范围 +NUM_PROCESSES_LIST=(8 16) +NUM_TOKENS_LIST=(128 512) +HIDDEN_LIST=(4096 7168) +NUM_TOPK_LIST=(8 9) +NUM_EXPERTS_LIST=(64 256) +ACTIVE_RANKS_LIST=("" "0,1" "0,2,3") +ENABLE_DIAGNOSE_LIST=("false" "true") + +SCRIPT="test_low_latency.py" + +# 遍历所有组合 +for NUM_PROCESSES in "${NUM_PROCESSES_LIST[@]}"; do + for NUM_TOKENS in "${NUM_TOKENS_LIST[@]}"; do + for HIDDEN in "${HIDDEN_LIST[@]}"; do + for NUM_TOPK in "${NUM_TOPK_LIST[@]}"; do + for NUM_EXPERTS in "${NUM_EXPERTS_LIST[@]}"; do + for ACTIVE_RANKS in "${ACTIVE_RANKS_LIST[@]}"; do + for ENABLE_DIAGNOSE in "${ENABLE_DIAGNOSE_LIST[@]}"; do + + # 构建命令 + CMD="python3 $SCRIPT \ + --num-processes $NUM_PROCESSES \ + --num-tokens $NUM_TOKENS \ + --hidden $HIDDEN \ + --num-topk $NUM_TOPK \ + --num-experts $NUM_EXPERTS" + + # 添加可选参数 + if [ -n "$ACTIVE_RANKS" ]; then + CMD="$CMD --active-ranks \"$ACTIVE_RANKS\"" + fi + + if [ "$ENABLE_DIAGNOSE" == "true" ]; then + CMD="$CMD --enable-diagnose" + fi + + # 打印并执行命令 + echo "Running: $CMD" + eval $CMD + + echo "--------------------------------------------------" + + done + done + done + done + done + done +done + +cd ./