diff --git a/.github/workflows/distribute.yml b/.github/workflows/distribute.yml new file mode 100644 index 000000000000..ddb73d000b45 --- /dev/null +++ b/.github/workflows/distribute.yml @@ -0,0 +1,124 @@ +name: PaddleNLP-Distributed-CI + +on: + push: + branches: [develop, release/*] + pull_request: + +env: + PR_ID: ${{ github.event.pull_request.number }} + COMMIT_ID: ${{ github.event.pull_request.head.sha }} + work_dir: /workspace/PaddleNLP + PADDLENLP_ROOT: /workspace/PaddleNLP + TASK: paddlenlp-CI-${{ github.event.pull_request.number }}-Distributed + ci_scripts: /workspace/PaddleNLP/scripts/distribute + BRANCH: ${{ github.event.pull_request.base.ref }} + AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }} + CI_name: distribute-ci + no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn" + +defaults: + run: + shell: bash + +jobs: + distribute-ci: + name: distribute-test + runs-on: [self-hosted, Auto-Parallel] + steps: + - name: Download PaddleNLP + env: + work_dir: ${{ github.workspace }} + run: | + echo "Downloading PaddleNLP.tar.gz" + wget -q --no-proxy https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleNLP.tar --no-check-certificate + echo "Extracting PaddleNLP.tar.gz" + tar xf PaddleNLP.tar && rm -rf PaddleNLP.tar + # wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddlenlp/Bos.zip --no-check-certificate + # unzip -P "$(cat ${{ github.workspace }}/../../../bos_key)" Bos.zip + # mkdir paddlenlp && mv Bos/* ./paddlenlp/ + source ${{ github.workspace }}/../../../proxy + cd PaddleNLP + git config --global user.name "PaddleCI" + git config --global user.email "paddle_ci@example.com" + git pull + git submodule update --init --recursive --force + git fetch origin pull/${PR_ID}/head:test && git checkout test + git remote add upstream https://github.com/PaddlePaddle/PaddleNLP.git + git fetch upstream ${BRANCH} + git merge ${BRANCH} + git diff --numstat ${BRANCH} |awk '{print $NF}' + git log --pretty=oneline -10 + + - name: Check docker image and run container + env: + CACHE_DIR: /home/data/cfs/.cache + FLAGS_dynamic_static_unified_comm: "True" + python_version: "3.10" + paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuSome-LinuxCentos-Gcc82-Cuda118-Cudnn86-Trt85-Py310-CINN-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + run: | + container_name=${TASK}-${core_index}-$(date +%Y%m%d-%H%M%S) + echo "container_name=${container_name}" >> ${{ github.env }} + docker_image=registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.8-cudnn8.6-trt8.5-gcc82 + nvidia-docker run -d -t --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \ + -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ + -v ${{ github.workspace }}:/workspace \ + -v /home/FleetX_CI:/gpt_data \ + -v /home/Llm_gpt_CI:/llm_gpt_data \ + -v /home/Llama_CI:/llama_data \ + -v /home/.cache/pip:/home/.cache/pip \ + -e BRANCH \ + -e PR_ID \ + -e COMMIT_ID \ + -e work_dir \ + -e PADDLENLP_ROOT \ + -e ci_scripts \ + -e no_proxy \ + -e CI_name \ + -e paddle_whl \ + -e FLAGS_dynamic_static_unified_comm \ + -e python_version \ + -w /workspace --runtime=nvidia ${docker_image} + + - name: Test + env: + work_dir: ${{ github.workspace }} + run: | + docker exec -t ${{ env.container_name }} /bin/bash -c ' + ldconfig + pip config set global.cache-dir "/home/.cache/pip" + ln -sf $(which python${python_version}) /usr/bin/python + python -m pip install bce-python-sdk==0.8.74 + ' + docker exec -t ${{ env.container_name }} /bin/bash -c ' + ldconfig + set -e + timeout 80m bash PaddleNLP/scripts/distribute/run_ci.sh ${paddle_whl} + ' + + - name: Upload and display logs + if: always() + env: + home_path: ${{ github.workspace }}/.. + bos_file: ${{ github.workspace }}/../bos/BosClient.py + run: | + docker exec -t ${{ env.container_name }} /bin/bash -c ' + export AK=paddle + export SK=paddle + if [ ! -f "${{ env.bos_file }}" ]; then + wget -q --no-proxy -O ${{ env.home_path }}/bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate + mkdir ${{ env.home_path }}/bos + tar xf ${{ env.home_path }}/bos_new.tar.gz -C ${{ env.home_path }}/bos + fi + cd /workspace/case_logs + for FILE in /workspace/case_logs/*; do + file=$(basename "$FILE") + python ${{ env.bos_file }} $file paddle-github-action/PR/Auto-Parallel/${PR_ID}/${COMMIT_ID}/logs + echo "$file: https://paddle-github-action.bj.bcebos.com/PR/Auto-Parallel/${PR_ID}/${COMMIT_ID}/logs/$file" + done + ' + - name: Terminate and delete the container + if: always() + run: | + docker rm -f ${{ env.container_name }} + \ No newline at end of file diff --git a/.github/workflows/llm.yml b/.github/workflows/llm.yml new file mode 100644 index 000000000000..1ae31fb48881 --- /dev/null +++ b/.github/workflows/llm.yml @@ -0,0 +1,148 @@ +name: PaddleNLP LLM CI + +on: + push: + branches: [develop, release/*] + pull_request: + +env: + PR_ID: ${{ github.event.pull_request.number }} + COMMIT_ID: ${{ github.event.pull_request.head.sha }} + work_dir: /workspace/PaddleNLP + PADDLENLP_ROOT: /workspace/PaddleNLP + TASK: paddlenlp-CI-${{ github.event.pull_request.number }}-llm + ci_scripts: /workspace/PaddleNLP/scripts/regression + BRANCH: ${{ github.event.pull_request.base.ref }} + AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }} + CI_name: llm-ci + no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn" + HF_ENDPOINT: https://hf-mirror.com + STUDIO_GIT_HOST: http://git.prod.idc-to-cloud.aistudio.baidu-int.com + PPNLP_HOME: /ssd1/paddlenlp + NLP_DIR: /workspace/PaddleNLP + HF_DATASETS_CACHE: /ssd1/paddlenlp/huggingface/datasets + TRANSFORMERS_CACHE: /ssd1/paddlenlp/huggingface + +defaults: + run: + shell: bash + +jobs: + llm-ci: + name: llm-test + runs-on: [self-hosted, 4gpu] + steps: + - name: Download PaddleNLP + env: + work_dir: ${{ github.workspace }} + run: | + echo "Downloading PaddleNLP.tar.gz" + wget -q --no-proxy https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleNLP.tar --no-check-certificate + echo "Extracting PaddleNLP.tar.gz" + tar xf PaddleNLP.tar && rm -rf PaddleNLP.tar + # wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddlenlp/Bos.zip --no-check-certificate + # unzip -P "$(cat ${{ github.workspace }}/../../../bos_key)" Bos.zip + # mkdir paddlenlp && mv Bos/* ./paddlenlp/ + source ${{ github.workspace }}/../../../proxy + cd PaddleNLP + git config --global user.name "PaddleCI" + git config --global user.email "paddle_ci@example.com" + git pull + git submodule update --init --recursive --force + git fetch origin pull/${PR_ID}/head:test && git checkout test + git remote add upstream https://github.com/PaddlePaddle/PaddleNLP.git + git fetch upstream ${BRANCH} + git merge ${BRANCH} + git diff --numstat ${BRANCH} |awk '{print $NF}' + git log --pretty=oneline -10 + + - name: Parse GPU ID from runner label + id: parse-gpu + run: | + echo "Runner labels: $RUNNER_LABELS" + + if [[ "$RUNNER_LABELS" == *"gpu-0-3"* ]]; then + echo "cudaid=0,1,2,3" >> ${{ github.env }} + elif [[ "$RUNNER_LABELS" == *"gpu-4-7"* ]]; then + echo "cudaid=4,5,6,7" >> ${{ github.env }} + else + echo "Unknown runner label, fallback to default CUDA" + echo "cudaid=0,1,2,3" >> ${{ github.env }} + fi + + - name: Check docker image and run container + env: + CACHE_DIR: /home/data/cfs/.cache + FLAGS_dynamic_static_unified_comm: "True" + python_version: "3.10" + paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuSome-LinuxCentos-Gcc82-Cuda118-Cudnn86-Trt85-Py310-CINN-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + run: | + container_name=${TASK}-${core_index}-$(date +%Y%m%d-%H%M%S) + echo "container_name=${container_name}" >> ${{ github.env }} + docker_image=iregistry.baidu-int.com/paddlecloud/base-images:paddlecloud-ubuntu18.04-gcc8.2-cuda11.8-cudnn8.6-nccl2.15.5-20250311 + nvidia-docker run -d -t --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \ + -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ + -v ${{ github.workspace }}:/workspace \ + -v /home/.cache/pip:/home/.cache/pip \ + -v /ssd1/paddlenlp:/ssd1/paddlenlp \ + -e BRANCH \ + -e PR_ID \ + -e COMMIT_ID \ + -e work_dir \ + -e PADDLENLP_ROOT \ + -e ci_scripts \ + -e no_proxy \ + -e CI_name \ + -e paddle_whl \ + -e FLAGS_dynamic_static_unified_comm \ + -e python_version \ + -e CUDA_VISIBLE_DEVICES=${{ env.cudaid }} \ + -w /workspace --runtime=nvidia ${docker_image} + + - name: Test + env: + work_dir: ${{ github.workspace }} + run: | + docker exec -t ${{ env.container_name }} /bin/bash -c ' + ldconfig + ln -sf $(which python${python_version}) /usr/bin/python + python -m pip install bce-python-sdk==0.8.74 + pip config set global.cache-dir "/home/.cache/pip" + echo "[global] + timeout = 60 + index = https://pip.baidu-int.com/search/ + index-url = https://pip.baidu-int.com/simple/ + trusted-host = pip.baidu-int.com" > /etc/pip.conf + ' + docker exec -t ${{ env.container_name }} /bin/bash -c ' + ldconfig + set -e + timeout 2h bash PaddleNLP/scripts/regression/run_ci.sh ${python} ${paddle} + ' + + - name: Upload and display logs + if: always() + env: + home_path: ${{ github.workspace }}/.. + bos_file: ${{ github.workspace }}/../bos/BosClient.py + run: | + docker exec -t ${{ env.container_name }} /bin/bash -c ' + export AK=paddle + export SK=paddle + if [ ! -f "${{ env.bos_file }}" ]; then + wget -q --no-proxy -O ${{ env.home_path }}/bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate + mkdir ${{ env.home_path }}/bos + tar xf ${{ env.home_path }}/bos_new.tar.gz -C ${{ env.home_path }}/bos + fi + cd /workspace/case_logs + for FILE in /workspace/case_logs/*; do + file=$(basename "$FILE") + python ${{ env.bos_file }} $file paddle-github-action/PR/Auto-Parallel/${PR_ID}/${COMMIT_ID}/logs + echo "$file: https://paddle-github-action.bj.bcebos.com/PR/Auto-Parallel/${PR_ID}/${COMMIT_ID}/logs/$file" + done + ' + - name: Terminate and delete the container + if: always() + run: | + docker rm -f ${{ env.container_name }} + diff --git a/.github/workflows/unit.yml b/.github/workflows/unit.yml new file mode 100644 index 000000000000..135a4586a086 --- /dev/null +++ b/.github/workflows/unit.yml @@ -0,0 +1,147 @@ +name: PaddleNLP Unit CI + +on: + push: + branches: [develop, release/*] + pull_request: + +env: + PR_ID: ${{ github.event.pull_request.number }} + COMMIT_ID: ${{ github.event.pull_request.head.sha }} + work_dir: /workspace/PaddleNLP + PADDLENLP_ROOT: /workspace/PaddleNLP + TASK: paddlenlp-CI-${{ github.event.pull_request.number }}-unit + ci_scripts: /workspace/PaddleNLP/scripts/unit_test + BRANCH: ${{ github.event.pull_request.base.ref }} + CI_name: unit-ci + no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn" + HF_ENDPOINT: https://hf-mirror.com + STUDIO_GIT_HOST: http://git.prod.idc-to-cloud.aistudio.baidu-int.com + PPNLP_HOME: /ssd1/paddlenlp + NLP_DIR: /workspace/PaddleNLP + HF_DATASETS_CACHE: /ssd1/paddlenlp/huggingface/datasets + TRANSFORMERS_CACHE: /ssd1/paddlenlp/huggingface + +defaults: + run: + shell: bash + +jobs: + unit-ci: + name: unit-test + runs-on: [self-hosted, 4gpu] + steps: + - name: Download PaddleNLP + env: + work_dir: ${{ github.workspace }} + run: | + echo "Downloading PaddleNLP.tar.gz" + wget -q --no-proxy https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleNLP.tar --no-check-certificate + echo "Extracting PaddleNLP.tar.gz" + tar xf PaddleNLP.tar && rm -rf PaddleNLP.tar + # wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddlenlp/Bos.zip --no-check-certificate + # unzip -P "$(cat ${{ github.workspace }}/../../../bos_key)" Bos.zip + # mkdir paddlenlp && mv Bos/* ./paddlenlp/ + source ${{ github.workspace }}/../../../proxy + cd PaddleNLP + git config --global user.name "PaddleCI" + git config --global user.email "paddle_ci@example.com" + git pull + git submodule update --init --recursive --force + git fetch origin pull/${PR_ID}/head:test && git checkout test + git remote add upstream https://github.com/PaddlePaddle/PaddleNLP.git + git fetch upstream ${BRANCH} + git merge ${BRANCH} + git diff --numstat ${BRANCH} |awk '{print $NF}' + git log --pretty=oneline -10 + + - name: Parse GPU ID from runner label + id: parse-gpu + run: | + echo "Runner labels: $RUNNER_LABELS" + + if [[ "$RUNNER_LABELS" == *"gpu-0-3"* ]]; then + echo "cudaid=0,1,2,3" >> ${{ github.env }} + elif [[ "$RUNNER_LABELS" == *"gpu-4-7"* ]]; then + echo "cudaid=4,5,6,7" >> ${{ github.env }} + else + echo "Unknown runner label, fallback to default CUDA" + echo "cudaid=0,1,2,3" >> ${{ github.env }} + fi + + - name: Check docker image and run container + env: + CACHE_DIR: /home/data/cfs/.cache + FLAGS_dynamic_static_unified_comm: "True" + python_version: "3.10" + paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuSome-LinuxCentos-Gcc82-Cuda118-Cudnn86-Trt85-Py310-CINN-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + run: | + container_name=${TASK}-${core_index}-$(date +%Y%m%d-%H%M%S) + echo "container_name=${container_name}" >> ${{ github.env }} + docker_image=iregistry.baidu-int.com/paddlecloud/base-images:paddlecloud-ubuntu18.04-gcc8.2-cuda11.8-cudnn8.6-nccl2.15.5-20250311 + nvidia-docker run -d -t --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \ + -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ + -v ${{ github.workspace }}:/workspace \ + -v /home/.cache/pip:/home/.cache/pip \ + -v /ssd1/paddlenlp:/ssd1/paddlenlp \ + -e BRANCH \ + -e PR_ID \ + -e COMMIT_ID \ + -e work_dir \ + -e PADDLENLP_ROOT \ + -e ci_scripts \ + -e no_proxy \ + -e CI_name \ + -e paddle_whl \ + -e FLAGS_dynamic_static_unified_comm \ + -e python_version \ + -e CUDA_VISIBLE_DEVICES=${{ env.cudaid }} \ + -w /workspace --runtime=nvidia ${docker_image} + + - name: Test + env: + work_dir: ${{ github.workspace }} + run: | + docker exec -t ${{ env.container_name }} /bin/bash -c ' + ldconfig + ln -sf $(which python${python_version}) /usr/bin/python + python -m pip install bce-python-sdk==0.8.74 + pip config set global.cache-dir "/home/.cache/pip" + echo "[global] + timeout = 60 + index = https://pip.baidu-int.com/search/ + index-url = https://pip.baidu-int.com/simple/ + trusted-host = pip.baidu-int.com" > /etc/pip.conf + ' + docker exec -t ${{ env.container_name }} /bin/bash -c ' + ldconfig + set -e + # rm -rf tests/utils/test_import_utils.py + timeout 50m bash PaddleNLP/scripts/unit_test/ci_unit.sh ${paddle} + ' + + - name: Upload and display logs + if: always() + env: + home_path: ${{ github.workspace }}/.. + bos_file: ${{ github.workspace }}/../bos/BosClient.py + run: | + docker exec -t ${{ env.container_name }} /bin/bash -c ' + export AK=paddle + export SK=paddle + if [ ! -f "${{ env.bos_file }}" ]; then + wget -q --no-proxy -O ${{ env.home_path }}/bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate + mkdir ${{ env.home_path }}/bos + tar xf ${{ env.home_path }}/bos_new.tar.gz -C ${{ env.home_path }}/bos + fi + cd /workspace/case_logs + for FILE in /workspace/case_logs/*; do + file=$(basename "$FILE") + python ${{ env.bos_file }} $file paddle-github-action/PR/Auto-Parallel/${PR_ID}/${COMMIT_ID}/logs + echo "$file: https://paddle-github-action.bj.bcebos.com/PR/Auto-Parallel/${PR_ID}/${COMMIT_ID}/logs/$file" + done + ' + - name: Terminate and delete the container + if: always() + run: | + docker rm -f ${{ env.container_name }} diff --git a/scripts/distribute/run_ci.sh b/scripts/distribute/run_ci.sh index fce2fd8b1f80..aacb8d138d50 100644 --- a/scripts/distribute/run_ci.sh +++ b/scripts/distribute/run_ci.sh @@ -31,6 +31,7 @@ target_lists_for_gpt=( "llm/auto_parallel/gpt-3" "paddlenlp/transformers/gpt" "scripts/distribute" + ".github/workflows/distribute.yml" ) target_lists_for_llama=( @@ -38,6 +39,7 @@ target_lists_for_llama=( "paddlenlp/trainer/auto_trainer.py" "paddlenlp/transformers/llama" "scripts/distribute" + ".github/workflows/distribute.yml" ) target_lists_for_deepseek=( @@ -49,6 +51,7 @@ target_lists_for_deepseek=( "paddlenlp/transformers/moe_layer_auto.py" "paddlenlp/transformers/moe_gate_auto.py" "scripts/distribute" + ".github/workflows/distribute.yml" ) target_path_for_ci_scripts="scripts/distribute" diff --git a/scripts/regression/run_ci.sh b/scripts/regression/run_ci.sh index 71eba33f2eda..ec650bc857b7 100644 --- a/scripts/regression/run_ci.sh +++ b/scripts/regression/run_ci.sh @@ -38,6 +38,7 @@ target_lists_for_llm=( "tests/llm" "csrc" "scripts/regression" + ".github/workflows/llm.yml" ) all_P0case_dic=(["msra_ner"]=15 ["glue"]=2