Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .github/actions/rerun-workflow/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: 'Rerun Workflow'
description: 'Re-run GitHub Actions workflow for a given Pull Request'
inputs:
GITHUB_TOKEN:
description: 'GitHub token with repo scope'
required: true
OWNER:
description: 'Repository owner'
required: true
REPO:
description: 'Repository name'
required: true
PR_ID:
description: 'Pull Request ID'
required: true
JOB_NAME:
description: 'Job name to rerun'
required: true

runs:
using: 'composite'
steps:
- run: bash ./.github/actions/rerun-workflow/rerun.sh
shell: bash
env:
GITHUB_TOKEN: ${{ inputs.GITHUB_TOKEN }}
OWNER: ${{ inputs.OWNER }}
REPO: ${{ inputs.REPO }}
PR_ID: ${{ inputs.PR_ID }}
JOB_NAME: ${{ inputs.JOB_NAME }}
77 changes: 77 additions & 0 deletions .github/actions/rerun-workflow/rerun.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Copyright (c) 2025 PaddleNLP Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -e

COMMIT_SHA=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
"https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_ID" | jq -r '.head.sha')

echo "Commit SHA: $COMMIT_SHA"

response=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
"https://api.github.com/repos/$OWNER/$REPO/actions/runs?head_sha=$COMMIT_SHA&per_page=100")

echo "Response: $response"

run_ids=$(echo "$response" | jq -r '.workflow_runs[].id')

if [ -n "$run_ids" ]; then
echo "Found run_ids for commit $COMMIT_SHA: $run_ids"

for run_id in $run_ids; do
if [ "$JOB_NAME" = "all-failed" ]; then
echo "Rerunning all failed jobs for run_id: $run_id"

rerun_response=$(curl -X POST -s -w "%{http_code}" -o /dev/null \
-H "Accept: application/vnd.github.v3+json" \
-H "Authorization: Bearer $GITHUB_TOKEN" \
"https://api.github.com/repos/$OWNER/$REPO/actions/runs/$run_id/rerun-failed-jobs")
if [ "$rerun_response" -eq 201 ]; then
echo "Successfully requested rerun for all blocked jobs in run_id: $run_id"
else
echo "Failed to request rerun for run_id: $run_id with status code $rerun_response"
fi

else
jobs_response=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
"https://api.github.com/repos/$OWNER/$REPO/actions/runs/$run_id/jobs")

echo "Jobs Response for run_id $run_id: $jobs_response"

# if [[ "$JOB_NAME" == *"bypass"* ]]; then
block_jobs=$(echo "$jobs_response" | jq -r --arg job_name "$JOB_NAME" \
'.jobs[] | select(.name == $job_name) | .id')
# else
# block_jobs=$(echo "$jobs_response" | jq -r --arg job_name "$JOB_NAME" \
# '.jobs[] | select(.name == $job_name and .conclusion != "success") | .id')
# fi

if [ -n "$block_jobs" ]; then
echo "Found block jobs for run_id $run_id: $block_jobs"

for job_id in $block_jobs; do
echo "Rerunning job_id: $job_id"
curl -X POST -H "Accept: application/vnd.github.v3+json" \
-H "Authorization: token $GITHUB_TOKEN" \
"https://api.github.com/repos/$OWNER/$REPO/actions/jobs/$job_id/rerun"
done
else
echo "No block jobs found for run_id $run_id with name $JOB_NAME."
fi
fi
done
else
echo "No matching workflow runs found for commit $COMMIT_SHA."
exit 1
fi
175 changes: 175 additions & 0 deletions .github/workflows/distribute.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
name: Distribute CI (V100)

on:
pull_request:
types: [opened, synchronize, reopened]
branches: [develop]
schedule:
- cron: "2 0 * * *"
workflow_call:
inputs:
run_downstream:
required: true
type: string
image_name:
required: true
type: string


concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
cancel-in-progress: true

env:
PR_ID: ${{ github.event.pull_request.number }}
COMMIT_ID: ${{ github.event.pull_request.head.sha }}
TASK: paddlenlp-CI-${{ github.event.pull_request.number }}-Distribut-V100
ci_scripts: /workspace/PaddleNLP/scripts/distribute
BRANCH: ${{ github.event.pull_request.base.ref }}
AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }}
CI_name: distribute-ci
no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"
GITHUB_EVENT_NAME: ${{ github.event_name }}
RUN_DOWNSTREAM: ${{ inputs.run_downstream }}

defaults:
run:
shell: bash

jobs:
distribute-v100-ci:
name: distribute-v100-ci
runs-on:
group: Auto-Parallel
steps:
- name: Determine Image Name
env:
IMAGE_NAME: ${{ inputs.image_name }}
run: |
if [[ -n "${IMAGE_NAME}" ]]; then
echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITHUB_ENV"
else
echo "IMAGE_NAME=registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.8-cudnn8.6-trt8.5-gcc82" >> "$GITHUB_ENV"
fi

- name: Run Container
env:
work_dir: ${{ github.workspace }}
CACHE_DIR: /home/data/cfs/.cache
FLAGS_dynamic_static_unified_comm: "True"
python_version: "3.10"
paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuSome-LinuxCentos-Gcc82-Cuda118-Cudnn86-Trt85-Py310-CINN-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
run: |
container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
echo "container_name=${container_name}" >> "$GITHUB_ENV"
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping..."
else
nvidia-docker run -d -t --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
-v $work_dir/../../..:$work_dir/../../.. \
-v $work_dir:/workspace \
-v /home/.cache/pip:/home/.cache/pip \
-v /home/FleetX_CI:/fleetx_data \
-v /home/Llm_gpt_CI:/llm_gpt_data \
-v /home/Llama_CI:/llama_data \
-e BRANCH \
-e AGILE_COMPILE_BRANCH \
-e PR_ID \
-e COMMIT_ID \
-e work_dir \
-e ci_scripts \
-e no_proxy \
-e CI_name \
-e paddle_whl \
-e FLAGS_dynamic_static_unified_comm \
-e python_version \
-w /workspace --runtime=nvidia ${{ env.IMAGE_NAME }}
fi

- name: Download Code
run: |
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping.."
else
docker exec -t $container_name /bin/bash -c '
rm -rf * .[^.]*
echo "Downloading PaddleNLP.tar.gz"
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleNLP.tar --no-check-certificate
echo "Extracting PaddleNLP.tar.gz"
tar xf PaddleNLP.tar && rm -rf PaddleNLP.tar
source $work_dir/../../../proxy
cd PaddleNLP
git config --global user.name "PaddleCI"
git config --global user.email "paddle_ci@example.com"
git pull
git submodule update --init --recursive --force
if [ -n "${PR_ID}" ]; then
git fetch origin pull/${PR_ID}/head
git checkout -b PR_${PR_ID} FETCH_HEAD
git remote add upstream https://github.com/PaddlePaddle/PaddleFormers.git
git fetch upstream ${BRANCH}
git merge ${BRANCH} --no-edit
git diff --numstat ${BRANCH} -- | awk "{print \$NF}"
else
echo "Not in a pull_request event. Skipping PR-specific operations."
fi
git log --pretty=oneline -10
'
fi

- name: Test
run: |
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping..."
else
docker exec -t $container_name /bin/bash -c '
ldconfig
ln -sf $(which python${python_version}) /usr/bin/python
pip config set global.cache-dir "/home/.cache/pip"
source $work_dir/../../../proxy
set -e
cd /workspace/PaddleNLP && git config --global --add safe.directory $PWD
timeout 80m bash scripts/distribute/run_ci.sh ${paddle_whl}
'
fi

- name: Upload Logs
if: always()
env:
home_path: ${{ github.workspace }}/../../..
bos_file: ${{ github.workspace }}/../../../bos/BosClient.py
run: |
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping..."
else
docker exec -t $container_name /bin/bash -c '
if [ ! -f "${{ env.bos_file }}" ]; then
wget -q --no-proxy -O ${{ env.home_path }}/bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
mkdir ${{ env.home_path }}/bos
tar xf ${{ env.home_path }}/bos_new.tar.gz -C ${{ env.home_path }}/bos
fi

if [[ "${{ env.RUN_DOWNSTREAM }}" == "" && -n "${PR_ID}" ]]; then
bos_prefix="${PR_ID}/${COMMIT_ID}"
elif [[ "${{ env.RUN_DOWNSTREAM }}" == "true" && -n "${PR_ID}" ]]; then
bos_prefix="${PR_ID}/${COMMIT_ID}/test_build"
else
bos_prefix="schedule/$(date +%Y%m%d)"
fi

cd /workspace/case_logs
for FILE in /workspace/case_logs/*; do
file=$(basename "$FILE")
python ${{ env.bos_file }} $file paddle-github-action/PR/PaddleNLP/distribute/${bos_prefix}/logs
echo "$file: https://paddle-github-action.bj.bcebos.com/PR/PaddleNLP/distribute/${bos_prefix}/logs/$file"
done
tar -czf products.tar.gz ./
python ${{ env.bos_file }} products.tar.gz paddle-github-action/PR/PaddleNLP/distribute/${bos_prefix}/logs
echo "products: https://paddle-github-action.bj.bcebos.com/PR/PaddleNLP/distribute/${bos_prefix}/logs/products.tar.gz"
'
fi

- name: Terminate And Delete the Container
if: always()
run: |
docker rm -f $container_name 2>/dev/null || true
Loading
Loading