[CI]add rl to ci (#10553)

Liujie0926 · web-flow · commit 8205e3d19430 · 2025-05-13T10:02:56.000+08:00
* fix scripts for docs

* fix timeout

* add grpo&amp;rf++

* fix path

* update yaml

* fix

* fix data

* add install_external_ops

* fix fused_ln

* fix

* update cmd

* fix path

* fix

* fix path

* fix codestyle

* update class name

* update name
diff --git a/scripts/regression/run_ci.sh b/scripts/regression/run_ci.sh
@@ -88,6 +88,14 @@ nlp_build (){
     python setup.py bdist_wheel
     python -m pip install --ignore-installed  dist/p****.whl
 }
+install_external_ops(){
+    echo -e "\033[31m ---- Install extern_ops  \033"
+    export PYTHONPATH=${nlp_dir}:$PYTHONPATH
+    cd ${nlp_dir}/slm/model_zoo/gpt-3/external_ops
+    python setup.py install
+    python -c "import fused_ln;";
+    cd ${nlp_dir}
+}
 ####################################
 # get diff case
 cd ${nlp_dir}
@@ -99,10 +107,12 @@ for file_name in `git diff --numstat ${AGILE_COMPILE_BRANCH} |awk '{print $NF}'`
     dir3=${arr_file_name[2]}
     dir4=${arr_file_name[3]}
     file_item=$dir1/$dir2/$dir3/$dir4
+    ext="${file_name##*.}"
     echo "file_name:"${file_name}, "dir1:"${dir1}, "dir2:"${dir2},"dir3:"${dir3},".xx:" ${file_name##*.}
+    echo "ext: ${file_name##*.}"
     if [ ! -f ${file_name} ];then # 针对pr删掉文件
         continue
-    elif [[ ${file_name##*.} == "md" ]] || [[ ${file_name##*.} == "rst" ]] || [[ ${dir1} == "docs" ]];then
+    elif [[ "$ext" == "md" || "$ext" == "rst" || "$file_name" == docs/* ]]; then
         continue
     elif [[ "${AGILE_COMPILE_BRANCH}" == "refactor-training-loop" ]];then # 针对特定分支
         P0case_list[${#P0case_list[*]}]=gpt
@@ -196,6 +206,8 @@ if [[ ${#P0case_list[*]} -ne 0 ]];then
     else
         echo "install_paddlenlp_ops_pr done"
     fi
+    # install fused_ln
+    install_external_ops
     python -c "from paddlenlp import __version__; print('paddlenlp version:', __version__)" >> ${log_path}/commit_info.txt
     python -c "import paddlenlp; print('paddlenlp commit:',paddlenlp.version.commit)" >> ${log_path}/commit_info.txt
     python -m pip list >> ${log_path}/commit_info.txt
diff --git a/scripts/unit_test/ci_unit.sh b/scripts/unit_test/ci_unit.sh
@@ -50,16 +50,14 @@ set_env() {
     export FLAGS_cudnn_deterministic=1
     export HF_ENDPOINT=https://hf-mirror.com
     export FLAGS_use_cuda_managed_memory=true
-    export running_time=30m
+    export running_time=40m
 
     # for CE
     if [[ ${FLAGS_enable_CE} == "true" ]];then
         export CE_TEST_ENV=1
         export RUN_SLOW_TEST=1
         export PYTHONPATH=${nlp_dir}:${nlp_dir}/llm:${PYTHONPATH}
         export running_time=5h
-    else
-        continue
     fi
 }
 
@@ -74,7 +72,7 @@ print_info() {
         cd ${PPNLP_HOME} && python upload.py ${PPNLP_HOME}/upload 'paddlenlp/PaddleNLP_CI/PaddleNLP-CI-Unittest-GPU'
         rm -rf upload/* && cd -
         if [ $1 -eq 124 ]; then
-            echo "\033[32m [failed-timeout] Test case execution was terminated after exceeding the 30m limit."
+            echo "\033[32m [failed-timeout] Test case execution was terminated after exceeding the ${running_time} min limit."
         fi
     else
         tail -n 1 ${log_path}/unittest.log
@@ -85,16 +83,12 @@ print_info() {
 get_diff_TO_case(){
 export FLAGS_enable_CI=false
 for file_name in `git diff --numstat ${AGILE_COMPILE_BRANCH} |awk '{print $NF}'`;do
-    arr_file_name=(${file_name//// })
-    dir1=${arr_file_name[0]}
-    dir2=${arr_file_name[1]}
-    dir3=${arr_file_name[2]}
-    dir4=${arr_file_name[3]}
-    file_item=$dir1/$dir2/$dir3/$dir4
-    echo "file_name:"${file_name}, "dir1:"${dir1}, "dir2:"${dir2},"dir3:"${dir3},".xx:" ${file_name##*.}
+    ext="${file_name##*.}"
+    echo "file_name: ${file_name}, ext: ${file_name##*.}"
+    
     if [ ! -f ${file_name} ];then # 针对pr删掉文件
         continue
-    elif [[ ${file_name##*.} == "md" ]] || [[ ${file_name##*.} == "rst" ]] || [[ ${dir1} == "docs" ]];then
+    elif [[ "$ext" == "md" || "$ext" == "rst" || "$file_name" == docs/* ]]; then
         continue
     else
         FLAGS_enable_CI=true
diff --git a/tests/llm/test_grpo.py b/tests/llm/test_grpo.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import os
+import signal
+import subprocess
+import sys
+import time
+import unittest
+
+from parameterized import parameterized_class
+
+from .testing_utils import LLMTest
+
+
+@parameterized_class(
+    ["model_dir"],
+    [["qwen"]],
+)
+class GRPOTest(LLMTest, unittest.TestCase):
+    config_path: str = None
+    model_dir: str = None
+
+    def setUp(self) -> None:
+        LLMTest.setUp(self)
+        sys.path.insert(0, "./llm/alignment/rl")
+        sys.path.insert(0, self.model_dir)
+
+    def tearDown(self) -> None:
+        LLMTest.tearDown(self)
+
+    def test_grpo(self):
+        # 设置必要的环境变量
+        env_vars = {
+            "PYTHONPATH": f"{os.path.abspath('./')}:{os.path.abspath('./llm')}:" + os.environ.get("PYTHONPATH", ""),
+            "FLAGS_set_to_1d": "False",
+            "NVIDIA_TF32_OVERRIDE": "0",
+            "FLAGS_dataloader_use_file_descriptor": "False",
+            "HF_DATASETS_DOWNLOAD_TIMEOUT": "1",
+            "FLAGS_gemm_use_half_precision_compute_type": "False",
+            "FLAGS_force_cublaslt_no_reduced_precision_reduction": "True",
+            "FLAGS_mla_use_tensorcore": "0",
+            "FLAGS_cascade_attention_max_partition_size": "2048",
+        }
+        case_env = os.environ.copy()
+        case_env.update(env_vars)
+
+        # 修改执行路径
+        repo_path = os.getcwd()
+        rl_dir = os.path.join(os.getcwd(), "./llm/alignment/rl")
+        os.chdir(rl_dir)
+
+        # 下载并解压数据
+        if not os.path.exists("ppo-kk.tgz"):
+            subprocess.run(
+                "wget -q https://paddlenlp.bj.bcebos.com/datasets/examples/ppo-kk.tgz && tar zxf ppo-kk.tgz",
+                shell=True,
+                check=True,
+            )
+
+        # 启动 reward server
+        reward_dir = os.path.join(os.getcwd(), "./reward")
+        reward_log = os.path.join(reward_dir, "reward_server.log")
+        reward_server_script = os.path.join(reward_dir, "reward_server.py")
+
+        with open(reward_log, "w") as log_file:
+            reward_proc = subprocess.Popen(
+                [sys.executable, reward_server_script],
+                cwd=reward_dir,
+                stdout=log_file,
+                stderr=subprocess.STDOUT,
+                preexec_fn=os.setsid,  # 便于后续 kill 整个进程组
+            )
+
+        try:
+            # 等待 reward server 启动
+            time.sleep(30)
+
+            # 运行主逻辑
+            cmd = 'python -u -m paddle.distributed.launch \
+                    --devices "$CUDA_VISIBLE_DEVICES" run_rl.py \
+                    ../../config/qwen/reinforce_plus_plus_argument.yaml \
+                    --actor_model_name_or_path "Qwen/Qwen2-1.5B" \
+                    --max_dec_len 128 \
+                    --max_steps 3 \
+                    --kl_coeff 0.000 \
+                    --kl_loss_coeff 0.000 \
+                    --use_fused_rms_norm true '
+            pro = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            out, err = pro.communicate()
+            print(out)
+            pro.wait()
+            pro.returncode == 0
+            assert str(out).find("Error") == -1
+            assert str(err).find("Error") == -1
+            os.chdir(repo_path)
+
+        finally:
+            # main 执行完毕，关闭 reward server
+            if reward_proc.poll() is None:  # 确保进程还在
+                os.killpg(os.getpgid(reward_proc.pid), signal.SIGTERM)  # kill 整个进程组
diff --git a/tests/llm/test_reinforce_plus_plus.py b/tests/llm/test_reinforce_plus_plus.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import os
+import signal
+import subprocess
+import sys
+import time
+import unittest
+
+from parameterized import parameterized_class
+
+from .testing_utils import LLMTest
+
+
+@parameterized_class(
+    ["model_dir"],
+    [["qwen"]],
+)
+class ReinforcePlusPlusTest(LLMTest, unittest.TestCase):
+    config_path: str = None
+    model_dir: str = None
+
+    def setUp(self) -> None:
+        LLMTest.setUp(self)
+        sys.path.insert(0, "./llm/alignment/rl")
+        sys.path.insert(0, self.model_dir)
+
+    def tearDown(self) -> None:
+        LLMTest.tearDown(self)
+
+    def test_reinforce_plus_plus(self):
+        # 设置必要的环境变量
+        env_vars = {
+            "PYTHONPATH": f"{os.path.abspath('./')}:{os.path.abspath('./llm')}:" + os.environ.get("PYTHONPATH", ""),
+            "FLAGS_set_to_1d": "False",
+            "NVIDIA_TF32_OVERRIDE": "0",
+            "FLAGS_dataloader_use_file_descriptor": "False",
+            "HF_DATASETS_DOWNLOAD_TIMEOUT": "1",
+            "FLAGS_gemm_use_half_precision_compute_type": "False",
+            "FLAGS_force_cublaslt_no_reduced_precision_reduction": "True",
+            "FLAGS_mla_use_tensorcore": "0",
+            "FLAGS_cascade_attention_max_partition_size": "2048",
+        }
+        case_env = os.environ.copy()
+        case_env.update(env_vars)
+
+        # 修改执行路径
+        repo_path = os.getcwd()
+        rl_dir = os.path.join(os.getcwd(), "./llm/alignment/rl")
+        os.chdir(rl_dir)
+
+        # 下载并解压数据
+        if not os.path.exists("ppo-kk.tgz"):
+            subprocess.run(
+                "wget -q https://paddlenlp.bj.bcebos.com/datasets/examples/ppo-kk.tgz && tar zxf ppo-kk.tgz",
+                shell=True,
+                check=True,
+            )
+
+        # 启动 reward server
+        reward_dir = os.path.join(os.getcwd(), "./reward")
+        reward_log = os.path.join(reward_dir, "reward_server.log")
+        reward_server_script = os.path.join(reward_dir, "reward_server.py")
+
+        with open(reward_log, "w") as log_file:
+            reward_proc = subprocess.Popen(
+                [sys.executable, reward_server_script],
+                cwd=reward_dir,
+                stdout=log_file,
+                stderr=subprocess.STDOUT,
+                preexec_fn=os.setsid,  # 便于后续 kill 整个进程组
+            )
+
+        try:
+            # 等待 reward server 启动
+            time.sleep(30)
+
+            # 运行主逻辑
+            cmd = 'python -u -m paddle.distributed.launch \
+                    --devices "$CUDA_VISIBLE_DEVICES" run_rl.py \
+                    ../../config/qwen/reinforce_plus_plus_argument.yaml \
+                    --rl_algorithm "reinforce_plus_plus" \
+                    --actor_model_name_or_path "Qwen/Qwen2-1.5B" \
+                    --max_dec_len 128 \
+                    --max_steps 3 \
+                    --kl_coeff 0.000 \
+                    --kl_loss_coeff 0.000 \
+                    --use_fused_rms_norm true '
+            pro = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            out, err = pro.communicate()
+            print(out)
+            pro.wait()
+            pro.returncode == 0
+            assert str(out).find("Error") == -1
+            assert str(err).find("Error") == -1
+            os.chdir(repo_path)
+
+        finally:
+            # main 执行完毕，关闭 reward server
+            if reward_proc.poll() is None:  # 确保进程还在
+                os.killpg(os.getpgid(reward_proc.pid), signal.SIGTERM)  # kill 整个进程组