- refactor test

re-imagined · re-imagined · commit 9c681be820c9 · 2025-04-22T21:01:58.000+08:00
diff --git a/tests/test_vllm_client_server.py b/tests/test_vllm_client_server.py
@@ -43,32 +43,8 @@ def setUpClass(cls):
             ["trl", "vllm-serve", "--model", cls.model_id], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env
         )
 
-        # Initialize the clients using both initialization methods
-        cls.client = VLLMClient(connection_timeout=120)  # Default host and port
-        cls.client_base_url = VLLMClient(base_url="http://0.0.0.0:8000", connection_timeout=120)  # Using base_url
-
-    def test_initialization_methods(self):
-        """Test that both initialization methods work correctly."""
-        # Test generation with default client (host+port)
-        prompts = ["Test initialization 1"]
-        outputs_default = self.client.generate(prompts)
-        self.assertIsInstance(outputs_default, list)
-        self.assertEqual(len(outputs_default), len(prompts))
-
-        # Test generation with base_url client
-        outputs_base_url = self.client_base_url.generate(prompts)
-        self.assertIsInstance(outputs_base_url, list)
-        self.assertEqual(len(outputs_base_url), len(prompts))
-
-    def test_base_url_attribute(self):
-        """Test that both initialization methods set the base_url attribute correctly."""
-        # Both clients should have the same base_url
-        self.assertEqual(self.client.base_url, "http://0.0.0.0:8000")
-        self.assertEqual(self.client_base_url.base_url, "http://0.0.0.0:8000")
-
-        # Verify the client doesn't store host/port when base_url is provided
-        self.assertTrue(not hasattr(self.client_base_url, 'host') or self.client_base_url.host is None)
-        self.assertTrue(not hasattr(self.client_base_url, 'server_port') or self.client_base_url.server_port is None)
+        # Initialize the client
+        cls.client = VLLMClient(connection_timeout=120)
 
     def test_generate(self):
         prompts = ["Hello, AI!", "Tell me a joke"]
@@ -114,9 +90,84 @@ def test_reset_prefix_cache(self):
     def tearDownClass(cls):
         super().tearDownClass()
 
-        # Close the clients
+        # Close the client
+        cls.client.close_communicator()
+
+        # vLLM x pytest (or Popen) seems not to handle process termination well. To avoid zombie processes, we need to
+        # kill the server process and its children explicitly.
+        parent = psutil.Process(cls.server_process.pid)
+        children = parent.children(recursive=True)
+        for child in children:
+            child.send_signal(signal.SIGTERM)
+        cls.server_process.terminate()
+        cls.server_process.wait()
+
+
+@pytest.mark.slow
+@require_torch_multi_gpu
+class TestVLLMClientServerBaseURL(unittest.TestCase):
+    model_id = "Qwen/Qwen2.5-1.5B"
+
+    @classmethod
+    def setUpClass(cls):
+        # We want the server to run on GPU 1, so we set CUDA_VISIBLE_DEVICES to "1"
+        env = os.environ.copy()
+        env["CUDA_VISIBLE_DEVICES"] = "1"  # Restrict to GPU 1
+
+        # Start the server process
+        cls.server_process = subprocess.Popen(
+            ["trl", "vllm-serve", "--model", cls.model_id], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env
+        )
+
+        # Initialize the client with base_url
+        cls.client = VLLMClient(base_url="http://localhost:8000", connection_timeout=120)
+
+    def test_generate(self):
+        prompts = ["Hello, AI!", "Tell me a joke"]
+        outputs = self.client.generate(prompts)
+
+        # Check that the output is a list
+        self.assertIsInstance(outputs, list)
+
+        # Check that the number of generated sequences is equal to the number of prompts
+        self.assertEqual(len(outputs), len(prompts))
+
+        # Check that the generated sequences are lists of integers
+        for seq in outputs:
+            self.assertTrue(all(isinstance(tok, int) for tok in seq))
+
+    def test_generate_with_params(self):
+        prompts = ["Hello, AI!", "Tell me a joke"]
+        outputs = self.client.generate(prompts, n=2, repetition_penalty=0.9, temperature=0.8, max_tokens=32)
+
+        # Check that the output is a list
+        self.assertIsInstance(outputs, list)
+
+        # Check that the number of generated sequences is 2 times the number of prompts
+        self.assertEqual(len(outputs), 2 * len(prompts))
+
+        # Check that the generated sequences are lists of integers
+        for seq in outputs:
+            self.assertTrue(all(isinstance(tok, int) for tok in seq))
+
+        # Check that the length of the generated sequences is less than or equal to 32
+        for seq in outputs:
+            self.assertLessEqual(len(seq), 32)
+
+    def test_update_model_params(self):
+        model = AutoModelForCausalLM.from_pretrained(self.model_id, device_map="cuda")
+        self.client.update_model_params(model)
+
+    def test_reset_prefix_cache(self):
+        # Test resetting the prefix cache
+        self.client.reset_prefix_cache()
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+
+        # Close the client
         cls.client.close_communicator()
-        cls.client_base_url.close_communicator()
 
         # vLLM x pytest (or Popen) seems not to handle process termination well. To avoid zombie processes, we need to
         # kill the server process and its children explicitly.
@@ -147,32 +198,69 @@ def setUpClass(cls):
             env=env,
         )
 
-        # Initialize the clients using both initialization methods
-        cls.client = VLLMClient(connection_timeout=120)  # Default host and port
-        cls.client_base_url = VLLMClient(base_url="http://0.0.0.0:8000", connection_timeout=120)  # Using base_url
-
-    def test_initialization_methods(self):
-        """Test that both initialization methods work correctly with tensor parallelism enabled."""
-        # Test generation with default client (host+port)
-        prompts = ["Test TP initialization 1"]
-        outputs_default = self.client.generate(prompts)
-        self.assertIsInstance(outputs_default, list)
-        self.assertEqual(len(outputs_default), len(prompts))
-
-        # Test generation with base_url client
-        outputs_base_url = self.client_base_url.generate(prompts)
-        self.assertIsInstance(outputs_base_url, list)
-        self.assertEqual(len(outputs_base_url), len(prompts))
-
-    def test_base_url_attribute(self):
-        """Test that both initialization methods set the base_url attribute correctly."""
-        # Both clients should have the same base_url
-        self.assertEqual(self.client.base_url, "http://0.0.0.0:8000")
-        self.assertEqual(self.client_base_url.base_url, "http://0.0.0.0:8000")
-
-        # Verify the client doesn't store host/port when base_url is provided
-        self.assertTrue(not hasattr(self.client_base_url, 'host') or self.client_base_url.host is None)
-        self.assertTrue(not hasattr(self.client_base_url, 'server_port') or self.client_base_url.server_port is None)
+        # Initialize the client
+        cls.client = VLLMClient(connection_timeout=120)
+
+    def test_generate(self):
+        prompts = ["Hello, AI!", "Tell me a joke"]
+        outputs = self.client.generate(prompts)
+
+        # Check that the output is a list
+        self.assertIsInstance(outputs, list)
+
+        # Check that the number of generated sequences is equal to the number of prompts
+        self.assertEqual(len(outputs), len(prompts))
+
+        # Check that the generated sequences are lists of integers
+        for seq in outputs:
+            self.assertTrue(all(isinstance(tok, int) for tok in seq))
+
+    def test_update_model_params(self):
+        model = AutoModelForCausalLM.from_pretrained(self.model_id, device_map="cuda")
+        self.client.update_model_params(model)
+
+    def test_reset_prefix_cache(self):
+        # Test resetting the prefix cache
+        self.client.reset_prefix_cache()
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+
+        # Close the client
+        cls.client.close_communicator()
+
+        # vLLM x pytest (or Popen) seems not to handle process termination well. To avoid zombie processes, we need to
+        # kill the server process and its children explicitly.
+        parent = psutil.Process(cls.server_process.pid)
+        children = parent.children(recursive=True)
+        for child in children:
+            child.send_signal(signal.SIGTERM)
+        cls.server_process.terminate()
+        cls.server_process.wait()
+
+
+@pytest.mark.slow
+@require_3_gpus
+class TestVLLMClientServerTPBaseURL(unittest.TestCase):
+    model_id = "Qwen/Qwen2.5-1.5B"
+
+    @classmethod
+    def setUpClass(cls):
+        # We want the server to run on GPU 1 and 2, so we set CUDA_VISIBLE_DEVICES to "1,2"
+        env = os.environ.copy()
+        env["CUDA_VISIBLE_DEVICES"] = "1,2"  # Restrict to GPU 1 and 2
+
+        # Start the server process
+        cls.server_process = subprocess.Popen(
+            ["trl", "vllm-serve", "--model", cls.model_id, "--tensor_parallel_size", "2"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env,
+        )
+
+        # Initialize the client with base_url
+        cls.client = VLLMClient(base_url="http://localhost:8000", connection_timeout=120)
 
     def test_generate(self):
         prompts = ["Hello, AI!", "Tell me a joke"]
@@ -200,9 +288,8 @@ def test_reset_prefix_cache(self):
     def tearDownClass(cls):
         super().tearDownClass()
 
-        # Close the clients
+        # Close the client
         cls.client.close_communicator()
-        cls.client_base_url.close_communicator()
 
         # vLLM x pytest (or Popen) seems not to handle process termination well. To avoid zombie processes, we need to
         # kill the server process and its children explicitly.
diff --git a/trl/extras/vllm_client.py b/trl/extras/vllm_client.py
@@ -315,8 +315,8 @@ def close_communicator(self):
     print("Client 1 initialized with base_url")
 
     # Example 2: Initialize with host and port
-    client2 = VLLMClient(host="0.0.0.0", server_port=8000)
-    print("Client 2 initialized with host and port")
+    # client2 = VLLMClient(host="0.0.0.0", server_port=8000)
+    # print("Client 2 initialized with host and port")
 
     # Choose one client to use for the example
     client = client1