dhakshin32
diff --git a/‎doc/source/data/working-with-llms.rst
Lines changed: 31 additions & 18 deletions b/‎doc/source/data/working-with-llms.rst
Lines changed: 31 additions & 18 deletions
diff --git a/‎python/ray/data/llm.py
Lines changed: 9 additions & 0 deletions b/‎python/ray/data/llm.py
Lines changed: 9 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/batch/processor/base.py
Lines changed: 23 additions & 0 deletions b/‎python/ray/llm/_internal/batch/processor/base.py
Lines changed: 23 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/batch/stages/base.py
Lines changed: 19 additions & 16 deletions b/‎python/ray/llm/_internal/batch/stages/base.py
Lines changed: 19 additions & 16 deletions
diff --git a/‎python/ray/llm/_internal/batch/stages/chat_template_stage.py
Lines changed: 11 additions & 6 deletions b/‎python/ray/llm/_internal/batch/stages/chat_template_stage.py
Lines changed: 11 additions & 6 deletions
diff --git a/‎python/ray/llm/_internal/batch/stages/http_request_stage.py
Lines changed: 10 additions & 5 deletions b/‎python/ray/llm/_internal/batch/stages/http_request_stage.py
Lines changed: 10 additions & 5 deletions
diff --git a/‎python/ray/llm/_internal/batch/stages/prepare_image_stage.py
Lines changed: 10 additions & 7 deletions b/‎python/ray/llm/_internal/batch/stages/prepare_image_stage.py
Lines changed: 10 additions & 7 deletions
@@ -77,32 +77,26 @@ Upon execution, the Processor object instantiates replicas of the vLLM engine (u
 
     {'answer': 'Snowflakes gently fall\nBlanketing the winter scene\nFrozen peaceful hush'}
 
-Some models may require a Hugging Face token to be specified. You can specify the token in the `runtime_env` argument.
+Each processor requires specific input columns. You can find get more info by using the following API:
 
 .. testcode::
 
-    config = vLLMEngineProcessorConfig(
-        model_source="unsloth/Llama-3.1-8B-Instruct",
-        runtime_env={"env_vars": {"HF_TOKEN": "your_huggingface_token"}},
-        concurrency=1,
-        batch_size=64,
-    )
+    processor.log_input_column_names()
 
-If your model is hosted on AWS S3, you can specify the S3 path in the `model_source` argument, and specify `load_format="runai_streamer"` in the `engine_kwargs` argument.
+.. testoutput::
+    :options: +MOCK
 
-.. note::
-    Install vLLM with runai dependencies: `pip install -U "vllm[runai]==0.7.2"`
+    The first stage of the processor is ChatTemplateStage.
+    Required input columns:
+            messages: A list of messages in OpenAI chat format. See https://platform.openai.com/docs/api-reference/chat/create for details.
+
+Some models may require a Hugging Face token to be specified. You can specify the token in the `runtime_env` argument.
 
 .. testcode::
 
     config = vLLMEngineProcessorConfig(
-        model_source="s3://your-bucket/your-model/",  # Make sure adding the trailing slash!
-        engine_kwargs={"load_format": "runai_streamer"},
-        runtime_env={"env_vars": {
-            "AWS_ACCESS_KEY_ID": "your_access_key_id",
-            "AWS_SECRET_ACCESS_KEY": "your_secret_access_key",
-            "AWS_REGION": "your_region",
-        }},
+        model_source="unsloth/Llama-3.1-8B-Instruct",
+        runtime_env={"env_vars": {"HF_TOKEN": "your_huggingface_token"}},
         concurrency=1,
         batch_size=64,
     )
@@ -146,7 +140,10 @@ The underlying `Processor` object instantiates replicas of the vLLM engine and a
 configure parallel workers to handle model parallelism (for tensor parallelism and pipeline parallelism,
 if specified).
 
-To optimize model loading, you can configure the `load_format` to `runai_streamer` or `tensorizer`:
+To optimize model loading, you can configure the `load_format` to `runai_streamer` or `tensorizer`.
+
+.. note::
+    In this case, install vLLM with runai dependencies: `pip install -U "vllm[runai]==0.7.2"`
 
 .. testcode::
 
@@ -157,6 +154,22 @@ To optimize model loading, you can configure the `load_format` to `runai_streame
         batch_size=64,
     )
 
+If your model is hosted on AWS S3, you can specify the S3 path in the `model_source` argument, and specify `load_format="runai_streamer"` in the `engine_kwargs` argument.
+
+.. testcode::
+
+    config = vLLMEngineProcessorConfig(
+        model_source="s3://your-bucket/your-model/",  # Make sure adding the trailing slash!
+        engine_kwargs={"load_format": "runai_streamer"},
+        runtime_env={"env_vars": {
+            "AWS_ACCESS_KEY_ID": "your_access_key_id",
+            "AWS_SECRET_ACCESS_KEY": "your_secret_access_key",
+            "AWS_REGION": "your_region",
+        }},
+        concurrency=1,
+        batch_size=64,
+    )
+
 To do multi-LoRA batch inference, you need to set LoRA related parameters in `engine_kwargs`. See :doc:`the vLLM with LoRA example</llm/examples/batch/vllm-with-lora>` for details.
 
 .. testcode::
 
@@ -147,6 +147,15 @@ class vLLMEngineProcessorConfig(_vLLMEngineProcessorConfig):
                 ),
             )
 
+            # The processor requires specific input columns, which depend on
+            # your processor config. You can use the following API to check
+            # the required input columns:
+            processor.log_input_column_names()
+            # Example log:
+            # The first stage of the processor is ChatTemplateStage.
+            # Required input columns:
+            #     messages: A list of messages in OpenAI chat format.
+
             ds = ray.data.range(300)
             ds = processor(ds)
             for row in ds.take_all():
 
@@ -1,3 +1,4 @@
+import logging
 from collections import OrderedDict
 from typing import Optional, List, Type, Callable, Dict
 
@@ -15,6 +16,9 @@
 from ray.llm._internal.common.base_pydantic import BaseModelExtended
 
 
+logger = logging.getLogger(__name__)
+
+
 class ProcessorConfig(BaseModelExtended):
     """The processor configuration."""
 
@@ -158,6 +162,25 @@ def get_stage_by_name(self, name: str) -> StatefulStage:
             return self.stages[name]
         raise ValueError(f"Stage {name} not found")
 
+    def log_input_column_names(self):
+        """Log.info the input stage and column names of this processor.
+        If the input dataset does not contain these columns, you have to
+        provide a preprocess function to bridge the gap.
+        """
+        name, stage = list(self.stages.items())[0]
+        expected_input_keys = stage.get_required_input_keys()
+        optional_input_keys = stage.get_optional_input_keys()
+
+        message = f"The first stage of the processor is {name}."
+        if expected_input_keys:
+            message += "\nRequired input columns:\n"
+            message += "\n".join(f"\t{k}: {v}" for k, v in expected_input_keys.items())
+        if optional_input_keys:
+            message += "\nOptional input columns:\n"
+            message += "\n".join(f"\t{k}: {v}" for k, v in optional_input_keys.items())
+
+        logger.info(message)
+
 
 @DeveloperAPI
 class ProcessorBuilder:
 
@@ -1,6 +1,6 @@
 """The base class for all stages."""
 import logging
-from typing import Any, Dict, AsyncIterator, List, Callable, Type
+from typing import Any, Dict, AsyncIterator, List, Callable, Type, Optional
 
 import pyarrow
 from pydantic import BaseModel, Field
@@ -71,14 +71,18 @@ class StatefulStageUDF:
         __call__ method will take the data column as the input of the udf
         method, and encapsulate the output of the udf method into the data
         column for the next stage.
+        expected_input_keys: The expected input keys of the stage.
     """
 
     # The internal column name for the index of the row in the batch.
     # This is used to align the output of the UDF with the input batch.
     IDX_IN_BATCH_COLUMN: str = "__idx_in_batch"
 
-    def __init__(self, data_column: str):
+    def __init__(
+        self, data_column: str, expected_input_keys: Optional[List[str]] = None
+    ):
         self.data_column = data_column
+        self.expected_input_keys = set(expected_input_keys or [])
 
     async def __call__(self, batch: Dict[str, Any]) -> AsyncIterator[Dict[str, Any]]:
         """A stage UDF wrapper that processes the input and output columns
@@ -195,8 +199,6 @@ def validate_inputs(self, inputs: List[Dict[str, Any]]):
         Raises:
             ValueError: If the required keys are not found.
         """
-        expected_input_keys = set(self.expected_input_keys)
-
         for inp in inputs:
             input_keys = set(inp.keys())
 
@@ -206,26 +208,16 @@ def validate_inputs(self, inputs: List[Dict[str, Any]]):
                     "for internal use."
                 )
 
-            if not expected_input_keys:
+            if not self.expected_input_keys:
                 continue
 
-            missing_required = expected_input_keys - input_keys
+            missing_required = self.expected_input_keys - input_keys
             if missing_required:
                 raise ValueError(
                     f"Required input keys {missing_required} not found at the input of "
                     f"{self.__class__.__name__}. Input keys: {input_keys}"
                 )
 
-    @property
-    def expected_input_keys(self) -> List[str]:
-        """A list of required input keys. Missing required keys will raise
-        an exception.
-
-        Returns:
-            A list of required input keys.
-        """
-        return []
-
     async def udf(self, rows: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]]:
         raise NotImplementedError("StageUDF must implement the udf method")
 
@@ -247,6 +239,14 @@ class StatefulStage(BaseModel):
         description="The arguments of .map_batches(). Default {'concurrency': 1}.",
     )
 
+    def get_required_input_keys(self) -> Dict[str, str]:
+        """The required input keys of the stage and their descriptions."""
+        return {}
+
+    def get_optional_input_keys(self) -> Dict[str, str]:
+        """The optional input keys of the stage and their descriptions."""
+        return {}
+
     def get_dataset_map_batches_kwargs(
         self,
         batch_size: int,
@@ -280,6 +280,9 @@ def get_dataset_map_batches_kwargs(
             )
 
         kwargs["fn_constructor_kwargs"]["data_column"] = data_column
+        kwargs["fn_constructor_kwargs"]["expected_input_keys"] = list(
+            self.get_required_input_keys().keys()
+        )
         return kwargs
 
     class Config:
 
@@ -13,6 +13,7 @@ class ChatTemplateUDF(StatefulStageUDF):
     def __init__(
         self,
         data_column: str,
+        expected_input_keys: List[str],
         model: str,
         chat_template: Optional[str] = None,
     ):
@@ -21,14 +22,15 @@ def __init__(
 
         Args:
             data_column: The data column name.
+            expected_input_keys: The expected input keys of the stage.
             model: The model to use for the chat template.
             chat_template: The chat template in Jinja template format. This is
             usually not needed if the model checkpoint already contains the
             chat template.
         """
         from transformers import AutoProcessor
 
-        super().__init__(data_column)
+        super().__init__(data_column, expected_input_keys)
 
         # NOTE: We always use processor instead of tokenizer in this stage,
         # because tokenizers of VLM models may not have chat template attribute.
@@ -95,15 +97,18 @@ def _should_add_generation_prompt(self, conversation: List[Dict[str, Any]]) -> b
         """
         return conversation[-1]["role"] == "user"
 
-    @property
-    def expected_input_keys(self) -> List[str]:
-        """The expected input keys."""
-        return ["messages"]
-
 
 class ChatTemplateStage(StatefulStage):
     """
     A stage that applies chat template.
     """
 
     fn: Type[StatefulStageUDF] = ChatTemplateUDF
+
+    def get_required_input_keys(self) -> Dict[str, str]:
+        """The required input keys of the stage and their descriptions."""
+        return {
+            "messages": "A list of messages in OpenAI chat format. "
+            "See https://platform.openai.com/docs/api-reference/chat/create "
+            "for details."
+        }
@@ -13,6 +13,7 @@ class HttpRequestUDF(StatefulStageUDF):
     def __init__(
         self,
         data_column: str,
+        expected_input_keys: List[str],
         url: str,
         additional_header: Optional[Dict[str, Any]] = None,
         qps: Optional[int] = None,
@@ -22,11 +23,12 @@ def __init__(
 
         Args:
             data_column: The data column name.
+            expected_input_keys: The expected input keys of the stage.
             url: The URL to send the HTTP request to.
             additional_header: The additional headers to send with the HTTP request.
             qps: The maximum number of requests per second.
         """
-        super().__init__(data_column)
+        super().__init__(data_column, expected_input_keys)
         self.url = url
         self.additional_header = additional_header or {}
         self.qps = qps
@@ -90,14 +92,17 @@ async def udf(self, batch: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]
                         "http_response": resp_json,
                     }
 
-    @property
-    def expected_input_keys(self) -> List[str]:
-        return ["payload"]
-
 
 class HttpRequestStage(StatefulStage):
     """
     A stage that sends HTTP requests.
     """
 
     fn: Type[StatefulStageUDF] = HttpRequestUDF
+
+    def get_required_input_keys(self) -> Dict[str, str]:
+        """The required input keys of the stage and their descriptions."""
+        return {
+            "payload": "The payload to send to the HTTP request. "
+            "It should be in JSON format."
+        }
@@ -304,8 +304,8 @@ async def process(self, images: List[_ImageType]) -> List["Image.Image"]:
 
 
 class PrepareImageUDF(StatefulStageUDF):
-    def __init__(self, data_column: str):
-        super().__init__(data_column)
+    def __init__(self, data_column: str, expected_input_keys: List[str]):
+        super().__init__(data_column, expected_input_keys)
         self.Image = importlib.import_module("PIL.Image")
         self.image_processor = ImageProcessor()
 
@@ -365,13 +365,16 @@ async def udf(self, batch: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]
                 img_start_idx += num_images_in_req
             yield ret
 
-    @property
-    def expected_input_keys(self) -> List[str]:
-        """The expected input keys."""
-        return ["messages"]
-
 
 class PrepareImageStage(StatefulStage):
     """A stage to prepare images from OpenAI chat template messages."""
 
     fn: StatefulStageUDF = PrepareImageUDF
+
+    def get_required_input_keys(self) -> Dict[str, str]:
+        """The required input keys of the stage and their descriptions."""
+        return {
+            "messages": "A list of messages in OpenAI chat format. "
+            "See https://platform.openai.com/docs/api-reference/chat/create "
+            "for details."
+        }