[Fix] Fix support Prithvi online inference using tensor

mgazz · mgazz · commit 567500a14c26 · 2025-06-30T00:55:44.000Z
Signed-off-by: Michele Gazzetti &lt;michele.gazzetti1@ibm.com&gt;
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
@@ -140,6 +140,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
     ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
     CustomChatCompletionContentSimpleImageParam,
     ChatCompletionContentPartImageEmbedsParam,
+    ChatCompletionContentPartTensorsParam,
     CustomChatCompletionContentSimpleAudioParam,
     CustomChatCompletionContentSimpleVideoParam, str]
 
@@ -583,6 +584,8 @@ def _placeholder_str(self, modality: ModalityStr,
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.video_token_index)
             raise TypeError(f"Unknown {modality} model type: {model_type}")
+        elif modality == "tensors":
+            return None
         else:
             raise TypeError(f"Unknown modality: {modality}")
 
@@ -641,6 +644,13 @@ def all_mm_data(self) -> Optional[MultiModalDataDict]:
                 raise ValueError(\
                     "Only one message can have {'type': 'image_embeds'}")
             mm_inputs["image"] = image_embeds_lst[0]
+
+        if "tensors" in items_by_modality:
+            tensors_lst = items_by_modality["tensors"]
+            if len(tensors_lst) > 1:
+                raise ValueError(\
+                    "Only one message can have {'type': 'tensors'}")
+            mm_inputs["tensors"] = tensors_lst[0]
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"] # A list of images
         if "audio" in items_by_modality:
@@ -674,6 +684,12 @@ async def all_mm_data(self) -> Optional[MultiModalDataDict]:
                 raise ValueError(
                     "Only one message can have {'type': 'image_embeds'}")
             mm_inputs["image"] = image_embeds_lst[0]
+        if "tensors" in items_by_modality:
+            tensors_lst = items_by_modality["tensors"]
+            if len(tensors_lst) > 1:
+                raise ValueError(\
+                    "Only one message can have {'type': 'tensors'}")
+            mm_inputs["tensors"] = tensors_lst[0]
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"] # A list of images
         if "audio" in items_by_modality:
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -1109,7 +1109,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
     model: Optional[str] = None
     messages: list[ChatCompletionMessageParam]
 
-    encoding_format: Literal["float", "base64", "tensors"] = "float"
+    encoding_format: Literal["float", "base64", "tensor"] = "float"
     dimensions: Optional[int] = None
     user: Optional[str] = None
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
@@ -44,9 +44,9 @@ def _get_data(
         pt_float32 = output.data.to(dtype=torch.float32)
         pooling_bytes = np.array(pt_float32, dtype="float32").tobytes()
         return base64.b64encode(pooling_bytes).decode("utf-8")
-    elif encoding_format == "tensors":
+    elif encoding_format == "tensor":
         tensor_encoding_io = ImageEmbeddingMediaIO()
-        tensor_encoding_io.encode_base64(output.data)
+        return tensor_encoding_io.encode_base64(output.data)
 
     assert_never(encoding_format)
 
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
@@ -95,3 +95,12 @@ def load_file(self, filepath: Path) -> torch.Tensor:
 
     def encode_base64(self, media: torch.Tensor) -> str:
         return base64.b64encode(media.numpy()).decode('utf-8')
+
+    # currently not used but it makes it easy
+    # for users to reconstruct the result tensor without knowledge of the array shape
+    def encode_tensor(self, media: torch.Tensor) -> str:
+        buffer_tiff = BytesIO()
+        torch.save(media.data, buffer_tiff)
+        buffer_tiff.seek(0)
+        binary_data = buffer_tiff.read()
+        return base64.b64encode(binary_data).decode('utf-8')