From 6b8d132dbf0291742977b1f042c710660baee9ba Mon Sep 17 00:00:00 2001
From: Jucheng Hu <jucheng_hu_20@ucl.ac.uk>
Date: Wed, 11 Mar 2026 00:04:56 +0000
Subject: [PATCH 1/2] add transformers loaded llava_hf

---
 vlmeval/vlm/llava/llava_hf.py | 178 ++++++++++++++++++++++++++++++++++
 1 file changed, 178 insertions(+)
 create mode 100644 vlmeval/vlm/llava/llava_hf.py

diff --git a/vlmeval/vlm/llava/llava_hf.py b/vlmeval/vlm/llava/llava_hf.py
new file mode 100644
index 000000000..fd3664d64
--- /dev/null
+++ b/vlmeval/vlm/llava/llava_hf.py
@@ -0,0 +1,178 @@
+import torch
+from PIL import Image
+from abc import abstractproperty
+import sys
+import os.path as osp
+from ..base import BaseModel
+from ...smp import *
+from ...dataset import DATASET_TYPE, DATASET_MODALITY
+import copy
+import requests
+from transformers import AutoProcessor, LlavaForConditionalGeneration
+import logging
+
+class LLaVA_HF(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path="llava-hf/llava-1.5-7b-hf", **kwargs):
+
+        self.model_path = model_path
+
+        try:
+            self.model = LlavaForConditionalGeneration.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
+                device_map="cuda"
+            )
+            self.processor = AutoProcessor.from_pretrained(model_path)
+        except Exception as err:
+            logging.critical(f"Failed to load Hugging Face LLaVA model from {model_path}.")
+            raise err
+
+        kwargs_default = dict(
+            do_sample=False,
+            temperature=0,
+            max_new_tokens=2048,
+            top_p=None,
+            num_beams=1,
+            use_cache=True,
+        )
+        kwargs_default.update(kwargs)
+        
+        # Hugging Face's generation config doesn't accept temperature=0 with do_sample=False
+        if not kwargs_default["do_sample"] and kwargs_default["temperature"] == 0:
+            kwargs_default.pop("temperature", None)
+            kwargs_default.pop("top_p", None)
+
+        self.kwargs = kwargs_default
+        warnings.warn(
+            f"Following kwargs received: {self.kwargs}, will use as generation config. "
+        )
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == "MCQ":
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        question = line["question"]
+        hint = line["hint"] if ("hint" in line and not pd.isna(line["hint"])) else None
+        if hint is not None:
+            question = hint + "\n" + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f"\n{key}. {item}"
+        prompt = question
+
+        if len(options):
+            prompt += (
+                "\n请直接回答选项字母。"
+                if cn_string(prompt)
+                else "\nAnswer with the option's letter from the given choices directly."
+            )
+        else:
+            prompt += (
+                "\n请直接回答问题。"
+                if cn_string(prompt)
+                else "\nAnswer the question directly."
+            )
+
+        message = [dict(type="image", value=s) for s in tgt_path]
+        message.append(dict(type="text", value=prompt))
+        return message
+
+    def chat_inner(self, message, dataset=None):
+        
+        
+        conversation = []
+        images = []
+
+        # Convert framework messages to HF Chat Template format
+        for utter in message:
+            content_list = []
+            for item in utter["content"]:
+                if item["type"] == "text":
+                    content_list.append({"type": "text", "text": item["value"]})
+                elif item["type"] == "image":
+                    content_list.append({"type": "image"})
+                    images.append(Image.open(item["value"]).convert("RGB"))
+            
+            conversation.append({
+                "role": utter["role"],
+                "content": content_list
+            })
+
+        prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
+
+        inputs = self.processor(
+            images=images if images else None,
+            text=prompt,
+            return_tensors="pt"
+        ).to(self.model.device, torch.float16)
+
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                **inputs,
+                **self.kwargs
+            )
+
+        # Slice the output to remove the input prompt tokens
+        input_len = inputs["input_ids"].shape[1]
+        generated_ids = output_ids[0][input_len:]
+        
+        output = self.processor.decode(generated_ids, skip_special_tokens=True).strip()
+        return output
+
+    def generate_inner(self, message, dataset=None):
+        import torch
+
+        content_list = []
+        images = []
+        
+        # Convert single-turn framework message to HF Chat Template format
+        for item in message:
+            if item["type"] == "text":
+                content_list.append({"type": "text", "text": item["value"]})
+            elif item["type"] == "image":
+                content_list.append({"type": "image"})
+                images.append(Image.open(item["value"]).convert("RGB"))
+
+        conversation = [
+            {
+                "role": "user",
+                "content": content_list
+            }
+        ]
+
+        prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
+
+        inputs = self.processor(
+            images=images if images else None,
+            text=prompt,
+            return_tensors="pt"
+        ).to(self.model.device, torch.float16)
+
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                **inputs,
+                **self.kwargs
+            )
+
+        # Slice the output to remove the input prompt tokens
+        input_len = inputs["input_ids"].shape[1]
+        generated_ids = output_ids[0][input_len:]
+        
+        output = self.processor.decode(generated_ids, skip_special_tokens=True).strip()
+        return output

From 666414fef8a7b5cfa3498da416848f54999dec33 Mon Sep 17 00:00:00 2001
From: Jucheng Hu <jucheng_hu_20@ucl.ac.uk>
Date: Wed, 11 Mar 2026 00:06:33 +0000
Subject: [PATCH 2/2] [Model] add support for LLaVA-hf

---
 vlmeval/config.py             | 2 ++
 vlmeval/vlm/__init__.py       | 1 +
 vlmeval/vlm/llava/__init__.py | 3 ++-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/vlmeval/config.py b/vlmeval/config.py
index 675ad6340..da8f6d0ce 100644
--- a/vlmeval/config.py
+++ b/vlmeval/config.py
@@ -844,6 +844,8 @@
 llava_series = {
     "llava_v1.5_7b": partial(LLaVA, model_path="liuhaotian/llava-v1.5-7b"),
     "llava_v1.5_13b": partial(LLaVA, model_path="liuhaotian/llava-v1.5-13b"),
+    "llava_v1.5_7b_hf": partial(LLaVA, model_path="llava-hf/llava-1.5-7b-hf"),
+    "llava_v1.5_13b_hf": partial(LLaVA, model_path="llava-hf/llava-1.5-13b-hf"),
     "llava_v1_7b": partial(LLaVA, model_path=LLAVA_V1_7B_MODEL_PTH),
     "sharegpt4v_7b": partial(LLaVA, model_path="Lin-Chen/ShareGPT4V-7B"),
     "sharegpt4v_13b": partial(LLaVA, model_path="Lin-Chen/ShareGPT4V-13B"),
diff --git a/vlmeval/vlm/__init__.py b/vlmeval/vlm/__init__.py
index 5d99d23a4..73c0406d9 100644
--- a/vlmeval/vlm/__init__.py
+++ b/vlmeval/vlm/__init__.py
@@ -15,6 +15,7 @@
 from .kosmos import Kosmos2
 from .llava import (
     LLaVA,
+    LLaVA_HF,
     LLaVA_Next,
     LLaVA_XTuner,
     LLaVA_Next2,
diff --git a/vlmeval/vlm/llava/__init__.py b/vlmeval/vlm/llava/__init__.py
index 9ad9a644a..8239ed398 100644
--- a/vlmeval/vlm/llava/__init__.py
+++ b/vlmeval/vlm/llava/__init__.py
@@ -1,4 +1,5 @@
 from .llava import LLaVA, LLaVA_Next, LLaVA_Next2, LLaVA_OneVision, LLaVA_OneVision_HF
 from .llava_xtuner import LLaVA_XTuner
+from .llava_hf import LLaVA_HF
 
-__all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner', 'LLaVA_Next2', 'LLaVA_OneVision', 'LLaVA_OneVision_HF']
+__all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner','LLaVA_HF', 'LLaVA_Next2', 'LLaVA_OneVision', 'LLaVA_OneVision_HF']