From 6b8d132dbf0291742977b1f042c710660baee9ba Mon Sep 17 00:00:00 2001 From: Jucheng Hu Date: Wed, 11 Mar 2026 00:04:56 +0000 Subject: [PATCH 1/2] add transformers loaded llava_hf --- vlmeval/vlm/llava/llava_hf.py | 178 ++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 vlmeval/vlm/llava/llava_hf.py diff --git a/vlmeval/vlm/llava/llava_hf.py b/vlmeval/vlm/llava/llava_hf.py new file mode 100644 index 000000000..fd3664d64 --- /dev/null +++ b/vlmeval/vlm/llava/llava_hf.py @@ -0,0 +1,178 @@ +import torch +from PIL import Image +from abc import abstractproperty +import sys +import os.path as osp +from ..base import BaseModel +from ...smp import * +from ...dataset import DATASET_TYPE, DATASET_MODALITY +import copy +import requests +from transformers import AutoProcessor, LlavaForConditionalGeneration +import logging + +class LLaVA_HF(BaseModel): + INSTALL_REQ = False + INTERLEAVE = True + + def __init__(self, model_path="llava-hf/llava-1.5-7b-hf", **kwargs): + + self.model_path = model_path + + try: + self.model = LlavaForConditionalGeneration.from_pretrained( + model_path, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + device_map="cuda" + ) + self.processor = AutoProcessor.from_pretrained(model_path) + except Exception as err: + logging.critical(f"Failed to load Hugging Face LLaVA model from {model_path}.") + raise err + + kwargs_default = dict( + do_sample=False, + temperature=0, + max_new_tokens=2048, + top_p=None, + num_beams=1, + use_cache=True, + ) + kwargs_default.update(kwargs) + + # Hugging Face's generation config doesn't accept temperature=0 with do_sample=False + if not kwargs_default["do_sample"] and kwargs_default["temperature"] == 0: + kwargs_default.pop("temperature", None) + kwargs_default.pop("top_p", None) + + self.kwargs = kwargs_default + warnings.warn( + f"Following kwargs received: {self.kwargs}, will use as generation config. " + ) + + def use_custom_prompt(self, dataset): + assert dataset is not None + if DATASET_TYPE(dataset) == "MCQ": + return True + return False + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + tgt_path = self.dump_image(line, dataset) + + question = line["question"] + hint = line["hint"] if ("hint" in line and not pd.isna(line["hint"])) else None + if hint is not None: + question = hint + "\n" + question + + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f"\n{key}. {item}" + prompt = question + + if len(options): + prompt += ( + "\n请直接回答选项字母。" + if cn_string(prompt) + else "\nAnswer with the option's letter from the given choices directly." + ) + else: + prompt += ( + "\n请直接回答问题。" + if cn_string(prompt) + else "\nAnswer the question directly." + ) + + message = [dict(type="image", value=s) for s in tgt_path] + message.append(dict(type="text", value=prompt)) + return message + + def chat_inner(self, message, dataset=None): + + + conversation = [] + images = [] + + # Convert framework messages to HF Chat Template format + for utter in message: + content_list = [] + for item in utter["content"]: + if item["type"] == "text": + content_list.append({"type": "text", "text": item["value"]}) + elif item["type"] == "image": + content_list.append({"type": "image"}) + images.append(Image.open(item["value"]).convert("RGB")) + + conversation.append({ + "role": utter["role"], + "content": content_list + }) + + prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True) + + inputs = self.processor( + images=images if images else None, + text=prompt, + return_tensors="pt" + ).to(self.model.device, torch.float16) + + with torch.inference_mode(): + output_ids = self.model.generate( + **inputs, + **self.kwargs + ) + + # Slice the output to remove the input prompt tokens + input_len = inputs["input_ids"].shape[1] + generated_ids = output_ids[0][input_len:] + + output = self.processor.decode(generated_ids, skip_special_tokens=True).strip() + return output + + def generate_inner(self, message, dataset=None): + import torch + + content_list = [] + images = [] + + # Convert single-turn framework message to HF Chat Template format + for item in message: + if item["type"] == "text": + content_list.append({"type": "text", "text": item["value"]}) + elif item["type"] == "image": + content_list.append({"type": "image"}) + images.append(Image.open(item["value"]).convert("RGB")) + + conversation = [ + { + "role": "user", + "content": content_list + } + ] + + prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True) + + inputs = self.processor( + images=images if images else None, + text=prompt, + return_tensors="pt" + ).to(self.model.device, torch.float16) + + with torch.inference_mode(): + output_ids = self.model.generate( + **inputs, + **self.kwargs + ) + + # Slice the output to remove the input prompt tokens + input_len = inputs["input_ids"].shape[1] + generated_ids = output_ids[0][input_len:] + + output = self.processor.decode(generated_ids, skip_special_tokens=True).strip() + return output From 666414fef8a7b5cfa3498da416848f54999dec33 Mon Sep 17 00:00:00 2001 From: Jucheng Hu Date: Wed, 11 Mar 2026 00:06:33 +0000 Subject: [PATCH 2/2] [Model] add support for LLaVA-hf --- vlmeval/config.py | 2 ++ vlmeval/vlm/__init__.py | 1 + vlmeval/vlm/llava/__init__.py | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vlmeval/config.py b/vlmeval/config.py index 675ad6340..da8f6d0ce 100644 --- a/vlmeval/config.py +++ b/vlmeval/config.py @@ -844,6 +844,8 @@ llava_series = { "llava_v1.5_7b": partial(LLaVA, model_path="liuhaotian/llava-v1.5-7b"), "llava_v1.5_13b": partial(LLaVA, model_path="liuhaotian/llava-v1.5-13b"), + "llava_v1.5_7b_hf": partial(LLaVA, model_path="llava-hf/llava-1.5-7b-hf"), + "llava_v1.5_13b_hf": partial(LLaVA, model_path="llava-hf/llava-1.5-13b-hf"), "llava_v1_7b": partial(LLaVA, model_path=LLAVA_V1_7B_MODEL_PTH), "sharegpt4v_7b": partial(LLaVA, model_path="Lin-Chen/ShareGPT4V-7B"), "sharegpt4v_13b": partial(LLaVA, model_path="Lin-Chen/ShareGPT4V-13B"), diff --git a/vlmeval/vlm/__init__.py b/vlmeval/vlm/__init__.py index 5d99d23a4..73c0406d9 100644 --- a/vlmeval/vlm/__init__.py +++ b/vlmeval/vlm/__init__.py @@ -15,6 +15,7 @@ from .kosmos import Kosmos2 from .llava import ( LLaVA, + LLaVA_HF, LLaVA_Next, LLaVA_XTuner, LLaVA_Next2, diff --git a/vlmeval/vlm/llava/__init__.py b/vlmeval/vlm/llava/__init__.py index 9ad9a644a..8239ed398 100644 --- a/vlmeval/vlm/llava/__init__.py +++ b/vlmeval/vlm/llava/__init__.py @@ -1,4 +1,5 @@ from .llava import LLaVA, LLaVA_Next, LLaVA_Next2, LLaVA_OneVision, LLaVA_OneVision_HF from .llava_xtuner import LLaVA_XTuner +from .llava_hf import LLaVA_HF -__all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner', 'LLaVA_Next2', 'LLaVA_OneVision', 'LLaVA_OneVision_HF'] +__all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner','LLaVA_HF', 'LLaVA_Next2', 'LLaVA_OneVision', 'LLaVA_OneVision_HF']