intel · wenhuach21 · Jul 21, 2025 · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025
diff --git a/auto_round/__main__.py b/auto_round/__main__.py
@@ -58,10 +58,14 @@ def run_fast():
 
 def run_mllm():
     if "--eval" in sys.argv:
-        from auto_round.script.mllm import setup_lmeval_parser, eval
+        from auto_round.script.llm import setup_eval_parser, eval
         sys.argv.remove("--eval")
-        args = setup_lmeval_parser()
+        args = setup_eval_parser()
+        args.mllm = True
         eval(args)
+    elif "--vlmeval" in sys.argv:
+        sys.argv.remove("--vlmeval")
+        run_vlmeavl()
     elif "--lmms" in sys.argv:
         sys.argv.remove("--lmms")
         run_lmms()
@@ -76,6 +80,12 @@ def run_lmms():
     args = setup_lmms_parser()
     lmms_eval(args)
 
+def run_vlmeavl():
+    from auto_round.script.mllm import setup_lmeval_parser, vlmeval
+    args = setup_lmeval_parser()
+    vlmeval(args)
+
+
 def switch():
     if "--mllm" in sys.argv:
         sys.argv.remove("--mllm")

diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py
@@ -210,6 +210,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.add_argument(
             "--model", "--model_name", "--model_name_or_path", default="facebook/opt-125m", help="model name or path")
+        self.add_argument(
+            "--mllm", default=False, help="whether to eval multi-modal model."
+        )
         self.add_argument(
             "--device",
             "--devices",
@@ -672,8 +675,15 @@ def eval(args):
         print("evaluation running time=%ds" % (time.time() - st))
     else:
         st = time.time()
+        if "auto" in str(batch_size) and args.mllm:
+            logger.warning("Batch size 'auto' is not yet supported for hf-multimodal models, reset to 16")
+            batch_size = 16
         res = simple_evaluate(
-            model="hf", model_args=model_args, tasks=tasks, device=device_str, batch_size=batch_size)
+            model="hf" if not args.mllm else "hf-multimodal",
+            model_args=model_args,
+            tasks=tasks,
+            device=device_str,
+            batch_size=batch_size)
         from lm_eval.utils import make_table  # pylint: disable=E0401
         print(make_table(res))
         print("evaluation running time=%ds" % (time.time() - st))

diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
@@ -474,7 +474,7 @@ def tune(args):
     clear_memory()
 
 
-def eval(args):
+def vlmeval(args):
     set_cuda_visible_devices(args.device)
     device_str, parallelism = get_device_and_parallelism(args.device)
     if parallelism:

diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py
@@ -23,9 +23,9 @@ def __iter__(self):
 class TestAutoRound(unittest.TestCase):
     @classmethod
     def setUpClass(self):
-        model_name = "Qwen/Qwen2.5-0.5B-Instruct"
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        self.model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         self.llm_dataloader = LLMDataLoader()
 
     @classmethod
@@ -56,6 +56,7 @@ def test_gguf_format(self):
         llm = Llama(f"saved/{gguf_file}", n_gpu_layers=-1)
         output = llm("There is a girl who likes adventure,", max_tokens=32)
         print(output)
+        shutil.rmtree("./saved", ignore_errors=True)
 
         save_dir = os.path.join(os.path.dirname(__file__), "saved")
         model_path = "Qwen/Qwen2.5-0.5B-Instruct"
@@ -67,10 +68,11 @@ def test_gguf_format(self):
         self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail")
 
         from llama_cpp import Llama
-        gguf_file = os.listdir("saved/Qwen2.5-0.5B-Instruct-w4g32")[0]
-        llm = Llama(f"saved/Qwen2.5-0.5B-Instruct-w4g32/{gguf_file}", n_gpu_layers=-1)
+        gguf_file = os.listdir("saved/Qwen2.5-0.5B-Instruct-gguf")[0]
+        llm = Llama(f"saved/Qwen2.5-0.5B-Instruct-gguf/{gguf_file}", n_gpu_layers=-1)
         output = llm("There is a girl who likes adventure,", max_tokens=32)
         print(output)
+        shutil.rmtree("./saved", ignore_errors=True)
 
     @require_gguf
     def test_q2_k_export(self):
@@ -111,7 +113,7 @@ def test_basic_usage(self):
         python_path = sys.executable
         res = os.system(
             f"cd ../.. && {python_path} -m auto_round --model {self.model_name} --eval_task_by_task"
-            f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0"
+            f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0 --eval_model_dtype bf16"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
@@ -141,7 +143,7 @@ def test_q4_0(self):
 
         from auto_round.eval.evaluation import simple_evaluate_user_model
         result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa")
-        self.assertGreater(result['results']['piqa']['acc,none'], 0.55)
+        self.assertAlmostEqual(result['results']['piqa']['acc,none'], 0.55, delta=0.01)
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_gguf
@@ -168,7 +170,7 @@ def test_q4_1(self):
 
         from auto_round.eval.evaluation import simple_evaluate_user_model
         result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa")
-        self.assertGreater(result['results']['piqa']['acc,none'], 0.55)
+        self.assertAlmostEqual(result['results']['piqa']['acc,none'], 0.55, delta=0.01)
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_gguf
@@ -244,7 +246,7 @@ def test_llama_4(self):
         model_name = "/dataset/Llama-4-Scout-17B-16E-Instruct/"
         from auto_round.mllm.autoround_mllm import AutoRoundMLLM
         from auto_round.utils import mllm_load_model
-        model, processor, tokenizer, image_processor = mllm_load_model(model_name)
+        model, processor, tokenizer, image_processor = mllm_load_model(model_name, use_auto_mapping=False)
         autoround = AutoRoundMLLM(
             model,
             tokenizer=tokenizer,