diff --git a/auto_round/__main__.py b/auto_round/__main__.py index d1c5af73..1a8c885d 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -58,10 +58,14 @@ def run_fast(): def run_mllm(): if "--eval" in sys.argv: - from auto_round.script.mllm import setup_lmeval_parser, eval + from auto_round.script.llm import setup_eval_parser, eval sys.argv.remove("--eval") - args = setup_lmeval_parser() + args = setup_eval_parser() + args.mllm = True eval(args) + elif "--vlmeval" in sys.argv: + sys.argv.remove("--vlmeval") + run_vlmeavl() elif "--lmms" in sys.argv: sys.argv.remove("--lmms") run_lmms() @@ -76,6 +80,12 @@ def run_lmms(): args = setup_lmms_parser() lmms_eval(args) +def run_vlmeavl(): + from auto_round.script.mllm import setup_lmeval_parser, vlmeval + args = setup_lmeval_parser() + vlmeval(args) + + def switch(): if "--mllm" in sys.argv: sys.argv.remove("--mllm") diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py index a7fd648e..efa2579d 100644 --- a/auto_round/script/llm.py +++ b/auto_round/script/llm.py @@ -210,6 +210,9 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.add_argument( "--model", "--model_name", "--model_name_or_path", default="facebook/opt-125m", help="model name or path") + self.add_argument( + "--mllm", default=False, help="whether to eval multi-modal model." + ) self.add_argument( "--device", "--devices", @@ -672,8 +675,15 @@ def eval(args): print("evaluation running time=%ds" % (time.time() - st)) else: st = time.time() + if "auto" in str(batch_size) and args.mllm: + logger.warning("Batch size 'auto' is not yet supported for hf-multimodal models, reset to 16") + batch_size = 16 res = simple_evaluate( - model="hf", model_args=model_args, tasks=tasks, device=device_str, batch_size=batch_size) + model="hf" if not args.mllm else "hf-multimodal", + model_args=model_args, + tasks=tasks, + device=device_str, + batch_size=batch_size) from lm_eval.utils import make_table # pylint: disable=E0401 print(make_table(res)) print("evaluation running time=%ds" % (time.time() - st)) diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py index c25c6180..676a2bbf 100644 --- a/auto_round/script/mllm.py +++ b/auto_round/script/mllm.py @@ -474,7 +474,7 @@ def tune(args): clear_memory() -def eval(args): +def vlmeval(args): set_cuda_visible_devices(args.device) device_str, parallelism = get_device_and_parallelism(args.device) if parallelism: diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py index d6cd5039..ece1c8c1 100644 --- a/test/test_cuda/test_gguf.py +++ b/test/test_cuda/test_gguf.py @@ -23,9 +23,9 @@ def __iter__(self): class TestAutoRound(unittest.TestCase): @classmethod def setUpClass(self): - model_name = "Qwen/Qwen2.5-0.5B-Instruct" - self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + self.model_name = "Qwen/Qwen2.5-0.5B-Instruct" + self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) self.llm_dataloader = LLMDataLoader() @classmethod @@ -56,6 +56,7 @@ def test_gguf_format(self): llm = Llama(f"saved/{gguf_file}", n_gpu_layers=-1) output = llm("There is a girl who likes adventure,", max_tokens=32) print(output) + shutil.rmtree("./saved", ignore_errors=True) save_dir = os.path.join(os.path.dirname(__file__), "saved") model_path = "Qwen/Qwen2.5-0.5B-Instruct" @@ -67,10 +68,11 @@ def test_gguf_format(self): self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail") from llama_cpp import Llama - gguf_file = os.listdir("saved/Qwen2.5-0.5B-Instruct-w4g32")[0] - llm = Llama(f"saved/Qwen2.5-0.5B-Instruct-w4g32/{gguf_file}", n_gpu_layers=-1) + gguf_file = os.listdir("saved/Qwen2.5-0.5B-Instruct-gguf")[0] + llm = Llama(f"saved/Qwen2.5-0.5B-Instruct-gguf/{gguf_file}", n_gpu_layers=-1) output = llm("There is a girl who likes adventure,", max_tokens=32) print(output) + shutil.rmtree("./saved", ignore_errors=True) @require_gguf def test_q2_k_export(self): @@ -111,7 +113,7 @@ def test_basic_usage(self): python_path = sys.executable res = os.system( f"cd ../.. && {python_path} -m auto_round --model {self.model_name} --eval_task_by_task" - f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0" + f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0 --eval_model_dtype bf16" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" @@ -141,7 +143,7 @@ def test_q4_0(self): from auto_round.eval.evaluation import simple_evaluate_user_model result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa") - self.assertGreater(result['results']['piqa']['acc,none'], 0.55) + self.assertAlmostEqual(result['results']['piqa']['acc,none'], 0.55, delta=0.01) shutil.rmtree("./saved", ignore_errors=True) @require_gguf @@ -168,7 +170,7 @@ def test_q4_1(self): from auto_round.eval.evaluation import simple_evaluate_user_model result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa") - self.assertGreater(result['results']['piqa']['acc,none'], 0.55) + self.assertAlmostEqual(result['results']['piqa']['acc,none'], 0.55, delta=0.01) shutil.rmtree("./saved", ignore_errors=True) @require_gguf @@ -244,7 +246,7 @@ def test_llama_4(self): model_name = "/dataset/Llama-4-Scout-17B-16E-Instruct/" from auto_round.mllm.autoround_mllm import AutoRoundMLLM from auto_round.utils import mllm_load_model - model, processor, tokenizer, image_processor = mllm_load_model(model_name) + model, processor, tokenizer, image_processor = mllm_load_model(model_name, use_auto_mapping=False) autoround = AutoRoundMLLM( model, tokenizer=tokenizer,