Supported sft for vlm (#499)

JingofXin · lwj-st · ChenJiahaoST · web-flow · commit ca8010b896a3 · 2025-05-29T15:38:57.000+08:00
Co-authored-by: lwj-st &lt;liwenjian.vendor@sensetime.com&gt;
Co-authored-by: Chenjiahao &lt;chenjiahao@sensetime.com&gt;
diff --git a/LazyLLM-Env b/LazyLLM-Env
@@ -1 +1 @@
-Subproject commit 70965dee229f09eadb4225f2a665484b5c214a5b
+Subproject commit 72557636190b689ee810a7121042154c5d5a8f88
diff --git a/lazyllm/cli/install.py b/lazyllm/cli/install.py
@@ -181,36 +181,6 @@ def install(commands):  # noqa C901
 
     extra_pkgs = set()
 
-    want_vllm = any(p.startswith("vllm") for p in pkgs)
-    want_llamafactory = any(p.startswith("lazyllm-llamafactory") for p in pkgs)
-
-    try:
-        importlib.metadata.version("vllm")
-        orig_vllm = True
-    except importlib.metadata.PackageNotFoundError:
-        orig_vllm = False
-
-    try:
-        importlib.metadata.version("lazyllm-llamafactory")
-        orig_llamafactory = True
-    except importlib.metadata.PackageNotFoundError:
-        orig_llamafactory = False
-
-    need_transformers = (
-        (want_vllm and orig_llamafactory)
-        or (want_llamafactory and orig_vllm)
-        or (want_vllm and want_llamafactory)
-    )
-
-    if need_transformers:
-        try:
-            tr_ver = importlib.metadata.version("transformers")
-        except importlib.metadata.PackageNotFoundError:
-            pass
-        else:
-            if tr_ver != "4.46.1":
-                extra_pkgs.add("transformers==4.46.1")
-
     for p in pkgs:
         if p.startswith("flash-attn"):
             try:
diff --git a/lazyllm/components/deploy/lmdeploy.py b/lazyllm/components/deploy/lmdeploy.py
@@ -58,7 +58,7 @@ def cmd(self, finetuned_model=None, base_model=None):
                             f"base_model({base_model}) will be used")
             finetuned_model = base_model
 
-        model_type = ModelManager.get_model_type(finetuned_model)
+        model_type = ModelManager.get_model_type(base_model or finetuned_model)
         if model_type == 'vlm':
             self.kw.pop("chat-template")
         else:
diff --git a/lazyllm/components/deploy/vllm.py b/lazyllm/components/deploy/vllm.py
@@ -45,7 +45,7 @@ def __init__(self, trust_remote_code=True, launcher=launchers.remote(ngpus=1), s
             'host': '0.0.0.0',
             'max-num-seqs': 256,
             'pipeline-parallel-size': 1,
-            'max-num-batched-tokens': 64000,
+            'max-num-batched-tokens': 128000,
         })
         self.trust_remote_code = trust_remote_code
         self.kw.check_and_update(kw)
diff --git a/lazyllm/components/finetune/llama_factory/model_mapping.py b/lazyllm/components/finetune/llama_factory/model_mapping.py
@@ -40,6 +40,7 @@ def match_longest_prefix(model_name):
     'index': 'index',
     'internlm': 'intern',
     'internlm2': 'intern2',
+    'internvl': 'intern_vl',
     'llama-2': 'llama2',
     'llama-3': 'llama3',
     'llama-3.2': 'mllama',
@@ -64,7 +65,9 @@ def match_longest_prefix(model_name):
     'pixtral': 'pixtral',
     'qwen': 'qwen',
     'codeqwen': 'qwen',
+    'qwen-vl': 'qwen2_vl',
     'qwen2-vl': 'qwen2_vl',
+    'qwen2.5-vl': 'qwen2_vl',
     'solar': 'solar',
     'telechat': 'telechat',
     'vicuna': 'vicuna',
diff --git a/lazyllm/components/finetune/llama_factory/sft.yaml b/lazyllm/components/finetune/llama_factory/sft.yaml
@@ -1,12 +1,12 @@
 ### ModelArguments
 model_name_or_path: internlm2-chat-7b
+trust_remote_code: true
 adapter_name_or_path: null
 adapter_folder: null
 cache_dir: null
 use_fast_tokenizer: true
 resize_vocab: false
 split_special_tokens: false
-new_special_tokens: null
 model_revision: main
 low_cpu_mem_usage: true
 quantization_bit: null
@@ -18,7 +18,6 @@ flash_attn: auto
 shift_attn: false
 mixture_of_depths: null
 use_unsloth: false
-# visual_inputs: false
 moe_aux_loss_coef: null
 disable_gradient_checkpointing: false
 upcast_layernorm: false
@@ -49,7 +48,6 @@ print_param_status: false
 template: null
 dataset: identity,alpaca_en_demo
 dataset_dir: lazyllm_temp_dir
-# split: train
 cutoff_len: 1024
 train_on_prompt: false
 streaming: false
@@ -172,7 +170,6 @@ gradient_checkpointing_kwargs: null
 include_inputs_for_metrics: false
 eval_do_concat_batches: true
 fp16_backend: auto
-evaluation_strategy: null
 push_to_hub_model_id: null
 push_to_hub_organization: null
 push_to_hub_token: null
@@ -185,8 +182,6 @@ ddp_timeout: 180000000
 torch_compile: false
 torch_compile_backend: null
 torch_compile_mode: null
-dispatch_batches: null
-split_batches: null
 include_tokens_per_second: false
 include_num_input_tokens_seen: false
 neftune_noise_alpha: null
@@ -254,7 +249,6 @@ stage: sft
 finetuning_type: lora
 use_llama_pro: false
 freeze_vision_tower: true
-train_mm_proj_only: false
 plot_loss: true
 
 ### GeneratingArguments
diff --git a/lazyllm/components/finetune/llamafactory.py b/lazyllm/components/finetune/llamafactory.py
@@ -116,6 +116,31 @@ def build_temp_dataset_info(self, datapaths):
             assert os.path.isfile(datapath)
             file_name, _ = os.path.splitext(os.path.basename(datapath))
             temp_dataset_dict[file_name] = {'file_name': datapath}
+            formatting = 'alpaca'
+            try:
+                with open(datapath, 'r', encoding='utf-8') as file:
+                    data = json.load(file)
+                if 'messages' in data[0]:
+                    formatting = 'sharegpt'
+                media_types = []
+                for media in ['images', 'videos', 'audios']:
+                    if media in data[0]:
+                        media_types.append(media)
+                if media_types:
+                    columns = {item: item for item in media_types}
+                    columns.update({"messages": "messages"})
+                    temp_dataset_dict[file_name].update({
+                        "tags": {
+                            "role_tag": "role",
+                            "content_tag": "content",
+                            "user_tag": "user",
+                            "assistant_tag": "assistant"
+                        },
+                        "columns": columns
+                    })
+            except Exception:
+                pass
+            temp_dataset_dict[file_name].update({'formatting': formatting})
         self.temp_dataset_info_path = os.path.join(self.temp_folder, 'dataset_info.json')
         with open(self.temp_dataset_info_path, 'w') as json_file:
             json.dump(temp_dataset_dict, json_file, indent=4)
@@ -149,7 +174,7 @@ def cmd(self, trainset, valset=None) -> str:
         random_value = random.randint(1000, 9999)
         self.log_file_path = f'{self.target_path}/train_log_{formatted_date}_{random_value}.log'
 
-        cmds = f'llamafactory-cli train {self.temp_yaml_file}'
+        cmds = f'export DISABLE_VERSION_CHECK=1 && llamafactory-cli train {self.temp_yaml_file}'
         cmds += f' 2>&1 | tee {self.log_file_path}'
         if self.temp_export_yaml_file:
             cmds += f' && llamafactory-cli export {self.temp_export_yaml_file}'
diff --git a/lazyllm/components/text_to_speech/base.py b/lazyllm/components/text_to_speech/base.py
@@ -6,9 +6,10 @@
 class TTSDeploy:
 
     def __new__(cls, name, **kwarg):
+        name = name.lower()
         if name == 'bark':
             return BarkDeploy(**kwarg)
-        elif name == 'ChatTTS':
+        elif name in ('chattts', 'chattts-new'):
             return ChatTTSDeploy(**kwarg)
         elif name.startswith('musicgen'):
             return MusicGenDeploy(**kwarg)
diff --git a/lazyllm/components/text_to_speech/chattts.py b/lazyllm/components/text_to_speech/chattts.py
@@ -57,7 +57,7 @@ def __call__(self, string):
                                   params_refine_text=params_refine_text,
                                   params_infer_code=params_infer_code,
                                 )
-        file_path = sounds_to_files(speech[0], self.save_path)
+        file_path = sounds_to_files(speech, self.save_path)
         return encode_query_with_filepaths(files=file_path)
 
     @classmethod
diff --git a/lazyllm/components/utils/downloader/model_mapping.py b/lazyllm/components/utils/downloader/model_mapping.py
@@ -182,6 +182,13 @@
         },
         "type": "tts"
     },
+    "chattts-new": {
+        "source": {
+            "huggingface": "2Noise/ChatTTS",
+            "modelscope": "AI-ModelScope/ChatTTS"
+        },
+        "type": "tts"
+    },
     "internvl-chat-v1-5": {
         "source": {
             "huggingface": "OpenGVLab/InternVL-Chat-V1-5",
@@ -224,6 +231,13 @@
         },
         "type": "vlm"
     },
+    "qwen2.5-vl-3b-instruct": {
+        "source": {
+            "huggingface": "Qwen/Qwen2.5-VL-3B-Instruct",
+            "modelscope": "Qwen/Qwen2.5-VL-3B-Instruct"
+        },
+        "type": "vlm"
+    },
     "musicgen-medium": {
         "source": {
             "huggingface": "facebook/musicgen-medium",
diff --git a/lazyllm/thirdparty/__init__.py b/lazyllm/thirdparty/__init__.py
@@ -7,7 +7,6 @@
     'huggingface_hub': 'huggingface-hub',
     'jwt': 'PyJWT',
     'rank_bm25': 'rank-bm25',
-    'collie': 'collie-lm',
     'faiss': 'faiss-cpu',
     'flash_attn': 'flash-attn',
     'sklearn': 'scikit-learn'
@@ -78,7 +77,7 @@ def __getattribute__(self, __name):
             raise ImportError(err_msg)
 
 modules = ['redis', 'huggingface_hub', 'jieba', 'modelscope', 'pandas', 'jwt', 'rank_bm25', 'redisvl', 'datasets',
-           'deepspeed', 'fire', 'numpy', 'peft', 'torch', 'transformers', 'collie', 'faiss', 'flash_attn', 'google',
+           'deepspeed', 'fire', 'numpy', 'peft', 'torch', 'transformers', 'faiss', 'flash_attn', 'google',
            'lightllm', 'vllm', 'ChatTTS', 'wandb', 'funasr', 'sklearn', 'torchvision', 'scipy', 'pymilvus',
            'sentence_transformers', 'gradio', 'chromadb', 'nltk', 'PIL', 'httpx', 'bm25s', 'kubernetes', 'pymongo',
            'rapidfuzz', 'FlagEmbedding', 'mcp', 'diffusers', 'pypdf', 'pptx', 'html2text', 'ebooklib', 'docx2txt',
diff --git a/pyproject.toml b/pyproject.toml
@@ -62,9 +62,9 @@ redisvl = { version = ">=0.1.3", optional = true }
 datasets = { version = ">=2.18.0", optional = true }
 deepspeed = { version = ">=0.12.3", optional = true }
 fire = { version = ">=0.6.0", optional = true }
-peft = { version = ">=0.3.0", optional = true }
+peft = {version = "==0.14.0", optional = true}
 torch = { version = ">=2.1.2", optional = true }
-transformers = { version = ">=4.41.1", optional = true }
+transformers = {version = "==4.51.3", optional = true}
 collie-lm = { version = ">=1.0.7", optional = true }
 faiss-cpu = { version = ">=1.8.0", optional = true }
 google = { version = ">=3.0.0", optional = true }
@@ -76,13 +76,12 @@ vllm = {version = "==0.7.3", optional = true}
 wandb = { version = ">=0.17.0", optional = true }
 chattts = {version = "^0.1.1", optional = true}
 funasr = {version = "^1.1.4", optional = true}
-lazyllm-lmdeploy = {version = "==0.7.1rc0", optional = true}
 timm = {version = "^1.0.8", optional = true}
 diffusers = {version = "^0.30.0", optional = true}
 sortedcontainers = {version = "^2.4.0", optional = true}
 flash-attn = {version = "^2.7.4.post1", optional = true}
 lightllm = {version = "^0.0.1", optional = true}
-lazyllm-llamafactory = {version = "==0.9.1rc0", optional = true}
+lazyllm-llamafactory = {version = "==0.9.3.dev0", optional = true}
 rotary-embedding-torch = {version = "^0.8.3", optional = true}
 infinity-emb = {version = "==0.0.70", optional = true}
 ctranslate2 = {version = "^4.0.0", optional = true}
@@ -94,6 +93,9 @@ flagembedding = {version = "^1.3.4", optional = true}
 mcp = {version = ">=1.5.0", optional = true}
 pytesseract = {version = "^0.3.13", optional = true}
 openai-whisper = {version = "*", optional = true}
+qwen-vl-utils = {version = "^0.0.11", optional = true}
+accelerate = {version = "==1.6.0", optional = true}
+lmdeploy = {version = "==0.8.0", optional = true}
 
 [tool.poetry.extras]
 standard = [
@@ -122,7 +124,7 @@ standard = [
     "wandb",
     "chattts",
     "funasr",
-    "lazyllm-lmdeploy",
+    "lmdeploy",
     "rotary-embedding-torch",
     "infinity-emb",
     "ctranslate2",
@@ -157,7 +159,7 @@ full = [
     "wandb",
     "chattts",
     "funasr",
-    "lazyllm-lmdeploy",
+    "lmdeploy",
     "timm",
     "diffusers",
     "redis",
@@ -175,9 +177,14 @@ full = [
     "ctranslate2",
     "optimum",
     "typer",
+    "pymongo",
+    "pymysql",
     "flagembedding",
     "mcp",
-    "pytesseract"
+    "pytesseract",
+    "openai-whisper",
+    "qwen-vl-utils",
+    "accelerate"
 ]
 alpaca-lora = [
     "appdirs",
@@ -208,7 +215,10 @@ llama-factory = [
     "tensorboard",
     "tensorboard-data-server",
     "torch",
-    "transformers"
+    "transformers",
+    "accelerate",
+    "qwen-vl-utils",
+    "lmdeploy"
 ]
 finetune-all = [
     "appdirs",
@@ -246,7 +256,7 @@ vllm = [
 lmdeploy = [
     "huggingface-hub",
     "modelscope",
-    "lazyllm-lmdeploy"
+    "lmdeploy"
 ]
 lightllm = [
     "huggingface-hub",
@@ -263,7 +273,7 @@ deploy-all = [
     "modelscope",
     "vllm",
     "sentence-transformers",
-    "lazyllm-lmdeploy",
+    "lmdeploy",
     "infinity-emb"  
 ]
 multimodal = [
diff --git a/requirements.full.txt b/requirements.full.txt
@@ -49,9 +49,9 @@ redisvl>=0.1.3
 datasets>=2.18.0
 deepspeed>=0.12.3
 fire>=0.6.0
-peft>=0.3.0
+peft==0.14.0
 torch>=2.1.2
-transformers>=4.41.1
+transformers==4.51.3
 collie-lm>=1.0.7
 faiss-cpu>=1.8.0
 google>=3.0.0
@@ -63,13 +63,12 @@ vllm==0.7.3
 wandb>=0.17.0
 chattts
 funasr
-lazyllm-lmdeploy==0.7.1rc0
 timm
 diffusers
 sortedcontainers
 flash-attn
 lightllm
-lazyllm-llamafactory==0.9.1rc0
+lazyllm-llamafactory==0.9.3.dev0
 rotary-embedding-torch
 infinity-emb==0.0.70
 ctranslate2
@@ -81,3 +80,6 @@ flagembedding
 mcp>=1.5.0
 pytesseract
 openai-whisper
+qwen-vl-utils
+accelerate==1.6.0
+lmdeploy==0.8.0
diff --git a/tests/advanced_tests/standard_test/test_deploy.py b/tests/advanced_tests/standard_test/test_deploy.py
@@ -140,7 +140,7 @@ def test_musicgen(self):
         assert len(res['files']) == 1
 
     def test_chattts(self):
-        m = lazyllm.TrainableModule('ChatTTS')
+        m = lazyllm.TrainableModule('ChatTTS-new')
         m.update_server()
         r = m('你好啊，很高兴认识你。')
         res = decode_query_with_filepaths(r)
diff --git a/tests/advanced_tests/standard_test/test_engine.py b/tests/advanced_tests/standard_test/test_engine.py
@@ -238,7 +238,7 @@ def test_engine_infer_server_tts(self):
         token = '123'
         engine = LightEngine()
         engine.launch_localllm_infer_service()
-        jobid, _ = engine.deploy_model(token, 'ChatTTS')
+        jobid, _ = engine.deploy_model(token, 'ChatTTS-new')
         engine.infer_client.wait_ready(token, jobid)
         r = engine.get_infra_handle(token, jobid)
         assert isinstance(r, lazyllm.TrainableModule) and r._impl._get_deploy_tasks.flag
diff --git a/tests/advanced_tests/standard_test/test_finetune.py b/tests/advanced_tests/standard_test/test_finetune.py

Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ def __call__(self, string):`
`57`	`57`	`params_refine_text=params_refine_text,`
`58`	`58`	`params_infer_code=params_infer_code,`
`59`	`59`	`)`
`60`		`- file_path = sounds_to_files(speech[0], self.save_path)`
	`60`	`+ file_path = sounds_to_files(speech, self.save_path)`
`61`	`61`	`return encode_query_with_filepaths(files=file_path)`
`62`	`62`
`63`	`63`	`@classmethod`