ml-explore
diff --git a/‎.github/workflows/pull_request.yml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/pull_request.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/workflows/release.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/release.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 4 additions & 4 deletions b/‎README.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎mlx_lm/_version.py‎
Lines changed: 1 addition & 1 deletion b/‎mlx_lm/_version.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlx_lm/benchmark.py‎
Lines changed: 11 additions & 2 deletions b/‎mlx_lm/benchmark.py‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎mlx_lm/cache_prompt.py‎
Lines changed: 4 additions & 17 deletions b/‎mlx_lm/cache_prompt.py‎
Lines changed: 4 additions & 17 deletions
diff --git a/‎mlx_lm/chat.py‎
Lines changed: 39 additions & 17 deletions b/‎mlx_lm/chat.py‎
Lines changed: 39 additions & 17 deletions
diff --git a/‎mlx_lm/convert.py‎
Lines changed: 15 additions & 4 deletions b/‎mlx_lm/convert.py‎
Lines changed: 15 additions & 4 deletions
diff --git a/‎mlx_lm/evaluate.py‎
Lines changed: 1 addition & 1 deletion b/‎mlx_lm/evaluate.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlx_lm/examples/batch_generate_response.py‎
Lines changed: 22 additions & 3 deletions b/‎mlx_lm/examples/batch_generate_response.py‎
Lines changed: 22 additions & 3 deletions
@@ -38,4 +38,6 @@ jobs:
       - name: Run tests
         shell: bash -l {0}
         run: |
-          python -m xmlrunner discover -v tests -o test-results/
+          curl -o test_data.zip -L https://github.com/ml-explore/mlx-lm/releases/download/test_data/test_data.zip
+          unzip test_data.zip
+          HF_HOME="." python -m xmlrunner discover -v tests -o test-results/
@@ -12,7 +12,7 @@ permissions:
 jobs:
 
   build_release:
-    if: github.repository == 'ml-explore/mlx'
+    if: github.repository == 'ml-explore/mlx-lm'
     runs-on: ubuntu-22.04
     permissions:
       id-token: write
 
@@ -71,7 +71,7 @@ prompt = "Write a story about Einstein"
 
 messages = [{"role": "user", "content": prompt}]
 prompt = tokenizer.apply_chat_template(
-    messages, add_generation_prompt=True
+    messages, add_generation_prompt=True,
 )
 
 text = generate(model, tokenizer, prompt=prompt, verbose=True)
@@ -130,7 +130,7 @@ prompt = "Write a story about Einstein"
 
 messages = [{"role": "user", "content": prompt}]
 prompt = tokenizer.apply_chat_template(
-    messages, add_generation_prompt=True
+    messages, add_generation_prompt=True,
 )
 
 for response in stream_generate(model, tokenizer, prompt, max_tokens=512):
@@ -170,7 +170,7 @@ mlx_lm.generate --help
 To quantize a model from the command line run:
 
 ```
-mlx_lm.convert --hf-path mistralai/Mistral-7B-Instruct-v0.3 -q
+mlx_lm.convert --model mistralai/Mistral-7B-Instruct-v0.3 -q
 ```
 
 For more options run:
@@ -185,7 +185,7 @@ You can upload new models to Hugging Face by specifying `--upload-repo` to
 
 ```
 mlx_lm.convert \
-    --hf-path mistralai/Mistral-7B-Instruct-v0.3 \
+    --model mistralai/Mistral-7B-Instruct-v0.3 \
     -q \
     --upload-repo mlx-community/my-4bit-mistral
 ```
 
@@ -1,3 +1,3 @@
 # Copyright © 2023-2025 Apple Inc.
 
-__version__ = "0.28.4"
+__version__ = "0.30.0"
@@ -6,7 +6,7 @@
 
 from mlx_lm import batch_generate, load, stream_generate
 from mlx_lm.generate import DEFAULT_MODEL
-from mlx_lm.utils import pipeline_load
+from mlx_lm.utils import pipeline_load, sharded_load
 
 
 def setup_arg_parser():
@@ -49,6 +49,11 @@ def setup_arg_parser():
         help="Number of timing trials",
         type=int,
     )
+    parser.add_argument(
+        "--pipeline",
+        action="store_true",
+        help="Use pipelining instead of tensor parallelism",
+    )
     return parser
 
 
@@ -59,6 +64,8 @@ def main():
 
     group = mx.distributed.init()
     rank = group.rank()
+    pipeline_group = group if args.pipeline else None
+    tensor_group = group if not args.pipeline else None
 
     def rprint(*args, **kwargs):
         if rank == 0:
@@ -67,7 +74,9 @@ def rprint(*args, **kwargs):
     model_path = args.model or DEFAULT_MODEL
 
     if group.size() > 1:
-        model, tokenizer, config = pipeline_load(args.model, return_config=True)
+        model, tokenizer, config = sharded_load(
+            args.model, pipeline_group, tensor_group, return_config=True
+        )
     else:
         model, tokenizer, config = load(
             args.model, return_config=True, tokenizer_config={"trust_remote_code": True}
 
@@ -41,16 +41,6 @@ def setup_arg_parser():
         default=None,
         help="End of sequence token for tokenizer",
     )
-    parser.add_argument(
-        "--ignore-chat-template",
-        action="store_true",
-        help="Use the raw prompt without the tokenizer's chat template.",
-    )
-    parser.add_argument(
-        "--use-default-chat-template",
-        action="store_true",
-        help="Use the default chat template",
-    )
     parser.add_argument(
         "--max-kv-size",
         type=int,
@@ -107,14 +97,12 @@ def main():
 
     args.prompt = sys.stdin.read() if args.prompt == "-" else args.prompt
 
-    if args.use_default_chat_template:
-        if tokenizer.chat_template is None:
-            tokenizer.chat_template = tokenizer.default_chat_template
-
-    if not args.ignore_chat_template and tokenizer.chat_template is not None:
+    if tokenizer.has_chat_template:
         messages = [{"role": "user", "content": args.prompt}]
         prompt = tokenizer.apply_chat_template(
-            messages, add_generation_prompt=False, continue_final_message=True
+            messages,
+            add_generation_prompt=False,
+            continue_final_message=True,
         )
 
     else:
@@ -153,7 +141,6 @@ def callback(processed, total_tokens):
     print("Saving...")
     metadata = {}
     metadata["model"] = args.model
-    metadata["chat_template"] = json.dumps(tokenizer.chat_template)
     metadata["tokenizer_config"] = json.dumps(tokenizer_config)
     save_prompt_cache(args.prompt_cache_file, cache, metadata)
 
 
@@ -7,7 +7,7 @@
 from .generate import stream_generate
 from .models.cache import make_prompt_cache
 from .sample_utils import make_sampler
-from .utils import load
+from .utils import load, sharded_load
 
 DEFAULT_TEMP = 0.0
 DEFAULT_TOP_P = 1.0
@@ -79,35 +79,54 @@ def setup_arg_parser():
         default=None,
         help="System prompt to be used for the chat template",
     )
+    parser.add_argument(
+        "--pipeline",
+        action="store_true",
+        help="Use pipelining instead of tensor parallelism",
+    )
     return parser
 
 
 def main():
     parser = setup_arg_parser()
     args = parser.parse_args()
 
+    group = mx.distributed.init()
+    rank = group.rank()
+    pipeline_group = group if args.pipeline else None
+    tensor_group = group if not args.pipeline else None
+
+    def rprint(*args, **kwargs):
+        if rank == 0:
+            print(*args, **kwargs)
+
     if args.seed is not None:
         mx.random.seed(args.seed)
 
-    model, tokenizer = load(
-        args.model,
-        adapter_path=args.adapter_path,
-        tokenizer_config={
-            "trust_remote_code": True if args.trust_remote_code else None
-        },
-    )
+    if group.size() > 1:
+        if args.adapter_path:
+            parser.error("Adapters not supported in distributed mode")
+        model, tokenizer = sharded_load(args.model, pipeline_group, tensor_group)
+    else:
+        model, tokenizer = load(
+            args.model,
+            adapter_path=args.adapter_path,
+            tokenizer_config={
+                "trust_remote_code": True if args.trust_remote_code else None
+            },
+        )
 
     def print_help():
-        print("The command list:")
-        print("- 'q' to exit")
-        print("- 'r' to reset the chat")
-        print("- 'h' to display these commands")
+        rprint("The command list:")
+        rprint("- 'q' to exit")
+        rprint("- 'r' to reset the chat")
+        rprint("- 'h' to display these commands")
 
-    print(f"[INFO] Starting chat session with {args.model}.")
+    rprint(f"[INFO] Starting chat session with {args.model}.")
     print_help()
     prompt_cache = make_prompt_cache(model, args.max_kv_size)
     while True:
-        query = input(">> ")
+        query = input(">> " if rank == 0 else "")
         if query == "q":
             break
         if query == "r":
@@ -120,7 +139,10 @@ def print_help():
         if args.system_prompt is not None:
             messages.append({"role": "system", "content": args.system_prompt})
         messages.append({"role": "user", "content": query})
-        prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
+        prompt = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+        )
         for response in stream_generate(
             model,
             tokenizer,
@@ -137,8 +159,8 @@ def print_help():
             ),
             prompt_cache=prompt_cache,
         ):
-            print(response.text, flush=True, end="")
-        print()
+            rprint(response.text, flush=True, end="")
+        rprint()
 
 
 if __name__ == "__main__":
 
@@ -179,25 +179,36 @@ def configure_parser() -> argparse.ArgumentParser:
         description="Convert Hugging Face model to MLX format"
     )
 
-    parser.add_argument("--hf-path", type=str, help="Path to the Hugging Face model.")
+    parser.add_argument(
+        "--hf-path",
+        "--model",
+        type=str,
+        help="Path to the model. This can be a local path or a Hugging Face Hub model identifier.",
+    )
     parser.add_argument(
         "--mlx-path", type=str, default="mlx_model", help="Path to save the MLX model."
     )
     parser.add_argument(
         "-q", "--quantize", help="Generate a quantized model.", action="store_true"
     )
     parser.add_argument(
-        "--q-group-size", help="Group size for quantization.", type=int, default=64
+        "--q-group-size",
+        help="Group size for quantization.",
+        type=int,
+        default=None,
     )
     parser.add_argument(
-        "--q-bits", help="Bits per weight for quantization.", type=int, default=4
+        "--q-bits",
+        help="Bits per weight for quantization.",
+        type=int,
+        default=None,
     )
     parser.add_argument(
         "--q-mode",
         help="The quantization mode.",
         type=str,
         default="affine",
-        choices=["affine", "mxfp4"],
+        choices=["affine", "mxfp4", "nvfp4", "mxfp8"],
     )
     parser.add_argument(
         "--quant-predicate",
 
@@ -26,7 +26,7 @@
 from .generate import batch_generate
 from .models.cache import make_prompt_cache
 from .sample_utils import make_sampler
-from .utils import common_prefix_len, load
+from .utils import load
 
 DEFAULT_MAX_TOKENS = 8192
 
 
@@ -26,7 +26,26 @@
 ]
 
 # Set `verbose=True` to see generation statistics
-result = batch_generate(model, tokenizer, prompts, verbose=False, max_tokens=128)
+result = batch_generate(
+    model, tokenizer, prompts, verbose=False, return_prompt_caches=True
+)
+print(result.texts[-1])
 
-# The returned result contains texts completions in the same order as prompts
-print(result.texts[0])
+prompts = [
+    "Could you summarize that?",
+    "And what about the sea?",
+    "Try again?",
+    "And Mt Olympus?",
+]
+prompts = [
+    tokenizer.apply_chat_template(
+        [{"role": "user", "content": p}],
+        add_generation_prompt=True,
+    )
+    for p in prompts
+]
+
+result = batch_generate(
+    model, tokenizer, prompts, verbose=False, prompt_caches=result.caches
+)
+print(result.texts[-1])
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`# Copyright © 2023-2025 Apple Inc.`
`2`	`2`
`3`		`-__version__ = "0.28.4"`
	`3`	`+__version__ = "0.30.0"`