Add logits_processors support to batch_generate (#635)

lazarust · awni · web-flow · commit d3dc2e3f3376 · 2026-01-04T15:19:00.000-08:00
* Update generate.py

* add samplers and logits processors with per example option and server support

---------

Co-authored-by: Awni Hannun &lt;awni@apple.com&gt;
diff --git a/mlx_lm/generate.py b/mlx_lm/generate.py
@@ -181,8 +181,7 @@ def setup_arg_parser():
     parser.add_argument(
         "--kv-bits",
         type=int,
-        help="Number of bits for KV cache quantization. "
-        "Defaults to no quantization.",
+        help="Number of bits for KV cache quantization. Defaults to no quantization.",
         default=None,
     )
     parser.add_argument(
@@ -548,7 +547,9 @@ def _step(model, cache, y, n_predict=1):
                     y = y[: -(n_predict - 1)]
                 for i in range(n_predict):
                     prev_tokens = (
-                        mx.concat([prev_tokens, y]) if prev_tokens is not None else y
+                        mx.concatenate([prev_tokens, y])
+                        if prev_tokens is not None
+                        else y
                     )
                     y, logprobs = _process_and_sample(prev_tokens, logits[:, i, :])
                     out_y.append(y)
@@ -840,6 +841,9 @@ class Batch:
     max_tokens: List[int]
     num_tokens: List[int]
     cache: List[Any]
+    samplers: List[Any]
+    logits_processors: List[Any]
+    tokens: List[mx.array]
 
     def __len__(self):
         return len(self.uids)
@@ -849,6 +853,9 @@ def filter(self, keep_idx: List[int]):
         self.logprobs = [self.logprobs[k] for k in keep_idx]
         self.max_tokens = [self.max_tokens[k] for k in keep_idx]
         self.num_tokens = [self.num_tokens[k] for k in keep_idx]
+        self.samplers = [self.samplers[k] for k in keep_idx]
+        self.logits_processors = [self.logits_processors[k] for k in keep_idx]
+        self.tokens = [self.tokens[k] for k in keep_idx]
         keep_idx = mx.array(keep_idx, mx.int32)
         self.y = self.y[keep_idx]
         for c in self.cache:
@@ -860,6 +867,9 @@ def extend(self, other):
         self.logprobs.extend(other.logprobs)
         self.num_tokens.extend(other.num_tokens)
         self.max_tokens.extend(other.max_tokens)
+        self.samplers.extend(other.samplers)
+        self.logits_processors.extend(other.logits_processors)
+        self.tokens.extend(other.tokens)
         for c, o in zip(self.cache, other.cache):
             c.extend(o)
 
@@ -912,7 +922,6 @@ def _merge_caches(caches):
 
 
 class BatchGenerator:
-
     @dataclass
     class Response:
         uid: int
@@ -927,6 +936,9 @@ def __init__(
         max_tokens: int = 128,
         stop_tokens: Optional[set] = None,
         sampler: Optional[Callable[[mx.array], mx.array]] = None,
+        logits_processors: Optional[
+            List[Callable[[mx.array, mx.array], mx.array]]
+        ] = None,
         completion_batch_size: int = 32,
         prefill_batch_size: int = 8,
         prefill_step_size: int = 2048,
@@ -939,6 +951,7 @@ def __init__(
         self.max_tokens = max_tokens
         self.stop_tokens = stop_tokens or set()
         self.sampler = sampler or (lambda x: mx.argmax(x, axis=-1))
+        self.logits_processors = logits_processors or []
         self.uid_count = 0
         self.prefill_step_size = prefill_step_size
         self.prefill_batch_size = prefill_batch_size
@@ -965,7 +978,12 @@ def __del__(self):
         self.close()
 
     def insert(
-        self, prompts, max_tokens: Union[List[int], int, None] = None, caches=None
+        self,
+        prompts,
+        max_tokens: Union[List[int], int, None] = None,
+        caches=None,
+        samplers: list | None = None,
+        logits_processors: list | None = None,
     ):
         uids = []
 
@@ -978,8 +996,13 @@ def insert(
             if caches[i] is None:
                 caches[i] = cache.make_prompt_cache(self.model)
 
-        for p, m, c in zip(prompts, max_tokens, caches):
-            self.unprocessed_prompts.append((self.uid_count, p, m, c))
+        samplers = samplers or [None] * len(prompts)
+        logits_processors = logits_processors or [self.logits_processors] * len(prompts)
+
+        for p, m, c, s, lp in zip(
+            prompts, max_tokens, caches, samplers, logits_processors
+        ):
+            self.unprocessed_prompts.append((self.uid_count, p, m, c, s, lp))
             uids.append(self.uid_count)
             self.uid_count += 1
         # Sort in ascending order of length
@@ -1003,7 +1026,7 @@ def remove(self, uids: List[int]):
                 self.unprocessed_prompts.pop(i)
 
     def _process_prompts(self, prompts):
-        uids, inputs, max_tokens, caches = zip(*prompts)
+        uids, inputs, max_tokens, caches, samplers, logits_processors = zip(*prompts)
 
         cache_lengths = [cache.cache_length(c) for c in caches]
         max_cache_length = max(cache_lengths)
@@ -1013,6 +1036,7 @@ def _process_prompts(self, prompts):
 
         self._stats.prompt_tokens += sum(lengths)
 
+        tokens = [mx.array(inp) for inp in inputs]
         processed_tokens = 0
 
         # New prompts so
@@ -1069,17 +1093,56 @@ def _process_prompts(self, prompts):
             mx.clear_cache()
             inputs = last_inputs
 
-        y, logprobs = self._step(inputs, prompt_cache)
+        y, logprobs = self._step(
+            inputs, prompt_cache, samplers, logits_processors, tokens
+        )
         mx.async_eval(y, logprobs)
+
         return Batch(
-            list(uids), y, logprobs, list(max_tokens), [0] * len(uids), prompt_cache
+            list(uids),
+            y,
+            logprobs,
+            list(max_tokens),
+            [0] * len(uids),
+            prompt_cache,
+            list(samplers),
+            list(logits_processors),
+            tokens,
         )
 
-    def _step(self, input_tokens: mx.array, prompt_cache: List[Any]):
+    def _step(
+        self,
+        input_tokens: mx.array,
+        prompt_cache: List[Any],
+        samplers: list | None,
+        logits_processors: list | None,
+        tokens: List[mx.array],
+    ):
+        batch_size = input_tokens.shape[0]
+
         logits = self.model(input_tokens, cache=prompt_cache)
         logits = logits[:, -1, :]
+
+        if any(logits_processors):
+            processed_logits = []
+            for e in range(batch_size):
+                sample_logits = logits[e : e + 1]
+                for processor in logits_processors[e]:
+                    sample_logits = processor(tokens[e], sample_logits)
+                processed_logits.append(sample_logits)
+            logits = mx.concatenate(processed_logits, axis=0)
+
         logprobs = logits - mx.logsumexp(logits, axis=-1, keepdims=True)
-        sampled = self.sampler(logprobs)
+        if any(samplers):
+            all_samples = []
+            for e in range(batch_size):
+                sample_sampler = samplers[e] or self.sampler
+                sampled = sample_sampler(logprobs[e : e + 1])
+                all_samples.append(sampled)
+            sampled = mx.concatenate(all_samples, axis=0)
+        else:
+            sampled = self.sampler(logprobs)
+
         return sampled, list(logprobs)
 
     def stats(self):
@@ -1129,7 +1192,16 @@ def _next(self):
 
         batch = self.active_batch
         y, logprobs = batch.y, batch.logprobs
-        batch.y, batch.logprobs = self._step(y[:, None], batch.cache)
+        for i, toks in enumerate(batch.tokens):
+            batch.tokens[i] = mx.concatenate((toks, y[i : i + 1]))
+        batch.y, batch.logprobs = self._step(
+            y[:, None],
+            batch.cache,
+            batch.samplers,
+            batch.logits_processors,
+            batch.tokens,
+        )
+
         mx.async_eval(batch.y, batch.logprobs)
 
         y = y.tolist()
@@ -1184,6 +1256,7 @@ def batch_generate(
     max_tokens: Union[int, List[int]] = 128,
     verbose: bool = False,
     return_prompt_caches: bool = False,
+    logits_processors: Optional[List[Callable[[mx.array, mx.array], mx.array]]] = None,
     **kwargs,
 ) -> BatchResponse:
     """
@@ -1202,11 +1275,17 @@ def batch_generate(
           can be per prompt if a list is provided.
        return_prompt_caches (bool): Return the prompt caches in the batch
           responses. Default: ``False``.
+       logits_processors (List[Callable[[mx.array, mx.array], mx.array]], optional):
+          A list of functions that take tokens and logits and return the processed logits. Default: ``None``.
        kwargs: The remaining options get passed to :obj:`BatchGenerator`.
           See :obj:`BatchGenerator` for more details.
     """
 
-    gen = BatchGenerator(model, stop_tokens=tokenizer.eos_token_ids, **kwargs)
+    gen = BatchGenerator(
+        model,
+        stop_tokens=tokenizer.eos_token_ids,
+        **kwargs,
+    )
     num_samples = len(prompts)
     fin = 0
     if verbose:
diff --git a/mlx_lm/server.py b/mlx_lm/server.py
@@ -467,6 +467,29 @@ def validate_draft_tokenizer(draft_tokenizer):
         return self.model, self.tokenizer
 
 
+def _make_sampler(args, tokenizer):
+    return make_sampler(
+        args.sampling.temperature,
+        top_p=args.sampling.top_p,
+        top_k=args.sampling.top_k,
+        min_p=args.sampling.min_p,
+        xtc_probability=args.sampling.xtc_probability,
+        xtc_threshold=args.sampling.xtc_threshold,
+        xtc_special_tokens=[
+            tokenizer.eos_token_id,
+            tokenizer.encode("\n"),
+        ],
+    )
+
+
+def _make_logits_processors(args):
+    return make_logits_processors(
+        args.logits.logit_bias,
+        args.logits.repetition_penalty,
+        args.logits.repetition_context_size,
+    )
+
+
 class ResponseGenerator:
     def __init__(self, model_provider: ModelProvider, prompt_cache: LRUPromptCache):
         self.model_provider = model_provider
@@ -510,12 +533,6 @@ def _is_batchable(self, args):
         for c in self.model_provider.cache_types:
             if c not in (KVCache, RotatingKVCache):
                 return False
-        if args.logits.logit_bias is not None:
-            return False
-        if args.logits.repetition_penalty != 0:
-            return False
-        if args.logprobs > 0:
-            return False
         if args.seed is not None:
             return False
 
@@ -569,7 +586,6 @@ def progress_callback(info):
                 if (
                     batch_generator is not None
                     and current_model == args.model
-                    and current_sampling == args.sampling
                     and is_batchable
                 ):
                     prompt = self._tokenize(current_tokenizer, request)
@@ -593,7 +609,11 @@ def progress_callback(info):
                         cache = make_prompt_cache(self.model_provider.model)
 
                     (uid,) = batch_generator.insert(
-                        [rest], args.max_tokens, caches=[cache]
+                        [rest],
+                        args.max_tokens,
+                        caches=[cache],
+                        samplers=[_make_sampler(args, tokenizer)],
+                        logits_processors=[_make_logits_processors(args)],
                     )
                     batch_results[uid] = {
                         "ctx": ctx,
@@ -620,25 +640,12 @@ def progress_callback(info):
                         continue
 
                     current_model = args.model
-                    current_sampling = args.sampling
                     current_tokenizer = tokenizer
                     current_model_key = self.model_provider.model_key
                     batch_results = {}
                     batch_generator = BatchGenerator(
                         model,
                         stop_tokens=tokenizer.eos_token_ids,
-                        sampler=make_sampler(
-                            args.sampling.temperature,
-                            top_p=args.sampling.top_p,
-                            top_k=args.sampling.top_k,
-                            min_p=args.sampling.min_p,
-                            xtc_probability=args.sampling.xtc_probability,
-                            xtc_threshold=args.sampling.xtc_threshold,
-                            xtc_special_tokens=[
-                                tokenizer.eos_token_id,
-                                tokenizer.encode("\n"),
-                            ],
-                        ),
                         prompt_progress_callback=progress_callback,
                     )
                     unprocessed_requests.append((rqueue, request, args))
@@ -750,23 +757,8 @@ def progress(tokens_processed, tokens_total):
                 mx.random.seed(args.seed)
 
             # Make the sampler and logit processor
-            sampler = make_sampler(
-                args.sampling.temperature,
-                top_p=args.sampling.top_p,
-                top_k=args.sampling.top_k,
-                min_p=args.sampling.min_p,
-                xtc_probability=args.sampling.xtc_probability,
-                xtc_threshold=args.sampling.xtc_threshold,
-                xtc_special_tokens=[
-                    tokenizer.eos_token_id,
-                    tokenizer.encode("\n"),
-                ],
-            )
-            logits_processors = make_logits_processors(
-                args.logits.logit_bias,
-                args.logits.repetition_penalty,
-                args.logits.repetition_context_size,
-            )
+            sampler = _make_sampler(args, tokenizer)
+            logits_processors = _make_logits_processors(args)
 
             # Load the KV cache
             cache, rest = self.prompt_cache.fetch_nearest_cache(
diff --git a/tests/test_generate.py b/tests/test_generate.py