|
21 | 21 | from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
22 | 22 | from vllm.v1.engine import EngineCoreRequest
|
23 | 23 | from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
|
24 |
| -from vllm.v1.structured_output.backend_guidance import ( |
25 |
| - validate_guidance_grammar) |
26 |
| -from vllm.v1.structured_output.backend_xgrammar import ( |
27 |
| - validate_xgrammar_grammar) |
| 24 | +from vllm.v1.structured_output import StructuredOutputManager |
28 | 25 |
|
29 | 26 |
|
30 | 27 | class Processor:
|
@@ -81,7 +78,7 @@ def _validate_sampling_params(
|
81 | 78 | params: SamplingParams,
|
82 | 79 | lora_request: Optional[LoRARequest],
|
83 | 80 | ) -> None:
|
84 |
| - self._validate_structured_output(params) |
| 81 | + StructuredOutputManager.validate_request(params, self.vllm_config) |
85 | 82 | self._validate_logit_bias(params)
|
86 | 83 |
|
87 | 84 | if params.allowed_token_ids is None:
|
@@ -148,59 +145,6 @@ def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
|
148 | 145 | raise ValueError(f"Got lora_request {lora_request} but LoRA is "
|
149 | 146 | "not enabled!")
|
150 | 147 |
|
151 |
| - def _validate_structured_output(self, params: SamplingParams) -> None: |
152 |
| - if not params.guided_decoding or not self.decoding_config: |
153 |
| - return |
154 |
| - |
155 |
| - engine_level_backend = self.decoding_config.backend |
156 |
| - if params.guided_decoding.backend: |
157 |
| - # Request-level backend selection is not supported in V1. |
158 |
| - # The values may differ if `params` is reused and was set |
159 |
| - # to a specific backend based on `auto` behavior in a previous |
160 |
| - # request. We remember that it was set as a result of `auto` |
161 |
| - # using the `_auto` option set on the backend in the params. |
162 |
| - if (params.guided_decoding.backend != engine_level_backend |
163 |
| - and not (engine_level_backend == "auto" |
164 |
| - and params.guided_decoding.backend_was_auto)): |
165 |
| - raise ValueError( |
166 |
| - "Request-level structured output backend selection is no " |
167 |
| - "longer supported. The request specified " |
168 |
| - f"'{params.guided_decoding.backend}', but vLLM was " |
169 |
| - f"initialised with '{engine_level_backend}'. This error " |
170 |
| - "can be resolved by removing backend selection from the " |
171 |
| - "request.") |
172 |
| - else: |
173 |
| - params.guided_decoding.backend = engine_level_backend |
174 |
| - |
175 |
| - # Request content validation |
176 |
| - if engine_level_backend.startswith("xgrammar"): |
177 |
| - # xgrammar with no fallback |
178 |
| - validate_xgrammar_grammar(params) |
179 |
| - elif engine_level_backend.startswith("guidance"): |
180 |
| - # TODO: ideally we would have the LLTokenizer here as Lark syntax |
181 |
| - # allows <|special_token|> and similar, see |
182 |
| - # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens |
183 |
| - # Without tokenizer these are disallowed in grammars. |
184 |
| - validate_guidance_grammar(params, tokenizer=None) |
185 |
| - else: |
186 |
| - # NOTE: engine_level_backend must be "auto" here, because we have |
187 |
| - # checked supported_backends above. |
188 |
| - # "auto" is an opt-in to opinionated behavior where we try to |
189 |
| - # choose a backend based on request contents. This is not the |
190 |
| - # default as it is less predictable and subject to change |
191 |
| - # between releases as feature support changes. |
192 |
| - try: |
193 |
| - validate_xgrammar_grammar(params) |
194 |
| - params.guided_decoding.backend = "xgrammar" |
195 |
| - except ValueError: |
196 |
| - # The request either failed validation |
197 |
| - # or includes some jsonschema feature(s) that |
198 |
| - # are not supported in xgrammar. Fall back to guidance. |
199 |
| - validate_guidance_grammar(params, tokenizer=None) |
200 |
| - params.guided_decoding.backend = "guidance" |
201 |
| - # Remember that this backend was set automatically |
202 |
| - params.guided_decoding.backend_was_auto = True |
203 |
| - |
204 | 148 | def process_inputs(
|
205 | 149 | self,
|
206 | 150 | request_id: str,
|
|
0 commit comments