Merge branch 'develop' into develop

luukunn · web-flow · commit affd1ab0d3f3 · 2025-07-18T13:26:57.000+08:00
diff --git a/docs/get_started/installation/kunlunxin_xpu.md b/docs/get_started/installation/kunlunxin_xpu.md
@@ -118,5 +118,5 @@ python -c "from fastdeploy.model_executor.ops.xpu import block_attn"
 
 If all the above steps execute successfully, FastDeploy is installed correctly.
 
-## How to deploy services on kunlunxin XPU
-Refer to [**Supported Models and Service Deployment**](../../usage/kunlunxin_xpu_deployment.md) for the details about the supported models and the way to deploy services on kunlunxin XPU.
+## How to deploy services on Kunlunxin XPU
+Refer to [**Supported Models and Service Deployment**](../../usage/kunlunxin_xpu_deployment.md) for the details about the supported models and the way to deploy services on Kunlunxin XPU.
diff --git a/docs/parameters.md b/docs/parameters.md
@@ -33,9 +33,8 @@ When using FastDeploy to deploy models (including offline inference and service
 | ```long_prefill_token_threshold``` | `int` | When Chunked Prefill is enabled, requests with token count exceeding this value are considered long requests, default: max_model_len*0.04 |
 | ```static_decode_blocks``` | `int` | During inference, each request is forced to allocate corresponding number of blocks from Prefill's KVCache for Decode use, default: 2 |
 | ```reasoning_parser``` | `str` | Specify the reasoning parser to extract reasoning content from model output |
-| ```enable_static_graph_inference``` | `bool` | Whether to use static graph inference mode, default: False |
 | ```use_cudagraph``` | `bool` | Whether to use cuda graph, default: False |
-| ```max_capture_batch_size``` | `int` | When cuda graph is enabled, maximum batch size of captured cuda graph, default: 64 |
+｜```graph_optimization_config```    | `str`       | Parameters related to graph optimization can be configured, with default values of'{"use_cudagraph":false, "graph_opt_level":0, "cudagraph_capture_sizes": null }' |
 | ```enable_custom_all_reduce``` | `bool` | Enable Custom all-reduce, default: False |
 | ```splitwise_role``` | `str` | Whether to enable splitwise inference, default value: mixed, supported parameters: ["mixed", "decode", "prefill"] |
 | ```innode_prefill_ports``` | `str` | Internal engine startup ports for prefill instances (only required for single-machine PD separation), default: None |
@@ -72,20 +71,53 @@ When `enable_chunked_prefill` is enabled, the service processes long input seque
 To optimize scheduling priority for short requests, new `max_long_partial_prefills` and `long_prefill_token_threshold` parameter combination is added. The former limits the number of long requests in single prefill batch, the latter defines the token threshold for long requests. The system will prioritize batch space for short requests, thereby reducing short request latency in mixed workload scenarios while maintaining stable throughput.
 
 ## 4. GraphOptimizationBackend related configuration parameters
+Currently, only user configuration of the following parameters is supported：
+- `use_cudagraph` : bool = False
+- `graph_optimization_config` :  Dict[str, Any]
+    - `graph_opt_level`: int = 0
+    - `use_cudagraph`: bool = False
+    - `cudagraph_capture_sizes` : List[int] = None
 
-### Static graph inference related parameters
+CudaGrpah can be enabled by setting `--use-cudagraph` or `--graph-optimization-config '{"use_cudagraph":true}'`. Using two different methods to set the use graph simultaneously may cause conflicts.
+
+
+The `graph_opt_level` parameter within `--graph-optimization-config` is used to configure the graph optimization level, with the following available options:
+- `0`: Use Dynamic compute graph, default to 0
+- `1`: Use Static compute graph, during the initialization phase, Paddle API will be used to convert the dynamic image into a static image
+- `2`: Base on Static compute graph, use the complier(CINN, Compiler Infrastructure for Neural Networks) of Paddle  to compile and optimize
+
+In general, static graphs have lower Kernel Launch overhead than dynamic graphs, and it is recommended to use static graphs.
+For adapted models, FastDeploy's CudaGraph * * can support both dynamic and static graphs * * simultaneously.
+
+When CudaGraph is enabled in the default configuration, a list of Batch Sizes that CudaGraph needs to capture will be automatically set based on the 'max_num_deqs' parameter. The logic for generating the list of Batch Sizes that need to be captured is as follows：
+
+1. Generate a candidate list with a range of [1,1024]  Batch Size.
+```
+        # Batch Size [1, 2, 4, 8, 16, ... 120, 128]
+        candidate_capture_sizes = [1, 2, 4] + [8 * i for i in range(1, 17)]
+        # Batch Size (128, 144, ... 240, 256]
+        candidate_capture_sizes += [16 * i for i in range(9, 17)]
+        # Batch Size (256, 288, ... 992, 1024]
+        candidate_capture_sizes += [32 * i for i in range(17, 33)]
+```
+2. Crop the candidate list based on the user set 'max_num_deqs' to obtain a CudaGraph capture list with a range of [1,' max_num_deqs'].
+
+Users can also customize the batch size list that needs to be captured by CudaGraph through the parameter `cudagraph_capture_sizes` in`--graph-optimization-config`:
+```
+--graph-optimization-config '{"cudagraph_capture_sizes": [1, 3, 5, 7, 9]}'
+```
 
-- When ```enable_static_graph_inference``` is enabled, dynamic-to-static graph conversion will be performed, using static graph for inference.
 
 ### CudaGraph related parameters
 
-For adapted models, FastDeploy's CudaGraph can support both dynamic and static graphs. Using CudaGraph incurs some additional memory overhead, divided into two categories in FastDeploy:
+ Using CudaGraph incurs some additional memory overhead, divided into two categories in FastDeploy:
 * Additional input Buffer overhead
 * CudaGraph uses dedicated memory pool, thus holding some intermediate activation memory isolated from main framework
 
 FastDeploy initialization sequence first uses `gpu_memory_utilization` parameter to calculate available memory for `KVCache`, after initializing `KVCache` then uses remaining memory to initialize CudaGraph. Since CudaGraph is not enabled by default currently, using default startup parameters may encounter `Out of memory` errors, can try following solutions:
 * Lower `gpu_memory_utilization` value, reserve more memory for CudaGraph.
-* Lower `max_capture_batch_size` value, reduce CudaGraph memory usage, but also reduce CudaGraph usage during inference.
+* Lower `max_num_seqs` to decrease the maximum concurrency.
+* Customize the batch size list that CudaGraph needs to capture through `graph_optimization_config`, and reduce the number of captured graphs by using `cudagraph_capture_sizes`
 
 - Before use, must ensure loaded model is properly decorated with ```@support_graph_optimization```.
 
@@ -118,4 +150,3 @@ FastDeploy initialization sequence first uses `gpu_memory_utilization` parameter
   ```
 - When ```use_cudagraph``` is enabled, currently only supports single-GPU inference, i.e. ```tensor_parallel_size``` set to 1.
 - When ```use_cudagraph``` is enabled, cannot enable ```enable_prefix_caching``` or ```enable_chunked_prefill```.
-- When ```use_cudagraph``` is enabled, batches with size ≤ ```max_capture_batch_size``` will be executed by CudaGraph, batches > ```max_capture_batch_size``` will be executed by original dynamic/static graph. To have all batch sizes executed by CudaGraph, ```max_capture_batch_size``` value should match ```max_num_seqs```. ```max_capture_batch_size``` > ```max_num_seqs``` will cause waste by capturing batches that won't be encountered during inference, occupying more time and memory.
diff --git a/docs/zh/get_started/installation/kunlunxin_xpu.md b/docs/zh/get_started/installation/kunlunxin_xpu.md
@@ -119,5 +119,5 @@ python -c "from fastdeploy.model_executor.ops.xpu import block_attn"
 
 如果上述步骤均执行成功，代表 FastDeploy 已安装成功。
 
-## 如何在昆仑新XPU上部署服务
+## 如何在昆仑芯 XPU 上部署服务
 请参考 [**支持的模型与服务部署**](../../usage/kunlunxin_xpu_deployment.md) 以了解昆仑芯 XPU 支持的模型与服务部署方法。
diff --git a/docs/zh/parameters.md b/docs/zh/parameters.md
@@ -32,9 +32,8 @@
 | ```long_prefill_token_threshold``` | `int`       | 开启Chunked Prefill时，请求Token数超过此值的请求被视为长请求，默认为max_model_len*0.04 |
 | ```static_decode_blocks```         | `int`       | 推理过程中，每条请求强制从Prefill的KVCache分配对应块数给Decode使用，默认2|
 | ```reasoning_parser```             | `str`       | 指定要使用的推理解析器，以便从模型输出中提取推理内容 |
-| ```enable_static_graph_inference```| `bool`      | 是否使用静态图推理模式，默认False |
 | ```use_cudagraph```                | `bool`      | 是否使用cuda graph，默认False |
-| ```max_capture_batch_size```       | `int`       | 开启 cuda graph 时，捕获的 cuda graph的最大batch size，默认为64 |
+｜```graph_optimization_config```    | `str`       | 可以配置计算图优化相关的参数，默认值为'{"use_cudagraph":false, "graph_opt_level":0, "cudagraph_capture_sizes": null }' |
 | ```enable_custom_all_reduce```     | `bool`      | 开启Custom all-reduce，默认False |
 | ```splitwise_role```               | `str`       | 是否开启splitwise推理，默认值mixed， 支持参数为["mixed", "decode", "prefill"] |
 | ```innode_prefill_ports```         | `str`       | prefill 实例内部引擎启动端口 （仅单机PD分离需要），默认值None |
@@ -70,22 +69,53 @@ FastDeploy在推理过程中，显存被```模型权重```、```预分配KVCache
 为优化短请求的调度优先级，新增 `max_long_partial_prefills` 与 `long_prefill_token_threshold` 参数组合。前者限制单个预填充批次中的长请求数量，后者定义长请求的token阈值。系统会优先保障短请求的批处理空间，从而在混合负载场景下降低短请求延迟，同时保持整体吞吐稳定。
 
 ## 4. GraphOptimizationBackend 相关配置参数说明
+当前仅支持用户配置以下参数：
+- `use_cudagraph` : bool = False
+- `graph_optimization_config` :  Dict[str, Any]
+    - `graph_opt_level`: int = 0
+    - `use_cudagraph`: bool = False
+    - `cudagraph_capture_sizes` : List[int] = None
+
+可以通过设置 `--use-cudagraph` 或 `--graph-optimization-config '{"use_cudagraph":true}'` 开启 CudaGrpah。
+
+`--graph-optimization-config` 中的 `graph_opt_level` 参数用于配置图优化等级，可选项如下：
+- `0`: 动态图，默认为 0
+- `1`: 静态图，初始化阶段会使用 Paddle API 将动态图转换为静态图
+- `2`: 在静态图的基础上，使用 Paddle 框架编译器（CINN, Compiler Infrastructure for Neural Networks）进行编译优化
+
+一般情况下静态图比动态图的 Kernel Launch 开销更小，推荐使用静态图。
+对于已适配的模型，FastDeploy 的 CudaGraph **可同时支持动态图与静态图**。
+
+在默认配置下开启 CudaGraph 时，会根据 `max_num_seqs` 参数自动设置 CudaGraph 需要捕获的 Batch Size 列表，需要捕获的 Batch Size 的列表自动生成逻辑如下：
+1. 生成一个范围为 [1,1024] Batch Size 的候选列表
+```
+        # Batch Size [1, 2, 4, 8, 16, ... 120, 128]
+        candidate_capture_sizes = [1, 2, 4] + [8 * i for i in range(1, 17)]
+        # Batch Size (128, 144, ... 240, 256]
+        candidate_capture_sizes += [16 * i for i in range(9, 17)]
+        # Batch Size (256, 288, ... 992, 1024]
+        candidate_capture_sizes += [32 * i for i in range(17, 33)]
+```
+2. 根据用户设置的 `max_num_seqs` 裁剪候选列表，得到范围为 [1, `max_num_seqs`] 的 CudaGraph 捕获列表。
+
+用户也可以通过 `--graph-optimization-config` 中的 `cudagraph_capture_sizes` 参数自定义需要被 CudaGraph 捕获的 Batch Size 列表:
+```
+--graph-optimization-config '{"cudagraph_capture_sizes": [1, 3, 5, 7, 9]}'
+```
 
-### 动态图转静态图相关参数说明
-
-- 当开启 ```enable_static_graph_inference```时，会执行动态图转静态图，使用静态图进行推理。
 
 ### CudaGraph相关参数说明
-
-对于已适配的模型，FastDeploy 的 CudaGraph 可同时支持动态图与静态图。使用 CudaGraph 会产生一些额外的显存开销，在FastDeploy中分为下面两类：
+使用 CudaGraph 会产生一些额外的显存开销，在FastDeploy中分为下面两类：
 * 额外的输入 Buffer 开销
 * CudaGraph 使用了专用的显存池，因此会持有一部分与主框架隔离的中间激活显存
 
-FastDeploy 的初始化顺序为先使用 `gpu_memory_utilization` 参数计算 `KVCache` 可用的显存，初始化完 `KVCache` 之后才会使用剩余显存初始化 CudaGraph。由于 CudaGraph 目前还不是默认开启的，因此使用默认启动参数可能会遇到 `Out of memory` 错误，可以尝试使用下面两种方式解决：
+FastDeploy 的初始化顺序为先使用 `gpu_memory_utilization` 参数计算 `KVCache` 可用的显存，初始化完 `KVCache` 之后才会使用剩余显存初始化 CudaGraph。由于 CudaGraph 目前还不是默认开启的，因此使用默认启动参数可能会遇到 `Out Of Memory` 错误，可以尝试使用下面三种方式解决：
 * 调低 `gpu_memory_utilization` 的值，多预留一些显存给CudaGraph使用。
-* 调低 `max_capture_batch_size` 的值， 减少CudaGraph的显存占用，同时也会降低推理时CudaGraph的使用率。
+* 调低 `max_num_seqs` 的值，降低最大并发数。
+* 通过 `graph_optimization_config` 自定义需要 CudaGraph 捕获的 Batch Size 列表 `cudagraph_capture_sizes`，减少捕获的图的数量
+
 
-- 使用之前，需要确保加载的模型被装饰器 ```@support_graph_optimization```正确修饰。
+使用CudaGraph之前，需要确保加载的模型被装饰器 ```@support_graph_optimization```正确修饰。
 
   ```python
   # 1. import 装饰器
@@ -116,4 +146,3 @@ FastDeploy 的初始化顺序为先使用 `gpu_memory_utilization` 参数计算
   ```
 - 当开启 ```use_cudagraph``` 时，暂时只支持单卡推理，即 ```tensor_parallel_size``` 设为1。
 - 当开启 ```use_cudagraph``` 时，暂不支持开启 ```enable_prefix_caching``` 或 ```enable_chunked_prefill``` 。
-- 当开启 ```use_cudagraph``` 后，size小于等于 ```max_capture_batch_size``` 的batch会由CudaGraph来执行前向计算，大于 ```max_capture_batch_size``` 的batch会由原本的动态图/静态图执行前向计算。如果希望所有batch size均由CudaGraph来执行，```max_capture_batch_size``` 的值建议与 ```max_num_seqs``` 一致。```max_capture_batch_size``` 大于 ```max_num_seqs``` 会导致浪费，会多捕获一些推理时不会遇到的batch，占用更多时间与显存。
diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py
@@ -310,10 +310,6 @@ class RequestOutput:
                                   None if decoder-only.
         num_cached_tokens: The number of tokens with prefix cache hit.
     """
-    __slots__ = (
-        'request_id', 'prompt', 'prompt_token_ids', 'outputs', 
-        'finished', 'metrics', 'num_cached_tokens', 'error_code', 'error_msg'
-    )
 
     def __init__(
         self,
diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py
@@ -24,7 +24,7 @@
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 from prometheus_client import CONTENT_TYPE_LATEST
-from fastdeploy.metrics.trace_util import inject_to_metadata
+from fastdeploy.metrics.trace_util import inject_to_metadata,instrument
 
 from fastdeploy.engine.args_utils import EngineArgs
 from fastdeploy.engine.engine import LLMEngine
@@ -141,6 +141,7 @@ async def lifespan(app: FastAPI):
 
 
 app = FastAPI(lifespan=lifespan)
+instrument(app)
 
 
 # TODO 传递真实引擎值 通过pid 获取状态
@@ -397,11 +398,6 @@ def launch_controller_server():
     """Controller server running the sub thread"""
     if args.controller_port < 0:
         return
-    
-    if not is_port_available(args.host, args.controller_port):
-        raise Exception(
-            f"The parameter `controller_port`:{args.controller_port} is already in use."
-        )
 
     if not is_port_available(args.host, args.controller_port):
         raise Exception(
diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
@@ -105,6 +105,30 @@
     # Whether to use aggregate send.
     "FD_USE_AGGREGATE_SEND":
     lambda: bool(int(os.getenv("FD_USE_AGGREGATE_SEND", "0"))),
+
+    # Whether to open Trace.
+    "TRACES_ENABLE":
+    lambda: os.getenv("TRACES_ENABLE", "false"),
+
+    # set traec Server name.
+    "FD_SERVICE_NAME":
+    lambda: os.getenv("FD_SERVICE_NAME", "FastDeploy"),
+
+    # set traec host name.
+    "FD_HOST_NAME":
+    lambda: os.getenv("FD_HOST_NAME", "localhost"),
+
+    # set traec exporter.
+    "TRACES_EXPORTER":
+    lambda: os.getenv("TRACES_EXPORTER", "console"),
+
+    # set traec exporter_otlp_endpoint.
+    "EXPORTER_OTLP_ENDPOINT":
+    lambda: os.getenv("EXPORTER_OTLP_ENDPOINT"),
+
+    # set traec exporter_otlp_headers.
+    "EXPORTER_OTLP_HEADERS":
+    lambda: os.getenv("EXPORTER_OTLP_HEADERS"),
 }
 
 
diff --git a/fastdeploy/metrics/trace_util.py b/fastdeploy/metrics/trace_util.py