From 7dfaf924906cfc179d046f926ff2220456ed0ea3 Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Thu, 24 Apr 2025 16:09:59 -0700
Subject: [PATCH 01/12] [Doc][Serve][LLM] Add doc for deploying DeepSeek

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 doc/source/serve/llm/serving-llms.rst | 49 ++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/doc/source/serve/llm/serving-llms.rst b/doc/source/serve/llm/serving-llms.rst
index d5ee81ebb3d64..5d21417587759 100644
--- a/doc/source/serve/llm/serving-llms.rst
+++ b/doc/source/serve/llm/serving-llms.rst
@@ -60,7 +60,6 @@ Quickstart Examples
 -------------------
 
 
-
 Deployment through :class:`LLMRouter <ray.serve.llm.LLMRouter>`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -249,6 +248,54 @@ For deploying multiple models, you can pass a list of :class:`LLMConfig <ray.ser
             serve.run(llm_app, blocking=True)
 
 
+Example: Deploying DeepSeek
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following example shows how you can deploy DeepSeek R1 or V3:
+
+.. tab-set::
+
+    .. tab-item:: Builder Pattern
+        :sync: builder
+
+        .. code-block:: python
+
+            from ray import serve
+            from ray.serve.llm import LLMConfig, LLMRouter, LLMServer
+
+            llm_config = LLMConfig(
+                model_loading_config=dict(
+                    model_id="deepseek",
+                    # Change to model download path
+                    model_source="/path/to/the/model",
+                ),
+                deployment_config=dict(autoscaling_config=dict(
+                    min_replicas=1,
+                    max_replicas=1,
+                )),
+                # Change to the accelerator type of the node
+                accelerator_type="H100",
+                runtime_env=dict(env_vars=dict(VLLM_USE_V1="1")),
+                # Customize engine arguments as needed (e.g. vLLM engine kwargs)
+                engine_kwargs=dict(
+                    tensor_parallel_size=8,
+                    pipeline_parallel_size=2,
+                    gpu_memory_utilization=0.92,
+                    dtype="auto",
+                    max_num_seqs=40,
+                    max_model_len=16384,
+                    enable_chunked_prefill=True,
+                    enable_prefix_caching=True,
+                    trust_remote_code=True,
+                ),
+            )
+
+            # Deploy the application
+            deployment = LLMServer.as_deployment(
+                llm_config.get_serve_options(name_prefix="vLLM:")).bind(llm_config)
+            llm_app = LLMRouter.as_deployment().bind([deployment])
+            serve.run(llm_app)
+
 Production Deployment
 ---------------------
 

From a92b1b867b2a9332488643a1a9951b7a5e555b8b Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Thu, 24 Apr 2025 16:32:05 -0700
Subject: [PATCH 02/12] Update doc/source/serve/llm/serving-llms.rst

Co-authored-by: Gene Der Su <gdsu@ucdavis.edu>
Signed-off-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
---
 doc/source/serve/llm/serving-llms.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/serve/llm/serving-llms.rst b/doc/source/serve/llm/serving-llms.rst
index 5d21417587759..dff9c17cb9268 100644
--- a/doc/source/serve/llm/serving-llms.rst
+++ b/doc/source/serve/llm/serving-llms.rst
@@ -275,7 +275,7 @@ The following example shows how you can deploy DeepSeek R1 or V3:
                 )),
                 # Change to the accelerator type of the node
                 accelerator_type="H100",
-                runtime_env=dict(env_vars=dict(VLLM_USE_V1="1")),
+                runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
                 # Customize engine arguments as needed (e.g. vLLM engine kwargs)
                 engine_kwargs=dict(
                     tensor_parallel_size=8,

From 8e4f6752eabe7003869aa5767f280fc3e2495788 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Thu, 24 Apr 2025 16:32:46 -0700
Subject: [PATCH 03/12] Update doc/source/serve/llm/serving-llms.rst

Co-authored-by: Gene Der Su <gdsu@ucdavis.edu>
Signed-off-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
---
 doc/source/serve/llm/serving-llms.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/serve/llm/serving-llms.rst b/doc/source/serve/llm/serving-llms.rst
index dff9c17cb9268..f42c3869829f1 100644
--- a/doc/source/serve/llm/serving-llms.rst
+++ b/doc/source/serve/llm/serving-llms.rst
@@ -293,7 +293,7 @@ The following example shows how you can deploy DeepSeek R1 or V3:
             # Deploy the application
             deployment = LLMServer.as_deployment(
                 llm_config.get_serve_options(name_prefix="vLLM:")).bind(llm_config)
-            llm_app = LLMRouter.as_deployment().bind([deployment])
+            llm_app = build_openai_app({"llm_configs": [llm_config]})
             serve.run(llm_app)
 
 Production Deployment

From 3d6ca1ccaa4b0902f9f6af057b5476d4e660fc61 Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Thu, 24 Apr 2025 18:42:28 -0700
Subject: [PATCH 04/12] up

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 .../serve/doc_code/tutorial_deepseek.py       | 37 ++++++++
 doc/source/serve/examples.yml                 |  8 ++
 doc/source/serve/llm/serving-llms.rst         | 49 +---------
 doc/source/serve/tutorials/serve-deepseek.md  | 90 +++++++++++++++++++
 4 files changed, 136 insertions(+), 48 deletions(-)
 create mode 100644 doc/source/serve/doc_code/tutorial_deepseek.py
 create mode 100644 doc/source/serve/tutorials/serve-deepseek.md

diff --git a/doc/source/serve/doc_code/tutorial_deepseek.py b/doc/source/serve/doc_code/tutorial_deepseek.py
new file mode 100644
index 0000000000000..221e7cb05e838
--- /dev/null
+++ b/doc/source/serve/doc_code/tutorial_deepseek.py
@@ -0,0 +1,37 @@
+# __deepseek_setup_start__
+
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+llm_config = LLMConfig(
+    model_loading_config=dict(
+        model_id="deepseek",
+        model_source="deepseek-ai/DeepSeek-R1",
+    ),
+    deployment_config=dict(autoscaling_config=dict(
+        min_replicas=1,
+        max_replicas=1,
+    )),
+    # Change to the accelerator type of the node
+    accelerator_type="H100",
+    runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
+    # Customize engine arguments as needed (e.g. vLLM engine kwargs)
+    engine_kwargs=dict(
+        tensor_parallel_size=8,
+        pipeline_parallel_size=2,
+        gpu_memory_utilization=0.92,
+        dtype="auto",
+        max_num_seqs=40,
+        max_model_len=16384,
+        enable_chunked_prefill=True,
+        enable_prefix_caching=True,
+        trust_remote_code=True,
+    ),
+)
+
+# Deploy the application
+llm_app = build_openai_app({"llm_configs": [llm_config]})
+serve.run(llm_app)
+
+# __deepseek_setup_end__
+
diff --git a/doc/source/serve/examples.yml b/doc/source/serve/examples.yml
index b14d5ccbcc54d..886ebdd10c8b9 100644
--- a/doc/source/serve/examples.yml
+++ b/doc/source/serve/examples.yml
@@ -74,6 +74,14 @@ examples:
       - natural language processing
     link: tutorials/streaming
     related_technology: ml applications
+  - title: Serve DeepSeek
+    skill_level: beginner
+    use_cases:
+      - generative ai
+      - large language models
+      - natural language processing
+    link: tutorials/serve-deepseek
+    related_technology: ml applications
   - title: Serving models with Triton Server in Ray Serve
     skill_level: intermediate
     use_cases:
diff --git a/doc/source/serve/llm/serving-llms.rst b/doc/source/serve/llm/serving-llms.rst
index f42c3869829f1..d5ee81ebb3d64 100644
--- a/doc/source/serve/llm/serving-llms.rst
+++ b/doc/source/serve/llm/serving-llms.rst
@@ -60,6 +60,7 @@ Quickstart Examples
 -------------------
 
 
+
 Deployment through :class:`LLMRouter <ray.serve.llm.LLMRouter>`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -248,54 +249,6 @@ For deploying multiple models, you can pass a list of :class:`LLMConfig <ray.ser
             serve.run(llm_app, blocking=True)
 
 
-Example: Deploying DeepSeek
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The following example shows how you can deploy DeepSeek R1 or V3:
-
-.. tab-set::
-
-    .. tab-item:: Builder Pattern
-        :sync: builder
-
-        .. code-block:: python
-
-            from ray import serve
-            from ray.serve.llm import LLMConfig, LLMRouter, LLMServer
-
-            llm_config = LLMConfig(
-                model_loading_config=dict(
-                    model_id="deepseek",
-                    # Change to model download path
-                    model_source="/path/to/the/model",
-                ),
-                deployment_config=dict(autoscaling_config=dict(
-                    min_replicas=1,
-                    max_replicas=1,
-                )),
-                # Change to the accelerator type of the node
-                accelerator_type="H100",
-                runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
-                # Customize engine arguments as needed (e.g. vLLM engine kwargs)
-                engine_kwargs=dict(
-                    tensor_parallel_size=8,
-                    pipeline_parallel_size=2,
-                    gpu_memory_utilization=0.92,
-                    dtype="auto",
-                    max_num_seqs=40,
-                    max_model_len=16384,
-                    enable_chunked_prefill=True,
-                    enable_prefix_caching=True,
-                    trust_remote_code=True,
-                ),
-            )
-
-            # Deploy the application
-            deployment = LLMServer.as_deployment(
-                llm_config.get_serve_options(name_prefix="vLLM:")).bind(llm_config)
-            llm_app = build_openai_app({"llm_configs": [llm_config]})
-            serve.run(llm_app)
-
 Production Deployment
 ---------------------
 
diff --git a/doc/source/serve/tutorials/serve-deepseek.md b/doc/source/serve/tutorials/serve-deepseek.md
new file mode 100644
index 0000000000000..0a5687743157d
--- /dev/null
+++ b/doc/source/serve/tutorials/serve-deepseek.md
@@ -0,0 +1,90 @@
+---
+orphan: true
+---
+
+(serve-deepseek-tutorial)=
+
+# Serve DeepSeek
+
+This example shows how to deploy DeepSeek R1 or V3 with Ray Serve LLM.
+
+## Installation
+
+To run this example, install the following:
+
+```bash
+pip install "ray[serve]"
+```
+
+## Code Structure
+
+Save the following code to a file named `deepseek.py`:
+
+```{literalinclude} ../doc_code/tutorial_deepseek.py
+:language: python
+:start-after: __deepseek_setup_start__
+:end-before: __deepseek_setup_end__
+```
+
+## Configuration
+
+You may need to adjust configurations in the above code based on your setup, specifically:
+
+* `accelerator_type`: for NVIDIA GPUs, DeepSeek requires Hopper GPUs or later ones. Therefore, you can specify `H200`, `H100`, `H20` etc. based on your hardware.
+* `tensor_parallel_size` and `pipeline_parallel_size`: DeepSeek requires a single node of 8xH200, or two nodes of 8xH100. The typical setup of using H100 is setting `tensor_parallel_size` to `8` and `pipeline_parallel_size` to `2` as in the code example. When using H200, you can set `tensor_parallel_size` to `8` and leave out the `pipeline_parallel_size` parameter (it is `1` by default).
+* `model_source`: although you could specify a HuggingFace model ID like `deepseek-ai/DeepSeek-R1` in the code example, it is recommended to pre-download the model because it is huge. You can download it to the local file system (e.g., `/path/to/downloaded/model`) or to a remote object store (e.g., `s3://my-bucket/path/to/downloaded/model`), and specify it as `model_source`. It is recommended to download it to a remote object store, using :ref:`Ray model caching utilities <_model_cache>`. Note that if you have two nodes and would like to download to local file system, you need to download the model to the same path on both nodes.
+
+## Deployment
+
+Deploy the service with `python3 deepseek.py`.
+
+## Testing the Service
+
+You can query the deployed model using the following request and get the corresponding response.
+
+.. tab-set::
+
+    .. tab-item:: Request
+        :sync: request
+
+        .. code-block:: bash
+
+            curl -X POST http://localhost:8000/v1/chat/completions \
+                -H "Content-Type: application/json" \
+                -H "Authorization: Bearer fake-key" \
+                -d '{
+                    "model": "deepseek",
+                    "messages": [{"role": "user", "content": "Hello!"}]
+                    }'
+
+    .. tab-item:: Response
+        :sync: response
+
+        .. code-block:: bash
+
+            {"id":"deepseek-68b5d5c5-fd34-42fc-be26-0a36f8457ffe","object":"chat.completion","created":1743646776,"model":"deepseek","choices":[{"index":0,"message":{"role":"assistant","reasoning_content":null,"content":"Hello! How can I assist you today? 😊","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":6,"total_tokens":18,"completion_tokens":12,"prompt_tokens_details":null},"prompt_logprobs":null}
+
+Another example request and response:
+
+.. tab-set::
+
+    .. tab-item:: Request
+        :sync: request
+
+        .. code-block:: bash
+
+            curl -X POST http://localhost:8000/v1/chat/completions \
+                -H "Content-Type: application/json" \
+                -H "Authorization: Bearer fake-key" \
+                -d '{
+                    "model": "deepseek",
+                    "messages": [{"role": "user", "content": "The future of AI is"}]
+                    }'
+
+
+    .. tab-item:: Response
+        :sync: response
+
+        .. code-block:: bash
+
+            {"id":"deepseek-b81ff9be-3ffc-4811-80ff-225006eff27c","object":"chat.completion","created":1743646860,"model":"deepseek","choices":[{"index":0,"message":{"role":"assistant","reasoning_content":null,"content":"The future of AI is multifaceted and holds immense potential across various domains. Here are some key aspects that are likely to shape its trajectory:\n\n1. **Advanced Automation**: AI will continue to automate routine and complex tasks across industries, increasing efficiency and productivity. This includes everything from manufacturing and logistics to healthcare and finance.\n\n2. **Enhanced Decision-Making**: AI systems will provide deeper insights and predictive analytics, aiding in better decision-making processes for businesses, governments, and individuals.\n\n3. **Personalization**: AI will drive more personalized experiences in areas such as shopping, education, and entertainment, tailoring services and products to individual preferences and behaviors.\n\n4. **Healthcare Revolution**: AI will play a significant role in diagnosing diseases, personalizing treatment plans, and even predicting health issues before they become critical, potentially transforming the healthcare industry.\n\n5. **Ethical and Responsible AI**: As AI becomes more integrated into society, there will be a growing focus on developing ethical guidelines and frameworks to ensure AI is used responsibly and transparently, addressing issues like bias, privacy, and security.\n\n6. **Human-AI Collaboration**: The future will see more seamless collaboration between humans and AI, with AI augmenting human capabilities rather than replacing them. This includes areas like creative industries, where AI can assist in generating ideas and content.\n\n7. **AI in Education**: AI will personalize learning experiences, adapt to individual learning styles, and provide real-time feedback, making education more accessible and effective.\n\n8. **Robotics and Autonomous Systems**: Advances in AI will lead to more sophisticated robots and autonomous systems, impacting industries like transportation (e.g., self-driving cars), agriculture, and home automation.\n\n9. **AI and Sustainability**: AI will play a crucial role in addressing environmental challenges by optimizing resource use, improving energy efficiency, and aiding in climate modeling and conservation efforts.\n\n10. **Regulation and Governance**: As AI technologies advance, there will be increased efforts to establish international standards and regulations to govern their development and use, ensuring they benefit society as a whole.\n\n11. **Quantum Computing and AI**: The integration of quantum computing with AI could revolutionize data processing capabilities, enabling the solving of complex problems that are currently intractable.\n\n12. **AI in Creative Fields**: AI will continue to make strides in creative domains such as music, art, and literature, collaborating with human creators to push the boundaries of innovation and expression.\n\nOverall, the future of AI is both promising and challenging, requiring careful consideration of its societal impact and the ethical implications of its widespread adoption.","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":9,"total_tokens":518,"completion_tokens":509,"prompt_tokens_details":null},"prompt_logprobs":null}
\ No newline at end of file

From ebd533e4a53ae18bd77290a3cdce8359e633b1c9 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Fri, 25 Apr 2025 11:03:05 -0700
Subject: [PATCH 05/12] Update doc/source/serve/tutorials/serve-deepseek.md

Co-authored-by: Gene Der Su <gdsu@ucdavis.edu>
Signed-off-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
---
 doc/source/serve/tutorials/serve-deepseek.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/serve/tutorials/serve-deepseek.md b/doc/source/serve/tutorials/serve-deepseek.md
index 0a5687743157d..74fdf9230baca 100644
--- a/doc/source/serve/tutorials/serve-deepseek.md
+++ b/doc/source/serve/tutorials/serve-deepseek.md
@@ -13,7 +13,7 @@ This example shows how to deploy DeepSeek R1 or V3 with Ray Serve LLM.
 To run this example, install the following:
 
 ```bash
-pip install "ray[serve]"
+pip install "ray[llm]"
 ```
 
 ## Code Structure

From a3fbb45bd82cb9e8cc6522edfb3eabcf8eca5328 Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Fri, 25 Apr 2025 12:03:38 -0700
Subject: [PATCH 06/12] up

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 doc/source/serve/tutorials/serve-deepseek.md | 95 +++++++++++---------
 1 file changed, 51 insertions(+), 44 deletions(-)

diff --git a/doc/source/serve/tutorials/serve-deepseek.md b/doc/source/serve/tutorials/serve-deepseek.md
index 74fdf9230baca..4f7fcaae9c593 100644
--- a/doc/source/serve/tutorials/serve-deepseek.md
+++ b/doc/source/serve/tutorials/serve-deepseek.md
@@ -30,9 +30,21 @@ Save the following code to a file named `deepseek.py`:
 
 You may need to adjust configurations in the above code based on your setup, specifically:
 
-* `accelerator_type`: for NVIDIA GPUs, DeepSeek requires Hopper GPUs or later ones. Therefore, you can specify `H200`, `H100`, `H20` etc. based on your hardware.
-* `tensor_parallel_size` and `pipeline_parallel_size`: DeepSeek requires a single node of 8xH200, or two nodes of 8xH100. The typical setup of using H100 is setting `tensor_parallel_size` to `8` and `pipeline_parallel_size` to `2` as in the code example. When using H200, you can set `tensor_parallel_size` to `8` and leave out the `pipeline_parallel_size` parameter (it is `1` by default).
-* `model_source`: although you could specify a HuggingFace model ID like `deepseek-ai/DeepSeek-R1` in the code example, it is recommended to pre-download the model because it is huge. You can download it to the local file system (e.g., `/path/to/downloaded/model`) or to a remote object store (e.g., `s3://my-bucket/path/to/downloaded/model`), and specify it as `model_source`. It is recommended to download it to a remote object store, using :ref:`Ray model caching utilities <_model_cache>`. Note that if you have two nodes and would like to download to local file system, you need to download the model to the same path on both nodes.
+* `accelerator_type`: for NVIDIA GPUs, DeepSeek requires Hopper GPUs or later ones. 
+Therefore, you can specify `H200`, `H100`, `H20` etc. based on your hardware.
+* `tensor_parallel_size` and `pipeline_parallel_size`: DeepSeek requires a single node of 8xH200,
+or two nodes of 8xH100. The typical setup of using H100 is setting `tensor_parallel_size` to `8`
+and `pipeline_parallel_size` to `2` as in the code example. When using H200, you can set
+`tensor_parallel_size` to `8` and leave out the `pipeline_parallel_size` parameter
+(it is `1` by default).
+* `model_source`: although you could specify a HuggingFace model ID like `deepseek-ai/DeepSeek-R1` 
+in the code example, it is recommended to pre-download the model because it is huge.
+You can download it to the local file system (e.g., `/path/to/downloaded/model`)
+or to a remote object store (e.g., `s3://my-bucket/path/to/downloaded/model`),
+and specify it as `model_source`. It is recommended to download it to a remote object store,
+using {ref}`Ray model caching utilities <model_cache>`. 
+Note that if you have two nodes and would like to download to local file system,
+you need to download the model to the same path on both nodes.
 
 ## Deployment
 
@@ -42,49 +54,44 @@ Deploy the service with `python3 deepseek.py`.
 
 You can query the deployed model using the following request and get the corresponding response.
 
-.. tab-set::
-
-    .. tab-item:: Request
-        :sync: request
-
-        .. code-block:: bash
-
-            curl -X POST http://localhost:8000/v1/chat/completions \
-                -H "Content-Type: application/json" \
-                -H "Authorization: Bearer fake-key" \
-                -d '{
-                    "model": "deepseek",
-                    "messages": [{"role": "user", "content": "Hello!"}]
-                    }'
-
-    .. tab-item:: Response
-        :sync: response
-
-        .. code-block:: bash
+::::{tab-set}
+:::{tab-item} Request
+```bash
+curl -X POST http://localhost:8000/v1/chat/completions \
+     -H "Content-Type: application/json" \
+     -H "Authorization: Bearer fake-key" \
+     -d '{
+           "model": "deepseek",
+           "messages": [{"role": "user", "content": "Hello!"}]
+         }'
+```
+:::
 
-            {"id":"deepseek-68b5d5c5-fd34-42fc-be26-0a36f8457ffe","object":"chat.completion","created":1743646776,"model":"deepseek","choices":[{"index":0,"message":{"role":"assistant","reasoning_content":null,"content":"Hello! How can I assist you today? 😊","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":6,"total_tokens":18,"completion_tokens":12,"prompt_tokens_details":null},"prompt_logprobs":null}
+:::{tab-item} Response
+```bash
+{"id":"deepseek-68b5d5c5-fd34-42fc-be26-0a36f8457ffe","object":"chat.completion","created":1743646776,"model":"deepseek","choices":[{"index":0,"message":{"role":"assistant","reasoning_content":null,"content":"Hello! How can I assist you today? 😊","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":6,"total_tokens":18,"completion_tokens":12,"prompt_tokens_details":null},"prompt_logprobs":null}
+```
+:::
+::::
 
 Another example request and response:
 
-.. tab-set::
-
-    .. tab-item:: Request
-        :sync: request
-
-        .. code-block:: bash
-
-            curl -X POST http://localhost:8000/v1/chat/completions \
-                -H "Content-Type: application/json" \
-                -H "Authorization: Bearer fake-key" \
-                -d '{
-                    "model": "deepseek",
-                    "messages": [{"role": "user", "content": "The future of AI is"}]
-                    }'
-
-
-    .. tab-item:: Response
-        :sync: response
-
-        .. code-block:: bash
+::::{tab-set}
+:::{tab-item} Request
+```bash
+curl -X POST http://localhost:8000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer fake-key" \
+    -d '{
+        "model": "deepseek",
+        "messages": [{"role": "user", "content": "The future of AI is"}]
+        }'
+```
+:::
 
-            {"id":"deepseek-b81ff9be-3ffc-4811-80ff-225006eff27c","object":"chat.completion","created":1743646860,"model":"deepseek","choices":[{"index":0,"message":{"role":"assistant","reasoning_content":null,"content":"The future of AI is multifaceted and holds immense potential across various domains. Here are some key aspects that are likely to shape its trajectory:\n\n1. **Advanced Automation**: AI will continue to automate routine and complex tasks across industries, increasing efficiency and productivity. This includes everything from manufacturing and logistics to healthcare and finance.\n\n2. **Enhanced Decision-Making**: AI systems will provide deeper insights and predictive analytics, aiding in better decision-making processes for businesses, governments, and individuals.\n\n3. **Personalization**: AI will drive more personalized experiences in areas such as shopping, education, and entertainment, tailoring services and products to individual preferences and behaviors.\n\n4. **Healthcare Revolution**: AI will play a significant role in diagnosing diseases, personalizing treatment plans, and even predicting health issues before they become critical, potentially transforming the healthcare industry.\n\n5. **Ethical and Responsible AI**: As AI becomes more integrated into society, there will be a growing focus on developing ethical guidelines and frameworks to ensure AI is used responsibly and transparently, addressing issues like bias, privacy, and security.\n\n6. **Human-AI Collaboration**: The future will see more seamless collaboration between humans and AI, with AI augmenting human capabilities rather than replacing them. This includes areas like creative industries, where AI can assist in generating ideas and content.\n\n7. **AI in Education**: AI will personalize learning experiences, adapt to individual learning styles, and provide real-time feedback, making education more accessible and effective.\n\n8. **Robotics and Autonomous Systems**: Advances in AI will lead to more sophisticated robots and autonomous systems, impacting industries like transportation (e.g., self-driving cars), agriculture, and home automation.\n\n9. **AI and Sustainability**: AI will play a crucial role in addressing environmental challenges by optimizing resource use, improving energy efficiency, and aiding in climate modeling and conservation efforts.\n\n10. **Regulation and Governance**: As AI technologies advance, there will be increased efforts to establish international standards and regulations to govern their development and use, ensuring they benefit society as a whole.\n\n11. **Quantum Computing and AI**: The integration of quantum computing with AI could revolutionize data processing capabilities, enabling the solving of complex problems that are currently intractable.\n\n12. **AI in Creative Fields**: AI will continue to make strides in creative domains such as music, art, and literature, collaborating with human creators to push the boundaries of innovation and expression.\n\nOverall, the future of AI is both promising and challenging, requiring careful consideration of its societal impact and the ethical implications of its widespread adoption.","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":9,"total_tokens":518,"completion_tokens":509,"prompt_tokens_details":null},"prompt_logprobs":null}
\ No newline at end of file
+:::{tab-item} Response
+```bash
+{"id":"deepseek-b81ff9be-3ffc-4811-80ff-225006eff27c","object":"chat.completion","created":1743646860,"model":"deepseek","choices":[{"index":0,"message":{"role":"assistant","reasoning_content":null,"content":"The future of AI is multifaceted and holds immense potential across various domains. Here are some key aspects that are likely to shape its trajectory:\n\n1. **Advanced Automation**: AI will continue to automate routine and complex tasks across industries, increasing efficiency and productivity. This includes everything from manufacturing and logistics to healthcare and finance.\n\n2. **Enhanced Decision-Making**: AI systems will provide deeper insights and predictive analytics, aiding in better decision-making processes for businesses, governments, and individuals.\n\n3. **Personalization**: AI will drive more personalized experiences in areas such as shopping, education, and entertainment, tailoring services and products to individual preferences and behaviors.\n\n4. **Healthcare Revolution**: AI will play a significant role in diagnosing diseases, personalizing treatment plans, and even predicting health issues before they become critical, potentially transforming the healthcare industry.\n\n5. **Ethical and Responsible AI**: As AI becomes more integrated into society, there will be a growing focus on developing ethical guidelines and frameworks to ensure AI is used responsibly and transparently, addressing issues like bias, privacy, and security.\n\n6. **Human-AI Collaboration**: The future will see more seamless collaboration between humans and AI, with AI augmenting human capabilities rather than replacing them. This includes areas like creative industries, where AI can assist in generating ideas and content.\n\n7. **AI in Education**: AI will personalize learning experiences, adapt to individual learning styles, and provide real-time feedback, making education more accessible and effective.\n\n8. **Robotics and Autonomous Systems**: Advances in AI will lead to more sophisticated robots and autonomous systems, impacting industries like transportation (e.g., self-driving cars), agriculture, and home automation.\n\n9. **AI and Sustainability**: AI will play a crucial role in addressing environmental challenges by optimizing resource use, improving energy efficiency, and aiding in climate modeling and conservation efforts.\n\n10. **Regulation and Governance**: As AI technologies advance, there will be increased efforts to establish international standards and regulations to govern their development and use, ensuring they benefit society as a whole.\n\n11. **Quantum Computing and AI**: The integration of quantum computing with AI could revolutionize data processing capabilities, enabling the solving of complex problems that are currently intractable.\n\n12. **AI in Creative Fields**: AI will continue to make strides in creative domains such as music, art, and literature, collaborating with human creators to push the boundaries of innovation and expression.\n\nOverall, the future of AI is both promising and challenging, requiring careful consideration of its societal impact and the ethical implications of its widespread adoption.","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":9,"total_tokens":518,"completion_tokens":509,"prompt_tokens_details":null},"prompt_logprobs":null}
+```
+:::
+::::

From c53e587f3059d96948ac4a3eab11e8b6bf313139 Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Fri, 25 Apr 2025 13:20:29 -0700
Subject: [PATCH 07/12] up

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 doc/source/serve/doc_code/tutorial_deepseek.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/doc/source/serve/doc_code/tutorial_deepseek.py b/doc/source/serve/doc_code/tutorial_deepseek.py
index 221e7cb05e838..4af9e2129b9ac 100644
--- a/doc/source/serve/doc_code/tutorial_deepseek.py
+++ b/doc/source/serve/doc_code/tutorial_deepseek.py
@@ -8,10 +8,12 @@
         model_id="deepseek",
         model_source="deepseek-ai/DeepSeek-R1",
     ),
-    deployment_config=dict(autoscaling_config=dict(
-        min_replicas=1,
-        max_replicas=1,
-    )),
+    deployment_config=dict(
+        autoscaling_config=dict(
+            min_replicas=1,
+            max_replicas=1,
+        )
+    ),
     # Change to the accelerator type of the node
     accelerator_type="H100",
     runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
@@ -34,4 +36,3 @@
 serve.run(llm_app)
 
 # __deepseek_setup_end__
-

From 807fff05fc832dc0aa15e9484371cfdcc9c53091 Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Fri, 25 Apr 2025 13:46:17 -0700
Subject: [PATCH 08/12] up

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 doc/source/custom_directives.py              |  1 +
 doc/source/serve/examples.yml                | 16 +++----
 doc/source/serve/llm/serving-llms.rst        |  3 +-
 doc/source/serve/tutorials/serve-deepseek.md | 46 +++++++++++++++++---
 4 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/doc/source/custom_directives.py b/doc/source/custom_directives.py
index 19059d40153ef..eb8f8c5bf19b1 100644
--- a/doc/source/custom_directives.py
+++ b/doc/source/custom_directives.py
@@ -506,6 +506,7 @@ def key(cls: type) -> str:
 
 class RelatedTechnology(ExampleEnum):
     ML_APPLICATIONS = "ML Applications"
+    LLM_APPLICATIONS = "LLM Applications"
     INTEGRATIONS = "Integrations"
     AI_ACCELERATORS = "AI Accelerators"
 
diff --git a/doc/source/serve/examples.yml b/doc/source/serve/examples.yml
index 886ebdd10c8b9..6ea0d026b611d 100644
--- a/doc/source/serve/examples.yml
+++ b/doc/source/serve/examples.yml
@@ -28,6 +28,14 @@ examples:
       - computer vision
     link: tutorials/object-detection
     related_technology: ml applications
+ - title: Serve DeepSeek
+    skill_level: beginner
+    use_cases:
+      - generative ai
+      - large language models
+      - natural language processing
+    link: tutorials/serve-deepseek
+    related_technology: llm applications
   - title: Serve an Inference Model on AWS NeuronCores Using FastAPI
     skill_level: intermediate
     use_cases:
@@ -74,14 +82,6 @@ examples:
       - natural language processing
     link: tutorials/streaming
     related_technology: ml applications
-  - title: Serve DeepSeek
-    skill_level: beginner
-    use_cases:
-      - generative ai
-      - large language models
-      - natural language processing
-    link: tutorials/serve-deepseek
-    related_technology: ml applications
   - title: Serving models with Triton Server in Ray Serve
     skill_level: intermediate
     use_cases:
diff --git a/doc/source/serve/llm/serving-llms.rst b/doc/source/serve/llm/serving-llms.rst
index d5ee81ebb3d64..6b1d8485c718e 100644
--- a/doc/source/serve/llm/serving-llms.rst
+++ b/doc/source/serve/llm/serving-llms.rst
@@ -59,8 +59,6 @@ The :class:`LLMConfig <ray.serve.llm.LLMConfig>` class specifies model details s
 Quickstart Examples
 -------------------
 
-
-
 Deployment through :class:`LLMRouter <ray.serve.llm.LLMRouter>`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -248,6 +246,7 @@ For deploying multiple models, you can pass a list of :class:`LLMConfig <ray.ser
             llm_app = LLMRouter.as_deployment().bind([deployment1, deployment2])
             serve.run(llm_app, blocking=True)
 
+See also :ref:`serve-deepseek-tutorial` for an example of deploying DeepSeek models.
 
 Production Deployment
 ---------------------
diff --git a/doc/source/serve/tutorials/serve-deepseek.md b/doc/source/serve/tutorials/serve-deepseek.md
index 4f7fcaae9c593..8284321674c32 100644
--- a/doc/source/serve/tutorials/serve-deepseek.md
+++ b/doc/source/serve/tutorials/serve-deepseek.md
@@ -16,9 +16,12 @@ To run this example, install the following:
 pip install "ray[llm]"
 ```
 
-## Code Structure
+## Deployment
+
+### Quick Deployment
 
-Save the following code to a file named `deepseek.py`:
+For quick deployment and testing, save the following code to a file named `deepseek.py`,
+and run `python3 deepseek.py`.
 
 ```{literalinclude} ../doc_code/tutorial_deepseek.py
 :language: python
@@ -26,6 +29,42 @@ Save the following code to a file named `deepseek.py`:
 :end-before: __deepseek_setup_end__
 ```
 
+### Production Deployment
+
+For production deployments, save the following to a YAML file named `deepseek.yaml`
+and run `serve run deepseek.yaml`.
+
+```yaml
+# config.yaml
+applications:
+- args:
+    llm_configs:
+      - model_loading_config:
+          model_id: "deepseek"
+          model_source: "deepseek-ai/DeepSeek-R1"
+        accelerator_type: "H100"
+        deployment_config:
+          autoscaling_config:
+            min_replicas: 1
+            max_replicas: 1
+        runtime_env:
+          env_vars:
+            VLLM_USE_V1: "1"
+        engine_kwargs:
+          tensor_parallel_size: 8
+          pipeline_parallel_size: 2
+          gpu_memory_utilization: 0.92
+          dtype: "auto"
+          max_num_seqs: 40
+          max_model_len: 16384
+          enable_chunked_prefill: true
+          enable_prefix_caching: true
+          trust_remote_code: true
+  import_path: ray.serve.llm:build_openai_app
+  name: llm_app
+  route_prefix: "/"
+```
+
 ## Configuration
 
 You may need to adjust configurations in the above code based on your setup, specifically:
@@ -46,9 +85,6 @@ using {ref}`Ray model caching utilities <model_cache>`.
 Note that if you have two nodes and would like to download to local file system,
 you need to download the model to the same path on both nodes.
 
-## Deployment
-
-Deploy the service with `python3 deepseek.py`.
 
 ## Testing the Service
 

From 536c521ddd470ed201a7935e4946310c176f7e04 Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Fri, 25 Apr 2025 14:12:21 -0700
Subject: [PATCH 09/12] up

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 doc/source/serve/tutorials/serve-deepseek.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/source/serve/tutorials/serve-deepseek.md b/doc/source/serve/tutorials/serve-deepseek.md
index 8284321674c32..69550ef6a4443 100644
--- a/doc/source/serve/tutorials/serve-deepseek.md
+++ b/doc/source/serve/tutorials/serve-deepseek.md
@@ -35,7 +35,6 @@ For production deployments, save the following to a YAML file named `deepseek.ya
 and run `serve run deepseek.yaml`.
 
 ```yaml
-# config.yaml
 applications:
 - args:
     llm_configs:

From c00f8b475af3dd1ef722ff478bd923eb73177fe1 Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Fri, 25 Apr 2025 14:25:28 -0700
Subject: [PATCH 10/12] up

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 doc/source/serve/examples.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/doc/source/serve/examples.yml b/doc/source/serve/examples.yml
index 6ea0d026b611d..bd830b01e8d8f 100644
--- a/doc/source/serve/examples.yml
+++ b/doc/source/serve/examples.yml
@@ -28,14 +28,6 @@ examples:
       - computer vision
     link: tutorials/object-detection
     related_technology: ml applications
- - title: Serve DeepSeek
-    skill_level: beginner
-    use_cases:
-      - generative ai
-      - large language models
-      - natural language processing
-    link: tutorials/serve-deepseek
-    related_technology: llm applications
   - title: Serve an Inference Model on AWS NeuronCores Using FastAPI
     skill_level: intermediate
     use_cases:
@@ -74,6 +66,14 @@ examples:
       - natural language processing
     link: tutorials/batch
     related_technology: integrations
+  - title: Serve DeepSeek
+    skill_level: beginner
+    use_cases:
+      - generative ai
+      - large language models
+      - natural language processing
+    link: tutorials/serve-deepseek
+    related_technology: llm applications
   - title: Serve a Chatbot with Request and Response Streaming
     skill_level: intermediate
     use_cases:

From 38d979d1e7512ecf0e4c6e2f0002b9bb6700f6e0 Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Fri, 25 Apr 2025 14:27:16 -0700
Subject: [PATCH 11/12] up

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 .../serve/doc_code/tutorial_deepseek.py       | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/doc/source/serve/doc_code/tutorial_deepseek.py b/doc/source/serve/doc_code/tutorial_deepseek.py
index 4af9e2129b9ac..146bf239adc47 100644
--- a/doc/source/serve/doc_code/tutorial_deepseek.py
+++ b/doc/source/serve/doc_code/tutorial_deepseek.py
@@ -4,31 +4,31 @@
 from ray.serve.llm import LLMConfig, build_openai_app
 
 llm_config = LLMConfig(
-    model_loading_config=dict(
-        model_id="deepseek",
-        model_source="deepseek-ai/DeepSeek-R1",
-    ),
-    deployment_config=dict(
-        autoscaling_config=dict(
-            min_replicas=1,
-            max_replicas=1,
-        )
-    ),
+    model_loading_config={
+        "model_id": "deepseek",
+        "model_source": "deepseek-ai/DeepSeek-R1",
+    },
+    deployment_config={
+        "autoscaling_config": {
+            "min_replicas": 1,
+            "max_replicas": 1,
+        }
+    },
     # Change to the accelerator type of the node
     accelerator_type="H100",
     runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
     # Customize engine arguments as needed (e.g. vLLM engine kwargs)
-    engine_kwargs=dict(
-        tensor_parallel_size=8,
-        pipeline_parallel_size=2,
-        gpu_memory_utilization=0.92,
-        dtype="auto",
-        max_num_seqs=40,
-        max_model_len=16384,
-        enable_chunked_prefill=True,
-        enable_prefix_caching=True,
-        trust_remote_code=True,
-    ),
+    engine_kwargs={
+        "tensor_parallel_size": 8,
+        "pipeline_parallel_size": 2,
+        "gpu_memory_utilization": 0.92,
+        "dtype": "auto",
+        "max_num_seqs": 40,
+        "max_model_len": 16384,
+        "enable_chunked_prefill": True,
+        "enable_prefix_caching": True,
+        "trust_remote_code": True,
+    },
 )
 
 # Deploy the application

From 23336d8574fd5f879271fa975e2354f2fbaad76a Mon Sep 17 00:00:00 2001
From: Rui Qiao <ruisearch42@gmail.com>
Date: Fri, 25 Apr 2025 15:51:35 -0700
Subject: [PATCH 12/12] up

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 .../serve/doc_code/tutorial_deepseek.py       | 38 ------------------
 doc/source/serve/tutorials/serve-deepseek.md  | 39 +++++++++++++++++--
 2 files changed, 35 insertions(+), 42 deletions(-)
 delete mode 100644 doc/source/serve/doc_code/tutorial_deepseek.py

diff --git a/doc/source/serve/doc_code/tutorial_deepseek.py b/doc/source/serve/doc_code/tutorial_deepseek.py
deleted file mode 100644
index 146bf239adc47..0000000000000
--- a/doc/source/serve/doc_code/tutorial_deepseek.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# __deepseek_setup_start__
-
-from ray import serve
-from ray.serve.llm import LLMConfig, build_openai_app
-
-llm_config = LLMConfig(
-    model_loading_config={
-        "model_id": "deepseek",
-        "model_source": "deepseek-ai/DeepSeek-R1",
-    },
-    deployment_config={
-        "autoscaling_config": {
-            "min_replicas": 1,
-            "max_replicas": 1,
-        }
-    },
-    # Change to the accelerator type of the node
-    accelerator_type="H100",
-    runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
-    # Customize engine arguments as needed (e.g. vLLM engine kwargs)
-    engine_kwargs={
-        "tensor_parallel_size": 8,
-        "pipeline_parallel_size": 2,
-        "gpu_memory_utilization": 0.92,
-        "dtype": "auto",
-        "max_num_seqs": 40,
-        "max_model_len": 16384,
-        "enable_chunked_prefill": True,
-        "enable_prefix_caching": True,
-        "trust_remote_code": True,
-    },
-)
-
-# Deploy the application
-llm_app = build_openai_app({"llm_configs": [llm_config]})
-serve.run(llm_app)
-
-# __deepseek_setup_end__
diff --git a/doc/source/serve/tutorials/serve-deepseek.md b/doc/source/serve/tutorials/serve-deepseek.md
index 69550ef6a4443..11b759ef4cb72 100644
--- a/doc/source/serve/tutorials/serve-deepseek.md
+++ b/doc/source/serve/tutorials/serve-deepseek.md
@@ -23,10 +23,41 @@ pip install "ray[llm]"
 For quick deployment and testing, save the following code to a file named `deepseek.py`,
 and run `python3 deepseek.py`.
 
-```{literalinclude} ../doc_code/tutorial_deepseek.py
-:language: python
-:start-after: __deepseek_setup_start__
-:end-before: __deepseek_setup_end__
+```python
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+llm_config = LLMConfig(
+    model_loading_config={
+        "model_id": "deepseek",
+        "model_source": "deepseek-ai/DeepSeek-R1",
+    },
+    deployment_config={
+        "autoscaling_config": {
+            "min_replicas": 1,
+            "max_replicas": 1,
+        }
+    },
+    # Change to the accelerator type of the node
+    accelerator_type="H100",
+    runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
+    # Customize engine arguments as needed (e.g. vLLM engine kwargs)
+    engine_kwargs={
+        "tensor_parallel_size": 8,
+        "pipeline_parallel_size": 2,
+        "gpu_memory_utilization": 0.92,
+        "dtype": "auto",
+        "max_num_seqs": 40,
+        "max_model_len": 16384,
+        "enable_chunked_prefill": True,
+        "enable_prefix_caching": True,
+        "trust_remote_code": True,
+    },
+)
+
+# Deploy the application
+llm_app = build_openai_app({"llm_configs": [llm_config]})
+serve.run(llm_app)
 ```
 
 ### Production Deployment