Skip to content

Commit fa1162e

Browse files
committed
Adaptiations to vllm-project#6484 and Merge remote-tracking branch 'github/main' into continous_batching_mamba_from_scratch
2 parents 906379d + f519902 commit fa1162e

File tree

113 files changed

+4134
-895
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

113 files changed

+4134
-895
lines changed

.buildkite/run-cpu-test-ppc64le.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,13 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
1818
# Run basic model test
1919
docker exec cpu-test bash -c "
2020
pip install pytest matplotlib einops transformers_stream_generator
21-
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
21+
pytest -v -s tests/models -m \"not vlm\" \
22+
--ignore=tests/models/test_embedding.py \
23+
--ignore=tests/models/test_oot_registration.py \
24+
--ignore=tests/models/test_registry.py \
25+
--ignore=tests/models/test_jamba.py \
26+
--ignore=tests/models/test_mamba.py \
27+
--ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
2228

2329
# online inference
2430
docker exec cpu-test bash -c "

.buildkite/run-cpu-test.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ docker exec cpu-test bash -c "
2727
pytest -v -s tests/models/decoder_only/language \
2828
--ignore=tests/models/test_fp8.py \
2929
--ignore=tests/models/decoder_only/language/test_jamba.py \
30+
--ignore=tests/models/decoder_only/language/test_mamba.py \
3031
--ignore=tests/models/decoder_only/language/test_granitemoe.py \
3132
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
3233

.buildkite/test-pipeline.yaml

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,9 @@ steps:
121121
- vllm/core/
122122
- tests/distributed
123123
- tests/spec_decode/e2e/test_integration_dist_tp4
124+
- tests/compile
124125
commands:
126+
- pytest -v -s compile/test_basic_correctness.py
125127
- pytest -v -s distributed/test_pynccl.py
126128
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
127129

@@ -231,14 +233,16 @@ steps:
231233
- vllm/
232234
- tests/compile
233235
commands:
234-
- pytest -v -s compile/test_full_graph_smoke.py
236+
- pytest -v -s compile/test_basic_correctness.py
235237

236-
- label: "PyTorch Fullgraph Test" # 18min
237-
source_file_dependencies:
238-
- vllm/
239-
- tests/compile
240-
commands:
241-
- pytest -v -s compile/test_full_graph.py
238+
# TODO: re-write in comparison tests, and fix symbolic shape
239+
# for quantization ops.
240+
# - label: "PyTorch Fullgraph Test" # 18min
241+
# source_file_dependencies:
242+
# - vllm/
243+
# - tests/compile
244+
# commands:
245+
# - pytest -v -s compile/test_full_graph.py
242246

243247
- label: Kernels Test %N # 1h each
244248
mirror_hardwares: [amd]
@@ -343,10 +347,11 @@ steps:
343347
- pytest -v -s models/encoder_decoder/language
344348
- pytest -v -s models/encoder_decoder/vision_language
345349

350+
# This test is used only in PR development phase to test individual models and should never run on main
346351
- label: Custom Models Test
347-
#mirror_hardwares: [amd]
348352
optional: true
349353
commands:
354+
- echo 'Testing custom models...'
350355
# PR authors can temporarily add commands below to test individual models
351356
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
352357
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
@@ -394,7 +399,7 @@ steps:
394399
- tests/distributed/
395400
- vllm/compilation
396401
commands:
397-
- pytest -v -s ./compile/test_full_graph_multi_gpu.py
402+
- pytest -v -s ./compile/test_basic_correctness.py
398403
- pytest -v -s ./compile/test_wrapper.py
399404
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
400405
- TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus

.github/CODEOWNERS

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,30 @@
11
# See https://help.github.com/articles/about-codeowners/
22
# for more info about CODEOWNERS file
33

4+
# This lists cover the "core" components of vLLM that require careful review
5+
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
6+
/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
7+
/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
8+
/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
9+
/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
10+
/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
11+
/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
12+
CMakeLists.txt @tlrmchlsmth @WoosukKwon
13+
14+
# Test ownership
415
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
516
/tests/test_inputs.py @DarkLight1337 @ywang96
6-
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
17+
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
718
/tests/models @DarkLight1337 @ywang96
819
/tests/multimodal @DarkLight1337 @ywang96
9-
/tests/prefix_caching @comaniac @KuntaiDu
20+
/tests/prefix_caching @comaniac @KuntaiDu
1021
/tests/spec_decode @njhill @LiuXiaoxuanPKU
11-
/tests/kernels @tlrmchlsmth @WoosukKwon
22+
/tests/kernels @tlrmchlsmth @WoosukKwon
1223
/tests/quantization @mgoin @robertgshaw2-neuralmagic
13-
/.buildkite/lm-eval-harness @mgoin @simon-mo
24+
/.buildkite/lm-eval-harness @mgoin @simon-mo
1425
/tests/distributed/test_multi_node_assignment.py @youkaichao
1526
/tests/distributed/test_pipeline_parallel.py @youkaichao
1627
/tests/distributed/test_same_node.py @youkaichao
17-
/tests/multi_step @alexm-neuralmagic @SolitaryThinker @comaniac
28+
/tests/multi_step @alexm-neuralmagic @comaniac
1829
/tests/weight_loading @mgoin @youkaichao
1930
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac

.github/workflows/actionlint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
runs-on: ubuntu-latest
2929
steps:
3030
- name: "Checkout"
31-
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
31+
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
3232
with:
3333
fetch-depth: 0
3434

.github/workflows/add_label_automerge.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ jobs:
88
runs-on: ubuntu-latest
99
steps:
1010
- name: Add label
11-
uses: actions/github-script@v6
11+
uses: actions/github-script@v7
1212
with:
1313
script: |
1414
github.rest.issues.addLabels({

.github/workflows/clang-format.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ jobs:
1717
matrix:
1818
python-version: ["3.11"]
1919
steps:
20-
- uses: actions/checkout@v3
20+
- uses: actions/checkout@v4
2121
- name: Set up Python ${{ matrix.python-version }}
22-
uses: actions/setup-python@v3
22+
uses: actions/setup-python@v5
2323
with:
2424
python-version: ${{ matrix.python-version }}
2525
- name: Install dependencies

.github/workflows/mypy.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ jobs:
1717
matrix:
1818
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
1919
steps:
20-
- uses: actions/checkout@v3
20+
- uses: actions/checkout@v4
2121
- name: Set up Python ${{ matrix.python-version }}
22-
uses: actions/setup-python@v3
22+
uses: actions/setup-python@v5
2323
with:
2424
python-version: ${{ matrix.python-version }}
2525
- name: Install dependencies

.github/workflows/publish.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
upload_url: ${{ steps.create_release.outputs.upload_url }}
2222
steps:
2323
- name: Checkout
24-
uses: actions/checkout@v3
24+
uses: actions/checkout@v4
2525

2626
- name: Extract branch info
2727
shell: bash
@@ -30,7 +30,7 @@ jobs:
3030
3131
- name: Create Release
3232
id: create_release
33-
uses: "actions/github-script@v6"
33+
uses: "actions/github-script@v7"
3434
env:
3535
RELEASE_TAG: ${{ env.release_tag }}
3636
with:
@@ -54,7 +54,7 @@ jobs:
5454

5555
steps:
5656
- name: Checkout
57-
uses: actions/checkout@v3
57+
uses: actions/checkout@v4
5858

5959
- name: Setup ccache
6060
uses: hendrikmuhs/ccache-action@v1.2
@@ -68,7 +68,7 @@ jobs:
6868
bash -x .github/workflows/scripts/env.sh
6969
7070
- name: Set up Python
71-
uses: actions/setup-python@v4
71+
uses: actions/setup-python@v5
7272
with:
7373
python-version: ${{ matrix.python-version }}
7474

.github/workflows/reminder_comment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ jobs:
88
runs-on: ubuntu-latest
99
steps:
1010
- name: Remind to run full CI on PR
11-
uses: actions/github-script@v6
11+
uses: actions/github-script@v7
1212
with:
1313
script: |
1414
github.rest.issues.createComment({

.github/workflows/ruff.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ jobs:
1717
matrix:
1818
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
1919
steps:
20-
- uses: actions/checkout@v3
20+
- uses: actions/checkout@v4
2121
- name: Set up Python ${{ matrix.python-version }}
22-
uses: actions/setup-python@v3
22+
uses: actions/setup-python@v5
2323
with:
2424
python-version: ${{ matrix.python-version }}
2525
- name: Install dependencies

.github/workflows/yapf.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ jobs:
1616
matrix:
1717
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
1818
steps:
19-
- uses: actions/checkout@v3
19+
- uses: actions/checkout@v4
2020
- name: Set up Python ${{ matrix.python-version }}
21-
uses: actions/setup-python@v3
21+
uses: actions/setup-python@v5
2222
with:
2323
python-version: ${{ matrix.python-version }}
2424
- name: Install dependencies

CMakeLists.txt

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -144,27 +144,32 @@ else()
144144
endif()
145145

146146

147-
#
148-
# For cuda we want to be able to control which architectures we compile for on
149-
# a per-file basis in order to cut down on compile time. So here we extract
150-
# the set of architectures we want to compile for and remove the from the
151-
# CMAKE_CUDA_FLAGS so that they are not applied globally.
152-
#
153147
if(VLLM_GPU_LANG STREQUAL "CUDA")
148+
#
149+
# For cuda we want to be able to control which architectures we compile for on
150+
# a per-file basis in order to cut down on compile time. So here we extract
151+
# the set of architectures we want to compile for and remove the from the
152+
# CMAKE_CUDA_FLAGS so that they are not applied globally.
153+
#
154154
clear_cuda_arches(CUDA_ARCH_FLAGS)
155155
extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
156156
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
157+
# Filter the target architectures by the supported supported archs
158+
# since for some files we will build for all CUDA_ARCHS.
159+
cuda_archs_loose_intersection(CUDA_ARCHS
160+
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
161+
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
162+
else()
163+
#
164+
# For other GPU targets override the GPU architectures detected by cmake/torch
165+
# and filter them by the supported versions for the current language.
166+
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
167+
#
168+
override_gpu_arches(VLLM_GPU_ARCHES
169+
${VLLM_GPU_LANG}
170+
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
157171
endif()
158172

159-
#
160-
# Override the GPU architectures detected by cmake/torch and filter them by
161-
# the supported versions for the current language.
162-
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
163-
#
164-
override_gpu_arches(VLLM_GPU_ARCHES
165-
${VLLM_GPU_LANG}
166-
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
167-
168173
#
169174
# Query torch for additional GPU compilation flags for the given
170175
# `VLLM_GPU_LANG`.

Dockerfile.cpu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ RUN pip install intel_extension_for_pytorch==2.4.0
2626

2727
WORKDIR /workspace
2828

29-
ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
29+
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
30+
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
3031
RUN --mount=type=cache,target=/root/.cache/pip \
3132
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
3233
pip install --upgrade pip && \

benchmarks/benchmark_serving.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -176,9 +176,9 @@ def sample_sonnet_requests(
176176
# Sample the rest of lines per request.
177177
sampled_requests: List[Tuple[str, int, int]] = []
178178
for _ in range(num_requests):
179-
sampled_lines = "".join(
180-
prefix_lines +
181-
random.sample(poem_lines, num_input_lines - num_prefix_lines))
179+
num_lines_needed = num_input_lines - num_prefix_lines
180+
sampled_lines = "".join(prefix_lines +
181+
random.choices(poem_lines, k=num_lines_needed))
182182

183183
prompt = f"{base_prompt}{sampled_lines}"
184184
message = [
@@ -536,7 +536,7 @@ def process_one_metric(
536536
# E.g., "Time to First Token"
537537
metric_header: str,
538538
):
539-
# This function print and add statistics of the specified
539+
# This function prints and adds statistics of the specified
540540
# metric.
541541
if metric_attribute_name not in selected_percentile_metrics:
542542
return

csrc/quantization/machete/machete_pytorch.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,10 @@ torch::Tensor prepack_B(torch::Tensor const& B,
8989
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
9090
m.impl("machete_prepack_B", &prepack_B);
9191
m.impl("machete_gemm", &gemm);
92+
}
93+
94+
// use CatchAll since supported_schedules has no tensor arguments
95+
TORCH_LIBRARY_IMPL(TORCH_EXTENSION_NAME, CatchAll, m) {
9296
m.impl("machete_supported_schedules", &supported_schedules);
9397
}
9498

0 commit comments

Comments
 (0)