diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 6314afd65234..42ae2a8a71a7 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -106,19 +106,3 @@ steps: - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" env: DOCKER_BUILDKIT: "1" - - - block: "Build Neuron release image" - key: block-neuron-release-image-build - depends_on: ~ - - - label: "Build and publish Neuron release image" - depends_on: block-neuron-release-image-build - agents: - queue: neuron-postmerge - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ." - - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest" - - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)" - env: - DOCKER_BUILDKIT: "1" diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh deleted file mode 100644 index a397457c8326..000000000000 --- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -# This script build the Neuron docker image and run the API server inside the container. -# It serves a sanity check for compilation and basic model usage. -set -e -set -v - -image_name="neuron/vllm-ci" -container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" - -HF_CACHE="$(realpath ~)/huggingface" -mkdir -p "${HF_CACHE}" -HF_MOUNT="/root/.cache/huggingface" -HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN) - -NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache" -mkdir -p "${NEURON_COMPILE_CACHE_URL}" -NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache" - -# Try building the docker image -aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws - -# prune old image and containers to save disk space, and only once a day -# by using a timestamp file in tmp. -if [ -f /tmp/neuron-docker-build-timestamp ]; then - last_build=$(cat /tmp/neuron-docker-build-timestamp) - current_time=$(date +%s) - if [ $((current_time - last_build)) -gt 86400 ]; then - # Remove dangling images (those that are not tagged and not used by any container) - docker image prune -f - # Remove unused volumes / force the system prune for old images as well. - docker volume prune -f && docker system prune -f - echo "$current_time" > /tmp/neuron-docker-build-timestamp - fi -else - date "+%s" > /tmp/neuron-docker-build-timestamp -fi - -docker build -t "${image_name}" -f docker/Dockerfile.neuron . - -# Setup cleanup -remove_docker_container() { - docker image rm -f "${image_name}" || true; -} -trap remove_docker_container EXIT - -# Run the image -docker run --rm -it --device=/dev/neuron0 --network bridge \ - -v "${HF_CACHE}:${HF_MOUNT}" \ - -e "HF_HOME=${HF_MOUNT}" \ - -e "HF_TOKEN=${HF_TOKEN}" \ - -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ - -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ - --name "${container_name}" \ - ${image_name} \ - /bin/bash -c " - set -e; # Exit on first error - python3 /workspace/vllm/examples/offline_inference/neuron.py; - python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys; - for f in /workspace/vllm/tests/neuron/2_core/*.py; do - echo \"Running test file: \$f\"; - python3 -m pytest \$f -v --capture=tee-sys; - done - " \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in index 82fd22b845f0..fb3cccbb4a9c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,7 +2,6 @@ include LICENSE include requirements/common.txt include requirements/cuda.txt include requirements/rocm.txt -include requirements/neuron.txt include requirements/cpu.txt include CMakeLists.txt diff --git a/docker/Dockerfile.neuron b/docker/Dockerfile.neuron deleted file mode 100644 index 8bc23554718d..000000000000 --- a/docker/Dockerfile.neuron +++ /dev/null @@ -1,56 +0,0 @@ -# default base image -# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx -ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04" - -FROM $BASE_IMAGE - -RUN echo "Base image is $BASE_IMAGE" - -# Install some basic utilities -RUN apt-get update && \ - apt-get install -y \ - git \ - python3 \ - python3-pip \ - ffmpeg libsm6 libxext6 libgl1 - -### Mount Point ### -# When launching the container, mount the code directory to /workspace -ARG APP_MOUNT=/workspace -VOLUME [ ${APP_MOUNT} ] -WORKDIR ${APP_MOUNT}/vllm - -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity -RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U -RUN python3 -m pip install pytest - -# uninstall transformers-neuronx package explicitly to avoid version conflict -RUN python3 -m pip uninstall -y transformers-neuronx - -COPY . . -ARG GIT_REPO_CHECK=0 -RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi - -RUN python3 -m pip install -U \ - 'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ - -r requirements/neuron.txt - -ENV VLLM_TARGET_DEVICE neuron -RUN --mount=type=bind,source=.git,target=.git \ - pip install --no-build-isolation -v -e . - -# install development dependencies (for testing) -RUN python3 -m pip install -e tests/vllm_test_utils - -# install transformers-neuronx package as an optional dependencies (for V0) -# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict -RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps - -RUN python3 -m pip install sentencepiece transformers==4.48.0 -U - -# overwrite entrypoint to run bash script -RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py - -CMD ["/bin/bash"] diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py deleted file mode 100644 index 7826629a36d0..000000000000 --- a/examples/offline_inference/neuron.py +++ /dev/null @@ -1,49 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from vllm import LLM, SamplingParams - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - -def main(): - # Create an LLM. - llm = LLM( - model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - max_num_seqs=8, - # The max_model_len and block_size arguments are required to be same as - # max sequence length when targeting neuron device. - # Currently, this is a known limitation in continuous batching support - # in transformers-neuronx. - # TODO(liangfu): Support paged-attention in transformers-neuronx. - max_model_len=1024, - block_size=1024, - # ruff: noqa: E501 - # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, - # or explicitly assigned. - device="neuron", - tensor_parallel_size=2, - ) - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - print("-" * 50) - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") - print("-" * 50) - - -if __name__ == "__main__": - main() diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py deleted file mode 100644 index 0b2070c8e253..000000000000 --- a/examples/offline_inference/neuron_eagle.py +++ /dev/null @@ -1,61 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This example shows how to run offline inference with an EAGLE speculative -decoding model on neuron. To use EAGLE speculative decoding, you must use -a draft model that is specifically fine-tuned for EAGLE speculation. -Additionally, to use EAGLE with NxD Inference, the draft model must include -the LM head weights from the target model. These weights are shared between -the draft and target model. -""" - -from vllm import LLM, SamplingParams - -# Sample prompts. -prompts = [ - "What is annapurna labs?", -] - - -def main(): - # Create a sampling params object. - sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True) - - # Create an LLM. - llm = LLM( - model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct", - speculative_config={ - "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft", - "num_speculative_tokens": 5, - "max_model_len": 2048, - }, - max_num_seqs=4, - # The max_model_len and block_size arguments are required to be same as - # max sequence length when targeting neuron device. - # Currently, this is a known limitation in continuous batching support - # in neuronx-distributed-inference. - max_model_len=2048, - block_size=2048, - # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, - # or explicitly assigned. - device="neuron", - tensor_parallel_size=32, - override_neuron_config={ - "enable_eagle_speculation": True, - "enable_fused_speculation": True, - }, - ) - - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}") - - -if __name__ == "__main__": - main() diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py deleted file mode 100644 index c0ecfac50899..000000000000 --- a/examples/offline_inference/neuron_int8_quantization.py +++ /dev/null @@ -1,63 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os - -from vllm import LLM, SamplingParams - -# creates XLA hlo graphs for all the context length buckets. -os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048" -# creates XLA hlo graphs for all the token gen buckets. -os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048" -# Quantizes neuron model weight to int8 , -# The default config for quantization is int8 dtype. -os.environ["NEURON_QUANT_DTYPE"] = "s8" - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - -def main(): - # Create an LLM. - llm = LLM( - model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - max_num_seqs=8, - # The max_model_len and block_size arguments are required to be same as - # max sequence length when targeting neuron device. - # Currently, this is a known limitation in continuous batching support - # in transformers-neuronx. - # TODO(liangfu): Support paged-attention in transformers-neuronx. - max_model_len=2048, - block_size=2048, - # ruff: noqa: E501 - # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, - # or explicitly assigned. - device="neuron", - quantization="neuron_quant", - override_neuron_config={ - "cast_logits_dtype": "bfloat16", - }, - tensor_parallel_size=2, - ) - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - print("-" * 50) - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") - print("-" * 50) - - -if __name__ == "__main__": - main() diff --git a/examples/offline_inference/neuron_multimodal.py b/examples/offline_inference/neuron_multimodal.py deleted file mode 100644 index 26f7505f2fa5..000000000000 --- a/examples/offline_inference/neuron_multimodal.py +++ /dev/null @@ -1,110 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import requests -import torch -from neuronx_distributed_inference.models.mllama.utils import add_instruct -from PIL import Image - -from vllm import LLM, SamplingParams, TextPrompt - - -def get_image(image_url): - image = Image.open(requests.get(image_url, stream=True).raw) - return image - - -# Model Inputs -PROMPTS = [ - "What is in this image? Tell me a story", - "What is the recipe of mayonnaise in two sentences?", - "Describe this image", - "What is the capital of Italy famous for?", -] -IMAGES = [ - get_image( - "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500" - ), - None, - get_image( - "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500" - ), - None, -] -SAMPLING_PARAMS = [ - dict(top_k=1, temperature=1.0, top_p=1.0, max_tokens=16) - for _ in range(len(PROMPTS)) -] - - -def get_VLLM_mllama_model_inputs(prompt, single_image, sampling_params): - # Prepare all inputs for mllama generation, including: - # 1. put text prompt into instruct chat template - # 2. compose single text and single image prompt into Vllm's prompt class - # 3. prepare sampling parameters - input_image = single_image - has_image = torch.tensor([1]) - if isinstance(single_image, torch.Tensor) and single_image.numel() == 0: - has_image = torch.tensor([0]) - - instruct_prompt = add_instruct(prompt, has_image) - inputs = TextPrompt(prompt=instruct_prompt) - - if input_image is not None: - inputs["multi_modal_data"] = {"image": input_image} - - sampling_params = SamplingParams(**sampling_params) - return inputs, sampling_params - - -def print_outputs(outputs): - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -def main(): - assert ( - len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS) - ), f"""Text, image prompts and sampling parameters should have the - same batch size; but got {len(PROMPTS)}, {len(IMAGES)}, - and {len(SAMPLING_PARAMS)}""" - - # Create an LLM. - llm = LLM( - model="meta-llama/Llama-3.2-11B-Vision-Instruct", - max_num_seqs=1, - max_model_len=4096, - block_size=4096, - device="neuron", - tensor_parallel_size=32, - override_neuron_config={ - "sequence_parallel_enabled": False, - "skip_warmup": True, - "save_sharded_checkpoint": True, - "on_device_sampling_config": { - "global_topk": 1, - "dynamic": False, - "deterministic": False, - }, - }, - ) - - batched_inputs = [] - batched_sample_params = [] - for pmpt, img, params in zip(PROMPTS, IMAGES, SAMPLING_PARAMS): - inputs, sampling_params = get_VLLM_mllama_model_inputs(pmpt, img, params) - # test batch-size = 1 - outputs = llm.generate(inputs, sampling_params) - print_outputs(outputs) - batched_inputs.append(inputs) - batched_sample_params.append(sampling_params) - - # test batch-size = 4 - outputs = llm.generate(batched_inputs, batched_sample_params) - print_outputs(outputs) - - -if __name__ == "__main__": - main() diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py deleted file mode 100644 index 2ef69f29863d..000000000000 --- a/examples/offline_inference/neuron_speculation.py +++ /dev/null @@ -1,65 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This example shows how to run offline inference with a speculative -decoding model on neuron. -""" - -import os - -from vllm import LLM, SamplingParams - -# Sample prompts. -prompts = [ - "Hello, I am a language model and I can help", - "The president of the United States is", - "The capital of France is", -] - - -def config_buckets(): - """Configure context length and token gen buckets.""" - # creates XLA hlo graphs for all the context length buckets. - os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048" - # creates XLA hlo graphs for all the token gen buckets. - os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048" - - -def initialize_model(): - """Create an LLM with speculative decoding.""" - return LLM( - model="openlm-research/open_llama_7b", - speculative_config={ - "model": "openlm-research/open_llama_3b", - "num_speculative_tokens": 4, - "max_model_len": 2048, - }, - max_num_seqs=4, - max_model_len=2048, - block_size=2048, - use_v2_block_manager=True, - device="neuron", - tensor_parallel_size=32, - ) - - -def process_requests(model: LLM, sampling_params: SamplingParams): - """Generate texts from prompts and print them.""" - outputs = model.generate(prompts, sampling_params) - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -def main(): - """Main function that sets up the model and processes prompts.""" - config_buckets() - model = initialize_model() - # Create a sampling params object. - sampling_params = SamplingParams(max_tokens=100, top_k=1) - process_requests(model, sampling_params) - - -if __name__ == "__main__": - main() diff --git a/requirements/neuron.txt b/requirements/neuron.txt deleted file mode 100644 index 7df478eddde3..000000000000 --- a/requirements/neuron.txt +++ /dev/null @@ -1,9 +0,0 @@ -# Common dependencies --r common.txt - -# Dependencies for Neuron devices -packaging>=24.2 -setuptools>=77.0.3,<80.0.0 -torch-neuronx >= 2.5.0 -neuronx-cc>=2.0.0a0 -torchvision # Required for Llama3.2 multimodal image preprocessing diff --git a/setup.py b/setup.py index 9a5ca3456a0f..895933c60417 100644 --- a/setup.py +++ b/setup.py @@ -416,8 +416,7 @@ def _no_device() -> bool: def _is_cuda() -> bool: has_cuda = torch.version.cuda is not None - return (VLLM_TARGET_DEVICE == "cuda" and has_cuda - and not (_is_neuron() or _is_tpu())) + return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu()) def _is_hip() -> bool: @@ -425,10 +424,6 @@ def _is_hip() -> bool: or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None -def _is_neuron() -> bool: - return VLLM_TARGET_DEVICE == "neuron" - - def _is_tpu() -> bool: return VLLM_TARGET_DEVICE == "tpu" @@ -473,25 +468,6 @@ def get_rocm_version(): return None -def get_neuronxcc_version(): - import sysconfig - site_dir = sysconfig.get_paths()["purelib"] - version_file = os.path.join(site_dir, "neuronxcc", "version", - "__init__.py") - - # Check if the command was executed successfully - with open(version_file) as fp: - content = fp.read() - - # Extract the version using a regular expression - match = re.search(r"__version__ = '(\S+)'", content) - if match: - # Return the version string - return match.group(1) - else: - raise RuntimeError("Could not find Neuron version in the output") - - def get_nvcc_cuda_version() -> Version: """Get the CUDA version from nvcc. @@ -544,12 +520,6 @@ def get_vllm_version() -> str: rocm_version = get_rocm_version() or torch.version.hip if rocm_version and rocm_version != MAIN_CUDA_VERSION: version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}" - elif _is_neuron(): - # Get the Neuron version - neuron_version = str(get_neuronxcc_version()) - if neuron_version != MAIN_CUDA_VERSION: - neuron_version_str = neuron_version.replace(".", "")[:3] - version += f"{sep}neuron{neuron_version_str}" elif _is_tpu(): version += f"{sep}tpu" elif _is_cpu(): @@ -594,8 +564,6 @@ def _read_requirements(filename: str) -> list[str]: requirements = modified_requirements elif _is_hip(): requirements = _read_requirements("rocm.txt") - elif _is_neuron(): - requirements = _read_requirements("neuron.txt") elif _is_tpu(): requirements = _read_requirements("tpu.txt") elif _is_cpu(): @@ -604,7 +572,7 @@ def _read_requirements(filename: str) -> list[str]: requirements = _read_requirements("xpu.txt") else: raise ValueError( - "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.") + "Unsupported platform, please use CUDA, ROCm, or CPU.") return requirements diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 5a91758414a5..d0b937896835 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -311,15 +311,6 @@ def test_prefix_cache_default(): }, "mm-processor-kwargs" ), - ( - '{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}', - { - "cast_logits_dtype": "bfloat16", - "sequence_parallel_norm": True, - "sequence_parallel_norm_threshold": 2048, - }, - "override-neuron-config" - ), ]) # yapf: enable def test_composite_arg_parser(arg, expected, option): diff --git a/tests/neuron/1_core/test_activation.py b/tests/neuron/1_core/test_activation.py deleted file mode 100644 index 2d6e5f523cb8..000000000000 --- a/tests/neuron/1_core/test_activation.py +++ /dev/null @@ -1,43 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch -import torch.nn.functional as F - -from vllm.model_executor.layers.activation import FastGELU, SiluAndMul -from vllm.platforms import current_platform - - -@pytest.mark.parametrize("activation", ["silu_and_mul", "gelu_fast"]) -@pytest.mark.parametrize("num_tokens,d,dtype", [ - (7, 512, torch.half), - (7, 512, torch.float), - (83, 512, torch.half), -]) -@torch.inference_mode() -def test_act_and_mul( - activation: str, - num_tokens: int, - d: int, - dtype: torch.dtype, -) -> None: - import torch_xla.core.xla_model as xm - - device = xm.xla_device() - current_platform.seed_everything(0) - torch.set_default_device("cpu") - x = torch.randn(num_tokens, 2 * d, dtype=dtype).to(device=device) - if activation == "silu_and_mul": - layer = SiluAndMul() - fn = layer.forward_native - elif activation == "gelu_fast": - layer = FastGELU() - fn = F.gelu - else: - raise NotImplementedError( - f"activation {activation} is not implemented.") - assert x.is_xla, "input tensor under testing is expected to be XLA tensor." - out = layer.to(device=device).forward_neuron(x) - ref_out = fn(x.cpu()) - torch.testing.assert_close(out.cpu(), ref_out, atol=0.01, rtol=0.0) diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py deleted file mode 100644 index efec56360c14..000000000000 --- a/tests/neuron/1_core/test_block_table.py +++ /dev/null @@ -1,154 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import neuronxcc.nki.language as nl -import pytest -import torch -import torch.nn.functional as F -from neuronxcc import nki - -from vllm.attention.ops.nki_flash_attn import ( - load_block_tables, transform_block_tables_for_indirect_load) - - -def is_power_of_2(n): - return n > 0 and (n & (n - 1) == 0) - - -def nki_load_and_transform_block_tables( - block_tables, - num_tiles, - num_blocks_per_tile, - num_head, - head_id, - block_size_tiling_factor, -): - assert is_power_of_2( - num_blocks_per_tile), f"{num_blocks_per_tile=} must be power of 2" - block_tables_sbuf = load_block_tables(block_tables, num_tiles, - num_blocks_per_tile) - - # we need to pass an Index as head_id - head_id = nl.arange(1)[None, :] + head_id - - block_tables_transposed = transform_block_tables_for_indirect_load( - block_tables_sbuf, block_size_tiling_factor, num_head, head_id) - B_P_SIZE = 128 - assert block_tables_transposed.shape[1] == B_P_SIZE - - out = nl.ndarray( - block_tables_transposed.shape, - dtype=nl.int32, - buffer=nl.shared_hbm, - ) - for i in nl.affine_range(block_tables_transposed.shape[0]): - nl.store(dst=out[i], value=block_tables_transposed[i]) - return out - - -def ref_block_tables_transform( - block_tables, - num_tiles, - num_blocks_per_tile, - num_head, - head_id, - block_size_tiling_factor, -): - assert block_tables.numel() == num_tiles * num_blocks_per_tile - block_tables = block_tables.view(num_tiles, num_blocks_per_tile) - B_F_SIZE = 128 - num_tiles_padded = (num_tiles + B_F_SIZE - 1) // B_F_SIZE * B_F_SIZE - block_tables = F.pad( - block_tables, - (0, 0, 0, num_tiles_padded - num_tiles), - "constant", - 0, - ) - - block_tables = block_tables * num_head + head_id - block_tables = block_tables.view(num_tiles_padded, num_blocks_per_tile, 1) - offset = torch.arange(0, block_size_tiling_factor).view(1, 1, -1) - block_tables = block_tables * block_size_tiling_factor + offset - block_tables_transposed = block_tables.view(num_tiles_padded, -1).t() - - num_blocks_per_tile = block_tables_transposed.shape[0] - assert num_blocks_per_tile % B_F_SIZE == 0 - return block_tables_transposed.view(num_blocks_per_tile // B_F_SIZE, - B_F_SIZE, num_tiles_padded) - - -@pytest.mark.parametrize( - "q_head_per_kv_head,head_id", - [ - (1, 0), - (3, 1), - ], -) -@pytest.mark.parametrize( - "num_tiles,num_blocks_per_tile", - [ - (1, 1), - (13, 16), - (17, 128), - (35, 512), - (128, 128), - (130, 64), - (280, 256), - (315, 1), - ], -) -@torch.inference_mode() -def test_load_and_transform_block_tables( - monkeypatch: pytest.MonkeyPatch, - num_tiles, - num_blocks_per_tile, - q_head_per_kv_head, - head_id, -) -> None: - import torch_xla.core.xla_model as xm - - device = xm.xla_device() - - compiler_flags_str = " ".join([ - "-O1", - "--retry_failed_compilation", - ]) - with monkeypatch.context() as m: - m.setenv("NEURON_CC_FLAGS", compiler_flags_str) - - torch.manual_seed(10000) - torch.set_printoptions(sci_mode=False) - - # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient - B_P_SIZE = 128 - if num_blocks_per_tile < B_P_SIZE: - assert B_P_SIZE % num_blocks_per_tile == 0 - block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile - else: - block_size_tiling_factor = 1 - max_num_blocks = 100000 - block_tables = torch.randint( - 0, - max_num_blocks, - (num_tiles * num_blocks_per_tile, ), - dtype=torch.int32, - ) - nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1]( - block_tables.to(device=device), - num_tiles, - num_blocks_per_tile, - q_head_per_kv_head, - head_id, - block_size_tiling_factor, - ).cpu() - ref_out = ref_block_tables_transform( - block_tables, - num_tiles, - num_blocks_per_tile, - q_head_per_kv_head, - head_id, - block_size_tiling_factor, - ) - assert (nki_out.shape == ref_out.shape - ), f"{nki_out.shape=} != {ref_out.shape=}" - assert torch.all(nki_out == ref_out) diff --git a/tests/neuron/1_core/test_cache.py b/tests/neuron/1_core/test_cache.py deleted file mode 100644 index 670889ad6b58..000000000000 --- a/tests/neuron/1_core/test_cache.py +++ /dev/null @@ -1,86 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch - -from vllm.attention.ops.nki_flash_attn import reshape_and_cache - - -@pytest.mark.parametrize( - "num_tokens, n_kv_head, d_head, num_blocks, block_size", - [ - # Small model configuration (e.g., GPT-2 small) - (32, 12, 64, 4, 128), # Typical sequence processing - (1, 12, 64, 4, 128), # Single token update - (128, 12, 64, 4, 128), # Longer sequence - - # Medium model configuration (e.g., GPT-2 medium) - (64, 16, 96, 8, 256), # Standard batch - (256, 16, 96, 8, 256), # Large batch - - # Large model configuration (e.g., GPT-3 style) - (48, 32, 128, 16, 512), # Typical processing window - (512, 32, 128, 16, 512), # Full context window - - # Edge cases and stress tests - (1024, 8, 32, 32, 32), # Many tokens, small heads - (16, 64, 256, 4, 64), # Few tokens, many heads - (2048, 24, 128, 64, 128), # Large scale test - - # Minimal configurations for debugging - (4, 2, 16, 2, 16), # Tiny test case - (1, 1, 8, 1, 8), # Minimal possible - ]) -def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks, - block_size): - # Set random seed for reproducibility - torch.manual_seed(42) - - # Create CPU tensors for reference implementation - key_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt( - torch.tensor(d_head)) - value_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt( - torch.tensor(d_head)) - key_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head) - value_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head) - slot_mapping_cpu = torch.randperm(num_blocks * block_size)[:num_tokens] - - # Run reference implementation on CPU - block_indices = torch.div(slot_mapping_cpu, - block_size, - rounding_mode="floor") - block_offsets = slot_mapping_cpu % block_size - - for i in range(num_tokens): - block_idx = block_indices[i] - block_offset = block_offsets[i] - key_cache_cpu[block_idx, :, block_offset, :] = key_cpu[i] - value_cache_cpu[block_idx, :, block_offset, :] = value_cpu[i] - - # Create XLA device tensors - device = torch.device('xla') - key = key_cpu.to(device) - value = value_cpu.to(device) - key_cache = torch.zeros_like(key_cache_cpu, device=device) - value_cache = torch.zeros_like(value_cache_cpu, device=device) - slot_mapping = slot_mapping_cpu.to(device) - kv_cache = torch.stack([key_cache, value_cache]) - - # Run vectorized implementation on XLA device - reshape_and_cache(key, value, kv_cache, slot_mapping) - key_cache, value_cache = torch.unbind(kv_cache, dim=0) - - # Move results back to CPU for comparison - key_cache_result = key_cache.cpu() - value_cache_result = value_cache.cpu() - - # Assert results match - torch.testing.assert_close(key_cache_result, - key_cache_cpu, - rtol=1e-5, - atol=1e-5) - torch.testing.assert_close(value_cache_result, - value_cache_cpu, - rtol=1e-5, - atol=1e-5) diff --git a/tests/neuron/1_core/test_layernorm.py b/tests/neuron/1_core/test_layernorm.py deleted file mode 100644 index c6fce1d1a063..000000000000 --- a/tests/neuron/1_core/test_layernorm.py +++ /dev/null @@ -1,57 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch - -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.platforms import current_platform - - -@pytest.mark.parametrize("num_tokens,hidden_size,add_residual,dtype", [ - (7, 8, False, torch.half), - (83, 768, False, torch.half), - (83, 768, True, torch.half), - (83, 768, True, torch.bfloat16), - (83, 768, True, torch.float32), -]) -@torch.inference_mode() -def test_rms_norm( - num_tokens: int, - hidden_size: int, - add_residual: bool, - dtype: torch.dtype, -) -> None: - import torch_xla.core.xla_model as xm - - device = xm.xla_device() - current_platform.seed_everything(0) - torch.set_default_device("cpu") - layer = RMSNorm(hidden_size).to(dtype=dtype) - layer.weight.data.normal_(mean=1.0, std=0.1) - scale = 1 / (2 * hidden_size) - x = torch.randn(num_tokens, hidden_size, dtype=dtype).to(device=device) - x *= scale - residual = torch.randn_like(x) * scale if add_residual else None - - residual_cpu = residual.cpu() if add_residual else None - ref_out = layer.to(device="cpu").forward_native(x.cpu(), residual_cpu) - assert x.is_xla, "input tensor under testing is expected to be XLA tensor." - out = layer.to(device=device)(x, residual) - - # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger - # numerical errors than other operators because they involve reductions. - # Therefore, we use a larger tolerance. - if add_residual: - assert out[0].is_xla, "output tensor is expected to be XLA tensor" - torch.testing.assert_close(out[0].cpu(), - ref_out[0], - atol=1e-2, - rtol=1e-2) - torch.testing.assert_close(out[1].cpu(), - ref_out[1], - atol=1e-2, - rtol=1e-2) - else: - assert out.is_xla, "output tensor is expected to be XLA tensor" - torch.testing.assert_close(out.cpu(), ref_out, atol=1e-2, rtol=1e-2) diff --git a/tests/neuron/1_core/test_logits_processor.py b/tests/neuron/1_core/test_logits_processor.py deleted file mode 100644 index ce9eadf5a883..000000000000 --- a/tests/neuron/1_core/test_logits_processor.py +++ /dev/null @@ -1,95 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import random -from unittest.mock import patch - -import pytest -import torch - -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.utils import set_random_seed -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import is_pin_memory_available - - -class MockLogitsProcessor(LogitsProcessor): - - def __init__(self, vocab_size: int, scale: float, - fake_logits: torch.Tensor): - super().__init__(vocab_size=vocab_size, scale=scale) - self.fake_logits = fake_logits.clone() - - def forward(self, *args, **kwargs): - with patch( - "vllm.model_executor.layers.logits_processor._prune_hidden_states", - lambda x, y: x - ), patch( - "vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits", - lambda *args, **kwargs: self.fake_logits): - return super().forward(*args, **kwargs) - - -def _prepare_test( - batch_size: int -) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]: - vocab_size = 32000 - input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16) - fake_logits = torch.full((batch_size, vocab_size), - 1e-2, - dtype=input_tensor.dtype) - logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits) - return input_tensor, fake_logits, logits_processor - - -RANDOM_SEEDS = list(range(8)) - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -def test_logits_processors(seed: int): - import torch_xla.core.xla_model as xm - - device = xm.xla_device() - set_random_seed(seed) - torch.set_default_device("cpu") - batch_size = random.randint(1, 256) - input_tensor, fake_logits, logits_processor = _prepare_test(batch_size) - - # This sample logits processor gives infinite score to the i-th token, - # where i is the length of the input sequence. - # We therefore expect the output token sequence to be [0, 1, 2, ...] - def pick_ith(token_ids, logits): - logits[len(token_ids)] = float("inf") - return logits - - seq_group_metadata_list = [] - seq_lens = [] - for i in range(batch_size): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=SamplingParams(temperature=0, - logits_processors=[pick_ith]), - block_tables={0: [1]}, - )) - seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=device, - pin_memory=is_pin_memory_available()) - logits_processor_output = logits_processor( - lm_head=None, - hidden_states=input_tensor, - sampling_metadata=sampling_metadata) - - fake_logits *= logits_processor.scale - torch.testing.assert_close(logits_processor_output[:, 1], - fake_logits[:, 1], - rtol=1e-4, - atol=0.0) diff --git a/tests/neuron/1_core/test_neuron_model_runner.py b/tests/neuron/1_core/test_neuron_model_runner.py deleted file mode 100644 index 5f3268810f9f..000000000000 --- a/tests/neuron/1_core/test_neuron_model_runner.py +++ /dev/null @@ -1,127 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os -from unittest.mock import MagicMock - -from vllm.config import VllmConfig -from vllm.engine.arg_utils import EngineArgs -from vllm.platforms import current_platform -from vllm.platforms.neuron import NeuronFramework -from vllm.sampling_params import SamplingParams -from vllm.sequence import SequenceData, SequenceGroupMetadata -from vllm.worker.neuron_model_runner import NeuronModelRunner - -os.environ[ - 'VLLM_NEURON_FRAMEWORK'] = NeuronFramework.TRANSFORMERS_NEURONX.value - - -def _create_neuron_model_runner(model: str, *args, - **kwargs) -> NeuronModelRunner: - engine_args = EngineArgs(model, *args, **kwargs) - engine_config = engine_args.create_engine_config() - vllm_config = VllmConfig( - model_config=engine_config.model_config, - parallel_config=engine_config.parallel_config, - scheduler_config=engine_config.scheduler_config, - device_config=engine_config.device_config, - ) - neuron_model_runner = NeuronModelRunner(vllm_config=vllm_config) - return neuron_model_runner - - -def test_update_neuron_sampling_params_not_full_batch(): - os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0" - model_runner = _create_neuron_model_runner( - "facebook/opt-125m", - seed=0, - dtype="float16", - max_num_seqs=2, - ) - assert not model_runner._on_device_sampling_disabled - # Test sampling param updating only when TNx is framework - # NxDI handles sampling parameter updating inside model - if current_platform.use_transformers_neuronx(): - model_mock = MagicMock() - model_runner.model = model_mock - - seq_group_metadata_list = [ - SequenceGroupMetadata( - request_id="test_0", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=SamplingParams(temperature=0.5, - top_k=1, - top_p=0.5), - block_tables={0: [1]}, - ) - ] - - model_runner.prepare_model_input(seq_group_metadata_list) - - # Index neuron sampling parameters based on block_tables indices. - # The first block_id of the sequence 0 is 1, so its parameters are - # placed at index 1. So the sampling parameters will be: - # Index 0: default sampling parameters - # Index 1: sequecne 0's sampling parameters. - neuron_sampling_params = ( - model_runner.model_config.neuron_sampling_params) - assert neuron_sampling_params.temperature == [1.0, 0.5] - assert neuron_sampling_params.top_k == [ - model_runner._MAX_NEURON_SAMPLING_TOP_K, 1 - ] - assert neuron_sampling_params.top_p == [1.0, 0.5] - model_mock.model.update_generation_config.assert_called_once_with( - neuron_sampling_params) - - -def test_update_neuron_sampling_params_full_batch(): - os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0" - model_runner = _create_neuron_model_runner( - "facebook/opt-125m", - seed=0, - dtype="float16", - max_num_seqs=2, - ) - assert not model_runner._on_device_sampling_disabled - - # Test sampling param updating only when TNx is framework - # NxDI handles sampling parameter updating inside model - if current_platform.use_transformers_neuronx(): - model_mock = MagicMock() - model_runner.model = model_mock - - seq_group_metadata_list = [ - SequenceGroupMetadata( - request_id="test_0", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=SamplingParams(temperature=0.5, - top_k=1, - top_p=0.5), - block_tables={0: [1]}, - ), - SequenceGroupMetadata( - request_id="test_0", - is_prompt=True, - seq_data={1: SequenceData.from_seqs([4, 5, 6])}, - sampling_params=SamplingParams(temperature=0.2, - top_k=2, - top_p=0.2), - block_tables={1: [0]}, - ) - ] - - model_runner.prepare_model_input(seq_group_metadata_list) - - # Index neuron sampling parameters based on block_tables indices. - # The first block_id of the sequence 0 is 1, so its parameters are - # placed at index 1. So the sampling parameters will be: - # Index 0: sequence 1's sampling parameters - # Index 1: sequecne 0's sampling parameters. - neuron_sampling_params = ( - model_runner.model_config.neuron_sampling_params) - assert neuron_sampling_params.temperature == [0.2, 0.5] - assert neuron_sampling_params.top_k == [2, 1] - assert neuron_sampling_params.top_p == [0.2, 0.5] - model_mock.model.update_generation_config.assert_called_once_with( - neuron_sampling_params) diff --git a/tests/neuron/1_core/test_neuron_quant.py b/tests/neuron/1_core/test_neuron_quant.py deleted file mode 100644 index 086300269592..000000000000 --- a/tests/neuron/1_core/test_neuron_quant.py +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.model_executor.layers.quantization.neuron_quant import ( - NeuronQuantConfig) - - -def test_get_supported_act_dtypes(): - neuron_quant_config = NeuronQuantConfig() - supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes() - target_list = ["any_dtype1", "any_dtype2"] - for dtype in target_list: - assert dtype in supported_act_dtypes diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py deleted file mode 100644 index abf7febc2955..000000000000 --- a/tests/neuron/1_core/test_prefix_prefill.py +++ /dev/null @@ -1,514 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Optional - -import pytest -import torch -import torch.nn.functional as F - -from vllm.utils import cdiv - - -class BlockDiagonalCausalFromBottomRightMask: - - @staticmethod - def _from_seqlens(query_lens, seq_lens, block_size=None): - from torch import logical_and, logical_or - - contexted = block_size is None - context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) - n_queries = sum(query_lens) - num_seqs = len(query_lens) - if contexted: - key_lens_blockaligned = seq_lens - else: - n_blocks_per_seq = (context_lens + block_size - 1) // block_size - offset_per_seq = n_blocks_per_seq * block_size - key_lens_blockaligned = offset_per_seq[:num_seqs].tolist() - n_keys = sum(key_lens_blockaligned) - - a = (torch.arange(n_queries).reshape(n_queries, - 1).expand(n_queries, n_keys)) - b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys) - q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0) - k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0) - - prior_mask = torch.zeros(n_queries, n_keys) - new_masks: list[torch.Tensor] = [] - for seq_id in range(num_seqs): - ri = q_cumsum[seq_id] - ci = k_cumsum[seq_id] - nr = query_lens[seq_id] - - if contexted: - nc = seq_lens[seq_id] - a_offset = ci + nc - ri - nr - new_mask = (a + a_offset) >= b - else: - nc = context_lens[seq_id] - a_offset = ci + nc - 1 - new_mask = a_offset >= b - - left_mask = b >= ci - top_mask = a >= ri - bottom_mask = a < (ri + nr) - - new_mask = logical_and( - logical_and(logical_and(new_mask, left_mask), top_mask), - bottom_mask, - ) - prior_mask = logical_or(prior_mask, new_mask) - new_masks = new_masks + [new_mask] - return prior_mask - - @staticmethod - def from_seqlens(query_lens, seq_lens, block_size=None): - contexted = block_size is None - if contexted: - prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens( - query_lens, seq_lens) - active_mask = None - else: - prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens( - query_lens, seq_lens, block_size) - active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens( - query_lens, query_lens) - return prior_mask, active_mask - - -def ref_softmax(x: torch.Tensor, - dim: int, - mixed_precision=False, - return_max_reduce=False): - max_value = torch.amax(x, dim=dim, keepdims=True) - exp = torch.exp(x - max_value) - if mixed_precision: - sum_value = torch.sum(exp.astype(torch.float32), - dim=dim, - keepdims=True).astype(x.dtype) - else: - sum_value = torch.sum(exp, dim=dim, keepdims=True) - if return_max_reduce: - return exp / sum_value, max_value, torch.reciprocal(sum_value) - return exp / sum_value - - -def ref_masked_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - scale: float, - attn_mask: Optional[torch.Tensor] = None, - return_max_reduce: Optional[bool] = False, -) -> torch.Tensor: - scaled_qk = scale * torch.einsum("qhd,khd->hqk", query, key).float() - if attn_mask is not None: - masked_score = scaled_qk + attn_mask.float() - if return_max_reduce: - norm_score, cached_max, cached_sum_reciprocal = ref_softmax( - masked_score, dim=-1, return_max_reduce=True) - else: - norm_score = ref_softmax(masked_score, dim=-1) - out = torch.einsum("hqk,khd->qhd", norm_score.to(value.dtype), value) - if return_max_reduce: - return ( - out, - cached_max, - cached_sum_reciprocal, - norm_score, - masked_score, - scaled_qk, - ) - else: - return (out, ) - - -def ref_context_attention( - query, - key, - value, - query_lens, - seq_lens, - head_size, - num_queries_per_kv, - return_max_reduce=False, -): - scale = float(1.0 / (head_size**0.5)) - if num_queries_per_kv > 1: - # Handle MQA and GQA - key = torch.repeat_interleave(key, num_queries_per_kv, dim=1) - value = torch.repeat_interleave(value, num_queries_per_kv, dim=1) - - attn_mask, _ = BlockDiagonalCausalFromBottomRightMask.from_seqlens( - query_lens, seq_lens) - - # convert binary mask to -inf values - attn_mask = torch.logical_not(attn_mask) - attn_mask = attn_mask.float() * -30000 - - output, *debug_tensors = ref_masked_attention( - query, - key, - value, - scale, - attn_mask, - return_max_reduce=return_max_reduce, - ) - - output = output.unsqueeze(1) - if return_max_reduce: - cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = ( - debug_tensors) - return ( - output, - cached_max, - cached_sum_reciprocal, - lse, - masked_score, - scaled_qk, - ) - else: - return output - - -def sample_inputs( - prefill_batch_size, - decode_batch_size, - min_query_len, - max_query_len, - min_ctx_len, - max_ctx_len, - block_size, - num_heads, - num_kv_heads, - head_size, - dtype, -): - batch_size = prefill_batch_size + decode_batch_size - max_model_len = (max_query_len + max_ctx_len) * 4 - max_block_per_request = max_model_len // block_size - cache_size = (batch_size * max_block_per_request) + 2 - prefill_ctx_lens = torch.randint(min_ctx_len, - max_ctx_len + 1, (prefill_batch_size, ), - dtype=torch.long).tolist() - decode_ctx_lens = torch.randint(min_ctx_len, - max_ctx_len + 1, (decode_batch_size, ), - dtype=torch.long).tolist() - ctx_lens = prefill_ctx_lens + decode_ctx_lens - query_lens = torch.randint( - min_query_len, - max_query_len + 1, - (prefill_batch_size, ), - dtype=torch.long, - ).tolist() + [1 for _ in range(decode_batch_size)] - seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)] - - num_tokens = sum(query_lens) - query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype) - query.uniform_(-1, 1) - torch.empty(num_tokens, num_heads, head_size, dtype=dtype) - - kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype) - kv.uniform_(-1, 1) - key, value = kv.unbind(dim=1) - - k_cache = torch.zeros(cache_size, - block_size, - num_kv_heads, - head_size, - dtype=dtype) - v_cache = torch.zeros(cache_size, - block_size, - num_kv_heads, - head_size, - dtype=dtype) - k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype) - v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype) - values = torch.arange(0, cache_size, dtype=torch.long) - values = values[torch.randperm(cache_size)] - block_table = values[:batch_size * max_block_per_request].view( - batch_size, max_block_per_request) - b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long) - b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1], - dtype=torch.long), - dim=0) - # copy kv to cache - b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1], - dtype=torch.long), - dim=0) - for i in range(batch_size): - for j in range(query_lens[i]): - k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + - j]) - v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + - b_ctx_len[i] + j]) - cur_ctx = 0 - block_id = 0 - while cur_ctx < b_ctx_len[i]: - start_loc = b_seq_start_loc[i] + cur_ctx - if cur_ctx + block_size > b_ctx_len[i]: - end_loc = b_seq_start_loc[i] + b_ctx_len[i] - else: - end_loc = start_loc + block_size - start_slot = block_table[i, block_id] * block_size - end_slot = start_slot + end_loc - start_loc - k_cache.view(-1, num_kv_heads, - head_size)[start_slot:end_slot].copy_( - key[start_loc:end_loc]) - v_cache.view(-1, num_kv_heads, - head_size)[start_slot:end_slot].copy_( - value[start_loc:end_loc]) - cur_ctx += block_size - block_id += 1 - kv_cache = torch.stack([k_cache, v_cache]) - - return ( - query, - k, - v, - kv_cache, - block_table, - key, - value, - query_lens, - seq_lens, - ) - - -def get_active_block_tables(block_tables, query_lens, seq_lens, block_size, - num_blocks): - context_lens = seq_lens - query_lens - blocks_per_seq = (context_lens + block_size - 1) // block_size - num_seqs = len(seq_lens) - active_blocks: list[int] = [] - for seq_id in range(num_seqs): - active_blocks = ( - active_blocks + - block_tables[seq_id, :blocks_per_seq[seq_id]].tolist()) - return F.pad( - torch.tensor(active_blocks, dtype=torch.int32), - (0, num_blocks - len(active_blocks)), - "constant", - 0, - ) - - -@pytest.mark.parametrize( - "prefill_batch_size,decode_batch_size,block_size,large_tile_size,num_heads,num_queries_per_kv,head_size,mixed_precision", - [ - # Test minimal configurations (small block size) - (1, 199, 1, 512, 4, 2, 8, False - ), # minimal block size, small dimensions - (1, 199, 1, 512, 4, 2, 8, True), # same with mixed precision - - # Test common/medium configurations - (4, 12, 32, 2048, 32, 8, 64, False), # common case, larger heads - (4, 12, 32, 2048, 16, 4, 32, - True), # medium size, mixed precision, grouped-query attention (GQA) - - # Test large configurations - (4, 12, 256, 8192, 8, 1, 128, False), # large blocks, large head size - (4, 12, 256, 8192, 64, 8, 64, True), # large blocks, many heads - - # Test asymmetric configurations - (2, 24, 64, 4096, 12, 4, 96, False), # varied batch sizes - (8, 8, 128, 2048, 24, 2, 48, True), # balanced batches - - # Test edge cases - (1, 128, 16, 1024, 4, 2, 16, False), # large decode batch - (16, 4, 8, 1024, 4, 2, 128, True), # large prefill batch - (4, 12, 32, 2048, 16, 1, 32, True), # multi-head attention (MHA) - (4, 12, 32, 2048, 16, 16, 32, True), # multi-query attention (MQA) - ]) -@torch.inference_mode() -def test_contexted_kv_attention( - monkeypatch: pytest.MonkeyPatch, - prefill_batch_size: int, - decode_batch_size: int, - num_heads: int, - num_queries_per_kv: int, - head_size: int, - block_size: int, - large_tile_size, - mixed_precision: bool, -) -> None: - - import torch_xla.core.xla_model as xm - - from vllm.attention.ops.nki_flash_attn import (flash_attn_varlen_nkifunc, - reorder_context_mask) - - assert large_tile_size % block_size == 0 - - device = xm.xla_device() - - compiler_flags_str = " ".join([ - "-O1", - "--retry_failed_compilation", - ]) - with monkeypatch.context() as m: - m.setenv("NEURON_CC_FLAGS", compiler_flags_str) - - torch.manual_seed(0) - torch.set_printoptions(sci_mode=False) - torch.set_default_device("cpu") - dtype = torch.float32 - - min_ctx_len = 32 - max_ctx_len = 1024 - min_query_len = 16 - max_query_len = 512 - num_kv_heads = num_heads // num_queries_per_kv - ( - query, - k_active, - v_active, - kv_cache, - block_table, - key, - value, - query_lens, - seq_lens, - ) = sample_inputs( - prefill_batch_size=prefill_batch_size, - decode_batch_size=decode_batch_size, - min_query_len=min_query_len, - max_query_len=max_query_len, - min_ctx_len=min_ctx_len, - max_ctx_len=max_ctx_len, - block_size=block_size, - num_heads=num_heads, - num_kv_heads=num_kv_heads, - head_size=head_size, - dtype=dtype, - ) - - output_ref = ref_context_attention( - query, - key, - value, - query_lens, - seq_lens, - head_size, - num_queries_per_kv, - return_max_reduce=False, - ) - - # build neuron program - B_P_SIZE = 128 - assert (large_tile_size >= B_P_SIZE - ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}" - - def pad_to_multiple(a, b): - return cdiv(a, b) * b - - def pad_to_next_power_of_2(a): - assert a > 0 - return 2**int(a - 1).bit_length() - - # calculate input shapes - max_num_queries = pad_to_next_power_of_2(sum(query_lens)) - context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) - num_active_blocks = cdiv(context_lens, block_size).sum().item() - num_active_blocks = pad_to_multiple(num_active_blocks, - large_tile_size // block_size) - context_kv_len = num_active_blocks * block_size - assert ( - context_kv_len % - large_tile_size == 0), f"invalid context_kv_len={context_kv_len}" - - # pad QKV tensors - pad_dims = ( - 0, - 0, - 0, - 0, - 0, - max_num_queries - query.shape[0], - ) - query = F.pad(query, pad_dims, "constant", 0) - k = F.pad(k_active, pad_dims, "constant", 0) - v = F.pad(v_active, pad_dims, "constant", 0) - - # permute QKV tensors - # query: (1, n_heads, d, seq_q) - # key: (1, n_kv_heads, d, seq_k) - # value: (1, n_kv_heads, seq_v, d) - query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous() - k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous() - v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous() - kv_cache = kv_cache.permute(0, 1, 3, 2, 4).contiguous() - - # transform block table - active_block_table = get_active_block_tables( - block_table.cpu(), - torch.tensor(query_lens).cpu(), - torch.tensor(seq_lens).cpu(), - block_size, - num_active_blocks, - ) - - # Build attention masks - prior_mask, active_mask = ( - BlockDiagonalCausalFromBottomRightMask.from_seqlens( - query_lens, seq_lens, block_size=block_size)) - prior_mask_padded = F.pad( - prior_mask, - ( - 0, - context_kv_len - prior_mask.shape[1], - 0, - max_num_queries - prior_mask.shape[0], - ), - "constant", - 0, - ).bool() - active_mask_padded = F.pad( - active_mask, - ( - 0, - max_num_queries - active_mask.shape[1], - 0, - max_num_queries - active_mask.shape[0], - ), - "constant", - 0, - ).bool() - attn_mask = torch.concat([prior_mask_padded, active_mask_padded], - dim=1) - - attn_mask = reorder_context_mask(attn_mask, large_tile_size, - block_size) - - input_args = ( - query.to(device=device), - k.to(device=device), - v.to(device=device), - kv_cache.to(device=device), - active_block_table.to(device=device), - attn_mask.to(device=device), - ) - input_kwargs = dict( - n_kv_head=num_kv_heads, - head_size=head_size, - mixed_precision=mixed_precision, - LARGE_TILE_SZ=large_tile_size, - ) - - output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs) - - num_actual_tokens = sum(query_lens) - # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d) - output_nki = output_nki.cpu().permute(0, 2, 1, 3) - output_nki = output_nki[0, :num_actual_tokens, :, :] - output_ref_padded = F.pad( - output_ref, - (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]), - "constant", - 0, - ) - output_ref = output_ref_padded.transpose( - 0, 1)[0, :num_actual_tokens, :, :] - - torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0) diff --git a/tests/neuron/1_core/test_rotary_embedding.py b/tests/neuron/1_core/test_rotary_embedding.py deleted file mode 100644 index a7ac79729986..000000000000 --- a/tests/neuron/1_core/test_rotary_embedding.py +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Tests for miscellaneous utilities -""" - -import pytest -import torch - -from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding -from vllm.platforms import current_platform - - -@pytest.mark.parametrize( - "max_position,is_neox_style,rotary_dim,head_size,seq_len,use_key", [ - (16, False, 32, 32, 1024, True), - (16, False, 32, 128, 1024, True), - (16, True, 32, 32, 1024, True), - (16, True, 32, 128, 1024, True), - (16, False, 32, 128, 1024, False), - (16, True, 32, 128, 1024, False), - ]) -def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim, - head_size, seq_len, use_key): - import torch_xla.core.xla_model as xm - - device = xm.xla_device() - current_platform.seed_everything(0) - torch.set_default_device("cpu") - - batch_size = 1 - base = 10000 - num_heads = 8 - - rot = RotaryEmbedding(head_size, rotary_dim, max_position, base, - is_neox_style, torch.float32) - - positions = torch.randint(0, - max_position, (batch_size, seq_len), - device="cpu") - query = torch.randn(batch_size, - seq_len, - num_heads * head_size, - dtype=torch.float32, - device="cpu") - key = torch.randn_like(query) if use_key else None - assert positions.is_cpu, \ - "reference input tensor is expected to be CPU tensor." - ref_query, ref_key = rot.to(device="cpu").forward_native( - positions, query, key) - out_query, out_key = rot.to(device=device).forward_neuron( - positions.to(device=device), query.to(device=device), - key.to(device=device) if key is not None else None) - if use_key: - assert out_query.is_xla and out_key.is_xla, \ - "output tensor is expected to be XLA tensor" - torch.testing.assert_close(out_key.cpu(), - ref_key, - atol=1e-2, - rtol=1e-2) - else: - assert out_key is None, "expected returned key to be None" - assert out_query.is_xla, \ - "output tensor is expected to be XLA tensor" - torch.testing.assert_close(out_query.cpu(), - ref_query, - atol=1e-2, - rtol=1e-2) diff --git a/tests/neuron/2_core/test_comm_ops.py b/tests/neuron/2_core/test_comm_ops.py deleted file mode 100644 index 85a48dae58aa..000000000000 --- a/tests/neuron/2_core/test_comm_ops.py +++ /dev/null @@ -1,101 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import functools -from typing import Callable -from unittest.mock import patch - -import pytest -import torch -import torch_xla.distributed.xla_multiprocessing as xmp -from typing_extensions import ParamSpec - -from vllm.distributed.communication_op import ( - tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) -from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.utils import get_distributed_init_method, get_open_port - -_P = ParamSpec("_P") - - -def reinitialize_neuron_runtime(f: Callable[_P, None]) -> Callable[_P, None]: - """Decorator to reinitialize the Neuron Runtime before executing a test. - This is necessary for distributed tests which need to reallocate Neuron - Cores to separate subprocesses. - """ - - @functools.wraps(f) - def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: - runtime = torch.classes.neuron.Runtime() - runtime.initialize() - runtime.unsafe_close() - - f(*args, **kwargs) - runtime.initialize() - - return wrapper - - -def all_gather_test_worker(index, tp_degree, distributed_init_method): - init_distributed_environment(tp_degree, - index, - distributed_init_method, - index, - backend="xla") - ensure_model_parallel_initialized(tp_degree, 1) - - num_dimensions = 3 - tensor_size = list(range(2, num_dimensions + 2)) - total_size = 1 - for s in tensor_size: - total_size *= s - - all_gather_dimension = -1 - all_tensors = [ - torch.arange(total_size, dtype=torch.float32, - device="xla").reshape(tensor_size) * (r + 1) - for r in range(tp_degree) - ] - expected = torch.cat(all_tensors, dim=all_gather_dimension) - t = all_tensors[index % tp_degree] - t = tensor_model_parallel_all_gather(t, all_gather_dimension) - torch.testing.assert_close(t, expected) - - -def all_reduce_test_worker(index, tp_degree, distributed_init_method): - init_distributed_environment(tp_degree, - index, - distributed_init_method, - index, - backend="xla") - ensure_model_parallel_initialized(tp_degree, 1) - - num_elements = 8 - all_tensors = [ - torch.arange(num_elements, dtype=torch.float32, device="xla") * (r + 1) - for r in range(tp_degree) - ] - expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0) - t = all_tensors[index % tp_degree] - t = tensor_model_parallel_all_reduce(t) - torch.testing.assert_close(t, expected) - - -@pytest.mark.parametrize("tp_size", [2]) -@pytest.mark.parametrize("test_target", - [all_reduce_test_worker, all_gather_test_worker]) -@reinitialize_neuron_runtime -def test_neuron_multi_process_tensor_parallel(monkeypatch, tp_size, - test_target): - - with patch('torch_xla._XLAC._xla_runtime_is_initialized', - return_value=False): - distributed_init_method = get_distributed_init_method( - "127.0.0.1", get_open_port()) - - monkeypatch.setenv("VLLM_USE_V1", "1") - monkeypatch.setenv("NEURONCORE_NUM_DEVICES", str(tp_size)) - monkeypatch.setenv("NEURON_PJRT_PROCESSES_NUM_DEVICES", - ','.join(['1' for _ in range(tp_size)])) - - xmp.spawn(test_target, args=(tp_size, distributed_init_method)) diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py deleted file mode 100644 index cac642af0310..000000000000 --- a/tests/neuron/2_core/test_eagle.py +++ /dev/null @@ -1,83 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import json -import os -import shutil -import tempfile - -import torch -from huggingface_hub import snapshot_download -from safetensors import safe_open - -from vllm import LLM, SamplingParams - - -def patch_eagle_draft_with_lm_head(target_model_id: str, - draft_model_id: str) -> str: - # In NxDI, draft model checkpoint must include lm_head weights from target - # model. For more details see https://awsdocs-neuron.readthedocs-hosted.com - # /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html - # #eagle-checkpoint-compatibility - final_draft_dir = "/tmp/patched_eagle_draft" - - with tempfile.TemporaryDirectory() as tmp_dir: - target_dir = snapshot_download(repo_id=target_model_id, - local_dir=os.path.join( - tmp_dir, "target")) - draft_dir = snapshot_download(repo_id=draft_model_id, - local_dir=os.path.join(tmp_dir, "draft")) - - lm_head_key = "lm_head.weight" - index_path = os.path.join(target_dir, "model.safetensors.index.json") - with open(index_path) as f: - index = json.load(f) - shard_name = index["weight_map"][lm_head_key] - target_safetensor_path = os.path.join(target_dir, shard_name) - - with safe_open(target_safetensor_path, framework="pt") as f: - target_lm_head = f.get_tensor(lm_head_key) - - draft_path = os.path.join(draft_dir, "pytorch_model.bin") - draft_state_dict = torch.load(draft_path, map_location="cpu") - draft_state_dict[lm_head_key] = target_lm_head.to(torch.float16) - torch.save(draft_state_dict, draft_path) - - shutil.copytree(draft_dir, final_draft_dir, dirs_exist_ok=True) - - return final_draft_dir - - -def test_eagle(): - patched_draft_path = patch_eagle_draft_with_lm_head( - target_model_id="meta-llama/Llama-2-7b-hf", - draft_model_id="yuhuili/EAGLE-llama2-chat-7B") - llm = LLM( - model="meta-llama/Llama-2-7b-hf", - speculative_config={ - "model": patched_draft_path, - "num_speculative_tokens": 5, - "max_model_len": 128 - }, - max_num_seqs=1, - max_model_len=128, - tensor_parallel_size=2, - override_neuron_config={ - "enable_eagle_speculation": True, - "enable_fused_speculation": True, - "fused_qkv": True - }, - ) - prompts = [ - "The president of the United States is", - ] - outputs = llm.generate(prompts, SamplingParams(top_k=1)) - expected_output = " the head of state and head of government of " \ - "the United States. The president direct" - - for output in outputs: - generated_text = output.outputs[0].text - print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}") - assert (expected_output == generated_text) - - print("Neuron Eagle speculation test passed.") diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py deleted file mode 100644 index d02fff943e90..000000000000 --- a/tests/neuron/2_core/test_mistral.py +++ /dev/null @@ -1,65 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from vllm import LLM, SamplingParams - - -def test_mistral(): - llm = LLM(model="mistralai/Mistral-7B-v0.1", - tensor_parallel_size=2, - max_num_seqs=4, - max_model_len=128, - use_v2_block_manager=True, - override_neuron_config={ - "sequence_parallel_enabled": False, - "skip_warmup": True - }) - - # Send more prompts than the compiled batch size (4) and request - # varying generation lengths to test accuracy related to Neuron - # specific sequence id sorting. - prompts = [ - "The president of the United States is", - "The capital of France is", - "What is Annapurna labs?", - "I believe the meaning of life is", - "Tell me a story about a brave knight", - "Hello, my name is Llama", - ] - - sampling_params = [ - SamplingParams(top_k=1, max_tokens=10), - SamplingParams(top_k=1, max_tokens=20), - SamplingParams(top_k=1, max_tokens=30), - SamplingParams(top_k=1, max_tokens=40), - SamplingParams(top_k=1, max_tokens=50), - SamplingParams(top_k=1, max_tokens=60) - ] - - outputs = llm.generate(prompts, sampling_params) - - expected_outputs = [ - " the most powerful person in the world. He is", - " a city of many faces. It is a city of history, culture, art, " - "fashion, and", - "\n\nAnnapurna Labs is a semiconductor company that was founded " - "in 2013 by Amazon. The company is", - " to be happy.\n\nI believe that happiness is a choice.\n\nI " - "believe that happiness is a state of mind.\n\nI believe that " - "happiness is a journey.\n\nI believe", - " who rescued a princess from a dragon.\n\nTell me a story about" - " a princess who rescued herself from a dragon.\n\nTell me a " - "story about a princess who rescued herself from a dragon and " - "then rescued a knight from", - " and I am a 10 year old male. I am a very friendly and " - "affectionate boy who loves to be around people. I am a very " - "active boy who loves to play and run around. I am a very smart " - "boy who loves to learn new things. I am a very loyal boy" - ] - - for expected_output, output in zip(expected_outputs, outputs): - generated_text = output.outputs[0].text - print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}") - assert (expected_output == generated_text) - - print("Neuron Mistral test passed.") diff --git a/tests/neuron/2_core/test_multi_lora.py b/tests/neuron/2_core/test_multi_lora.py deleted file mode 100644 index 6b97f47d4db3..000000000000 --- a/tests/neuron/2_core/test_multi_lora.py +++ /dev/null @@ -1,99 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from huggingface_hub import snapshot_download - -from vllm import LLM, SamplingParams -from vllm.lora.request import LoRARequest - - -def test_llama_single_lora(): - sql_lora_files = snapshot_download( - repo_id="yard1/llama-2-7b-sql-lora-test") - llm = LLM(model="meta-llama/Llama-2-7b-hf", - tensor_parallel_size=2, - max_num_seqs=4, - max_model_len=512, - use_v2_block_manager=True, - override_neuron_config={ - "sequence_parallel_enabled": False, - "skip_warmup": True, - "lora_modules": [{ - "name": "lora_id_1", - "path": sql_lora_files - }] - }, - enable_lora=True, - max_loras=1, - max_lora_rank=256, - device="neuron") - """For multi-lora requests using NxDI as the backend, only the lora_name - needs to be specified. The lora_id and lora_path are supplied at the LLM - class/server initialization, after which the paths are handled by NxDI""" - lora_req_1 = LoRARequest("lora_id_1", 0, " ") - prompts = [ - "The president of the United States is", - "The capital of France is", - ] - outputs = llm.generate(prompts, - SamplingParams(top_k=1), - lora_request=[lora_req_1, lora_req_1]) - - expected_outputs = [ - " the head of state and head of government of the United States. " - "The president direct", - " a city of contrasts. The city is home to the Eiffel Tower" - ] - - for expected_output, output in zip(expected_outputs, outputs): - generated_text = output.outputs[0].text - assert (expected_output == generated_text) - - -def test_llama_multiple_lora(): - sql_lora_files = snapshot_download( - repo_id="yard1/llama-2-7b-sql-lora-test") - llm = LLM(model="meta-llama/Llama-2-7b-hf", - tensor_parallel_size=2, - max_num_seqs=4, - max_model_len=512, - use_v2_block_manager=True, - override_neuron_config={ - "sequence_parallel_enabled": - False, - "skip_warmup": - True, - "lora_modules": [{ - "name": "lora_id_1", - "path": sql_lora_files - }, { - "name": "lora_id_2", - "path": sql_lora_files - }] - }, - enable_lora=True, - max_loras=2, - max_lora_rank=256, - device="neuron") - """For multi-lora requests using NxDI as the backend, only the lora_name - needs to be specified. The lora_id and lora_path are supplied at the LLM - class/server initialization, after which the paths are handled by NxDI""" - lora_req_1 = LoRARequest("lora_id_1", 0, " ") - lora_req_2 = LoRARequest("lora_id_2", 1, " ") - prompts = [ - "The president of the United States is", - "The capital of France is", - ] - outputs = llm.generate(prompts, - SamplingParams(top_k=1), - lora_request=[lora_req_1, lora_req_2]) - - expected_outputs = [ - " the head of state and head of government of the United States. " - "The president direct", - " a city of contrasts. The city is home to the Eiffel Tower" - ] - - for expected_output, output in zip(expected_outputs, outputs): - generated_text = output.outputs[0].text - assert (expected_output == generated_text) diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py deleted file mode 100644 index 29fa43201761..000000000000 --- a/vllm/attention/ops/nki_flash_attn.py +++ /dev/null @@ -1,903 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import neuronxcc.nki.isa as nisa -import neuronxcc.nki.language as nl -import numpy as np -import torch -from neuronxcc import nki -from neuronxcc.nki.language import par_dim - -from vllm.utils import cdiv - - -def is_power_of_2(x): - return x > 0 and (x & (x - 1)) == 0 - - -@nki.jit -def load_block_tables(block_tables_hbm, num_tiles, num_blocks_per_tile): - """ - Load block tables from HBM into SRAM - - `block_tables_hbm` has shape `(num_tiles * num_blocks_per_tile, )`. - In case `num_tiles > B_P_SIZE`, we need further tile `num_tile` dimension. - """ - B_P_SIZE = 128 - - # reshape as `(num_tiles, num_blocks_per_tile)` - assert len(block_tables_hbm.shape) == 1 - (num_total_blocks, ) = block_tables_hbm.shape - assert num_blocks_per_tile * num_tiles == num_total_blocks - block_tables_hbm = block_tables_hbm.reshape( - (num_tiles, num_blocks_per_tile)) - - block_tables_sbuf = nl.zeros( - (cdiv(num_tiles, B_P_SIZE), par_dim(B_P_SIZE), num_blocks_per_tile), - dtype=nl.int32, - ) - for i in nl.affine_range(cdiv(num_tiles, B_P_SIZE)): - i_p = nl.arange(B_P_SIZE)[:, None] - i_f = nl.arange(num_blocks_per_tile)[None, :] - block_tables_sbuf[i, i_p, i_f] = nl.load( - block_tables_hbm[i_p + i * B_P_SIZE, i_f], - dtype=nl.int32, - mask=(i_p + i * B_P_SIZE < num_tiles), - ) - return block_tables_sbuf - - -@nki.jit -def transform_block_tables_for_indirect_load( - block_tables, - block_size_tiling_factor, - num_head, - head_id, -): - """ - This function does two things: - 1. calculate new `block_tables` for a `head_id` after flattening - `num_block`, `num_head`, and `block_size_tiling_factor` dimensions - 2. transpose the result so that `block_table` for each tile is mapped to - SBUF Partition dimension for vectorized DMA - - Tiling trick to further improve DMA performance: - Given KV cache shape `(num_block, num_head, block_size, D)`, when loading M - blocks of a given `head_id` from HBM, the load `cache[block_tables, - head_id]` has shape `(M, block_size, D)`. If M < B_P_SIZE = 128, DMA may not - fully utilize hardware parallelization. The solution is to tile `block_size` - into `(block_size_tiling_factor, tiled_block_size)` s.t. `M * - block_size_tiling_factor = B_P_SIZE`. After tiling, KV cache has shape - `(num_block, num_head, block_size_tiling_factor, tiled_block_size, D)`. - - Note: - We don't further tile D dimension as small DMA size also hurts performance. - """ - B_P_SIZE = 128 - num_partitions, num_tiles_per_partition, num_blocks_per_tile = ( - block_tables.shape) - assert num_tiles_per_partition == B_P_SIZE - assert is_power_of_2( - num_blocks_per_tile), f"{num_blocks_per_tile=} is not power of 2" - - num_loads = cdiv(num_blocks_per_tile, B_P_SIZE) - block_tables_transposed = nl.ndarray( - ( - num_loads, - par_dim(B_P_SIZE), - num_partitions * num_tiles_per_partition, - ), - dtype=nl.int32, - ) - - # prepare iota ahead of time to avoid repeatedly using Gpsimd - if num_head > 1: - head_id = nisa.iota(head_id, dtype=nl.int32).reshape((1, 1)) - head_id = nl.transpose( - head_id.broadcast_to((1, num_tiles_per_partition))) - if num_blocks_per_tile > 1: - head_id = head_id.broadcast_to( - (num_tiles_per_partition, num_blocks_per_tile)) - - if block_size_tiling_factor > 1: - broadcast_shape = ( - num_tiles_per_partition, - num_blocks_per_tile, - block_size_tiling_factor, - ) - offset = nisa.iota(nl.arange(block_size_tiling_factor)[None, None, :], - dtype=nl.int32).broadcast_to(broadcast_shape) - - for partition_id in nl.affine_range(num_partitions): - block_tables_partition = block_tables[partition_id] - if num_head > 1: - # fuse num_block and num_head dimension - block_tables_partition = block_tables_partition * num_head + head_id - - if block_size_tiling_factor > 1: - # need to apply block size tiling trick - assert num_blocks_per_tile * block_size_tiling_factor == B_P_SIZE - block_tables_partition = ((block_tables_partition * - block_size_tiling_factor).reshape( - (num_tiles_per_partition, - num_blocks_per_tile, - 1)).broadcast_to(broadcast_shape)) - new_block_tables = block_tables_partition + offset - new_block_tables = new_block_tables.reshape( - (num_tiles_per_partition, B_P_SIZE)) - else: - new_block_tables = block_tables_partition - - # transpose the block table so that it can be used by vector DGE - for i in nl.affine_range(num_loads): - i_p = nl.arange(B_P_SIZE)[:, None] - i_f = (partition_id * num_tiles_per_partition + - nl.arange(num_tiles_per_partition)[None, :]) - block_tables_transposed[i, i_p, i_f] = nl.transpose( - new_block_tables[:, nl.ds(i * B_P_SIZE, B_P_SIZE)]) - return block_tables_transposed - - -@nki.jit -def load_kv_tile_from_cache( - cur_k_tile, - cur_v_tile, - kv_cache, - block_tables, - large_k_tile_idx, - num_blocks_per_large_tile, - tiled_block_size, - B_P_SIZE, - B_D_SIZE, -): - """ - Load KV cache and transform Key and Value into layout required by Matmul - - Vectorized DMA Load layout: - Key and Value: (par_dim(B_P_SIZE), seqlen_kv // B_P_SIZE * B_D_SIZE) - - Layout used by attention matmuls: - Key: (par_dim(B_D_SIZE), seqlen_kv) - Value: (seqlen_kv // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE) - equivalent to (par_dim(B_P_SIZE), seqlen_kv // B_P_SIZE * B_D_SIZE) - """ - # load key cache - num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE) - for load_idx in nl.affine_range(num_loads): - i_p = nl.arange(B_P_SIZE)[:, None] - i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :] - loaded = nl.load(kv_cache[0, block_tables[load_idx, i_p, - large_k_tile_idx], i_f]) - if cur_k_tile.dtype != loaded.dtype: - loaded = nl.copy(loaded, dtype=cur_k_tile.dtype) - # Transpose SBUF tensor using PE - for tb_i in nl.affine_range(tiled_block_size): - cur_k_tile[ - :, - nl.ds( - load_idx * B_P_SIZE * tiled_block_size + tb_i * B_P_SIZE, - B_P_SIZE, - ), - ] = nl.transpose(loaded[:, nl.ds(tb_i * B_D_SIZE, B_D_SIZE)]) - - # load value cache - for load_idx in nl.affine_range(num_loads): - loaded = nl.load(kv_cache[1, block_tables[load_idx, i_p, - large_k_tile_idx], i_f]) - if cur_v_tile.dtype != loaded.dtype: - loaded = nl.copy(loaded, dtype=cur_v_tile.dtype) - i_p = nl.arange(B_P_SIZE)[:, None] - i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :] - cur_v_tile[ - :, - nl.ds( - load_idx * tiled_block_size * B_D_SIZE, - tiled_block_size * B_D_SIZE, - ), - ] = loaded - - -@nki.jit -def transpose_p_local(p_local_transposed, - p_local, - LARGE_TILE_SZ, - B_F_SIZE=512): - for i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE): - if nisa.get_nc_version() == nisa.nc_version.gen3: - p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE), - buffer=nl.sbuf, - dtype=p_local.dtype) - else: - p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE), - buffer=nl.psum, - dtype=np.float32) - - for j in nl.affine_range(B_F_SIZE // 128): - j_128_slice = nl.ds(j * 128, 128) - i_j_128_slice = nl.ds(i * B_F_SIZE + j * 128, 128) - - if nisa.get_nc_version() == nisa.nc_version.gen3: - p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose( - p_local[:, i_j_128_slice]) - else: - p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose( - p_local[:, i_j_128_slice]) - - p_local_transposed[:, nl.ds(i * B_F_SIZE, B_F_SIZE)] = nl.copy( - p_local_t_tmp, dtype=p_local_transposed.dtype) - - -@nki.jit -def _flash_attention_core( - q_local_tile, - k, - v, - o_buffer, - l_buffer, - m_buffer, - kernel_dtype, - acc_type, - tile_mask, - use_causal_mask, - q_tile_idx=None, - initialize=False, - LARGE_TILE_SZ=2048, - B_P_SIZE=128, - B_F_SIZE=512, - B_D_SIZE=128, - qk_res_buffer=None, -): - """ - The flash attention core function to calculate self attention between a tile - of q and a block of K and V. - The q_local_tile has (B_P_SIZE, B_D_SIZE) - The K and V have shape (B_D_SIZE, LARGE_TILE_SZ), whose free dimension will - be split into size B_F_SIZE tiles - - The results are stored in the following three buffers - o_buffer: (B_P_SIZE, d) - l_buffer: (B_P_SIZE, 1) - m_buffer: (B_P_SIZE, 1) - - All IO buffers are in SBUF. - """ - num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE - - qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), - buffer=nl.sbuf, - dtype=acc_type) - max_local = nl.ndarray((par_dim(B_P_SIZE), num_k_tile_per_large_tile), - dtype=acc_type) - for k_i in nl.affine_range(num_k_tile_per_large_tile): - k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE) - - if use_causal_mask: - # mask are used to only apply computation to the lower half of the - # matrix, which reduce the arithmetic intensity by up to 50% - multiplication_required_selection = (q_tile_idx * B_P_SIZE - >= k_i * B_F_SIZE) - else: - multiplication_required_selection = True - - if multiplication_required_selection: - qk_psum = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE), - dtype=np.float32, - buffer=nl.psum) # (128, 512) - qk_psum[:, :] = nl.matmul(q_local_tile, - k[:, k_i_b_f_slice], - transpose_x=True) # (p(128), 512) - qk_res_buf[:, k_i_b_f_slice] = nl.where( - tile_mask[:, k_i_b_f_slice], - qk_psum[:, nl.ds(0, B_F_SIZE)], - -9984.0, - dtype=acc_type, - ) - else: - qk_res_buf[:, k_i_b_f_slice] = -9984.0 - - # Calculate max of the current tile - max_local[:, k_i] = nisa.tensor_reduce( - np.max, - qk_res_buf[:, k_i_b_f_slice], - axis=(1, ), - dtype=acc_type, - negate=False, - ) - - if qk_res_buffer is not None: - qk_res_buffer[:, :] = nl.copy(qk_res_buf[:, :]) - - max_ = nisa.tensor_reduce( - np.max, - max_local[:, :], - axis=(1, ), - dtype=acc_type, - negate=False, - ) - - o_previous_scaled = nl.ndarray((par_dim(B_P_SIZE), B_D_SIZE), - dtype=o_buffer.dtype) - - if initialize: - m_buffer[:, 0] = nl.copy(max_) - m_current = max_ - else: - m_previous = nl.copy(m_buffer[:, 0]) - m_buffer[:, 0] = nl.maximum(m_previous, max_) # (128,1) - - m_current = m_buffer[:, 0] - # Compute scaling factor - alpha = nisa.activation( - np.exp, - m_previous, - bias=-1 * m_current, - scale=1.0, - ) - o_previous_scaled[...] = nl.multiply(o_buffer[:, :], alpha) - - p_local = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), - dtype=kernel_dtype) - REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2) - - p_partial_sum = nl.ndarray( - (par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE), - dtype=acc_type, - ) - - for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE): - k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE) - - # compute exp(qk - max) - # Compute partial row - tile sum of exp(qk - max)) - # FIXME : Use activation accumulate to accumulate over k_r_i loop ? - p_local[:, k_r_i_reduce_slice] = nisa.activation_reduce( - np.exp, - qk_res_buf[:, k_r_i_reduce_slice], - bias=-1 * m_current, - scale=1.0, - reduce_op=nl.add, - reduce_res=p_partial_sum[:, k_r_i], - dtype=kernel_dtype, - ) - - ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type) - - p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), - dtype=kernel_dtype) - transpose_p_local( - p_local_transposed=p_local_transposed, - p_local=p_local, - LARGE_TILE_SZ=LARGE_TILE_SZ, - B_F_SIZE=B_F_SIZE, - ) - - pv_psum = nl.zeros( - (par_dim(B_P_SIZE), B_D_SIZE), - dtype=np.float32, - buffer=nl.psum, - ) - for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE): - pv_psum[:, :] += nl.matmul( - p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)], - v[:, nl.ds(k_i * B_D_SIZE, B_D_SIZE)], - transpose_x=True, - ) # (128, 128) (p(Br), d) - - if initialize: - o_buffer[:, :] = nl.copy(pv_psum[:, :]) - l_buffer[:, 0] = nl.add(nl.log(ps), max_) - else: - o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum) - - l_prev = l_buffer[:, 0] - l_exp = nl.add( - nl.exp(nl.subtract(l_prev, m_current)), - ps, - ) - l_buffer[:, 0] = nl.add(m_current, nl.log(l_exp)) - - -@nki.jit -def load_v_tile(v_hbm_tile, cur_v_tile, large_tile_idx, v_i, LARGE_TILE_SZ): - B_P_SIZE = 128 - B_D_SIZE = v_hbm_tile.shape[-1] - loaded = nl.load(v_hbm_tile[ - nl.ds(large_tile_idx * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE), - :, - ]) - if cur_v_tile.dtype != loaded.dtype: - loaded = nl.copy(loaded, dtype=cur_v_tile.dtype) - cur_v_tile[:, nl.ds(v_i * B_D_SIZE, B_D_SIZE)] = loaded - - -@nki.jit -def flash_paged_attention( - query, - key, - value, - kv_cache, - block_tables, - mask, - softmax_scale=None, - mixed_precision=True, - LARGE_TILE_SZ=2048, - return_debug_tensors=False, -): - """ - Flash PagedAttention Forward Kernel. - - IO tensor layouts: - - query: shape (1, n_heads, d, seq_q) - - key: shape (1, n_kv_heads, d, seq_k) - - value: shape (1, n_kv_heads, seq_v, d) - - kv_cache: (2, num_blocks, n_kv_heads, block_size, d) - - block_tables: (num_active_blocks, ) - - mask: (seq_q, num_active_blocks * block_size + seq_q) - - o: shape (1, n_heads, seq_q, d) - - - This kernel requires seq_k == seq_v - - We use continuous batching by default, so the batch dimension is - always 1, and different requests are concatenated along sequence - dimension. - - We use paged cache blocks (kv_cache) to store KV cache. - - IO tensor dtypes: - - This kernel assumes all IO tensors have the same dtype except for - block_tables (int32) and mask (int32) - - If mixed_precision is True, then all Tensor Engine operation will be - performed in bfloat16 and accumulation will be performed in float32. - Otherwise the intermediates will be in the same type as the inputs. - - Compile-time Constants: - - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)` - - mixed_precision: flag to set non-matmul ops in fp32 precision, default - is set to `true`, if false, we use same precision as input types - - LARGE_TILE_SZ: `default=2048`, size of the kv tile size for attention - computation reduction - - GQA support Notes: - the spmd kernel for launching kernel should be on kv_heads instead of - nheads - - Example usage: - MHA: q: [b, h, d, s], k: [b, h, d, s], v: [b, h, s, d] - usage: `flash_fwd[b, h](q, k, v, ...)` - GQA: q: [b, h, d, s], k: [b, kv_h, d, s], v: [b, kv_h, s, d] - usage: `flash_fwd[b, kv_h](q, k, v, ...)` - """ - B_F_SIZE = 512 - B_P_SIZE = 128 - b, h, d, seqlen_q = query.shape - B_D_SIZE = d - n_tile_q = seqlen_q // B_P_SIZE # since q will be loaded on tensor engine - _, num_blocks, k_h, block_size, _ = kv_cache.shape - q_h_per_k_h = h // k_h - assert b == 1, f"invalid batch size {b=}" - assert d <= 128, f" we do not support head_dim > 128, got head dim {d=}" - cache_shape = (2, num_blocks, k_h, block_size, d) - assert (tuple(kv_cache.shape) == cache_shape - ), f"{kv_cache.shape=} mismatch, expect {cache_shape}" - assert key is None or tuple(key.shape) == ( - 1, - k_h, - d, - seqlen_q, - ), f"key shape {key.shape} mismatch!" - assert value is None or tuple(value.shape) == ( - 1, - k_h, - seqlen_q, - d, - ), f"value shape {value.shape} mismatch!" - - assert ( - nl.program_ndim() == 2 - ), f"Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!" - batch_id = nl.program_id(axis=0) - head_id = nl.program_id(axis=1) - - (num_active_blocks, ) = block_tables.shape - context_kv_len = num_active_blocks * block_size - assert ( - LARGE_TILE_SZ % B_F_SIZE == 0 - ), f"Need {LARGE_TILE_SZ=} to be divisible by {B_F_SIZE=} in transpose_p" - assert (context_kv_len % LARGE_TILE_SZ == 0 - ), f"Need {context_kv_len=} to be divisible by {LARGE_TILE_SZ=}" - - num_blocks_per_large_tile = LARGE_TILE_SZ // block_size - assert is_power_of_2( - num_blocks_per_large_tile - ), f"{num_blocks_per_large_tile=} is expected of be power of 2" - if seqlen_q > B_F_SIZE: - MAX_REDUCTION_TILE = 2048 - if seqlen_q // 2 > MAX_REDUCTION_TILE: - assert ( - seqlen_q % MAX_REDUCTION_TILE == 0 - ), f"{seqlen_q=} should be divisible by {MAX_REDUCTION_TILE=}" - else: - assert (seqlen_q % B_F_SIZE == 0 - ), f"{seqlen_q=} should be divisible by {B_F_SIZE=})" - - kernel_dtype = nl.bfloat16 if mixed_precision else query.dtype - acc_type = np.dtype(np.float32) if mixed_precision else kernel_dtype - softmax_scale = softmax_scale or (1.0 / (d**0.5)) - num_large_k_tile = context_kv_len // LARGE_TILE_SZ - - o = nl.ndarray((b, h, seqlen_q, d), - dtype=query.dtype, - buffer=nl.shared_hbm) - hbm_l_buffer, hbm_m_buffer, hbm_qk_res, qk_res_buffer = ( - None, - None, - None, - None, - ) - if return_debug_tensors: - hbm_l_buffer = nl.ndarray((b, h, seqlen_q), - dtype=acc_type, - buffer=nl.shared_hbm) - hbm_m_buffer = nl.ndarray((b, h, seqlen_q), - dtype=acc_type, - buffer=nl.shared_hbm) - hbm_qk_res = nl.ndarray((b, h, B_P_SIZE, seqlen_q), - dtype=acc_type, - buffer=nl.shared_hbm) - qk_res_buffer = nl.zeros( - (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), seqlen_q), - dtype=acc_type, - buffer=nl.sbuf, - lazy_initialization=True, - ) - block_tables_sbuf = load_block_tables( - block_tables_hbm=block_tables, - num_tiles=num_large_k_tile, - num_blocks_per_tile=num_blocks_per_large_tile, - ) - - # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient - if num_blocks_per_large_tile < B_P_SIZE: - # we checked num_blocks_per_tile is a power of 2 - assert B_P_SIZE % num_blocks_per_large_tile == 0 - block_size_tiling_factor = B_P_SIZE // num_blocks_per_large_tile - # We assume block_size >= block_size_tiling_factor - assert block_size % block_size_tiling_factor == 0 - else: - block_size_tiling_factor = 1 - tiled_block_size = block_size // block_size_tiling_factor - - # Indirect DMA load must be placed along Partition Dimension - block_tables_sbuf = transform_block_tables_for_indirect_load( - block_tables_sbuf, - block_size_tiling_factor=block_size_tiling_factor, - num_head=k_h, - head_id=head_id, - ) - - # Flatten KV cache to be 3D for loading into SBUF - new_cache_shape = ( - 2, - num_blocks * k_h * block_size_tiling_factor, - tiled_block_size * d, - ) - kv_cache = kv_cache.reshape(new_cache_shape) - - # Global Flash Attention accumulators - o_buffer = nl.zeros( - (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), d), - dtype=acc_type, - buffer=nl.sbuf, - lazy_initialization=True, - ) - l_buffer = nl.zeros( - (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1), - dtype=acc_type, - buffer=nl.sbuf, - lazy_initialization=True, - ) - m_buffer = nl.zeros( - (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1), - dtype=acc_type, - buffer=nl.sbuf, - lazy_initialization=True, - ) - - for large_k_tile_idx in nl.sequential_range(0, num_large_k_tile): - num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE) - cur_k_tile = nl.ndarray( - (par_dim(B_D_SIZE), LARGE_TILE_SZ), - dtype=kernel_dtype, - ) - cur_v_tile = nl.ndarray( - (par_dim(B_P_SIZE), num_loads * tiled_block_size * B_D_SIZE), - dtype=kernel_dtype, - ) - load_kv_tile_from_cache( - cur_k_tile=cur_k_tile, - cur_v_tile=cur_v_tile, - kv_cache=kv_cache, - block_tables=block_tables_sbuf, - large_k_tile_idx=large_k_tile_idx, - num_blocks_per_large_tile=num_blocks_per_large_tile, - tiled_block_size=tiled_block_size, - B_P_SIZE=B_P_SIZE, - B_D_SIZE=B_D_SIZE, - ) - - for i in nl.affine_range(n_tile_q): - cur_mask = nl.load(mask[ - nl.ds(i * B_P_SIZE, B_P_SIZE), - nl.ds(large_k_tile_idx * LARGE_TILE_SZ, LARGE_TILE_SZ), - ]) - for i_q_h in nl.affine_range(q_h_per_k_h): - q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype) - q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h] - q_sbuf_tile = nl.load(q_hbm_tile[:, - nl.ds(i * - B_P_SIZE, B_P_SIZE)]) - if q_sbuf_tile.dtype != kernel_dtype: - q_sbuf_tile = nl.copy(q_sbuf_tile, dtype=kernel_dtype) - q_tile[:, :] = q_sbuf_tile * softmax_scale - - _flash_attention_core( - q_local_tile=q_tile, - k=cur_k_tile, - v=cur_v_tile, - o_buffer=o_buffer[i, i_q_h], - l_buffer=l_buffer[i, i_q_h], - m_buffer=m_buffer[i, i_q_h], - kernel_dtype=kernel_dtype, - acc_type=acc_type, - tile_mask=cur_mask, - use_causal_mask=False, - q_tile_idx=i, - initialize=large_k_tile_idx == 0, - LARGE_TILE_SZ=LARGE_TILE_SZ, - B_P_SIZE=B_P_SIZE, - B_F_SIZE=B_F_SIZE, - B_D_SIZE=B_D_SIZE, - ) - - # compute attention between input query, key and value - if key is not None and value is not None: - B_F_SIZE = min(seqlen_q, B_F_SIZE) - LARGE_TILE_SZ = seqlen_q - - cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ), - dtype=kernel_dtype) - cur_v_tile = nl.ndarray( - (par_dim(B_P_SIZE), LARGE_TILE_SZ // B_P_SIZE * B_D_SIZE), - dtype=kernel_dtype, - ) - - loaded = nl.load(key[batch_id, head_id, :, :]) - if loaded.dtype != kernel_dtype: - loaded = nl.copy(loaded, dtype=kernel_dtype) - cur_k_tile[:, :] = loaded - - v_hbm_tile = value[batch_id, head_id] - for v_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE): - load_v_tile( - v_hbm_tile=v_hbm_tile, - cur_v_tile=cur_v_tile, - large_tile_idx=0, - v_i=v_i, - LARGE_TILE_SZ=LARGE_TILE_SZ, - ) - - for i in nl.affine_range(n_tile_q): - cur_mask = nl.load(mask[ - nl.ds(i * B_P_SIZE, B_P_SIZE), - nl.ds(context_kv_len, LARGE_TILE_SZ), - ]) - for i_q_h in nl.affine_range(q_h_per_k_h): - - q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype) - q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h] - q_sbuf_tile = nl.load(q_hbm_tile[:, - nl.ds(i * - B_P_SIZE, B_P_SIZE)]) - if q_sbuf_tile.dtype != kernel_dtype: - q_sbuf_tile = nl.copy(q_sbuf_tile, dtype=kernel_dtype) - q_tile[:, :] = q_sbuf_tile * softmax_scale - _flash_attention_core( - q_local_tile=q_tile, - k=cur_k_tile, - v=cur_v_tile, - o_buffer=o_buffer[i, i_q_h], - l_buffer=l_buffer[i, i_q_h], - m_buffer=m_buffer[i, i_q_h], - kernel_dtype=kernel_dtype, - acc_type=acc_type, - tile_mask=cur_mask, - use_causal_mask=True, - q_tile_idx=i, - initialize=False, - LARGE_TILE_SZ=LARGE_TILE_SZ, - B_P_SIZE=B_P_SIZE, - B_F_SIZE=B_F_SIZE, - B_D_SIZE=B_D_SIZE, - qk_res_buffer=(qk_res_buffer[i, i_q_h] - if qk_res_buffer is not None else None), - ) - - # -- -- -- -- write output to buffer on HBM -- -- -- -- -- -- # - for i_q_h in nl.affine_range(q_h_per_k_h): - for i in nl.affine_range(n_tile_q): - out = nl.multiply( - o_buffer[i, i_q_h], - nl.exp(m_buffer[i, i_q_h] - l_buffer[i, i_q_h]), - dtype=kernel_dtype, - ) - - nl.store( - o[ - batch_id, - head_id * q_h_per_k_h + i_q_h, - nl.ds(i * B_P_SIZE, B_P_SIZE), - :, - ], - out, - ) - # maximum and summation statistics - if return_debug_tensors: - nl.store( - hbm_m_buffer[ - batch_id, - head_id * q_h_per_k_h + i_q_h, - nl.ds(i * B_P_SIZE, B_P_SIZE), - ], - m_buffer[i, i_q_h, :, :], - ) - nl.store( - hbm_l_buffer[ - batch_id, - head_id * q_h_per_k_h + i_q_h, - nl.ds(i * B_P_SIZE, B_P_SIZE), - ], - l_buffer[i, i_q_h], - ) - nl.store( - hbm_qk_res[batch_id, head_id * q_h_per_k_h + i_q_h, :, :], - qk_res_buffer[batch_id, i_q_h, :, :], - ) - - if return_debug_tensors: - return o, hbm_m_buffer, hbm_l_buffer, hbm_qk_res - return o - - -def reorder_context_mask(mask, LARGE_TILE_SZ, block_size): - """ - Reorder the mask to make it compatible with the flash attention kernel. - - We vectorize KV cache read to improve DMA utilization. However, the layout - that maximizes DMA bandwidth changes the order tokens are consumed. - - The token layout (inner 2 dimensions) after vectorized load is (B_P_SIZE, - tiled_block_size) in a tile of `B_P_SIZE * tiled_block_size` tokens. And - each step the engine consumes a column (rather than a row) of B_P_SIZE - tokens. Therefore, the tokens are visited in a strided way. - - To make sure mask matches the order tokens are consumed, we need to properly - transpose mask. - """ - total_query_len, total_seq_len = mask.shape - context_kv_len = total_seq_len - total_query_len - - B_P_SIZE = 128 - assert (LARGE_TILE_SZ - >= B_P_SIZE), f"{LARGE_TILE_SZ=} must be larger than {B_P_SIZE=}" - num_tiled_blocks = max(B_P_SIZE, LARGE_TILE_SZ // block_size) - tiled_block_size = LARGE_TILE_SZ // num_tiled_blocks - if tiled_block_size > 1: - # Mask reordering is needed when tiled_block_size > 1 - device = mask.device - mask = mask.cpu() - context_mask = mask[:, :context_kv_len] - context_mask = context_mask.view( - total_query_len, - context_kv_len // LARGE_TILE_SZ, - num_tiled_blocks // B_P_SIZE, - B_P_SIZE, - tiled_block_size, - ) - context_mask = context_mask.transpose(3, 4).reshape( - total_query_len, context_kv_len) - new_mask = mask[:, context_kv_len:] - return torch.concat([context_mask, new_mask], dim=1).to(device) - else: - return mask - - -def flash_attn_varlen_nkifunc( - query, - key, - value, - kv_cache, - block_table, - attn_mask, - n_kv_head=None, - head_size=None, - LARGE_TILE_SZ=2048, - mixed_precision=True, -): - """ - Compute flash paged attention for variable length sequences. - - This function is a wrapper around the flash attention NKI kernel. It takes - in the following arguments: - - query: (1, n_heads, d, seq_q) - - key: (1, n_kv_heads, d, seq_k) - - value: (1, n_kv_heads, seq_v, d) - - kv_cache: (2, n_blocks, n_kv_heads, block_size, d) - - block_tables: (n_active_blocks, ) - - attn_mask: (seq_q, n_active_blocks * block_size + seq_q) - - Notes: - - attn_mask must be reordered outside using `reorder_context_mask` - - Key/value cache layout must be (n_blocks, n_kv_heads, block_size, d) - for better DMA throughput - """ - if n_kv_head is None: - n_kv_head = kv_cache.shape[2] - assert kv_cache.shape[0] == 2 - assert kv_cache.shape[2] == n_kv_head - if head_size is None: - head_size = kv_cache.shape[-1] - - kwargs = dict( - query=query, - key=key, - value=value, - kv_cache=kv_cache, - block_tables=block_table, - mask=attn_mask, - softmax_scale=1.0 / (head_size**0.5), - mixed_precision=mixed_precision, - LARGE_TILE_SZ=LARGE_TILE_SZ, - ) - - o = flash_paged_attention[1, n_kv_head](**kwargs) - return o - - -def reshape_and_cache( - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - slot_mapping: torch.Tensor, -) -> None: - """ - Writes key-value pairs to the KV cache at specified positions. - - Args: - key (torch.Tensor): Key tensor with shape - (num_tokens, n_kv_head, d_head) - value (torch.Tensor): Value tensor with shape - (num_tokens, n_kv_head, d_head) - kv_cache (torch.Tensor): Key/value cache tensor with shape - (2, num_blocks, n_kv_head, block_size, d_head) - slot_mapping (torch.Tensor): Mapping tensor indicating cache positions - with shape (num_tokens) - - Returns: - None: Updates the kv_cache tensor in-place - """ - block_size = kv_cache.size(3) - n_kv_head = key.size(1) - - # Calculate indices with explicit floor division - block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") - block_offsets = slot_mapping % block_size - - # Create the head indices tensor - head_indices = torch.arange(n_kv_head, device=key.device) - - # Update caches using index_put_ - kv_cache.index_put_( - (torch.tensor([0], device=key.device), block_indices[:, None], - head_indices[None, :], block_offsets[:, None]), key) - - kv_cache.index_put_( - (torch.tensor([1], device=key.device), block_indices[:, None], - head_indices[None, :], block_offsets[:, None]), value) diff --git a/vllm/collect_env.py b/vllm/collect_env.py index ee43ad12e8a5..5b3883a482f2 100644 --- a/vllm/collect_env.py +++ b/vllm/collect_env.py @@ -54,7 +54,6 @@ 'is_xnnpack_available', 'cpu_info', 'rocm_version', # vllm specific field - 'neuron_sdk_version', # vllm specific field 'vllm_version', # vllm specific field 'vllm_build_flags', # vllm specific field 'gpu_topo', # vllm specific field @@ -275,15 +274,6 @@ def get_rocm_version(run_lambda): r'HIP version: (\S+)') -def get_neuron_sdk_version(run_lambda): - # Adapted from your install script - try: - result = run_lambda(["neuron-ls"]) - return result if result[0] == 0 else 'N/A' - except Exception: - return 'N/A' - - def get_vllm_version(): from vllm import __version__, __version_tuple__ @@ -306,10 +296,9 @@ def get_vllm_version(): def summarize_vllm_build_flags(): # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc. - return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format( + return 'CUDA Archs: {}; ROCm: {}'.format( os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'), 'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled', - 'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled', ) @@ -601,7 +590,6 @@ def get_version_or_na(cfg, prefix): conda_packages = get_conda_packages(run_lambda) rocm_version = get_rocm_version(run_lambda) - neuron_sdk_version = get_neuron_sdk_version(run_lambda) vllm_version = get_vllm_version() vllm_build_flags = summarize_vllm_build_flags() gpu_topo = get_gpu_topo(run_lambda) @@ -635,7 +623,6 @@ def get_version_or_na(cfg, prefix): is_xnnpack_available=is_xnnpack_available(), cpu_info=get_cpu_info(run_lambda), rocm_version=rocm_version, - neuron_sdk_version=neuron_sdk_version, vllm_version=vllm_version, vllm_build_flags=vllm_build_flags, gpu_topo=gpu_topo, @@ -702,7 +689,6 @@ def get_version_or_na(cfg, prefix): vLLM Info ============================== ROCM Version : {rocm_version} -Neuron SDK Version : {neuron_sdk_version} vLLM Version : {vllm_version} vLLM Build Flags: {vllm_build_flags} diff --git a/vllm/config.py b/vllm/config.py index 526b5db235fd..253ecde81668 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -376,11 +376,6 @@ class ModelConfig: disable_mm_preprocessor_cache: bool = False """If `True`, disable caching of the multi-modal preprocessor/mapper (not recommended).""" - override_neuron_config: dict[str, Any] = field(default_factory=dict) - """Initialize non-default neuron config or override default neuron config - that are specific to Neuron devices, this argument will be used to - configure the neuron config that can not be gathered from the vllm - arguments. e.g. `{"cast_logits_dtype": "bfloat16"}`.""" pooler_config: Optional["PoolerConfig"] = field(init=False) """Pooler config which controls the behaviour of output pooling in pooling models.""" @@ -650,10 +645,6 @@ def __post_init__(self) -> None: self.has_noops = self._init_has_noops() self.has_inner_state = self._init_has_inner_state() - if (not current_platform.is_neuron() and self.override_neuron_config): - raise ValueError( - "`override_neuron_config` is only supported on Neuron.") - self._verify_quantization() self._verify_cuda_graph() self._verify_bnb_config() @@ -1493,13 +1484,7 @@ def is_encoder_decoder(self) -> bool: """ For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to True to enable cross-attention - Neuron needs all multimodal data to be in the decoder and does not - need to explicitly enable cross-attention """ - if (current_platform.is_neuron() - and self.hf_config.model_type == "mllama"): - return False - return is_encoder_decoder(self.hf_config) @property @@ -2075,10 +2060,7 @@ def __post_init__(self) -> None: from vllm.executor import ray_utils backend: DistributedExecutorBackend = "mp" ray_found = ray_utils.ray_is_available() - if current_platform.is_neuron(): - # neuron uses single process to control multiple devices - backend = "uni" - elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD: + if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD: backend = "uni" elif (current_platform.is_cuda() and cuda_device_count_stateless() < self.world_size): @@ -2452,7 +2434,7 @@ def is_multi_step(self) -> bool: return self.num_scheduler_steps > 1 -Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"] +Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"] @config @@ -2508,9 +2490,7 @@ def __post_init__(self): self.device_type = self.device.type # Some device types require processing inputs on CPU - if self.device_type in ["neuron"]: - self.device = torch.device("cpu") - elif self.device_type in ["tpu"]: + if self.device_type in ["tpu"]: self.device = None else: # Set device with device type @@ -4845,7 +4825,6 @@ def __str__(self): f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}," f" tokenizer_mode={self.model_config.tokenizer_mode}, " f"revision={self.model_config.revision}, " - f"override_neuron_config={self.model_config.override_neuron_config}," f" tokenizer_revision={self.model_config.tokenizer_revision}, " f"trust_remote_code={self.model_config.trust_remote_code}, " f"dtype={self.model_config.dtype}, " diff --git a/vllm/distributed/device_communicators/neuron_communicator.py b/vllm/distributed/device_communicators/neuron_communicator.py deleted file mode 100644 index 5b61a1687a01..000000000000 --- a/vllm/distributed/device_communicators/neuron_communicator.py +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import torch - -from vllm.distributed.device_communicators.base_device_communicator import ( - DeviceCommunicatorBase) -from vllm.platforms import current_platform - -if current_platform.is_neuron(): - import torch_xla.core.xla_model as xm - - -class NeuronCommunicator(DeviceCommunicatorBase): - - def all_reduce(self, x: torch.Tensor) -> torch.Tensor: - return xm.all_reduce(xm.REDUCE_SUM, x) - - def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor: - assert dim == -1, "Neuron only supports dim=-1 for all-gather." - return xm.all_gather(x, dim=dim) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b20defde73ed..6786d9a2b4e4 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -408,8 +408,6 @@ class EngineArgs: scheduling_policy: SchedulerPolicy = SchedulerConfig.policy scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls - override_neuron_config: dict[str, Any] = \ - get_field(ModelConfig, "override_neuron_config") override_pooler_config: Optional[Union[dict, PoolerConfig]] = \ ModelConfig.override_pooler_config compilation_config: CompilationConfig = \ @@ -525,8 +523,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help=model_kwargs["hf_token"]["help"]) model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"]) - model_group.add_argument("--override-neuron-config", - **model_kwargs["override_neuron_config"]) model_group.add_argument("--override-pooler-config", **model_kwargs["override_pooler_config"]) model_group.add_argument("--logits-processor-pattern", @@ -929,7 +925,6 @@ def create_model_config(self) -> ModelConfig: config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache, - override_neuron_config=self.override_neuron_config, override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, generation_config=self.generation_config, diff --git a/vllm/envs.py b/vllm/envs.py index ba0c55160b7c..a10dcf72da81 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -200,7 +200,7 @@ def get_vllm_port() -> Optional[int]: # ================== Installation Time Env Vars ================== # Target device of vLLM, supporting [cuda (by default), - # rocm, neuron, cpu] + # rocm, cpu] "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"), diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index f6e79cd676f8..70a0e37add5b 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -73,11 +73,6 @@ def forward_tpu(self, *args, **kwargs): # NOTE(woosuk): This is a placeholder for future extensions. return self.forward_native(*args, **kwargs) - def forward_neuron(self, *args, **kwargs): - # By default, we assume that Neuron ops are compatible with the - # PyTorch-native implementation. - return self.forward_native(*args, **kwargs) - def forward_oot(self, *args, **kwargs): # By default, we assume that OOT ops are compatible with the # PyTorch-native implementation. @@ -105,8 +100,6 @@ def dispatch_forward(self): return self.forward_tpu elif current_platform.is_xpu(): return self.forward_xpu - elif current_platform.is_neuron(): - return self.forward_neuron elif current_platform.is_out_of_tree(): return self.forward_oot else: diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 1fd96fe405b9..3f6116d893ee 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -90,13 +90,6 @@ def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: self.op(out, x) return out - def forward_neuron(self, x: torch.Tensor) -> torch.Tensor: - d = x.shape[-1] // 2 - x_reshaped = x.view(-1, x.shape[-1]) - s = x_reshaped[:, :d] * F.sigmoid(x_reshaped[:, :d]) - result = s * x_reshaped[:, d:] - return result.view(*x.shape[:-1], d) - @CustomOp.register("mul_and_silu") class MulAndSilu(CustomOp): diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 95aea912a150..a5325c96b120 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -29,7 +29,6 @@ "qqq", "hqq", "experts_int8", - "neuron_quant", "ipex", "quark", "moe_wna16", @@ -110,7 +109,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: from .marlin import MarlinConfig from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config from .moe_wna16 import MoeWNA16Config - from .neuron_quant import NeuronQuantConfig from .ptpc_fp8 import PTPCFp8Config from .qqq import QQQConfig from .rtn import RTNConfig @@ -140,7 +138,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: "qqq": QQQConfig, "hqq": HQQMarlinConfig, "experts_int8": ExpertsInt8Config, - "neuron_quant": NeuronQuantConfig, "ipex": IPEXConfig, "quark": QuarkConfig, "moe_wna16": MoeWNA16Config, diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py deleted file mode 100644 index 8040236663dd..000000000000 --- a/vllm/model_executor/layers/quantization/neuron_quant.py +++ /dev/null @@ -1,76 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os -from importlib.util import find_spec -from typing import Any, Optional - -from torch.nn import Module - -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) - -SUPPORTED_QUANT_DTYPE_LIST = ['s8', 'f8e4m3fn'] - - -class AlwaysSupportedDtypes(list): - - def __contains__(self, item): - return True - - -class NeuronQuantConfig(QuantizationConfig): - """Int8 Quantization Config class for Neuron Backend.""" - - def __init__( - self, - dequant_dtype: str = "f16", - quantize_method: str = "vector_dynamic", - ) -> None: - super().__init__() - self.quant_dtype = os.getenv("NEURON_QUANT_DTYPE", "s8") - if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST: - raise ValueError( - f"Neuron quantization datatype {self.quant_dtype} is not valid," - f" the quantization datatype should match one of the below " - f"types {SUPPORTED_QUANT_DTYPE_LIST}") - self.dequant_dtype = dequant_dtype - self.quantize_method = quantize_method - - def get_name(self) -> QuantizationMethods: - return "neuron_quant" - - def get_supported_act_dtypes(self) -> list[str]: - # Neuron implements custom handling logic for quantization support - return AlwaysSupportedDtypes() - - @classmethod - def get_min_capability(cls) -> int: - raise NotImplementedError( - "This function should not be called with Neuron Backend") - - @staticmethod - def get_config_filenames() -> list[str]: - return [] - - @classmethod - def from_config(cls, config: dict[str, Any]) -> "NeuronQuantConfig": - quantize_method = cls.get_from_keys(config, ["quantize_method"]) - dequant_dtype = cls.get_from_keys(config, ["dequant_dtype"]) - return cls(dequant_dtype=dequant_dtype, - quantize_method=quantize_method) - - def get_quant_method(self, layer: Module, prefix: str) -> Optional[Any]: - if find_spec("transformers_neuronx") is not None: - return self.get_quantization_config() - else: - raise NotImplementedError( - "Neuron Quantization is only supported through" - " transformers_neuronx.") - - def get_quantization_config(self): - from transformers_neuronx.config import QuantizationConfig - return QuantizationConfig(quant_dtype=self.quant_dtype, - dequant_dtype=self.dequant_dtype, - quantize_method=self.quantize_method) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index dddd4d6a7117..d21b3d9863fd 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -229,85 +229,6 @@ def forward_xpu( self.cos_sin_cache, self.is_neox_style) return query, key - def forward_neuron( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - - def _apply_rotary_emb_neuron( - x: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, - is_neox_style: bool, - ) -> torch.Tensor: - cos = cos.unsqueeze(-2).to(x.dtype) - sin = sin.unsqueeze(-2).to(x.dtype) - if is_neox_style: - x1, x2 = torch.chunk(x, 2, dim=-1) - else: - # x1 = x[..., ::2] - - # x2 = x[..., 1::2] - d = x.shape[-1] // 2 - x_reshaped = x.view(-1, x.shape[-1]) - x1 = x_reshaped[:, ::2].view(*x.shape[:-1], d) - x2 = x_reshaped[:, 1::2].view(*x.shape[:-1], d) - o1 = x1 * cos - x2 * sin - o2 = x2 * cos + x1 * sin - if is_neox_style: - return torch.cat((o1, o2), dim=-1) - else: - return torch.stack((o1, o2), dim=-1).flatten(-2) - - if offsets is not None: - positions = positions + offsets - - self.cos_sin_cache = self.cos_sin_cache.to(query.device, - dtype=query.dtype) - - positions = positions.flatten() - num_tokens = positions.shape[0] - cos_sin = self.cos_sin_cache.index_select(0, positions) - cos, sin = cos_sin.chunk(2, dim=-1) - - query_shape = query.shape - query = query.view(num_tokens, -1, self.head_size) - if key is not None: - key_shape = key.shape - key = key.view(num_tokens, -1, self.head_size) - - if self.rotary_dim == self.head_size: - query = _apply_rotary_emb(query, cos, sin, self.is_neox_style) - query = query.reshape(query_shape) - if key is not None: - key = _apply_rotary_emb(key, cos, sin, self.is_neox_style) - key = key.reshape(key_shape) - else: - head_size = query.shape[-1] - query_reshaped = query.view(-1, head_size) - query_pass = query_reshaped[:, self.rotary_dim:].view( - *query.shape[:-1], head_size - self.rotary_dim) - query_rot = query_reshaped[:, :self.rotary_dim].view( - *query.shape[:-1], self.rotary_dim) - query_rot = _apply_rotary_emb_neuron(query_rot, cos, sin, - self.is_neox_style) - query = torch.cat((query_rot, query_pass), - dim=-1).reshape(query_shape) - - if key is not None: - key_reshaped = key.view(-1, head_size) - key_pass = key_reshaped[:, self.rotary_dim:].view( - *key.shape[:-1], head_size - self.rotary_dim) - key_rot = key_reshaped[:, :self.rotary_dim].view( - *key.shape[:-1], self.rotary_dim) - key_rot = _apply_rotary_emb_neuron(key_rot, cos, sin, - self.is_neox_style) - key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) - return query, key - def extra_repr(self) -> str: s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" s += f", max_position_embeddings={self.max_position_embeddings}" diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py deleted file mode 100644 index fad97aba84b6..000000000000 --- a/vllm/model_executor/model_loader/neuron.py +++ /dev/null @@ -1,476 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Utilities for selecting and loading Neuron models in transformers-neuronx -framework.""" -import ast -import copy -import importlib -import os -from typing import Optional - -import torch -import torch.nn as nn -from transformers import PretrainedConfig - -from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig, - SpeculativeConfig) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization import get_quantization_config -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, - SequenceOutput) - -TORCH_DTYPE_TO_NEURON_AMP = { - "auto": "f32", - "half": "f16", - "float16": "f16", - "bfloat16": "bf16", - "float": "f32", - "float32": "f32", - torch.float16: "f16", - torch.bfloat16: "bf16", - torch.float32: "f32", -} - -# Models supported by Neuron. -_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str, str]] = { - "LlamaForCausalLM": ("transformers_neuronx.llama.model", - "LlamaForSampling", "LlamaForCausalLM"), - "MistralForCausalLM": ("transformers_neuronx.mistral.model", - "MistralForSampling", "MistralForCausalLM") -} - - -class NeuronCausalLM(nn.Module): - - def __init__(self, - config: PretrainedConfig, - on_device_sampling_disabled: bool = False) -> None: - super().__init__() - self.config = config - self.logits_processor = LogitsProcessor(config.vocab_size, - logits_as_input=True) - - self.on_device_sampling_disabled = on_device_sampling_disabled - if self.on_device_sampling_disabled: - # Use default sampler - self.sampler = Sampler() - - # Lazy initialized - self.model: nn.Module - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - input_block_ids: torch.Tensor, - ) -> torch.Tensor: - logits = self.model(input_ids, - cache_ids=positions, - start_ids=input_block_ids) - return logits - - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(None, hidden_states, sampling_metadata) - return logits - - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - - if self.on_device_sampling_disabled: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - - # On-device sampling outputs the token ids directly. - sampled_token_ids = logits.flatten() - next_tokens = [] - sample_idx = 0 - for seq_group in sampling_metadata.seq_groups: - samples = [] - for seq_id in seq_group.seq_ids: - token_id = sampled_token_ids[sample_idx].item() - samples.append( - SequenceOutput(parent_seq_id=seq_id, - output_token=token_id, - logprobs={token_id: Logprob(token_id)})) - sample_idx += 1 - next_tokens.append( - CompletionSequenceGroupOutput(samples=samples, - prompt_logprobs=None)) - - return SamplerOutput(outputs=next_tokens) - - def load_weights(self, model_name_or_path: str, **kwargs): - arch = _get_model_architecture(self.config) - neuronx_module_path, neuronx_model_cls_name, hf_model_cls_name = ( - _NEURON_SUPPORTED_MODELS[arch]) - neuronx_module = importlib.import_module(neuronx_module_path) - neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name) - - self.model = neuronx_model_cls.from_pretrained(model_name_or_path, - **kwargs) - self.model.to_neuron() - - -class NeuronSpeculationCausalLM(nn.Module): - """A Neuron-optimized causal language model with speculative decoding.""" - - SPECULATION_TERMINATION_ID = -1 - - def __init__(self, speculation_model) -> None: - super().__init__() - self.model = speculation_model - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - input_block_ids: torch.Tensor, - ) -> torch.Tensor: - tokens, counts = self.model.speculative_iteration( - input_ids, positions, input_block_ids) - - # Mark the end of accepted speculative tokens for each sequence with the - # speculation termination id. - batch_size, steps = tokens.shape - mask = torch.arange(steps).expand(batch_size, -1) >= counts - tokens[mask] = self.SPECULATION_TERMINATION_ID - - return tokens - - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[list[SamplerOutput]]: - batch_size, num_steps = logits.shape - seq_ids = [ - seq_id for sg in sampling_metadata.seq_groups - for seq_id in sg.seq_ids - ] - # Organize input tensors by step instead of by sequence. - accepted_token_ids_by_step = logits.transpose(0, 1) - accepted_token_ids_by_step = accepted_token_ids_by_step.tolist() - - sampler_output_list = [] - for step_index in range(num_steps): - if all(token_id == self.SPECULATION_TERMINATION_ID - for token_id in accepted_token_ids_by_step[step_index]): - break - step_output_token_ids = [] - for sequence_index in range(batch_size): - token_id = accepted_token_ids_by_step[step_index][ - sequence_index] - step_output_token_ids.append( - CompletionSequenceGroupOutput(samples=[ - SequenceOutput(parent_seq_id=seq_ids[sequence_index], - output_token=token_id, - logprobs={token_id: Logprob(token_id)}) - ], - prompt_logprobs=None)) - sampler_output_list.append( - SamplerOutput(outputs=step_output_token_ids)) - return sampler_output_list - - -def _get_model_architecture(config: PretrainedConfig) -> str: - architectures = getattr(config, "architectures", []) - for arch in architectures: - if arch in _NEURON_SUPPORTED_MODELS: - return arch - raise ValueError( - f"Model architectures {architectures} are not supported on Neuron " - f"for now. Supported architectures: " - f"{list(_NEURON_SUPPORTED_MODELS.keys())}") - - -def _get_buckets(env: str, default_value: list[int]) -> list[int]: - env_value = os.getenv(env) - if env_value is None: - return default_value - buckets_remove_empty = filter( - lambda x: x is not None and len(x.strip()) > 0, env_value.split(",")) - buckets_int = map(int, buckets_remove_empty) - buckets_list = list(buckets_int) - return buckets_list - - -def _get_default_neuron_config(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig): - """Generate a neuron config based on vllm config args.""" - from transformers_neuronx.config import ContinuousBatchingConfig - from transformers_neuronx.constants import LAYOUT_BSH - - continuous_batching_config = ContinuousBatchingConfig( - batch_size_for_shared_caches=scheduler_config.max_num_seqs) - quant_config = dict( - dequant_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], - quantize_method="vector_dynamic") - neuron_quantization_config_builder = lambda quant: get_quantization_config( - quant).from_config(quant_config).get_quant_method(None, "") - # TODO: Add Paged attention config to the default neuron arguments. - default_neuron_args = dict( - collectives_layout=LAYOUT_BSH, - attention_layout=LAYOUT_BSH, - fuse_qkv=True, - quant=neuron_quantization_config_builder(model_config.quantization) - if model_config.quantization else None, - continuous_batching=continuous_batching_config, - weight_tiling=bool(model_config.quantization), - on_device_generation=_get_neuron_on_device_generation_config( - model_config)) - return default_neuron_args - - -def _get_default_neuron_config_for_speculation( - model_config: ModelConfig, parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig): - """Generate a neuron config for speculative decoding based on - vllm config args.""" - from transformers_neuronx.config import ContinuousBatchingConfig - from transformers_neuronx.constants import LAYOUT_BSH - - continuous_batching_config = ContinuousBatchingConfig( - batch_size_for_shared_caches=scheduler_config.max_num_seqs) - - default_neuron_args = dict(collectives_layout=LAYOUT_BSH, - attention_layout=LAYOUT_BSH, - fuse_qkv=True, - on_device_embedding=True, - continuous_batching=continuous_batching_config, - on_device_generation=copy.deepcopy( - model_config.neuron_sampling_params)) - return default_neuron_args - - -def _get_neuron_on_device_generation_config(model_config: ModelConfig): - if not _is_neuron_on_device_sampling_disabled(model_config): - return copy.deepcopy(model_config.neuron_sampling_params) - return None - - -def _is_neuron_on_device_sampling_disabled(model_config: ModelConfig) -> bool: - return not getattr(model_config, "neuron_sampling_params", None) - - -def _get_neuron_config_after_override(default_neuron_config, - overridden_neuron_config): - from transformers_neuronx.config import (ContinuousBatchingConfig, - GenerationConfig, - KVCacheQuantizationConfig, - NeuronConfig, QuantizationConfig, - SparseAttnConfig) - - sparse_attn = overridden_neuron_config.pop("sparse_attn", {}) - if sparse_attn: - overridden_neuron_config["sparse_attn"] = SparseAttnConfig( - **sparse_attn) - - kv_cache_quant = overridden_neuron_config.pop("kv_cache_quant", {}) - if kv_cache_quant: - overridden_neuron_config["kv_cache_quant"] = KVCacheQuantizationConfig( - **kv_cache_quant) - - continuous_batching = overridden_neuron_config.pop("continuous_batching", - {}) - if continuous_batching: - overridden_neuron_config[ - "continuous_batching"] = ContinuousBatchingConfig( - **continuous_batching) - - quant = overridden_neuron_config.pop("quant", {}) - if quant: - overridden_neuron_config["quant"] = QuantizationConfig(**quant) - - on_device_generation = overridden_neuron_config.pop( - "on_device_generation", {}) - if on_device_generation: - overridden_neuron_config["on_device_generation"] = GenerationConfig( - **on_device_generation) - default_neuron_config.update(overridden_neuron_config) - return NeuronConfig(**default_neuron_config) - - -def get_neuron_model(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig) -> nn.Module: - """Initializes a neuron-optimized model for inference.""" - # Create a model instance. - model = NeuronCausalLM( - model_config.hf_config, - _is_neuron_on_device_sampling_disabled(model_config)) - - default_neuron_config_args = _get_default_neuron_config( - model_config, parallel_config, scheduler_config) - - neuron_config = _get_neuron_config_after_override( - default_neuron_config_args, model_config.override_neuron_config) - - context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS", - [scheduler_config.max_model_len]) - n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS", - [scheduler_config.max_model_len]) - - model.load_weights(model_config.model, - tp_degree=parallel_config.tensor_parallel_size, - amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], - neuron_config=neuron_config, - context_length_estimate=context_length_estimates, - n_positions=n_positions, - batch_size=scheduler_config.max_num_seqs) - - return model.eval() - - -def get_neuron_speculation_model(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - speculation_config: SpeculativeConfig): - """Initializes a neuron-optimized speculation model for inference. - - This method is only applicable for speculation with a standalone draft model - """ - from transformers_neuronx.fused_speculation import FusedSpeculativeDecoder - - # For Eagle SD, we need to pass in additional parameters in neuron config. - is_eagle = getattr(speculation_config.draft_model_config.hf_config, - "is_eagle", False) - - # Create target model instance. - target_model = NeuronCausalLM(model_config.hf_config) - - default_neuron_config_args = _get_default_neuron_config_for_speculation( - model_config, parallel_config, scheduler_config) - if is_eagle: - default_neuron_config_args['is_eagle_target'] = True - - neuron_config = _get_neuron_config_after_override( - default_neuron_config_args, model_config.override_neuron_config) - - context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS", - [scheduler_config.max_model_len]) - n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS", - [scheduler_config.max_model_len]) - - target_model.load_weights( - model_config.model, - tp_degree=parallel_config.tensor_parallel_size, - amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], - neuron_config=neuron_config, - context_length_estimate=context_length_estimates, - n_positions=n_positions, - batch_size=scheduler_config.max_num_seqs) - - target_model.eval() - - # Create draft model instance. - draft_model = NeuronCausalLM( - speculation_config.draft_model_config.hf_config) - - default_draft_neuron_config_args = ( - _get_default_neuron_config_for_speculation( - speculation_config.draft_model_config, parallel_config, - scheduler_config)) - if is_eagle: - default_draft_neuron_config_args['is_eagle_draft'] = True - default_draft_neuron_config_args['has_pre_attention_norm'] = False - - draft_neuron_config = _get_neuron_config_after_override( - default_draft_neuron_config_args, - speculation_config.draft_model_config.override_neuron_config) - - draft_model.load_weights(speculation_config.draft_model_config.model, - tp_degree=speculation_config. - draft_parallel_config.tensor_parallel_size, - amp=TORCH_DTYPE_TO_NEURON_AMP[ - speculation_config.draft_model_config.dtype], - neuron_config=draft_neuron_config, - context_length_estimate=context_length_estimates, - n_positions=n_positions, - batch_size=scheduler_config.max_num_seqs) - - draft_model.eval() - - num_speculative_tokens = speculation_config.num_speculative_tokens - # Create speculation model instance. - speculation_model = FusedSpeculativeDecoder(draft_model.model, - target_model.model, - num_speculative_tokens) - speculation_model.to_neuron() - - return NeuronSpeculationCausalLM(speculation_model) - - -def get_neuron_eagle_speculation_model(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - speculation_config: SpeculativeConfig): - """Initializes a neuron-optimized EAGLE speculation model for inference.""" - from transformers_neuronx.eagle_speculation import EagleSpeculativeDecoder - - # Create target model instance. - target_model = NeuronCausalLM(model_config.hf_config) - - default_neuron_config_args = _get_default_neuron_config_for_speculation( - model_config, parallel_config, scheduler_config) - default_neuron_config_args['is_eagle_target'] = True - neuron_config = _get_neuron_config_after_override( - default_neuron_config_args, model_config.override_neuron_config) - - context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS", - [scheduler_config.max_model_len]) - n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS", - [scheduler_config.max_model_len]) - - target_model.load_weights( - model_config.model, - tp_degree=parallel_config.tensor_parallel_size, - amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], - neuron_config=neuron_config, - context_length_estimate=context_length_estimates, - n_positions=n_positions, - batch_size=scheduler_config.max_num_seqs) - - target_model.eval() - - # Create draft model instance. - draft_model = NeuronCausalLM( - speculation_config.draft_model_config.hf_config) - - default_draft_neuron_config_args = ( - _get_default_neuron_config_for_speculation( - speculation_config.draft_model_config, parallel_config, - scheduler_config)) - default_draft_neuron_config_args['is_eagle_draft'] = True - default_draft_neuron_config_args['has_pre_attention_norm'] = False - draft_neuron_config = _get_neuron_config_after_override( - default_draft_neuron_config_args, - speculation_config.draft_model_config.override_neuron_config) - - draft_model.load_weights(speculation_config.draft_model_config.model, - tp_degree=speculation_config. - draft_parallel_config.tensor_parallel_size, - amp=TORCH_DTYPE_TO_NEURON_AMP[ - speculation_config.draft_model_config.dtype], - neuron_config=draft_neuron_config, - context_length_estimate=context_length_estimates, - n_positions=n_positions, - batch_size=scheduler_config.max_num_seqs) - - draft_model.eval() - - token_tree: dict[int, list[int]] = ast.literal_eval( - speculation_config.speculative_token_tree) - - speculation_model = EagleSpeculativeDecoder(draft_model.model, - target_model.model, - token_tree=token_tree) - speculation_model.to_neuron() - - return NeuronSpeculationCausalLM(speculation_model) diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py deleted file mode 100644 index f450961c64ff..000000000000 --- a/vllm/model_executor/model_loader/neuronx_distributed.py +++ /dev/null @@ -1,685 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Utilities for selecting and loading Neuron models in -neuronx-distributed-inference framework.""" -# Disabling yapf because yapf and isort have conflicts for the below imports -# yapf: disable -import copy -import hashlib -import importlib -import multiprocessing -import os -import shutil -from typing import Optional - -import torch -import torch.nn as nn -from neuronx_distributed_inference.models.config import ( - FusedSpecNeuronConfig, OnDeviceSamplingConfig) -from neuronx_distributed_inference.models.mllama.utils import ( - create_vision_mask) -from neuronx_distributed_inference.modules.lora_serving import ( - LoraServingConfig) -from neuronx_distributed_inference.utils.hf_adapter import ( - load_pretrained_config) -from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig - -from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig, - SpeculativeConfig) -from vllm.logger import init_logger -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, - SequenceOutput) - -# yapf: enable -logger = init_logger(__name__) - -TORCH_DTYPE_TO_NEURON_AMP = { - "auto": "float32", - "half": "float16", - "float16": "float16", - "bfloat16": "bfloat16", - "float": "float32", - "float32": "float32", - torch.float16: "float16", - torch.bfloat16: "bfloat16", - torch.float32: "float32", -} - -# Models supported by Neuronx distributed for inference. -_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str]] = { - "LlamaForCausalLM": - ("neuronx_distributed_inference.models.llama.modeling_llama", - "NeuronLlamaForCausalLM"), - "MistralForCausalLM": - ("neuronx_distributed_inference.models.llama.modeling_llama", - "NeuronLlamaForCausalLM"), - "DbrxForCausalLM": - ("neuronx_distributed_inference.models.dbrx.modeling_dbrx", - "NeuronDbrxForCausalLM"), - "MixtralForCausalLM": - ("neuronx_distributed_inference.models.mixtral.modeling_mixtral", - "NeuronMixtralForCausalLM"), - "MllamaForConditionalGeneration": - ("neuronx_distributed_inference.models.mllama.modeling_mllama", - "NeuronMllamaForCausalLM"), -} - - -class NeuronCausalLM(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - ) -> None: - super().__init__() - self.config = config - self.logits_processor = LogitsProcessor(config.vocab_size, - logits_as_input=True) - self.sampler = Sampler() - - # Lazy initialized - self.model: nn.Module - - def forward(self, - input_ids: torch.Tensor, - positions: torch.Tensor, - input_block_ids: torch.Tensor, - sampling_params: torch.Tensor, - prev_hidden: Optional[torch.Tensor] = None, - adapter_ids: Optional[torch.Tensor] = None) -> torch.Tensor: - # sort block ids sequentially for perf/neuron support reasons - sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids) - input_ids = torch.index_select(input_ids, 0, sorted_indices) - positions = torch.index_select(positions, 0, sorted_indices) - sampling_params = torch.index_select(sampling_params, 0, - sorted_indices) - output = self.model(input_ids, - attention_mask=None, - position_ids=positions, - seq_ids=sorted_input_block_ids, - sampling_params=sampling_params, - prev_hidden=prev_hidden, - adapter_ids=adapter_ids) - # on-device sampling - if self.config.neuron_config.on_device_sampling_config: - output = output.hidden_states - else: - output = output.logits[:, -1, :] - - restored_indices = torch.argsort(sorted_indices) - if input_block_ids.shape[0] != 1: - output = torch.index_select(output, 0, restored_indices) - - return output - - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(None, hidden_states, sampling_metadata) - return logits - - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - # on-device sampling - if self.config.neuron_config.on_device_sampling_config: - batch_size = logits.shape - seq_ids = [ - seq_id for sg in sampling_metadata.seq_groups - for seq_id in sg.seq_ids - ] - assert len(seq_ids) == list(batch_size)[0], "batch size mismatch" - # Organize input tensors by step instead of by sequence. - accepted_token_ids_by_step = logits.flatten() - accepted_token_ids_by_step = accepted_token_ids_by_step.tolist() - - step_output_token_ids = [] - for i, seq_id in enumerate(seq_ids): - token_id = accepted_token_ids_by_step[i] - step_output_token_ids.append( - CompletionSequenceGroupOutput(samples=[ - SequenceOutput(parent_seq_id=seq_id, - output_token=token_id, - logprobs={token_id: Logprob(token_id)}) - ], - prompt_logprobs=None)) - return SamplerOutput(outputs=step_output_token_ids) - else: - return self.sampler(logits, sampling_metadata) - - def load_weights(self, model_name_or_path: str, **kwargs): - arch = _get_model_architecture(self.config) - neuronx_module_path, neuronx_model_cls_name = ( - _NEURON_SUPPORTED_MODELS[arch]) - neuronx_module = importlib.import_module(neuronx_module_path) - neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name) - neuron_config = neuronx_model_cls.get_neuron_config_cls()( - **kwargs['neuron_config']) - self.config.neuron_config = neuron_config - config = neuronx_model_cls.get_config_cls()( - neuron_config, - load_config=load_pretrained_config(model_name_or_path)) - hashed_config = hashlib.md5(config.to_json_string().encode('utf-8'), - usedforsecurity=False).hexdigest() - if os.getenv("NEURON_COMPILED_ARTIFACTS") is not None: - compiled_model_path = os.getenv("NEURON_COMPILED_ARTIFACTS") - elif os.path.exists(model_name_or_path): - compiled_model_path = os.path.join(model_name_or_path, - "neuron-compiled-artifacts", - hashed_config) - shutil.rmtree(compiled_model_path, ignore_errors=True) - else: - compiled_model_path = os.path.join("local-models", - model_name_or_path, - "neuron-compiled-artifacts", - hashed_config) - shutil.rmtree(compiled_model_path, ignore_errors=True) - try: - self.model = neuronx_model_cls(compiled_model_path) - override_neuron_config = kwargs["override_neuron_config"] - for k, v in override_neuron_config.items(): - setattr(self.model.config.neuron_config, k, v) - self.model.load(compiled_model_path) - return - except (FileNotFoundError, ValueError) as e: - logger.warning("Exception: %s", e) - logger.warning("Failed to load the model from %s, Recompiling...", - compiled_model_path) - if not os.path.exists(model_name_or_path): - hf_model = AutoModelForCausalLM.from_pretrained(model_name_or_path) - saved_path = os.path.join("local-models", model_name_or_path) - hf_model.save_pretrained(saved_path) - model_name_or_path = saved_path - self.model = neuronx_model_cls(model_name_or_path, config) - self.model.compile(compiled_model_path) - self.model.load(compiled_model_path) - - -class NeuronMllamaForCausalLM(nn.Module): - - def __init__(self, - config: PretrainedConfig, - on_device_sampling_disabled: bool = False) -> None: - super().__init__() - # has_image is the only multimodal input that is used in - # token-generation - # This is a cache (on CPU) that saves has_image data per sequence id - # The number of entries in this cache is <= Batch-Size - self.has_image_cache: dict[int, torch.Tensor] = {} - self.config = config - self.logits_processor = LogitsProcessor( - config.get_text_config().vocab_size, logits_as_input=True) - - self.on_device_sampling_disabled = on_device_sampling_disabled - if self.on_device_sampling_disabled: - # Use default sampler - self.sampler = Sampler() - - # Lazy initialized - self.model: nn.Module - self.is_reorder_needed: bool = True - - def read_from_has_image_cache(self, seq_ids: torch.Tensor): - has_image_list = [] - for index in range(len(seq_ids)): - seq_id = seq_ids[index].item() - if seq_id in self.has_image_cache: - has_image_list.append(self.has_image_cache[seq_id]) - else: - has_image_list.append(torch.tensor([0])) - return torch.tensor(has_image_list) - - def write_to_has_image_cache(self, seq_ids: torch.Tensor, - has_image: torch.Tensor): - for index in range(len(seq_ids)): - seq_id = seq_ids[index].item() - if index < len(has_image): - self.has_image_cache[seq_id] = has_image[index] - else: - self.has_image_cache[seq_id] = torch.zeros(1) - - def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, - seq_ids: torch.Tensor, pixel_values: torch.Tensor, - aspect_ratios: torch.Tensor, num_chunks: torch.Tensor, - has_image: torch.Tensor, sampling_params) -> torch.Tensor: - - # We update the has_image cache during prefill - # and read the has_image cache during decode - if input_ids.shape[-1] > 1: # prefill - self.write_to_has_image_cache(seq_ids, has_image) - else: - has_image = self.read_from_has_image_cache(seq_ids) - bs = input_ids.shape[0] - num_chunks = torch.zeros((bs, 1)) - aspect_ratios = torch.zeros((bs, 1, 2)) - - input_block_ids = seq_ids - origin_input_block_ids = seq_ids - if self.is_reorder_needed: - # sort block ids sequentially for perf/neuron support reasons - input_block_ids, sorted_indices = torch.sort(input_block_ids) - input_ids = torch.index_select(input_ids, 0, sorted_indices) - positions = torch.index_select(positions, 0, sorted_indices) - sampling_params = torch.index_select(sampling_params, 0, - sorted_indices) - pixel_values = torch.index_select(pixel_values, 0, sorted_indices) - aspect_ratios = torch.index_select(aspect_ratios, 0, - sorted_indices) - num_chunks = torch.index_select(num_chunks, 0, sorted_indices) - has_image = torch.index_select(has_image, 0, sorted_indices) - - self.vision_mask = create_vision_mask(input_ids, self.vision_token_id) - output = self.model( - input_ids.to(torch.int32), - attention_mask=None, - position_ids=positions.to(torch.int32), - seq_ids=seq_ids.flatten().to(torch.int32), - pixel_values=pixel_values.to( - self.config.vision_config.torch_dtype), - aspect_ratios=aspect_ratios.to(torch.int32), - vision_mask=self.vision_mask.to(torch.int32), - sampling_params=sampling_params, - num_chunks=num_chunks.to(torch.int32), - has_image=has_image.to(torch.int32), - ) - if self.config.neuron_config.on_device_sampling_config: - output = output.hidden_states - else: - output = output.logits[:, -1, :] - - if self.is_reorder_needed and origin_input_block_ids.shape[0] != 1: - restored_indices = torch.argsort(sorted_indices) - output = torch.index_select(output, 0, restored_indices) - return output - - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(None, hidden_states, sampling_metadata) - return logits - - def sample(self, hidden_states, sampling_metadata): - if not self.on_device_sampling_disabled: - with torch.profiler.record_function("sample"): - hidden_states = hidden_states.flatten() - res = [] - sample_idx = 0 - for seq_group in sampling_metadata.seq_groups: - seq_ids = seq_group.seq_ids - samples = [] - for seq_id in seq_ids: - token_id = hidden_states[sample_idx].item() - samples.append( - SequenceOutput( - parent_seq_id=seq_id, - output_token=token_id, - logprobs={token_id: Logprob(token_id)})) - sample_idx += 1 - res.append( - CompletionSequenceGroupOutput(samples=samples, - prompt_logprobs=None)) - next_tokens = SamplerOutput(outputs=res) - else: - next_tokens = self.sampler(None, hidden_states, sampling_metadata) - return next_tokens - - def load_weights(self, model_name_or_path: str, **kwargs): - arch = _get_model_architecture(self.config) - neuronx_module_path, neuronx_model_cls_name = ( - _NEURON_SUPPORTED_MODELS[arch]) - neuronx_module = importlib.import_module(neuronx_module_path) - neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name) - neuron_config = neuronx_model_cls.get_neuron_config_cls()( - **kwargs['neuron_config']) - self.config.neuron_config = neuron_config - logger.info("neuron_config buckets: %s", - self.config.neuron_config.buckets) - config = neuronx_model_cls.get_config_cls()( - neuron_config, - load_config=load_pretrained_config(model_name_or_path)) - hashed_config = hashlib.md5(config.to_json_string().encode('utf-8'), - usedforsecurity=False).hexdigest() - if os.getenv("NEURON_COMPILED_ARTIFACTS") is not None: - compiled_model_path = os.getenv("NEURON_COMPILED_ARTIFACTS") - elif os.path.exists(model_name_or_path): - compiled_model_path = os.path.join(model_name_or_path, - "neuron-compiled-artifacts", - hashed_config) - else: - compiled_model_path = os.path.join("local-models", - model_name_or_path, - "neuron-compiled-artifacts", - hashed_config) - try: - self.model = neuronx_model_cls(compiled_model_path) - tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) - self.vision_token_id = tokenizer( - "<|image|>", add_special_tokens=False).input_ids[0] - self.model.load(compiled_model_path) - return - except (FileNotFoundError, ValueError): - logger.warning("Failed to load the model from %s, Recompiling...", - compiled_model_path) - if not os.path.exists(model_name_or_path): - hf_model = AutoModelForCausalLM.from_pretrained(model_name_or_path) - saved_path = os.path.join("local-models", model_name_or_path) - hf_model.save_pretrained(saved_path) - model_name_or_path = saved_path - self.model = neuronx_model_cls(model_name_or_path, config) - - logger.info("\nCompiling and saving model to %s", model_name_or_path) - - p = multiprocessing.Process(target=compile_model, - args=(self, compiled_model_path)) - p.start() - p.join() - - tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) - tokenizer.save_pretrained(compiled_model_path) - logger.info("Successfully compiled and saved the model in %s", - compiled_model_path) - - # Read "<|image|>" token_id from the tokenizer - self.vision_token_id = tokenizer("<|image|>", - add_special_tokens=False).input_ids[0] - logger.info("\nLoading model from compiled checkpoint...") - self.model.load(compiled_model_path) - - -def compile_model(neuron_model, traced_model_path): - neuron_model.model.compile(traced_model_path) - - -class NeuronSpeculationCausalLM(nn.Module): - """A Neuron-optimized causal language model with speculative decoding.""" - - def __init__( - self, - config: PretrainedConfig, - ) -> None: - super().__init__() - self.config = config - self.logits_processor = LogitsProcessor(config.vocab_size, - logits_as_input=True) - # Lazy initialized - self.model: nn.Module - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - input_block_ids: torch.Tensor, - sampling_params: torch.Tensor, - ) -> torch.Tensor: - # sort block ids sequentially for perf/neuron support reasons - sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids) - input_ids = torch.index_select(input_ids, 0, sorted_indices) - positions = torch.index_select(positions, 0, sorted_indices) - sampling_params = torch.index_select(sampling_params, 0, - sorted_indices) - - output = self.model(input_ids, - attention_mask=None, - position_ids=positions, - seq_ids=sorted_input_block_ids, - sampling_params=sampling_params) - restored_indices = torch.argsort(sorted_indices) - - # CTX encoding - if (positions[:, 0]).sum().item() == 0: - output = output.fused_outputs[0][:, 0:1] - if input_block_ids.shape[0] != 1: - output = torch.index_select(output, 0, restored_indices) - return output - - # Fused Spec (Generation) - accepted_tokens_with_padding = output.fused_outputs[0] - next_pos_ids = output.fused_outputs[-1] - generated_token_counts = next_pos_ids - positions - - assert torch.any(generated_token_counts == 0).item() is False, \ - "NxDI model generated no output for one or more sequences." - - batch_size, steps = accepted_tokens_with_padding.shape - mask = torch.arange(steps).expand(batch_size, - -1) >= generated_token_counts - accepted_tokens_with_padding[mask] = -1 - - if input_block_ids.shape[0] != 1: - accepted_tokens_with_padding = torch.index_select( - accepted_tokens_with_padding, 0, restored_indices) - - return accepted_tokens_with_padding - - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[list[SamplerOutput]]: - batch_size, num_steps = logits.shape - seq_ids = [ - seq_id for sg in sampling_metadata.seq_groups - for seq_id in sg.seq_ids - ] - # Organize input tensors by step instead of by sequence. - accepted_token_ids_by_step = logits.transpose(0, 1) - accepted_token_ids_by_step = accepted_token_ids_by_step.tolist() - - sampler_output_list = [] - for step_index in range(num_steps): - if all(token_id == -1 - for token_id in accepted_token_ids_by_step[step_index]): - break - step_output_token_ids = [] - for sequence_index in range(batch_size): - token_id = accepted_token_ids_by_step[step_index][ - sequence_index] - step_output_token_ids.append( - CompletionSequenceGroupOutput(samples=[ - SequenceOutput(parent_seq_id=seq_ids[sequence_index], - output_token=token_id, - logprobs={token_id: Logprob(token_id)}) - ], - prompt_logprobs=None)) - sampler_output_list.append( - SamplerOutput(outputs=step_output_token_ids)) - return sampler_output_list - - def load_weights(self, model_name_or_path: str, - draft_model_name_or_path: str, **kwargs): - arch = _get_model_architecture(self.config) - neuronx_module_path, neuronx_model_cls_name = ( - _NEURON_SUPPORTED_MODELS[arch]) - neuronx_module = importlib.import_module(neuronx_module_path) - neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name) - neuron_config = neuronx_model_cls.get_neuron_config_cls()( - **kwargs['neuron_config']) - config = neuronx_model_cls.get_config_cls()( - neuron_config, - load_config=load_pretrained_config(model_name_or_path)) - - draft_neuron_config = copy.deepcopy(config.neuron_config) - if not config.neuron_config.enable_eagle_speculation: - draft_neuron_config.speculation_length = 0 - draft_neuron_config.trace_tokengen_model = True - draft_neuron_config.enable_fused_speculation = False - if getattr(config.neuron_config, "draft_model_modules_to_not_convert", - None): - draft_neuron_config.modules_to_not_convert = ( - draft_neuron_config.draft_model_modules_to_not_convert) - if config.neuron_config.enable_eagle_speculation: - draft_neuron_config.is_eagle_draft = True - draft_neuron_config.sequence_parallel_enabled = False - draft_config = neuronx_model_cls.get_config_cls()( - draft_neuron_config, - load_config=load_pretrained_config(draft_model_name_or_path)) - fused_spec_config = (FusedSpecNeuronConfig( - neuronx_model_cls._model_cls, - draft_config=draft_config, - draft_model_path=draft_model_name_or_path)) - config.fused_spec_config = fused_spec_config - self.config.neuron_config = neuron_config - - hashed_config = hashlib.md5(config.to_json_string().encode('utf-8'), - usedforsecurity=False).hexdigest() - if os.getenv("NEURON_COMPILED_ARTIFACTS") is not None: - compiled_model_path = os.getenv("NEURON_COMPILED_ARTIFACTS") - elif os.path.exists(model_name_or_path): - compiled_model_path = os.path.join(model_name_or_path, - "neuron-compiled-artifacts", - hashed_config) - shutil.rmtree(compiled_model_path, ignore_errors=True) - else: - compiled_model_path = os.path.join("local-models", - model_name_or_path, - "neuron-compiled-artifacts", - hashed_config) - shutil.rmtree(compiled_model_path, ignore_errors=True) - try: - self.model = neuronx_model_cls(compiled_model_path) - override_neuron_config = kwargs["override_neuron_config"] - for k, v in override_neuron_config.items(): - setattr(self.model.config.neuron_config, k, v) - self.model.load(compiled_model_path) - return - except (FileNotFoundError, ValueError) as e: - logger.warning("Exception: %s", e) - logger.warning("Failed to load the model from %s Recompiling...", - compiled_model_path) - if not os.path.exists(model_name_or_path): - hf_model = AutoModelForCausalLM.from_pretrained(model_name_or_path) - saved_path = os.path.join("local-models", model_name_or_path) - hf_model.save_pretrained(saved_path) - model_name_or_path = saved_path - if not os.path.exists(draft_model_name_or_path): - if draft_model_name_or_path != model_name_or_path: - hf_model = AutoModelForCausalLM.from_pretrained( - draft_model_name_or_path) - saved_path = os.path.join("local-models", - draft_model_name_or_path) - hf_model.save_pretrained(saved_path) - draft_model_name_or_path = saved_path - else: - draft_model_name_or_path = model_name_or_path - config.fused_spec_config.draft_model_path = draft_model_name_or_path - self.model = neuronx_model_cls(model_name_or_path, config) - self.model.compile(compiled_model_path) - self.model.load(compiled_model_path) - - -def _get_model_architecture(config: PretrainedConfig) -> str: - architectures = getattr(config, "architectures", []) - for arch in architectures: - if arch in _NEURON_SUPPORTED_MODELS: - return arch - raise ValueError( - f"Model architectures {architectures} are not supported on Neuron " - f"for now. Supported architectures: " - f"{list(_NEURON_SUPPORTED_MODELS.keys())}") - - -def _get_default_neuron_config(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - lora_serving_config: LoraServingConfig): - """Generate a neuron config based on vllm config args.""" - on_device_sampling_config = OnDeviceSamplingConfig(dynamic=True, - deterministic=False) - batch_size = scheduler_config.max_num_seqs - - neuron_config = dict( - tp_degree=parallel_config.tensor_parallel_size, - ctx_batch_size=1, - batch_size=batch_size, - max_context_length=scheduler_config.max_model_len, - seq_len=scheduler_config.max_model_len, - enable_bucketing=True, - is_continuous_batching=True, - quantized=False, - torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], - padding_side="right", - on_device_sampling_config=on_device_sampling_config, - sequence_parallel_enabled=True, - lora_serving_config=lora_serving_config) - return neuron_config - - -def _get_default_speculation_config(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - speculation_config: SpeculativeConfig): - """Generate a neuron config for speculative decoding based on vllm config - args.""" - neuron_config = dict( - tp_degree=parallel_config.tensor_parallel_size, - ctx_batch_size=1, - batch_size=scheduler_config.max_num_seqs, - max_context_length=scheduler_config.max_model_len, - seq_len=scheduler_config.max_model_len, - speculation_length=speculation_config.num_speculative_tokens, - trace_tokengen_model=False, - enable_fused_speculation=True, - enable_bucketing=True, - is_continuous_batching=True, - quantized=False, - torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], - on_device_sampling_config=dict( - top_k=1, - do_sample=False, - )) - return neuron_config - - -def _get_neuron_config_after_override(default_neuron_config, - overridden_neuron_config): - """Update default neuron config values with override args""" - overridden_neuron_config = overridden_neuron_config or {} - default_neuron_config.update(overridden_neuron_config) - return default_neuron_config - - -def get_neuron_model(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - lora_serving_config: LoraServingConfig) -> nn.Module: - """Initializes a neuron-optimized model for inference.""" - model_arch = _get_model_architecture(model_config.hf_config) - if model_arch == "MllamaForConditionalGeneration": - model = NeuronMllamaForCausalLM(model_config.hf_config) - else: - model = NeuronCausalLM(model_config.hf_config) - default_neuron_config_args = _get_default_neuron_config( - model_config, parallel_config, scheduler_config, lora_serving_config) - neuron_config = _get_neuron_config_after_override( - default_neuron_config_args, model_config.override_neuron_config) - - override_neuron_config = model_config.override_neuron_config - model.load_weights(model_config.model, - neuron_config=neuron_config, - override_neuron_config=override_neuron_config) - return model.eval() - - -def get_neuron_speculation_model(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - speculation_config: SpeculativeConfig): - """Initializes a neuron-optimized speculation model for inference. - - This model handles speculation using both a draft model and an EAGLE draft. - """ - model = NeuronSpeculationCausalLM(model_config.hf_config) - default_neuron_config_args = _get_default_speculation_config( - model_config, parallel_config, scheduler_config, speculation_config) - neuron_config = _get_neuron_config_after_override( - default_neuron_config_args, model_config.override_neuron_config) - - override_neuron_config = model_config.override_neuron_config - model.load_weights(model_config.model, - speculation_config.draft_model_config.model, - neuron_config=neuron_config, - override_neuron_config=override_neuron_config) - return model.eval() diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index c13659f8a06e..77c44a74ed3e 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -163,37 +163,12 @@ def cpu_platform_plugin() -> Optional[str]: return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None -def neuron_platform_plugin() -> Optional[str]: - tnx_installed = False - nxd_installed = False - logger.debug("Checking if Neuron platform is available.") - try: - import transformers_neuronx # noqa: F401 - tnx_installed = True - logger.debug("Confirmed Neuron platform is available because" - " transformers_neuronx is found.") - except ImportError: - pass - - try: - import neuronx_distributed_inference # noqa: F401 - nxd_installed = True - logger.debug("Confirmed Neuron platform is available because" - " neuronx_distributed_inference is found.") - except ImportError: - pass - - is_neuron = tnx_installed or nxd_installed - return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None - - builtin_platform_plugins = { 'tpu': tpu_platform_plugin, 'cuda': cuda_platform_plugin, 'rocm': rocm_platform_plugin, 'xpu': xpu_platform_plugin, 'cpu': cpu_platform_plugin, - 'neuron': neuron_platform_plugin, } diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index b8e788de11c6..f40e18aef215 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -158,9 +158,6 @@ def is_xpu(self) -> bool: def is_cpu(self) -> bool: return self._enum == PlatformEnum.CPU - def is_neuron(self) -> bool: - return self._enum == PlatformEnum.NEURON - def is_out_of_tree(self) -> bool: return self._enum == PlatformEnum.OOT diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py deleted file mode 100644 index cb8ac8db669f..000000000000 --- a/vllm/platforms/neuron.py +++ /dev/null @@ -1,151 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import enum -import os -from functools import lru_cache -from typing import TYPE_CHECKING, Optional - -from vllm import envs -from vllm.logger import init_logger -from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS - -from .interface import Platform, PlatformEnum - -if TYPE_CHECKING: - from vllm.config import VllmConfig -else: - VllmConfig = None - -logger = init_logger(__name__) - - -class NeuronFramework(enum.Enum): - TRANSFORMERS_NEURONX = "transformers-neuronx" - NEURONX_DISTRIBUTED_INFERENCE = "neuronx-distributed-inference" - - -class NeuronPlatform(Platform): - _enum = PlatformEnum.NEURON - device_name: str = "neuron" - device_type: str = "neuron" - ray_device_key: str = "neuron_cores" - supported_quantization: list[str] = ["neuron_quant", "fbgemm_fp8"] - dist_backend: str = "gloo" - device_control_env_var: str = "NEURON_RT_VISIBLE_CORES" - - @classmethod - def get_device_name(cls, device_id: int = 0) -> str: - return "neuron" - - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - return False - - @classmethod - def check_and_update_config(cls, vllm_config: VllmConfig) -> None: - parallel_config = vllm_config.parallel_config - if parallel_config.worker_cls == "auto": - parallel_config.worker_cls = \ - "vllm.worker.neuron_worker.NeuronWorker" - - if parallel_config.world_size > 1: - parallel_config.distributed_executor_backend = "uni" - - if vllm_config.cache_config and vllm_config.model_config: - # neuron needs block_size = max_model_len - vllm_config.cache_config.block_size = \ - vllm_config.model_config.max_model_len # type: ignore - - if vllm_config.model_config and vllm_config.model_config.use_mla: - logger.info( - "MLA is enabled on a non-GPU platform; forcing chunked " - "prefill and prefix caching to be disabled.") - vllm_config.scheduler_config.enable_chunked_prefill = False - vllm_config.scheduler_config.chunked_prefill_enabled = False - vllm_config.scheduler_config.max_num_batched_tokens = max( - vllm_config.scheduler_config.max_model_len, - DEFAULT_MAX_NUM_BATCHED_TOKENS) - - @classmethod - def is_pin_memory_available(cls) -> bool: - logger.warning("Pin memory is not supported on Neuron.") - return False - - @classmethod - def get_device_communicator_cls(cls) -> str: - if envs.VLLM_USE_V1: - return "vllm.distributed.device_communicators.neuron_communicator.NeuronCommunicator" # noqa - else: - return Platform.get_device_communicator_cls() - - @classmethod - def use_all_gather(cls) -> bool: - return True - - @classmethod - @lru_cache - def is_neuronx_distributed_inference(cls) -> bool: - try: - import neuronx_distributed_inference - except ImportError: - neuronx_distributed_inference = None - return neuronx_distributed_inference is not None - - @classmethod - @lru_cache - def is_transformers_neuronx(cls) -> bool: - try: - import transformers_neuronx - except ImportError: - transformers_neuronx = None - return transformers_neuronx is not None - - def get_neuron_framework_to_use(self): - """Return the specified framework if corresponding installations are - available. - - If no framework is specified, use neuronx-distributed-inference by - default. - If that's unavailable, check and switch to transformers-neuronx. - """ - if not self.is_neuron(): - raise AssertionError( - f"Neuron Framework unavailable for platform: {self}") - - tnx_installed = self.is_transformers_neuronx() - nxd_installed = self.is_neuronx_distributed_inference() - - specified_framework = os.environ.get("VLLM_NEURON_FRAMEWORK") - tnx_framework = NeuronFramework.TRANSFORMERS_NEURONX.value - nxd_framework = NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE.value - if specified_framework == tnx_framework and tnx_installed: - return self.TRANSFORMERS_NEURONX - - if ((specified_framework == nxd_framework and nxd_installed) - or (specified_framework is None and nxd_installed)): - return NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE - - if specified_framework is None and tnx_installed: - return NeuronFramework.TRANSFORMERS_NEURONX - - return None - - def use_neuronx_distributed(self): - """ - Return True if the framework determined in get_neuron_framework_to_use() - is NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE, False otherwise. This - is used to select the Neuron model framework and framework-specific - configuration to apply during model compilation. - """ - nxd_framework = NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE - return self.get_neuron_framework_to_use() == nxd_framework - - def use_transformers_neuronx(self): - """ - Return True if the framework determined in get_neuron_framework_to_use() - is NeuronFramework.TRANSFORMERS_NEURONX, False otherwise. This is used - to select the Neuron model framework and framework-specific - configuration to apply during model compilation. - """ - return self.get_neuron_framework_to_use( - ) == NeuronFramework.TRANSFORMERS_NEURONX diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py deleted file mode 100644 index 25f588077cb4..000000000000 --- a/vllm/worker/multi_step_neuron_model_runner.py +++ /dev/null @@ -1,84 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from importlib.util import find_spec -from typing import List, Optional - -import torch - -from vllm.config import VllmConfig -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import MultiModalKwargs -from vllm.sequence import IntermediateTensors -from vllm.worker.neuron_model_runner import (ModelInputForNeuron, - NeuronModelRunner) - - -class MultiStepNeuronModelRunner(NeuronModelRunner): - """A model runner for multi step decoding using the transformers_neuronx - framework""" - - def __init__( - self, - vllm_config: VllmConfig, - ): - super().__init__(vllm_config) - self.speculation_config = self.speculative_config - from transformers_neuronx.config import GenerationConfig - self.speculation_config.draft_model_config.neuron_sampling_params = ( - GenerationConfig( - max_length=self.scheduler_config.max_model_len, - do_sample=True, - per_batch_line=True, - top_k=[self._MAX_NEURON_SAMPLING_TOP_K] \ - * self.scheduler_config.max_num_seqs, - top_p=[1.0] * self.scheduler_config.max_num_seqs, - temperature=[1.0] * self.scheduler_config.max_num_seqs, - dynamic=True, - global_top_k=self._MAX_NEURON_SAMPLING_TOP_K - )) - - def load_model(self) -> None: - if find_spec("transformers_neuronx") is not None: - from vllm.model_executor.model_loader.neuron import ( - get_neuron_eagle_speculation_model, - get_neuron_speculation_model) - if self.speculation_config.speculative_token_tree is not None: - self.model = get_neuron_eagle_speculation_model( - self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - speculation_config=self.speculation_config) - else: - self.model = get_neuron_speculation_model( - self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - speculation_config=self.speculation_config) - else: - raise NotImplementedError( - "Supports only Transformer-NeuronX based models.") - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForNeuron, - kv_caches: Optional[List[torch.Tensor]] = None, - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: - logits = self.model( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - input_block_ids=model_input.input_block_ids, - **MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs or {}, - device=self.device, - ), - ) - - output = self.model.sample( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - return output diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py deleted file mode 100644 index dd521dd67dad..000000000000 --- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py +++ /dev/null @@ -1,63 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import List, Optional - -import torch - -from vllm.config import VllmConfig -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import MultiModalKwargs -from vllm.sequence import IntermediateTensors -from vllm.worker.neuronx_distributed_model_runner import ( - NeuronxDistributedModelRunner) - - -class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner): - """A model runner for multi-step decoding using the - neuronx-distributed-inference framework""" - - def __init__( - self, - vllm_config: VllmConfig, - ): - super().__init__(vllm_config) - - def load_model(self) -> None: - from vllm.model_executor.model_loader.neuronx_distributed import ( - get_neuron_speculation_model) - self.model = get_neuron_speculation_model( - self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - speculation_config=self.speculative_config) - - @torch.inference_mode() - def execute_model( - self, - model_input, - kv_caches: Optional[List[torch.Tensor]] = None, - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: - sampling_params = torch.tensor([[ - seq_group.sampling_params.top_k, - seq_group.sampling_params.top_p, - seq_group.sampling_params.temperature, - ] for seq_group in model_input.sampling_metadata.seq_groups]) - - logits = self.model( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - input_block_ids=model_input.input_block_ids, - sampling_params=sampling_params, - **MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs or {}, - device=self.device, - ), - ) - - output = self.model.sample( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - return output diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py deleted file mode 100644 index 7ccf1a2c0a87..000000000000 --- a/vllm/worker/neuron_model_runner.py +++ /dev/null @@ -1,460 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os -from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union - -import torch -from torch import nn - -from vllm.config import DeviceConfig, VllmConfig -from vllm.logger import init_logger -from vllm.lora.layers import LoRAMapping -from vllm.lora.request import LoRARequest -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader.neuron import get_neuron_model -from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalKwargs) -from vllm.platforms import current_platform -from vllm.sampling_params import SamplingParams -from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.utils import is_pin_memory_available, make_tensor_with_pad -from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - -logger = init_logger(__name__) - - -@dataclass(frozen=True) -class ModelInputForNeuron(ModelRunnerInputBase): - """ - Used by the NeuronModelRunner. - """ - input_tokens: Optional[torch.Tensor] = None - input_positions: Optional[torch.Tensor] = None - input_block_ids: Optional[torch.Tensor] = None - sampling_metadata: SamplingMetadata = None - multi_modal_kwargs: BatchedTensorInputs = None - adapter_ids: Optional[str] = None - - def as_broadcastable_tensor_dict( - self) -> Dict[str, Union[int, torch.Tensor]]: - return { - "input_tokens": self.input_tokens, - "input_positions": self.input_positions, - "input_block_ids": self.input_block_ids, - "sampling_metadata": self.sampling_metadata, - "multi_modal_kwargs": self.multi_modal_kwargs, - } - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "ModelInputForNeuron": - return ModelInputForNeuron( - input_tokens=tensor_dict["input_tokens"], - input_positions=tensor_dict["input_positions"], - input_block_ids=tensor_dict["input_block_ids"], - sampling_metadata=tensor_dict["sampling_metadata"], - multi_modal_kwargs=tensor_dict["multi_modal_kwargs"], - ) - - -class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): - """A model runner for AWS Neuron hardware""" - - # NEURON has an upper limit on the top_k - _MAX_NEURON_SAMPLING_TOP_K = 256 - - def __init__( - self, - vllm_config: VllmConfig, - ): - ModelRunnerBase.__init__(self, vllm_config) - - if (self.model_config is not None - and self.model_config.get_sliding_window()): - logger.warning("Sliding window is not supported on Neuron. " - "The model will run without sliding window.") - self.device_config = (self.device_config if self.device_config - is not None else DeviceConfig()) - self.lora_config = vllm_config.lora_config - self.device = self.device_config.device - self.pin_memory = is_pin_memory_available() - - # Multi-modal data support - self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \ - .create_input_mapper(self.model_config) - - # Lazy initialization. - self.model: nn.Module # initialize after load_model. - - # Once NEURON_ON_DEVICE_SAMPLING_DISABLED is set to a non-zero value, - # turn off on-device sampling. - self._on_device_sampling_disabled = int( - os.getenv("NEURON_ON_DEVICE_SAMPLING_DISABLED", "0")) - - # NEURON needs to update sampling parameters when request IDs change - # across batches. This variable stores the previous batch's request IDs - # to determine if an update is needed. - self._previous_batch_request_ids: List[str] = [] - - if not self._on_device_sampling_disabled: - self._init_neuron_sampling() - - def _init_neuron_sampling(self) -> None: - if current_platform.use_transformers_neuronx(): - from transformers_neuronx.config import GenerationConfig - else: - from transformers import GenerationConfig - logger.warning( - "On-device sampling is turned on in Neuron by default, only " - "top_k, top_p, and temperature are current supported sampling " - "parameters. To turn off the on-device sampling, please set " - "the environment variable NEURON_ON_DEVICE_SAMPLING_DISABLED=1.") - self.model_config.neuron_sampling_params = GenerationConfig( - max_length=self.scheduler_config.max_model_len, - do_sample=True, - per_batch_line=True, - top_k=[self._MAX_NEURON_SAMPLING_TOP_K] \ - * self.scheduler_config.max_num_seqs, - top_p=[1.0] * self.scheduler_config.max_num_seqs, - temperature=[1.0] * self.scheduler_config.max_num_seqs, - dynamic=True, - global_top_k=self._MAX_NEURON_SAMPLING_TOP_K) - - def load_model(self) -> None: - self.model = get_neuron_model(self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config) - - def get_model(self) -> nn.Module: - return self.model - - def _prepare_prompt( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int], - BatchedTensorInputs]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[List[int]] = [] - input_positions: List[List[int]] = [] - input_block_ids: List[int] = [] - - seq_lens: List[int] = [] - multi_modal_kwargs_list: List[MultiModalKwargs] = [] - for seq_group_metadata in seq_group_metadata_list: - assert seq_group_metadata.is_prompt - seq_ids = list(seq_group_metadata.seq_data.keys()) - assert len(seq_ids) == 1 - seq_id = seq_ids[0] - - seq_data = seq_group_metadata.seq_data[seq_id] - prompt_tokens = seq_data.get_token_ids() - seq_len = len(prompt_tokens) - seq_lens.append(seq_len) - - input_tokens.append(prompt_tokens) - input_positions.append(list(range(seq_len))) - - assert seq_group_metadata.block_tables is not None - block_table = seq_group_metadata.block_tables[seq_id] - assert len(block_table) == 1 - input_block_ids.append(block_table[0]) - - mm_kwargs = seq_group_metadata.multi_modal_data - if mm_kwargs: - mm_kwargs = self.process_multi_modal_data_neuron(mm_kwargs) - multi_modal_kwargs_list.append(mm_kwargs) - - max_seq_len = max(seq_lens) - assert max_seq_len > 0 - input_tokens = make_tensor_with_pad(input_tokens, - pad=0, - max_len=max_seq_len, - dtype=torch.long, - device=self.device) - input_positions = make_tensor_with_pad(input_positions, - pad=0, - max_len=max_seq_len, - dtype=torch.long, - device=self.device) - input_block_ids = torch.tensor(input_block_ids, - dtype=torch.long, - device=self.device) - - multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - - return (input_tokens, input_positions, input_block_ids, seq_lens, - multi_modal_kwargs) - - def _prepare_decode( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[List[int]] = [] - input_positions: List[List[int]] = [] - input_block_ids: List[int] = [] - context_lens: List[int] = [] - - for seq_group_metadata in seq_group_metadata_list: - assert not seq_group_metadata.is_prompt - - seq_ids = list(seq_group_metadata.seq_data.keys()) - - for seq_id in seq_ids: - seq_data = seq_group_metadata.seq_data[seq_id] - generation_token = seq_data.get_last_token_id() - input_tokens.append([generation_token]) - - seq_len = seq_data.get_len() - position = seq_len - 1 - input_positions.append([position]) - context_lens.append(seq_len) - - assert seq_group_metadata.block_tables is not None - block_table = seq_group_metadata.block_tables[seq_id] - assert len(block_table) == 1 - input_block_ids.append(block_table[0]) - - input_tokens = make_tensor_with_pad(input_tokens, - pad=0, - max_len=1, - dtype=torch.long, - device=self.device) - input_positions = make_tensor_with_pad(input_positions, - pad=0, - max_len=1, - dtype=torch.long, - device=self.device) - context_lens = torch.tensor(context_lens, - dtype=torch.int, - device=self.device) - input_block_ids = torch.tensor(input_block_ids, - dtype=torch.long, - device=self.device) - - return input_tokens, input_positions, input_block_ids - - def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, Any]) -> ModelInputForNeuron: - return ModelInputForNeuron.from_broadcasted_tensor_dict(tensor_dict) - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForNeuron: - multi_modal_kwargs = None - # NOTE: We assume that all sequences in the group are all prompts or - # all decodes. - is_prompt = seq_group_metadata_list[0].is_prompt - # Prepare input tensors. - if is_prompt: - (input_tokens, input_positions, input_block_ids, seq_lens, - multi_modal_kwargs - ) = self._prepare_prompt(seq_group_metadata_list) - else: - (input_tokens, input_positions, - input_block_ids) = self._prepare_decode(seq_group_metadata_list) - seq_lens = None - - if not self._on_device_sampling_disabled: - for seq_group_metadata in seq_group_metadata_list: - sampling_params = seq_group_metadata.sampling_params - top_k, top_p, temperature = ( - self._convert_to_neuron_sampling_params(sampling_params)) - sampling_params.top_k = top_k - sampling_params.top_p = top_p - sampling_params.temperature = temperature - - # we need multi_modal_data for later tokens as well - multi_modal_kwargs_list: List[MultiModalKwargs] = [] - for seq_group_metadata in seq_group_metadata_list: - mm_data = seq_group_metadata.multi_modal_data - if mm_data: - multi_modal_kwargs_list.append(mm_data) - multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - # query_lens is not needed if chunked prefill is not - # supported. Since neuron worker doesn't support chunked prefill - # just use seq_lens instead. - seq_lens, - self.device, - self.pin_memory, - generators=self.get_generators(finished_requests_ids)) - - if current_platform.use_transformers_neuronx( - ) and not self._on_device_sampling_disabled: - # Once the request IDs are changed in current iteration, we will - # update the on-device sampling parameters. - current_batch_request_ids = [ - seq_group_meta_data.request_id - for seq_group_meta_data in seq_group_metadata_list - ] - if current_batch_request_ids != self._previous_batch_request_ids: - self._update_neuron_sampling_params(seq_group_metadata_list) - self._previous_batch_request_ids = current_batch_request_ids - - return ModelInputForNeuron(input_tokens=input_tokens, - input_positions=input_positions, - input_block_ids=input_block_ids, - sampling_metadata=sampling_metadata, - multi_modal_kwargs=multi_modal_kwargs) - - def _update_neuron_sampling_params( - self, seq_group_metadata_list: List[SequenceGroupMetadata]): - # Update Neuron sampling parameters (GenerationConfig in Neuron) - current_sampling_params = self.model_config.neuron_sampling_params - assert current_sampling_params is not None, ( - f"Failed to update sampling_params, " - f"current sampling params is {current_sampling_params}") - - is_update_needed = False - - top_k = current_sampling_params.top_k - top_p = current_sampling_params.top_p - temperature = current_sampling_params.temperature - - # The index of a sequence's sampling parameters in neuron is equal to - # its index in `input_block_ids`. - for seq_group_metadata in seq_group_metadata_list: - seq_ids = list(seq_group_metadata.seq_data.keys()) - sampling_params = seq_group_metadata.sampling_params - - seq_group_top_k = sampling_params.top_k - seq_group_top_p = sampling_params.top_p - seq_group_temperature = sampling_params.temperature - - for seq_id in seq_ids: - index = seq_group_metadata.block_tables[seq_id][0] - if (top_k[index] != seq_group_top_k - or top_p[index] != seq_group_top_p - or temperature[index] != seq_group_temperature): - is_update_needed = True - - top_k[index] = seq_group_top_k - top_p[index] = seq_group_top_p - temperature[index] = seq_group_temperature - - # update_generation_config is only available in transformers-neuronx - if is_update_needed and current_platform.use_transformers_neuronx(): - self.model.model.update_generation_config(current_sampling_params) - - def _convert_to_neuron_sampling_params( - self, sampling_params: SamplingParams) -> Tuple[int, float, float]: - # Returns the top_k, top_p and temperature parameters for neuron. - top_k = sampling_params.top_k - top_p = sampling_params.top_p - temperature = sampling_params.temperature - - if temperature == 0.0: - # Enable greedy sampling on zero temperature - return (1, 1.0, 1.0) - if top_k < 1 or top_k > self._MAX_NEURON_SAMPLING_TOP_K: - top_k = self._MAX_NEURON_SAMPLING_TOP_K - - return (top_k, top_p, temperature) - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForNeuron, - kv_caches: Optional[List[torch.Tensor]] = None, - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: - if num_steps > 1: - raise ValueError( - "NeuronModelRunner does not support multi-step execution.") - - # extract top_k, top_p and temperature from model_input for neuron - # forward call - sampling_params = (torch.tensor([[ - seq_group.sampling_params.top_k, seq_group.sampling_params.top_p, - seq_group.sampling_params.temperature - ] for seq_group in model_input.sampling_metadata.seq_groups])) - - if current_platform.use_neuronx_distributed(): - hidden_states = self.model( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - input_block_ids=model_input.input_block_ids, - sampling_params=sampling_params, - adapter_ids=model_input.adapter_ids, - **MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs or {}, - device=self.device, - ), - ) - elif current_platform.use_transformers_neuronx(): - # [TODO] validate on-device sampling - # The model signature may need change for on-device sampling - hidden_states = self.model( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - input_block_ids=model_input.input_block_ids, - **MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs or {}, - device=self.device, - ), - ) - - # Compute the logits only if the on-device sampling is turned off as - # on-device sampling outputs the token ids. - if self._on_device_sampling_disabled: - logits = self.model.compute_logits(hidden_states, - model_input.sampling_metadata) - else: - logits = hidden_states - - # Sample the next token. - output = self.model.sample( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - return [output] - - @property - def vocab_size(self) -> int: - return self.model_config.get_vocab_size() - - def process_multi_modal_data_neuron(self, mm_data): - # this is a no-op for NeuronModelRunner - return mm_data - - def remove_all_loras(self): - raise NotImplementedError( - "LoRAs are not supported for Transformers NeuronX framework") - - def set_active_loras(self, lora_requests: Set[LoRARequest], - lora_mapping: LoRAMapping) -> None: - raise NotImplementedError( - "LoRAs are not supported for Transformers NeuronX framework") - - def add_lora(self, lora_request: LoRARequest): - raise NotImplementedError( - "LoRAs are not supported for Transformers NeuronX framework") - - def remove_lora(self, lora_id: int) -> bool: - raise NotImplementedError( - "LoRAs are not supported for Transformers NeuronX framework") - - def pin_lora(self, lora_id: int) -> bool: - raise NotImplementedError( - "LoRAs are not supported for Transformers NeuronX framework") - - def list_loras(self) -> Set[int]: - raise NotImplementedError( - "LoRAs are not supported for Transformers NeuronX framework") diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py deleted file mode 100644 index 4e1408300fb8..000000000000 --- a/vllm/worker/neuron_worker.py +++ /dev/null @@ -1,193 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""A Neuron worker class.""" -import os -from typing import List, Optional, Set, Tuple - -import torch.distributed - -from vllm.config import VllmConfig -from vllm.distributed import (ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor import set_random_seed -from vllm.platforms import current_platform -from vllm.platforms.neuron import NeuronFramework -from vllm.sequence import ExecuteModelRequest -from vllm.worker.neuron_model_runner import NeuronModelRunner -from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, - WorkerInput) - -logger = init_logger(__name__) - - -class NeuronWorker(LocalOrDistributedWorkerBase): - """A worker class that executes the model on a group of neuron cores. - """ - - model_runner: NeuronModelRunner - - def __init__(self, - vllm_config: VllmConfig, - local_rank: int, - rank: int, - distributed_init_method: str, - is_driver_worker: bool = False) -> None: - WorkerBase.__init__(self, vllm_config=vllm_config) - self.local_rank = local_rank - self.rank = rank - self.distributed_init_method = distributed_init_method - self.is_driver_worker = is_driver_worker - self.lora_config = vllm_config.lora_config - - if self.model_config.trust_remote_code: - # note: lazy import to avoid importing torch before initializing - from vllm.utils import init_cached_hf_modules - init_cached_hf_modules() - - neuron_framework = current_platform.get_neuron_framework_to_use() - if neuron_framework == NeuronFramework.TRANSFORMERS_NEURONX: - self.model_runner = self.get_tnx_model_runner(vllm_config) - elif neuron_framework == NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE: - self.model_runner = self.get_neuronx_distributed_model_runner( - vllm_config) - else: - raise NotImplementedError( - "Specified framework" + - f" {os.environ.get('VLLM_NEURON_FRAMEWORK')}" + - " is either not installed or not supported." + - " Supported frameworks: " + - "[transformers-neuronx, neuronx-distributed-inference]") - - def get_tnx_model_runner(self, vllm_config): - assert (self.lora_config - is None), ("LoRA is not supported for TransformersNeuronX " - "framework.") - from vllm.worker.multi_step_neuron_model_runner import ( - MultiStepNeuronModelRunner) - if self.speculative_config is not None: - return MultiStepNeuronModelRunner(vllm_config=vllm_config) - else: - return NeuronModelRunner(vllm_config=vllm_config) - - def get_neuronx_distributed_model_runner(self, vllm_config): - from vllm.worker.multi_step_neuronx_distributed_model_runner import ( - MultiStepNeuronxDistributedModelRunner) - from vllm.worker.neuronx_distributed_model_runner import ( - NeuronxDistributedModelRunner) - if self.speculative_config is not None: - assert (self.lora_config - is None), "LoRA is not supported for Speculative Decoding" - return MultiStepNeuronxDistributedModelRunner( - vllm_config=vllm_config) - else: - return NeuronxDistributedModelRunner(vllm_config=vllm_config) - - def init_device(self) -> None: - self.init_distributed_environment() - - # Set random seed. - set_random_seed(self.model_config.seed) - - def load_model(self): - self.model_runner.load_model() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of available KV blocks. - - Swapping is not yet supported, so always return num_cpu_blocks=0. - - We configure num_gpu_blocks to be equal to max_num_seqs. - """ - # Set the number of GPU blocks to be the same as the maximum number of - # sequences that can be processed in a single batch. This is equivalent - # to schedule without PagedAttention. - num_gpu_blocks = self.scheduler_config.max_num_seqs + 1 - - # Swap not yet supported with Neuron backend. - num_cpu_blocks = 0 - - return num_gpu_blocks, num_cpu_blocks - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Initialize the KV cache. - """ - - # Different values are not tested. - assert num_cpu_blocks == 0 - assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1 - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - @property - def do_metadata_broadcast(self) -> bool: - return False - - @property - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: - return None - - @torch.inference_mode() - def prepare_worker_input( - self, execute_model_req: ExecuteModelRequest) -> WorkerInput: - return WorkerInput(num_seq_groups=len( - execute_model_req.seq_group_metadata_list), ) - - def execute_worker(self, worker_input: WorkerInput) -> None: - pass - - def get_cache_block_size_bytes(self) -> int: - """Determine the size in bytes of a cache block. - - This is required for speculative decoding; it is not yet implemented. - """ - raise NotImplementedError - - def init_distributed_environment(self): - """Neuron uses transformers-neuronx for tensor parallelism. - - vLLM still needs the environment initialized when TP/PP > 1 - """ - init_distributed_environment( - world_size=1, - rank=self.rank, - local_rank=self.local_rank, - distributed_init_method=self.distributed_init_method, - backend=current_platform.dist_backend, - ) - - ensure_model_parallel_initialized( - 1, - 1, - ) - - def add_lora(self, lora_request: LoRARequest) -> bool: - if current_platform.use_transformers_neuronx(): - raise NotImplementedError( - f"{type(self)} does not support LoRA with Neuron Framework " - f"Transformers NeuronX") - return self.model_runner.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - if current_platform.use_transformers_neuronx(): - raise NotImplementedError( - f"{type(self)} does not support LoRA with Neuron Framework " - f"Transformers NeuronX") - return self.model_runner.remove_lora(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - if current_platform.use_transformers_neuronx(): - raise NotImplementedError( - f"{type(self)} does not support LoRA with Neuron Framework " - f"Transformers NeuronX") - return self.model_runner.pin_lora(lora_id) - - def list_loras(self) -> Set[int]: - if current_platform.use_transformers_neuronx(): - raise NotImplementedError( - f"{type(self)} does not support LoRA with Neuron Framework " - f"Transformers NeuronX") - return self.model_runner.list_loras() diff --git a/vllm/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py deleted file mode 100644 index 2a0f4e77c99e..000000000000 --- a/vllm/worker/neuronx_distributed_model_runner.py +++ /dev/null @@ -1,294 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional, Set - -import torch -from neuronx_distributed_inference.models.mllama.aspect_ratio_utils import ( - get_all_supported_aspect_ratios) -from neuronx_distributed_inference.modules.generation.sampling import ( - prepare_sampling_params) -from neuronx_distributed_inference.modules.lora_serving import ( - LoraCheckpoint, LoraServingConfig) - -from vllm.config import VllmConfig -from vllm.entrypoints.openai.serving_models import LoRAModulePath -from vllm.logger import init_logger -from vllm.lora.layers import LoRAMapping -from vllm.lora.request import LoRARequest -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader.neuronx_distributed import ( - _get_model_architecture, get_neuron_model) -from vllm.multimodal import MultiModalKwargs -from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.worker.neuron_model_runner import (ModelInputForNeuron, - NeuronModelRunner) - -logger = init_logger(__name__) - - -class NeuronxDistributedModelRunner(NeuronModelRunner): - - def __init__( - self, - vllm_config: VllmConfig, - ): - super().__init__(vllm_config) - self.lora_checkpoint = None - self.model = None - self.lora_serving_config = None - - @staticmethod - def _get_lora_paths_strings(lora_modules: List[LoRAModulePath]): - if not lora_modules: - return None - return {_.get("name"): _.get("path") for _ in lora_modules} - - def _get_nxdi_lora_config(self): - override_neuron_config = self.model_config.override_neuron_config - lora_modules = override_neuron_config.pop("lora_modules", None) - target_modules = override_neuron_config.pop("target_modules", None) - lora_ckpt_paths = self._get_lora_paths_strings(lora_modules) - if self.lora_config.max_loras < len(lora_ckpt_paths): - raise ValueError( - "Number of LoRAs (%s) exceeds maximum " - "allowed (%s)", len(lora_ckpt_paths), - self.lora_config.max_loras) - - return LoraServingConfig( - max_loras=self.lora_config.max_loras, - max_lora_rank=self.lora_config.max_lora_rank, - target_modules=target_modules, - lora_ckpt_paths=lora_ckpt_paths, - ) - - def load_model(self) -> None: - # Update LoRA config - if self.lora_config is not None: - self.lora_serving_config = self._get_nxdi_lora_config() - self.lora_checkpoint = LoraCheckpoint(self.lora_serving_config) - self.model = get_neuron_model( - self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - lora_serving_config=self.lora_serving_config) - - def get_nxd_sampling_params(self, sampling_metadata): - if self.model.config.neuron_config.on_device_sampling_config: - max_topk = (self.model.config.neuron_config. - on_device_sampling_config.global_topk) - else: - max_topk = self.model.config.vocab_size - - top_k = [1] * self.scheduler_config.max_num_seqs - top_p = [1.0] * self.scheduler_config.max_num_seqs - temperature = [1.0] * self.scheduler_config.max_num_seqs - - for index, sequenceGroupToSample in enumerate( - sampling_metadata.seq_groups): - top_k[index] = (sequenceGroupToSample.sampling_params.top_k - if sequenceGroupToSample.sampling_params.top_k > 0 - else max_topk) - top_p[index] = sequenceGroupToSample.sampling_params.top_p - temperature[index] = ( - sequenceGroupToSample.sampling_params.temperature) - - sampling_params = prepare_sampling_params( - batch_size=self.scheduler_config.max_num_seqs, - top_k=top_k, - top_p=top_p, - temperature=temperature) - return sampling_params - - def get_multi_modal_data_neuron(self, input_images): - raise NotImplementedError("need to restore multi-modal support") - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForNeuron, - kv_caches: Optional[List[torch.Tensor]] = None, - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: - if num_steps > 1: - raise ValueError( - "NeuronModelRunner does not support multi-step execution.") - - if _get_model_architecture( - self.model.config) != "MllamaForConditionalGeneration": - return super().execute_model(model_input, kv_caches, - intermediate_tensors, num_steps) - - sampling_params = self.get_nxd_sampling_params( - model_input.sampling_metadata) - - if model_input.multi_modal_kwargs.get('pixel_values') is not None: - hidden_states = self.model( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - seq_ids=model_input.input_block_ids, - pixel_values=model_input.multi_modal_kwargs.get( - 'pixel_values'), - aspect_ratios=model_input.multi_modal_kwargs.get( - 'aspect_ratios'), - sampling_params=sampling_params, - num_chunks=model_input.multi_modal_kwargs.get('num_chunks'), - has_image=model_input.multi_modal_kwargs.get( - 'has_image').squeeze(1), - ) - else: - bs = model_input.input_tokens.shape[0] if (model_input.input_tokens - is not None) else 1 - empty_pixel_values = torch.zeros([bs, 1, 4, 3, 560, 560], - dtype=torch.bfloat16) - empty_aspect_ratios = torch.ones([bs, 1, 2], dtype=torch.int64) - num_chunks = torch.zeros((bs, 1), dtype=torch.int32) - has_image = torch.zeros([bs], dtype=torch.int32) - hidden_states = self.model( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - seq_ids=model_input.input_block_ids, - pixel_values=empty_pixel_values, - aspect_ratios=empty_aspect_ratios, - sampling_params=sampling_params, - num_chunks=num_chunks, - has_image=has_image, - ) - - output = self.model.sample( - hidden_states=hidden_states, - sampling_metadata=model_input.sampling_metadata, - ) - - return [output] - - def process_multi_modal_data_neuron(self, mm_data): - # Neuron uses aspect_ratios instead of aspect_ratio_ids - all_supported_aspect_ratios = get_all_supported_aspect_ratios( - self.model.config.vision_config.max_num_tiles) - aspect_ratio_ids = mm_data.get("aspect_ratio_ids") - mm_data["aspect_ratios"] = torch.tensor( - all_supported_aspect_ratios[aspect_ratio_ids]).unsqueeze(0) - - # Neuron's num_chunks is HF's num_tiles - mm_data["num_chunks"] = mm_data.get("num_tiles") - - # Input has an image if it has pixel_values - bs = mm_data["num_chunks"].shape[0] - pixel_values = mm_data.get("pixel_values") - if pixel_values is not None and not torch.all(pixel_values == 0): - mm_data["has_image"] = torch.ones(bs) - - else: - mm_data["has_image"] = torch.zeros(bs) - return mm_data - - def _get_lora_adapter_ids(self, seq_group_metadata_list): - # set LoRA adapter IDs for multi-lora serving - batch_size = len(seq_group_metadata_list) - if self.lora_checkpoint is not None: - # "0" indicates NxDI to use the base model for inference - adapter_ids = ["0"] * batch_size - for idx, seq_group_metadata in enumerate(seq_group_metadata_list): - if seq_group_metadata.lora_request is not None: - adapter_ids[ - idx] = seq_group_metadata.lora_request.lora_name - - # convert adapter_ids from strings to integers - adapter_ids = self.lora_checkpoint.convert_adapter_ids_to_indices( - adapter_ids, batch_size) - else: - adapter_ids = torch.zeros((batch_size), dtype=torch.int32) - - return adapter_ids - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForNeuron: - # NOTE: We assume that all sequences in the group are all prompts or - # all decodes. - is_prompt = seq_group_metadata_list[0].is_prompt - # Prepare input tensors. - if is_prompt: - (input_tokens, input_positions, input_block_ids, seq_lens, - multi_modal_kwargs - ) = self._prepare_prompt(seq_group_metadata_list) - else: - (input_tokens, input_positions, - input_block_ids) = self._prepare_decode(seq_group_metadata_list) - seq_lens = None - - if not self._on_device_sampling_disabled: - for seq_group_metadata in seq_group_metadata_list: - sampling_params = seq_group_metadata.sampling_params - top_k, top_p, temperature = ( - self._convert_to_neuron_sampling_params(sampling_params)) - sampling_params.top_k = top_k - sampling_params.top_p = top_p - sampling_params.temperature = temperature - - # we need multi_modal_data for later tokens as well - multi_modal_kwargs_list: List[MultiModalKwargs] = [] - for seq_group_metadata in seq_group_metadata_list: - mm_data = seq_group_metadata.multi_modal_data - if mm_data: - multi_modal_kwargs_list.append(mm_data) - multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - - lora_adapter_ids = self._get_lora_adapter_ids(seq_group_metadata_list) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - # query_lens is not needed if chunked prefill is not - # supported. Since neuron worker doesn't support chunked prefill - # just use seq_lens instead. - seq_lens, - self.device, - self.pin_memory, - generators=self.get_generators(finished_requests_ids)) - - return ModelInputForNeuron(input_tokens=input_tokens, - input_positions=input_positions, - input_block_ids=input_block_ids, - sampling_metadata=sampling_metadata, - multi_modal_kwargs=multi_modal_kwargs, - adapter_ids=lora_adapter_ids) - - def remove_all_loras(self): - raise NotImplementedError( - "Managing LoRAs is only supported through the " - "lora_modules parameter in override_neuron_config") - - def set_active_loras(self, lora_requests: Set[LoRARequest], - lora_mapping: LoRAMapping) -> None: - raise NotImplementedError( - "Managing LoRAs is only supported through the " - "lora_modules parameter in override_neuron_config") - - def add_lora(self, lora_request: LoRARequest): - logger.warning( - "Adding LoRAs is only supported through the " - "lora_modules parameter in override_neuron_config. If you supplied " - "the parameter, you can ignore this warning. Ignoring" - "lora request: ", lora_request) - - def remove_lora(self, lora_id: int) -> bool: - raise NotImplementedError( - "Managing LoRAs is only supported through the " - "lora_modules parameter in override_neuron_config") - - def pin_lora(self, lora_id: int) -> bool: - raise NotImplementedError( - "Managing LoRAs is only supported through the " - "lora_modules parameter in override_neuron_config") - - def list_loras(self) -> Set[int]: - raise NotImplementedError( - "Managing LoRAs is only supported through the " - "lora_modules parameter in override_neuron_config")