Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
text=auto eol=lf
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
.ruff_cache
_version.py
build
*.so
*.pyc
.pytest_cache

# Ascend Specific
fusion_result.json
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "third_party/kvcache-ops"]
path = third_party/kvcache-ops
url = https://gitee.com/openeuler/kvcache-ops.git
60 changes: 60 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.

# CMake lowest version requirement
cmake_minimum_required(VERSION 3.16.0)
# project information
project(c_ops)

set(CMAKE_CXX_STANDARD 17)
set(LMC_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}")
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)

set(SOC_VERSION ${SOC_VERSION})
set(ARCH ${ARCH})

if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release" CACHE STRINGS "Build type Release/Debug (default Release)" FORCE)
endif()

if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
set(ARCH_SUBDIR "aarch64-linux")
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
set(ARCH_SUBDIR "x86_64-linux")
else()
message(FATAL_ERROR "Unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}")
endif()

add_subdirectory(third_party/kvcache-ops)
add_subdirectory(csrc)


set(TORCH_LIBS_DIR "${TORCH_PATH}/lib")

target_link_options(c_ops PRIVATE
"-Wl,-rpath,$ORIGIN:$ORIGIN/lib"
"-Wl,-rpath,${LMC_INSTALL_PATH}"
)

target_link_directories(
c_ops
PRIVATE
${TORCH_LIBS_DIR}
${TORCH_NPU_PATH}/lib/
${ASCEND_CANN_PACKAGE_PATH}/lib64
${ASCEND_CANN_PACKAGE_PATH}/${ARCH_SUBDIR}/devlib
)

target_link_libraries(
c_ops
PUBLIC
${TORCH_LIBRARIES}
libtorch_npu.so
cache_kernels
ascendcl
platform
ascend_hal
tiling_api
)


install(TARGETS c_ops cache_kernels DESTINATION ${LMC_INSTALL_PATH})
117 changes: 117 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,121 @@

--------------------------------------------------------------------------------

## Overview

LMCache-Ascend is a community maintained plugin for running LMCache on the Ascend NPU.


## Prerequisites

To use LMCache-Ascend on the NPU hardware, please make sure the following prerequisites are satisfied.

- Hardware: Atlas 800I A2 Inference series. The rest of the series like A3 Inference/Training and 300I Duo are experimental.
- OS: Linux-based.
- Software:
- **Python**: >= 3.10, <= 3.11
- **CANN Toolkit**: >= 8.2rc1
- **Ascend Driver**: >= 24.1
- **PyTorch**: == 2.5.1, **Torch-npu**: == 2.5.1.post1.dev20250619
- **vLLM**: v0.9.2 & **vLLM-Ascend**: v0.9.2rc1

## Getting Started

### Clone LMCache-Ascend Repo

Our repo contains a kvcache ops submodule for ease of maintainence, therefore we recommend cloning the repo with submodules.

```bash
cd /workspace
git clone --recurse-submodules https://github.com/LMCache/LMCache-Ascend.git
```

### Docker

```bash
cd /workspace/LMCache-Ascend
docker build -f docker/Dockerfile.a2.openEuler -t lmcache-ascend:v0.3.3-vllm-ascend-v0.9.2rc1-910b-cann-8.2rc1-py3.11-openeuler-22.03 .
```

Once that is built, run it with the following cmd
```bash
DEVICE_LIST="0,1,2,3,4,5,6,7"
docker run -it \
--privileged \
--cap-add=SYS_PTRACE \
--net=host \
--name lmcache-ascend-dev \
--rm \
-e ASCEND_VISIBLE_DEVICES=${DEVICE_LIST} \
-e ASCEND_RT_VISIBLE_DEVICES=${DEVICE_LIST} \
-e ASCEND_TOTAL_MEMORY_GB=32 \
-e VLLM_TARGET_DEVICE=npu \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
-v /etc/localtime:/etc/localtime \
-v /usr/local/dcmi:/etc/local/dcmi \
-v /var/log/npu:/var/log/npu \
-v /sys/fs/cgroup:/sys/fs/cgroup:ro \
-v /dev/davinci_manager:/dev/davinci_manager \
-v /dev/devmm_svm:/dev/devmm_svm \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /etc/hccn.conf:/etc/hccn.conf \
lmcache-ascend:v0.3.3-vllm-ascend-v0.9.2rc1-910b-cann-8.2rc1-py3.11-openeuler-22.03 \
/bin/bash
```

### Manual Installation

Assuming your working directory is ```/workspace```.

1. Clone and Install vLLM Repo
```bash
VLLM_REPO=https://github.com/vllm-project/vllm.git
VLLM_TAG=v0.9.2
git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /workspace/vllm
# NOTE: There is an Ascend Triton but we don't currently support it properly.
VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton
```

2. Clone and Install vLLM Ascend Repo
```bash
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh

VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
VLLM_ASCEND_TAG=v0.9.2rc1
git clone --depth 1 $VLLM_ASCEND_REPO --branch $VLLM_ASCEND_TAG /workspace/vllm-ascend
# apply patch to v0.9.2rc1
cd /workspace/vllm-ascend && \
git apply -p1 /workspace/LMCache-Ascend/docker/kv-connector-v1.diff

export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
```

3. Clone and Install LMCache Repo

```bash
LMCACHE_REPO=https://github.com/LMCache/LMCache.git
LMCACHE_TAG=v0.3.3
git clone --depth 1 $LMCACHE_REPO --branch $LMCACHE_TAG /workspace/LMCache
# our build is based on arm64
sed -i "s/^infinistore$/infinistore; platform_machine == 'x86_64'/" /workspace/LMCache/requirements/common.txt
export NO_CUDA_EXT=1 && python3 -m pip install -v -e /workspace/LMCache
```

4. Install LMCache-Ascend Repo

```bash
cd /workspace/LMCache-Ascend
python3 -m pip install -v --no-build-isolation -e .
```

## FAQ

1. Why do I have HostRegisterError ?
- If you encounter the Host Register Error within a container environment, please make sure you add the IPC_LOCK capabilities.
- Otherwise, please check your driver version is >= 24.0
44 changes: 44 additions & 0 deletions csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
include(utils.cmake)
append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")

find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11")
find_package(pybind11 REQUIRED)

message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")

file(GLOB SRC_FILES
${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)

find_package(Torch REQUIRED)

include_directories(
${CMAKE_CURRENT_SOURCE_DIR}
${pybind11_INCLUDE_DIRS}
${PYTHON_INCLUDE_PATH}
${TORCH_INCLUDE_DIRS}
${TORCH_NPU_PATH}/include
${ASCEND_CANN_PACKAGE_PATH}/include
${ASCEND_CANN_PACKAGE_PATH}/aarch64-linux/ascendc/include
${ASCEND_CANN_PACKAGE_PATH}/aarch64-linux/include/experiment/platform
${ASCEND_CANN_PACKAGE_PATH}/aarch64-linux/include/experiment/ascend_hal
${ASCEND_CANN_PACKAGE_PATH}/x86_64-linux/include/experiment/platform
${ASCEND_CANN_PACKAGE_PATH}/x86_64-linux/include/experiment/ascend_hal
)


set(
INCLUDES
${TORCH_INCLUDE_DIRS}
${TORCH_NPU_PATH}/include
${ASCEND_CANN_PACKAGE_PATH}/include
${ASCEND_CANN_PACKAGE_PATH}/aarch64-linux/ascendc/include
${ASCEND_CANN_PACKAGE_PATH}/aarch64-linux/include/experiment/platform
${ASCEND_CANN_PACKAGE_PATH}/aarch64-linux/include/experiment/ascend_hal
)

set(PYMODULE_FILES
${SRC_FILES}
)

pybind11_add_module(c_ops ${PYMODULE_FILES})
32 changes: 32 additions & 0 deletions csrc/cachegen_kernels.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include "cachegen_kernels.h"
#include <pybind11/pybind11.h>
#include <Python.h>

namespace py = pybind11;

void encode_cuda_new(const at::Tensor& cdf, const at::Tensor& input_sym,
at::Tensor& output_buffer, at::Tensor& output_lengths) {
// TODO:
PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend.");
throw py::error_already_set();
};

void decode_cuda_new(const at::Tensor& cdf, const at::Tensor& bytestreams,
const at::Tensor& lengths, at::Tensor& output) {
// TODO:
PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend.");
throw py::error_already_set();
};

void decode_cuda_prefsum(const at::Tensor& cdf, const at::Tensor& bytestreams,
const at::Tensor& lengths, at::Tensor& output) {
// TODO:
PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend.");
throw py::error_already_set();
};

at::Tensor calculate_cdf(const at::Tensor& input, const int max_bins) {
// TODO:
PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend.");
throw py::error_already_set();
};
16 changes: 16 additions & 0 deletions csrc/cachegen_kernels.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#pragma once
#include <ATen/ATen.h>
#include <pybind11/pybind11.h>
#include <torch/torch.h>
#include <torch/extension.h>

void encode_cuda_new(const at::Tensor& cdf, const at::Tensor& input_sym,
at::Tensor& output_buffer, at::Tensor& output_lengths);

void decode_cuda_new(const at::Tensor& cdf, const at::Tensor& bytestreams,
const at::Tensor& lengths, at::Tensor& output);

void decode_cuda_prefsum(const at::Tensor& cdf, const at::Tensor& bytestreams,
const at::Tensor& lengths, at::Tensor& output);

at::Tensor calculate_cdf(const at::Tensor& input, const int max_bins);
Loading