LMCache · matthewygf · Aug 18, 2025 · Aug 15, 2025 · Aug 15, 2025
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+text=auto eol=lf
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+.ruff_cache
+_version.py
+build
+*.so
+*.pyc
+.pytest_cache
+
+# Ascend Specific
+fusion_result.json
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "third_party/kvcache-ops"]
+	path = third_party/kvcache-ops
+	url = https://gitee.com/openeuler/kvcache-ops.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,60 @@
+# Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
+
+# CMake lowest version requirement
+cmake_minimum_required(VERSION 3.16.0)
+# project information
+project(c_ops)
+
+set(CMAKE_CXX_STANDARD 17)
+set(LMC_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}")
+add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)
+
+set(SOC_VERSION ${SOC_VERSION})
+set(ARCH ${ARCH})
+
+if (NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRINGS "Build type Release/Debug (default Release)" FORCE)
+endif()
+
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+    set(ARCH_SUBDIR "aarch64-linux")
+elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+    set(ARCH_SUBDIR "x86_64-linux")
+else()
+    message(FATAL_ERROR "Unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}")
+endif()
+
+add_subdirectory(third_party/kvcache-ops)
+add_subdirectory(csrc)
+
+
+set(TORCH_LIBS_DIR "${TORCH_PATH}/lib")
+
+target_link_options(c_ops PRIVATE
+  "-Wl,-rpath,$ORIGIN:$ORIGIN/lib"
+  "-Wl,-rpath,${LMC_INSTALL_PATH}" 
+)
+
+target_link_directories(
+  c_ops
+  PRIVATE
+  ${TORCH_LIBS_DIR}
+  ${TORCH_NPU_PATH}/lib/
+  ${ASCEND_CANN_PACKAGE_PATH}/lib64
+  ${ASCEND_CANN_PACKAGE_PATH}/${ARCH_SUBDIR}/devlib
+)
+
+target_link_libraries(
+  c_ops
+  PUBLIC
+  ${TORCH_LIBRARIES}
+  libtorch_npu.so
+  cache_kernels
+  ascendcl
+  platform
+  ascend_hal
+  tiling_api
+)
+
+
+install(TARGETS c_ops cache_kernels DESTINATION ${LMC_INSTALL_PATH})
diff --git a/README.md b/README.md
@@ -14,4 +14,121 @@
 
 --------------------------------------------------------------------------------
 
+## Overview
 
+LMCache-Ascend is a community maintained plugin for running LMCache on the Ascend NPU.
+
+
+## Prerequisites
+
+To use LMCache-Ascend on the NPU hardware, please make sure the following prerequisites are satisfied.
+
+- Hardware: Atlas 800I A2 Inference series. The rest of the series like A3 Inference/Training and 300I Duo are experimental.
+- OS: Linux-based.
+- Software:
+  - **Python**: >= 3.10, <= 3.11
+  - **CANN Toolkit**: >= 8.2rc1
+  - **Ascend Driver**: >= 24.1
+  - **PyTorch**: == 2.5.1, **Torch-npu**: == 2.5.1.post1.dev20250619
+  - **vLLM**: v0.9.2 & **vLLM-Ascend**: v0.9.2rc1
+
+## Getting Started
+
+### Clone LMCache-Ascend Repo
+
+Our repo contains a kvcache ops submodule for ease of maintainence, therefore we recommend cloning the repo with submodules.
+
+```bash
+cd /workspace
+git clone --recurse-submodules https://github.com/LMCache/LMCache-Ascend.git
+```
+
+### Docker
+
+```bash
+cd /workspace/LMCache-Ascend
+docker build -f docker/Dockerfile.a2.openEuler -t lmcache-ascend:v0.3.3-vllm-ascend-v0.9.2rc1-910b-cann-8.2rc1-py3.11-openeuler-22.03 .
+```
+
+Once that is built, run it with the following cmd
+```bash
+DEVICE_LIST="0,1,2,3,4,5,6,7"
+docker run -it \
+    --privileged \
+    --cap-add=SYS_PTRACE \
+    --net=host \
+    --name lmcache-ascend-dev \
+    --rm \
+    -e ASCEND_VISIBLE_DEVICES=${DEVICE_LIST} \
+    -e ASCEND_RT_VISIBLE_DEVICES=${DEVICE_LIST} \
+    -e ASCEND_TOTAL_MEMORY_GB=32 \
+    -e VLLM_TARGET_DEVICE=npu \
+    -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
+    -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
+    -v /etc/localtime:/etc/localtime \
+    -v /usr/local/dcmi:/etc/local/dcmi \
+    -v /var/log/npu:/var/log/npu \
+    -v /sys/fs/cgroup:/sys/fs/cgroup:ro \
+    -v /dev/davinci_manager:/dev/davinci_manager \
+    -v /dev/devmm_svm:/dev/devmm_svm \
+    -v /etc/ascend_install.info:/etc/ascend_install.info \
+    -v /etc/hccn.conf:/etc/hccn.conf \
+    lmcache-ascend:v0.3.3-vllm-ascend-v0.9.2rc1-910b-cann-8.2rc1-py3.11-openeuler-22.03 \
+    /bin/bash
+```
+
+### Manual Installation
+
+Assuming your working directory is ```/workspace```.
+
+1. Clone and Install vLLM Repo
+```bash
+VLLM_REPO=https://github.com/vllm-project/vllm.git
+VLLM_TAG=v0.9.2
+git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /workspace/vllm
+# NOTE: There is an Ascend Triton but we don't currently support it properly.
+VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton
+```
+
+2. Clone and Install vLLM Ascend Repo
+```bash
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+source /usr/local/Ascend/nnal/atb/set_env.sh
+
+VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
+VLLM_ASCEND_TAG=v0.9.2rc1
+git clone --depth 1 $VLLM_ASCEND_REPO --branch $VLLM_ASCEND_TAG /workspace/vllm-ascend
+# apply patch to v0.9.2rc1
+cd /workspace/vllm-ascend && \
+    git apply -p1 /workspace/LMCache-Ascend/docker/kv-connector-v1.diff
+
+export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
+```
+
+3. Clone and Install LMCache Repo
+
+```bash
+LMCACHE_REPO=https://github.com/LMCache/LMCache.git
+LMCACHE_TAG=v0.3.3
+git clone --depth 1 $LMCACHE_REPO --branch $LMCACHE_TAG /workspace/LMCache
+# our build is based on arm64
+sed -i "s/^infinistore$/infinistore; platform_machine == 'x86_64'/" /workspace/LMCache/requirements/common.txt
+export NO_CUDA_EXT=1 && python3 -m pip install -v -e /workspace/LMCache
+```
+
+4. Install LMCache-Ascend Repo
+
+```bash
+cd /workspace/LMCache-Ascend
+python3 -m pip install -v --no-build-isolation -e .
+```
+
+## FAQ
+
+1. Why do I have HostRegisterError ? 
+  - If you encounter the Host Register Error within a container environment, please make sure you add the IPC_LOCK capabilities.
+  - Otherwise, please check your driver version is >= 24.0
diff --git a/csrc/CMakeLists.txt b/csrc/CMakeLists.txt
@@ -0,0 +1,44 @@
+include(utils.cmake)
+append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
+
+find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11")
+find_package(pybind11 REQUIRED)
+
+message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
+
+file(GLOB SRC_FILES
+${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+
+find_package(Torch REQUIRED)
+
+include_directories(
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${pybind11_INCLUDE_DIRS}
+  ${PYTHON_INCLUDE_PATH}
+  ${TORCH_INCLUDE_DIRS}
+  ${TORCH_NPU_PATH}/include
+  ${ASCEND_CANN_PACKAGE_PATH}/include
+  ${ASCEND_CANN_PACKAGE_PATH}/aarch64-linux/ascendc/include
+  ${ASCEND_CANN_PACKAGE_PATH}/aarch64-linux/include/experiment/platform
+  ${ASCEND_CANN_PACKAGE_PATH}/aarch64-linux/include/experiment/ascend_hal
+  ${ASCEND_CANN_PACKAGE_PATH}/x86_64-linux/include/experiment/platform
+  ${ASCEND_CANN_PACKAGE_PATH}/x86_64-linux/include/experiment/ascend_hal
+)
+
+
+set(
+  INCLUDES
+  ${TORCH_INCLUDE_DIRS}
+  ${TORCH_NPU_PATH}/include
+  ${ASCEND_CANN_PACKAGE_PATH}/include
+  ${ASCEND_CANN_PACKAGE_PATH}/aarch64-linux/ascendc/include
+  ${ASCEND_CANN_PACKAGE_PATH}/aarch64-linux/include/experiment/platform
+  ${ASCEND_CANN_PACKAGE_PATH}/aarch64-linux/include/experiment/ascend_hal
+)
+
+set(PYMODULE_FILES 
+    ${SRC_FILES}
+)
+
+pybind11_add_module(c_ops ${PYMODULE_FILES})
diff --git a/csrc/cachegen_kernels.cpp b/csrc/cachegen_kernels.cpp
@@ -0,0 +1,32 @@
+#include "cachegen_kernels.h"
+#include <pybind11/pybind11.h>
+#include <Python.h> 
+
+namespace py = pybind11;
+
+void encode_cuda_new(const at::Tensor& cdf, const at::Tensor& input_sym,
+                     at::Tensor& output_buffer, at::Tensor& output_lengths) {
+    // TODO:
+    PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend.");
+    throw py::error_already_set();
+};
+
+void decode_cuda_new(const at::Tensor& cdf, const at::Tensor& bytestreams,
+                     const at::Tensor& lengths, at::Tensor& output) {
+    // TODO:
+    PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend.");
+    throw py::error_already_set();
+};
+
+void decode_cuda_prefsum(const at::Tensor& cdf, const at::Tensor& bytestreams,
+                         const at::Tensor& lengths, at::Tensor& output) {
+    // TODO:
+    PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend.");
+    throw py::error_already_set();
+};
+
+at::Tensor calculate_cdf(const at::Tensor& input, const int max_bins) {
+    // TODO:
+    PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend.");
+    throw py::error_already_set();
+};
diff --git a/csrc/cachegen_kernels.h b/csrc/cachegen_kernels.h
@@ -0,0 +1,16 @@
+#pragma once
+#include <ATen/ATen.h>
+#include <pybind11/pybind11.h>
+#include <torch/torch.h>
+#include <torch/extension.h>
+
+void encode_cuda_new(const at::Tensor& cdf, const at::Tensor& input_sym,
+                     at::Tensor& output_buffer, at::Tensor& output_lengths);
+
+void decode_cuda_new(const at::Tensor& cdf, const at::Tensor& bytestreams,
+                     const at::Tensor& lengths, at::Tensor& output);
+
+void decode_cuda_prefsum(const at::Tensor& cdf, const at::Tensor& bytestreams,
+                         const at::Tensor& lengths, at::Tensor& output);
+
+at::Tensor calculate_cdf(const at::Tensor& input, const int max_bins);