From 1c1afe09a5f738ad12ffc73ea983e93d69c070d7 Mon Sep 17 00:00:00 2001
From: l1cacheDell <d31409163@163.com>
Date: Mon, 12 May 2025 05:49:14 +0000
Subject: [PATCH 1/2] boost compiling

---
 csrc/setup_cuda.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/csrc/setup_cuda.py b/csrc/setup_cuda.py
index d5b197c8ce38..cfd5b2489448 100644
--- a/csrc/setup_cuda.py
+++ b/csrc/setup_cuda.py
@@ -138,6 +138,7 @@ def get_gencode_flags():
 
 nvcc_compile_args = gencode_flags
 update_git_submodule()
+os.environ.pop('PADDLE_CUDA_ARCH_LIST', None)
 nvcc_compile_args += [
     "-O3",
     "-DNDEBUG",
@@ -192,11 +193,8 @@ def get_gencode_flags():
     ]
 
 if cc >= 80 and nvcc_version >= Version("12.4"):
-    os.environ.pop('PADDLE_CUDA_ARCH_LIST', None)
     nvcc_compile_args += [
-        "-std=c++17",
         "--use_fast_math",
-        "--threads=8",
         "-D_GLIBCXX_USE_CXX11_ABI=1",
     ]
     sources += ["./gpu/sage_attn_kernels/sageattn_fused.cu"]
@@ -235,7 +233,7 @@ def get_gencode_flags():
     ext_modules=CUDAExtension(
         sources=sources,
         extra_compile_args={
-            "cxx": ["-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
+            "cxx": ["-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16", "--threads=8"],
             "nvcc": nvcc_compile_args,
         },
         libraries=["cublasLt"],

From bbe6fe6333aaa1e3cee8ed971e00b9798a652801 Mon Sep 17 00:00:00 2001
From: l1cacheDell <d31409163@163.com>
Date: Tue, 13 May 2025 07:58:44 +0000
Subject: [PATCH 2/2] move pop func into sageattn

---
 csrc/setup_cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/setup_cuda.py b/csrc/setup_cuda.py
index cfd5b2489448..13a27b15e0f2 100644
--- a/csrc/setup_cuda.py
+++ b/csrc/setup_cuda.py
@@ -138,7 +138,6 @@ def get_gencode_flags():
 
 nvcc_compile_args = gencode_flags
 update_git_submodule()
-os.environ.pop('PADDLE_CUDA_ARCH_LIST', None)
 nvcc_compile_args += [
     "-O3",
     "-DNDEBUG",
@@ -193,6 +192,7 @@ def get_gencode_flags():
     ]
 
 if cc >= 80 and nvcc_version >= Version("12.4"):
+    os.environ.pop('PADDLE_CUDA_ARCH_LIST', None)
     nvcc_compile_args += [
         "--use_fast_math",
         "-D_GLIBCXX_USE_CXX11_ABI=1",