add global clear_workspaces

lvhan028 · lvhan028 · commit 650721cadbb7 · 2025-03-24T17:02:41.000+08:00
diff --git a/example/test_linear.py b/example/test_linear.py
@@ -116,3 +116,5 @@ def dequantize(qweight, qzeros, scales, group_size: int = 128):
 print(f'abs_diff {abs_diff:4f}, '
       f'rel_diff {rel_diff:4f}, '
       f'outliers {outliers:4f}')
+
+tm.Linear.clear_workspaces()
diff --git a/src/turbomind/api/python/bind.cpp b/src/turbomind/api/python/bind.cpp
@@ -342,5 +342,6 @@ PYBIND11_MODULE(_turbomind_ext, m)
             auto _out   = TorchTensorToTurbomindTensor(out);
             auto stream = reinterpret_cast<cudaStream_t>(stream_id);
             return self->forward(*_in, *_out, stream);
-        });
+        })
+        .def_static("clear_workspaces", &turbomind::Linear::clearWorkspaces);
 }
diff --git a/src/turbomind/api/python/linear.cc b/src/turbomind/api/python/linear.cc
@@ -283,6 +283,10 @@ struct Linear::Impl {
         }
     }
 
+    static void clearWorkspaces() {
+        workspace_cache_.clear();
+    }
+
 private:
     static gemm::Workspace& getWorkspace(int device_id, cudaStream_t stream)
     {
@@ -295,9 +299,13 @@ struct Linear::Impl {
         }
 
         // create a new workspace if cache missed
-        auto workspace = std::shared_ptr<gemm::Workspace>(new gemm::Workspace, [](gemm::Workspace* p) {
-            cudaFreeAsync(p->barriers, 0);
-            cudaFreeAsync(p->partials, 0);
+        auto workspace = std::shared_ptr<gemm::Workspace>(new gemm::Workspace, [device_id](gemm::Workspace* p) {
+            int old{};
+            check_cuda_error(cudaGetDevice(&old));
+            check_cuda_error(cudaSetDevice(device_id));
+            check_cuda_error(cudaFree(p->barriers));
+            check_cuda_error(cudaFree(p->partials));
+            check_cuda_error(cudaSetDevice(old));
         });
 
         workspace->barriers_size = gemm::Gemm::kBarriersSize;
@@ -349,4 +357,9 @@ void Linear::forward(const Tensor& in, Tensor& out, cudaStream_t stream)
 {
     impl_->forward(in, out, stream);
 }
+
+void Linear::clearWorkspaces() {
+    Linear::Impl::clearWorkspaces();
+}
+
 }  // namespace turbomind
diff --git a/src/turbomind/api/python/linear.h b/src/turbomind/api/python/linear.h
@@ -28,6 +28,8 @@ class Linear {
     void forward(const Tensor& in, Tensor& out, cudaStream_t stream = nullptr);
     ~Linear() {}
 
+    static void clearWorkspaces();
+
 private:
     struct Impl;
     std::shared_ptr<Impl> impl_;
diff --git a/turbomind/linear.py b/turbomind/linear.py
@@ -188,3 +188,7 @@ def __call__(self, x: torch.Tensor):
 
     def to_half(x: torch.Tensor):
         return x.to(torch.half)
+
+    @classmethod
+    def clear_workspaces(cls):
+        return _turbomind_ext.Linear.clear_workspaces()

Original file line number	Diff line number	Diff line change
`@@ -342,5 +342,6 @@ PYBIND11_MODULE(_turbomind_ext, m)`
`342`	`342`	`auto _out = TorchTensorToTurbomindTensor(out);`
`343`	`343`	`auto stream = reinterpret_cast<cudaStream_t>(stream_id);`
`344`	`344`	`return self->forward(_in, _out, stream);`
`345`		`- });`
	`345`	`+ })`
	`346`	`+ .def_static("clear_workspaces", &turbomind::Linear::clearWorkspaces);`
`346`	`347`	`}`
Original file line number	Diff line number	Diff line change
`@@ -283,6 +283,10 @@ struct Linear::Impl {`
`283`	`283`	`}`
`284`	`284`	`}`
`285`	`285`
	`286`	`+ static void clearWorkspaces() {`
	`287`	`+ workspace_cache_.clear();`
	`288`	`+ }`
	`289`	`+`
`286`	`290`	`private:`
`287`	`291`	`static gemm::Workspace& getWorkspace(int device_id, cudaStream_t stream)`
`288`	`292`	`{`
`@@ -295,9 +299,13 @@ struct Linear::Impl {`
`295`	`299`	`}`
`296`	`300`
`297`	`301`	`// create a new workspace if cache missed`
`298`		`- auto workspace = std::shared_ptr<gemm::Workspace>(new gemm::Workspace, [](gemm::Workspace* p) {`
`299`		`- cudaFreeAsync(p->barriers, 0);`
`300`		`- cudaFreeAsync(p->partials, 0);`
	`302`	`+ auto workspace = std::shared_ptr<gemm::Workspace>(new gemm::Workspace, [device_id](gemm::Workspace* p) {`
	`303`	`+ int old{};`
	`304`	`+ check_cuda_error(cudaGetDevice(&old));`
	`305`	`+ check_cuda_error(cudaSetDevice(device_id));`
	`306`	`+ check_cuda_error(cudaFree(p->barriers));`
	`307`	`+ check_cuda_error(cudaFree(p->partials));`
	`308`	`+ check_cuda_error(cudaSetDevice(old));`
`301`	`309`	`});`
`302`	`310`
`303`	`311`	`workspace->barriers_size = gemm::Gemm::kBarriersSize;`
`@@ -349,4 +357,9 @@ void Linear::forward(const Tensor& in, Tensor& out, cudaStream_t stream)`
`349`	`357`	`{`
`350`	`358`	`impl_->forward(in, out, stream);`
`351`	`359`	`}`
	`360`	`+`
	`361`	`+void Linear::clearWorkspaces() {`
	`362`	`+ Linear::Impl::clearWorkspaces();`
	`363`	`+}`
	`364`	`+`
`352`	`365`	`} // namespace turbomind`