From e9bcfecd95837027801d965c12cbb30be3cf7f36 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Fri, 28 Mar 2025 17:47:16 +0000
Subject: [PATCH 1/8] create multiple cublas handles
---
include/tensor.cuh | 29 ++++++++++++++++++-----------
main.cu | 1 +
2 files changed, 19 insertions(+), 11 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 5089ed7..cb97fe3 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -127,32 +127,39 @@ inline void gpuAssert(T code, const char *file, int line, bool abort = true) {
*/
class Session {
public:
- static Session &getInstance() {
- static Session instance;
+ static Session &getInstance(size_t numStreams=1) {
+ static Session instance(numStreams);
return instance;
}
private:
- Session() {
- gpuErrChk(cublasCreate(&m_cublasHandle));
+ Session(size_t numStreams=1) {
+ m_numBublasHandlesStreams = numStreams;
+ m_cublasHandles.reserve(m_numBublasHandlesStreams);
+ for (size_t i=0; i m_cublasHandles;
cusolverDnHandle_t m_cusolverHandle;
- size_t bytesAllocated = 0;
+ size_t m_bytesAllocated = 0;
+ size_t m_numBublasHandlesStreams = 1;
public:
Session(Session const &) = delete;
void operator=(Session const &) = delete;
- cublasHandle_t &cuBlasHandle() { return m_cublasHandle; }
+ cublasHandle_t &cuBlasHandle() { return m_cublasHandles[0]; }
cusolverDnHandle_t &cuSolverHandle() { return m_cusolverHandle; }
@@ -168,7 +175,7 @@ public:
cudaError_t cudaAllocate(void** d, size_t s) {
cudaError_t err = cudaMalloc(d, s);
if (err == cudaSuccess) {
- bytesAllocated += s;
+ m_bytesAllocated += s;
}
return err;
}
@@ -176,9 +183,9 @@ public:
/**
* @return Total allocated bytes
*/
- size_t totalAllocatedBytes() const { return bytesAllocated; }
+ size_t totalAllocatedBytes() const { return m_bytesAllocated; }
- void incrementAllocatedBytes(size_t s) { bytesAllocated += s; }
+ void incrementAllocatedBytes(size_t s) { m_bytesAllocated += s; }
};
diff --git a/main.cu b/main.cu
index 742321d..5dbe058 100644
--- a/main.cu
+++ b/main.cu
@@ -21,6 +21,7 @@ void xyz() {
int main() {
+ Session::getInstance(5);
xyz();
std::cout << "Memory (outside): " << std::setprecision(3)
<< (float) Session::getInstance().totalAllocatedBytes() / 1e6
From 954f175dbcaba54430d5519ed1c5a98f8ab542e6 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Fri, 28 Mar 2025 18:51:41 +0000
Subject: [PATCH 2/8] streams fully supported
---
include/tensor.cuh | 110 ++++++++++++++++++++++++++++-----------------
main.cu | 11 +++--
2 files changed, 78 insertions(+), 43 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index cb97fe3..6d257e3 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -125,19 +125,29 @@ inline void gpuAssert(T code, const char *file, int line, bool abort = true) {
* The cuBlas handle can be accessed anywhere by `Session::getInstance().cuBlasHandle()`
* The cuSolver handle can be accessed anywhere by `Session::getInstance().cuSolverHandle()`
*/
+static size_t s_numStreams = 1;
+
class Session {
public:
- static Session &getInstance(size_t numStreams=1) {
- static Session instance(numStreams);
+
+ static void setStreams(size_t numStreams) {
+ s_numStreams = numStreams;
+ }
+
+ static Session &getInstance() {
+ static Session instance(s_numStreams);
return instance;
}
private:
- Session(size_t numStreams=1) {
+ Session(size_t numStreams=10) {
m_numBublasHandlesStreams = numStreams;
- m_cublasHandles.reserve(m_numBublasHandlesStreams);
+ m_cublasHandles.resize(m_numBublasHandlesStreams);
+ m_cublasStreams.resize(m_numBublasHandlesStreams);
for (size_t i=0; i m_cublasHandles;
+ std::vector m_cublasStreams;
cusolverDnHandle_t m_cusolverHandle;
size_t m_bytesAllocated = 0;
size_t m_numBublasHandlesStreams = 1;
@@ -159,7 +170,7 @@ public:
void operator=(Session const &) = delete;
- cublasHandle_t &cuBlasHandle() { return m_cublasHandles[0]; }
+ cublasHandle_t &cuBlasHandle(size_t idx=0) { return m_cublasHandles[idx]; }
cusolverDnHandle_t &cuSolverHandle() { return m_cusolverHandle; }
@@ -220,6 +231,7 @@ private:
size_t m_numMats = 0; ///< Number of matrices
bool m_doDestroyData = false; ///< Whether to destroy memory
bool m_doDestroyPtrMatrices = false; ///< Whether to destroy memory
+ size_t m_idxStream = 0; ///< Stream index (defaults to 0)
void destroy() {
if (m_doDestroyData) {
@@ -273,6 +285,13 @@ private:
void initialisePointersToMatricesData();
public:
+ /**
+ * Set the stream ID
+ */
+ void setStreamIdx(size_t);
+
+ size_t streamIdx() const { return m_idxStream; }
+
/**
* Create a tensor with random elements
* @param numRows number of rows
@@ -594,6 +613,14 @@ public:
}
}; /* END OF DTENSOR */
+template
+void DTensor::setStreamIdx(size_t idx) {
+ if (idx >= s_numStreams) {
+ throw std::invalid_argument("Invalid stream index; it exceeds the max allocated streams");
+ }
+ m_idxStream = idx;
+}
+
template
void DTensor::initialisePointersToMatricesData() {
/* Make sure m_d_ptrMatrices has been allocated */
@@ -813,7 +840,7 @@ DTensor::DTensor(const DTensor &other) {
m_numMats = other.m_numMats;
m_numRows = other.m_numRows;
m_numCols = other.m_numCols;
-
+ m_idxStream = other.m_idxStream;
allocateOnDevice(m_numRows * m_numCols * m_numMats);
gpuErrChk(cudaMemcpy(m_d_data, other.raw(), m_numRows * m_numCols * m_numMats * sizeof(T),
cudaMemcpyDeviceToDevice));
@@ -846,6 +873,7 @@ DTensor::DTensor(const DTensor &other, size_t axis, size_t from, size_t to
m_d_data = other.m_d_data + offset;
m_doDestroyData = false;
m_doDestroyPtrMatrices = false;
+ m_idxStream = other.m_idxStream;
}
template
@@ -858,6 +886,7 @@ DTensor::DTensor(DTensor &&other) {
m_doDestroyData = other.m_doDestroyData;
m_doDestroyPtrMatrices = other.m_doDestroyPtrMatrices;
m_d_ptrMatrices = other.m_d_ptrMatrices;
+ m_idxStream = other.m_idxStream;
/* Invalidate other */
other.m_doDestroyPtrMatrices = false;
other.m_doDestroyData = false;
@@ -894,7 +923,7 @@ inline double DTensor::dotF(const DTensor &other) {
throw std::invalid_argument("[dotF] incompatible dimensions");
size_t n = numEl();
double result;
- gpuErrChk(cublasDdot(Session::getInstance().cuBlasHandle(), n,
+ gpuErrChk(cublasDdot(Session::getInstance().cuBlasHandle(m_idxStream), n,
raw(), 1,
other.raw(), 1,
&result));
@@ -907,7 +936,7 @@ inline float DTensor::dotF(const DTensor &other) {
throw std::invalid_argument("[dotF] incompatible dimensions");
size_t n = numEl();
float result;
- gpuErrChk(cublasSdot(Session::getInstance().cuBlasHandle(), n,
+ gpuErrChk(cublasSdot(Session::getInstance().cuBlasHandle(m_idxStream), n,
raw(), 1,
other.raw(), 1,
&result));
@@ -917,7 +946,7 @@ inline float DTensor::dotF(const DTensor &other) {
template<>
inline double DTensor::normF() const {
double the_norm;
- gpuErrChk(cublasDnrm2(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ gpuErrChk(cublasDnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
&the_norm));
return the_norm;
}
@@ -925,7 +954,7 @@ inline double DTensor::normF() const {
template<>
inline float DTensor::normF() const {
float the_norm;
- gpuErrChk(cublasSnrm2(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ gpuErrChk(cublasSnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
&the_norm));
return the_norm;
}
@@ -933,7 +962,7 @@ inline float DTensor::normF() const {
template<>
inline float DTensor::sumAbs() const {
float sumAbsAllElements;
- gpuErrChk(cublasSasum(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ gpuErrChk(cublasSasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
&sumAbsAllElements));
return sumAbsAllElements;
}
@@ -941,7 +970,7 @@ inline float DTensor::sumAbs() const {
template<>
inline double DTensor::sumAbs() const {
double sumAbsAllElements;
- gpuErrChk(cublasDasum(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ gpuErrChk(cublasDasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
&sumAbsAllElements));
return sumAbsAllElements;
}
@@ -950,7 +979,7 @@ template<>
inline float DTensor::maxAbs() const {
int idx;
float hostDst;
- gpuErrChk(cublasIsamax(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ gpuErrChk(cublasIsamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
&idx));
gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(float), cudaMemcpyDeviceToHost));
return std::signbit(hostDst) ? -hostDst : hostDst;
@@ -960,7 +989,7 @@ template<>
inline double DTensor::maxAbs() const {
int idx;
double hostDst;
- gpuErrChk(cublasIdamax(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ gpuErrChk(cublasIdamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
&idx));
gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(double), cudaMemcpyDeviceToHost));
return std::signbit(hostDst) ? -hostDst : hostDst;
@@ -970,7 +999,7 @@ template<>
inline float DTensor::minAbs() const {
int idx;
float hostDst;
- gpuErrChk(cublasIsamin(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ gpuErrChk(cublasIsamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
&idx));
gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(float), cudaMemcpyDeviceToHost));
return std::signbit(hostDst) ? -hostDst : hostDst;
@@ -980,7 +1009,7 @@ template<>
inline double DTensor::minAbs() const {
int idx;
double hostDst;
- gpuErrChk(cublasIdamin(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ gpuErrChk(cublasIdamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
&idx));
gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(double), cudaMemcpyDeviceToHost));
return std::signbit(hostDst) ? -hostDst : hostDst;
@@ -992,10 +1021,10 @@ void DTensor::applyRightGivensRotation(size_t i, size_t j, const T *c, const
T *col_i = m_d_data + i * m_numRows;
T *col_j = m_d_data + j * m_numRows;
if constexpr (std::is_same_v) {
- gpuErrChk(cublasDrot(Session::getInstance().cuBlasHandle(), m_numRows,
+ gpuErrChk(cublasDrot(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows,
col_i, 1, col_j, 1, c, minus_s));
} else if constexpr (std::is_same_v) {
- gpuErrChk(cublasSrot(Session::getInstance().cuBlasHandle(), m_numRows,
+ gpuErrChk(cublasSrot(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows,
col_i, 1, col_j, 1, c, minus_s));
}
}
@@ -1004,12 +1033,12 @@ template
void DTensor::applyLeftGivensRotation(size_t i, size_t j, const T *c, const T *minus_s) {
if (m_numMats > 1) throw std::invalid_argument("[applyLeftGivensRotation] tensors (nMat>1) not supported");
if constexpr (std::is_same_v) {
- gpuErrChk(cublasDrot(Session::getInstance().cuBlasHandle(), m_numCols,
+ gpuErrChk(cublasDrot(Session::getInstance().cuBlasHandle(m_idxStream), m_numCols,
m_d_data + i, m_numRows,
m_d_data + j, m_numRows,
c, minus_s));
} else if constexpr (std::is_same_v) {
- gpuErrChk(cublasSrot(Session::getInstance().cuBlasHandle(), m_numCols,
+ gpuErrChk(cublasSrot(Session::getInstance().cuBlasHandle(m_idxStream), m_numCols,
m_d_data + i, m_numRows,
m_d_data + j, m_numRows,
c, minus_s));
@@ -1085,7 +1114,7 @@ inline DTensor DTensor::tr() const {
float alpha = 1.0f, beta = 0;
size_t numElMat = m_numCols * m_numRows;
for (size_t i = 0; i < m_numMats; i++) {
- gpuErrChk(cublasSgeam(Session::getInstance().cuBlasHandle(),
+ gpuErrChk(cublasSgeam(Session::getInstance().cuBlasHandle(m_idxStream),
CUBLAS_OP_T, CUBLAS_OP_N,
m_numCols, m_numRows,
&alpha, raw() + numElMat * i, m_numRows,
@@ -1101,7 +1130,7 @@ inline DTensor DTensor::tr() const {
double alpha = 1.0f, beta = 0;
size_t numElMat = m_numCols * m_numRows;
for (size_t i = 0; i < m_numMats; i++) {
- gpuErrChk(cublasDgeam(Session::getInstance().cuBlasHandle(),
+ gpuErrChk(cublasDgeam(Session::getInstance().cuBlasHandle(m_idxStream),
CUBLAS_OP_T, CUBLAS_OP_N,
m_numCols, m_numRows,
&alpha, raw() + numElMat * i, m_numRows,
@@ -1126,7 +1155,7 @@ template<>
inline DTensor &DTensor::operator*=(double scalar) {
double alpha = scalar;
gpuErrChk(
- cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
+ cublasDscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
return *this;
}
@@ -1137,6 +1166,7 @@ DTensor &DTensor::operator=(const DTensor &other) {
m_numCols = other.m_numCols;
m_doDestroyData = false;
m_d_data = other.m_d_data;
+ m_idxStream = other.m_idxStream;
return *this;
}
@@ -1144,7 +1174,7 @@ template<>
inline DTensor &DTensor::operator*=(float scalar) {
float alpha = scalar;
gpuErrChk(
- cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
+ cublasSscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
return *this;
}
@@ -1152,7 +1182,7 @@ template<>
inline DTensor &DTensor::operator+=(const DTensor &rhs) {
const double alpha = 1.;
gpuErrChk(
- cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
+ cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
1, m_d_data, 1));
return *this;
}
@@ -1161,7 +1191,7 @@ template<>
inline DTensor &DTensor::operator+=(const DTensor &rhs) {
const float alpha = 1.;
gpuErrChk(
- cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
+ cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
1, m_d_data, 1));
return *this;
}
@@ -1169,7 +1199,7 @@ inline DTensor &DTensor::operator+=(const DTensor &rhs) {
template<>
inline DTensor &DTensor::operator-=(const DTensor &rhs) {
const float alpha = -1.;
- cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, 1,
+ cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, 1,
m_d_data, 1);
return *this;
}
@@ -1178,7 +1208,7 @@ template<>
inline DTensor &DTensor::operator-=(const DTensor &rhs) {
const double alpha = -1.;
gpuErrChk(
- cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
+ cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
1, m_d_data, 1));
return *this;
}
@@ -1199,7 +1229,7 @@ inline void DTensor::addAB(const DTensor &A, const DTensor 1) {
- gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(),
+ gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(m_idxStream),
CUBLAS_OP_N, CUBLAS_OP_N,
nRA, nCB, nCA, &_alpha,
A.m_d_ptrMatrices, nRA,
@@ -1208,7 +1238,7 @@ inline void DTensor::addAB(const DTensor &A, const DTensor::addAB(const DTensor &A, const DTensor
size_t nCB = B.numCols();
float _alpha = alpha, _beta = beta;
if (nMat > 1) {
- gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(),
+ gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(m_idxStream),
CUBLAS_OP_N, CUBLAS_OP_N,
nRA, nCB, nCA, &_alpha,
A.m_d_ptrMatrices, nRA,
@@ -1235,7 +1265,7 @@ inline void DTensor::addAB(const DTensor &A, const DTensor
m_d_ptrMatrices, nRA,
nMat));
} else {
- gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(),
+ gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(m_idxStream),
CUBLAS_OP_N, CUBLAS_OP_N,
nRA, nCB, nCA, &_alpha,
A.raw(), nRA,
@@ -1259,7 +1289,7 @@ inline void DTensor::leastSquaresBatched(DTensor &B) {
throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
int info = 0;
DTensor infoArray(batchSize); // TODO consider preallocating?
- gpuErrChk(cublasDgelsBatched(Session::getInstance().cuBlasHandle(),
+ gpuErrChk(cublasDgelsBatched(Session::getInstance().cuBlasHandle(m_idxStream),
CUBLAS_OP_N,
m_numRows,
m_numCols,
@@ -1287,7 +1317,7 @@ inline void DTensor::leastSquaresBatched(DTensor &B) {
throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
int info = 0;
DTensor infoArray(batchSize); // TODO consider preallocating?
- gpuErrChk(cublasSgelsBatched(Session::getInstance().cuBlasHandle(),
+ gpuErrChk(cublasSgelsBatched(Session::getInstance().cuBlasHandle(m_idxStream),
CUBLAS_OP_N,
m_numRows,
m_numCols,
@@ -1307,7 +1337,7 @@ inline DTensor DTensor::getRows(size_t rowsFrom, size_t rowsTo,
size_t n = numCols(), m = numRows();
DTensor rowsOnly(rowsRangeLength, numCols(), 1);
for (size_t i = 0; i < rowsRangeLength; i++) {
- gpuErrChk(cublasDcopy(Session::getInstance().cuBlasHandle(),
+ gpuErrChk(cublasDcopy(Session::getInstance().cuBlasHandle(m_idxStream),
n, // # values to copy
raw() + rowsFrom + i + matIdx * n * m, m,
rowsOnly.raw() + i,
@@ -1322,7 +1352,7 @@ inline DTensor DTensor::getRows(size_t rowsFrom, size_t rowsTo, si
size_t n = numCols(), m = numRows();
DTensor rowsOnly(rowsRangeLength, numCols(), 1);
for (size_t i = 0; i < rowsRangeLength; i++) {
- gpuErrChk(cublasScopy(Session::getInstance().cuBlasHandle(),
+ gpuErrChk(cublasScopy(Session::getInstance().cuBlasHandle(m_idxStream),
n, // # values to copy
raw() + rowsFrom + i + matIdx * n * m, m,
rowsOnly.raw() + i,
@@ -1808,7 +1838,7 @@ inline void QRFactoriser::leastSquares(DTensor &rhs) {
rhs.raw(), m,
m_workspace->raw(), m_workspaceSize,
m_info->raw()));
- gpuErrChk(cublasDtrsm(Session::getInstance().cuBlasHandle(),
+ gpuErrChk(cublasDtrsm(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()),
CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1,
&alpha,
m_matrix->raw(), m,
@@ -1827,7 +1857,7 @@ inline void QRFactoriser::leastSquares(DTensor &rhs) {
rhs.raw(), m,
m_workspace->raw(), m_workspaceSize,
m_info->raw()));
- gpuErrChk(cublasStrsm(Session::getInstance().cuBlasHandle(),
+ gpuErrChk(cublasStrsm(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()),
CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1,
&alpha,
m_matrix->raw(), m,
@@ -2201,7 +2231,7 @@ inline void GivensAnnihilator::annihilate(size_t i, size_t k, size_t j) {
* Pass cosine and sine as device pointers
* (Avoid having to download first)
*/
- gpuErrChk(cublasSetPointerMode(Session::getInstance().cuBlasHandle(), CUBLAS_POINTER_MODE_DEVICE));
+ gpuErrChk(cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_DEVICE));
/* Useful definitions */
T *aux = m_d_rhyp_cos_sin->raw();
@@ -2214,7 +2244,7 @@ inline void GivensAnnihilator::annihilate(size_t i, size_t k, size_t j) {
m_matrix->applyLeftGivensRotation(i, k, aux + 1, aux + 2);
/* Change back to default behaviour */
- gpuErrChk(cublasSetPointerMode(Session::getInstance().cuBlasHandle(), CUBLAS_POINTER_MODE_HOST));
+ gpuErrChk(cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_HOST));
}
diff --git a/main.cu b/main.cu
index 5dbe058..1d60028 100644
--- a/main.cu
+++ b/main.cu
@@ -7,11 +7,16 @@
void xyz() {
/* Write to binary file */
auto r = DTensor::createRandomTensor(3, 6, 4, -1, 1);
- auto r2 = DTensor::createRandomTensor(300, 600, 4, -1, 1);
- std::string fName = "tensor.bt"; // binary tensor file extension: .bt
+ r.setStreamIdx(1);
+ std::string fName = "abcd.bt"; // binary tensor file extension: .bt
+ r.saveToFile(fName);
/* Parse binary file */
auto recov = DTensor::parseFromFile(fName);
+
+ std::cout << r;
+ std::cout << recov;
+
auto err = r - recov;
std::cout << "max error : " << err.maxAbs() << std::endl;
std::cout << "Memory: " << std::setprecision(3)
@@ -21,7 +26,7 @@ void xyz() {
int main() {
- Session::getInstance(5);
+ Session::setStreams(5);
xyz();
std::cout << "Memory (outside): " << std::setprecision(3)
<< (float) Session::getInstance().totalAllocatedBytes() / 1e6
From 212cd37917725cc3326c387f7b19c028fc16866a Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Fri, 28 Mar 2025 19:02:22 +0000
Subject: [PATCH 3/8] changelog updated
---
CHANGELOG.md | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 60d3173..5d853fe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## v1.9.0 - 28-03-2025
+
+### Added
+
+- Multiple streams supported
+
+
From 0a17614474fbd51226d0313a995ae58ec52013ae Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Fri, 28 Mar 2025 19:24:19 +0000
Subject: [PATCH 4/8] minor
---
include/tensor.cuh | 27 +++++++++++++++------------
main.cu | 3 +--
2 files changed, 16 insertions(+), 14 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 6d257e3..57f55ab 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -116,6 +116,7 @@ inline void gpuAssert(T code, const char *file, int line, bool abort = true) {
/* ================================================================================================
* SESSION
* ================================================================================================ */
+static size_t s_numStreams = 1;
/**
* Singleton for Cuda library handles.
@@ -125,11 +126,12 @@ inline void gpuAssert(T code, const char *file, int line, bool abort = true) {
* The cuBlas handle can be accessed anywhere by `Session::getInstance().cuBlasHandle()`
* The cuSolver handle can be accessed anywhere by `Session::getInstance().cuSolverHandle()`
*/
-static size_t s_numStreams = 1;
-
class Session {
public:
-
+ /**
+ *
+ * @param numStreams
+ */
static void setStreams(size_t numStreams) {
s_numStreams = numStreams;
}
@@ -140,11 +142,11 @@ public:
}
private:
- Session(size_t numStreams=10) {
- m_numBublasHandlesStreams = numStreams;
- m_cublasHandles.resize(m_numBublasHandlesStreams);
- m_cublasStreams.resize(m_numBublasHandlesStreams);
- for (size_t i=0; i m_cublasStreams;
cusolverDnHandle_t m_cusolverHandle;
size_t m_bytesAllocated = 0;
- size_t m_numBublasHandlesStreams = 1;
+ size_t m_numCublasHandlesStreams = 1;
public:
Session(Session const &) = delete;
@@ -288,7 +290,7 @@ public:
/**
* Set the stream ID
*/
- void setStreamIdx(size_t);
+ DTensor setStreamIdx(size_t);
size_t streamIdx() const { return m_idxStream; }
@@ -614,11 +616,12 @@ public:
}; /* END OF DTENSOR */
template
-void DTensor::setStreamIdx(size_t idx) {
+DTensor DTensor::setStreamIdx(size_t idx) {
if (idx >= s_numStreams) {
throw std::invalid_argument("Invalid stream index; it exceeds the max allocated streams");
}
m_idxStream = idx;
+ return *this;
}
template
diff --git a/main.cu b/main.cu
index 1d60028..f1e0d70 100644
--- a/main.cu
+++ b/main.cu
@@ -6,8 +6,7 @@
void xyz() {
/* Write to binary file */
- auto r = DTensor::createRandomTensor(3, 6, 4, -1, 1);
- r.setStreamIdx(1);
+ DTensor r = DTensor::createRandomTensor(3, 6, 4, -1, 1).setStreamIdx(1);
std::string fName = "abcd.bt"; // binary tensor file extension: .bt
r.saveToFile(fName);
From 901ba1404dc5858f3012850caad7a07a5a2a38d8 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Fri, 28 Mar 2025 19:39:15 +0000
Subject: [PATCH 5/8] streams and cusolver handles
---
include/tensor.cuh | 35 ++++++++++++++++++++++++++++-------
1 file changed, 28 insertions(+), 7 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 57f55ab..a2ff121 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -129,13 +129,18 @@ static size_t s_numStreams = 1;
class Session {
public:
/**
- *
- * @param numStreams
+ * Sets the total number of available streams
+ * @param numStreams number of streams (default: 1)
*/
static void setStreams(size_t numStreams) {
s_numStreams = numStreams;
}
+ /**
+ * Returns the unique instance of Session (constructed upon first
+ * invocation)
+ * @return instance of Session
+ */
static Session &getInstance() {
static Session instance(s_numStreams);
return instance;
@@ -146,24 +151,26 @@ private:
m_numCublasHandlesStreams = numStreams;
m_cublasHandles.resize(m_numCublasHandlesStreams);
m_cublasStreams.resize(m_numCublasHandlesStreams);
+ m_cusolverHandles.resize(m_numCublasHandlesStreams);
for (size_t i=0; i m_cublasHandles;
std::vector m_cublasStreams;
- cusolverDnHandle_t m_cusolverHandle;
+ std::vector m_cusolverHandles;
size_t m_bytesAllocated = 0;
size_t m_numCublasHandlesStreams = 1;
@@ -172,9 +179,19 @@ public:
void operator=(Session const &) = delete;
+ /**
+ * cuBLAS handle
+ * @param idx index of stream
+ * @return cuBLAS handle
+ */
cublasHandle_t &cuBlasHandle(size_t idx=0) { return m_cublasHandles[idx]; }
- cusolverDnHandle_t &cuSolverHandle() { return m_cusolverHandle; }
+ /**
+ * cuSolver handle
+ * @param idx index of stream
+ * @return cuSolver handle
+ */
+ cusolverDnHandle_t &cuSolverHandle(size_t idx=0) { return m_cusolverHandles[idx]; }
/**
* Preferred method for CUDA memory allocation; it allocated memory on the device
@@ -198,7 +215,11 @@ public:
*/
size_t totalAllocatedBytes() const { return m_bytesAllocated; }
- void incrementAllocatedBytes(size_t s) { m_bytesAllocated += s; }
+ /**
+ * Increment counter of allocated bytes
+ * @param s allocated bytes (can be negative)
+ */
+ void incrementAllocatedBytes(int s) { m_bytesAllocated += s; }
};
From 6929a08a35a8d02a1f900ab89e38d890587c2435 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Fri, 28 Mar 2025 20:43:44 +0000
Subject: [PATCH 6/8] docs for streams
---
README.md | 11 ++++++++++-
include/tensor.cuh | 4 ++++
2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index b6e560e..00cd8a8 100644
--- a/README.md
+++ b/README.md
@@ -403,6 +403,15 @@ size_t allocatedBytes =
Session::getInstance().totalAllocatedBytes();
```
-
+GPUtils supports multiple streams. By default a single stream is created,
+but you can set the number of streams you need with
+```c++
+/* This needs to be the first line in your code */
+Session::setStreams(4); // create 4 strems
+```
+Then, you can use `setStreamIdx` to select a stream to go with your instance of `DTensor`
+```c++
+auto a = DTensor::createRandomTensor(3, 6, 4, -1, 1).setStreamIdx(2);
+```
## Happy number crunching!
diff --git a/include/tensor.cuh b/include/tensor.cuh
index a2ff121..32a62d5 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -116,6 +116,10 @@ inline void gpuAssert(T code, const char *file, int line, bool abort = true) {
/* ================================================================================================
* SESSION
* ================================================================================================ */
+/**
+ * Total number of allocated streams
+ * Can be changed with Session::setStreams()
+ */
static size_t s_numStreams = 1;
/**
From b5d9008b9bb18e2f2d0a1f4cc2c40377d7d96094 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Fri, 28 Mar 2025 21:03:06 +0000
Subject: [PATCH 7/8] sync streams
---
include/tensor.cuh | 96 +++++++++++++++++++++++++++++++---------------
main.cu | 2 +
2 files changed, 68 insertions(+), 30 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 32a62d5..b73e280 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -156,7 +156,7 @@ private:
m_cublasHandles.resize(m_numCublasHandlesStreams);
m_cublasStreams.resize(m_numCublasHandlesStreams);
m_cusolverHandles.resize(m_numCublasHandlesStreams);
- for (size_t i=0; i= m_numCublasHandlesStreams) {
+ throw std::runtime_error("stream index out of range");
+ }
+ gpuErrChk(cudaStreamSynchronize(m_cublasStreams[idx]));
+ }
+
+ /**
+ * Synchronize all streams
+ */
+ void synchronizeAllStreams() const {
+ for (size_t i = 0; i < m_numCublasHandlesStreams; i++) {
+ synchronizeStream(i);
+ }
+ }
};
@@ -974,32 +994,36 @@ inline float DTensor::dotF(const DTensor &other) {
template<>
inline double DTensor::normF() const {
double the_norm;
- gpuErrChk(cublasDnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
- &the_norm));
+ gpuErrChk(
+ cublasDnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ &the_norm));
return the_norm;
}
template<>
inline float DTensor::normF() const {
float the_norm;
- gpuErrChk(cublasSnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
- &the_norm));
+ gpuErrChk(
+ cublasSnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ &the_norm));
return the_norm;
}
template<>
inline float DTensor::sumAbs() const {
float sumAbsAllElements;
- gpuErrChk(cublasSasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
- &sumAbsAllElements));
+ gpuErrChk(
+ cublasSasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ &sumAbsAllElements));
return sumAbsAllElements;
}
template<>
inline double DTensor::sumAbs() const {
double sumAbsAllElements;
- gpuErrChk(cublasDasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
- &sumAbsAllElements));
+ gpuErrChk(
+ cublasDasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ &sumAbsAllElements));
return sumAbsAllElements;
}
@@ -1007,8 +1031,9 @@ template<>
inline float DTensor::maxAbs() const {
int idx;
float hostDst;
- gpuErrChk(cublasIsamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
- &idx));
+ gpuErrChk(
+ cublasIsamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ &idx));
gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(float), cudaMemcpyDeviceToHost));
return std::signbit(hostDst) ? -hostDst : hostDst;
}
@@ -1017,8 +1042,9 @@ template<>
inline double DTensor::maxAbs() const {
int idx;
double hostDst;
- gpuErrChk(cublasIdamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
- &idx));
+ gpuErrChk(
+ cublasIdamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ &idx));
gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(double), cudaMemcpyDeviceToHost));
return std::signbit(hostDst) ? -hostDst : hostDst;
}
@@ -1027,8 +1053,9 @@ template<>
inline float DTensor::minAbs() const {
int idx;
float hostDst;
- gpuErrChk(cublasIsamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
- &idx));
+ gpuErrChk(
+ cublasIsamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ &idx));
gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(float), cudaMemcpyDeviceToHost));
return std::signbit(hostDst) ? -hostDst : hostDst;
}
@@ -1037,8 +1064,9 @@ template<>
inline double DTensor::minAbs() const {
int idx;
double hostDst;
- gpuErrChk(cublasIdamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
- &idx));
+ gpuErrChk(
+ cublasIdamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
+ &idx));
gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(double), cudaMemcpyDeviceToHost));
return std::signbit(hostDst) ? -hostDst : hostDst;
}
@@ -1087,7 +1115,7 @@ inline void DTensor::allocateOnDevice(size_t size, bool zero) {
if (numMats() > 1) {
m_doDestroyPtrMatrices = true;
- cudaStatus = Session::getInstance().cudaAllocate((void**) &m_d_ptrMatrices, numMats() * sizeof(T *));
+ cudaStatus = Session::getInstance().cudaAllocate((void **) &m_d_ptrMatrices, numMats() * sizeof(T *));
if (cudaStatus != cudaSuccess) {
gpuErrChk(cudaFree(m_d_data)); // ... free previously allocated memory
gpuErrChk(cudaStatus); // ... and memento mori
@@ -1183,7 +1211,8 @@ template<>
inline DTensor &DTensor::operator*=(double scalar) {
double alpha = scalar;
gpuErrChk(
- cublasDscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
+ cublasDscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha,
+ m_d_data, 1));
return *this;
}
@@ -1202,7 +1231,8 @@ template<>
inline DTensor &DTensor::operator*=(float scalar) {
float alpha = scalar;
gpuErrChk(
- cublasSscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
+ cublasSscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha,
+ m_d_data, 1));
return *this;
}
@@ -1210,7 +1240,8 @@ template<>
inline DTensor &DTensor::operator+=(const DTensor &rhs) {
const double alpha = 1.;
gpuErrChk(
- cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
+ cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.
+ m_d_data,
1, m_d_data, 1));
return *this;
}
@@ -1219,7 +1250,8 @@ template<>
inline DTensor &DTensor::operator+=(const DTensor &rhs) {
const float alpha = 1.;
gpuErrChk(
- cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
+ cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.
+ m_d_data,
1, m_d_data, 1));
return *this;
}
@@ -1227,7 +1259,8 @@ inline DTensor &DTensor::operator+=(const DTensor &rhs) {
template<>
inline DTensor &DTensor::operator-=(const DTensor &rhs) {
const float alpha = -1.;
- cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, 1,
+ cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha,
+ rhs.m_d_data, 1,
m_d_data, 1);
return *this;
}
@@ -1236,7 +1269,8 @@ template<>
inline DTensor &DTensor::operator-=(const DTensor &rhs) {
const double alpha = -1.;
gpuErrChk(
- cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
+ cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.
+ m_d_data,
1, m_d_data, 1));
return *this;
}
@@ -2259,7 +2293,8 @@ inline void GivensAnnihilator::annihilate(size_t i, size_t k, size_t j) {
* Pass cosine and sine as device pointers
* (Avoid having to download first)
*/
- gpuErrChk(cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_DEVICE));
+ gpuErrChk(
+ cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_DEVICE));
/* Useful definitions */
T *aux = m_d_rhyp_cos_sin->raw();
@@ -2272,7 +2307,8 @@ inline void GivensAnnihilator::annihilate(size_t i, size_t k, size_t j) {
m_matrix->applyLeftGivensRotation(i, k, aux + 1, aux + 2);
/* Change back to default behaviour */
- gpuErrChk(cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_HOST));
+ gpuErrChk(
+ cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_HOST));
}
diff --git a/main.cu b/main.cu
index f1e0d70..2aa9f8d 100644
--- a/main.cu
+++ b/main.cu
@@ -21,6 +21,8 @@ void xyz() {
std::cout << "Memory: " << std::setprecision(3)
<< (float) Session::getInstance().totalAllocatedBytes() / 1e6
<< " MB" << std::endl;
+
+ Session::getInstance().synchronizeAllStreams();
}
From 8258c61620bb3ea08df18091d0bce2588ed59877 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Fri, 28 Mar 2025 21:17:19 +0000
Subject: [PATCH 8/8] update readme
---
README.md | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 00cd8a8..2728855 100644
--- a/README.md
+++ b/README.md
@@ -411,7 +411,10 @@ Session::setStreams(4); // create 4 strems
```
Then, you can use `setStreamIdx` to select a stream to go with your instance of `DTensor`
```c++
-auto a = DTensor::createRandomTensor(3, 6, 4, -1, 1).setStreamIdx(2);
+auto a = DTensor::createRandomTensor(3, 6, 4, -1, 1).setStreamIdx(0);
+auto b = DTensor::createRandomTensor(3, 6, 4, -1, 1).setStreamIdx(1);
+// do stuff...
+Session::getInstance().synchronizeAllStreams();
```
## Happy number crunching!