From e9bcfecd95837027801d965c12cbb30be3cf7f36 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Fri, 28 Mar 2025 17:47:16 +0000 Subject: [PATCH 1/8] create multiple cublas handles --- include/tensor.cuh | 29 ++++++++++++++++++----------- main.cu | 1 + 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index 5089ed7..cb97fe3 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -127,32 +127,39 @@ inline void gpuAssert(T code, const char *file, int line, bool abort = true) { */ class Session { public: - static Session &getInstance() { - static Session instance; + static Session &getInstance(size_t numStreams=1) { + static Session instance(numStreams); return instance; } private: - Session() { - gpuErrChk(cublasCreate(&m_cublasHandle)); + Session(size_t numStreams=1) { + m_numBublasHandlesStreams = numStreams; + m_cublasHandles.reserve(m_numBublasHandlesStreams); + for (size_t i=0; i m_cublasHandles; cusolverDnHandle_t m_cusolverHandle; - size_t bytesAllocated = 0; + size_t m_bytesAllocated = 0; + size_t m_numBublasHandlesStreams = 1; public: Session(Session const &) = delete; void operator=(Session const &) = delete; - cublasHandle_t &cuBlasHandle() { return m_cublasHandle; } + cublasHandle_t &cuBlasHandle() { return m_cublasHandles[0]; } cusolverDnHandle_t &cuSolverHandle() { return m_cusolverHandle; } @@ -168,7 +175,7 @@ public: cudaError_t cudaAllocate(void** d, size_t s) { cudaError_t err = cudaMalloc(d, s); if (err == cudaSuccess) { - bytesAllocated += s; + m_bytesAllocated += s; } return err; } @@ -176,9 +183,9 @@ public: /** * @return Total allocated bytes */ - size_t totalAllocatedBytes() const { return bytesAllocated; } + size_t totalAllocatedBytes() const { return m_bytesAllocated; } - void incrementAllocatedBytes(size_t s) { bytesAllocated += s; } + void incrementAllocatedBytes(size_t s) { m_bytesAllocated += s; } }; diff --git a/main.cu b/main.cu index 742321d..5dbe058 100644 --- a/main.cu +++ b/main.cu @@ -21,6 +21,7 @@ void xyz() { int main() { + Session::getInstance(5); xyz(); std::cout << "Memory (outside): " << std::setprecision(3) << (float) Session::getInstance().totalAllocatedBytes() / 1e6 From 954f175dbcaba54430d5519ed1c5a98f8ab542e6 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Fri, 28 Mar 2025 18:51:41 +0000 Subject: [PATCH 2/8] streams fully supported --- include/tensor.cuh | 110 ++++++++++++++++++++++++++++----------------- main.cu | 11 +++-- 2 files changed, 78 insertions(+), 43 deletions(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index cb97fe3..6d257e3 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -125,19 +125,29 @@ inline void gpuAssert(T code, const char *file, int line, bool abort = true) { * The cuBlas handle can be accessed anywhere by `Session::getInstance().cuBlasHandle()` * The cuSolver handle can be accessed anywhere by `Session::getInstance().cuSolverHandle()` */ +static size_t s_numStreams = 1; + class Session { public: - static Session &getInstance(size_t numStreams=1) { - static Session instance(numStreams); + + static void setStreams(size_t numStreams) { + s_numStreams = numStreams; + } + + static Session &getInstance() { + static Session instance(s_numStreams); return instance; } private: - Session(size_t numStreams=1) { + Session(size_t numStreams=10) { m_numBublasHandlesStreams = numStreams; - m_cublasHandles.reserve(m_numBublasHandlesStreams); + m_cublasHandles.resize(m_numBublasHandlesStreams); + m_cublasStreams.resize(m_numBublasHandlesStreams); for (size_t i=0; i m_cublasHandles; + std::vector m_cublasStreams; cusolverDnHandle_t m_cusolverHandle; size_t m_bytesAllocated = 0; size_t m_numBublasHandlesStreams = 1; @@ -159,7 +170,7 @@ public: void operator=(Session const &) = delete; - cublasHandle_t &cuBlasHandle() { return m_cublasHandles[0]; } + cublasHandle_t &cuBlasHandle(size_t idx=0) { return m_cublasHandles[idx]; } cusolverDnHandle_t &cuSolverHandle() { return m_cusolverHandle; } @@ -220,6 +231,7 @@ private: size_t m_numMats = 0; ///< Number of matrices bool m_doDestroyData = false; ///< Whether to destroy memory bool m_doDestroyPtrMatrices = false; ///< Whether to destroy memory + size_t m_idxStream = 0; ///< Stream index (defaults to 0) void destroy() { if (m_doDestroyData) { @@ -273,6 +285,13 @@ private: void initialisePointersToMatricesData(); public: + /** + * Set the stream ID + */ + void setStreamIdx(size_t); + + size_t streamIdx() const { return m_idxStream; } + /** * Create a tensor with random elements * @param numRows number of rows @@ -594,6 +613,14 @@ public: } }; /* END OF DTENSOR */ +template +void DTensor::setStreamIdx(size_t idx) { + if (idx >= s_numStreams) { + throw std::invalid_argument("Invalid stream index; it exceeds the max allocated streams"); + } + m_idxStream = idx; +} + template void DTensor::initialisePointersToMatricesData() { /* Make sure m_d_ptrMatrices has been allocated */ @@ -813,7 +840,7 @@ DTensor::DTensor(const DTensor &other) { m_numMats = other.m_numMats; m_numRows = other.m_numRows; m_numCols = other.m_numCols; - + m_idxStream = other.m_idxStream; allocateOnDevice(m_numRows * m_numCols * m_numMats); gpuErrChk(cudaMemcpy(m_d_data, other.raw(), m_numRows * m_numCols * m_numMats * sizeof(T), cudaMemcpyDeviceToDevice)); @@ -846,6 +873,7 @@ DTensor::DTensor(const DTensor &other, size_t axis, size_t from, size_t to m_d_data = other.m_d_data + offset; m_doDestroyData = false; m_doDestroyPtrMatrices = false; + m_idxStream = other.m_idxStream; } template @@ -858,6 +886,7 @@ DTensor::DTensor(DTensor &&other) { m_doDestroyData = other.m_doDestroyData; m_doDestroyPtrMatrices = other.m_doDestroyPtrMatrices; m_d_ptrMatrices = other.m_d_ptrMatrices; + m_idxStream = other.m_idxStream; /* Invalidate other */ other.m_doDestroyPtrMatrices = false; other.m_doDestroyData = false; @@ -894,7 +923,7 @@ inline double DTensor::dotF(const DTensor &other) { throw std::invalid_argument("[dotF] incompatible dimensions"); size_t n = numEl(); double result; - gpuErrChk(cublasDdot(Session::getInstance().cuBlasHandle(), n, + gpuErrChk(cublasDdot(Session::getInstance().cuBlasHandle(m_idxStream), n, raw(), 1, other.raw(), 1, &result)); @@ -907,7 +936,7 @@ inline float DTensor::dotF(const DTensor &other) { throw std::invalid_argument("[dotF] incompatible dimensions"); size_t n = numEl(); float result; - gpuErrChk(cublasSdot(Session::getInstance().cuBlasHandle(), n, + gpuErrChk(cublasSdot(Session::getInstance().cuBlasHandle(m_idxStream), n, raw(), 1, other.raw(), 1, &result)); @@ -917,7 +946,7 @@ inline float DTensor::dotF(const DTensor &other) { template<> inline double DTensor::normF() const { double the_norm; - gpuErrChk(cublasDnrm2(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, + gpuErrChk(cublasDnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, &the_norm)); return the_norm; } @@ -925,7 +954,7 @@ inline double DTensor::normF() const { template<> inline float DTensor::normF() const { float the_norm; - gpuErrChk(cublasSnrm2(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, + gpuErrChk(cublasSnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, &the_norm)); return the_norm; } @@ -933,7 +962,7 @@ inline float DTensor::normF() const { template<> inline float DTensor::sumAbs() const { float sumAbsAllElements; - gpuErrChk(cublasSasum(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, + gpuErrChk(cublasSasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, &sumAbsAllElements)); return sumAbsAllElements; } @@ -941,7 +970,7 @@ inline float DTensor::sumAbs() const { template<> inline double DTensor::sumAbs() const { double sumAbsAllElements; - gpuErrChk(cublasDasum(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, + gpuErrChk(cublasDasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, &sumAbsAllElements)); return sumAbsAllElements; } @@ -950,7 +979,7 @@ template<> inline float DTensor::maxAbs() const { int idx; float hostDst; - gpuErrChk(cublasIsamax(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, + gpuErrChk(cublasIsamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, &idx)); gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(float), cudaMemcpyDeviceToHost)); return std::signbit(hostDst) ? -hostDst : hostDst; @@ -960,7 +989,7 @@ template<> inline double DTensor::maxAbs() const { int idx; double hostDst; - gpuErrChk(cublasIdamax(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, + gpuErrChk(cublasIdamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, &idx)); gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(double), cudaMemcpyDeviceToHost)); return std::signbit(hostDst) ? -hostDst : hostDst; @@ -970,7 +999,7 @@ template<> inline float DTensor::minAbs() const { int idx; float hostDst; - gpuErrChk(cublasIsamin(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, + gpuErrChk(cublasIsamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, &idx)); gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(float), cudaMemcpyDeviceToHost)); return std::signbit(hostDst) ? -hostDst : hostDst; @@ -980,7 +1009,7 @@ template<> inline double DTensor::minAbs() const { int idx; double hostDst; - gpuErrChk(cublasIdamin(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, + gpuErrChk(cublasIdamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, &idx)); gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(double), cudaMemcpyDeviceToHost)); return std::signbit(hostDst) ? -hostDst : hostDst; @@ -992,10 +1021,10 @@ void DTensor::applyRightGivensRotation(size_t i, size_t j, const T *c, const T *col_i = m_d_data + i * m_numRows; T *col_j = m_d_data + j * m_numRows; if constexpr (std::is_same_v) { - gpuErrChk(cublasDrot(Session::getInstance().cuBlasHandle(), m_numRows, + gpuErrChk(cublasDrot(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows, col_i, 1, col_j, 1, c, minus_s)); } else if constexpr (std::is_same_v) { - gpuErrChk(cublasSrot(Session::getInstance().cuBlasHandle(), m_numRows, + gpuErrChk(cublasSrot(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows, col_i, 1, col_j, 1, c, minus_s)); } } @@ -1004,12 +1033,12 @@ template void DTensor::applyLeftGivensRotation(size_t i, size_t j, const T *c, const T *minus_s) { if (m_numMats > 1) throw std::invalid_argument("[applyLeftGivensRotation] tensors (nMat>1) not supported"); if constexpr (std::is_same_v) { - gpuErrChk(cublasDrot(Session::getInstance().cuBlasHandle(), m_numCols, + gpuErrChk(cublasDrot(Session::getInstance().cuBlasHandle(m_idxStream), m_numCols, m_d_data + i, m_numRows, m_d_data + j, m_numRows, c, minus_s)); } else if constexpr (std::is_same_v) { - gpuErrChk(cublasSrot(Session::getInstance().cuBlasHandle(), m_numCols, + gpuErrChk(cublasSrot(Session::getInstance().cuBlasHandle(m_idxStream), m_numCols, m_d_data + i, m_numRows, m_d_data + j, m_numRows, c, minus_s)); @@ -1085,7 +1114,7 @@ inline DTensor DTensor::tr() const { float alpha = 1.0f, beta = 0; size_t numElMat = m_numCols * m_numRows; for (size_t i = 0; i < m_numMats; i++) { - gpuErrChk(cublasSgeam(Session::getInstance().cuBlasHandle(), + gpuErrChk(cublasSgeam(Session::getInstance().cuBlasHandle(m_idxStream), CUBLAS_OP_T, CUBLAS_OP_N, m_numCols, m_numRows, &alpha, raw() + numElMat * i, m_numRows, @@ -1101,7 +1130,7 @@ inline DTensor DTensor::tr() const { double alpha = 1.0f, beta = 0; size_t numElMat = m_numCols * m_numRows; for (size_t i = 0; i < m_numMats; i++) { - gpuErrChk(cublasDgeam(Session::getInstance().cuBlasHandle(), + gpuErrChk(cublasDgeam(Session::getInstance().cuBlasHandle(m_idxStream), CUBLAS_OP_T, CUBLAS_OP_N, m_numCols, m_numRows, &alpha, raw() + numElMat * i, m_numRows, @@ -1126,7 +1155,7 @@ template<> inline DTensor &DTensor::operator*=(double scalar) { double alpha = scalar; gpuErrChk( - cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); + cublasDscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); return *this; } @@ -1137,6 +1166,7 @@ DTensor &DTensor::operator=(const DTensor &other) { m_numCols = other.m_numCols; m_doDestroyData = false; m_d_data = other.m_d_data; + m_idxStream = other.m_idxStream; return *this; } @@ -1144,7 +1174,7 @@ template<> inline DTensor &DTensor::operator*=(float scalar) { float alpha = scalar; gpuErrChk( - cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); + cublasSscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); return *this; } @@ -1152,7 +1182,7 @@ template<> inline DTensor &DTensor::operator+=(const DTensor &rhs) { const double alpha = 1.; gpuErrChk( - cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, 1, m_d_data, 1)); return *this; } @@ -1161,7 +1191,7 @@ template<> inline DTensor &DTensor::operator+=(const DTensor &rhs) { const float alpha = 1.; gpuErrChk( - cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, 1, m_d_data, 1)); return *this; } @@ -1169,7 +1199,7 @@ inline DTensor &DTensor::operator+=(const DTensor &rhs) { template<> inline DTensor &DTensor::operator-=(const DTensor &rhs) { const float alpha = -1.; - cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, 1, + cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, 1, m_d_data, 1); return *this; } @@ -1178,7 +1208,7 @@ template<> inline DTensor &DTensor::operator-=(const DTensor &rhs) { const double alpha = -1.; gpuErrChk( - cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, 1, m_d_data, 1)); return *this; } @@ -1199,7 +1229,7 @@ inline void DTensor::addAB(const DTensor &A, const DTensor 1) { - gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(), + gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(m_idxStream), CUBLAS_OP_N, CUBLAS_OP_N, nRA, nCB, nCA, &_alpha, A.m_d_ptrMatrices, nRA, @@ -1208,7 +1238,7 @@ inline void DTensor::addAB(const DTensor &A, const DTensor::addAB(const DTensor &A, const DTensor size_t nCB = B.numCols(); float _alpha = alpha, _beta = beta; if (nMat > 1) { - gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(), + gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(m_idxStream), CUBLAS_OP_N, CUBLAS_OP_N, nRA, nCB, nCA, &_alpha, A.m_d_ptrMatrices, nRA, @@ -1235,7 +1265,7 @@ inline void DTensor::addAB(const DTensor &A, const DTensor m_d_ptrMatrices, nRA, nMat)); } else { - gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(), + gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(m_idxStream), CUBLAS_OP_N, CUBLAS_OP_N, nRA, nCB, nCA, &_alpha, A.raw(), nRA, @@ -1259,7 +1289,7 @@ inline void DTensor::leastSquaresBatched(DTensor &B) { throw std::invalid_argument("[Least squares batched] supports square or tall matrices only"); int info = 0; DTensor infoArray(batchSize); // TODO consider preallocating? - gpuErrChk(cublasDgelsBatched(Session::getInstance().cuBlasHandle(), + gpuErrChk(cublasDgelsBatched(Session::getInstance().cuBlasHandle(m_idxStream), CUBLAS_OP_N, m_numRows, m_numCols, @@ -1287,7 +1317,7 @@ inline void DTensor::leastSquaresBatched(DTensor &B) { throw std::invalid_argument("[Least squares batched] supports square or tall matrices only"); int info = 0; DTensor infoArray(batchSize); // TODO consider preallocating? - gpuErrChk(cublasSgelsBatched(Session::getInstance().cuBlasHandle(), + gpuErrChk(cublasSgelsBatched(Session::getInstance().cuBlasHandle(m_idxStream), CUBLAS_OP_N, m_numRows, m_numCols, @@ -1307,7 +1337,7 @@ inline DTensor DTensor::getRows(size_t rowsFrom, size_t rowsTo, size_t n = numCols(), m = numRows(); DTensor rowsOnly(rowsRangeLength, numCols(), 1); for (size_t i = 0; i < rowsRangeLength; i++) { - gpuErrChk(cublasDcopy(Session::getInstance().cuBlasHandle(), + gpuErrChk(cublasDcopy(Session::getInstance().cuBlasHandle(m_idxStream), n, // # values to copy raw() + rowsFrom + i + matIdx * n * m, m, rowsOnly.raw() + i, @@ -1322,7 +1352,7 @@ inline DTensor DTensor::getRows(size_t rowsFrom, size_t rowsTo, si size_t n = numCols(), m = numRows(); DTensor rowsOnly(rowsRangeLength, numCols(), 1); for (size_t i = 0; i < rowsRangeLength; i++) { - gpuErrChk(cublasScopy(Session::getInstance().cuBlasHandle(), + gpuErrChk(cublasScopy(Session::getInstance().cuBlasHandle(m_idxStream), n, // # values to copy raw() + rowsFrom + i + matIdx * n * m, m, rowsOnly.raw() + i, @@ -1808,7 +1838,7 @@ inline void QRFactoriser::leastSquares(DTensor &rhs) { rhs.raw(), m, m_workspace->raw(), m_workspaceSize, m_info->raw())); - gpuErrChk(cublasDtrsm(Session::getInstance().cuBlasHandle(), + gpuErrChk(cublasDtrsm(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1, &alpha, m_matrix->raw(), m, @@ -1827,7 +1857,7 @@ inline void QRFactoriser::leastSquares(DTensor &rhs) { rhs.raw(), m, m_workspace->raw(), m_workspaceSize, m_info->raw())); - gpuErrChk(cublasStrsm(Session::getInstance().cuBlasHandle(), + gpuErrChk(cublasStrsm(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1, &alpha, m_matrix->raw(), m, @@ -2201,7 +2231,7 @@ inline void GivensAnnihilator::annihilate(size_t i, size_t k, size_t j) { * Pass cosine and sine as device pointers * (Avoid having to download first) */ - gpuErrChk(cublasSetPointerMode(Session::getInstance().cuBlasHandle(), CUBLAS_POINTER_MODE_DEVICE)); + gpuErrChk(cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_DEVICE)); /* Useful definitions */ T *aux = m_d_rhyp_cos_sin->raw(); @@ -2214,7 +2244,7 @@ inline void GivensAnnihilator::annihilate(size_t i, size_t k, size_t j) { m_matrix->applyLeftGivensRotation(i, k, aux + 1, aux + 2); /* Change back to default behaviour */ - gpuErrChk(cublasSetPointerMode(Session::getInstance().cuBlasHandle(), CUBLAS_POINTER_MODE_HOST)); + gpuErrChk(cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_HOST)); } diff --git a/main.cu b/main.cu index 5dbe058..1d60028 100644 --- a/main.cu +++ b/main.cu @@ -7,11 +7,16 @@ void xyz() { /* Write to binary file */ auto r = DTensor::createRandomTensor(3, 6, 4, -1, 1); - auto r2 = DTensor::createRandomTensor(300, 600, 4, -1, 1); - std::string fName = "tensor.bt"; // binary tensor file extension: .bt + r.setStreamIdx(1); + std::string fName = "abcd.bt"; // binary tensor file extension: .bt + r.saveToFile(fName); /* Parse binary file */ auto recov = DTensor::parseFromFile(fName); + + std::cout << r; + std::cout << recov; + auto err = r - recov; std::cout << "max error : " << err.maxAbs() << std::endl; std::cout << "Memory: " << std::setprecision(3) @@ -21,7 +26,7 @@ void xyz() { int main() { - Session::getInstance(5); + Session::setStreams(5); xyz(); std::cout << "Memory (outside): " << std::setprecision(3) << (float) Session::getInstance().totalAllocatedBytes() / 1e6 From 212cd37917725cc3326c387f7b19c028fc16866a Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Fri, 28 Mar 2025 19:02:22 +0000 Subject: [PATCH 3/8] changelog updated --- CHANGELOG.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 60d3173..5d853fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## v1.9.0 - 28-03-2025 + +### Added + +- Multiple streams supported + + From 0a17614474fbd51226d0313a995ae58ec52013ae Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Fri, 28 Mar 2025 19:24:19 +0000 Subject: [PATCH 4/8] minor --- include/tensor.cuh | 27 +++++++++++++++------------ main.cu | 3 +-- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index 6d257e3..57f55ab 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -116,6 +116,7 @@ inline void gpuAssert(T code, const char *file, int line, bool abort = true) { /* ================================================================================================ * SESSION * ================================================================================================ */ +static size_t s_numStreams = 1; /** * Singleton for Cuda library handles. @@ -125,11 +126,12 @@ inline void gpuAssert(T code, const char *file, int line, bool abort = true) { * The cuBlas handle can be accessed anywhere by `Session::getInstance().cuBlasHandle()` * The cuSolver handle can be accessed anywhere by `Session::getInstance().cuSolverHandle()` */ -static size_t s_numStreams = 1; - class Session { public: - + /** + * + * @param numStreams + */ static void setStreams(size_t numStreams) { s_numStreams = numStreams; } @@ -140,11 +142,11 @@ public: } private: - Session(size_t numStreams=10) { - m_numBublasHandlesStreams = numStreams; - m_cublasHandles.resize(m_numBublasHandlesStreams); - m_cublasStreams.resize(m_numBublasHandlesStreams); - for (size_t i=0; i m_cublasStreams; cusolverDnHandle_t m_cusolverHandle; size_t m_bytesAllocated = 0; - size_t m_numBublasHandlesStreams = 1; + size_t m_numCublasHandlesStreams = 1; public: Session(Session const &) = delete; @@ -288,7 +290,7 @@ public: /** * Set the stream ID */ - void setStreamIdx(size_t); + DTensor setStreamIdx(size_t); size_t streamIdx() const { return m_idxStream; } @@ -614,11 +616,12 @@ public: }; /* END OF DTENSOR */ template -void DTensor::setStreamIdx(size_t idx) { +DTensor DTensor::setStreamIdx(size_t idx) { if (idx >= s_numStreams) { throw std::invalid_argument("Invalid stream index; it exceeds the max allocated streams"); } m_idxStream = idx; + return *this; } template diff --git a/main.cu b/main.cu index 1d60028..f1e0d70 100644 --- a/main.cu +++ b/main.cu @@ -6,8 +6,7 @@ void xyz() { /* Write to binary file */ - auto r = DTensor::createRandomTensor(3, 6, 4, -1, 1); - r.setStreamIdx(1); + DTensor r = DTensor::createRandomTensor(3, 6, 4, -1, 1).setStreamIdx(1); std::string fName = "abcd.bt"; // binary tensor file extension: .bt r.saveToFile(fName); From 901ba1404dc5858f3012850caad7a07a5a2a38d8 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Fri, 28 Mar 2025 19:39:15 +0000 Subject: [PATCH 5/8] streams and cusolver handles --- include/tensor.cuh | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index 57f55ab..a2ff121 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -129,13 +129,18 @@ static size_t s_numStreams = 1; class Session { public: /** - * - * @param numStreams + * Sets the total number of available streams + * @param numStreams number of streams (default: 1) */ static void setStreams(size_t numStreams) { s_numStreams = numStreams; } + /** + * Returns the unique instance of Session (constructed upon first + * invocation) + * @return instance of Session + */ static Session &getInstance() { static Session instance(s_numStreams); return instance; @@ -146,24 +151,26 @@ private: m_numCublasHandlesStreams = numStreams; m_cublasHandles.resize(m_numCublasHandlesStreams); m_cublasStreams.resize(m_numCublasHandlesStreams); + m_cusolverHandles.resize(m_numCublasHandlesStreams); for (size_t i=0; i m_cublasHandles; std::vector m_cublasStreams; - cusolverDnHandle_t m_cusolverHandle; + std::vector m_cusolverHandles; size_t m_bytesAllocated = 0; size_t m_numCublasHandlesStreams = 1; @@ -172,9 +179,19 @@ public: void operator=(Session const &) = delete; + /** + * cuBLAS handle + * @param idx index of stream + * @return cuBLAS handle + */ cublasHandle_t &cuBlasHandle(size_t idx=0) { return m_cublasHandles[idx]; } - cusolverDnHandle_t &cuSolverHandle() { return m_cusolverHandle; } + /** + * cuSolver handle + * @param idx index of stream + * @return cuSolver handle + */ + cusolverDnHandle_t &cuSolverHandle(size_t idx=0) { return m_cusolverHandles[idx]; } /** * Preferred method for CUDA memory allocation; it allocated memory on the device @@ -198,7 +215,11 @@ public: */ size_t totalAllocatedBytes() const { return m_bytesAllocated; } - void incrementAllocatedBytes(size_t s) { m_bytesAllocated += s; } + /** + * Increment counter of allocated bytes + * @param s allocated bytes (can be negative) + */ + void incrementAllocatedBytes(int s) { m_bytesAllocated += s; } }; From 6929a08a35a8d02a1f900ab89e38d890587c2435 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Fri, 28 Mar 2025 20:43:44 +0000 Subject: [PATCH 6/8] docs for streams --- README.md | 11 ++++++++++- include/tensor.cuh | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b6e560e..00cd8a8 100644 --- a/README.md +++ b/README.md @@ -403,6 +403,15 @@ size_t allocatedBytes = Session::getInstance().totalAllocatedBytes(); ``` - +GPUtils supports multiple streams. By default a single stream is created, +but you can set the number of streams you need with +```c++ +/* This needs to be the first line in your code */ +Session::setStreams(4); // create 4 strems +``` +Then, you can use `setStreamIdx` to select a stream to go with your instance of `DTensor` +```c++ +auto a = DTensor::createRandomTensor(3, 6, 4, -1, 1).setStreamIdx(2); +``` ## Happy number crunching! diff --git a/include/tensor.cuh b/include/tensor.cuh index a2ff121..32a62d5 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -116,6 +116,10 @@ inline void gpuAssert(T code, const char *file, int line, bool abort = true) { /* ================================================================================================ * SESSION * ================================================================================================ */ +/** + * Total number of allocated streams + * Can be changed with Session::setStreams() + */ static size_t s_numStreams = 1; /** From b5d9008b9bb18e2f2d0a1f4cc2c40377d7d96094 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Fri, 28 Mar 2025 21:03:06 +0000 Subject: [PATCH 7/8] sync streams --- include/tensor.cuh | 96 +++++++++++++++++++++++++++++++--------------- main.cu | 2 + 2 files changed, 68 insertions(+), 30 deletions(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index 32a62d5..b73e280 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -156,7 +156,7 @@ private: m_cublasHandles.resize(m_numCublasHandlesStreams); m_cublasStreams.resize(m_numCublasHandlesStreams); m_cusolverHandles.resize(m_numCublasHandlesStreams); - for (size_t i=0; i= m_numCublasHandlesStreams) { + throw std::runtime_error("stream index out of range"); + } + gpuErrChk(cudaStreamSynchronize(m_cublasStreams[idx])); + } + + /** + * Synchronize all streams + */ + void synchronizeAllStreams() const { + for (size_t i = 0; i < m_numCublasHandlesStreams; i++) { + synchronizeStream(i); + } + } }; @@ -974,32 +994,36 @@ inline float DTensor::dotF(const DTensor &other) { template<> inline double DTensor::normF() const { double the_norm; - gpuErrChk(cublasDnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &the_norm)); + gpuErrChk( + cublasDnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, + &the_norm)); return the_norm; } template<> inline float DTensor::normF() const { float the_norm; - gpuErrChk(cublasSnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &the_norm)); + gpuErrChk( + cublasSnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, + &the_norm)); return the_norm; } template<> inline float DTensor::sumAbs() const { float sumAbsAllElements; - gpuErrChk(cublasSasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &sumAbsAllElements)); + gpuErrChk( + cublasSasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, + &sumAbsAllElements)); return sumAbsAllElements; } template<> inline double DTensor::sumAbs() const { double sumAbsAllElements; - gpuErrChk(cublasDasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &sumAbsAllElements)); + gpuErrChk( + cublasDasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, + &sumAbsAllElements)); return sumAbsAllElements; } @@ -1007,8 +1031,9 @@ template<> inline float DTensor::maxAbs() const { int idx; float hostDst; - gpuErrChk(cublasIsamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &idx)); + gpuErrChk( + cublasIsamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, + &idx)); gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(float), cudaMemcpyDeviceToHost)); return std::signbit(hostDst) ? -hostDst : hostDst; } @@ -1017,8 +1042,9 @@ template<> inline double DTensor::maxAbs() const { int idx; double hostDst; - gpuErrChk(cublasIdamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &idx)); + gpuErrChk( + cublasIdamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, + &idx)); gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(double), cudaMemcpyDeviceToHost)); return std::signbit(hostDst) ? -hostDst : hostDst; } @@ -1027,8 +1053,9 @@ template<> inline float DTensor::minAbs() const { int idx; float hostDst; - gpuErrChk(cublasIsamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &idx)); + gpuErrChk( + cublasIsamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, + &idx)); gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(float), cudaMemcpyDeviceToHost)); return std::signbit(hostDst) ? -hostDst : hostDst; } @@ -1037,8 +1064,9 @@ template<> inline double DTensor::minAbs() const { int idx; double hostDst; - gpuErrChk(cublasIdamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &idx)); + gpuErrChk( + cublasIdamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1, + &idx)); gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(double), cudaMemcpyDeviceToHost)); return std::signbit(hostDst) ? -hostDst : hostDst; } @@ -1087,7 +1115,7 @@ inline void DTensor::allocateOnDevice(size_t size, bool zero) { if (numMats() > 1) { m_doDestroyPtrMatrices = true; - cudaStatus = Session::getInstance().cudaAllocate((void**) &m_d_ptrMatrices, numMats() * sizeof(T *)); + cudaStatus = Session::getInstance().cudaAllocate((void **) &m_d_ptrMatrices, numMats() * sizeof(T *)); if (cudaStatus != cudaSuccess) { gpuErrChk(cudaFree(m_d_data)); // ... free previously allocated memory gpuErrChk(cudaStatus); // ... and memento mori @@ -1183,7 +1211,8 @@ template<> inline DTensor &DTensor::operator*=(double scalar) { double alpha = scalar; gpuErrChk( - cublasDscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); + cublasDscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, + m_d_data, 1)); return *this; } @@ -1202,7 +1231,8 @@ template<> inline DTensor &DTensor::operator*=(float scalar) { float alpha = scalar; gpuErrChk( - cublasSscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); + cublasSscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, + m_d_data, 1)); return *this; } @@ -1210,7 +1240,8 @@ template<> inline DTensor &DTensor::operator+=(const DTensor &rhs) { const double alpha = 1.; gpuErrChk( - cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs. + m_d_data, 1, m_d_data, 1)); return *this; } @@ -1219,7 +1250,8 @@ template<> inline DTensor &DTensor::operator+=(const DTensor &rhs) { const float alpha = 1.; gpuErrChk( - cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs. + m_d_data, 1, m_d_data, 1)); return *this; } @@ -1227,7 +1259,8 @@ inline DTensor &DTensor::operator+=(const DTensor &rhs) { template<> inline DTensor &DTensor::operator-=(const DTensor &rhs) { const float alpha = -1.; - cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, 1, + cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, + rhs.m_d_data, 1, m_d_data, 1); return *this; } @@ -1236,7 +1269,8 @@ template<> inline DTensor &DTensor::operator-=(const DTensor &rhs) { const double alpha = -1.; gpuErrChk( - cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs. + m_d_data, 1, m_d_data, 1)); return *this; } @@ -2259,7 +2293,8 @@ inline void GivensAnnihilator::annihilate(size_t i, size_t k, size_t j) { * Pass cosine and sine as device pointers * (Avoid having to download first) */ - gpuErrChk(cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_DEVICE)); + gpuErrChk( + cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_DEVICE)); /* Useful definitions */ T *aux = m_d_rhyp_cos_sin->raw(); @@ -2272,7 +2307,8 @@ inline void GivensAnnihilator::annihilate(size_t i, size_t k, size_t j) { m_matrix->applyLeftGivensRotation(i, k, aux + 1, aux + 2); /* Change back to default behaviour */ - gpuErrChk(cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_HOST)); + gpuErrChk( + cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_HOST)); } diff --git a/main.cu b/main.cu index f1e0d70..2aa9f8d 100644 --- a/main.cu +++ b/main.cu @@ -21,6 +21,8 @@ void xyz() { std::cout << "Memory: " << std::setprecision(3) << (float) Session::getInstance().totalAllocatedBytes() / 1e6 << " MB" << std::endl; + + Session::getInstance().synchronizeAllStreams(); } From 8258c61620bb3ea08df18091d0bce2588ed59877 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Fri, 28 Mar 2025 21:17:19 +0000 Subject: [PATCH 8/8] update readme --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 00cd8a8..2728855 100644 --- a/README.md +++ b/README.md @@ -411,7 +411,10 @@ Session::setStreams(4); // create 4 strems ``` Then, you can use `setStreamIdx` to select a stream to go with your instance of `DTensor` ```c++ -auto a = DTensor::createRandomTensor(3, 6, 4, -1, 1).setStreamIdx(2); +auto a = DTensor::createRandomTensor(3, 6, 4, -1, 1).setStreamIdx(0); +auto b = DTensor::createRandomTensor(3, 6, 4, -1, 1).setStreamIdx(1); +// do stuff... +Session::getInstance().synchronizeAllStreams(); ``` ## Happy number crunching!