From d9c8bddccc909e31a9bee2b29dd717fad6d7c5d2 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 6 Nov 2024 13:17:22 +0000 Subject: [PATCH 1/7] introducing raw pointer to matrices (on device) --- include/tensor.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/include/tensor.cuh b/include/tensor.cuh index 0a8d1d1..d5fd8bd 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -185,6 +185,7 @@ private: size_t m_numCols = 0; ///< Number of columns size_t m_numMats = 0; ///< Number of matrices bool m_doDestroy = false; ///< Whether to destroy memory + T** m_rawPtrToMatrices; ///< raw pointers to matrices bool destroy() { if (!m_doDestroy) return false; From c7f92180836125fa59ad9798130d9fe44d6548cc Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 6 Nov 2024 17:19:02 +0000 Subject: [PATCH 2/7] Getting rid of pointersToMatrices Introduce m_d_ptrMatrices to preallocate memory of an array of pointers to each sub-matrix of a tensor. Destroy when necessary. New method makePointersToMatrices. Slicing works. Update all constructors --- include/tensor.cuh | 97 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 68 insertions(+), 29 deletions(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index d5fd8bd..3e2cfb7 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -181,17 +181,35 @@ class DTensor { private: T *m_d_data = nullptr; ///< Pointer to device data + T** m_d_ptrMatrices = nullptr; ///< Pointer to matrices in tensor size_t m_numRows = 0; ///< Number of rows size_t m_numCols = 0; ///< Number of columns size_t m_numMats = 0; ///< Number of matrices - bool m_doDestroy = false; ///< Whether to destroy memory - T** m_rawPtrToMatrices; ///< raw pointers to matrices + bool m_doDestroyData = false; ///< Whether to destroy m_d_data + bool m_doDestroyPointersToMatrices = false; ///< Whether to destroy m_d_ptrMatrices + + void makePointersToMatrices() { + std::vector h_pointers(m_numMats); + size_t numelMat = m_numRows * m_numCols; + h_pointers[0] = m_d_data; + for (size_t i = 1; i < m_numMats; i++) { + h_pointers[i] = m_d_data + i * numelMat; + } + size_t buffer_size = m_numMats * sizeof(T*); + gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers.data(), buffer_size, cudaMemcpyHostToDevice)); + m_doDestroyPointersToMatrices = true; + } bool destroy() { - if (!m_doDestroy) return false; - if (m_d_data) cudaFree(m_d_data); - m_d_data = nullptr; - return true; + if (m_doDestroyData) { + if (m_d_data) gpuErrChk(cudaFree(m_d_data)); + m_d_data = nullptr; + } + if (m_doDestroyPointersToMatrices){ + if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices)); + m_d_ptrMatrices = nullptr; + } + return m_doDestroyData || m_doDestroyPointersToMatrices; } /** @@ -516,9 +534,8 @@ DTensor DTensor::createRandomTensor(size_t numRows, size_t numCols, size_t auto randVec = generateIntRandomVector(numRows * numCols * numMats, low, hi); DTensor a(randVec, numRows, numCols, numMats); return a; - } else { - throw std::invalid_argument("[createRandomTensor] unsupported type T"); } + throw std::invalid_argument("[createRandomTensor] unsupported type T"); } template @@ -543,6 +560,7 @@ DTensor::DTensor(size_t m, size_t n, size_t k, bool zero) { m_numMats = k; size_t size = m * n * k; allocateOnDevice(size, zero); + makePointersToMatrices(); } template @@ -553,6 +571,7 @@ DTensor::DTensor(const std::vector &data, size_t m, size_t n, size_t k, St size_t size = m * n * k; allocateOnDevice(size); upload(data, mode); + makePointersToMatrices(); } template @@ -564,6 +583,7 @@ DTensor::DTensor(const DTensor &other) { allocateOnDevice(m_numRows * m_numCols * m_numMats); gpuErrChk(cudaMemcpy(m_d_data, other.raw(), m_numRows * m_numCols * m_numMats * sizeof(T), cudaMemcpyDeviceToDevice)); + makePointersToMatrices(); } template @@ -575,6 +595,7 @@ DTensor::DTensor(const DTensor &other, size_t axis, size_t from, size_t to m_numRows = other.m_numRows; m_numCols = other.m_numCols; m_numMats = len; + m_d_ptrMatrices = other.m_d_ptrMatrices + from; } else if (axis == 1) { offset = other.m_numRows * from; m_numRows = other.m_numRows; @@ -587,7 +608,13 @@ DTensor::DTensor(const DTensor &other, size_t axis, size_t from, size_t to m_numMats = 1; } m_d_data = other.m_d_data + offset; - m_doDestroy = false; + m_doDestroyData = false; + if (axis == 1 || axis == 0) { + gpuErrChk(cudaMalloc(&m_d_ptrMatrices, sizeof(T *))); // allocate memory for an array of one T* element + T* h_pointers[1] = {m_d_data}; // transfer datum to device + gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers, sizeof(T*), cudaMemcpyHostToDevice)); + m_doDestroyPointersToMatrices = true; // make sure to free memory later + } } template @@ -596,9 +623,11 @@ DTensor::DTensor(DTensor &&other) { m_numRows = other.m_numRows; m_numMats = other.m_numMats; m_d_data = other.m_d_data; - m_doDestroy = true; - other.m_doDestroy = false; + m_d_ptrMatrices = other.m_d_ptrMatrices; + m_doDestroyData = true; + other.m_doDestroyData = false; other.m_d_data = nullptr; + other.m_d_ptrMatrices = nullptr; other.m_numCols = 0; other.m_numRows = 0; other.m_numMats = 0; @@ -758,12 +787,18 @@ template inline bool DTensor::allocateOnDevice(size_t size, bool zero) { if (size <= 0) return false; destroy(); - m_doDestroy = true; - size_t buffer_size = size * sizeof(T); - bool cudaStatus = cudaMalloc(&m_d_data, buffer_size); + m_doDestroyData = true; + /* Allocate memory for m_d_data */ + size_t data_size_bytes = size * sizeof(T); + bool cudaStatus = cudaMalloc(&m_d_data, data_size_bytes); if (cudaStatus != cudaSuccess) return false; - if (zero) gpuErrChk(cudaMemset(m_d_data, 0, buffer_size)); // set to zero all elements - return true; + if (zero) gpuErrChk(cudaMemset(m_d_data, 0, data_size_bytes)); // set to zero all elements + + /* Allocate memory for m_d_ptrMatrices */ + size_t ptr_matrices_bytes = m_numMats * sizeof(T*); + cudaStatus = cudaMalloc(&m_d_ptrMatrices, ptr_matrices_bytes); + + return (cudaStatus != cudaSuccess); } template @@ -840,6 +875,10 @@ inline void DTensor::deviceCopyTo(DTensor &elsewhere) const { m_d_data, m_numRows * m_numCols * m_numMats * sizeof(T), cudaMemcpyDeviceToDevice)); + gpuErrChk(cudaMemcpy(elsewhere.m_d_ptrMatrices, + m_d_ptrMatrices, + m_numMats * sizeof(T*), + cudaMemcpyDeviceToDevice)); } template<> @@ -855,7 +894,7 @@ DTensor &DTensor::operator=(const DTensor &other) { m_numMats = other.m_numMats; m_numRows = other.m_numRows; m_numCols = other.m_numCols; - m_doDestroy = false; + m_doDestroyData = false; m_d_data = other.m_d_data; return *this; } @@ -929,17 +968,17 @@ inline void DTensor::addAB(const DTensor &A, const DTensor ptrA = A.pointersToMatrices(); - DTensor ptrB = B.pointersToMatrices(); - DTensor ptr = pointersToMatrices(); double _alpha = alpha, _beta = beta; + double** ptrAraw = A.m_d_ptrMatrices; + double** ptrBraw = B.m_d_ptrMatrices; + double** ptrRaw = m_d_ptrMatrices; gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, nRA, nCB, nCA, &_alpha, - ptrA.raw(), nRA, - ptrB.raw(), nCA, + ptrAraw, nRA, + ptrBraw, nCA, &_beta, - ptr.raw(), nRA, + ptrRaw, nRA, nMat)); } @@ -949,17 +988,17 @@ inline void DTensor::addAB(const DTensor &A, const DTensor size_t nRA = A.numRows(); size_t nCA = A.numCols(); size_t nCB = B.numCols(); - DTensor ptrA = A.pointersToMatrices(); - DTensor ptrB = B.pointersToMatrices(); - DTensor ptr = pointersToMatrices(); + float** ptrAraw = A.m_d_ptrMatrices; + float** ptrBraw = B.m_d_ptrMatrices; + float** ptrRaw = m_d_ptrMatrices; float _alpha = alpha, _beta = beta; gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, nRA, nCB, nCA, &_alpha, - ptrA.raw(), nRA, - ptrB.raw(), nCA, + ptrAraw, nRA, + ptrBraw, nCA, &_beta, - ptr.raw(), nRA, + ptrRaw, nRA, nMat)); } From 9b17bb2ca9a14e9a306d8166c2e260fb704bfe27 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 6 Nov 2024 17:24:04 +0000 Subject: [PATCH 3/7] DTensor/copy constructor: warning in docs --- include/tensor.cuh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/tensor.cuh b/include/tensor.cuh index 3e2cfb7..8631a2d 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -305,6 +305,10 @@ public: * @param axis axis to slice (0=rows, 1=columns, 2=matrices) * @param from index to slice axis from (zero-indexed) * @param to index to slice axis to (inclusive) + * + * @warning If axis=0 or axis=2, this method will (i) allocate memory on the GPU + * to store one element of size T* (pointer-to-T), (ii) will transfer one such + * element from the host to the device. */ DTensor(const DTensor &other, size_t axis, size_t from, size_t to); From 452a6ca3b954e9b2bffef374127682c79040f2cc Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 6 Nov 2024 19:02:17 +0000 Subject: [PATCH 4/7] DTensor::addAB: use gemm instead of gemmBatched when nMats=1 --- include/tensor.cuh | 73 ++++++++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index 8631a2d..9134c12 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -181,7 +181,7 @@ class DTensor { private: T *m_d_data = nullptr; ///< Pointer to device data - T** m_d_ptrMatrices = nullptr; ///< Pointer to matrices in tensor + T **m_d_ptrMatrices = nullptr; ///< Pointer to matrices in tensor size_t m_numRows = 0; ///< Number of rows size_t m_numCols = 0; ///< Number of columns size_t m_numMats = 0; ///< Number of matrices @@ -195,7 +195,7 @@ private: for (size_t i = 1; i < m_numMats; i++) { h_pointers[i] = m_d_data + i * numelMat; } - size_t buffer_size = m_numMats * sizeof(T*); + size_t buffer_size = m_numMats * sizeof(T *); gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers.data(), buffer_size, cudaMemcpyHostToDevice)); m_doDestroyPointersToMatrices = true; } @@ -205,7 +205,7 @@ private: if (m_d_data) gpuErrChk(cudaFree(m_d_data)); m_d_data = nullptr; } - if (m_doDestroyPointersToMatrices){ + if (m_doDestroyPointersToMatrices) { if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices)); m_d_ptrMatrices = nullptr; } @@ -555,6 +555,7 @@ void DTensor::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats m_numRows = newNumRows; m_numCols = newNumCols; m_numMats = newNumMats; + // TODO create } template @@ -615,8 +616,8 @@ DTensor::DTensor(const DTensor &other, size_t axis, size_t from, size_t to m_doDestroyData = false; if (axis == 1 || axis == 0) { gpuErrChk(cudaMalloc(&m_d_ptrMatrices, sizeof(T *))); // allocate memory for an array of one T* element - T* h_pointers[1] = {m_d_data}; // transfer datum to device - gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers, sizeof(T*), cudaMemcpyHostToDevice)); + T *h_pointers[1] = {m_d_data}; // transfer datum to device + gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers, sizeof(T *), cudaMemcpyHostToDevice)); m_doDestroyPointersToMatrices = true; // make sure to free memory later } } @@ -799,7 +800,7 @@ inline bool DTensor::allocateOnDevice(size_t size, bool zero) { if (zero) gpuErrChk(cudaMemset(m_d_data, 0, data_size_bytes)); // set to zero all elements /* Allocate memory for m_d_ptrMatrices */ - size_t ptr_matrices_bytes = m_numMats * sizeof(T*); + size_t ptr_matrices_bytes = m_numMats * sizeof(T *); cudaStatus = cudaMalloc(&m_d_ptrMatrices, ptr_matrices_bytes); return (cudaStatus != cudaSuccess); @@ -881,7 +882,7 @@ inline void DTensor::deviceCopyTo(DTensor &elsewhere) const { cudaMemcpyDeviceToDevice)); gpuErrChk(cudaMemcpy(elsewhere.m_d_ptrMatrices, m_d_ptrMatrices, - m_numMats * sizeof(T*), + m_numMats * sizeof(T *), cudaMemcpyDeviceToDevice)); } @@ -973,17 +974,24 @@ inline void DTensor::addAB(const DTensor &A, const DTensor 1) { + gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(), + CUBLAS_OP_N, CUBLAS_OP_N, + nRA, nCB, nCA, &_alpha, + A.m_d_ptrMatrices, nRA, + B.m_d_ptrMatrices, nCA, + &_beta, + m_d_ptrMatrices, nRA, + nMat)); + } else { + gpuErrChk(cublasDgemm(Session::getInstance().cuBlasHandle(), + CUBLAS_OP_N, CUBLAS_OP_N, + nRA, nCB, nCA, &_alpha, + A.raw(), nRA, + B.raw(), nCA, + &_beta, + raw(), nRA)); + } } template<> @@ -992,18 +1000,25 @@ inline void DTensor::addAB(const DTensor &A, const DTensor size_t nRA = A.numRows(); size_t nCA = A.numCols(); size_t nCB = B.numCols(); - float** ptrAraw = A.m_d_ptrMatrices; - float** ptrBraw = B.m_d_ptrMatrices; - float** ptrRaw = m_d_ptrMatrices; float _alpha = alpha, _beta = beta; - gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(), - CUBLAS_OP_N, CUBLAS_OP_N, - nRA, nCB, nCA, &_alpha, - ptrAraw, nRA, - ptrBraw, nCA, - &_beta, - ptrRaw, nRA, - nMat)); + if (nMat > 1) { + gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(), + CUBLAS_OP_N, CUBLAS_OP_N, + nRA, nCB, nCA, &_alpha, + A.m_d_ptrMatrices, nRA, + B.m_d_ptrMatrices, nCA, + &_beta, + m_d_ptrMatrices, nRA, + nMat)); + } else { + gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(), + CUBLAS_OP_N, CUBLAS_OP_N, + nRA, nCB, nCA, &_alpha, + A.raw(), nRA, + B.raw(), nCA, + &_beta, + raw(), nRA)); + } } template<> From 0d3bfc021695025a86ac8163c1c8ad8fb2e2537d Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 6 Nov 2024 19:26:12 +0000 Subject: [PATCH 5/7] DTensor::allocateOnDevice: set m_doDestroyPointersToMatrices Fix issue with memory allocation/freeing --- include/tensor.cuh | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index 9134c12..4f59982 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -613,12 +613,11 @@ DTensor::DTensor(const DTensor &other, size_t axis, size_t from, size_t to m_numMats = 1; } m_d_data = other.m_d_data + offset; - m_doDestroyData = false; - if (axis == 1 || axis == 0) { - gpuErrChk(cudaMalloc(&m_d_ptrMatrices, sizeof(T *))); // allocate memory for an array of one T* element - T *h_pointers[1] = {m_d_data}; // transfer datum to device - gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers, sizeof(T *), cudaMemcpyHostToDevice)); - m_doDestroyPointersToMatrices = true; // make sure to free memory later + m_doDestroyData = false; // no new memory allocated! + m_doDestroyPointersToMatrices = false; // no new auxiliary memory allocated! + if (axis != 2) { + // m_d_ptrMatrices is not needed for vectors and matrices + m_d_ptrMatrices = nullptr; } } @@ -802,6 +801,7 @@ inline bool DTensor::allocateOnDevice(size_t size, bool zero) { /* Allocate memory for m_d_ptrMatrices */ size_t ptr_matrices_bytes = m_numMats * sizeof(T *); cudaStatus = cudaMalloc(&m_d_ptrMatrices, ptr_matrices_bytes); + m_doDestroyPointersToMatrices = true; return (cudaStatus != cudaSuccess); } @@ -880,10 +880,12 @@ inline void DTensor::deviceCopyTo(DTensor &elsewhere) const { m_d_data, m_numRows * m_numCols * m_numMats * sizeof(T), cudaMemcpyDeviceToDevice)); - gpuErrChk(cudaMemcpy(elsewhere.m_d_ptrMatrices, - m_d_ptrMatrices, - m_numMats * sizeof(T *), - cudaMemcpyDeviceToDevice)); + if (m_d_ptrMatrices) { + gpuErrChk(cudaMemcpy(elsewhere.m_d_ptrMatrices, + m_d_ptrMatrices, + m_numMats * sizeof(T *), + cudaMemcpyDeviceToDevice)); + } } template<> From 82af79f4ed827183c87a1593e996a10663c94474 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 6 Nov 2024 19:42:34 +0000 Subject: [PATCH 6/7] update destroy --- include/tensor.cuh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index 4f59982..050d5d1 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -201,15 +201,18 @@ private: } bool destroy() { + bool willDestroy = m_doDestroyData || m_doDestroyPointersToMatrices; if (m_doDestroyData) { if (m_d_data) gpuErrChk(cudaFree(m_d_data)); m_d_data = nullptr; + m_doDestroyData = false; } if (m_doDestroyPointersToMatrices) { if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices)); m_d_ptrMatrices = nullptr; + m_doDestroyPointersToMatrices = false; } - return m_doDestroyData || m_doDestroyPointersToMatrices; + return willDestroy; } /** From b7ecb3f13158c289e57a7de0761e026a3c083276 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 6 Nov 2024 20:02:12 +0000 Subject: [PATCH 7/7] Get rid of pointersToMatrices --- include/tensor.cuh | 71 +++++++++++++++++++++------------------------- test/testTensor.cu | 23 --------------- 2 files changed, 32 insertions(+), 62 deletions(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index 050d5d1..8007209 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -205,12 +205,10 @@ private: if (m_doDestroyData) { if (m_d_data) gpuErrChk(cudaFree(m_d_data)); m_d_data = nullptr; - m_doDestroyData = false; } if (m_doDestroyPointersToMatrices) { if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices)); m_d_ptrMatrices = nullptr; - m_doDestroyPointersToMatrices = false; } return willDestroy; } @@ -320,6 +318,10 @@ public: */ T *raw() const; + T **ptrMatrices() const { + return m_d_ptrMatrices; + } + /** * @return number of rows */ @@ -842,6 +844,7 @@ inline T *DTensor::raw() const { return m_d_data; } + template<> inline DTensor DTensor::tr() const { DTensor transposes(m_numCols, m_numRows, m_numMats); @@ -960,17 +963,17 @@ inline T DTensor::operator()(size_t i, size_t j, size_t k) const { return hostDst; } -template -inline DTensor DTensor::pointersToMatrices() const { - std::vector h_pointers(m_numMats); - size_t numelMat = m_numRows * m_numCols; - h_pointers[0] = m_d_data; - for (size_t i = 1; i < m_numMats; i++) { - h_pointers[i] = m_d_data + i * numelMat; - } - DTensor t(h_pointers, m_numMats, 1, 1); - return t; -} +//template +//inline DTensor DTensor::pointersToMatrices() const { +// std::vector h_pointers(m_numMats); +// size_t numelMat = m_numRows * m_numCols; +// h_pointers[0] = m_d_data; +// for (size_t i = 1; i < m_numMats; i++) { +// h_pointers[i] = m_d_data + i * numelMat; +// } +// DTensor t(h_pointers, m_numMats, 1, 1); +// return t; +//} template<> inline void DTensor::addAB(const DTensor &A, const DTensor &B, double alpha, double beta) { @@ -1017,12 +1020,12 @@ inline void DTensor::addAB(const DTensor &A, const DTensor nMat)); } else { gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(), - CUBLAS_OP_N, CUBLAS_OP_N, - nRA, nCB, nCA, &_alpha, - A.raw(), nRA, - B.raw(), nCA, - &_beta, - raw(), nRA)); + CUBLAS_OP_N, CUBLAS_OP_N, + nRA, nCB, nCA, &_alpha, + A.raw(), nRA, + B.raw(), nCA, + &_beta, + raw(), nRA)); } } @@ -1040,16 +1043,14 @@ inline void DTensor::leastSquaresBatched(DTensor &B) { throw std::invalid_argument("[Least squares batched] supports square or tall matrices only"); int info = 0; DTensor infoArray(batchSize); - DTensor As = pointersToMatrices(); - DTensor Bs = B.pointersToMatrices(); gpuErrChk(cublasDgelsBatched(Session::getInstance().cuBlasHandle(), CUBLAS_OP_N, m_numRows, m_numCols, nColsB, - As.raw(), + m_d_ptrMatrices, m_numRows, - Bs.raw(), + B.m_d_ptrMatrices, m_numRows, &info, infoArray.raw(), @@ -1070,16 +1071,14 @@ inline void DTensor::leastSquaresBatched(DTensor &B) { throw std::invalid_argument("[Least squares batched] supports square or tall matrices only"); int info = 0; DTensor infoArray(batchSize); - DTensor As = pointersToMatrices(); - DTensor Bs = B.pointersToMatrices(); gpuErrChk(cublasSgelsBatched(Session::getInstance().cuBlasHandle(), CUBLAS_OP_N, m_numRows, m_numCols, nColsB, - As.raw(), + m_d_ptrMatrices, m_numRows, - Bs.raw(), + B.m_d_ptrMatrices, m_numRows, &info, infoArray.raw(), @@ -1839,11 +1838,10 @@ public: template<> inline void CholeskyBatchFactoriser::factorise() { if (m_factorisationDone) return; - DTensor ptrA = m_matrix->pointersToMatrices(); gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, m_numRows, - ptrA.raw(), + m_matrix->ptrMatrices(), m_numRows, m_deviceInfo->raw(), m_numMats)); @@ -1853,11 +1851,10 @@ inline void CholeskyBatchFactoriser::factorise() { template<> inline void CholeskyBatchFactoriser::factorise() { if (m_factorisationDone) return; - DTensor ptrA = m_matrix->pointersToMatrices(); gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, m_numRows, - ptrA.raw(), + m_matrix->ptrMatrices(), m_numRows, m_deviceInfo->raw(), m_numMats)); @@ -1871,15 +1868,13 @@ inline void CholeskyBatchFactoriser::solve(DTensor &b) { throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible"); } if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column"); - DTensor ptrA = m_matrix->pointersToMatrices(); - DTensor ptrB = b.pointersToMatrices(); gpuErrChk(cusolverDnDpotrsBatched(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, m_numRows, 1, ///< only supports rhs = 1 - ptrA.raw(), + m_matrix->ptrMatrices(), m_numRows, - ptrB.raw(), + b.ptrMatrices(), m_numRows, m_deviceInfo->raw(), m_numMats)); @@ -1892,15 +1887,13 @@ inline void CholeskyBatchFactoriser::solve(DTensor &b) { throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible"); } if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column"); - DTensor ptrA = m_matrix->pointersToMatrices(); - DTensor ptrB = b.pointersToMatrices(); gpuErrChk(cusolverDnSpotrsBatched(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, m_numRows, 1, ///< only supports rhs = 1 - ptrA.raw(), + m_matrix->ptrMatrices(), m_numRows, - ptrB.raw(), + b.ptrMatrices(), m_numRows, m_deviceInfo->raw(), m_numMats)); diff --git a/test/testTensor.cu b/test/testTensor.cu index 0a33ffc..3cc7580 100644 --- a/test/testTensor.cu +++ b/test/testTensor.cu @@ -635,29 +635,6 @@ TEST_F(TensorTest, tensorMinusTensor) { tensorMinusTensor(); } -/* --------------------------------------- - * Tensor: pointers to matrices (on device) - * --------------------------------------- */ - -TEMPLATE_WITH_TYPE_T -void tensorPointersToMatrices() { - std::vector dataA = TENSOR_DATA_234A; - DTensor A(dataA, 2, 3, 4); - DTensor pointers = A.pointersToMatrices(); - EXPECT_EQ(4, pointers.numRows()); - EXPECT_EQ(1, pointers.numCols()); - EXPECT_EQ(1, pointers.numMats()); - T *p1 = pointers(1, 0, 0); // pointer to matrix #1 - T hostDst; // let's see what's there... - cudaMemcpy(&hostDst, p1, sizeof(T), cudaMemcpyDeviceToHost); - EXPECT_EQ(dataA[6], hostDst); -} - -TEST_F(TensorTest, tensorPointersToMatrices) { - tensorPointersToMatrices(); - tensorPointersToMatrices(); - tensorPointersToMatrices(); -} /* --------------------------------------- * Tensor: C = AB