diff --git a/include/tensor.cuh b/include/tensor.cuh index 0a8d1d1..8007209 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -181,16 +181,36 @@ class DTensor { private: T *m_d_data = nullptr; ///< Pointer to device data + T **m_d_ptrMatrices = nullptr; ///< Pointer to matrices in tensor size_t m_numRows = 0; ///< Number of rows size_t m_numCols = 0; ///< Number of columns size_t m_numMats = 0; ///< Number of matrices - bool m_doDestroy = false; ///< Whether to destroy memory + bool m_doDestroyData = false; ///< Whether to destroy m_d_data + bool m_doDestroyPointersToMatrices = false; ///< Whether to destroy m_d_ptrMatrices + + void makePointersToMatrices() { + std::vector h_pointers(m_numMats); + size_t numelMat = m_numRows * m_numCols; + h_pointers[0] = m_d_data; + for (size_t i = 1; i < m_numMats; i++) { + h_pointers[i] = m_d_data + i * numelMat; + } + size_t buffer_size = m_numMats * sizeof(T *); + gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers.data(), buffer_size, cudaMemcpyHostToDevice)); + m_doDestroyPointersToMatrices = true; + } bool destroy() { - if (!m_doDestroy) return false; - if (m_d_data) cudaFree(m_d_data); - m_d_data = nullptr; - return true; + bool willDestroy = m_doDestroyData || m_doDestroyPointersToMatrices; + if (m_doDestroyData) { + if (m_d_data) gpuErrChk(cudaFree(m_d_data)); + m_d_data = nullptr; + } + if (m_doDestroyPointersToMatrices) { + if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices)); + m_d_ptrMatrices = nullptr; + } + return willDestroy; } /** @@ -286,6 +306,10 @@ public: * @param axis axis to slice (0=rows, 1=columns, 2=matrices) * @param from index to slice axis from (zero-indexed) * @param to index to slice axis to (inclusive) + * + * @warning If axis=0 or axis=2, this method will (i) allocate memory on the GPU + * to store one element of size T* (pointer-to-T), (ii) will transfer one such + * element from the host to the device. */ DTensor(const DTensor &other, size_t axis, size_t from, size_t to); @@ -294,6 +318,10 @@ public: */ T *raw() const; + T **ptrMatrices() const { + return m_d_ptrMatrices; + } + /** * @return number of rows */ @@ -515,9 +543,8 @@ DTensor DTensor::createRandomTensor(size_t numRows, size_t numCols, size_t auto randVec = generateIntRandomVector(numRows * numCols * numMats, low, hi); DTensor a(randVec, numRows, numCols, numMats); return a; - } else { - throw std::invalid_argument("[createRandomTensor] unsupported type T"); } + throw std::invalid_argument("[createRandomTensor] unsupported type T"); } template @@ -533,6 +560,7 @@ void DTensor::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats m_numRows = newNumRows; m_numCols = newNumCols; m_numMats = newNumMats; + // TODO create } template @@ -542,6 +570,7 @@ DTensor::DTensor(size_t m, size_t n, size_t k, bool zero) { m_numMats = k; size_t size = m * n * k; allocateOnDevice(size, zero); + makePointersToMatrices(); } template @@ -552,6 +581,7 @@ DTensor::DTensor(const std::vector &data, size_t m, size_t n, size_t k, St size_t size = m * n * k; allocateOnDevice(size); upload(data, mode); + makePointersToMatrices(); } template @@ -563,6 +593,7 @@ DTensor::DTensor(const DTensor &other) { allocateOnDevice(m_numRows * m_numCols * m_numMats); gpuErrChk(cudaMemcpy(m_d_data, other.raw(), m_numRows * m_numCols * m_numMats * sizeof(T), cudaMemcpyDeviceToDevice)); + makePointersToMatrices(); } template @@ -574,6 +605,7 @@ DTensor::DTensor(const DTensor &other, size_t axis, size_t from, size_t to m_numRows = other.m_numRows; m_numCols = other.m_numCols; m_numMats = len; + m_d_ptrMatrices = other.m_d_ptrMatrices + from; } else if (axis == 1) { offset = other.m_numRows * from; m_numRows = other.m_numRows; @@ -586,7 +618,12 @@ DTensor::DTensor(const DTensor &other, size_t axis, size_t from, size_t to m_numMats = 1; } m_d_data = other.m_d_data + offset; - m_doDestroy = false; + m_doDestroyData = false; // no new memory allocated! + m_doDestroyPointersToMatrices = false; // no new auxiliary memory allocated! + if (axis != 2) { + // m_d_ptrMatrices is not needed for vectors and matrices + m_d_ptrMatrices = nullptr; + } } template @@ -595,9 +632,11 @@ DTensor::DTensor(DTensor &&other) { m_numRows = other.m_numRows; m_numMats = other.m_numMats; m_d_data = other.m_d_data; - m_doDestroy = true; - other.m_doDestroy = false; + m_d_ptrMatrices = other.m_d_ptrMatrices; + m_doDestroyData = true; + other.m_doDestroyData = false; other.m_d_data = nullptr; + other.m_d_ptrMatrices = nullptr; other.m_numCols = 0; other.m_numRows = 0; other.m_numMats = 0; @@ -757,12 +796,19 @@ template inline bool DTensor::allocateOnDevice(size_t size, bool zero) { if (size <= 0) return false; destroy(); - m_doDestroy = true; - size_t buffer_size = size * sizeof(T); - bool cudaStatus = cudaMalloc(&m_d_data, buffer_size); + m_doDestroyData = true; + /* Allocate memory for m_d_data */ + size_t data_size_bytes = size * sizeof(T); + bool cudaStatus = cudaMalloc(&m_d_data, data_size_bytes); if (cudaStatus != cudaSuccess) return false; - if (zero) gpuErrChk(cudaMemset(m_d_data, 0, buffer_size)); // set to zero all elements - return true; + if (zero) gpuErrChk(cudaMemset(m_d_data, 0, data_size_bytes)); // set to zero all elements + + /* Allocate memory for m_d_ptrMatrices */ + size_t ptr_matrices_bytes = m_numMats * sizeof(T *); + cudaStatus = cudaMalloc(&m_d_ptrMatrices, ptr_matrices_bytes); + m_doDestroyPointersToMatrices = true; + + return (cudaStatus != cudaSuccess); } template @@ -798,6 +844,7 @@ inline T *DTensor::raw() const { return m_d_data; } + template<> inline DTensor DTensor::tr() const { DTensor transposes(m_numCols, m_numRows, m_numMats); @@ -839,6 +886,12 @@ inline void DTensor::deviceCopyTo(DTensor &elsewhere) const { m_d_data, m_numRows * m_numCols * m_numMats * sizeof(T), cudaMemcpyDeviceToDevice)); + if (m_d_ptrMatrices) { + gpuErrChk(cudaMemcpy(elsewhere.m_d_ptrMatrices, + m_d_ptrMatrices, + m_numMats * sizeof(T *), + cudaMemcpyDeviceToDevice)); + } } template<> @@ -854,7 +907,7 @@ DTensor &DTensor::operator=(const DTensor &other) { m_numMats = other.m_numMats; m_numRows = other.m_numRows; m_numCols = other.m_numCols; - m_doDestroy = false; + m_doDestroyData = false; m_d_data = other.m_d_data; return *this; } @@ -910,17 +963,17 @@ inline T DTensor::operator()(size_t i, size_t j, size_t k) const { return hostDst; } -template -inline DTensor DTensor::pointersToMatrices() const { - std::vector h_pointers(m_numMats); - size_t numelMat = m_numRows * m_numCols; - h_pointers[0] = m_d_data; - for (size_t i = 1; i < m_numMats; i++) { - h_pointers[i] = m_d_data + i * numelMat; - } - DTensor t(h_pointers, m_numMats, 1, 1); - return t; -} +//template +//inline DTensor DTensor::pointersToMatrices() const { +// std::vector h_pointers(m_numMats); +// size_t numelMat = m_numRows * m_numCols; +// h_pointers[0] = m_d_data; +// for (size_t i = 1; i < m_numMats; i++) { +// h_pointers[i] = m_d_data + i * numelMat; +// } +// DTensor t(h_pointers, m_numMats, 1, 1); +// return t; +//} template<> inline void DTensor::addAB(const DTensor &A, const DTensor &B, double alpha, double beta) { @@ -928,18 +981,25 @@ inline void DTensor::addAB(const DTensor &A, const DTensor ptrA = A.pointersToMatrices(); - DTensor ptrB = B.pointersToMatrices(); - DTensor ptr = pointersToMatrices(); double _alpha = alpha, _beta = beta; - gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(), - CUBLAS_OP_N, CUBLAS_OP_N, - nRA, nCB, nCA, &_alpha, - ptrA.raw(), nRA, - ptrB.raw(), nCA, - &_beta, - ptr.raw(), nRA, - nMat)); + if (nMat > 1) { + gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(), + CUBLAS_OP_N, CUBLAS_OP_N, + nRA, nCB, nCA, &_alpha, + A.m_d_ptrMatrices, nRA, + B.m_d_ptrMatrices, nCA, + &_beta, + m_d_ptrMatrices, nRA, + nMat)); + } else { + gpuErrChk(cublasDgemm(Session::getInstance().cuBlasHandle(), + CUBLAS_OP_N, CUBLAS_OP_N, + nRA, nCB, nCA, &_alpha, + A.raw(), nRA, + B.raw(), nCA, + &_beta, + raw(), nRA)); + } } template<> @@ -948,18 +1008,25 @@ inline void DTensor::addAB(const DTensor &A, const DTensor size_t nRA = A.numRows(); size_t nCA = A.numCols(); size_t nCB = B.numCols(); - DTensor ptrA = A.pointersToMatrices(); - DTensor ptrB = B.pointersToMatrices(); - DTensor ptr = pointersToMatrices(); float _alpha = alpha, _beta = beta; - gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(), - CUBLAS_OP_N, CUBLAS_OP_N, - nRA, nCB, nCA, &_alpha, - ptrA.raw(), nRA, - ptrB.raw(), nCA, - &_beta, - ptr.raw(), nRA, - nMat)); + if (nMat > 1) { + gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(), + CUBLAS_OP_N, CUBLAS_OP_N, + nRA, nCB, nCA, &_alpha, + A.m_d_ptrMatrices, nRA, + B.m_d_ptrMatrices, nCA, + &_beta, + m_d_ptrMatrices, nRA, + nMat)); + } else { + gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(), + CUBLAS_OP_N, CUBLAS_OP_N, + nRA, nCB, nCA, &_alpha, + A.raw(), nRA, + B.raw(), nCA, + &_beta, + raw(), nRA)); + } } template<> @@ -976,16 +1043,14 @@ inline void DTensor::leastSquaresBatched(DTensor &B) { throw std::invalid_argument("[Least squares batched] supports square or tall matrices only"); int info = 0; DTensor infoArray(batchSize); - DTensor As = pointersToMatrices(); - DTensor Bs = B.pointersToMatrices(); gpuErrChk(cublasDgelsBatched(Session::getInstance().cuBlasHandle(), CUBLAS_OP_N, m_numRows, m_numCols, nColsB, - As.raw(), + m_d_ptrMatrices, m_numRows, - Bs.raw(), + B.m_d_ptrMatrices, m_numRows, &info, infoArray.raw(), @@ -1006,16 +1071,14 @@ inline void DTensor::leastSquaresBatched(DTensor &B) { throw std::invalid_argument("[Least squares batched] supports square or tall matrices only"); int info = 0; DTensor infoArray(batchSize); - DTensor As = pointersToMatrices(); - DTensor Bs = B.pointersToMatrices(); gpuErrChk(cublasSgelsBatched(Session::getInstance().cuBlasHandle(), CUBLAS_OP_N, m_numRows, m_numCols, nColsB, - As.raw(), + m_d_ptrMatrices, m_numRows, - Bs.raw(), + B.m_d_ptrMatrices, m_numRows, &info, infoArray.raw(), @@ -1775,11 +1838,10 @@ public: template<> inline void CholeskyBatchFactoriser::factorise() { if (m_factorisationDone) return; - DTensor ptrA = m_matrix->pointersToMatrices(); gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, m_numRows, - ptrA.raw(), + m_matrix->ptrMatrices(), m_numRows, m_deviceInfo->raw(), m_numMats)); @@ -1789,11 +1851,10 @@ inline void CholeskyBatchFactoriser::factorise() { template<> inline void CholeskyBatchFactoriser::factorise() { if (m_factorisationDone) return; - DTensor ptrA = m_matrix->pointersToMatrices(); gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, m_numRows, - ptrA.raw(), + m_matrix->ptrMatrices(), m_numRows, m_deviceInfo->raw(), m_numMats)); @@ -1807,15 +1868,13 @@ inline void CholeskyBatchFactoriser::solve(DTensor &b) { throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible"); } if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column"); - DTensor ptrA = m_matrix->pointersToMatrices(); - DTensor ptrB = b.pointersToMatrices(); gpuErrChk(cusolverDnDpotrsBatched(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, m_numRows, 1, ///< only supports rhs = 1 - ptrA.raw(), + m_matrix->ptrMatrices(), m_numRows, - ptrB.raw(), + b.ptrMatrices(), m_numRows, m_deviceInfo->raw(), m_numMats)); @@ -1828,15 +1887,13 @@ inline void CholeskyBatchFactoriser::solve(DTensor &b) { throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible"); } if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column"); - DTensor ptrA = m_matrix->pointersToMatrices(); - DTensor ptrB = b.pointersToMatrices(); gpuErrChk(cusolverDnSpotrsBatched(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, m_numRows, 1, ///< only supports rhs = 1 - ptrA.raw(), + m_matrix->ptrMatrices(), m_numRows, - ptrB.raw(), + b.ptrMatrices(), m_numRows, m_deviceInfo->raw(), m_numMats)); diff --git a/test/testTensor.cu b/test/testTensor.cu index 0a33ffc..3cc7580 100644 --- a/test/testTensor.cu +++ b/test/testTensor.cu @@ -635,29 +635,6 @@ TEST_F(TensorTest, tensorMinusTensor) { tensorMinusTensor(); } -/* --------------------------------------- - * Tensor: pointers to matrices (on device) - * --------------------------------------- */ - -TEMPLATE_WITH_TYPE_T -void tensorPointersToMatrices() { - std::vector dataA = TENSOR_DATA_234A; - DTensor A(dataA, 2, 3, 4); - DTensor pointers = A.pointersToMatrices(); - EXPECT_EQ(4, pointers.numRows()); - EXPECT_EQ(1, pointers.numCols()); - EXPECT_EQ(1, pointers.numMats()); - T *p1 = pointers(1, 0, 0); // pointer to matrix #1 - T hostDst; // let's see what's there... - cudaMemcpy(&hostDst, p1, sizeof(T), cudaMemcpyDeviceToHost); - EXPECT_EQ(dataA[6], hostDst); -} - -TEST_F(TensorTest, tensorPointersToMatrices) { - tensorPointersToMatrices(); - tensorPointersToMatrices(); - tensorPointersToMatrices(); -} /* --------------------------------------- * Tensor: C = AB