From aa1317b6f25213016b6f6dbaa208086b9b53145a Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Thu, 7 Nov 2024 01:06:30 +0000
Subject: [PATCH 01/17] Create pointer-to-matrices memory (m_d_ptrMatrices)
Allocating and destroying memory
No memory leaks evident atm
---
include/tensor.cuh | 34 ++++++++++++++++++++++------------
1 file changed, 22 insertions(+), 12 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 0a8d1d1..00e5068 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -181,16 +181,22 @@ class DTensor {
private:
T *m_d_data = nullptr; ///< Pointer to device data
+ T **m_d_ptrMatrices = nullptr; ///< Pointer to matrices in tensor
size_t m_numRows = 0; ///< Number of rows
size_t m_numCols = 0; ///< Number of columns
size_t m_numMats = 0; ///< Number of matrices
- bool m_doDestroy = false; ///< Whether to destroy memory
+ bool m_doDestroyData = false; ///< Whether to destroy memory
+ bool m_doDestroyPtrMatrices = false; ///< Whether to destroy memory
- bool destroy() {
- if (!m_doDestroy) return false;
- if (m_d_data) cudaFree(m_d_data);
- m_d_data = nullptr;
- return true;
+ void destroy() {
+ if (m_doDestroyData) {
+ if (m_d_data) gpuErrChk(cudaFree(m_d_data));
+ m_d_data = nullptr;
+ }
+ if (m_doDestroyPtrMatrices) {
+ if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices));
+ m_d_ptrMatrices = nullptr;
+ }
}
/**
@@ -586,7 +592,7 @@ DTensor::DTensor(const DTensor &other, size_t axis, size_t from, size_t to
m_numMats = 1;
}
m_d_data = other.m_d_data + offset;
- m_doDestroy = false;
+ m_doDestroyData = false;
}
template
@@ -595,8 +601,8 @@ DTensor::DTensor(DTensor &&other) {
m_numRows = other.m_numRows;
m_numMats = other.m_numMats;
m_d_data = other.m_d_data;
- m_doDestroy = true;
- other.m_doDestroy = false;
+ m_doDestroyData = true;
+ other.m_doDestroyData = false;
other.m_d_data = nullptr;
other.m_numCols = 0;
other.m_numRows = 0;
@@ -757,12 +763,16 @@ template
inline bool DTensor::allocateOnDevice(size_t size, bool zero) {
if (size <= 0) return false;
destroy();
- m_doDestroy = true;
+ m_doDestroyData = true;
size_t buffer_size = size * sizeof(T);
bool cudaStatus = cudaMalloc(&m_d_data, buffer_size);
if (cudaStatus != cudaSuccess) return false;
if (zero) gpuErrChk(cudaMemset(m_d_data, 0, buffer_size)); // set to zero all elements
- return true;
+
+ m_doDestroyPtrMatrices = true;
+ cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T*));
+
+ return (cudaStatus != cudaSuccess);
}
template
@@ -854,7 +864,7 @@ DTensor &DTensor::operator=(const DTensor &other) {
m_numMats = other.m_numMats;
m_numRows = other.m_numRows;
m_numCols = other.m_numCols;
- m_doDestroy = false;
+ m_doDestroyData = false;
m_d_data = other.m_d_data;
return *this;
}
From 2ac61cf073fdcecd4229aac49c774c0e3a01dddf Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Thu, 7 Nov 2024 01:18:25 +0000
Subject: [PATCH 02/17] Initialise m_d_ptrMatrices
Introduce initialisePointersToMatricesData to initialise
m_d_ptrMatrices; appropriate checks for safety.
---
include/tensor.cuh | 25 ++++++++++++++++++++++---
1 file changed, 22 insertions(+), 3 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 00e5068..1be39a9 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -42,8 +42,7 @@ static std::random_device RND_DEVICE;
* @param hi
* @return
*/
-TEMPLATE_WITH_TYPE_T
-TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
std::vector generateRealRandomVector(size_t n, T low, T hi) {
std::mt19937_64 mersenne_engine(RND_DEVICE());
std::uniform_real_distribution dist(low, hi);
@@ -231,6 +230,23 @@ private:
*/
std::ostream &print(std::ostream &out) const;
+ void initialisePointersToMatricesData() {
+ /* Make sure m_d_ptrMatrices has been allocated */
+ if (!m_d_ptrMatrices || !m_doDestroyPtrMatrices) {
+ throw std::runtime_error("Unallocated memory (m_d_ptrMatrices)");
+ }
+ /* Host-based vector of pointers */
+ std::vector h_pointers(m_numMats);
+ size_t numelMat = m_numRows * m_numCols;
+ h_pointers[0] = m_d_data;
+ for (size_t i = 1; i < m_numMats; i++) {
+ h_pointers[i] = m_d_data + i * numelMat;
+ }
+ /* Upload data to m_d_ptrMatrices */
+ size_t buffer_size = m_numMats * sizeof(T *);
+ gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers.data(), buffer_size, cudaMemcpyHostToDevice));
+ }
+
public:
/**
@@ -770,7 +786,10 @@ inline bool DTensor::allocateOnDevice(size_t size, bool zero) {
if (zero) gpuErrChk(cudaMemset(m_d_data, 0, buffer_size)); // set to zero all elements
m_doDestroyPtrMatrices = true;
- cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T*));
+ cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T *));
+
+ /* Initialise m_d_ptrMatrices */
+ initialisePointersToMatricesData();
return (cudaStatus != cudaSuccess);
}
From 3338852def77ee30e7923f7a0a42f0a4394e3c94 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Thu, 7 Nov 2024 01:24:28 +0000
Subject: [PATCH 03/17] Use initialisePointersToMatricesData to initialise
m_d_ptrMatrices
Also in the slice constructor, keep m_d_ptrMatrices=nullptr
when we dont slice along axis=2
---
include/tensor.cuh | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 1be39a9..8ef0d90 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -564,6 +564,8 @@ DTensor::DTensor(size_t m, size_t n, size_t k, bool zero) {
m_numMats = k;
size_t size = m * n * k;
allocateOnDevice(size, zero);
+ /* Initialise m_d_ptrMatrices */
+ initialisePointersToMatricesData();
}
template
@@ -574,6 +576,8 @@ DTensor::DTensor(const std::vector &data, size_t m, size_t n, size_t k, St
size_t size = m * n * k;
allocateOnDevice(size);
upload(data, mode);
+ /* Initialise m_d_ptrMatrices */
+ initialisePointersToMatricesData();
}
template
@@ -585,6 +589,8 @@ DTensor::DTensor(const DTensor &other) {
allocateOnDevice(m_numRows * m_numCols * m_numMats);
gpuErrChk(cudaMemcpy(m_d_data, other.raw(), m_numRows * m_numCols * m_numMats * sizeof(T),
cudaMemcpyDeviceToDevice));
+ /* Initialise m_d_ptrMatrices */
+ initialisePointersToMatricesData();
}
template
@@ -609,6 +615,11 @@ DTensor::DTensor(const DTensor &other, size_t axis, size_t from, size_t to
}
m_d_data = other.m_d_data + offset;
m_doDestroyData = false;
+ m_doDestroyPtrMatrices = false;
+ if (axis != 2) {
+ // m_d_ptrMatrices is not needed for vectors and matrices
+ m_d_ptrMatrices = nullptr;
+ }
}
template
@@ -788,9 +799,6 @@ inline bool DTensor::allocateOnDevice(size_t size, bool zero) {
m_doDestroyPtrMatrices = true;
cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T *));
- /* Initialise m_d_ptrMatrices */
- initialisePointersToMatricesData();
-
return (cudaStatus != cudaSuccess);
}
From 36bf6ed6c60b9157b915f8fc651824cbb547df1e Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Thu, 7 Nov 2024 01:35:41 +0000
Subject: [PATCH 04/17] update addAB to use preallocated memory
use cublasDGemm if nMats=1
---
include/tensor.cuh | 37 +++++++++++++++++++++++++------------
1 file changed, 25 insertions(+), 12 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 8ef0d90..65ebdd5 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -624,12 +624,18 @@ DTensor::DTensor(const DTensor &other, size_t axis, size_t from, size_t to
template
DTensor::DTensor(DTensor &&other) {
+ /* Steal everything from other */
m_numCols = other.m_numCols;
m_numRows = other.m_numRows;
m_numMats = other.m_numMats;
m_d_data = other.m_d_data;
- m_doDestroyData = true;
+ m_doDestroyData = other.m_doDestroyData;
+ m_doDestroyPtrMatrices = other.m_doDestroyPtrMatrices;
+ m_d_ptrMatrices = other.m_d_ptrMatrices;
+ /* Invalidate other */
+ other.m_doDestroyPtrMatrices = false;
other.m_doDestroyData = false;
+ other.m_d_ptrMatrices = nullptr;
other.m_d_data = nullptr;
other.m_numCols = 0;
other.m_numRows = 0;
@@ -965,18 +971,25 @@ inline void DTensor::addAB(const DTensor &A, const DTensor ptrA = A.pointersToMatrices();
- DTensor ptrB = B.pointersToMatrices();
- DTensor ptr = pointersToMatrices();
double _alpha = alpha, _beta = beta;
- gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(),
- CUBLAS_OP_N, CUBLAS_OP_N,
- nRA, nCB, nCA, &_alpha,
- ptrA.raw(), nRA,
- ptrB.raw(), nCA,
- &_beta,
- ptr.raw(), nRA,
- nMat));
+ if (nMat > 1) {
+ gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(),
+ CUBLAS_OP_N, CUBLAS_OP_N,
+ nRA, nCB, nCA, &_alpha,
+ A.m_d_ptrMatrices, nRA,
+ B.m_d_ptrMatrices, nCA,
+ &_beta,
+ m_d_ptrMatrices, nRA,
+ nMat));
+ } else {
+ gpuErrChk(cublasDgemm(Session::getInstance().cuBlasHandle(),
+ CUBLAS_OP_N, CUBLAS_OP_N,
+ nRA, nCB, nCA, &_alpha,
+ A.raw(), nRA,
+ B.raw(), nCA,
+ &_beta,
+ raw(), nRA));
+ }
}
template<>
From 4e5da4dd952be5a881ddb094e35c8796f6f2e770 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Thu, 7 Nov 2024 01:39:25 +0000
Subject: [PATCH 05/17] mirror implementation for DTensor::addAB
---
include/tensor.cuh | 29 ++++++++++++++++++-----------
1 file changed, 18 insertions(+), 11 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 65ebdd5..3d650d4 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -998,18 +998,25 @@ inline void DTensor::addAB(const DTensor &A, const DTensor
size_t nRA = A.numRows();
size_t nCA = A.numCols();
size_t nCB = B.numCols();
- DTensor ptrA = A.pointersToMatrices();
- DTensor ptrB = B.pointersToMatrices();
- DTensor ptr = pointersToMatrices();
float _alpha = alpha, _beta = beta;
- gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(),
- CUBLAS_OP_N, CUBLAS_OP_N,
- nRA, nCB, nCA, &_alpha,
- ptrA.raw(), nRA,
- ptrB.raw(), nCA,
- &_beta,
- ptr.raw(), nRA,
- nMat));
+ if (nMat > 1) {
+ gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(),
+ CUBLAS_OP_N, CUBLAS_OP_N,
+ nRA, nCB, nCA, &_alpha,
+ A.m_d_ptrMatrices, nRA,
+ B.m_d_ptrMatrices, nCA,
+ &_beta,
+ m_d_ptrMatrices, nRA,
+ nMat));
+ } else {
+ gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(),
+ CUBLAS_OP_N, CUBLAS_OP_N,
+ nRA, nCB, nCA, &_alpha,
+ A.raw(), nRA,
+ B.raw(), nCA,
+ &_beta,
+ raw(), nRA));
+ }
}
template<>
From 296efd4b18b6cc68339462d6d953fdfbd59fbbea Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Thu, 7 Nov 2024 01:46:59 +0000
Subject: [PATCH 06/17] Get rid of pointersToMatrices() in LS
---
include/tensor.cuh | 17 +++++++----------
1 file changed, 7 insertions(+), 10 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 3d650d4..465eacd 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -359,6 +359,7 @@ public:
* Creates a vector of pointers to the matrices of this tensor.
* The vector is an (n,1,1)-tensor, where n is the number of matrices in this tensor.
* @return vector of pointers to the first element of each matrix
+ * @deprecated
*/
DTensor pointersToMatrices() const;
@@ -1032,17 +1033,15 @@ inline void DTensor::leastSquaresBatched(DTensor &B) {
if (m_numCols > m_numRows)
throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
int info = 0;
- DTensor infoArray(batchSize);
- DTensor As = pointersToMatrices();
- DTensor Bs = B.pointersToMatrices();
+ DTensor infoArray(batchSize); // TODO consider preallocating?
gpuErrChk(cublasDgelsBatched(Session::getInstance().cuBlasHandle(),
CUBLAS_OP_N,
m_numRows,
m_numCols,
nColsB,
- As.raw(),
+ m_d_ptrMatrices,
m_numRows,
- Bs.raw(),
+ B.m_d_ptrMatrices,
m_numRows,
&info,
infoArray.raw(),
@@ -1062,17 +1061,15 @@ inline void DTensor::leastSquaresBatched(DTensor &B) {
if (m_numCols > m_numRows)
throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
int info = 0;
- DTensor infoArray(batchSize);
- DTensor As = pointersToMatrices();
- DTensor Bs = B.pointersToMatrices();
+ DTensor infoArray(batchSize); // TODO consider preallocating?
gpuErrChk(cublasSgelsBatched(Session::getInstance().cuBlasHandle(),
CUBLAS_OP_N,
m_numRows,
m_numCols,
nColsB,
- As.raw(),
+ m_d_ptrMatrices,
m_numRows,
- Bs.raw(),
+ B.m_d_ptrMatrices,
m_numRows,
&info,
infoArray.raw(),
From 02564d57cb8de04850df8a4f961ecad7e3338ff5 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Thu, 7 Nov 2024 01:53:38 +0000
Subject: [PATCH 07/17] New method DTensor::ptrMatrices() exposes
m_d_ptrMatrices
---
include/tensor.cuh | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 465eacd..6f15916 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -316,6 +316,12 @@ public:
*/
T *raw() const;
+ /**
+ * Pointers to matrices (on device)
+ * @return
+ */
+ T **ptrMatrices();
+
/**
* @return number of rows
*/
@@ -842,6 +848,12 @@ inline T *DTensor::raw() const {
return m_d_data;
}
+template
+inline T **DTensor::ptrMatrices() {
+ return m_d_ptrMatrices;
+}
+
+
template<>
inline DTensor DTensor::tr() const {
DTensor transposes(m_numCols, m_numRows, m_numMats);
From f2e86c79e274a3f022965e07008be071152154ed Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Thu, 7 Nov 2024 01:57:44 +0000
Subject: [PATCH 08/17] Completely got rid of pointersToMatrices()
---
include/tensor.cuh | 38 ++++++--------------------------------
test/testTensor.cu | 24 ------------------------
2 files changed, 6 insertions(+), 56 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 6f15916..9fc4703 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -361,14 +361,6 @@ public:
*/
void deviceCopyTo(DTensor &other) const;
- /**
- * Creates a vector of pointers to the matrices of this tensor.
- * The vector is an (n,1,1)-tensor, where n is the number of matrices in this tensor.
- * @return vector of pointers to the first element of each matrix
- * @deprecated
- */
- DTensor pointersToMatrices() const;
-
/**
* Slices rows from specified matrix.
* @param rowsFrom index to slice rows from (zero-indexed)
@@ -966,18 +958,6 @@ inline T DTensor::operator()(size_t i, size_t j, size_t k) const {
return hostDst;
}
-template
-inline DTensor DTensor::pointersToMatrices() const {
- std::vector h_pointers(m_numMats);
- size_t numelMat = m_numRows * m_numCols;
- h_pointers[0] = m_d_data;
- for (size_t i = 1; i < m_numMats; i++) {
- h_pointers[i] = m_d_data + i * numelMat;
- }
- DTensor t(h_pointers, m_numMats, 1, 1);
- return t;
-}
-
template<>
inline void DTensor::addAB(const DTensor &A, const DTensor &B, double alpha, double beta) {
size_t nMat = A.numMats();
@@ -1841,11 +1821,10 @@ public:
template<>
inline void CholeskyBatchFactoriser::factorise() {
if (m_factorisationDone) return;
- DTensor ptrA = m_matrix->pointersToMatrices();
gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(),
CUBLAS_FILL_MODE_LOWER,
m_numRows,
- ptrA.raw(),
+ m_matrix->ptrMatrices(),
m_numRows,
m_deviceInfo->raw(),
m_numMats));
@@ -1855,11 +1834,10 @@ inline void CholeskyBatchFactoriser::factorise() {
template<>
inline void CholeskyBatchFactoriser::factorise() {
if (m_factorisationDone) return;
- DTensor ptrA = m_matrix->pointersToMatrices();
gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(),
CUBLAS_FILL_MODE_LOWER,
m_numRows,
- ptrA.raw(),
+ m_matrix->ptrMatrices(),
m_numRows,
m_deviceInfo->raw(),
m_numMats));
@@ -1873,15 +1851,13 @@ inline void CholeskyBatchFactoriser::solve(DTensor &b) {
throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
}
if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
- DTensor ptrA = m_matrix->pointersToMatrices();
- DTensor ptrB = b.pointersToMatrices();
gpuErrChk(cusolverDnDpotrsBatched(Session::getInstance().cuSolverHandle(),
CUBLAS_FILL_MODE_LOWER,
m_numRows,
1, ///< only supports rhs = 1
- ptrA.raw(),
+ m_matrix->ptrMatrices(),
m_numRows,
- ptrB.raw(),
+ b.ptrMatrices(),
m_numRows,
m_deviceInfo->raw(),
m_numMats));
@@ -1894,15 +1870,13 @@ inline void CholeskyBatchFactoriser::solve(DTensor &b) {
throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
}
if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
- DTensor ptrA = m_matrix->pointersToMatrices();
- DTensor ptrB = b.pointersToMatrices();
gpuErrChk(cusolverDnSpotrsBatched(Session::getInstance().cuSolverHandle(),
CUBLAS_FILL_MODE_LOWER,
m_numRows,
1, ///< only supports rhs = 1
- ptrA.raw(),
+ m_matrix->ptrMatrices(),
m_numRows,
- ptrB.raw(),
+ b.ptrMatrices(),
m_numRows,
m_deviceInfo->raw(),
m_numMats));
diff --git a/test/testTensor.cu b/test/testTensor.cu
index 0a33ffc..902a27c 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -635,30 +635,6 @@ TEST_F(TensorTest, tensorMinusTensor) {
tensorMinusTensor();
}
-/* ---------------------------------------
- * Tensor: pointers to matrices (on device)
- * --------------------------------------- */
-
-TEMPLATE_WITH_TYPE_T
-void tensorPointersToMatrices() {
- std::vector dataA = TENSOR_DATA_234A;
- DTensor A(dataA, 2, 3, 4);
- DTensor pointers = A.pointersToMatrices();
- EXPECT_EQ(4, pointers.numRows());
- EXPECT_EQ(1, pointers.numCols());
- EXPECT_EQ(1, pointers.numMats());
- T *p1 = pointers(1, 0, 0); // pointer to matrix #1
- T hostDst; // let's see what's there...
- cudaMemcpy(&hostDst, p1, sizeof(T), cudaMemcpyDeviceToHost);
- EXPECT_EQ(dataA[6], hostDst);
-}
-
-TEST_F(TensorTest, tensorPointersToMatrices) {
- tensorPointersToMatrices();
- tensorPointersToMatrices();
- tensorPointersToMatrices();
-}
-
/* ---------------------------------------
* Tensor: C = AB
* --------------------------------------- */
From 424a251a1d4008932e8791d1a0843b6575e6770e Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Thu, 7 Nov 2024 02:36:51 +0000
Subject: [PATCH 09/17] No memory allocation for m_d_ptrMatrices unless nMats >
1
---
CHANGELOG.md | 9 +++++++++
include/tensor.cuh | 14 ++++++++++----
2 files changed, 19 insertions(+), 4 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e409628..4d19eb6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## v1.3.1 - 8-11-2024
+
+### Fixed
+
+- Memory management improvements: we got rid of `pointerToMatrices`, which would unnecessarily allocate memory and `addAB` does not allocate any new memory internally.
+
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 9fc4703..6cd64e1 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -232,8 +232,8 @@ private:
void initialisePointersToMatricesData() {
/* Make sure m_d_ptrMatrices has been allocated */
- if (!m_d_ptrMatrices || !m_doDestroyPtrMatrices) {
- throw std::runtime_error("Unallocated memory (m_d_ptrMatrices)");
+ if (m_numMats <= 1 | !m_d_ptrMatrices || !m_doDestroyPtrMatrices) {
+ return;
}
/* Host-based vector of pointers */
std::vector h_pointers(m_numMats);
@@ -287,6 +287,7 @@ public:
* @param n number of columns
* @param k number of matrices
*/
+
DTensor(const std::vector &data, size_t m, size_t n = 1, size_t k = 1,
StorageMode mode = StorageMode::defaultMajor);
@@ -553,6 +554,7 @@ void DTensor::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats
}
m_numRows = newNumRows;
m_numCols = newNumCols;
+ // TODO allocate or reallocate new memory
m_numMats = newNumMats;
}
@@ -801,8 +803,12 @@ inline bool DTensor::allocateOnDevice(size_t size, bool zero) {
if (cudaStatus != cudaSuccess) return false;
if (zero) gpuErrChk(cudaMemset(m_d_data, 0, buffer_size)); // set to zero all elements
- m_doDestroyPtrMatrices = true;
- cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T *));
+ if (numMats() > 1) {
+ m_doDestroyPtrMatrices = true;
+ cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T *));
+ } else {
+ m_doDestroyPtrMatrices = false;
+ }
return (cudaStatus != cudaSuccess);
}
From b2857cd6c135c241ce655dc71422688b94c217b5 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Thu, 7 Nov 2024 02:47:51 +0000
Subject: [PATCH 10/17] ptrMatrices now declared as a const method
---
include/tensor.cuh | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 6cd64e1..3c8c83c 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -287,7 +287,6 @@ public:
* @param n number of columns
* @param k number of matrices
*/
-
DTensor(const std::vector &data, size_t m, size_t n = 1, size_t k = 1,
StorageMode mode = StorageMode::defaultMajor);
@@ -321,7 +320,7 @@ public:
* Pointers to matrices (on device)
* @return
*/
- T **ptrMatrices();
+ T **ptrMatrices() const;
/**
* @return number of rows
@@ -847,7 +846,7 @@ inline T *DTensor::raw() const {
}
template
-inline T **DTensor::ptrMatrices() {
+inline T **DTensor::ptrMatrices() const {
return m_d_ptrMatrices;
}
From 7c32dae4ca0a2e6fd91281225cdee35d66521ae4 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Thu, 7 Nov 2024 13:55:36 +0000
Subject: [PATCH 11/17] Update DTensor::reshape
When necessary, memory is reallocated for m_d_ptrMatrices
within reshape
---
include/tensor.cuh | 64 ++++++++++++++++++++++++++++++----------------
1 file changed, 42 insertions(+), 22 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 3c8c83c..ff35317 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -230,32 +230,24 @@ private:
*/
std::ostream &print(std::ostream &out) const;
- void initialisePointersToMatricesData() {
- /* Make sure m_d_ptrMatrices has been allocated */
- if (m_numMats <= 1 | !m_d_ptrMatrices || !m_doDestroyPtrMatrices) {
- return;
- }
- /* Host-based vector of pointers */
- std::vector h_pointers(m_numMats);
- size_t numelMat = m_numRows * m_numCols;
- h_pointers[0] = m_d_data;
- for (size_t i = 1; i < m_numMats; i++) {
- h_pointers[i] = m_d_data + i * numelMat;
- }
- /* Upload data to m_d_ptrMatrices */
- size_t buffer_size = m_numMats * sizeof(T *);
- gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers.data(), buffer_size, cudaMemcpyHostToDevice));
- }
+ /**
+ * Initialises an array of pointers to the sub-matrices of the
+ * tensor (on the device). No allocation takes place if the tensor
+ * has only one matrix.
+ */
+ void initialisePointersToMatricesData();
public:
/**
* Create a tensor with random elements
- * @param numRows
- * @param numCols
- * @param numMats
- * @param low
- * @param hi
+ * @param numRows number of rows
+ * @param numCols number of columns
+ * @param numMats number of matrices
+ * @param low minimum value of random elements
+ * @param hi maximum value of random elements
+ *
+ * @throws std::invalid_argument if T is other than double, float, or int
*/
static DTensor createRandomTensor(size_t numRows, size_t numCols, size_t numMats, T low, T hi);
@@ -526,6 +518,24 @@ public:
}; /* END OF DTENSOR */
+template
+void DTensor::initialisePointersToMatricesData() {
+ /* Make sure m_d_ptrMatrices has been allocated */
+ if (m_numMats <= 1 || !m_d_ptrMatrices || !m_doDestroyPtrMatrices) {
+ return;
+ }
+ /* Host-based vector of pointers */
+ std::vector h_pointers(m_numMats);
+ size_t numelMat = m_numRows * m_numCols;
+ h_pointers[0] = m_d_data;
+ for (size_t i = 1; i < m_numMats; i++) {
+ h_pointers[i] = m_d_data + i * numelMat;
+ }
+ /* Upload data to m_d_ptrMatrices */
+ size_t buffer_size = m_numMats * sizeof(T *);
+ gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers.data(), buffer_size, cudaMemcpyHostToDevice));
+}
+
template
DTensor DTensor::createRandomTensor(size_t numRows, size_t numCols, size_t numMats, T low, T hi) {
if constexpr (std::is_floating_point::value) {
@@ -543,6 +553,7 @@ DTensor DTensor::createRandomTensor(size_t numRows, size_t numCols, size_t
template
void DTensor::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats) {
+ if (m_numRows == newNumRows && m_numCols == newNumCols && m_numMats == newNumMats) return;
size_t newNumElements = newNumRows * newNumCols * newNumMats;
if (numEl() != newNumElements) {
char errMessage[256];
@@ -553,8 +564,17 @@ void DTensor::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats
}
m_numRows = newNumRows;
m_numCols = newNumCols;
- // TODO allocate or reallocate new memory
m_numMats = newNumMats;
+ /* Free the memory for m_d_ptrMatrices */
+ if (m_d_ptrMatrices && m_doDestroyPtrMatrices) {
+ gpuErrChk(cudaFree(m_d_ptrMatrices));
+ m_d_ptrMatrices = nullptr;
+ }
+ /* Reallocate memory for m_d_ptrMatrices, if necessary */
+ if (m_numMats > 1) {
+ gpuErrChk(cudaMalloc(&m_d_ptrMatrices, m_numMats * sizeof(T *)));
+ }
+ initialisePointersToMatricesData();
}
template
From 225ba0da797f37cc4911efc9a33b99d721401d3d Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Thu, 7 Nov 2024 14:03:26 +0000
Subject: [PATCH 12/17] Better memory management when reshaping
No need to free/reallocate when we can reuse the already
allocated memory space (when the number of matrices
decreases); no new memory allocation when numMats=1
---
include/tensor.cuh | 23 ++++++++++++++---------
1 file changed, 14 insertions(+), 9 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index ff35317..44a448a 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -555,6 +555,7 @@ template
void DTensor::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats) {
if (m_numRows == newNumRows && m_numCols == newNumCols && m_numMats == newNumMats) return;
size_t newNumElements = newNumRows * newNumCols * newNumMats;
+ /* Check whether dimensions are compatible */
if (numEl() != newNumElements) {
char errMessage[256];
sprintf(errMessage,
@@ -562,18 +563,22 @@ void DTensor::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats
numRows(), numRows(), numMats(), numEl(), newNumRows, newNumCols, newNumMats, newNumElements);
throw std::invalid_argument(errMessage);
}
+
+ /* Only free/reallocate if newNumMats > m_numMats
+ * otherwise, reuse the already allocated memory space */
+ if (newNumMats > m_numMats) {
+ /* Free the memory for m_d_ptrMatrices */
+ if (m_d_ptrMatrices && m_doDestroyPtrMatrices) {
+ gpuErrChk(cudaFree(m_d_ptrMatrices));
+ m_d_ptrMatrices = nullptr;
+ }
+ /* Reallocate memory for m_d_ptrMatrices, if necessary */
+ if (newNumMats > 1) gpuErrChk(cudaMalloc(&m_d_ptrMatrices, newNumMats * sizeof(T *)));
+ }
+
m_numRows = newNumRows;
m_numCols = newNumCols;
m_numMats = newNumMats;
- /* Free the memory for m_d_ptrMatrices */
- if (m_d_ptrMatrices && m_doDestroyPtrMatrices) {
- gpuErrChk(cudaFree(m_d_ptrMatrices));
- m_d_ptrMatrices = nullptr;
- }
- /* Reallocate memory for m_d_ptrMatrices, if necessary */
- if (m_numMats > 1) {
- gpuErrChk(cudaMalloc(&m_d_ptrMatrices, m_numMats * sizeof(T *)));
- }
initialisePointersToMatricesData();
}
From d4b5472bd3d555815e9e69e7a51ed01330a69152 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Thu, 7 Nov 2024 14:32:49 +0000
Subject: [PATCH 13/17] Safe memory allocation in reshape and unit test
---
include/tensor.cuh | 6 +-
test/testTensor.cu | 752 +++++++++++++++++++++++++++------------------
2 files changed, 451 insertions(+), 307 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 44a448a..55b661b 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -571,9 +571,13 @@ void DTensor::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats
if (m_d_ptrMatrices && m_doDestroyPtrMatrices) {
gpuErrChk(cudaFree(m_d_ptrMatrices));
m_d_ptrMatrices = nullptr;
+ m_doDestroyPtrMatrices = false;
}
/* Reallocate memory for m_d_ptrMatrices, if necessary */
- if (newNumMats > 1) gpuErrChk(cudaMalloc(&m_d_ptrMatrices, newNumMats * sizeof(T *)));
+ if (newNumMats > 1) {
+ gpuErrChk(cudaMalloc(&m_d_ptrMatrices, newNumMats * sizeof(T *)));
+ m_doDestroyPtrMatrices = true;
+ }
}
m_numRows = newNumRows;
diff --git a/test/testTensor.cu b/test/testTensor.cu
index 902a27c..982934c 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -26,7 +26,7 @@ protected:
TEMPLATE_WITH_TYPE_T
void tensorConstructionZero() {
- DTensor zero(2, 3, 4, true);
+ DTensor zero(2, 3, 4, true);
EXPECT_EQ(2, zero.numRows());
EXPECT_EQ(3, zero.numCols());
EXPECT_EQ(4, zero.numMats());
@@ -36,10 +36,14 @@ void tensorConstructionZero() {
EXPECT_EQ(expectedResult, zeroDown);
}
-TEST_F(TensorTest, tensorConstructionZero) {
- tensorConstructionZero();
- tensorConstructionZero();
- tensorConstructionZero();
+TEST_F(TensorTest, tensorConstructionZero
+) {
+tensorConstructionZero();
+
+tensorConstructionZero();
+
+tensorConstructionZero();
+
}
/* ---------------------------------------
@@ -65,21 +69,21 @@ void tensorConstructionStorageMode() {
std::vector Rm = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
std::vector hostData(rows * cols * mats);
// test constructor
- DTensor testCm(Cm, rows, cols, mats, columnMajor);
- DTensor testRm(Rm, rows, cols, mats, rowMajor);
+ DTensor testCm(Cm, rows, cols, mats, columnMajor);
+ DTensor testRm(Rm, rows, cols, mats, rowMajor);
testCm.download(hostData);
EXPECT_EQ(Cm, hostData);
testRm.download(hostData);
EXPECT_EQ(Cm, hostData);
// test .upload()
- DTensor testSplitCm(rows, cols, mats);
- DTensor ACm(testSplitCm, 2, 0, 0);
- DTensor BCm(testSplitCm, 2, 1, 1);
+ DTensor testSplitCm(rows, cols, mats);
+ DTensor ACm(testSplitCm, 2, 0, 0);
+ DTensor BCm(testSplitCm, 2, 1, 1);
ACm.upload(aCm, columnMajor);
BCm.upload(bCm, columnMajor);
- DTensor testSplitRm(rows, cols, mats);
- DTensor ARm(testSplitRm, 2, 0, 0);
- DTensor BRm(testSplitRm, 2, 1, 1);
+ DTensor testSplitRm(rows, cols, mats);
+ DTensor ARm(testSplitRm, 2, 0, 0);
+ DTensor BRm(testSplitRm, 2, 1, 1);
ARm.upload(aRm, rowMajor);
BRm.upload(bRm, rowMajor);
testSplitCm.download(hostData);
@@ -88,10 +92,14 @@ void tensorConstructionStorageMode() {
EXPECT_EQ(Cm, hostData);
}
-TEST_F(TensorTest, tensorConstructionStorageMode) {
- tensorConstructionStorageMode();
- tensorConstructionStorageMode();
- tensorConstructionStorageMode();
+TEST_F(TensorTest, tensorConstructionStorageMode
+) {
+tensorConstructionStorageMode();
+
+tensorConstructionStorageMode();
+
+tensorConstructionStorageMode();
+
}
/* ---------------------------------------
@@ -109,10 +117,14 @@ void randomTensorCreation() {
EXPECT_TRUE(rEle >= -1 && rEle <= 1);
}
-TEST_F(TensorTest, randomTensorCreation) {
- randomTensorCreation();
- randomTensorCreation();
- randomTensorCreation();
+TEST_F(TensorTest, randomTensorCreation
+) {
+randomTensorCreation();
+
+randomTensorCreation();
+
+randomTensorCreation();
+
}
/* ---------------------------------------
@@ -121,17 +133,23 @@ TEST_F(TensorTest, randomTensorCreation) {
TEMPLATE_WITH_TYPE_T
void tensorMoveConstructor() {
- DTensor zero(2, 3, 4, true);
- DTensor x(std::move(zero));
- DTensor y(DTensor{100, 10, 1000});
+ DTensor zero(2, 3, 4, true);
+ DTensor x(std::move(zero));
+ DTensor y(DTensor < T > {100, 10, 1000});
}
-TEST_F(TensorTest, tensorMoveConstructor) {
- tensorMoveConstructor();
- tensorMoveConstructor();
- tensorMoveConstructor();
- tensorMoveConstructor();
- tensorMoveConstructor();
+TEST_F(TensorTest, tensorMoveConstructor
+) {
+tensorMoveConstructor();
+
+tensorMoveConstructor();
+
+tensorMoveConstructor();
+
+tensorMoveConstructor();
+
+tensorMoveConstructor();
+
}
/* ---------------------------------------
@@ -142,17 +160,21 @@ TEST_F(TensorTest, tensorMoveConstructor) {
TEMPLATE_WITH_TYPE_T
void tensorConstructionFromVector() {
std::vector data = TENSOR_DATA_234A;
- DTensor tenz(data, 2, 3, 4);
+ DTensor tenz(data, 2, 3, 4);
EXPECT_EQ(2, tenz.numRows());
EXPECT_EQ(3, tenz.numCols());
EXPECT_EQ(4, tenz.numMats());
EXPECT_EQ(2 * 3 * 4, tenz.numEl());
}
-TEST_F(TensorTest, tensorConstructionFromVector) {
- tensorConstructionFromVector();
- tensorConstructionFromVector();
- tensorConstructionFromVector();
+TEST_F(TensorTest, tensorConstructionFromVector
+) {
+tensorConstructionFromVector();
+
+tensorConstructionFromVector();
+
+tensorConstructionFromVector();
+
}
/* ---------------------------------------
@@ -162,8 +184,8 @@ TEST_F(TensorTest, tensorConstructionFromVector) {
TEMPLATE_WITH_TYPE_T
void tensorCopyConstructor() {
std::vector data = TENSOR_DATA_234A;
- DTensor tenz(data, 2, 3, 4);
- DTensor tenzCp(tenz);
+ DTensor tenz(data, 2, 3, 4);
+ DTensor tenzCp(tenz);
EXPECT_EQ(2, tenzCp.numRows());
EXPECT_EQ(3, tenzCp.numCols());
EXPECT_EQ(4, tenzCp.numMats());
@@ -174,10 +196,14 @@ void tensorCopyConstructor() {
EXPECT_NE(tenz.raw(), tenzCp.raw());
}
-TEST_F(TensorTest, tensorCopyConstructor) {
- tensorCopyConstructor();
- tensorCopyConstructor();
- tensorCopyConstructor();
+TEST_F(TensorTest, tensorCopyConstructor
+) {
+tensorCopyConstructor();
+
+tensorCopyConstructor();
+
+tensorCopyConstructor();
+
}
/* ---------------------------------------
@@ -188,18 +214,22 @@ TEST_F(TensorTest, tensorCopyConstructor) {
TEMPLATE_WITH_TYPE_T
void tensorSlicingConstructorAxis2() {
std::vector data = TENSOR_DATA_234A;
- DTensor tens(data, 2, 3, 4);
- DTensor tensSlice(tens, 2, 0, 1); // matrices #0 and #1
+ DTensor tens(data, 2, 3, 4);
+ DTensor tensSlice(tens, 2, 0, 1); // matrices #0 and #1
EXPECT_EQ(2, tensSlice.numRows());
EXPECT_EQ(3, tensSlice.numCols());
EXPECT_EQ(2, tensSlice.numMats());
EXPECT_EQ(tens.raw(), tensSlice.raw()); // it is indeed a slice
}
-TEST_F(TensorTest, tensorSlicingConstructorAxis2) {
- tensorSlicingConstructorAxis2();
- tensorSlicingConstructorAxis2();
- tensorSlicingConstructorAxis2();
+TEST_F(TensorTest, tensorSlicingConstructorAxis2
+) {
+tensorSlicingConstructorAxis2();
+
+tensorSlicingConstructorAxis2();
+
+tensorSlicingConstructorAxis2();
+
}
/* ---------------------------------------
@@ -210,8 +240,8 @@ TEST_F(TensorTest, tensorSlicingConstructorAxis2) {
TEMPLATE_WITH_TYPE_T
void tensorSlicingConstructorAxis1() {
std::vector data = TENSOR_DATA_234A;
- DTensor tenz(data, 2, 3, 4);
- DTensor tenzSlice(tenz, 1, 1, 2); // columns from 1 to 2
+ DTensor tenz(data, 2, 3, 4);
+ DTensor tenzSlice(tenz, 1, 1, 2); // columns from 1 to 2
EXPECT_EQ(2, tenzSlice.numRows());
EXPECT_EQ(2, tenzSlice.numCols());
EXPECT_EQ(1, tenzSlice.numMats());
@@ -221,10 +251,14 @@ void tensorSlicingConstructorAxis1() {
EXPECT_EQ(expected, tenzSliceDown);
}
-TEST_F(TensorTest, tensorSlicingConstructorAxis1) {
- tensorSlicingConstructorAxis1();
- tensorSlicingConstructorAxis1();
- tensorSlicingConstructorAxis1();
+TEST_F(TensorTest, tensorSlicingConstructorAxis1
+) {
+tensorSlicingConstructorAxis1();
+
+tensorSlicingConstructorAxis1();
+
+tensorSlicingConstructorAxis1();
+
}
/* ---------------------------------------
@@ -235,8 +269,8 @@ TEST_F(TensorTest, tensorSlicingConstructorAxis1) {
TEMPLATE_WITH_TYPE_T
void tensorSlicingConstructorAxis0() {
std::vector data = TENSOR_DATA_234A;
- DTensor tenz(data, 2, 3, 4);
- DTensor tenzSlice(tenz, 0, 2, 3); // elements 2..3
+ DTensor tenz(data, 2, 3, 4);
+ DTensor tenzSlice(tenz, 0, 2, 3); // elements 2..3
EXPECT_EQ(2, tenzSlice.numRows());
EXPECT_EQ(1, tenzSlice.numCols());
EXPECT_EQ(1, tenzSlice.numMats());
@@ -246,10 +280,14 @@ void tensorSlicingConstructorAxis0() {
EXPECT_EQ(expected, tenzSliceDown);
}
-TEST_F(TensorTest, tensorSlicingConstructorAxis0) {
- tensorSlicingConstructorAxis0();
- tensorSlicingConstructorAxis0();
- tensorSlicingConstructorAxis0();
+TEST_F(TensorTest, tensorSlicingConstructorAxis0
+) {
+tensorSlicingConstructorAxis0();
+
+tensorSlicingConstructorAxis0();
+
+tensorSlicingConstructorAxis0();
+
}
/* ---------------------------------------
@@ -259,7 +297,7 @@ TEST_F(TensorTest, tensorSlicingConstructorAxis0) {
TEMPLATE_WITH_TYPE_T
void tensorUpload() {
std::vector data = TENSOR_DATA_234A;
- DTensor tenz(2, 3, 4);
+ DTensor tenz(2, 3, 4);
tenz.upload(data);
EXPECT_EQ(2, tenz.numRows());
EXPECT_EQ(3, tenz.numCols());
@@ -269,10 +307,14 @@ void tensorUpload() {
EXPECT_EQ(8, tenz(1, 2, 3));
}
-TEST_F(TensorTest, tensorUpload) {
- tensorUpload();
- tensorUpload();
- tensorUpload();
+TEST_F(TensorTest, tensorUpload
+) {
+tensorUpload();
+
+tensorUpload();
+
+tensorUpload();
+
}
/* ---------------------------------------
@@ -282,9 +324,9 @@ TEST_F(TensorTest, tensorUpload) {
TEMPLATE_WITH_TYPE_T
void tensorDeviceCopyTo() {
std::vector data = TENSOR_DATA_234A;
- DTensor