From d9c8bddccc909e31a9bee2b29dd717fad6d7c5d2 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 6 Nov 2024 13:17:22 +0000
Subject: [PATCH 1/7] introducing raw pointer to matrices (on device)
---
include/tensor.cuh | 1 +
1 file changed, 1 insertion(+)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 0a8d1d1..d5fd8bd 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -185,6 +185,7 @@ private:
size_t m_numCols = 0; ///< Number of columns
size_t m_numMats = 0; ///< Number of matrices
bool m_doDestroy = false; ///< Whether to destroy memory
+ T** m_rawPtrToMatrices; ///< raw pointers to matrices
bool destroy() {
if (!m_doDestroy) return false;
From c7f92180836125fa59ad9798130d9fe44d6548cc Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 6 Nov 2024 17:19:02 +0000
Subject: [PATCH 2/7] Getting rid of pointersToMatrices
Introduce m_d_ptrMatrices to preallocate memory of an array
of pointers to each sub-matrix of a tensor. Destroy when
necessary. New method makePointersToMatrices. Slicing works.
Update all constructors
---
include/tensor.cuh | 97 ++++++++++++++++++++++++++++++++--------------
1 file changed, 68 insertions(+), 29 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index d5fd8bd..3e2cfb7 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -181,17 +181,35 @@ class DTensor {
private:
T *m_d_data = nullptr; ///< Pointer to device data
+ T** m_d_ptrMatrices = nullptr; ///< Pointer to matrices in tensor
size_t m_numRows = 0; ///< Number of rows
size_t m_numCols = 0; ///< Number of columns
size_t m_numMats = 0; ///< Number of matrices
- bool m_doDestroy = false; ///< Whether to destroy memory
- T** m_rawPtrToMatrices; ///< raw pointers to matrices
+ bool m_doDestroyData = false; ///< Whether to destroy m_d_data
+ bool m_doDestroyPointersToMatrices = false; ///< Whether to destroy m_d_ptrMatrices
+
+ void makePointersToMatrices() {
+ std::vector h_pointers(m_numMats);
+ size_t numelMat = m_numRows * m_numCols;
+ h_pointers[0] = m_d_data;
+ for (size_t i = 1; i < m_numMats; i++) {
+ h_pointers[i] = m_d_data + i * numelMat;
+ }
+ size_t buffer_size = m_numMats * sizeof(T*);
+ gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers.data(), buffer_size, cudaMemcpyHostToDevice));
+ m_doDestroyPointersToMatrices = true;
+ }
bool destroy() {
- if (!m_doDestroy) return false;
- if (m_d_data) cudaFree(m_d_data);
- m_d_data = nullptr;
- return true;
+ if (m_doDestroyData) {
+ if (m_d_data) gpuErrChk(cudaFree(m_d_data));
+ m_d_data = nullptr;
+ }
+ if (m_doDestroyPointersToMatrices){
+ if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices));
+ m_d_ptrMatrices = nullptr;
+ }
+ return m_doDestroyData || m_doDestroyPointersToMatrices;
}
/**
@@ -516,9 +534,8 @@ DTensor DTensor::createRandomTensor(size_t numRows, size_t numCols, size_t
auto randVec = generateIntRandomVector(numRows * numCols * numMats, low, hi);
DTensor a(randVec, numRows, numCols, numMats);
return a;
- } else {
- throw std::invalid_argument("[createRandomTensor] unsupported type T");
}
+ throw std::invalid_argument("[createRandomTensor] unsupported type T");
}
template
@@ -543,6 +560,7 @@ DTensor::DTensor(size_t m, size_t n, size_t k, bool zero) {
m_numMats = k;
size_t size = m * n * k;
allocateOnDevice(size, zero);
+ makePointersToMatrices();
}
template
@@ -553,6 +571,7 @@ DTensor::DTensor(const std::vector &data, size_t m, size_t n, size_t k, St
size_t size = m * n * k;
allocateOnDevice(size);
upload(data, mode);
+ makePointersToMatrices();
}
template
@@ -564,6 +583,7 @@ DTensor::DTensor(const DTensor &other) {
allocateOnDevice(m_numRows * m_numCols * m_numMats);
gpuErrChk(cudaMemcpy(m_d_data, other.raw(), m_numRows * m_numCols * m_numMats * sizeof(T),
cudaMemcpyDeviceToDevice));
+ makePointersToMatrices();
}
template
@@ -575,6 +595,7 @@ DTensor::DTensor(const DTensor &other, size_t axis, size_t from, size_t to
m_numRows = other.m_numRows;
m_numCols = other.m_numCols;
m_numMats = len;
+ m_d_ptrMatrices = other.m_d_ptrMatrices + from;
} else if (axis == 1) {
offset = other.m_numRows * from;
m_numRows = other.m_numRows;
@@ -587,7 +608,13 @@ DTensor::DTensor(const DTensor &other, size_t axis, size_t from, size_t to
m_numMats = 1;
}
m_d_data = other.m_d_data + offset;
- m_doDestroy = false;
+ m_doDestroyData = false;
+ if (axis == 1 || axis == 0) {
+ gpuErrChk(cudaMalloc(&m_d_ptrMatrices, sizeof(T *))); // allocate memory for an array of one T* element
+ T* h_pointers[1] = {m_d_data}; // transfer datum to device
+ gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers, sizeof(T*), cudaMemcpyHostToDevice));
+ m_doDestroyPointersToMatrices = true; // make sure to free memory later
+ }
}
template
@@ -596,9 +623,11 @@ DTensor::DTensor(DTensor &&other) {
m_numRows = other.m_numRows;
m_numMats = other.m_numMats;
m_d_data = other.m_d_data;
- m_doDestroy = true;
- other.m_doDestroy = false;
+ m_d_ptrMatrices = other.m_d_ptrMatrices;
+ m_doDestroyData = true;
+ other.m_doDestroyData = false;
other.m_d_data = nullptr;
+ other.m_d_ptrMatrices = nullptr;
other.m_numCols = 0;
other.m_numRows = 0;
other.m_numMats = 0;
@@ -758,12 +787,18 @@ template
inline bool DTensor::allocateOnDevice(size_t size, bool zero) {
if (size <= 0) return false;
destroy();
- m_doDestroy = true;
- size_t buffer_size = size * sizeof(T);
- bool cudaStatus = cudaMalloc(&m_d_data, buffer_size);
+ m_doDestroyData = true;
+ /* Allocate memory for m_d_data */
+ size_t data_size_bytes = size * sizeof(T);
+ bool cudaStatus = cudaMalloc(&m_d_data, data_size_bytes);
if (cudaStatus != cudaSuccess) return false;
- if (zero) gpuErrChk(cudaMemset(m_d_data, 0, buffer_size)); // set to zero all elements
- return true;
+ if (zero) gpuErrChk(cudaMemset(m_d_data, 0, data_size_bytes)); // set to zero all elements
+
+ /* Allocate memory for m_d_ptrMatrices */
+ size_t ptr_matrices_bytes = m_numMats * sizeof(T*);
+ cudaStatus = cudaMalloc(&m_d_ptrMatrices, ptr_matrices_bytes);
+
+ return (cudaStatus != cudaSuccess);
}
template
@@ -840,6 +875,10 @@ inline void DTensor::deviceCopyTo(DTensor &elsewhere) const {
m_d_data,
m_numRows * m_numCols * m_numMats * sizeof(T),
cudaMemcpyDeviceToDevice));
+ gpuErrChk(cudaMemcpy(elsewhere.m_d_ptrMatrices,
+ m_d_ptrMatrices,
+ m_numMats * sizeof(T*),
+ cudaMemcpyDeviceToDevice));
}
template<>
@@ -855,7 +894,7 @@ DTensor &DTensor::operator=(const DTensor &other) {
m_numMats = other.m_numMats;
m_numRows = other.m_numRows;
m_numCols = other.m_numCols;
- m_doDestroy = false;
+ m_doDestroyData = false;
m_d_data = other.m_d_data;
return *this;
}
@@ -929,17 +968,17 @@ inline void DTensor::addAB(const DTensor &A, const DTensor ptrA = A.pointersToMatrices();
- DTensor ptrB = B.pointersToMatrices();
- DTensor ptr = pointersToMatrices();
double _alpha = alpha, _beta = beta;
+ double** ptrAraw = A.m_d_ptrMatrices;
+ double** ptrBraw = B.m_d_ptrMatrices;
+ double** ptrRaw = m_d_ptrMatrices;
gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(),
CUBLAS_OP_N, CUBLAS_OP_N,
nRA, nCB, nCA, &_alpha,
- ptrA.raw(), nRA,
- ptrB.raw(), nCA,
+ ptrAraw, nRA,
+ ptrBraw, nCA,
&_beta,
- ptr.raw(), nRA,
+ ptrRaw, nRA,
nMat));
}
@@ -949,17 +988,17 @@ inline void DTensor::addAB(const DTensor &A, const DTensor
size_t nRA = A.numRows();
size_t nCA = A.numCols();
size_t nCB = B.numCols();
- DTensor ptrA = A.pointersToMatrices();
- DTensor ptrB = B.pointersToMatrices();
- DTensor ptr = pointersToMatrices();
+ float** ptrAraw = A.m_d_ptrMatrices;
+ float** ptrBraw = B.m_d_ptrMatrices;
+ float** ptrRaw = m_d_ptrMatrices;
float _alpha = alpha, _beta = beta;
gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(),
CUBLAS_OP_N, CUBLAS_OP_N,
nRA, nCB, nCA, &_alpha,
- ptrA.raw(), nRA,
- ptrB.raw(), nCA,
+ ptrAraw, nRA,
+ ptrBraw, nCA,
&_beta,
- ptr.raw(), nRA,
+ ptrRaw, nRA,
nMat));
}
From 9b17bb2ca9a14e9a306d8166c2e260fb704bfe27 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 6 Nov 2024 17:24:04 +0000
Subject: [PATCH 3/7] DTensor/copy constructor: warning in docs
---
include/tensor.cuh | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 3e2cfb7..8631a2d 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -305,6 +305,10 @@ public:
* @param axis axis to slice (0=rows, 1=columns, 2=matrices)
* @param from index to slice axis from (zero-indexed)
* @param to index to slice axis to (inclusive)
+ *
+ * @warning If axis=0 or axis=2, this method will (i) allocate memory on the GPU
+ * to store one element of size T* (pointer-to-T), (ii) will transfer one such
+ * element from the host to the device.
*/
DTensor(const DTensor &other, size_t axis, size_t from, size_t to);
From 452a6ca3b954e9b2bffef374127682c79040f2cc Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 6 Nov 2024 19:02:17 +0000
Subject: [PATCH 4/7] DTensor::addAB: use gemm instead of gemmBatched when
nMats=1
---
include/tensor.cuh | 73 ++++++++++++++++++++++++++++------------------
1 file changed, 44 insertions(+), 29 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 8631a2d..9134c12 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -181,7 +181,7 @@ class DTensor {
private:
T *m_d_data = nullptr; ///< Pointer to device data
- T** m_d_ptrMatrices = nullptr; ///< Pointer to matrices in tensor
+ T **m_d_ptrMatrices = nullptr; ///< Pointer to matrices in tensor
size_t m_numRows = 0; ///< Number of rows
size_t m_numCols = 0; ///< Number of columns
size_t m_numMats = 0; ///< Number of matrices
@@ -195,7 +195,7 @@ private:
for (size_t i = 1; i < m_numMats; i++) {
h_pointers[i] = m_d_data + i * numelMat;
}
- size_t buffer_size = m_numMats * sizeof(T*);
+ size_t buffer_size = m_numMats * sizeof(T *);
gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers.data(), buffer_size, cudaMemcpyHostToDevice));
m_doDestroyPointersToMatrices = true;
}
@@ -205,7 +205,7 @@ private:
if (m_d_data) gpuErrChk(cudaFree(m_d_data));
m_d_data = nullptr;
}
- if (m_doDestroyPointersToMatrices){
+ if (m_doDestroyPointersToMatrices) {
if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices));
m_d_ptrMatrices = nullptr;
}
@@ -555,6 +555,7 @@ void DTensor::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats
m_numRows = newNumRows;
m_numCols = newNumCols;
m_numMats = newNumMats;
+ // TODO create
}
template
@@ -615,8 +616,8 @@ DTensor::DTensor(const DTensor &other, size_t axis, size_t from, size_t to
m_doDestroyData = false;
if (axis == 1 || axis == 0) {
gpuErrChk(cudaMalloc(&m_d_ptrMatrices, sizeof(T *))); // allocate memory for an array of one T* element
- T* h_pointers[1] = {m_d_data}; // transfer datum to device
- gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers, sizeof(T*), cudaMemcpyHostToDevice));
+ T *h_pointers[1] = {m_d_data}; // transfer datum to device
+ gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers, sizeof(T *), cudaMemcpyHostToDevice));
m_doDestroyPointersToMatrices = true; // make sure to free memory later
}
}
@@ -799,7 +800,7 @@ inline bool DTensor::allocateOnDevice(size_t size, bool zero) {
if (zero) gpuErrChk(cudaMemset(m_d_data, 0, data_size_bytes)); // set to zero all elements
/* Allocate memory for m_d_ptrMatrices */
- size_t ptr_matrices_bytes = m_numMats * sizeof(T*);
+ size_t ptr_matrices_bytes = m_numMats * sizeof(T *);
cudaStatus = cudaMalloc(&m_d_ptrMatrices, ptr_matrices_bytes);
return (cudaStatus != cudaSuccess);
@@ -881,7 +882,7 @@ inline void DTensor::deviceCopyTo(DTensor &elsewhere) const {
cudaMemcpyDeviceToDevice));
gpuErrChk(cudaMemcpy(elsewhere.m_d_ptrMatrices,
m_d_ptrMatrices,
- m_numMats * sizeof(T*),
+ m_numMats * sizeof(T *),
cudaMemcpyDeviceToDevice));
}
@@ -973,17 +974,24 @@ inline void DTensor::addAB(const DTensor &A, const DTensor 1) {
+ gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(),
+ CUBLAS_OP_N, CUBLAS_OP_N,
+ nRA, nCB, nCA, &_alpha,
+ A.m_d_ptrMatrices, nRA,
+ B.m_d_ptrMatrices, nCA,
+ &_beta,
+ m_d_ptrMatrices, nRA,
+ nMat));
+ } else {
+ gpuErrChk(cublasDgemm(Session::getInstance().cuBlasHandle(),
+ CUBLAS_OP_N, CUBLAS_OP_N,
+ nRA, nCB, nCA, &_alpha,
+ A.raw(), nRA,
+ B.raw(), nCA,
+ &_beta,
+ raw(), nRA));
+ }
}
template<>
@@ -992,18 +1000,25 @@ inline void DTensor::addAB(const DTensor &A, const DTensor
size_t nRA = A.numRows();
size_t nCA = A.numCols();
size_t nCB = B.numCols();
- float** ptrAraw = A.m_d_ptrMatrices;
- float** ptrBraw = B.m_d_ptrMatrices;
- float** ptrRaw = m_d_ptrMatrices;
float _alpha = alpha, _beta = beta;
- gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(),
- CUBLAS_OP_N, CUBLAS_OP_N,
- nRA, nCB, nCA, &_alpha,
- ptrAraw, nRA,
- ptrBraw, nCA,
- &_beta,
- ptrRaw, nRA,
- nMat));
+ if (nMat > 1) {
+ gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(),
+ CUBLAS_OP_N, CUBLAS_OP_N,
+ nRA, nCB, nCA, &_alpha,
+ A.m_d_ptrMatrices, nRA,
+ B.m_d_ptrMatrices, nCA,
+ &_beta,
+ m_d_ptrMatrices, nRA,
+ nMat));
+ } else {
+ gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(),
+ CUBLAS_OP_N, CUBLAS_OP_N,
+ nRA, nCB, nCA, &_alpha,
+ A.raw(), nRA,
+ B.raw(), nCA,
+ &_beta,
+ raw(), nRA));
+ }
}
template<>
From 0d3bfc021695025a86ac8163c1c8ad8fb2e2537d Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 6 Nov 2024 19:26:12 +0000
Subject: [PATCH 5/7] DTensor::allocateOnDevice: set
m_doDestroyPointersToMatrices
Fix issue with memory allocation/freeing
---
include/tensor.cuh | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 9134c12..4f59982 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -613,12 +613,11 @@ DTensor::DTensor(const DTensor &other, size_t axis, size_t from, size_t to
m_numMats = 1;
}
m_d_data = other.m_d_data + offset;
- m_doDestroyData = false;
- if (axis == 1 || axis == 0) {
- gpuErrChk(cudaMalloc(&m_d_ptrMatrices, sizeof(T *))); // allocate memory for an array of one T* element
- T *h_pointers[1] = {m_d_data}; // transfer datum to device
- gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers, sizeof(T *), cudaMemcpyHostToDevice));
- m_doDestroyPointersToMatrices = true; // make sure to free memory later
+ m_doDestroyData = false; // no new memory allocated!
+ m_doDestroyPointersToMatrices = false; // no new auxiliary memory allocated!
+ if (axis != 2) {
+ // m_d_ptrMatrices is not needed for vectors and matrices
+ m_d_ptrMatrices = nullptr;
}
}
@@ -802,6 +801,7 @@ inline bool DTensor::allocateOnDevice(size_t size, bool zero) {
/* Allocate memory for m_d_ptrMatrices */
size_t ptr_matrices_bytes = m_numMats * sizeof(T *);
cudaStatus = cudaMalloc(&m_d_ptrMatrices, ptr_matrices_bytes);
+ m_doDestroyPointersToMatrices = true;
return (cudaStatus != cudaSuccess);
}
@@ -880,10 +880,12 @@ inline void DTensor::deviceCopyTo(DTensor &elsewhere) const {
m_d_data,
m_numRows * m_numCols * m_numMats * sizeof(T),
cudaMemcpyDeviceToDevice));
- gpuErrChk(cudaMemcpy(elsewhere.m_d_ptrMatrices,
- m_d_ptrMatrices,
- m_numMats * sizeof(T *),
- cudaMemcpyDeviceToDevice));
+ if (m_d_ptrMatrices) {
+ gpuErrChk(cudaMemcpy(elsewhere.m_d_ptrMatrices,
+ m_d_ptrMatrices,
+ m_numMats * sizeof(T *),
+ cudaMemcpyDeviceToDevice));
+ }
}
template<>
From 82af79f4ed827183c87a1593e996a10663c94474 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 6 Nov 2024 19:42:34 +0000
Subject: [PATCH 6/7] update destroy
---
include/tensor.cuh | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 4f59982..050d5d1 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -201,15 +201,18 @@ private:
}
bool destroy() {
+ bool willDestroy = m_doDestroyData || m_doDestroyPointersToMatrices;
if (m_doDestroyData) {
if (m_d_data) gpuErrChk(cudaFree(m_d_data));
m_d_data = nullptr;
+ m_doDestroyData = false;
}
if (m_doDestroyPointersToMatrices) {
if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices));
m_d_ptrMatrices = nullptr;
+ m_doDestroyPointersToMatrices = false;
}
- return m_doDestroyData || m_doDestroyPointersToMatrices;
+ return willDestroy;
}
/**
From b7ecb3f13158c289e57a7de0761e026a3c083276 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 6 Nov 2024 20:02:12 +0000
Subject: [PATCH 7/7] Get rid of pointersToMatrices
---
include/tensor.cuh | 71 +++++++++++++++++++++-------------------------
test/testTensor.cu | 23 ---------------
2 files changed, 32 insertions(+), 62 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 050d5d1..8007209 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -205,12 +205,10 @@ private:
if (m_doDestroyData) {
if (m_d_data) gpuErrChk(cudaFree(m_d_data));
m_d_data = nullptr;
- m_doDestroyData = false;
}
if (m_doDestroyPointersToMatrices) {
if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices));
m_d_ptrMatrices = nullptr;
- m_doDestroyPointersToMatrices = false;
}
return willDestroy;
}
@@ -320,6 +318,10 @@ public:
*/
T *raw() const;
+ T **ptrMatrices() const {
+ return m_d_ptrMatrices;
+ }
+
/**
* @return number of rows
*/
@@ -842,6 +844,7 @@ inline T *DTensor::raw() const {
return m_d_data;
}
+
template<>
inline DTensor DTensor::tr() const {
DTensor transposes(m_numCols, m_numRows, m_numMats);
@@ -960,17 +963,17 @@ inline T DTensor::operator()(size_t i, size_t j, size_t k) const {
return hostDst;
}
-template
-inline DTensor DTensor::pointersToMatrices() const {
- std::vector h_pointers(m_numMats);
- size_t numelMat = m_numRows * m_numCols;
- h_pointers[0] = m_d_data;
- for (size_t i = 1; i < m_numMats; i++) {
- h_pointers[i] = m_d_data + i * numelMat;
- }
- DTensor t(h_pointers, m_numMats, 1, 1);
- return t;
-}
+//template
+//inline DTensor DTensor::pointersToMatrices() const {
+// std::vector h_pointers(m_numMats);
+// size_t numelMat = m_numRows * m_numCols;
+// h_pointers[0] = m_d_data;
+// for (size_t i = 1; i < m_numMats; i++) {
+// h_pointers[i] = m_d_data + i * numelMat;
+// }
+// DTensor t(h_pointers, m_numMats, 1, 1);
+// return t;
+//}
template<>
inline void DTensor::addAB(const DTensor &A, const DTensor &B, double alpha, double beta) {
@@ -1017,12 +1020,12 @@ inline void DTensor::addAB(const DTensor &A, const DTensor
nMat));
} else {
gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(),
- CUBLAS_OP_N, CUBLAS_OP_N,
- nRA, nCB, nCA, &_alpha,
- A.raw(), nRA,
- B.raw(), nCA,
- &_beta,
- raw(), nRA));
+ CUBLAS_OP_N, CUBLAS_OP_N,
+ nRA, nCB, nCA, &_alpha,
+ A.raw(), nRA,
+ B.raw(), nCA,
+ &_beta,
+ raw(), nRA));
}
}
@@ -1040,16 +1043,14 @@ inline void DTensor::leastSquaresBatched(DTensor &B) {
throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
int info = 0;
DTensor infoArray(batchSize);
- DTensor As = pointersToMatrices();
- DTensor Bs = B.pointersToMatrices();
gpuErrChk(cublasDgelsBatched(Session::getInstance().cuBlasHandle(),
CUBLAS_OP_N,
m_numRows,
m_numCols,
nColsB,
- As.raw(),
+ m_d_ptrMatrices,
m_numRows,
- Bs.raw(),
+ B.m_d_ptrMatrices,
m_numRows,
&info,
infoArray.raw(),
@@ -1070,16 +1071,14 @@ inline void DTensor::leastSquaresBatched(DTensor &B) {
throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
int info = 0;
DTensor infoArray(batchSize);
- DTensor As = pointersToMatrices();
- DTensor Bs = B.pointersToMatrices();
gpuErrChk(cublasSgelsBatched(Session::getInstance().cuBlasHandle(),
CUBLAS_OP_N,
m_numRows,
m_numCols,
nColsB,
- As.raw(),
+ m_d_ptrMatrices,
m_numRows,
- Bs.raw(),
+ B.m_d_ptrMatrices,
m_numRows,
&info,
infoArray.raw(),
@@ -1839,11 +1838,10 @@ public:
template<>
inline void CholeskyBatchFactoriser::factorise() {
if (m_factorisationDone) return;
- DTensor ptrA = m_matrix->pointersToMatrices();
gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(),
CUBLAS_FILL_MODE_LOWER,
m_numRows,
- ptrA.raw(),
+ m_matrix->ptrMatrices(),
m_numRows,
m_deviceInfo->raw(),
m_numMats));
@@ -1853,11 +1851,10 @@ inline void CholeskyBatchFactoriser::factorise() {
template<>
inline void CholeskyBatchFactoriser::factorise() {
if (m_factorisationDone) return;
- DTensor ptrA = m_matrix->pointersToMatrices();
gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(),
CUBLAS_FILL_MODE_LOWER,
m_numRows,
- ptrA.raw(),
+ m_matrix->ptrMatrices(),
m_numRows,
m_deviceInfo->raw(),
m_numMats));
@@ -1871,15 +1868,13 @@ inline void CholeskyBatchFactoriser::solve(DTensor &b) {
throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
}
if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
- DTensor ptrA = m_matrix->pointersToMatrices();
- DTensor ptrB = b.pointersToMatrices();
gpuErrChk(cusolverDnDpotrsBatched(Session::getInstance().cuSolverHandle(),
CUBLAS_FILL_MODE_LOWER,
m_numRows,
1, ///< only supports rhs = 1
- ptrA.raw(),
+ m_matrix->ptrMatrices(),
m_numRows,
- ptrB.raw(),
+ b.ptrMatrices(),
m_numRows,
m_deviceInfo->raw(),
m_numMats));
@@ -1892,15 +1887,13 @@ inline void CholeskyBatchFactoriser::solve(DTensor &b) {
throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
}
if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
- DTensor ptrA = m_matrix->pointersToMatrices();
- DTensor ptrB = b.pointersToMatrices();
gpuErrChk(cusolverDnSpotrsBatched(Session::getInstance().cuSolverHandle(),
CUBLAS_FILL_MODE_LOWER,
m_numRows,
1, ///< only supports rhs = 1
- ptrA.raw(),
+ m_matrix->ptrMatrices(),
m_numRows,
- ptrB.raw(),
+ b.ptrMatrices(),
m_numRows,
m_deviceInfo->raw(),
m_numMats));
diff --git a/test/testTensor.cu b/test/testTensor.cu
index 0a33ffc..3cc7580 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -635,29 +635,6 @@ TEST_F(TensorTest, tensorMinusTensor) {
tensorMinusTensor();
}
-/* ---------------------------------------
- * Tensor: pointers to matrices (on device)
- * --------------------------------------- */
-
-TEMPLATE_WITH_TYPE_T
-void tensorPointersToMatrices() {
- std::vector dataA = TENSOR_DATA_234A;
- DTensor A(dataA, 2, 3, 4);
- DTensor pointers = A.pointersToMatrices();
- EXPECT_EQ(4, pointers.numRows());
- EXPECT_EQ(1, pointers.numCols());
- EXPECT_EQ(1, pointers.numMats());
- T *p1 = pointers(1, 0, 0); // pointer to matrix #1
- T hostDst; // let's see what's there...
- cudaMemcpy(&hostDst, p1, sizeof(T), cudaMemcpyDeviceToHost);
- EXPECT_EQ(dataA[6], hostDst);
-}
-
-TEST_F(TensorTest, tensorPointersToMatrices) {
- tensorPointersToMatrices();
- tensorPointersToMatrices();
- tensorPointersToMatrices();
-}
/* ---------------------------------------
* Tensor: C = AB