From d9c8bddccc909e31a9bee2b29dd717fad6d7c5d2 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Wed, 6 Nov 2024 13:17:22 +0000
Subject: [PATCH 1/7] introducing raw pointer to matrices (on device)

---
 include/tensor.cuh | 1 +
 1 file changed, 1 insertion(+)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 0a8d1d1..d5fd8bd 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -185,6 +185,7 @@ private:
     size_t m_numCols = 0;  ///< Number of columns
     size_t m_numMats = 0;  ///< Number of matrices
     bool m_doDestroy = false;  ///< Whether to destroy memory
+    T** m_rawPtrToMatrices; ///< raw pointers to matrices
 
     bool destroy() {
         if (!m_doDestroy) return false;

From c7f92180836125fa59ad9798130d9fe44d6548cc Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Wed, 6 Nov 2024 17:19:02 +0000
Subject: [PATCH 2/7] Getting rid of pointersToMatrices

Introduce m_d_ptrMatrices to preallocate memory of an array
of pointers to each sub-matrix of a tensor. Destroy when
necessary. New method makePointersToMatrices. Slicing works.
Update all constructors
---
 include/tensor.cuh | 97 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 68 insertions(+), 29 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index d5fd8bd..3e2cfb7 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -181,17 +181,35 @@ class DTensor {
 
 private:
     T *m_d_data = nullptr;  ///< Pointer to device data
+    T** m_d_ptrMatrices = nullptr; ///< Pointer to matrices in tensor
     size_t m_numRows = 0;  ///< Number of rows
     size_t m_numCols = 0;  ///< Number of columns
     size_t m_numMats = 0;  ///< Number of matrices
-    bool m_doDestroy = false;  ///< Whether to destroy memory
-    T** m_rawPtrToMatrices; ///< raw pointers to matrices
+    bool m_doDestroyData = false;  ///< Whether to destroy m_d_data
+    bool m_doDestroyPointersToMatrices = false; ///< Whether to destroy m_d_ptrMatrices
+
+    void makePointersToMatrices() {
+        std::vector<T *> h_pointers(m_numMats);
+        size_t numelMat = m_numRows * m_numCols;
+        h_pointers[0] = m_d_data;
+        for (size_t i = 1; i < m_numMats; i++) {
+            h_pointers[i] = m_d_data + i * numelMat;
+        }
+        size_t buffer_size = m_numMats * sizeof(T*);
+        gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers.data(), buffer_size, cudaMemcpyHostToDevice));
+        m_doDestroyPointersToMatrices = true;
+    }
 
     bool destroy() {
-        if (!m_doDestroy) return false;
-        if (m_d_data) cudaFree(m_d_data);
-        m_d_data = nullptr;
-        return true;
+        if (m_doDestroyData) {
+            if (m_d_data) gpuErrChk(cudaFree(m_d_data));
+            m_d_data = nullptr;
+        }
+        if (m_doDestroyPointersToMatrices){
+            if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices));
+            m_d_ptrMatrices = nullptr;
+        }
+        return m_doDestroyData || m_doDestroyPointersToMatrices;
     }
 
     /**
@@ -516,9 +534,8 @@ DTensor<T> DTensor<T>::createRandomTensor(size_t numRows, size_t numCols, size_t
         auto randVec = generateIntRandomVector(numRows * numCols * numMats, low, hi);
         DTensor<T> a(randVec, numRows, numCols, numMats);
         return a;
-    } else {
-        throw std::invalid_argument("[createRandomTensor] unsupported type T");
     }
+    throw std::invalid_argument("[createRandomTensor] unsupported type T");
 }
 
 template<typename T>
@@ -543,6 +560,7 @@ DTensor<T>::DTensor(size_t m, size_t n, size_t k, bool zero) {
     m_numMats = k;
     size_t size = m * n * k;
     allocateOnDevice(size, zero);
+    makePointersToMatrices();
 }
 
 template<typename T>
@@ -553,6 +571,7 @@ DTensor<T>::DTensor(const std::vector<T> &data, size_t m, size_t n, size_t k, St
     size_t size = m * n * k;
     allocateOnDevice(size);
     upload(data, mode);
+    makePointersToMatrices();
 }
 
 template<typename T>
@@ -564,6 +583,7 @@ DTensor<T>::DTensor(const DTensor<T> &other) {
     allocateOnDevice(m_numRows * m_numCols * m_numMats);
     gpuErrChk(cudaMemcpy(m_d_data, other.raw(), m_numRows * m_numCols * m_numMats * sizeof(T),
                          cudaMemcpyDeviceToDevice));
+    makePointersToMatrices();
 }
 
 template<typename T>
@@ -575,6 +595,7 @@ DTensor<T>::DTensor(const DTensor<T> &other, size_t axis, size_t from, size_t to
         m_numRows = other.m_numRows;
         m_numCols = other.m_numCols;
         m_numMats = len;
+        m_d_ptrMatrices = other.m_d_ptrMatrices + from;
     } else if (axis == 1) {
         offset = other.m_numRows * from;
         m_numRows = other.m_numRows;
@@ -587,7 +608,13 @@ DTensor<T>::DTensor(const DTensor<T> &other, size_t axis, size_t from, size_t to
         m_numMats = 1;
     }
     m_d_data = other.m_d_data + offset;
-    m_doDestroy = false;
+    m_doDestroyData = false;
+    if (axis == 1 || axis == 0) {
+        gpuErrChk(cudaMalloc(&m_d_ptrMatrices, sizeof(T *))); // allocate memory for an array of one T* element
+        T* h_pointers[1] = {m_d_data}; // transfer datum to device
+        gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers, sizeof(T*), cudaMemcpyHostToDevice));
+        m_doDestroyPointersToMatrices = true; // make sure to free memory later
+    }
 }
 
 template<typename T>
@@ -596,9 +623,11 @@ DTensor<T>::DTensor(DTensor<T> &&other) {
     m_numRows = other.m_numRows;
     m_numMats = other.m_numMats;
     m_d_data = other.m_d_data;
-    m_doDestroy = true;
-    other.m_doDestroy = false;
+    m_d_ptrMatrices = other.m_d_ptrMatrices;
+    m_doDestroyData = true;
+    other.m_doDestroyData = false;
     other.m_d_data = nullptr;
+    other.m_d_ptrMatrices = nullptr;
     other.m_numCols = 0;
     other.m_numRows = 0;
     other.m_numMats = 0;
@@ -758,12 +787,18 @@ template<typename T>
 inline bool DTensor<T>::allocateOnDevice(size_t size, bool zero) {
     if (size <= 0) return false;
     destroy();
-    m_doDestroy = true;
-    size_t buffer_size = size * sizeof(T);
-    bool cudaStatus = cudaMalloc(&m_d_data, buffer_size);
+    m_doDestroyData = true;
+    /* Allocate memory for m_d_data */
+    size_t data_size_bytes = size * sizeof(T);
+    bool cudaStatus = cudaMalloc(&m_d_data, data_size_bytes);
     if (cudaStatus != cudaSuccess) return false;
-    if (zero) gpuErrChk(cudaMemset(m_d_data, 0, buffer_size)); // set to zero all elements
-    return true;
+    if (zero) gpuErrChk(cudaMemset(m_d_data, 0, data_size_bytes)); // set to zero all elements
+
+    /* Allocate memory for m_d_ptrMatrices */
+    size_t ptr_matrices_bytes = m_numMats * sizeof(T*);
+    cudaStatus = cudaMalloc(&m_d_ptrMatrices, ptr_matrices_bytes);
+
+    return (cudaStatus != cudaSuccess);
 }
 
 template<typename T>
@@ -840,6 +875,10 @@ inline void DTensor<T>::deviceCopyTo(DTensor<T> &elsewhere) const {
                          m_d_data,
                          m_numRows * m_numCols * m_numMats * sizeof(T),
                          cudaMemcpyDeviceToDevice));
+    gpuErrChk(cudaMemcpy(elsewhere.m_d_ptrMatrices,
+                         m_d_ptrMatrices,
+                         m_numMats * sizeof(T*),
+                         cudaMemcpyDeviceToDevice));
 }
 
 template<>
@@ -855,7 +894,7 @@ DTensor<T> &DTensor<T>::operator=(const DTensor<T> &other) {
     m_numMats = other.m_numMats;
     m_numRows = other.m_numRows;
     m_numCols = other.m_numCols;
-    m_doDestroy = false;
+    m_doDestroyData = false;
     m_d_data = other.m_d_data;
     return *this;
 }
@@ -929,17 +968,17 @@ inline void DTensor<double>::addAB(const DTensor<double> &A, const DTensor<doubl
     size_t nRA = A.numRows();
     size_t nCA = A.numCols();
     size_t nCB = B.numCols();
-    DTensor<double *> ptrA = A.pointersToMatrices();
-    DTensor<double *> ptrB = B.pointersToMatrices();
-    DTensor<double *> ptr = pointersToMatrices();
     double _alpha = alpha, _beta = beta;
+    double** ptrAraw = A.m_d_ptrMatrices;
+    double** ptrBraw = B.m_d_ptrMatrices;
+    double** ptrRaw = m_d_ptrMatrices;
     gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(),
                                  CUBLAS_OP_N, CUBLAS_OP_N,
                                  nRA, nCB, nCA, &_alpha,
-                                 ptrA.raw(), nRA,
-                                 ptrB.raw(), nCA,
+                                 ptrAraw, nRA,
+                                 ptrBraw, nCA,
                                  &_beta,
-                                 ptr.raw(), nRA,
+                                 ptrRaw, nRA,
                                  nMat));
 }
 
@@ -949,17 +988,17 @@ inline void DTensor<float>::addAB(const DTensor<float> &A, const DTensor<float>
     size_t nRA = A.numRows();
     size_t nCA = A.numCols();
     size_t nCB = B.numCols();
-    DTensor<float *> ptrA = A.pointersToMatrices();
-    DTensor<float *> ptrB = B.pointersToMatrices();
-    DTensor<float *> ptr = pointersToMatrices();
+    float** ptrAraw = A.m_d_ptrMatrices;
+    float** ptrBraw = B.m_d_ptrMatrices;
+    float** ptrRaw = m_d_ptrMatrices;
     float _alpha = alpha, _beta = beta;
     gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(),
                                  CUBLAS_OP_N, CUBLAS_OP_N,
                                  nRA, nCB, nCA, &_alpha,
-                                 ptrA.raw(), nRA,
-                                 ptrB.raw(), nCA,
+                                 ptrAraw, nRA,
+                                 ptrBraw, nCA,
                                  &_beta,
-                                 ptr.raw(), nRA,
+                                 ptrRaw, nRA,
                                  nMat));
 }
 

From 9b17bb2ca9a14e9a306d8166c2e260fb704bfe27 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Wed, 6 Nov 2024 17:24:04 +0000
Subject: [PATCH 3/7] DTensor/copy constructor: warning in docs

---
 include/tensor.cuh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 3e2cfb7..8631a2d 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -305,6 +305,10 @@ public:
      * @param axis axis to slice (0=rows, 1=columns, 2=matrices)
      * @param from index to slice axis from (zero-indexed)
      * @param to index to slice axis to (inclusive)
+     *
+     * @warning If axis=0 or axis=2, this method will (i) allocate memory on the GPU
+     * to store one element of size T* (pointer-to-T), (ii) will transfer one such
+     * element from the host to the device.
      */
     DTensor(const DTensor &other, size_t axis, size_t from, size_t to);
 

From 452a6ca3b954e9b2bffef374127682c79040f2cc Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Wed, 6 Nov 2024 19:02:17 +0000
Subject: [PATCH 4/7] DTensor::addAB: use gemm instead of gemmBatched when
 nMats=1

---
 include/tensor.cuh | 73 ++++++++++++++++++++++++++++------------------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 8631a2d..9134c12 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -181,7 +181,7 @@ class DTensor {
 
 private:
     T *m_d_data = nullptr;  ///< Pointer to device data
-    T** m_d_ptrMatrices = nullptr; ///< Pointer to matrices in tensor
+    T **m_d_ptrMatrices = nullptr; ///< Pointer to matrices in tensor
     size_t m_numRows = 0;  ///< Number of rows
     size_t m_numCols = 0;  ///< Number of columns
     size_t m_numMats = 0;  ///< Number of matrices
@@ -195,7 +195,7 @@ private:
         for (size_t i = 1; i < m_numMats; i++) {
             h_pointers[i] = m_d_data + i * numelMat;
         }
-        size_t buffer_size = m_numMats * sizeof(T*);
+        size_t buffer_size = m_numMats * sizeof(T *);
         gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers.data(), buffer_size, cudaMemcpyHostToDevice));
         m_doDestroyPointersToMatrices = true;
     }
@@ -205,7 +205,7 @@ private:
             if (m_d_data) gpuErrChk(cudaFree(m_d_data));
             m_d_data = nullptr;
         }
-        if (m_doDestroyPointersToMatrices){
+        if (m_doDestroyPointersToMatrices) {
             if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices));
             m_d_ptrMatrices = nullptr;
         }
@@ -555,6 +555,7 @@ void DTensor<T>::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats
     m_numRows = newNumRows;
     m_numCols = newNumCols;
     m_numMats = newNumMats;
+    // TODO create
 }
 
 template<typename T>
@@ -615,8 +616,8 @@ DTensor<T>::DTensor(const DTensor<T> &other, size_t axis, size_t from, size_t to
     m_doDestroyData = false;
     if (axis == 1 || axis == 0) {
         gpuErrChk(cudaMalloc(&m_d_ptrMatrices, sizeof(T *))); // allocate memory for an array of one T* element
-        T* h_pointers[1] = {m_d_data}; // transfer datum to device
-        gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers, sizeof(T*), cudaMemcpyHostToDevice));
+        T *h_pointers[1] = {m_d_data}; // transfer datum to device
+        gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers, sizeof(T *), cudaMemcpyHostToDevice));
         m_doDestroyPointersToMatrices = true; // make sure to free memory later
     }
 }
@@ -799,7 +800,7 @@ inline bool DTensor<T>::allocateOnDevice(size_t size, bool zero) {
     if (zero) gpuErrChk(cudaMemset(m_d_data, 0, data_size_bytes)); // set to zero all elements
 
     /* Allocate memory for m_d_ptrMatrices */
-    size_t ptr_matrices_bytes = m_numMats * sizeof(T*);
+    size_t ptr_matrices_bytes = m_numMats * sizeof(T *);
     cudaStatus = cudaMalloc(&m_d_ptrMatrices, ptr_matrices_bytes);
 
     return (cudaStatus != cudaSuccess);
@@ -881,7 +882,7 @@ inline void DTensor<T>::deviceCopyTo(DTensor<T> &elsewhere) const {
                          cudaMemcpyDeviceToDevice));
     gpuErrChk(cudaMemcpy(elsewhere.m_d_ptrMatrices,
                          m_d_ptrMatrices,
-                         m_numMats * sizeof(T*),
+                         m_numMats * sizeof(T *),
                          cudaMemcpyDeviceToDevice));
 }
 
@@ -973,17 +974,24 @@ inline void DTensor<double>::addAB(const DTensor<double> &A, const DTensor<doubl
     size_t nCA = A.numCols();
     size_t nCB = B.numCols();
     double _alpha = alpha, _beta = beta;
-    double** ptrAraw = A.m_d_ptrMatrices;
-    double** ptrBraw = B.m_d_ptrMatrices;
-    double** ptrRaw = m_d_ptrMatrices;
-    gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(),
-                                 CUBLAS_OP_N, CUBLAS_OP_N,
-                                 nRA, nCB, nCA, &_alpha,
-                                 ptrAraw, nRA,
-                                 ptrBraw, nCA,
-                                 &_beta,
-                                 ptrRaw, nRA,
-                                 nMat));
+    if (nMat > 1) {
+        gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(),
+                                     CUBLAS_OP_N, CUBLAS_OP_N,
+                                     nRA, nCB, nCA, &_alpha,
+                                     A.m_d_ptrMatrices, nRA,
+                                     B.m_d_ptrMatrices, nCA,
+                                     &_beta,
+                                     m_d_ptrMatrices, nRA,
+                                     nMat));
+    } else {
+        gpuErrChk(cublasDgemm(Session::getInstance().cuBlasHandle(),
+                              CUBLAS_OP_N, CUBLAS_OP_N,
+                              nRA, nCB, nCA, &_alpha,
+                              A.raw(), nRA,
+                              B.raw(), nCA,
+                              &_beta,
+                              raw(), nRA));
+    }
 }
 
 template<>
@@ -992,18 +1000,25 @@ inline void DTensor<float>::addAB(const DTensor<float> &A, const DTensor<float>
     size_t nRA = A.numRows();
     size_t nCA = A.numCols();
     size_t nCB = B.numCols();
-    float** ptrAraw = A.m_d_ptrMatrices;
-    float** ptrBraw = B.m_d_ptrMatrices;
-    float** ptrRaw = m_d_ptrMatrices;
     float _alpha = alpha, _beta = beta;
-    gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(),
-                                 CUBLAS_OP_N, CUBLAS_OP_N,
-                                 nRA, nCB, nCA, &_alpha,
-                                 ptrAraw, nRA,
-                                 ptrBraw, nCA,
-                                 &_beta,
-                                 ptrRaw, nRA,
-                                 nMat));
+    if (nMat > 1) {
+        gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(),
+                                     CUBLAS_OP_N, CUBLAS_OP_N,
+                                     nRA, nCB, nCA, &_alpha,
+                                     A.m_d_ptrMatrices, nRA,
+                                     B.m_d_ptrMatrices, nCA,
+                                     &_beta,
+                                     m_d_ptrMatrices, nRA,
+                                     nMat));
+    } else {
+        gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(),
+                                     CUBLAS_OP_N, CUBLAS_OP_N,
+                                     nRA, nCB, nCA, &_alpha,
+                                     A.raw(), nRA,
+                                     B.raw(), nCA,
+                                     &_beta,
+                                     raw(), nRA));
+    }
 }
 
 template<>

From 0d3bfc021695025a86ac8163c1c8ad8fb2e2537d Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Wed, 6 Nov 2024 19:26:12 +0000
Subject: [PATCH 5/7] DTensor<T>::allocateOnDevice: set
 m_doDestroyPointersToMatrices

Fix issue with memory allocation/freeing
---
 include/tensor.cuh | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 9134c12..4f59982 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -613,12 +613,11 @@ DTensor<T>::DTensor(const DTensor<T> &other, size_t axis, size_t from, size_t to
         m_numMats = 1;
     }
     m_d_data = other.m_d_data + offset;
-    m_doDestroyData = false;
-    if (axis == 1 || axis == 0) {
-        gpuErrChk(cudaMalloc(&m_d_ptrMatrices, sizeof(T *))); // allocate memory for an array of one T* element
-        T *h_pointers[1] = {m_d_data}; // transfer datum to device
-        gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers, sizeof(T *), cudaMemcpyHostToDevice));
-        m_doDestroyPointersToMatrices = true; // make sure to free memory later
+    m_doDestroyData = false; // no new memory allocated!
+    m_doDestroyPointersToMatrices = false; // no new auxiliary memory allocated!
+    if (axis != 2) {
+        // m_d_ptrMatrices is not needed for vectors and matrices
+        m_d_ptrMatrices = nullptr;
     }
 }
 
@@ -802,6 +801,7 @@ inline bool DTensor<T>::allocateOnDevice(size_t size, bool zero) {
     /* Allocate memory for m_d_ptrMatrices */
     size_t ptr_matrices_bytes = m_numMats * sizeof(T *);
     cudaStatus = cudaMalloc(&m_d_ptrMatrices, ptr_matrices_bytes);
+    m_doDestroyPointersToMatrices = true;
 
     return (cudaStatus != cudaSuccess);
 }
@@ -880,10 +880,12 @@ inline void DTensor<T>::deviceCopyTo(DTensor<T> &elsewhere) const {
                          m_d_data,
                          m_numRows * m_numCols * m_numMats * sizeof(T),
                          cudaMemcpyDeviceToDevice));
-    gpuErrChk(cudaMemcpy(elsewhere.m_d_ptrMatrices,
-                         m_d_ptrMatrices,
-                         m_numMats * sizeof(T *),
-                         cudaMemcpyDeviceToDevice));
+    if (m_d_ptrMatrices) {
+        gpuErrChk(cudaMemcpy(elsewhere.m_d_ptrMatrices,
+                             m_d_ptrMatrices,
+                             m_numMats * sizeof(T *),
+                             cudaMemcpyDeviceToDevice));
+    }
 }
 
 template<>

From 82af79f4ed827183c87a1593e996a10663c94474 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Wed, 6 Nov 2024 19:42:34 +0000
Subject: [PATCH 6/7] update destroy

---
 include/tensor.cuh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 4f59982..050d5d1 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -201,15 +201,18 @@ private:
     }
 
     bool destroy() {
+        bool willDestroy = m_doDestroyData || m_doDestroyPointersToMatrices;
         if (m_doDestroyData) {
             if (m_d_data) gpuErrChk(cudaFree(m_d_data));
             m_d_data = nullptr;
+            m_doDestroyData = false;
         }
         if (m_doDestroyPointersToMatrices) {
             if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices));
             m_d_ptrMatrices = nullptr;
+            m_doDestroyPointersToMatrices = false;
         }
-        return m_doDestroyData || m_doDestroyPointersToMatrices;
+        return willDestroy;
     }
 
     /**

From b7ecb3f13158c289e57a7de0761e026a3c083276 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Wed, 6 Nov 2024 20:02:12 +0000
Subject: [PATCH 7/7] Get rid of pointersToMatrices

---
 include/tensor.cuh | 71 +++++++++++++++++++++-------------------------
 test/testTensor.cu | 23 ---------------
 2 files changed, 32 insertions(+), 62 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 050d5d1..8007209 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -205,12 +205,10 @@ private:
         if (m_doDestroyData) {
             if (m_d_data) gpuErrChk(cudaFree(m_d_data));
             m_d_data = nullptr;
-            m_doDestroyData = false;
         }
         if (m_doDestroyPointersToMatrices) {
             if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices));
             m_d_ptrMatrices = nullptr;
-            m_doDestroyPointersToMatrices = false;
         }
         return willDestroy;
     }
@@ -320,6 +318,10 @@ public:
      */
     T *raw() const;
 
+    T **ptrMatrices() const {
+        return m_d_ptrMatrices;
+    }
+
     /**
      * @return number of rows
      */
@@ -842,6 +844,7 @@ inline T *DTensor<T>::raw() const {
     return m_d_data;
 }
 
+
 template<>
 inline DTensor<float> DTensor<float>::tr() const {
     DTensor<float> transposes(m_numCols, m_numRows, m_numMats);
@@ -960,17 +963,17 @@ inline T DTensor<T>::operator()(size_t i, size_t j, size_t k) const {
     return hostDst;
 }
 
-template<typename T>
-inline DTensor<T *> DTensor<T>::pointersToMatrices() const {
-    std::vector<T *> h_pointers(m_numMats);
-    size_t numelMat = m_numRows * m_numCols;
-    h_pointers[0] = m_d_data;
-    for (size_t i = 1; i < m_numMats; i++) {
-        h_pointers[i] = m_d_data + i * numelMat;
-    }
-    DTensor<T *> t(h_pointers, m_numMats, 1, 1);
-    return t;
-}
+//template<typename T>
+//inline DTensor<T *> DTensor<T>::pointersToMatrices() const {
+//    std::vector<T *> h_pointers(m_numMats);
+//    size_t numelMat = m_numRows * m_numCols;
+//    h_pointers[0] = m_d_data;
+//    for (size_t i = 1; i < m_numMats; i++) {
+//        h_pointers[i] = m_d_data + i * numelMat;
+//    }
+//    DTensor<T *> t(h_pointers, m_numMats, 1, 1);
+//    return t;
+//}
 
 template<>
 inline void DTensor<double>::addAB(const DTensor<double> &A, const DTensor<double> &B, double alpha, double beta) {
@@ -1017,12 +1020,12 @@ inline void DTensor<float>::addAB(const DTensor<float> &A, const DTensor<float>
                                      nMat));
     } else {
         gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(),
-                                     CUBLAS_OP_N, CUBLAS_OP_N,
-                                     nRA, nCB, nCA, &_alpha,
-                                     A.raw(), nRA,
-                                     B.raw(), nCA,
-                                     &_beta,
-                                     raw(), nRA));
+                              CUBLAS_OP_N, CUBLAS_OP_N,
+                              nRA, nCB, nCA, &_alpha,
+                              A.raw(), nRA,
+                              B.raw(), nCA,
+                              &_beta,
+                              raw(), nRA));
     }
 }
 
@@ -1040,16 +1043,14 @@ inline void DTensor<double>::leastSquaresBatched(DTensor &B) {
         throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
     int info = 0;
     DTensor<int> infoArray(batchSize);
-    DTensor<double *> As = pointersToMatrices();
-    DTensor<double *> Bs = B.pointersToMatrices();
     gpuErrChk(cublasDgelsBatched(Session::getInstance().cuBlasHandle(),
                                  CUBLAS_OP_N,
                                  m_numRows,
                                  m_numCols,
                                  nColsB,
-                                 As.raw(),
+                                 m_d_ptrMatrices,
                                  m_numRows,
-                                 Bs.raw(),
+                                 B.m_d_ptrMatrices,
                                  m_numRows,
                                  &info,
                                  infoArray.raw(),
@@ -1070,16 +1071,14 @@ inline void DTensor<float>::leastSquaresBatched(DTensor &B) {
         throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
     int info = 0;
     DTensor<int> infoArray(batchSize);
-    DTensor<float *> As = pointersToMatrices();
-    DTensor<float *> Bs = B.pointersToMatrices();
     gpuErrChk(cublasSgelsBatched(Session::getInstance().cuBlasHandle(),
                                  CUBLAS_OP_N,
                                  m_numRows,
                                  m_numCols,
                                  nColsB,
-                                 As.raw(),
+                                 m_d_ptrMatrices,
                                  m_numRows,
-                                 Bs.raw(),
+                                 B.m_d_ptrMatrices,
                                  m_numRows,
                                  &info,
                                  infoArray.raw(),
@@ -1839,11 +1838,10 @@ public:
 template<>
 inline void CholeskyBatchFactoriser<double>::factorise() {
     if (m_factorisationDone) return;
-    DTensor<double *> ptrA = m_matrix->pointersToMatrices();
     gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(),
                                       CUBLAS_FILL_MODE_LOWER,
                                       m_numRows,
-                                      ptrA.raw(),
+                                      m_matrix->ptrMatrices(),
                                       m_numRows,
                                       m_deviceInfo->raw(),
                                       m_numMats));
@@ -1853,11 +1851,10 @@ inline void CholeskyBatchFactoriser<double>::factorise() {
 template<>
 inline void CholeskyBatchFactoriser<float>::factorise() {
     if (m_factorisationDone) return;
-    DTensor<float *> ptrA = m_matrix->pointersToMatrices();
     gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(),
                                       CUBLAS_FILL_MODE_LOWER,
                                       m_numRows,
-                                      ptrA.raw(),
+                                      m_matrix->ptrMatrices(),
                                       m_numRows,
                                       m_deviceInfo->raw(),
                                       m_numMats));
@@ -1871,15 +1868,13 @@ inline void CholeskyBatchFactoriser<double>::solve(DTensor<double> &b) {
         throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
     }
     if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
-    DTensor<double *> ptrA = m_matrix->pointersToMatrices();
-    DTensor<double *> ptrB = b.pointersToMatrices();
     gpuErrChk(cusolverDnDpotrsBatched(Session::getInstance().cuSolverHandle(),
                                       CUBLAS_FILL_MODE_LOWER,
                                       m_numRows,
                                       1,  ///< only supports rhs = 1
-                                      ptrA.raw(),
+                                      m_matrix->ptrMatrices(),
                                       m_numRows,
-                                      ptrB.raw(),
+                                      b.ptrMatrices(),
                                       m_numRows,
                                       m_deviceInfo->raw(),
                                       m_numMats));
@@ -1892,15 +1887,13 @@ inline void CholeskyBatchFactoriser<float>::solve(DTensor<float> &b) {
         throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
     }
     if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
-    DTensor<float *> ptrA = m_matrix->pointersToMatrices();
-    DTensor<float *> ptrB = b.pointersToMatrices();
     gpuErrChk(cusolverDnSpotrsBatched(Session::getInstance().cuSolverHandle(),
                                       CUBLAS_FILL_MODE_LOWER,
                                       m_numRows,
                                       1,  ///< only supports rhs = 1
-                                      ptrA.raw(),
+                                      m_matrix->ptrMatrices(),
                                       m_numRows,
-                                      ptrB.raw(),
+                                      b.ptrMatrices(),
                                       m_numRows,
                                       m_deviceInfo->raw(),
                                       m_numMats));
diff --git a/test/testTensor.cu b/test/testTensor.cu
index 0a33ffc..3cc7580 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -635,29 +635,6 @@ TEST_F(TensorTest, tensorMinusTensor) {
     tensorMinusTensor<double>();
 }
 
-/* ---------------------------------------
- * Tensor: pointers to matrices (on device)
- * --------------------------------------- */
-
-TEMPLATE_WITH_TYPE_T
-void tensorPointersToMatrices() {
-    std::vector<T> dataA = TENSOR_DATA_234A;
-    DTensor<T> A(dataA, 2, 3, 4);
-    DTensor<T *> pointers = A.pointersToMatrices();
-    EXPECT_EQ(4, pointers.numRows());
-    EXPECT_EQ(1, pointers.numCols());
-    EXPECT_EQ(1, pointers.numMats());
-    T *p1 = pointers(1, 0, 0); // pointer to matrix #1
-    T hostDst; // let's see what's there...
-    cudaMemcpy(&hostDst, p1, sizeof(T), cudaMemcpyDeviceToHost);
-    EXPECT_EQ(dataA[6], hostDst);
-}
-
-TEST_F(TensorTest, tensorPointersToMatrices) {
-    tensorPointersToMatrices<float>();
-    tensorPointersToMatrices<double>();
-    tensorPointersToMatrices<int>();
-}
 
 /* ---------------------------------------
  * Tensor: C = AB