From aa1317b6f25213016b6f6dbaa208086b9b53145a Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 01:06:30 +0000
Subject: [PATCH 01/17] Create pointer-to-matrices memory (m_d_ptrMatrices)

Allocating and destroying memory
No memory leaks evident atm
---
 include/tensor.cuh | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 0a8d1d1..00e5068 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -181,16 +181,22 @@ class DTensor {
 
 private:
     T *m_d_data = nullptr;  ///< Pointer to device data
+    T **m_d_ptrMatrices = nullptr; ///< Pointer to matrices in tensor
     size_t m_numRows = 0;  ///< Number of rows
     size_t m_numCols = 0;  ///< Number of columns
     size_t m_numMats = 0;  ///< Number of matrices
-    bool m_doDestroy = false;  ///< Whether to destroy memory
+    bool m_doDestroyData = false;  ///< Whether to destroy memory
+    bool m_doDestroyPtrMatrices = false;  ///< Whether to destroy memory
 
-    bool destroy() {
-        if (!m_doDestroy) return false;
-        if (m_d_data) cudaFree(m_d_data);
-        m_d_data = nullptr;
-        return true;
+    void destroy() {
+        if (m_doDestroyData) {
+            if (m_d_data) gpuErrChk(cudaFree(m_d_data));
+            m_d_data = nullptr;
+        }
+        if (m_doDestroyPtrMatrices) {
+            if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices));
+            m_d_ptrMatrices = nullptr;
+        }
     }
 
     /**
@@ -586,7 +592,7 @@ DTensor<T>::DTensor(const DTensor<T> &other, size_t axis, size_t from, size_t to
         m_numMats = 1;
     }
     m_d_data = other.m_d_data + offset;
-    m_doDestroy = false;
+    m_doDestroyData = false;
 }
 
 template<typename T>
@@ -595,8 +601,8 @@ DTensor<T>::DTensor(DTensor<T> &&other) {
     m_numRows = other.m_numRows;
     m_numMats = other.m_numMats;
     m_d_data = other.m_d_data;
-    m_doDestroy = true;
-    other.m_doDestroy = false;
+    m_doDestroyData = true;
+    other.m_doDestroyData = false;
     other.m_d_data = nullptr;
     other.m_numCols = 0;
     other.m_numRows = 0;
@@ -757,12 +763,16 @@ template<typename T>
 inline bool DTensor<T>::allocateOnDevice(size_t size, bool zero) {
     if (size <= 0) return false;
     destroy();
-    m_doDestroy = true;
+    m_doDestroyData = true;
     size_t buffer_size = size * sizeof(T);
     bool cudaStatus = cudaMalloc(&m_d_data, buffer_size);
     if (cudaStatus != cudaSuccess) return false;
     if (zero) gpuErrChk(cudaMemset(m_d_data, 0, buffer_size)); // set to zero all elements
-    return true;
+
+    m_doDestroyPtrMatrices = true;
+    cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T*));
+
+    return (cudaStatus != cudaSuccess);
 }
 
 template<typename T>
@@ -854,7 +864,7 @@ DTensor<T> &DTensor<T>::operator=(const DTensor<T> &other) {
     m_numMats = other.m_numMats;
     m_numRows = other.m_numRows;
     m_numCols = other.m_numCols;
-    m_doDestroy = false;
+    m_doDestroyData = false;
     m_d_data = other.m_d_data;
     return *this;
 }

From 2ac61cf073fdcecd4229aac49c774c0e3a01dddf Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 01:18:25 +0000
Subject: [PATCH 02/17] Initialise m_d_ptrMatrices

Introduce initialisePointersToMatricesData to initialise
m_d_ptrMatrices; appropriate checks for safety.
---
 include/tensor.cuh | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 00e5068..1be39a9 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -42,8 +42,7 @@ static std::random_device RND_DEVICE;
  * @param hi
  * @return
  */
-TEMPLATE_WITH_TYPE_T
-TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
 std::vector<T> generateRealRandomVector(size_t n, T low, T hi) {
     std::mt19937_64 mersenne_engine(RND_DEVICE());
     std::uniform_real_distribution<T> dist(low, hi);
@@ -231,6 +230,23 @@ private:
      */
     std::ostream &print(std::ostream &out) const;
 
+    void initialisePointersToMatricesData() {
+        /* Make sure m_d_ptrMatrices has been allocated */
+        if (!m_d_ptrMatrices || !m_doDestroyPtrMatrices) {
+            throw std::runtime_error("Unallocated memory (m_d_ptrMatrices)");
+        }
+        /* Host-based vector of pointers */
+        std::vector<T *> h_pointers(m_numMats);
+        size_t numelMat = m_numRows * m_numCols;
+        h_pointers[0] = m_d_data;
+        for (size_t i = 1; i < m_numMats; i++) {
+            h_pointers[i] = m_d_data + i * numelMat;
+        }
+        /* Upload data to m_d_ptrMatrices */
+        size_t buffer_size = m_numMats * sizeof(T *);
+        gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers.data(), buffer_size, cudaMemcpyHostToDevice));
+    }
+
 public:
 
     /**
@@ -770,7 +786,10 @@ inline bool DTensor<T>::allocateOnDevice(size_t size, bool zero) {
     if (zero) gpuErrChk(cudaMemset(m_d_data, 0, buffer_size)); // set to zero all elements
 
     m_doDestroyPtrMatrices = true;
-    cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T*));
+    cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T *));
+
+    /* Initialise m_d_ptrMatrices */
+    initialisePointersToMatricesData();
 
     return (cudaStatus != cudaSuccess);
 }

From 3338852def77ee30e7923f7a0a42f0a4394e3c94 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 01:24:28 +0000
Subject: [PATCH 03/17] Use initialisePointersToMatricesData to initialise
 m_d_ptrMatrices

Also in the slice constructor, keep m_d_ptrMatrices=nullptr
when we dont slice along axis=2
---
 include/tensor.cuh | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 1be39a9..8ef0d90 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -564,6 +564,8 @@ DTensor<T>::DTensor(size_t m, size_t n, size_t k, bool zero) {
     m_numMats = k;
     size_t size = m * n * k;
     allocateOnDevice(size, zero);
+    /* Initialise m_d_ptrMatrices */
+    initialisePointersToMatricesData();
 }
 
 template<typename T>
@@ -574,6 +576,8 @@ DTensor<T>::DTensor(const std::vector<T> &data, size_t m, size_t n, size_t k, St
     size_t size = m * n * k;
     allocateOnDevice(size);
     upload(data, mode);
+    /* Initialise m_d_ptrMatrices */
+    initialisePointersToMatricesData();
 }
 
 template<typename T>
@@ -585,6 +589,8 @@ DTensor<T>::DTensor(const DTensor<T> &other) {
     allocateOnDevice(m_numRows * m_numCols * m_numMats);
     gpuErrChk(cudaMemcpy(m_d_data, other.raw(), m_numRows * m_numCols * m_numMats * sizeof(T),
                          cudaMemcpyDeviceToDevice));
+    /* Initialise m_d_ptrMatrices */
+    initialisePointersToMatricesData();
 }
 
 template<typename T>
@@ -609,6 +615,11 @@ DTensor<T>::DTensor(const DTensor<T> &other, size_t axis, size_t from, size_t to
     }
     m_d_data = other.m_d_data + offset;
     m_doDestroyData = false;
+    m_doDestroyPtrMatrices = false;
+    if (axis != 2) {
+        // m_d_ptrMatrices is not needed for vectors and matrices
+        m_d_ptrMatrices = nullptr;
+    }
 }
 
 template<typename T>
@@ -788,9 +799,6 @@ inline bool DTensor<T>::allocateOnDevice(size_t size, bool zero) {
     m_doDestroyPtrMatrices = true;
     cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T *));
 
-    /* Initialise m_d_ptrMatrices */
-    initialisePointersToMatricesData();
-
     return (cudaStatus != cudaSuccess);
 }
 

From 36bf6ed6c60b9157b915f8fc651824cbb547df1e Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 01:35:41 +0000
Subject: [PATCH 04/17] update addAB to use preallocated memory

use cublasDGemm if nMats=1
---
 include/tensor.cuh | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 8ef0d90..65ebdd5 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -624,12 +624,18 @@ DTensor<T>::DTensor(const DTensor<T> &other, size_t axis, size_t from, size_t to
 
 template<typename T>
 DTensor<T>::DTensor(DTensor<T> &&other) {
+    /* Steal everything from other */
     m_numCols = other.m_numCols;
     m_numRows = other.m_numRows;
     m_numMats = other.m_numMats;
     m_d_data = other.m_d_data;
-    m_doDestroyData = true;
+    m_doDestroyData = other.m_doDestroyData;
+    m_doDestroyPtrMatrices = other.m_doDestroyPtrMatrices;
+    m_d_ptrMatrices = other.m_d_ptrMatrices;
+    /* Invalidate other */
+    other.m_doDestroyPtrMatrices = false;
     other.m_doDestroyData = false;
+    other.m_d_ptrMatrices = nullptr;
     other.m_d_data = nullptr;
     other.m_numCols = 0;
     other.m_numRows = 0;
@@ -965,18 +971,25 @@ inline void DTensor<double>::addAB(const DTensor<double> &A, const DTensor<doubl
     size_t nRA = A.numRows();
     size_t nCA = A.numCols();
     size_t nCB = B.numCols();
-    DTensor<double *> ptrA = A.pointersToMatrices();
-    DTensor<double *> ptrB = B.pointersToMatrices();
-    DTensor<double *> ptr = pointersToMatrices();
     double _alpha = alpha, _beta = beta;
-    gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(),
-                                 CUBLAS_OP_N, CUBLAS_OP_N,
-                                 nRA, nCB, nCA, &_alpha,
-                                 ptrA.raw(), nRA,
-                                 ptrB.raw(), nCA,
-                                 &_beta,
-                                 ptr.raw(), nRA,
-                                 nMat));
+    if (nMat > 1) {
+        gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(),
+                                     CUBLAS_OP_N, CUBLAS_OP_N,
+                                     nRA, nCB, nCA, &_alpha,
+                                     A.m_d_ptrMatrices, nRA,
+                                     B.m_d_ptrMatrices, nCA,
+                                     &_beta,
+                                     m_d_ptrMatrices, nRA,
+                                     nMat));
+    } else {
+        gpuErrChk(cublasDgemm(Session::getInstance().cuBlasHandle(),
+                              CUBLAS_OP_N, CUBLAS_OP_N,
+                              nRA, nCB, nCA, &_alpha,
+                              A.raw(), nRA,
+                              B.raw(), nCA,
+                              &_beta,
+                              raw(), nRA));
+    }
 }
 
 template<>

From 4e5da4dd952be5a881ddb094e35c8796f6f2e770 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 01:39:25 +0000
Subject: [PATCH 05/17] mirror implementation for DTensor<float>::addAB

---
 include/tensor.cuh | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 65ebdd5..3d650d4 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -998,18 +998,25 @@ inline void DTensor<float>::addAB(const DTensor<float> &A, const DTensor<float>
     size_t nRA = A.numRows();
     size_t nCA = A.numCols();
     size_t nCB = B.numCols();
-    DTensor<float *> ptrA = A.pointersToMatrices();
-    DTensor<float *> ptrB = B.pointersToMatrices();
-    DTensor<float *> ptr = pointersToMatrices();
     float _alpha = alpha, _beta = beta;
-    gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(),
-                                 CUBLAS_OP_N, CUBLAS_OP_N,
-                                 nRA, nCB, nCA, &_alpha,
-                                 ptrA.raw(), nRA,
-                                 ptrB.raw(), nCA,
-                                 &_beta,
-                                 ptr.raw(), nRA,
-                                 nMat));
+    if (nMat > 1) {
+        gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(),
+                                     CUBLAS_OP_N, CUBLAS_OP_N,
+                                     nRA, nCB, nCA, &_alpha,
+                                     A.m_d_ptrMatrices, nRA,
+                                     B.m_d_ptrMatrices, nCA,
+                                     &_beta,
+                                     m_d_ptrMatrices, nRA,
+                                     nMat));
+    } else {
+        gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(),
+                              CUBLAS_OP_N, CUBLAS_OP_N,
+                              nRA, nCB, nCA, &_alpha,
+                              A.raw(), nRA,
+                              B.raw(), nCA,
+                              &_beta,
+                              raw(), nRA));
+    }
 }
 
 template<>

From 296efd4b18b6cc68339462d6d953fdfbd59fbbea Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 01:46:59 +0000
Subject: [PATCH 06/17] Get rid of pointersToMatrices() in LS

---
 include/tensor.cuh | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 3d650d4..465eacd 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -359,6 +359,7 @@ public:
      * Creates a vector of pointers to the matrices of this tensor.
      * The vector is an (n,1,1)-tensor, where n is the number of matrices in this tensor.
      * @return vector of pointers to the first element of each matrix
+     * @deprecated
      */
     DTensor<T *> pointersToMatrices() const;
 
@@ -1032,17 +1033,15 @@ inline void DTensor<double>::leastSquaresBatched(DTensor &B) {
     if (m_numCols > m_numRows)
         throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
     int info = 0;
-    DTensor<int> infoArray(batchSize);
-    DTensor<double *> As = pointersToMatrices();
-    DTensor<double *> Bs = B.pointersToMatrices();
+    DTensor<int> infoArray(batchSize); // TODO consider preallocating?
     gpuErrChk(cublasDgelsBatched(Session::getInstance().cuBlasHandle(),
                                  CUBLAS_OP_N,
                                  m_numRows,
                                  m_numCols,
                                  nColsB,
-                                 As.raw(),
+                                 m_d_ptrMatrices,
                                  m_numRows,
-                                 Bs.raw(),
+                                 B.m_d_ptrMatrices,
                                  m_numRows,
                                  &info,
                                  infoArray.raw(),
@@ -1062,17 +1061,15 @@ inline void DTensor<float>::leastSquaresBatched(DTensor &B) {
     if (m_numCols > m_numRows)
         throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
     int info = 0;
-    DTensor<int> infoArray(batchSize);
-    DTensor<float *> As = pointersToMatrices();
-    DTensor<float *> Bs = B.pointersToMatrices();
+    DTensor<int> infoArray(batchSize); // TODO consider preallocating?
     gpuErrChk(cublasSgelsBatched(Session::getInstance().cuBlasHandle(),
                                  CUBLAS_OP_N,
                                  m_numRows,
                                  m_numCols,
                                  nColsB,
-                                 As.raw(),
+                                 m_d_ptrMatrices,
                                  m_numRows,
-                                 Bs.raw(),
+                                 B.m_d_ptrMatrices,
                                  m_numRows,
                                  &info,
                                  infoArray.raw(),

From 02564d57cb8de04850df8a4f961ecad7e3338ff5 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 01:53:38 +0000
Subject: [PATCH 07/17] New method DTensor<T>::ptrMatrices() exposes
 m_d_ptrMatrices

---
 include/tensor.cuh | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 465eacd..6f15916 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -316,6 +316,12 @@ public:
      */
     T *raw() const;
 
+    /**
+     * Pointers to matrices (on device)
+     * @return
+     */
+    T **ptrMatrices();
+
     /**
      * @return number of rows
      */
@@ -842,6 +848,12 @@ inline T *DTensor<T>::raw() const {
     return m_d_data;
 }
 
+template<typename T>
+inline T **DTensor<T>::ptrMatrices()  {
+    return m_d_ptrMatrices;
+}
+
+
 template<>
 inline DTensor<float> DTensor<float>::tr() const {
     DTensor<float> transposes(m_numCols, m_numRows, m_numMats);

From f2e86c79e274a3f022965e07008be071152154ed Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 01:57:44 +0000
Subject: [PATCH 08/17] Completely got rid of pointersToMatrices()

---
 include/tensor.cuh | 38 ++++++--------------------------------
 test/testTensor.cu | 24 ------------------------
 2 files changed, 6 insertions(+), 56 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 6f15916..9fc4703 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -361,14 +361,6 @@ public:
      */
     void deviceCopyTo(DTensor<T> &other) const;
 
-    /**
-     * Creates a vector of pointers to the matrices of this tensor.
-     * The vector is an (n,1,1)-tensor, where n is the number of matrices in this tensor.
-     * @return vector of pointers to the first element of each matrix
-     * @deprecated
-     */
-    DTensor<T *> pointersToMatrices() const;
-
     /**
      * Slices rows from specified matrix.
      * @param rowsFrom index to slice rows from (zero-indexed)
@@ -966,18 +958,6 @@ inline T DTensor<T>::operator()(size_t i, size_t j, size_t k) const {
     return hostDst;
 }
 
-template<typename T>
-inline DTensor<T *> DTensor<T>::pointersToMatrices() const {
-    std::vector<T *> h_pointers(m_numMats);
-    size_t numelMat = m_numRows * m_numCols;
-    h_pointers[0] = m_d_data;
-    for (size_t i = 1; i < m_numMats; i++) {
-        h_pointers[i] = m_d_data + i * numelMat;
-    }
-    DTensor<T *> t(h_pointers, m_numMats, 1, 1);
-    return t;
-}
-
 template<>
 inline void DTensor<double>::addAB(const DTensor<double> &A, const DTensor<double> &B, double alpha, double beta) {
     size_t nMat = A.numMats();
@@ -1841,11 +1821,10 @@ public:
 template<>
 inline void CholeskyBatchFactoriser<double>::factorise() {
     if (m_factorisationDone) return;
-    DTensor<double *> ptrA = m_matrix->pointersToMatrices();
     gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(),
                                       CUBLAS_FILL_MODE_LOWER,
                                       m_numRows,
-                                      ptrA.raw(),
+                                      m_matrix->ptrMatrices(),
                                       m_numRows,
                                       m_deviceInfo->raw(),
                                       m_numMats));
@@ -1855,11 +1834,10 @@ inline void CholeskyBatchFactoriser<double>::factorise() {
 template<>
 inline void CholeskyBatchFactoriser<float>::factorise() {
     if (m_factorisationDone) return;
-    DTensor<float *> ptrA = m_matrix->pointersToMatrices();
     gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(),
                                       CUBLAS_FILL_MODE_LOWER,
                                       m_numRows,
-                                      ptrA.raw(),
+                                      m_matrix->ptrMatrices(),
                                       m_numRows,
                                       m_deviceInfo->raw(),
                                       m_numMats));
@@ -1873,15 +1851,13 @@ inline void CholeskyBatchFactoriser<double>::solve(DTensor<double> &b) {
         throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
     }
     if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
-    DTensor<double *> ptrA = m_matrix->pointersToMatrices();
-    DTensor<double *> ptrB = b.pointersToMatrices();
     gpuErrChk(cusolverDnDpotrsBatched(Session::getInstance().cuSolverHandle(),
                                       CUBLAS_FILL_MODE_LOWER,
                                       m_numRows,
                                       1,  ///< only supports rhs = 1
-                                      ptrA.raw(),
+                                      m_matrix->ptrMatrices(),
                                       m_numRows,
-                                      ptrB.raw(),
+                                      b.ptrMatrices(),
                                       m_numRows,
                                       m_deviceInfo->raw(),
                                       m_numMats));
@@ -1894,15 +1870,13 @@ inline void CholeskyBatchFactoriser<float>::solve(DTensor<float> &b) {
         throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
     }
     if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
-    DTensor<float *> ptrA = m_matrix->pointersToMatrices();
-    DTensor<float *> ptrB = b.pointersToMatrices();
     gpuErrChk(cusolverDnSpotrsBatched(Session::getInstance().cuSolverHandle(),
                                       CUBLAS_FILL_MODE_LOWER,
                                       m_numRows,
                                       1,  ///< only supports rhs = 1
-                                      ptrA.raw(),
+                                      m_matrix->ptrMatrices(),
                                       m_numRows,
-                                      ptrB.raw(),
+                                      b.ptrMatrices(),
                                       m_numRows,
                                       m_deviceInfo->raw(),
                                       m_numMats));
diff --git a/test/testTensor.cu b/test/testTensor.cu
index 0a33ffc..902a27c 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -635,30 +635,6 @@ TEST_F(TensorTest, tensorMinusTensor) {
     tensorMinusTensor<double>();
 }
 
-/* ---------------------------------------
- * Tensor: pointers to matrices (on device)
- * --------------------------------------- */
-
-TEMPLATE_WITH_TYPE_T
-void tensorPointersToMatrices() {
-    std::vector<T> dataA = TENSOR_DATA_234A;
-    DTensor<T> A(dataA, 2, 3, 4);
-    DTensor<T *> pointers = A.pointersToMatrices();
-    EXPECT_EQ(4, pointers.numRows());
-    EXPECT_EQ(1, pointers.numCols());
-    EXPECT_EQ(1, pointers.numMats());
-    T *p1 = pointers(1, 0, 0); // pointer to matrix #1
-    T hostDst; // let's see what's there...
-    cudaMemcpy(&hostDst, p1, sizeof(T), cudaMemcpyDeviceToHost);
-    EXPECT_EQ(dataA[6], hostDst);
-}
-
-TEST_F(TensorTest, tensorPointersToMatrices) {
-    tensorPointersToMatrices<float>();
-    tensorPointersToMatrices<double>();
-    tensorPointersToMatrices<int>();
-}
-
 /* ---------------------------------------
  * Tensor: C = AB
  * --------------------------------------- */

From 424a251a1d4008932e8791d1a0843b6575e6770e Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 02:36:51 +0000
Subject: [PATCH 09/17] No memory allocation for m_d_ptrMatrices unless nMats >
 1

---
 CHANGELOG.md       |  9 +++++++++
 include/tensor.cuh | 14 ++++++++++----
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e409628..4d19eb6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 
+<!-- ---------------------
+      v1.3.1
+     --------------------- -->
+## v1.3.1 - 8-11-2024
+
+### Fixed
+
+- Memory management improvements: we got rid of `pointerToMatrices`, which would unnecessarily allocate memory and `addAB` does not allocate any new memory internally.
+
 <!-- ---------------------
       v1.3.0
      --------------------- -->
diff --git a/include/tensor.cuh b/include/tensor.cuh
index 9fc4703..6cd64e1 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -232,8 +232,8 @@ private:
 
     void initialisePointersToMatricesData() {
         /* Make sure m_d_ptrMatrices has been allocated */
-        if (!m_d_ptrMatrices || !m_doDestroyPtrMatrices) {
-            throw std::runtime_error("Unallocated memory (m_d_ptrMatrices)");
+        if (m_numMats <= 1 | !m_d_ptrMatrices || !m_doDestroyPtrMatrices) {
+            return;
         }
         /* Host-based vector of pointers */
         std::vector<T *> h_pointers(m_numMats);
@@ -287,6 +287,7 @@ public:
      * @param n number of columns
      * @param k number of matrices
      */
+
     DTensor(const std::vector<T> &data, size_t m, size_t n = 1, size_t k = 1,
             StorageMode mode = StorageMode::defaultMajor);
 
@@ -553,6 +554,7 @@ void DTensor<T>::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats
     }
     m_numRows = newNumRows;
     m_numCols = newNumCols;
+    // TODO allocate or reallocate new memory
     m_numMats = newNumMats;
 }
 
@@ -801,8 +803,12 @@ inline bool DTensor<T>::allocateOnDevice(size_t size, bool zero) {
     if (cudaStatus != cudaSuccess) return false;
     if (zero) gpuErrChk(cudaMemset(m_d_data, 0, buffer_size)); // set to zero all elements
 
-    m_doDestroyPtrMatrices = true;
-    cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T *));
+    if (numMats() > 1) {
+        m_doDestroyPtrMatrices = true;
+        cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T *));
+    } else {
+        m_doDestroyPtrMatrices = false;
+    }
 
     return (cudaStatus != cudaSuccess);
 }

From b2857cd6c135c241ce655dc71422688b94c217b5 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 02:47:51 +0000
Subject: [PATCH 10/17] ptrMatrices now declared as a const method

---
 include/tensor.cuh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 6cd64e1..3c8c83c 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -287,7 +287,6 @@ public:
      * @param n number of columns
      * @param k number of matrices
      */
-
     DTensor(const std::vector<T> &data, size_t m, size_t n = 1, size_t k = 1,
             StorageMode mode = StorageMode::defaultMajor);
 
@@ -321,7 +320,7 @@ public:
      * Pointers to matrices (on device)
      * @return
      */
-    T **ptrMatrices();
+    T **ptrMatrices() const;
 
     /**
      * @return number of rows
@@ -847,7 +846,7 @@ inline T *DTensor<T>::raw() const {
 }
 
 template<typename T>
-inline T **DTensor<T>::ptrMatrices()  {
+inline T **DTensor<T>::ptrMatrices() const {
     return m_d_ptrMatrices;
 }
 

From 7c32dae4ca0a2e6fd91281225cdee35d66521ae4 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 13:55:36 +0000
Subject: [PATCH 11/17] Update DTensor<T>::reshape

When necessary, memory is reallocated for m_d_ptrMatrices
within reshape
---
 include/tensor.cuh | 64 ++++++++++++++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 22 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 3c8c83c..ff35317 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -230,32 +230,24 @@ private:
      */
     std::ostream &print(std::ostream &out) const;
 
-    void initialisePointersToMatricesData() {
-        /* Make sure m_d_ptrMatrices has been allocated */
-        if (m_numMats <= 1 | !m_d_ptrMatrices || !m_doDestroyPtrMatrices) {
-            return;
-        }
-        /* Host-based vector of pointers */
-        std::vector<T *> h_pointers(m_numMats);
-        size_t numelMat = m_numRows * m_numCols;
-        h_pointers[0] = m_d_data;
-        for (size_t i = 1; i < m_numMats; i++) {
-            h_pointers[i] = m_d_data + i * numelMat;
-        }
-        /* Upload data to m_d_ptrMatrices */
-        size_t buffer_size = m_numMats * sizeof(T *);
-        gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers.data(), buffer_size, cudaMemcpyHostToDevice));
-    }
+    /**
+     * Initialises an array of pointers to the sub-matrices of the
+     * tensor (on the device). No allocation takes place if the tensor
+     * has only one matrix.
+     */
+    void initialisePointersToMatricesData();
 
 public:
 
     /**
      * Create a tensor with random elements
-     * @param numRows
-     * @param numCols
-     * @param numMats
-     * @param low
-     * @param hi
+     * @param numRows number of rows
+     * @param numCols number of columns
+     * @param numMats number of matrices
+     * @param low minimum value of random elements
+     * @param hi maximum value of random elements
+     *
+     * @throws std::invalid_argument if T is other than double, float, or int
      */
     static DTensor<T> createRandomTensor(size_t numRows, size_t numCols, size_t numMats, T low, T hi);
 
@@ -526,6 +518,24 @@ public:
 
 }; /* END OF DTENSOR */
 
+template<typename T>
+void DTensor<T>::initialisePointersToMatricesData() {
+    /* Make sure m_d_ptrMatrices has been allocated */
+    if (m_numMats <= 1 || !m_d_ptrMatrices || !m_doDestroyPtrMatrices) {
+        return;
+    }
+    /* Host-based vector of pointers */
+    std::vector<T *> h_pointers(m_numMats);
+    size_t numelMat = m_numRows * m_numCols;
+    h_pointers[0] = m_d_data;
+    for (size_t i = 1; i < m_numMats; i++) {
+        h_pointers[i] = m_d_data + i * numelMat;
+    }
+    /* Upload data to m_d_ptrMatrices */
+    size_t buffer_size = m_numMats * sizeof(T *);
+    gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers.data(), buffer_size, cudaMemcpyHostToDevice));
+}
+
 template<typename T>
 DTensor<T> DTensor<T>::createRandomTensor(size_t numRows, size_t numCols, size_t numMats, T low, T hi) {
     if constexpr (std::is_floating_point<T>::value) {
@@ -543,6 +553,7 @@ DTensor<T> DTensor<T>::createRandomTensor(size_t numRows, size_t numCols, size_t
 
 template<typename T>
 void DTensor<T>::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats) {
+    if (m_numRows == newNumRows && m_numCols == newNumCols && m_numMats == newNumMats) return;
     size_t newNumElements = newNumRows * newNumCols * newNumMats;
     if (numEl() != newNumElements) {
         char errMessage[256];
@@ -553,8 +564,17 @@ void DTensor<T>::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats
     }
     m_numRows = newNumRows;
     m_numCols = newNumCols;
-    // TODO allocate or reallocate new memory
     m_numMats = newNumMats;
+    /* Free the memory for m_d_ptrMatrices */
+    if (m_d_ptrMatrices && m_doDestroyPtrMatrices) {
+        gpuErrChk(cudaFree(m_d_ptrMatrices));
+        m_d_ptrMatrices = nullptr;
+    }
+    /* Reallocate memory for m_d_ptrMatrices, if necessary */
+    if (m_numMats > 1) {
+        gpuErrChk(cudaMalloc(&m_d_ptrMatrices, m_numMats * sizeof(T *)));
+    }
+    initialisePointersToMatricesData();
 }
 
 template<typename T>

From 225ba0da797f37cc4911efc9a33b99d721401d3d Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 14:03:26 +0000
Subject: [PATCH 12/17] Better memory management when reshaping

No need to free/reallocate when we can reuse the already
allocated memory space (when the number of matrices
decreases); no new memory allocation when numMats=1
---
 include/tensor.cuh | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index ff35317..44a448a 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -555,6 +555,7 @@ template<typename T>
 void DTensor<T>::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats) {
     if (m_numRows == newNumRows && m_numCols == newNumCols && m_numMats == newNumMats) return;
     size_t newNumElements = newNumRows * newNumCols * newNumMats;
+    /* Check whether dimensions are compatible */
     if (numEl() != newNumElements) {
         char errMessage[256];
         sprintf(errMessage,
@@ -562,18 +563,22 @@ void DTensor<T>::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats
                 numRows(), numRows(), numMats(), numEl(), newNumRows, newNumCols, newNumMats, newNumElements);
         throw std::invalid_argument(errMessage);
     }
+
+    /* Only free/reallocate if newNumMats > m_numMats
+     * otherwise, reuse the already allocated memory space */
+    if (newNumMats > m_numMats) {
+        /* Free the memory for m_d_ptrMatrices */
+        if (m_d_ptrMatrices && m_doDestroyPtrMatrices) {
+            gpuErrChk(cudaFree(m_d_ptrMatrices));
+            m_d_ptrMatrices = nullptr;
+        }
+        /* Reallocate memory for m_d_ptrMatrices, if necessary */
+        if (newNumMats > 1) gpuErrChk(cudaMalloc(&m_d_ptrMatrices, newNumMats * sizeof(T *)));
+    }
+
     m_numRows = newNumRows;
     m_numCols = newNumCols;
     m_numMats = newNumMats;
-    /* Free the memory for m_d_ptrMatrices */
-    if (m_d_ptrMatrices && m_doDestroyPtrMatrices) {
-        gpuErrChk(cudaFree(m_d_ptrMatrices));
-        m_d_ptrMatrices = nullptr;
-    }
-    /* Reallocate memory for m_d_ptrMatrices, if necessary */
-    if (m_numMats > 1) {
-        gpuErrChk(cudaMalloc(&m_d_ptrMatrices, m_numMats * sizeof(T *)));
-    }
     initialisePointersToMatricesData();
 }
 

From d4b5472bd3d555815e9e69e7a51ed01330a69152 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 14:32:49 +0000
Subject: [PATCH 13/17] Safe memory allocation in reshape and unit test

---
 include/tensor.cuh |   6 +-
 test/testTensor.cu | 752 +++++++++++++++++++++++++++------------------
 2 files changed, 451 insertions(+), 307 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 44a448a..55b661b 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -571,9 +571,13 @@ void DTensor<T>::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats
         if (m_d_ptrMatrices && m_doDestroyPtrMatrices) {
             gpuErrChk(cudaFree(m_d_ptrMatrices));
             m_d_ptrMatrices = nullptr;
+            m_doDestroyPtrMatrices = false;
         }
         /* Reallocate memory for m_d_ptrMatrices, if necessary */
-        if (newNumMats > 1) gpuErrChk(cudaMalloc(&m_d_ptrMatrices, newNumMats * sizeof(T *)));
+        if (newNumMats > 1) {
+            gpuErrChk(cudaMalloc(&m_d_ptrMatrices, newNumMats * sizeof(T *)));
+            m_doDestroyPtrMatrices = true;
+        }
     }
 
     m_numRows = newNumRows;
diff --git a/test/testTensor.cu b/test/testTensor.cu
index 902a27c..982934c 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -26,7 +26,7 @@ protected:
 
 TEMPLATE_WITH_TYPE_T
 void tensorConstructionZero() {
-    DTensor<T> zero(2, 3, 4, true);
+    DTensor <T> zero(2, 3, 4, true);
     EXPECT_EQ(2, zero.numRows());
     EXPECT_EQ(3, zero.numCols());
     EXPECT_EQ(4, zero.numMats());
@@ -36,10 +36,14 @@ void tensorConstructionZero() {
     EXPECT_EQ(expectedResult, zeroDown);
 }
 
-TEST_F(TensorTest, tensorConstructionZero) {
-    tensorConstructionZero<float>();
-    tensorConstructionZero<double>();
-    tensorConstructionZero<int>();
+TEST_F(TensorTest, tensorConstructionZero
+) {
+tensorConstructionZero<float>();
+
+tensorConstructionZero<double>();
+
+tensorConstructionZero<int>();
+
 }
 
 /* ---------------------------------------
@@ -65,21 +69,21 @@ void tensorConstructionStorageMode() {
     std::vector<T> Rm = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
     std::vector<T> hostData(rows * cols * mats);
     // test constructor
-    DTensor<T> testCm(Cm, rows, cols, mats, columnMajor);
-    DTensor<T> testRm(Rm, rows, cols, mats, rowMajor);
+    DTensor <T> testCm(Cm, rows, cols, mats, columnMajor);
+    DTensor <T> testRm(Rm, rows, cols, mats, rowMajor);
     testCm.download(hostData);
     EXPECT_EQ(Cm, hostData);
     testRm.download(hostData);
     EXPECT_EQ(Cm, hostData);
     // test .upload()
-    DTensor<T> testSplitCm(rows, cols, mats);
-    DTensor<T> ACm(testSplitCm, 2, 0, 0);
-    DTensor<T> BCm(testSplitCm, 2, 1, 1);
+    DTensor <T> testSplitCm(rows, cols, mats);
+    DTensor <T> ACm(testSplitCm, 2, 0, 0);
+    DTensor <T> BCm(testSplitCm, 2, 1, 1);
     ACm.upload(aCm, columnMajor);
     BCm.upload(bCm, columnMajor);
-    DTensor<T> testSplitRm(rows, cols, mats);
-    DTensor<T> ARm(testSplitRm, 2, 0, 0);
-    DTensor<T> BRm(testSplitRm, 2, 1, 1);
+    DTensor <T> testSplitRm(rows, cols, mats);
+    DTensor <T> ARm(testSplitRm, 2, 0, 0);
+    DTensor <T> BRm(testSplitRm, 2, 1, 1);
     ARm.upload(aRm, rowMajor);
     BRm.upload(bRm, rowMajor);
     testSplitCm.download(hostData);
@@ -88,10 +92,14 @@ void tensorConstructionStorageMode() {
     EXPECT_EQ(Cm, hostData);
 }
 
-TEST_F(TensorTest, tensorConstructionStorageMode) {
-    tensorConstructionStorageMode<float>();
-    tensorConstructionStorageMode<double>();
-    tensorConstructionStorageMode<int>();
+TEST_F(TensorTest, tensorConstructionStorageMode
+) {
+tensorConstructionStorageMode<float>();
+
+tensorConstructionStorageMode<double>();
+
+tensorConstructionStorageMode<int>();
+
 }
 
 /* ---------------------------------------
@@ -109,10 +117,14 @@ void randomTensorCreation() {
     EXPECT_TRUE(rEle >= -1 && rEle <= 1);
 }
 
-TEST_F(TensorTest, randomTensorCreation) {
-    randomTensorCreation<float>();
-    randomTensorCreation<double>();
-    randomTensorCreation<int>();
+TEST_F(TensorTest, randomTensorCreation
+) {
+randomTensorCreation<float>();
+
+randomTensorCreation<double>();
+
+randomTensorCreation<int>();
+
 }
 
 /* ---------------------------------------
@@ -121,17 +133,23 @@ TEST_F(TensorTest, randomTensorCreation) {
 
 TEMPLATE_WITH_TYPE_T
 void tensorMoveConstructor() {
-    DTensor<T> zero(2, 3, 4, true);
-    DTensor<T> x(std::move(zero));
-    DTensor<T> y(DTensor<T>{100, 10, 1000});
+    DTensor <T> zero(2, 3, 4, true);
+    DTensor <T> x(std::move(zero));
+    DTensor <T> y(DTensor < T > {100, 10, 1000});
 }
 
-TEST_F(TensorTest, tensorMoveConstructor) {
-    tensorMoveConstructor<float>();
-    tensorMoveConstructor<double>();
-    tensorMoveConstructor<int>();
-    tensorMoveConstructor<int *>();
-    tensorMoveConstructor<double *>();
+TEST_F(TensorTest, tensorMoveConstructor
+) {
+tensorMoveConstructor<float>();
+
+tensorMoveConstructor<double>();
+
+tensorMoveConstructor<int>();
+
+tensorMoveConstructor<int *>();
+
+tensorMoveConstructor<double *>();
+
 }
 
 /* ---------------------------------------
@@ -142,17 +160,21 @@ TEST_F(TensorTest, tensorMoveConstructor) {
 TEMPLATE_WITH_TYPE_T
 void tensorConstructionFromVector() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor<T> tenz(data, 2, 3, 4);
+    DTensor <T> tenz(data, 2, 3, 4);
     EXPECT_EQ(2, tenz.numRows());
     EXPECT_EQ(3, tenz.numCols());
     EXPECT_EQ(4, tenz.numMats());
     EXPECT_EQ(2 * 3 * 4, tenz.numEl());
 }
 
-TEST_F(TensorTest, tensorConstructionFromVector) {
-    tensorConstructionFromVector<float>();
-    tensorConstructionFromVector<double>();
-    tensorConstructionFromVector<int>();
+TEST_F(TensorTest, tensorConstructionFromVector
+) {
+tensorConstructionFromVector<float>();
+
+tensorConstructionFromVector<double>();
+
+tensorConstructionFromVector<int>();
+
 }
 
 /* ---------------------------------------
@@ -162,8 +184,8 @@ TEST_F(TensorTest, tensorConstructionFromVector) {
 TEMPLATE_WITH_TYPE_T
 void tensorCopyConstructor() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor<T> tenz(data, 2, 3, 4);
-    DTensor<T> tenzCp(tenz);
+    DTensor <T> tenz(data, 2, 3, 4);
+    DTensor <T> tenzCp(tenz);
     EXPECT_EQ(2, tenzCp.numRows());
     EXPECT_EQ(3, tenzCp.numCols());
     EXPECT_EQ(4, tenzCp.numMats());
@@ -174,10 +196,14 @@ void tensorCopyConstructor() {
     EXPECT_NE(tenz.raw(), tenzCp.raw());
 }
 
-TEST_F(TensorTest, tensorCopyConstructor) {
-    tensorCopyConstructor<float>();
-    tensorCopyConstructor<double>();
-    tensorCopyConstructor<int>();
+TEST_F(TensorTest, tensorCopyConstructor
+) {
+tensorCopyConstructor<float>();
+
+tensorCopyConstructor<double>();
+
+tensorCopyConstructor<int>();
+
 }
 
 /* ---------------------------------------
@@ -188,18 +214,22 @@ TEST_F(TensorTest, tensorCopyConstructor) {
 TEMPLATE_WITH_TYPE_T
 void tensorSlicingConstructorAxis2() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor<T> tens(data, 2, 3, 4);
-    DTensor<T> tensSlice(tens, 2, 0, 1); // matrices #0 and #1
+    DTensor <T> tens(data, 2, 3, 4);
+    DTensor <T> tensSlice(tens, 2, 0, 1); // matrices #0 and #1
     EXPECT_EQ(2, tensSlice.numRows());
     EXPECT_EQ(3, tensSlice.numCols());
     EXPECT_EQ(2, tensSlice.numMats());
     EXPECT_EQ(tens.raw(), tensSlice.raw()); // it is indeed a slice
 }
 
-TEST_F(TensorTest, tensorSlicingConstructorAxis2) {
-    tensorSlicingConstructorAxis2<float>();
-    tensorSlicingConstructorAxis2<double>();
-    tensorSlicingConstructorAxis2<int>();
+TEST_F(TensorTest, tensorSlicingConstructorAxis2
+) {
+tensorSlicingConstructorAxis2<float>();
+
+tensorSlicingConstructorAxis2<double>();
+
+tensorSlicingConstructorAxis2<int>();
+
 }
 
 /* ---------------------------------------
@@ -210,8 +240,8 @@ TEST_F(TensorTest, tensorSlicingConstructorAxis2) {
 TEMPLATE_WITH_TYPE_T
 void tensorSlicingConstructorAxis1() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor<T> tenz(data, 2, 3, 4);
-    DTensor<T> tenzSlice(tenz, 1, 1, 2); // columns from 1 to 2
+    DTensor <T> tenz(data, 2, 3, 4);
+    DTensor <T> tenzSlice(tenz, 1, 1, 2); // columns from 1 to 2
     EXPECT_EQ(2, tenzSlice.numRows());
     EXPECT_EQ(2, tenzSlice.numCols());
     EXPECT_EQ(1, tenzSlice.numMats());
@@ -221,10 +251,14 @@ void tensorSlicingConstructorAxis1() {
     EXPECT_EQ(expected, tenzSliceDown);
 }
 
-TEST_F(TensorTest, tensorSlicingConstructorAxis1) {
-    tensorSlicingConstructorAxis1<float>();
-    tensorSlicingConstructorAxis1<double>();
-    tensorSlicingConstructorAxis1<int>();
+TEST_F(TensorTest, tensorSlicingConstructorAxis1
+) {
+tensorSlicingConstructorAxis1<float>();
+
+tensorSlicingConstructorAxis1<double>();
+
+tensorSlicingConstructorAxis1<int>();
+
 }
 
 /* ---------------------------------------
@@ -235,8 +269,8 @@ TEST_F(TensorTest, tensorSlicingConstructorAxis1) {
 TEMPLATE_WITH_TYPE_T
 void tensorSlicingConstructorAxis0() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor<T> tenz(data, 2, 3, 4);
-    DTensor<T> tenzSlice(tenz, 0, 2, 3); // elements 2..3
+    DTensor <T> tenz(data, 2, 3, 4);
+    DTensor <T> tenzSlice(tenz, 0, 2, 3); // elements 2..3
     EXPECT_EQ(2, tenzSlice.numRows());
     EXPECT_EQ(1, tenzSlice.numCols());
     EXPECT_EQ(1, tenzSlice.numMats());
@@ -246,10 +280,14 @@ void tensorSlicingConstructorAxis0() {
     EXPECT_EQ(expected, tenzSliceDown);
 }
 
-TEST_F(TensorTest, tensorSlicingConstructorAxis0) {
-    tensorSlicingConstructorAxis0<float>();
-    tensorSlicingConstructorAxis0<double>();
-    tensorSlicingConstructorAxis0<int>();
+TEST_F(TensorTest, tensorSlicingConstructorAxis0
+) {
+tensorSlicingConstructorAxis0<float>();
+
+tensorSlicingConstructorAxis0<double>();
+
+tensorSlicingConstructorAxis0<int>();
+
 }
 
 /* ---------------------------------------
@@ -259,7 +297,7 @@ TEST_F(TensorTest, tensorSlicingConstructorAxis0) {
 TEMPLATE_WITH_TYPE_T
 void tensorUpload() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor<T> tenz(2, 3, 4);
+    DTensor <T> tenz(2, 3, 4);
     tenz.upload(data);
     EXPECT_EQ(2, tenz.numRows());
     EXPECT_EQ(3, tenz.numCols());
@@ -269,10 +307,14 @@ void tensorUpload() {
     EXPECT_EQ(8, tenz(1, 2, 3));
 }
 
-TEST_F(TensorTest, tensorUpload) {
-    tensorUpload<float>();
-    tensorUpload<double>();
-    tensorUpload<int>();
+TEST_F(TensorTest, tensorUpload
+) {
+tensorUpload<float>();
+
+tensorUpload<double>();
+
+tensorUpload<int>();
+
 }
 
 /* ---------------------------------------
@@ -282,9 +324,9 @@ TEST_F(TensorTest, tensorUpload) {
 TEMPLATE_WITH_TYPE_T
 void tensorDeviceCopyTo() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor<T> tenz(data, 2, 3, 4);
-    DTensor<T> other(2, 3, 5, true);
-    DTensor<T> z(other, 2, 1, 4);
+    DTensor <T> tenz(data, 2, 3, 4);
+    DTensor <T> other(2, 3, 5, true);
+    DTensor <T> z(other, 2, 1, 4);
     tenz.deviceCopyTo(z);
     std::vector<T> expected = {0, 0, 0, 0, 0, 0,
                                1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 10, 5, 4, 3, 2, 1, -1, 4, 3, 4, 3, 4, 8};
@@ -293,10 +335,42 @@ void tensorDeviceCopyTo() {
     EXPECT_EQ(expected, actual);
 }
 
-TEST_F(TensorTest, tensorDeviceCopyTo) {
-    tensorDeviceCopyTo<float>();
-    tensorDeviceCopyTo<double>();
-    tensorDeviceCopyTo<int>();
+TEST_F(TensorTest, tensorDeviceCopyTo
+) {
+tensorDeviceCopyTo<float>();
+
+tensorDeviceCopyTo<double>();
+
+tensorDeviceCopyTo<int>();
+
+}
+
+
+/* ---------------------------------------
+ * Tensor: Frobenius dot product
+ * --------------------------------------- */
+
+TEMPLATE_WITH_TYPE_T
+void tensorReshape() {
+    size_t m = 5, n = 10, k = 3;
+    DTensor <T> a = DTensor<T>::createRandomTensor(m, n, k, -1, 1); // dim = (m, n, k)
+    ASSERT_EQ(m, a.numRows());
+    ASSERT_EQ(n, a.numCols());
+    ASSERT_EQ(k, a.numMats());
+    a.reshape(m, k, n); // dim = (m, k, n)
+    ASSERT_EQ(m, a.numRows());
+    ASSERT_EQ(k, a.numCols());
+    ASSERT_EQ(n, a.numMats());
+    a.reshape(k, n, m); // dim = (k, n, m)
+    ASSERT_EQ(k, a.numRows());
+    ASSERT_EQ(n, a.numCols());
+    ASSERT_EQ(m, a.numMats());
+}
+
+TEST_F(TensorTest, tensorReshape) {
+    tensorReshape<float>();
+    tensorReshape<double>();
+    tensorReshape<int>();
 }
 
 /* ---------------------------------------
@@ -308,20 +382,21 @@ void tensorDotF(T epsilon) {
     // as vectors
     std::vector<T> dataA = TENSOR_DATA_234A;
     std::vector<T> dataB = TENSOR_DATA_234B;
-    DTensor<T> vecA(dataA, dataA.size());
-    DTensor<T> vecB(dataB, dataB.size());
+    DTensor <T> vecA(dataA, dataA.size());
+    DTensor <T> vecB(dataB, dataB.size());
     T dotVector = vecA.dotF(vecB);
     EXPECT_EQ(604, dotVector);  // from MATLAB
     // as matrices
-    DTensor<T> tenA(dataA, 2, 3, 4);
-    DTensor<T> tenB(dataB, 2, 3, 4);
+    DTensor <T> tenA(dataA, 2, 3, 4);
+    DTensor <T> tenB(dataB, 2, 3, 4);
     T dotTensor = tenA.dotF(tenB);
     EXPECT_EQ(604, dotTensor);  // from MATLAB
 }
 
-TEST_F(TensorTest, tensorDotF) {
-    tensorDotF<float>(PRECISION_LOW);
-    tensorDotF<double>(PRECISION_HIGH);
+TEST_F(TensorTest, tensorDotF
+) {
+tensorDotF<float>(PRECISION_LOW);
+tensorDotF<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -331,13 +406,14 @@ TEST_F(TensorTest, tensorDotF) {
 TEMPLATE_WITH_TYPE_T
 void tensorNormF(T epsilon) {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor<T> tenz(data, 2, 3, 4);
+    DTensor <T> tenz(data, 2, 3, 4);
     EXPECT_NEAR(26.153393661244042, tenz.normF(), epsilon); // from MATLAB
 }
 
-TEST_F(TensorTest, tensorNormF) {
-    tensorNormF<float>(PRECISION_LOW);
-    tensorNormF<double>(PRECISION_HIGH);
+TEST_F(TensorTest, tensorNormF
+) {
+tensorNormF<float>(PRECISION_LOW);
+tensorNormF<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -348,13 +424,16 @@ TEST_F(TensorTest, tensorNormF) {
 TEMPLATE_WITH_TYPE_T
 void tensorSumAbs() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor<T> tenz(data, 2, 3, 4);
+    DTensor <T> tenz(data, 2, 3, 4);
     EXPECT_NEAR(112, tenz.sumAbs(), PRECISION_HIGH); // from MATLAB
 }
 
-TEST_F(TensorTest, tensorSumAbs) {
-    tensorSumAbs<float>();
-    tensorSumAbs<double>();
+TEST_F(TensorTest, tensorSumAbs
+) {
+tensorSumAbs<float>();
+
+tensorSumAbs<double>();
+
 }
 
 /* ---------------------------------------
@@ -364,14 +443,17 @@ TEST_F(TensorTest, tensorSumAbs) {
 TEMPLATE_WITH_TYPE_T
 void tensorMax() {
     std::vector<T> data = TENSOR_DATA_234AMB;
-    DTensor<T> tenz(data, 2, 3, 4);
+    DTensor <T> tenz(data, 2, 3, 4);
     T m = tenz.maxAbs();
     EXPECT_EQ(27, m);
 }
 
-TEST_F(TensorTest, tensorMax) {
-    tensorMax<float>();
-    tensorMax<double>();
+TEST_F(TensorTest, tensorMax
+) {
+tensorMax<float>();
+
+tensorMax<double>();
+
 }
 
 /* ---------------------------------------
@@ -381,14 +463,17 @@ TEST_F(TensorTest, tensorMax) {
 TEMPLATE_WITH_TYPE_T
 void tensorMin() {
     std::vector<T> data = TENSOR_DATA_234AMB;
-    DTensor<T> tenz(data, 2, 3, 4);
+    DTensor <T> tenz(data, 2, 3, 4);
     T m = tenz.minAbs();
     EXPECT_EQ(0, m);
 }
 
-TEST_F(TensorTest, tensorMin) {
-    tensorMin<float>();
-    tensorMin<double>();
+TEST_F(TensorTest, tensorMin
+) {
+tensorMin<float>();
+
+tensorMin<double>();
+
 }
 
 /* ---------------------------------------
@@ -421,9 +506,10 @@ void tensorRightGivens(T epsilon) {
     }
 }
 
-TEST_F(TensorTest, tensorRightGivens) {
-    tensorRightGivens<float>(PRECISION_LOW);
-    tensorRightGivens<double>(PRECISION_HIGH);
+TEST_F(TensorTest, tensorRightGivens
+) {
+tensorRightGivens<float>(PRECISION_LOW);
+tensorRightGivens<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -458,9 +544,10 @@ void tensorLeftGivens(T epsilon) {
     }
 }
 
-TEST_F(TensorTest, tensorLeftGivens) {
-    tensorLeftGivens<float>(1e-10);
-    tensorLeftGivens<double>(1e-14);
+TEST_F(TensorTest, tensorLeftGivens
+) {
+tensorLeftGivens<float>(1e-10);
+tensorLeftGivens<double>(1e-14);
 }
 
 /* ---------------------------------------
@@ -471,16 +558,20 @@ TEST_F(TensorTest, tensorLeftGivens) {
 TEMPLATE_WITH_TYPE_T
 void tensorBracketOperator() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor<T> tenz(data, 2, 3, 4);
+    DTensor <T> tenz(data, 2, 3, 4);
     EXPECT_EQ(1, tenz(0, 0, 0));
     EXPECT_EQ(3, tenz(0, 1, 2));
     EXPECT_EQ(8, tenz(1, 2, 3));
 }
 
-TEST_F(TensorTest, tensorBracketOperator) {
-    tensorBracketOperator<float>();
-    tensorBracketOperator<double>();
-    tensorBracketOperator<int>();
+TEST_F(TensorTest, tensorBracketOperator
+) {
+tensorBracketOperator<float>();
+
+tensorBracketOperator<double>();
+
+tensorBracketOperator<int>();
+
 }
 
 /* ---------------------------------------
@@ -490,8 +581,8 @@ TEST_F(TensorTest, tensorBracketOperator) {
 TEMPLATE_WITH_TYPE_T
 void tensorAssignmentOperator() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor<T> tenz(data, 2, 3, 4);
-    DTensor<T> other;
+    DTensor <T> tenz(data, 2, 3, 4);
+    DTensor <T> other;
     other = tenz;
     EXPECT_EQ(tenz.raw(), other.raw());
     EXPECT_EQ(2, other.numRows());
@@ -499,10 +590,14 @@ void tensorAssignmentOperator() {
     EXPECT_EQ(4, other.numMats());
 }
 
-TEST_F(TensorTest, tensorAssignmentOperator) {
-    tensorAssignmentOperator<float>();
-    tensorAssignmentOperator<double>();
-    tensorAssignmentOperator<int>();
+TEST_F(TensorTest, tensorAssignmentOperator
+) {
+tensorAssignmentOperator<float>();
+
+tensorAssignmentOperator<double>();
+
+tensorAssignmentOperator<int>();
+
 }
 
 /* ---------------------------------------
@@ -514,16 +609,19 @@ void tensorTimesEqualsScalar() {
     std::vector<T> data = TENSOR_DATA_234A;
     std::vector<T> dataTimes3 = {3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12,
                                  24};
-    DTensor<T> tenz(data, 2, 3, 4);
+    DTensor <T> tenz(data, 2, 3, 4);
     tenz *= 3.0;
     std::vector<T> actual;
     tenz.download(actual);
     EXPECT_EQ(dataTimes3, actual);
 }
 
-TEST_F(TensorTest, tensorTimesEqualsScalar) {
-    tensorTimesEqualsScalar<float>();
-    tensorTimesEqualsScalar<double>();
+TEST_F(TensorTest, tensorTimesEqualsScalar
+) {
+tensorTimesEqualsScalar<float>();
+
+tensorTimesEqualsScalar<double>();
+
 }
 
 /* ---------------------------------------
@@ -535,16 +633,19 @@ void tensorTimesScalar() {
     std::vector<T> data = TENSOR_DATA_234A;
     std::vector<T> dataTimes3 = {3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12,
                                  24};
-    DTensor<T> tenz(data, 2, 3, 4);
+    DTensor <T> tenz(data, 2, 3, 4);
     auto tripleTensor = 3.0 * tenz;
     std::vector<T> actual;
     tripleTensor.download(actual);
     EXPECT_EQ(dataTimes3, actual);
 }
 
-TEST_F(TensorTest, tensorTimesScalar) {
-    tensorTimesScalar<float>();
-    tensorTimesScalar<double>();
+TEST_F(TensorTest, tensorTimesScalar
+) {
+tensorTimesScalar<float>();
+
+tensorTimesScalar<double>();
+
 }
 
 /* ---------------------------------------
@@ -555,8 +656,8 @@ TEMPLATE_WITH_TYPE_T
 void tensorPlusEqualsTensor() {
     std::vector<T> dataA = TENSOR_DATA_234A;
     std::vector<T> dataB = TENSOR_DATA_234B;
-    DTensor<T> A(dataA, 2, 3, 4);
-    DTensor<T> B(dataB, 2, 3, 4);
+    DTensor <T> A(dataA, 2, 3, 4);
+    DTensor <T> B(dataB, 2, 3, 4);
     A += B;
     std::vector<T> expected = TENSOR_DATA_234APB;
     std::vector<T> actual;
@@ -564,9 +665,12 @@ void tensorPlusEqualsTensor() {
     EXPECT_EQ(expected, actual);
 }
 
-TEST_F(TensorTest, tensorPlusEqualsTensor) {
-    tensorPlusEqualsTensor<float>();
-    tensorPlusEqualsTensor<double>();
+TEST_F(TensorTest, tensorPlusEqualsTensor
+) {
+tensorPlusEqualsTensor<float>();
+
+tensorPlusEqualsTensor<double>();
+
 }
 
 /* ---------------------------------------
@@ -577,8 +681,8 @@ TEMPLATE_WITH_TYPE_T
 void tensorMinusEqualsTensor() {
     std::vector<T> dataA = TENSOR_DATA_234A;
     std::vector<T> dataB = TENSOR_DATA_234B;
-    DTensor<T> A(dataA, 2, 3, 4);
-    DTensor<T> B(dataB, 2, 3, 4);
+    DTensor <T> A(dataA, 2, 3, 4);
+    DTensor <T> B(dataB, 2, 3, 4);
     A -= B;
     std::vector<T> expected = TENSOR_DATA_234AMB;
     std::vector<T> actual;
@@ -586,9 +690,12 @@ void tensorMinusEqualsTensor() {
     EXPECT_EQ(expected, actual);
 }
 
-TEST_F(TensorTest, tensorMinusEqualsTensor) {
-    tensorMinusEqualsTensor<float>();
-    tensorMinusEqualsTensor<double>();
+TEST_F(TensorTest, tensorMinusEqualsTensor
+) {
+tensorMinusEqualsTensor<float>();
+
+tensorMinusEqualsTensor<double>();
+
 }
 
 /* ---------------------------------------
@@ -599,18 +706,21 @@ TEMPLATE_WITH_TYPE_T
 void tensorPlusTensor() {
     std::vector<T> dataA = TENSOR_DATA_234A;
     std::vector<T> dataB = TENSOR_DATA_234B;
-    DTensor<T> A(dataA, 2, 3, 4);
-    DTensor<T> B(dataB, 2, 3, 4);
-    DTensor<T> C = A + B;
+    DTensor <T> A(dataA, 2, 3, 4);
+    DTensor <T> B(dataB, 2, 3, 4);
+    DTensor <T> C = A + B;
     std::vector<T> expected = TENSOR_DATA_234APB;
     std::vector<T> actual;
     C.download(actual);
     EXPECT_EQ(expected, actual);
 }
 
-TEST_F(TensorTest, tensorPlusTensor) {
-    tensorPlusTensor<float>();
-    tensorPlusTensor<double>();
+TEST_F(TensorTest, tensorPlusTensor
+) {
+tensorPlusTensor<float>();
+
+tensorPlusTensor<double>();
+
 }
 
 /* ---------------------------------------
@@ -621,18 +731,21 @@ TEMPLATE_WITH_TYPE_T
 void tensorMinusTensor() {
     std::vector<T> dataA = TENSOR_DATA_234A;
     std::vector<T> dataB = TENSOR_DATA_234B;
-    DTensor<T> A(dataA, 2, 3, 4);
-    DTensor<T> B(dataB, 2, 3, 4);
-    DTensor<T> C = A - B;
+    DTensor <T> A(dataA, 2, 3, 4);
+    DTensor <T> B(dataB, 2, 3, 4);
+    DTensor <T> C = A - B;
     std::vector<T> expected = TENSOR_DATA_234AMB;
     std::vector<T> actual;
     C.download(actual);
     EXPECT_EQ(expected, actual);
 }
 
-TEST_F(TensorTest, tensorMinusTensor) {
-    tensorMinusTensor<float>();
-    tensorMinusTensor<double>();
+TEST_F(TensorTest, tensorMinusTensor
+) {
+tensorMinusTensor<float>();
+
+tensorMinusTensor<double>();
+
 }
 
 /* ---------------------------------------
@@ -647,9 +760,9 @@ void tensorAddAB() {
     std::vector<T> bData = {6, 5, 4, 3, 2, 1,
                             7, 6, 5, 4, 3, 2,
                             1, 2, 1, 5, -6, 8};
-    DTensor<T> A(aData, 2, 3, 3);
-    DTensor<T> B(bData, 3, 2, 3);
-    DTensor<T> C(2, 2, 3, true);
+    DTensor <T> A(aData, 2, 3, 3);
+    DTensor <T> B(bData, 3, 2, 3);
+    DTensor <T> C(2, 2, 3, true);
     C.addAB(A, B);
     std::vector<T> expected = {41, 56, 14, 20, 158, 176, 77, 86, 60, 64, 111, 118};
     std::vector<T> actual;
@@ -657,9 +770,12 @@ void tensorAddAB() {
     EXPECT_EQ(expected, actual);
 }
 
-TEST_F(TensorTest, tensorAddAB) {
-    tensorAddAB<double>();
-    tensorAddAB<float>();
+TEST_F(TensorTest, tensorAddAB
+) {
+tensorAddAB<double>();
+
+tensorAddAB<float>();
+
 }
 
 /* ---------------------------------------
@@ -674,23 +790,26 @@ void tensorGetRows() {
                             5., 6., 7.,
                             8., 9., 10.,
                             11., 12., 13};
-    DTensor<T> A(aData, 3, 3, 2);
-    DTensor<T> Ar0 = A.getRows(1, 1, 0);
+    DTensor <T> A(aData, 3, 3, 2);
+    DTensor <T> Ar0 = A.getRows(1, 1, 0);
     std::vector<T> expected0 = {25., 720., -1.};
     std::vector<T> actual0(3);
     Ar0.download(actual0);
     EXPECT_EQ(expected0, actual0);
 
-    DTensor<T> Ar1 = A.getRows(1, 2, 1);
+    DTensor <T> Ar1 = A.getRows(1, 2, 1);
     std::vector<T> expected1 = {6., 7., 9., 10., 12., 13.};
     std::vector<T> actual1(6);
     Ar1.download(actual1);
     EXPECT_EQ(expected1, actual1);
 }
 
-TEST_F(TensorTest, tensorGetRows) {
-    tensorGetRows<float>();
-    tensorGetRows<double>();
+TEST_F(TensorTest, tensorGetRows
+) {
+tensorGetRows<float>();
+
+tensorGetRows<double>();
+
 }
 
 
@@ -701,8 +820,8 @@ TEST_F(TensorTest, tensorGetRows) {
 TEMPLATE_WITH_TYPE_T
 void tensorTranspose() {
     std::vector<T> aData = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-    DTensor<T> A(aData, 3, 2, 2);
-    DTensor<T> Atranspose = A.tr();
+    DTensor <T> A(aData, 3, 2, 2);
+    DTensor <T> Atranspose = A.tr();
     EXPECT_EQ(2, Atranspose.numRows());
     EXPECT_EQ(3, Atranspose.numCols());
     EXPECT_EQ(2, Atranspose.numMats());
@@ -713,9 +832,12 @@ void tensorTranspose() {
 
 }
 
-TEST_F(TensorTest, tensorTranspose) {
-    tensorTranspose<float>();
-    tensorTranspose<double>();
+TEST_F(TensorTest, tensorTranspose
+) {
+tensorTranspose<float>();
+
+tensorTranspose<double>();
+
 }
 
 /* ================================================================================================
@@ -742,21 +864,22 @@ void tensorLeastSquares1(T epsilon) {
                             6, 8,
                             -9, 20};
     std::vector<T> bData = {1, 1, -1, 2, 30, -80};
-    DTensor<T> A0(aData, 2, 2, 3);
-    DTensor<T> A(A0);
-    DTensor<T> B(bData, 2, 1, 3);
-    DTensor<T> sol(B);
+    DTensor <T> A0(aData, 2, 2, 3);
+    DTensor <T> A(A0);
+    DTensor <T> B(bData, 2, 1, 3);
+    DTensor <T> sol(B);
     A0.leastSquaresBatched(sol);
-    DTensor<T> C(2, 1, 3);
+    DTensor <T> C(2, 1, 3);
     C.addAB(A, sol);
     C -= B;
     T nrmErr = C.normF();
     EXPECT_LT(nrmErr, epsilon);
 }
 
-TEST_F(LeastSquaresTest, tensorLS1) {
-    tensorLeastSquares1<float>(PRECISION_LOW);
-    tensorLeastSquares1<double>(PRECISION_HIGH);
+TEST_F(LeastSquaresTest, tensorLS1
+) {
+tensorLeastSquares1<float>(PRECISION_LOW);
+tensorLeastSquares1<double>(PRECISION_HIGH);
 }
 
 
@@ -780,8 +903,8 @@ void singularValuesComputation(float epsilon) {
     std::vector<T> bData = {1, 6, 6, 6, 6, 6, 6, 6,
                             2, 7, 7, 7, 7, 7, 7, 7,
                             3, 8, 8, 8, 8, 8, 8, 8,};
-    DTensor<T> B(bData, 8, 3);
-    Svd<T> svd(B, true, false);
+    DTensor <T> B(bData, 8, 3);
+    Svd <T> svd(B, true, false);
     EXPECT_EQ(true, svd.factorise());
     auto S = svd.singularValues();
     EXPECT_NEAR(32.496241123753592, S(0), epsilon); // value from MATLAB
@@ -791,9 +914,10 @@ void singularValuesComputation(float epsilon) {
     EXPECT_TRUE(U.has_value());
 }
 
-TEST_F(SvdTest, singularValuesComputation) {
-    singularValuesComputation<float>(PRECISION_LOW);
-    singularValuesComputation<double>(PRECISION_HIGH);
+TEST_F(SvdTest, singularValuesComputation
+) {
+singularValuesComputation<float>(PRECISION_LOW);
+singularValuesComputation<double>(PRECISION_HIGH);
 }
 
 
@@ -806,15 +930,15 @@ void singularValuesMemory(float epsilon) {
     std::vector<T> bData = {1, 6, 6, 6, 6, 6, 6, 6,
                             2, 7, 7, 7, 7, 7, 7, 7,
                             3, 8, 8, 8, 8, 8, 8, 8,};
-    DTensor<T> B(bData, 8, 3);
-    Svd<T> svd(B, true, false);
+    DTensor <T> B(bData, 8, 3);
+    Svd <T> svd(B, true, false);
     EXPECT_EQ(true, svd.factorise());
-    DTensor<T> const &v1 = svd.rightSingularVectors();
-    DTensor<T> const &v2 = svd.rightSingularVectors();
+    DTensor <T> const &v1 = svd.rightSingularVectors();
+    DTensor <T> const &v2 = svd.rightSingularVectors();
     EXPECT_EQ(&v1, &v2);
     EXPECT_EQ(v1.raw(), v2.raw());
-    DTensor<T> const &s1 = svd.singularValues();
-    DTensor<T> const &s2 = svd.singularValues();
+    DTensor <T> const &s1 = svd.singularValues();
+    DTensor <T> const &s2 = svd.singularValues();
     EXPECT_EQ(&s1, &s2);
     EXPECT_EQ(s1.raw(), s2.raw());
     auto u1 = svd.leftSingularVectors().value();
@@ -823,9 +947,10 @@ void singularValuesMemory(float epsilon) {
     EXPECT_EQ(u1->raw(), u2->raw());
 }
 
-TEST_F(SvdTest, singularValuesMemory) {
-    singularValuesMemory<float>(PRECISION_LOW);
-    singularValuesMemory<double>(PRECISION_HIGH);
+TEST_F(SvdTest, singularValuesMemory
+) {
+singularValuesMemory<float>(PRECISION_LOW);
+singularValuesMemory<double>(PRECISION_HIGH);
 }
 
 
@@ -835,11 +960,11 @@ TEST_F(SvdTest, singularValuesMemory) {
 TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void singularValuesMultipleMatrices(float epsilon) {
     std::vector<T> aData = {1, 2, 3, 4, 5, 6, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1};
-    DTensor<T> A(aData, 3, 2, 3);
-    Svd<T> svd(A, true); // do compute U (A will be destroyed)
+    DTensor <T> A(aData, 3, 2, 3);
+    Svd <T> svd(A, true); // do compute U (A will be destroyed)
     svd.factorise();
-    DTensor<T> const &S = svd.singularValues();
-    DTensor<T> const &V = svd.rightSingularVectors();
+    DTensor <T> const &S = svd.singularValues();
+    DTensor <T> const &V = svd.rightSingularVectors();
     auto Uopt = svd.leftSingularVectors();
     auto U = Uopt.value();
     std::vector<T> expected_v = {-0.386317703118612, -0.922365780077058, -0.922365780077058, 0.386317703118612,
@@ -869,9 +994,10 @@ void singularValuesMultipleMatrices(float epsilon) {
 
 }
 
-TEST_F(SvdTest, singularValuesMultipleMatrices) {
-    singularValuesMultipleMatrices<float>(10 * PRECISION_LOW); // SVD with float performs quite poorly
-    singularValuesMultipleMatrices<double>(PRECISION_HIGH);
+TEST_F(SvdTest, singularValuesMultipleMatrices
+) {
+singularValuesMultipleMatrices<float>(10 * PRECISION_LOW); // SVD with float performs quite poorly
+singularValuesMultipleMatrices<double>(PRECISION_HIGH);
 }
 
 
@@ -884,9 +1010,9 @@ void singularValuesRankMultipleMatrices(float epsilon) {
     std::vector<T> aData = {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 0,
                             1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12,
                             1, 2, 3, 4, 2, 4, 6, 8, 3, 6, 9, 12};
-    DTensor<T> A(aData, 4, 3, 3);
+    DTensor <T> A(aData, 4, 3, 3);
 
-    Svd<T> svd(A);
+    Svd <T> svd(A);
     svd.factorise();
     auto rank = svd.rank(epsilon);
     EXPECT_EQ(3, rank(0, 0, 0));
@@ -894,9 +1020,10 @@ void singularValuesRankMultipleMatrices(float epsilon) {
     EXPECT_EQ(1, rank(0, 0, 2));
 }
 
-TEST_F(SvdTest, singularValuesRankMultipleMatrices) {
-    singularValuesRankMultipleMatrices<float>(PRECISION_LOW); // SVD with float performs quite poorly
-    singularValuesRankMultipleMatrices<double>(PRECISION_HIGH);
+TEST_F(SvdTest, singularValuesRankMultipleMatrices
+) {
+singularValuesRankMultipleMatrices<float>(PRECISION_LOW); // SVD with float performs quite poorly
+singularValuesRankMultipleMatrices<double>(PRECISION_HIGH);
 }
 
 /* ================================================================================================
@@ -919,17 +1046,18 @@ void choleskyFactorisation(T epsilon) {
     std::vector<T> aData = {10.0, 2.0, 3.0,
                             2.0, 20.0, -1.0,
                             3.0, -1.0, 30.0};
-    DTensor<T> A(aData, 3, 3, 1);
-    CholeskyFactoriser<T> chol(A);
+    DTensor <T> A(aData, 3, 3, 1);
+    CholeskyFactoriser <T> chol(A);
     chol.factorise();
     EXPECT_NEAR(3.162277660168380, A(0, 0), epsilon);
     EXPECT_NEAR(-0.361403161162101, A(2, 1), epsilon);
     EXPECT_NEAR(5.382321781081287, A(2, 2), epsilon);
 }
 
-TEST_F(CholeskyTest, choleskyFactorisation) {
-    choleskyFactorisation<float>(PRECISION_LOW);
-    choleskyFactorisation<double>(PRECISION_HIGH);
+TEST_F(CholeskyTest, choleskyFactorisation
+) {
+choleskyFactorisation<float>(PRECISION_LOW);
+choleskyFactorisation<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -941,14 +1069,14 @@ void choleskyFactorisationSolution(T epsilon) {
     std::vector<T> aData = {10.0, 2.0, 3.0,
                             2.0, 20.0, -1.0,
                             3.0, -1.0, 30.0};
-    DTensor<T> A(aData, 3, 3, 1);
-    DTensor<T> L(A); // L = A
-    CholeskyFactoriser<T> chol(L);
+    DTensor <T> A(aData, 3, 3, 1);
+    DTensor <T> L(A); // L = A
+    CholeskyFactoriser <T> chol(L);
     chol.factorise();
 
     std::vector<T> bData = {-1., -3., 5.};
-    DTensor<T> rhs(bData, 3, 1, 1);
-    DTensor<T> sol(rhs);
+    DTensor <T> rhs(bData, 3, 1, 1);
+    DTensor <T> sol(rhs);
     chol.solve(sol);
 
     std::vector<T> expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036};
@@ -956,15 +1084,16 @@ void choleskyFactorisationSolution(T epsilon) {
     sol.download(actual);
     for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon);
 
-    DTensor<T> error = A * sol;
+    DTensor <T> error = A * sol;
     error -= rhs;
     EXPECT_TRUE(error.normF() < epsilon);
 
 }
 
-TEST_F(CholeskyTest, choleskyFactorisationSolution) {
-    choleskyFactorisationSolution<float>(PRECISION_LOW);
-    choleskyFactorisationSolution<double>(PRECISION_HIGH);
+TEST_F(CholeskyTest, choleskyFactorisationSolution
+) {
+choleskyFactorisationSolution<float>(PRECISION_LOW);
+choleskyFactorisationSolution<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -976,12 +1105,12 @@ void choleskyBatchFactorisation(T epsilon) {
     std::vector<T> aData = {10.0, 2.0, 3.0,
                             2.0, 20.0, -1.0,
                             3.0, -1.0, 30.0};
-    DTensor<T> A(3, 3, 2);
-    DTensor<T> A0(A, 2, 0, 0);
-    DTensor<T> A1(A, 2, 1, 1);
+    DTensor <T> A(3, 3, 2);
+    DTensor <T> A0(A, 2, 0, 0);
+    DTensor <T> A1(A, 2, 1, 1);
     A0.upload(aData);
     A1.upload(aData);
-    CholeskyBatchFactoriser<T> chol(A);
+    CholeskyBatchFactoriser <T> chol(A);
     chol.factorise();
     // 0
     EXPECT_NEAR(3.162277660168380, A(0, 0, 0), epsilon);
@@ -993,9 +1122,10 @@ void choleskyBatchFactorisation(T epsilon) {
     EXPECT_NEAR(5.382321781081287, A(2, 2, 1), epsilon);
 }
 
-TEST_F(CholeskyTest, choleskyBatchFactorisation) {
-    choleskyBatchFactorisation<float>(PRECISION_LOW);
-    choleskyBatchFactorisation<double>(PRECISION_HIGH);
+TEST_F(CholeskyTest, choleskyBatchFactorisation
+) {
+choleskyBatchFactorisation<float>(PRECISION_LOW);
+choleskyBatchFactorisation<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -1007,35 +1137,36 @@ void choleskyBatchFactorSolve(T epsilon) {
     std::vector<T> aData = {10.0, 2.0, 3.0,
                             2.0, 20.0, -1.0,
                             3.0, -1.0, 30.0};
-    DTensor<T> A(3, 3, 2);
-    DTensor<T> A0(A, 2, 0, 0);
-    DTensor<T> A1(A, 2, 1, 1);
+    DTensor <T> A(3, 3, 2);
+    DTensor <T> A0(A, 2, 0, 0);
+    DTensor <T> A1(A, 2, 1, 1);
     A0.upload(aData);
     A1.upload(aData);
-    DTensor<T> L(A); // L = A
-    CholeskyBatchFactoriser<T> chol(L);
+    DTensor <T> L(A); // L = A
+    CholeskyBatchFactoriser <T> chol(L);
     chol.factorise();
     std::vector<T> bData = {-1., -3., 5.};
-    DTensor<T> rhs(3, 1, 2);
-    DTensor<T> rhs0(rhs, 2, 0, 0);
-    DTensor<T> rhs1(rhs, 2, 1, 1);
+    DTensor <T> rhs(3, 1, 2);
+    DTensor <T> rhs0(rhs, 2, 0, 0);
+    DTensor <T> rhs1(rhs, 2, 1, 1);
     rhs0.upload(bData);
     rhs1.upload(bData);
-    DTensor<T> sol(rhs);
+    DTensor <T> sol(rhs);
     chol.solve(sol);
     std::vector<T> expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036};
     std::vector<T> actual(6);
     sol.download(actual);
     for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon);  // 0
     for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon);  // 1
-    DTensor<T> error = A * sol;
+    DTensor <T> error = A * sol;
     error -= rhs;
     EXPECT_TRUE(error.normF() < epsilon);
 }
 
-TEST_F(CholeskyTest, choleskyBatchFactorSolve) {
-    choleskyBatchFactorSolve<float>(PRECISION_LOW);
-    choleskyBatchFactorSolve<double>(PRECISION_HIGH);
+TEST_F(CholeskyTest, choleskyBatchFactorSolve
+) {
+choleskyBatchFactorSolve<float>(PRECISION_LOW);
+choleskyBatchFactorSolve<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -1047,42 +1178,43 @@ void choleskyBatchSolve(T epsilon) {
     std::vector<T> aData = {10.0, 2.0, 3.0,
                             2.0, 20.0, -1.0,
                             3.0, -1.0, 30.0};
-    DTensor<T> A(3, 3, 2);
-    DTensor<T> A0(A, 2, 0, 0);
-    DTensor<T> A1(A, 2, 1, 1);
+    DTensor <T> A(3, 3, 2);
+    DTensor <T> A0(A, 2, 0, 0);
+    DTensor <T> A1(A, 2, 1, 1);
     A0.upload(aData);
     A1.upload(aData);
     std::vector<T> lowData = {3.162277660168380, 0, 0,
                               0.632455532033676, 4.427188724235731, 0,
                               0.948683298050514, -0.361403161162101, 5.382321781081287};  // from matlab
-    DTensor<T> low(3, 3, 2);
-    DTensor<T> low0(low, 2, 0, 0);
-    DTensor<T> low1(low, 2, 1, 1);
+    DTensor <T> low(3, 3, 2);
+    DTensor <T> low0(low, 2, 0, 0);
+    DTensor <T> low1(low, 2, 1, 1);
     low0.upload(lowData, rowMajor);
     low1.upload(lowData, rowMajor);
-    DTensor<T> L(low);
-    CholeskyBatchFactoriser<T> chol(L, true);
+    DTensor <T> L(low);
+    CholeskyBatchFactoriser <T> chol(L, true);
     std::vector<T> bData = {-1., -3., 5.};
-    DTensor<T> rhs(3, 1, 2);
-    DTensor<T> rhs0(rhs, 2, 0, 0);
-    DTensor<T> rhs1(rhs, 2, 1, 1);
+    DTensor <T> rhs(3, 1, 2);
+    DTensor <T> rhs0(rhs, 2, 0, 0);
+    DTensor <T> rhs1(rhs, 2, 1, 1);
     rhs0.upload(bData);
     rhs1.upload(bData);
-    DTensor<T> sol(rhs);
+    DTensor <T> sol(rhs);
     chol.solve(sol);
     std::vector<T> expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036};
     std::vector<T> actual(6);
     sol.download(actual);
     for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon);  // 0
     for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon);  // 1
-    DTensor<T> error = A * sol;
+    DTensor <T> error = A * sol;
     error -= rhs;
     EXPECT_TRUE(error.normF() < epsilon);
 }
 
-TEST_F(CholeskyTest, choleskyBatchSolve) {
-    choleskyBatchSolve<float>(PRECISION_LOW);
-    choleskyBatchSolve<double>(PRECISION_HIGH);
+TEST_F(CholeskyTest, choleskyBatchSolve
+) {
+choleskyBatchSolve<float>(PRECISION_LOW);
+choleskyBatchSolve<double>(PRECISION_HIGH);
 }
 
 
@@ -1105,15 +1237,15 @@ TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void qrFactorisation(T epsilon) {
     size_t nR = 4;
     size_t nC = 3;
-    DTensor<T> temp(nR, nC);
-    DTensor<T> A = DTensor<T>::createRandomTensor(nR, nC, 1, -100, 100);
-    QRFactoriser<T> qr(temp);
+    DTensor <T> temp(nR, nC);
+    DTensor <T> A = DTensor<T>::createRandomTensor(nR, nC, 1, -100, 100);
+    QRFactoriser <T> qr(temp);
     A.deviceCopyTo(temp);
     int status = qr.factorise();
     EXPECT_EQ(status, 0);
-    DTensor<T> Q(nR, nC);
-    DTensor<T> R(nC, nC, 1, true);
-    DTensor<T> QR(nR, nC);
+    DTensor <T> Q(nR, nC);
+    DTensor <T> R(nC, nC, 1, true);
+    DTensor <T> QR(nR, nC);
     status = qr.getQR(Q, R);
     EXPECT_EQ(status, 0);
     QR.addAB(Q, R);
@@ -1122,9 +1254,10 @@ void qrFactorisation(T epsilon) {
     EXPECT_NEAR(nrm, 0., epsilon);
 }
 
-TEST_F(QRTest, qrFactorisation) {
-    qrFactorisation<float>(PRECISION_LOW);
-    qrFactorisation<double>(PRECISION_HIGH);
+TEST_F(QRTest, qrFactorisation
+) {
+qrFactorisation<float>(PRECISION_LOW);
+qrFactorisation<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -1136,15 +1269,15 @@ TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void qrFactorisationTall(T epsilon) {
     size_t nR = 20;
     size_t nC = 3;
-    DTensor<T> temp(nR, nC);
-    DTensor<T> A = DTensor<T>::createRandomTensor(nR, nC, 1, -100, 100);
-    QRFactoriser<T> qr(temp);
+    DTensor <T> temp(nR, nC);
+    DTensor <T> A = DTensor<T>::createRandomTensor(nR, nC, 1, -100, 100);
+    QRFactoriser <T> qr(temp);
     A.deviceCopyTo(temp);
     int status = qr.factorise();
     EXPECT_EQ(status, 0);
-    DTensor<T> Q(nR, nC);
-    DTensor<T> R(nC, nC, 1, true);
-    DTensor<T> QR(nR, nC);
+    DTensor <T> Q(nR, nC);
+    DTensor <T> R(nC, nC, 1, true);
+    DTensor <T> QR(nR, nC);
     status = qr.getQR(Q, R);
     EXPECT_EQ(status, 0);
     QR.addAB(Q, R);
@@ -1153,9 +1286,10 @@ void qrFactorisationTall(T epsilon) {
     EXPECT_NEAR(nrm, 0., epsilon);
 }
 
-TEST_F(QRTest, qrFactorisationTall) {
-    qrFactorisationTall<float>(PRECISION_LOW);
-    qrFactorisationTall<double>(PRECISION_HIGH);
+TEST_F(QRTest, qrFactorisationTall
+) {
+qrFactorisationTall<float>(PRECISION_LOW);
+qrFactorisationTall<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -1166,7 +1300,7 @@ TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void qrLeastSquares(T epsilon) {
     size_t nR = 4;
     size_t nC = 3;
-    DTensor<T> temp(nR, nC);
+    DTensor <T> temp(nR, nC);
     std::vector<T> vecA = {85.5638, -59.4001, -80.1992,
                            99.9464, 5.51393, 5.17935,
                            6.87488, -26.7536, 36.0914,
@@ -1175,12 +1309,12 @@ void qrLeastSquares(T epsilon) {
                            -48.5744,
                            43.4229,
                            -56.5081};  // Random vector
-    DTensor<T> A(vecA, nR, nC, 1, rowMajor);
-    DTensor<T> b(vecB, nR);
-    DTensor<T> xFull(nR);
-    DTensor<T> x(xFull, 0, 0, nC - 1);
-    DTensor<T> Ax(nR);
-    QRFactoriser<T> qr(temp);
+    DTensor <T> A(vecA, nR, nC, 1, rowMajor);
+    DTensor <T> b(vecB, nR);
+    DTensor <T> xFull(nR);
+    DTensor <T> x(xFull, 0, 0, nC - 1);
+    DTensor <T> Ax(nR);
+    QRFactoriser <T> qr(temp);
     A.deviceCopyTo(temp);
     int status = qr.factorise();
     EXPECT_EQ(status, 0);
@@ -1193,9 +1327,10 @@ void qrLeastSquares(T epsilon) {
     EXPECT_NEAR(nrm, 80.003169364198072, epsilon);  // From MatLab
 }
 
-TEST_F(QRTest, qrLeastSquares) {
-    qrLeastSquares<float>(PRECISION_LOW);
-    qrLeastSquares<double>(PRECISION_HIGH);
+TEST_F(QRTest, qrLeastSquares
+) {
+qrLeastSquares<float>(PRECISION_LOW);
+qrLeastSquares<double>(PRECISION_HIGH);
 }
 
 
@@ -1221,19 +1356,19 @@ void computeNullspaceTensor(T epsilon) {
                             1, 2, 3, 4, 2, 4, 6, 8, 3, 6, 9, 12,
                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-    DTensor<T> A(aData, 3, 4, 5);
-    Nullspace<T> ns(A);
-    DTensor<T> nA = ns.nullspace();
+    DTensor <T> A(aData, 3, 4, 5);
+    Nullspace <T> ns(A);
+    DTensor <T> nA = ns.nullspace();
     size_t nMats = nA.numMats();
     EXPECT_EQ(nMats, 5);
     for (size_t i = 0; i < nMats; i++) {
-        DTensor<T> nAi(nA, 2, i, i);
-        DTensor<T> Ai(A, 2, i, i);
-        DTensor<T> mustBeZero = Ai * nAi;
+        DTensor <T> nAi(nA, 2, i, i);
+        DTensor <T> Ai(A, 2, i, i);
+        DTensor <T> mustBeZero = Ai * nAi;
         EXPECT_LT(mustBeZero.normF(), epsilon);
 
-        DTensor<T> nAiTr = nAi.tr();
-        DTensor<T> mustBeEye = nAiTr * nAi;
+        DTensor <T> nAiTr = nAi.tr();
+        DTensor <T> mustBeEye = nAiTr * nAi;
         EXPECT_NEAR(1, mustBeEye(0, 0, 0), epsilon);
         for (size_t ir = 0; ir < mustBeEye.numRows(); ir++) {
             for (size_t ic = 0; ic < mustBeEye.numCols(); ic++) {
@@ -1245,9 +1380,10 @@ void computeNullspaceTensor(T epsilon) {
     }
 }
 
-TEST_F(NullspaceTest, computeNullspaceTensor) {
-    computeNullspaceTensor<float>(PRECISION_LOW);
-    computeNullspaceTensor<double>(PRECISION_HIGH);
+TEST_F(NullspaceTest, computeNullspaceTensor
+) {
+computeNullspaceTensor<float>(PRECISION_LOW);
+computeNullspaceTensor<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -1262,15 +1398,16 @@ void computeNullspaceTrivial(T epsilon) {
                         1, 1, 1,
                         5, 6, 7,
                         9, 0, 3};
-    DTensor<T> A(data, 3, 3, 2, rowMajor);
-    Nullspace<T> nullA(A);
-    DTensor<T> N = nullA.nullspace();
+    DTensor <T> A(data, 3, 3, 2, rowMajor);
+    Nullspace <T> nullA(A);
+    DTensor <T> N = nullA.nullspace();
     EXPECT_EQ(N.normF(), 0);
 }
 
-TEST_F(NullspaceTest, computeNullspaceTrivial) {
-    computeNullspaceTrivial<float>(PRECISION_LOW);
-    computeNullspaceTrivial<double>(PRECISION_HIGH);
+TEST_F(NullspaceTest, computeNullspaceTrivial
+) {
+computeNullspaceTrivial<float>(PRECISION_LOW);
+computeNullspaceTrivial<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -1285,34 +1422,35 @@ void projectOnNullspaceTensor(T epsilon) {
     std::vector<T> mat{1, -2, 3, 4, -1, -1, -1,
                        1, 2, -3, 4, -1, -1, -1,
                        -1, 3, 5, -7, -1, -1, -1};
-    DTensor<T> A(m, n, 1);
+    DTensor <T> A(m, n, 1);
     A.upload(mat, rowMajor);
-    Nullspace<T> ns = Nullspace(A);
-    DTensor<T> N = ns.nullspace();
+    Nullspace <T> ns = Nullspace(A);
+    DTensor <T> N = ns.nullspace();
 
     // online
     std::vector<T> vec{1, 2, 3, 4, 5, 6, 7};
-    DTensor<T> x(vec, n);
-    DTensor<T> proj(x);
+    DTensor <T> x(vec, n);
+    DTensor <T> proj(x);
     ns.project(proj);
 
     // Testing that proj is indeed in ker A
-    DTensor<T> error(m, 1, 1, true);
+    DTensor <T> error(m, 1, 1, true);
     error.addAB(A, proj);
     EXPECT_TRUE(error.normF() < epsilon);
 
     // Orthogonality test (other - p) † (p - x)
     std::vector<T> h_other{1, -2, 5, 4, 0, 0, 0};
-    DTensor<T> other(h_other, n);
-    DTensor<T> y = N * other;
-    DTensor<T> delta1 = y - proj;
-    DTensor<T> delta2 = proj - x;
+    DTensor <T> other(h_other, n);
+    DTensor <T> y = N * other;
+    DTensor <T> delta1 = y - proj;
+    DTensor <T> delta2 = proj - x;
     EXPECT_LT(delta1.dotF(delta2), epsilon);
 }
 
-TEST_F(NullspaceTest, projectOnNullspaceTensor) {
-    projectOnNullspaceTensor<float>(PRECISION_LOW);
-    projectOnNullspaceTensor<double>(PRECISION_HIGH);
+TEST_F(NullspaceTest, projectOnNullspaceTensor
+) {
+projectOnNullspaceTensor<float>(PRECISION_LOW);
+projectOnNullspaceTensor<double>(PRECISION_HIGH);
 }
 
 
@@ -1350,9 +1488,10 @@ void givensAnnihilateElement(T epsilon) {
     }
 }
 
-TEST_F(GivensAnnihilatorTest, givensAnnihilateElement) {
-    givensAnnihilateElement<float>(PRECISION_LOW);
-    givensAnnihilateElement<double>(PRECISION_HIGH);
+TEST_F(GivensAnnihilatorTest, givensAnnihilateElement
+) {
+givensAnnihilateElement<float>(PRECISION_LOW);
+givensAnnihilateElement<double>(PRECISION_HIGH);
 }
 
 
@@ -1379,9 +1518,10 @@ void givensAnnihilateCorrectness(T epsilon) {
 
 }
 
-TEST_F(GivensAnnihilatorTest, givensAnnihilateCorrectness) {
-    givensAnnihilateCorrectness<double>(1e-14);
-    givensAnnihilateCorrectness<float>(1e-12);
+TEST_F(GivensAnnihilatorTest, givensAnnihilateCorrectness
+) {
+givensAnnihilateCorrectness<double>(1e-14);
+givensAnnihilateCorrectness<float>(1e-12);
 }
 
 

From 5b3a67798f6f8e8e0642cbe038dfdc50c6a1b3a8 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 14:47:22 +0000
Subject: [PATCH 14/17] Test reshape more thoroughly

and format code
---
 test/testTensor.cu | 440 ++++++++++++++++++---------------------------
 1 file changed, 172 insertions(+), 268 deletions(-)

diff --git a/test/testTensor.cu b/test/testTensor.cu
index 982934c..3b28a34 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -36,14 +36,10 @@ void tensorConstructionZero() {
     EXPECT_EQ(expectedResult, zeroDown);
 }
 
-TEST_F(TensorTest, tensorConstructionZero
-) {
-tensorConstructionZero<float>();
-
-tensorConstructionZero<double>();
-
-tensorConstructionZero<int>();
-
+TEST_F(TensorTest, tensorConstructionZero) {
+    tensorConstructionZero<float>();
+    tensorConstructionZero<double>();
+    tensorConstructionZero<int>();
 }
 
 /* ---------------------------------------
@@ -92,14 +88,10 @@ void tensorConstructionStorageMode() {
     EXPECT_EQ(Cm, hostData);
 }
 
-TEST_F(TensorTest, tensorConstructionStorageMode
-) {
-tensorConstructionStorageMode<float>();
-
-tensorConstructionStorageMode<double>();
-
-tensorConstructionStorageMode<int>();
-
+TEST_F(TensorTest, tensorConstructionStorageMode) {
+    tensorConstructionStorageMode<float>();
+    tensorConstructionStorageMode<double>();
+    tensorConstructionStorageMode<int>();
 }
 
 /* ---------------------------------------
@@ -117,14 +109,10 @@ void randomTensorCreation() {
     EXPECT_TRUE(rEle >= -1 && rEle <= 1);
 }
 
-TEST_F(TensorTest, randomTensorCreation
-) {
-randomTensorCreation<float>();
-
-randomTensorCreation<double>();
-
-randomTensorCreation<int>();
-
+TEST_F(TensorTest, randomTensorCreation) {
+    randomTensorCreation<float>();
+    randomTensorCreation<double>();
+    randomTensorCreation<int>();
 }
 
 /* ---------------------------------------
@@ -138,18 +126,12 @@ void tensorMoveConstructor() {
     DTensor <T> y(DTensor < T > {100, 10, 1000});
 }
 
-TEST_F(TensorTest, tensorMoveConstructor
-) {
-tensorMoveConstructor<float>();
-
-tensorMoveConstructor<double>();
-
-tensorMoveConstructor<int>();
-
-tensorMoveConstructor<int *>();
-
-tensorMoveConstructor<double *>();
-
+TEST_F(TensorTest, tensorMoveConstructor) {
+    tensorMoveConstructor<float>();
+    tensorMoveConstructor<double>();
+    tensorMoveConstructor<int>();
+    tensorMoveConstructor<int *>();
+    tensorMoveConstructor<double *>();
 }
 
 /* ---------------------------------------
@@ -167,14 +149,10 @@ void tensorConstructionFromVector() {
     EXPECT_EQ(2 * 3 * 4, tenz.numEl());
 }
 
-TEST_F(TensorTest, tensorConstructionFromVector
-) {
-tensorConstructionFromVector<float>();
-
-tensorConstructionFromVector<double>();
-
-tensorConstructionFromVector<int>();
-
+TEST_F(TensorTest, tensorConstructionFromVector) {
+    tensorConstructionFromVector<float>();
+    tensorConstructionFromVector<double>();
+    tensorConstructionFromVector<int>();
 }
 
 /* ---------------------------------------
@@ -196,14 +174,10 @@ void tensorCopyConstructor() {
     EXPECT_NE(tenz.raw(), tenzCp.raw());
 }
 
-TEST_F(TensorTest, tensorCopyConstructor
-) {
-tensorCopyConstructor<float>();
-
-tensorCopyConstructor<double>();
-
-tensorCopyConstructor<int>();
-
+TEST_F(TensorTest, tensorCopyConstructor) {
+    tensorCopyConstructor<float>();
+    tensorCopyConstructor<double>();
+    tensorCopyConstructor<int>();
 }
 
 /* ---------------------------------------
@@ -222,14 +196,10 @@ void tensorSlicingConstructorAxis2() {
     EXPECT_EQ(tens.raw(), tensSlice.raw()); // it is indeed a slice
 }
 
-TEST_F(TensorTest, tensorSlicingConstructorAxis2
-) {
-tensorSlicingConstructorAxis2<float>();
-
-tensorSlicingConstructorAxis2<double>();
-
-tensorSlicingConstructorAxis2<int>();
-
+TEST_F(TensorTest, tensorSlicingConstructorAxis2) {
+    tensorSlicingConstructorAxis2<float>();
+    tensorSlicingConstructorAxis2<double>();
+    tensorSlicingConstructorAxis2<int>();
 }
 
 /* ---------------------------------------
@@ -251,14 +221,10 @@ void tensorSlicingConstructorAxis1() {
     EXPECT_EQ(expected, tenzSliceDown);
 }
 
-TEST_F(TensorTest, tensorSlicingConstructorAxis1
-) {
-tensorSlicingConstructorAxis1<float>();
-
-tensorSlicingConstructorAxis1<double>();
-
-tensorSlicingConstructorAxis1<int>();
-
+TEST_F(TensorTest, tensorSlicingConstructorAxis1) {
+    tensorSlicingConstructorAxis1<float>();
+    tensorSlicingConstructorAxis1<double>();
+    tensorSlicingConstructorAxis1<int>();
 }
 
 /* ---------------------------------------
@@ -280,14 +246,10 @@ void tensorSlicingConstructorAxis0() {
     EXPECT_EQ(expected, tenzSliceDown);
 }
 
-TEST_F(TensorTest, tensorSlicingConstructorAxis0
-) {
-tensorSlicingConstructorAxis0<float>();
-
-tensorSlicingConstructorAxis0<double>();
-
-tensorSlicingConstructorAxis0<int>();
-
+TEST_F(TensorTest, tensorSlicingConstructorAxis0) {
+    tensorSlicingConstructorAxis0<float>();
+    tensorSlicingConstructorAxis0<double>();
+    tensorSlicingConstructorAxis0<int>();
 }
 
 /* ---------------------------------------
@@ -307,14 +269,10 @@ void tensorUpload() {
     EXPECT_EQ(8, tenz(1, 2, 3));
 }
 
-TEST_F(TensorTest, tensorUpload
-) {
-tensorUpload<float>();
-
-tensorUpload<double>();
-
-tensorUpload<int>();
-
+TEST_F(TensorTest, tensorUpload) {
+    tensorUpload<float>();
+    tensorUpload<double>();
+    tensorUpload<int>();
 }
 
 /* ---------------------------------------
@@ -335,14 +293,10 @@ void tensorDeviceCopyTo() {
     EXPECT_EQ(expected, actual);
 }
 
-TEST_F(TensorTest, tensorDeviceCopyTo
-) {
-tensorDeviceCopyTo<float>();
-
-tensorDeviceCopyTo<double>();
-
-tensorDeviceCopyTo<int>();
-
+TEST_F(TensorTest, tensorDeviceCopyTo) {
+    tensorDeviceCopyTo<float>();
+    tensorDeviceCopyTo<double>();
+    tensorDeviceCopyTo<int>();
 }
 
 
@@ -354,6 +308,8 @@ TEMPLATE_WITH_TYPE_T
 void tensorReshape() {
     size_t m = 5, n = 10, k = 3;
     DTensor <T> a = DTensor<T>::createRandomTensor(m, n, k, -1, 1); // dim = (m, n, k)
+    T lastElement = a(m - 1, n - 1, k - 1); // last element
+    T firstElement = a(0, 0, 0);
     ASSERT_EQ(m, a.numRows());
     ASSERT_EQ(n, a.numCols());
     ASSERT_EQ(k, a.numMats());
@@ -365,6 +321,20 @@ void tensorReshape() {
     ASSERT_EQ(k, a.numRows());
     ASSERT_EQ(n, a.numCols());
     ASSERT_EQ(m, a.numMats());
+    a.reshape(k * n, m, 1); // dim = (k*n, m, 1)
+    ASSERT_EQ(k * n, a.numRows());
+    ASSERT_EQ(m, a.numCols());
+    ASSERT_EQ(1, a.numMats());
+    a.reshape(m, k * n, 1); // dim = (m, k*n, 1)
+    ASSERT_EQ(m, a.numRows());
+    ASSERT_EQ(k * n, a.numCols());
+    ASSERT_EQ(1, a.numMats());
+    a.reshape(m * k * n, 1, 1); // dim = (m*k*n, 1, 1)
+    ASSERT_EQ(m * k * n, a.numRows());
+    ASSERT_EQ(1, a.numCols());
+    ASSERT_EQ(1, a.numMats());
+    ASSERT_EQ(lastElement, a(m * n * k - 1, 0, 0));
+    ASSERT_EQ(firstElement, a(0, 0, 0));
 }
 
 TEST_F(TensorTest, tensorReshape) {
@@ -393,10 +363,9 @@ void tensorDotF(T epsilon) {
     EXPECT_EQ(604, dotTensor);  // from MATLAB
 }
 
-TEST_F(TensorTest, tensorDotF
-) {
-tensorDotF<float>(PRECISION_LOW);
-tensorDotF<double>(PRECISION_HIGH);
+TEST_F(TensorTest, tensorDotF) {
+    tensorDotF<float>(PRECISION_LOW);
+    tensorDotF<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -410,10 +379,9 @@ void tensorNormF(T epsilon) {
     EXPECT_NEAR(26.153393661244042, tenz.normF(), epsilon); // from MATLAB
 }
 
-TEST_F(TensorTest, tensorNormF
-) {
-tensorNormF<float>(PRECISION_LOW);
-tensorNormF<double>(PRECISION_HIGH);
+TEST_F(TensorTest, tensorNormF) {
+    tensorNormF<float>(PRECISION_LOW);
+    tensorNormF<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -428,12 +396,9 @@ void tensorSumAbs() {
     EXPECT_NEAR(112, tenz.sumAbs(), PRECISION_HIGH); // from MATLAB
 }
 
-TEST_F(TensorTest, tensorSumAbs
-) {
-tensorSumAbs<float>();
-
-tensorSumAbs<double>();
-
+TEST_F(TensorTest, tensorSumAbs) {
+    tensorSumAbs<float>();
+    tensorSumAbs<double>();
 }
 
 /* ---------------------------------------
@@ -448,12 +413,9 @@ void tensorMax() {
     EXPECT_EQ(27, m);
 }
 
-TEST_F(TensorTest, tensorMax
-) {
-tensorMax<float>();
-
-tensorMax<double>();
-
+TEST_F(TensorTest, tensorMax) {
+    tensorMax<float>();
+    tensorMax<double>();
 }
 
 /* ---------------------------------------
@@ -468,12 +430,9 @@ void tensorMin() {
     EXPECT_EQ(0, m);
 }
 
-TEST_F(TensorTest, tensorMin
-) {
-tensorMin<float>();
-
-tensorMin<double>();
-
+TEST_F(TensorTest, tensorMin) {
+    tensorMin<float>();
+    tensorMin<double>();
 }
 
 /* ---------------------------------------
@@ -506,10 +465,9 @@ void tensorRightGivens(T epsilon) {
     }
 }
 
-TEST_F(TensorTest, tensorRightGivens
-) {
-tensorRightGivens<float>(PRECISION_LOW);
-tensorRightGivens<double>(PRECISION_HIGH);
+TEST_F(TensorTest, tensorRightGivens ) {
+    tensorRightGivens<float>(PRECISION_LOW);
+    tensorRightGivens<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -544,10 +502,9 @@ void tensorLeftGivens(T epsilon) {
     }
 }
 
-TEST_F(TensorTest, tensorLeftGivens
-) {
-tensorLeftGivens<float>(1e-10);
-tensorLeftGivens<double>(1e-14);
+TEST_F(TensorTest, tensorLeftGivens) {
+    tensorLeftGivens<float>(1e-10);
+    tensorLeftGivens<double>(1e-14);
 }
 
 /* ---------------------------------------
@@ -564,14 +521,10 @@ void tensorBracketOperator() {
     EXPECT_EQ(8, tenz(1, 2, 3));
 }
 
-TEST_F(TensorTest, tensorBracketOperator
-) {
-tensorBracketOperator<float>();
-
-tensorBracketOperator<double>();
-
-tensorBracketOperator<int>();
-
+TEST_F(TensorTest, tensorBracketOperator) {
+    tensorBracketOperator<float>();
+    tensorBracketOperator<double>();
+    tensorBracketOperator<int>();
 }
 
 /* ---------------------------------------
@@ -590,14 +543,10 @@ void tensorAssignmentOperator() {
     EXPECT_EQ(4, other.numMats());
 }
 
-TEST_F(TensorTest, tensorAssignmentOperator
-) {
-tensorAssignmentOperator<float>();
-
-tensorAssignmentOperator<double>();
-
-tensorAssignmentOperator<int>();
-
+TEST_F(TensorTest, tensorAssignmentOperator) {
+    tensorAssignmentOperator<float>();
+    tensorAssignmentOperator<double>();
+    tensorAssignmentOperator<int>();
 }
 
 /* ---------------------------------------
@@ -616,12 +565,9 @@ void tensorTimesEqualsScalar() {
     EXPECT_EQ(dataTimes3, actual);
 }
 
-TEST_F(TensorTest, tensorTimesEqualsScalar
-) {
-tensorTimesEqualsScalar<float>();
-
-tensorTimesEqualsScalar<double>();
-
+TEST_F(TensorTest, tensorTimesEqualsScalar) {
+    tensorTimesEqualsScalar<float>();
+    tensorTimesEqualsScalar<double>();
 }
 
 /* ---------------------------------------
@@ -640,12 +586,9 @@ void tensorTimesScalar() {
     EXPECT_EQ(dataTimes3, actual);
 }
 
-TEST_F(TensorTest, tensorTimesScalar
-) {
-tensorTimesScalar<float>();
-
-tensorTimesScalar<double>();
-
+TEST_F(TensorTest, tensorTimesScalar) {
+    tensorTimesScalar<float>();
+    tensorTimesScalar<double>();
 }
 
 /* ---------------------------------------
@@ -665,12 +608,9 @@ void tensorPlusEqualsTensor() {
     EXPECT_EQ(expected, actual);
 }
 
-TEST_F(TensorTest, tensorPlusEqualsTensor
-) {
-tensorPlusEqualsTensor<float>();
-
-tensorPlusEqualsTensor<double>();
-
+TEST_F(TensorTest, tensorPlusEqualsTensor) {
+    tensorPlusEqualsTensor<float>();
+    tensorPlusEqualsTensor<double>();
 }
 
 /* ---------------------------------------
@@ -690,12 +630,9 @@ void tensorMinusEqualsTensor() {
     EXPECT_EQ(expected, actual);
 }
 
-TEST_F(TensorTest, tensorMinusEqualsTensor
-) {
-tensorMinusEqualsTensor<float>();
-
-tensorMinusEqualsTensor<double>();
-
+TEST_F(TensorTest, tensorMinusEqualsTensor) {
+    tensorMinusEqualsTensor<float>();
+    tensorMinusEqualsTensor<double>();
 }
 
 /* ---------------------------------------
@@ -715,12 +652,9 @@ void tensorPlusTensor() {
     EXPECT_EQ(expected, actual);
 }
 
-TEST_F(TensorTest, tensorPlusTensor
-) {
-tensorPlusTensor<float>();
-
-tensorPlusTensor<double>();
-
+TEST_F(TensorTest, tensorPlusTensor) {
+    tensorPlusTensor<float>();
+    tensorPlusTensor<double>();
 }
 
 /* ---------------------------------------
@@ -740,12 +674,9 @@ void tensorMinusTensor() {
     EXPECT_EQ(expected, actual);
 }
 
-TEST_F(TensorTest, tensorMinusTensor
-) {
-tensorMinusTensor<float>();
-
-tensorMinusTensor<double>();
-
+TEST_F(TensorTest, tensorMinusTensor) {
+    tensorMinusTensor<float>();
+    tensorMinusTensor<double>();
 }
 
 /* ---------------------------------------
@@ -770,12 +701,9 @@ void tensorAddAB() {
     EXPECT_EQ(expected, actual);
 }
 
-TEST_F(TensorTest, tensorAddAB
-) {
-tensorAddAB<double>();
-
-tensorAddAB<float>();
-
+TEST_F(TensorTest, tensorAddAB) {
+    tensorAddAB<double>();
+    tensorAddAB<float>();
 }
 
 /* ---------------------------------------
@@ -804,12 +732,9 @@ void tensorGetRows() {
     EXPECT_EQ(expected1, actual1);
 }
 
-TEST_F(TensorTest, tensorGetRows
-) {
-tensorGetRows<float>();
-
-tensorGetRows<double>();
-
+TEST_F(TensorTest, tensorGetRows) {
+    tensorGetRows<float>();
+    tensorGetRows<double>();
 }
 
 
@@ -832,12 +757,9 @@ void tensorTranspose() {
 
 }
 
-TEST_F(TensorTest, tensorTranspose
-) {
-tensorTranspose<float>();
-
-tensorTranspose<double>();
-
+TEST_F(TensorTest, tensorTranspose) {
+    tensorTranspose<float>();
+    tensorTranspose<double>();
 }
 
 /* ================================================================================================
@@ -876,10 +798,9 @@ void tensorLeastSquares1(T epsilon) {
     EXPECT_LT(nrmErr, epsilon);
 }
 
-TEST_F(LeastSquaresTest, tensorLS1
-) {
-tensorLeastSquares1<float>(PRECISION_LOW);
-tensorLeastSquares1<double>(PRECISION_HIGH);
+TEST_F(LeastSquaresTest, tensorLS1) {
+    tensorLeastSquares1<float>(PRECISION_LOW);
+    tensorLeastSquares1<double>(PRECISION_HIGH);
 }
 
 
@@ -914,10 +835,9 @@ void singularValuesComputation(float epsilon) {
     EXPECT_TRUE(U.has_value());
 }
 
-TEST_F(SvdTest, singularValuesComputation
-) {
-singularValuesComputation<float>(PRECISION_LOW);
-singularValuesComputation<double>(PRECISION_HIGH);
+TEST_F(SvdTest, singularValuesComputation) {
+    singularValuesComputation<float>(PRECISION_LOW);
+    singularValuesComputation<double>(PRECISION_HIGH);
 }
 
 
@@ -947,10 +867,9 @@ void singularValuesMemory(float epsilon) {
     EXPECT_EQ(u1->raw(), u2->raw());
 }
 
-TEST_F(SvdTest, singularValuesMemory
-) {
-singularValuesMemory<float>(PRECISION_LOW);
-singularValuesMemory<double>(PRECISION_HIGH);
+TEST_F(SvdTest, singularValuesMemory) {
+    singularValuesMemory<float>(PRECISION_LOW);
+    singularValuesMemory<double>(PRECISION_HIGH);
 }
 
 
@@ -994,10 +913,9 @@ void singularValuesMultipleMatrices(float epsilon) {
 
 }
 
-TEST_F(SvdTest, singularValuesMultipleMatrices
-) {
-singularValuesMultipleMatrices<float>(10 * PRECISION_LOW); // SVD with float performs quite poorly
-singularValuesMultipleMatrices<double>(PRECISION_HIGH);
+TEST_F(SvdTest, singularValuesMultipleMatrices) {
+    singularValuesMultipleMatrices<float>(10 * PRECISION_LOW); // SVD with float performs quite poorly
+    singularValuesMultipleMatrices<double>(PRECISION_HIGH);
 }
 
 
@@ -1020,10 +938,9 @@ void singularValuesRankMultipleMatrices(float epsilon) {
     EXPECT_EQ(1, rank(0, 0, 2));
 }
 
-TEST_F(SvdTest, singularValuesRankMultipleMatrices
-) {
-singularValuesRankMultipleMatrices<float>(PRECISION_LOW); // SVD with float performs quite poorly
-singularValuesRankMultipleMatrices<double>(PRECISION_HIGH);
+TEST_F(SvdTest, singularValuesRankMultipleMatrices) {
+    singularValuesRankMultipleMatrices<float>(PRECISION_LOW); // SVD with float performs quite poorly
+    singularValuesRankMultipleMatrices<double>(PRECISION_HIGH);
 }
 
 /* ================================================================================================
@@ -1054,10 +971,9 @@ void choleskyFactorisation(T epsilon) {
     EXPECT_NEAR(5.382321781081287, A(2, 2), epsilon);
 }
 
-TEST_F(CholeskyTest, choleskyFactorisation
-) {
-choleskyFactorisation<float>(PRECISION_LOW);
-choleskyFactorisation<double>(PRECISION_HIGH);
+TEST_F(CholeskyTest, choleskyFactorisation) {
+    choleskyFactorisation<float>(PRECISION_LOW);
+    choleskyFactorisation<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -1090,10 +1006,9 @@ void choleskyFactorisationSolution(T epsilon) {
 
 }
 
-TEST_F(CholeskyTest, choleskyFactorisationSolution
-) {
-choleskyFactorisationSolution<float>(PRECISION_LOW);
-choleskyFactorisationSolution<double>(PRECISION_HIGH);
+TEST_F(CholeskyTest, choleskyFactorisationSolution) {
+    choleskyFactorisationSolution<float>(PRECISION_LOW);
+    choleskyFactorisationSolution<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -1122,10 +1037,9 @@ void choleskyBatchFactorisation(T epsilon) {
     EXPECT_NEAR(5.382321781081287, A(2, 2, 1), epsilon);
 }
 
-TEST_F(CholeskyTest, choleskyBatchFactorisation
-) {
-choleskyBatchFactorisation<float>(PRECISION_LOW);
-choleskyBatchFactorisation<double>(PRECISION_HIGH);
+TEST_F(CholeskyTest, choleskyBatchFactorisation) {
+    choleskyBatchFactorisation<float>(PRECISION_LOW);
+    choleskyBatchFactorisation<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -1163,10 +1077,9 @@ void choleskyBatchFactorSolve(T epsilon) {
     EXPECT_TRUE(error.normF() < epsilon);
 }
 
-TEST_F(CholeskyTest, choleskyBatchFactorSolve
-) {
-choleskyBatchFactorSolve<float>(PRECISION_LOW);
-choleskyBatchFactorSolve<double>(PRECISION_HIGH);
+TEST_F(CholeskyTest, choleskyBatchFactorSolve) {
+    choleskyBatchFactorSolve<float>(PRECISION_LOW);
+    choleskyBatchFactorSolve<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -1211,10 +1124,9 @@ void choleskyBatchSolve(T epsilon) {
     EXPECT_TRUE(error.normF() < epsilon);
 }
 
-TEST_F(CholeskyTest, choleskyBatchSolve
-) {
-choleskyBatchSolve<float>(PRECISION_LOW);
-choleskyBatchSolve<double>(PRECISION_HIGH);
+TEST_F(CholeskyTest, choleskyBatchSolve) {
+    choleskyBatchSolve<float>(PRECISION_LOW);
+    choleskyBatchSolve<double>(PRECISION_HIGH);
 }
 
 
@@ -1254,10 +1166,9 @@ void qrFactorisation(T epsilon) {
     EXPECT_NEAR(nrm, 0., epsilon);
 }
 
-TEST_F(QRTest, qrFactorisation
-) {
-qrFactorisation<float>(PRECISION_LOW);
-qrFactorisation<double>(PRECISION_HIGH);
+TEST_F(QRTest, qrFactorisation) {
+    qrFactorisation<float>(PRECISION_LOW);
+    qrFactorisation<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -1286,10 +1197,9 @@ void qrFactorisationTall(T epsilon) {
     EXPECT_NEAR(nrm, 0., epsilon);
 }
 
-TEST_F(QRTest, qrFactorisationTall
-) {
-qrFactorisationTall<float>(PRECISION_LOW);
-qrFactorisationTall<double>(PRECISION_HIGH);
+TEST_F(QRTest, qrFactorisationTall) {
+    qrFactorisationTall<float>(PRECISION_LOW);
+    qrFactorisationTall<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -1327,10 +1237,9 @@ void qrLeastSquares(T epsilon) {
     EXPECT_NEAR(nrm, 80.003169364198072, epsilon);  // From MatLab
 }
 
-TEST_F(QRTest, qrLeastSquares
-) {
-qrLeastSquares<float>(PRECISION_LOW);
-qrLeastSquares<double>(PRECISION_HIGH);
+TEST_F(QRTest, qrLeastSquares) {
+    qrLeastSquares<float>(PRECISION_LOW);
+    qrLeastSquares<double>(PRECISION_HIGH);
 }
 
 
@@ -1380,10 +1289,9 @@ void computeNullspaceTensor(T epsilon) {
     }
 }
 
-TEST_F(NullspaceTest, computeNullspaceTensor
-) {
-computeNullspaceTensor<float>(PRECISION_LOW);
-computeNullspaceTensor<double>(PRECISION_HIGH);
+TEST_F(NullspaceTest, computeNullspaceTensor) {
+    computeNullspaceTensor<float>(PRECISION_LOW);
+    computeNullspaceTensor<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -1404,10 +1312,9 @@ void computeNullspaceTrivial(T epsilon) {
     EXPECT_EQ(N.normF(), 0);
 }
 
-TEST_F(NullspaceTest, computeNullspaceTrivial
-) {
-computeNullspaceTrivial<float>(PRECISION_LOW);
-computeNullspaceTrivial<double>(PRECISION_HIGH);
+TEST_F(NullspaceTest, computeNullspaceTrivial) {
+    computeNullspaceTrivial<float>(PRECISION_LOW);
+    computeNullspaceTrivial<double>(PRECISION_HIGH);
 }
 
 /* ---------------------------------------
@@ -1447,10 +1354,9 @@ void projectOnNullspaceTensor(T epsilon) {
     EXPECT_LT(delta1.dotF(delta2), epsilon);
 }
 
-TEST_F(NullspaceTest, projectOnNullspaceTensor
-) {
-projectOnNullspaceTensor<float>(PRECISION_LOW);
-projectOnNullspaceTensor<double>(PRECISION_HIGH);
+TEST_F(NullspaceTest, projectOnNullspaceTensor) {
+    projectOnNullspaceTensor<float>(PRECISION_LOW);
+    projectOnNullspaceTensor<double>(PRECISION_HIGH);
 }
 
 
@@ -1488,10 +1394,9 @@ void givensAnnihilateElement(T epsilon) {
     }
 }
 
-TEST_F(GivensAnnihilatorTest, givensAnnihilateElement
-) {
-givensAnnihilateElement<float>(PRECISION_LOW);
-givensAnnihilateElement<double>(PRECISION_HIGH);
+TEST_F(GivensAnnihilatorTest, givensAnnihilateElement) {
+    givensAnnihilateElement<float>(PRECISION_LOW);
+    givensAnnihilateElement<double>(PRECISION_HIGH);
 }
 
 
@@ -1518,10 +1423,9 @@ void givensAnnihilateCorrectness(T epsilon) {
 
 }
 
-TEST_F(GivensAnnihilatorTest, givensAnnihilateCorrectness
-) {
-givensAnnihilateCorrectness<double>(1e-14);
-givensAnnihilateCorrectness<float>(1e-12);
+TEST_F(GivensAnnihilatorTest, givensAnnihilateCorrectness) {
+    givensAnnihilateCorrectness<double>(1e-14);
+    givensAnnihilateCorrectness<float>(1e-12);
 }
 
 

From 75576ac96237608c2ce384aaf3ba3d3072c11a9c Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 16:49:35 +0000
Subject: [PATCH 15/17] Further unit tests for reshape and docs

Unit test: Slice along axis=2 and reshape
Write documentation for DTensor<T>::reshape
Fix formatting issues in testTensor
---
 include/tensor.cuh |  15 ++
 test/testTensor.cu | 338 +++++++++++++++++++++++++--------------------
 2 files changed, 200 insertions(+), 153 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 55b661b..d62803c 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -471,6 +471,21 @@ public:
      */
     void addAB(const DTensor<T> &A, const DTensor<T> &B, T alpha = 1, T beta = 0);
 
+    /**
+     * Reshapes the tensor
+     *
+     * If the new number of tensors is larger than the current one,
+     * this method will allocate a device array of type T* and length
+     * equal to the new number of matrices.
+     *
+     * No new memory is allocated if newNumMats = 1
+     *
+     * @param newNumRows new number of rows
+     * @param newNumCols new number of columns
+     * @param newNumMats new number of matrices
+     *
+     * @throws std::invalid_argument if the provided dimensions are incompatible
+     */
     void reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats = 1);
 
     /* ------------- OPERATORS ------------- */
diff --git a/test/testTensor.cu b/test/testTensor.cu
index 3b28a34..4c1c147 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -26,7 +26,7 @@ protected:
 
 TEMPLATE_WITH_TYPE_T
 void tensorConstructionZero() {
-    DTensor <T> zero(2, 3, 4, true);
+    DTensor<T> zero(2, 3, 4, true);
     EXPECT_EQ(2, zero.numRows());
     EXPECT_EQ(3, zero.numCols());
     EXPECT_EQ(4, zero.numMats());
@@ -65,21 +65,21 @@ void tensorConstructionStorageMode() {
     std::vector<T> Rm = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
     std::vector<T> hostData(rows * cols * mats);
     // test constructor
-    DTensor <T> testCm(Cm, rows, cols, mats, columnMajor);
-    DTensor <T> testRm(Rm, rows, cols, mats, rowMajor);
+    DTensor<T> testCm(Cm, rows, cols, mats, columnMajor);
+    DTensor<T> testRm(Rm, rows, cols, mats, rowMajor);
     testCm.download(hostData);
     EXPECT_EQ(Cm, hostData);
     testRm.download(hostData);
     EXPECT_EQ(Cm, hostData);
     // test .upload()
-    DTensor <T> testSplitCm(rows, cols, mats);
-    DTensor <T> ACm(testSplitCm, 2, 0, 0);
-    DTensor <T> BCm(testSplitCm, 2, 1, 1);
+    DTensor<T> testSplitCm(rows, cols, mats);
+    DTensor<T> ACm(testSplitCm, 2, 0, 0);
+    DTensor<T> BCm(testSplitCm, 2, 1, 1);
     ACm.upload(aCm, columnMajor);
     BCm.upload(bCm, columnMajor);
-    DTensor <T> testSplitRm(rows, cols, mats);
-    DTensor <T> ARm(testSplitRm, 2, 0, 0);
-    DTensor <T> BRm(testSplitRm, 2, 1, 1);
+    DTensor<T> testSplitRm(rows, cols, mats);
+    DTensor<T> ARm(testSplitRm, 2, 0, 0);
+    DTensor<T> BRm(testSplitRm, 2, 1, 1);
     ARm.upload(aRm, rowMajor);
     BRm.upload(bRm, rowMajor);
     testSplitCm.download(hostData);
@@ -121,9 +121,9 @@ TEST_F(TensorTest, randomTensorCreation) {
 
 TEMPLATE_WITH_TYPE_T
 void tensorMoveConstructor() {
-    DTensor <T> zero(2, 3, 4, true);
-    DTensor <T> x(std::move(zero));
-    DTensor <T> y(DTensor < T > {100, 10, 1000});
+    DTensor<T> zero(2, 3, 4, true);
+    DTensor<T> x(std::move(zero));
+    DTensor<T> y(DTensor < T > {100, 10, 1000});
 }
 
 TEST_F(TensorTest, tensorMoveConstructor) {
@@ -142,7 +142,7 @@ TEST_F(TensorTest, tensorMoveConstructor) {
 TEMPLATE_WITH_TYPE_T
 void tensorConstructionFromVector() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor <T> tenz(data, 2, 3, 4);
+    DTensor<T> tenz(data, 2, 3, 4);
     EXPECT_EQ(2, tenz.numRows());
     EXPECT_EQ(3, tenz.numCols());
     EXPECT_EQ(4, tenz.numMats());
@@ -162,8 +162,8 @@ TEST_F(TensorTest, tensorConstructionFromVector) {
 TEMPLATE_WITH_TYPE_T
 void tensorCopyConstructor() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor <T> tenz(data, 2, 3, 4);
-    DTensor <T> tenzCp(tenz);
+    DTensor<T> tenz(data, 2, 3, 4);
+    DTensor<T> tenzCp(tenz);
     EXPECT_EQ(2, tenzCp.numRows());
     EXPECT_EQ(3, tenzCp.numCols());
     EXPECT_EQ(4, tenzCp.numMats());
@@ -188,8 +188,8 @@ TEST_F(TensorTest, tensorCopyConstructor) {
 TEMPLATE_WITH_TYPE_T
 void tensorSlicingConstructorAxis2() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor <T> tens(data, 2, 3, 4);
-    DTensor <T> tensSlice(tens, 2, 0, 1); // matrices #0 and #1
+    DTensor<T> tens(data, 2, 3, 4);
+    DTensor<T> tensSlice(tens, 2, 0, 1); // matrices #0 and #1
     EXPECT_EQ(2, tensSlice.numRows());
     EXPECT_EQ(3, tensSlice.numCols());
     EXPECT_EQ(2, tensSlice.numMats());
@@ -210,8 +210,8 @@ TEST_F(TensorTest, tensorSlicingConstructorAxis2) {
 TEMPLATE_WITH_TYPE_T
 void tensorSlicingConstructorAxis1() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor <T> tenz(data, 2, 3, 4);
-    DTensor <T> tenzSlice(tenz, 1, 1, 2); // columns from 1 to 2
+    DTensor<T> tenz(data, 2, 3, 4);
+    DTensor<T> tenzSlice(tenz, 1, 1, 2); // columns from 1 to 2
     EXPECT_EQ(2, tenzSlice.numRows());
     EXPECT_EQ(2, tenzSlice.numCols());
     EXPECT_EQ(1, tenzSlice.numMats());
@@ -235,8 +235,8 @@ TEST_F(TensorTest, tensorSlicingConstructorAxis1) {
 TEMPLATE_WITH_TYPE_T
 void tensorSlicingConstructorAxis0() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor <T> tenz(data, 2, 3, 4);
-    DTensor <T> tenzSlice(tenz, 0, 2, 3); // elements 2..3
+    DTensor<T> tenz(data, 2, 3, 4);
+    DTensor<T> tenzSlice(tenz, 0, 2, 3); // elements 2..3
     EXPECT_EQ(2, tenzSlice.numRows());
     EXPECT_EQ(1, tenzSlice.numCols());
     EXPECT_EQ(1, tenzSlice.numMats());
@@ -259,7 +259,7 @@ TEST_F(TensorTest, tensorSlicingConstructorAxis0) {
 TEMPLATE_WITH_TYPE_T
 void tensorUpload() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor <T> tenz(2, 3, 4);
+    DTensor<T> tenz(2, 3, 4);
     tenz.upload(data);
     EXPECT_EQ(2, tenz.numRows());
     EXPECT_EQ(3, tenz.numCols());
@@ -282,9 +282,9 @@ TEST_F(TensorTest, tensorUpload) {
 TEMPLATE_WITH_TYPE_T
 void tensorDeviceCopyTo() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor <T> tenz(data, 2, 3, 4);
-    DTensor <T> other(2, 3, 5, true);
-    DTensor <T> z(other, 2, 1, 4);
+    DTensor<T> tenz(data, 2, 3, 4);
+    DTensor<T> other(2, 3, 5, true);
+    DTensor<T> z(other, 2, 1, 4);
     tenz.deviceCopyTo(z);
     std::vector<T> expected = {0, 0, 0, 0, 0, 0,
                                1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 10, 5, 4, 3, 2, 1, -1, 4, 3, 4, 3, 4, 8};
@@ -301,13 +301,13 @@ TEST_F(TensorTest, tensorDeviceCopyTo) {
 
 
 /* ---------------------------------------
- * Tensor: Frobenius dot product
+ * Tensor: Reshape
  * --------------------------------------- */
 
 TEMPLATE_WITH_TYPE_T
 void tensorReshape() {
     size_t m = 5, n = 10, k = 3;
-    DTensor <T> a = DTensor<T>::createRandomTensor(m, n, k, -1, 1); // dim = (m, n, k)
+    DTensor<T> a = DTensor<T>::createRandomTensor(m, n, k, -1, 1); // dim = (m, n, k)
     T lastElement = a(m - 1, n - 1, k - 1); // last element
     T firstElement = a(0, 0, 0);
     ASSERT_EQ(m, a.numRows());
@@ -343,6 +343,38 @@ TEST_F(TensorTest, tensorReshape) {
     tensorReshape<int>();
 }
 
+/* ---------------------------------------
+ * Tensor: Slice, reshape and add/multiply
+ * --------------------------------------- */
+
+TEMPLATE_WITH_TYPE_T
+void tensorSliceAndReshape(T epsilon) {
+    std::vector<T> dataA = TENSOR_DATA_234A;
+    std::vector<T> dataB = TENSOR_DATA_234B;
+    DTensor<T> a(dataA, 2, 3, 4);
+    DTensor<T> b(dataB, 2, 3, 4);
+
+    /* ---- Slicing axis = 2 ---- */
+    DTensor<T> aSlice(a, 2, 1, 3);
+    DTensor<T> bSlice(b, 2, 1, 3);
+    aSlice.reshape(2, 9, 1);
+    bSlice.reshape(2, 9, 1);
+    aSlice += bSlice;
+
+    std::vector<T> dataAExpected = {1, 2, 3, 4, 5, 6, 41, 7, 5, 5,
+                                    19, 17, 14, 13, 5, 11, -8, -4,
+                                    6, 8, 8, -2, 8, 13};
+    DTensor<T> aExpected(dataAExpected, 2, 3, 4);
+
+    DTensor<T> err = aExpected - a;
+    ASSERT_LT(err.normF(), epsilon);
+}
+
+TEST_F(TensorTest, tensorSliceAndReshape) {
+    tensorSliceAndReshape<float>(PRECISION_LOW);
+    tensorSliceAndReshape<double>(PRECISION_HIGH);
+}
+
 /* ---------------------------------------
  * Tensor: Frobenius dot product
  * --------------------------------------- */
@@ -352,13 +384,13 @@ void tensorDotF(T epsilon) {
     // as vectors
     std::vector<T> dataA = TENSOR_DATA_234A;
     std::vector<T> dataB = TENSOR_DATA_234B;
-    DTensor <T> vecA(dataA, dataA.size());
-    DTensor <T> vecB(dataB, dataB.size());
+    DTensor<T> vecA(dataA, dataA.size());
+    DTensor<T> vecB(dataB, dataB.size());
     T dotVector = vecA.dotF(vecB);
     EXPECT_EQ(604, dotVector);  // from MATLAB
     // as matrices
-    DTensor <T> tenA(dataA, 2, 3, 4);
-    DTensor <T> tenB(dataB, 2, 3, 4);
+    DTensor<T> tenA(dataA, 2, 3, 4);
+    DTensor<T> tenB(dataB, 2, 3, 4);
     T dotTensor = tenA.dotF(tenB);
     EXPECT_EQ(604, dotTensor);  // from MATLAB
 }
@@ -375,7 +407,7 @@ TEST_F(TensorTest, tensorDotF) {
 TEMPLATE_WITH_TYPE_T
 void tensorNormF(T epsilon) {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor <T> tenz(data, 2, 3, 4);
+    DTensor<T> tenz(data, 2, 3, 4);
     EXPECT_NEAR(26.153393661244042, tenz.normF(), epsilon); // from MATLAB
 }
 
@@ -392,7 +424,7 @@ TEST_F(TensorTest, tensorNormF) {
 TEMPLATE_WITH_TYPE_T
 void tensorSumAbs() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor <T> tenz(data, 2, 3, 4);
+    DTensor<T> tenz(data, 2, 3, 4);
     EXPECT_NEAR(112, tenz.sumAbs(), PRECISION_HIGH); // from MATLAB
 }
 
@@ -408,7 +440,7 @@ TEST_F(TensorTest, tensorSumAbs) {
 TEMPLATE_WITH_TYPE_T
 void tensorMax() {
     std::vector<T> data = TENSOR_DATA_234AMB;
-    DTensor <T> tenz(data, 2, 3, 4);
+    DTensor<T> tenz(data, 2, 3, 4);
     T m = tenz.maxAbs();
     EXPECT_EQ(27, m);
 }
@@ -425,7 +457,7 @@ TEST_F(TensorTest, tensorMax) {
 TEMPLATE_WITH_TYPE_T
 void tensorMin() {
     std::vector<T> data = TENSOR_DATA_234AMB;
-    DTensor <T> tenz(data, 2, 3, 4);
+    DTensor<T> tenz(data, 2, 3, 4);
     T m = tenz.minAbs();
     EXPECT_EQ(0, m);
 }
@@ -465,7 +497,7 @@ void tensorRightGivens(T epsilon) {
     }
 }
 
-TEST_F(TensorTest, tensorRightGivens ) {
+TEST_F(TensorTest, tensorRightGivens) {
     tensorRightGivens<float>(PRECISION_LOW);
     tensorRightGivens<double>(PRECISION_HIGH);
 }
@@ -515,7 +547,7 @@ TEST_F(TensorTest, tensorLeftGivens) {
 TEMPLATE_WITH_TYPE_T
 void tensorBracketOperator() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor <T> tenz(data, 2, 3, 4);
+    DTensor<T> tenz(data, 2, 3, 4);
     EXPECT_EQ(1, tenz(0, 0, 0));
     EXPECT_EQ(3, tenz(0, 1, 2));
     EXPECT_EQ(8, tenz(1, 2, 3));
@@ -534,8 +566,8 @@ TEST_F(TensorTest, tensorBracketOperator) {
 TEMPLATE_WITH_TYPE_T
 void tensorAssignmentOperator() {
     std::vector<T> data = TENSOR_DATA_234A;
-    DTensor <T> tenz(data, 2, 3, 4);
-    DTensor <T> other;
+    DTensor<T> tenz(data, 2, 3, 4);
+    DTensor<T> other;
     other = tenz;
     EXPECT_EQ(tenz.raw(), other.raw());
     EXPECT_EQ(2, other.numRows());
@@ -558,7 +590,7 @@ void tensorTimesEqualsScalar() {
     std::vector<T> data = TENSOR_DATA_234A;
     std::vector<T> dataTimes3 = {3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12,
                                  24};
-    DTensor <T> tenz(data, 2, 3, 4);
+    DTensor<T> tenz(data, 2, 3, 4);
     tenz *= 3.0;
     std::vector<T> actual;
     tenz.download(actual);
@@ -579,7 +611,7 @@ void tensorTimesScalar() {
     std::vector<T> data = TENSOR_DATA_234A;
     std::vector<T> dataTimes3 = {3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12,
                                  24};
-    DTensor <T> tenz(data, 2, 3, 4);
+    DTensor<T> tenz(data, 2, 3, 4);
     auto tripleTensor = 3.0 * tenz;
     std::vector<T> actual;
     tripleTensor.download(actual);
@@ -599,8 +631,8 @@ TEMPLATE_WITH_TYPE_T
 void tensorPlusEqualsTensor() {
     std::vector<T> dataA = TENSOR_DATA_234A;
     std::vector<T> dataB = TENSOR_DATA_234B;
-    DTensor <T> A(dataA, 2, 3, 4);
-    DTensor <T> B(dataB, 2, 3, 4);
+    DTensor<T> A(dataA, 2, 3, 4);
+    DTensor<T> B(dataB, 2, 3, 4);
     A += B;
     std::vector<T> expected = TENSOR_DATA_234APB;
     std::vector<T> actual;
@@ -621,8 +653,8 @@ TEMPLATE_WITH_TYPE_T
 void tensorMinusEqualsTensor() {
     std::vector<T> dataA = TENSOR_DATA_234A;
     std::vector<T> dataB = TENSOR_DATA_234B;
-    DTensor <T> A(dataA, 2, 3, 4);
-    DTensor <T> B(dataB, 2, 3, 4);
+    DTensor<T> A(dataA, 2, 3, 4);
+    DTensor<T> B(dataB, 2, 3, 4);
     A -= B;
     std::vector<T> expected = TENSOR_DATA_234AMB;
     std::vector<T> actual;
@@ -643,9 +675,9 @@ TEMPLATE_WITH_TYPE_T
 void tensorPlusTensor() {
     std::vector<T> dataA = TENSOR_DATA_234A;
     std::vector<T> dataB = TENSOR_DATA_234B;
-    DTensor <T> A(dataA, 2, 3, 4);
-    DTensor <T> B(dataB, 2, 3, 4);
-    DTensor <T> C = A + B;
+    DTensor<T> A(dataA, 2, 3, 4);
+    DTensor<T> B(dataB, 2, 3, 4);
+    DTensor<T> C = A + B;
     std::vector<T> expected = TENSOR_DATA_234APB;
     std::vector<T> actual;
     C.download(actual);
@@ -665,9 +697,9 @@ TEMPLATE_WITH_TYPE_T
 void tensorMinusTensor() {
     std::vector<T> dataA = TENSOR_DATA_234A;
     std::vector<T> dataB = TENSOR_DATA_234B;
-    DTensor <T> A(dataA, 2, 3, 4);
-    DTensor <T> B(dataB, 2, 3, 4);
-    DTensor <T> C = A - B;
+    DTensor<T> A(dataA, 2, 3, 4);
+    DTensor<T> B(dataB, 2, 3, 4);
+    DTensor<T> C = A - B;
     std::vector<T> expected = TENSOR_DATA_234AMB;
     std::vector<T> actual;
     C.download(actual);
@@ -691,9 +723,9 @@ void tensorAddAB() {
     std::vector<T> bData = {6, 5, 4, 3, 2, 1,
                             7, 6, 5, 4, 3, 2,
                             1, 2, 1, 5, -6, 8};
-    DTensor <T> A(aData, 2, 3, 3);
-    DTensor <T> B(bData, 3, 2, 3);
-    DTensor <T> C(2, 2, 3, true);
+    DTensor<T> A(aData, 2, 3, 3);
+    DTensor<T> B(bData, 3, 2, 3);
+    DTensor<T> C(2, 2, 3, true);
     C.addAB(A, B);
     std::vector<T> expected = {41, 56, 14, 20, 158, 176, 77, 86, 60, 64, 111, 118};
     std::vector<T> actual;
@@ -718,14 +750,14 @@ void tensorGetRows() {
                             5., 6., 7.,
                             8., 9., 10.,
                             11., 12., 13};
-    DTensor <T> A(aData, 3, 3, 2);
-    DTensor <T> Ar0 = A.getRows(1, 1, 0);
+    DTensor<T> A(aData, 3, 3, 2);
+    DTensor<T> Ar0 = A.getRows(1, 1, 0);
     std::vector<T> expected0 = {25., 720., -1.};
     std::vector<T> actual0(3);
     Ar0.download(actual0);
     EXPECT_EQ(expected0, actual0);
 
-    DTensor <T> Ar1 = A.getRows(1, 2, 1);
+    DTensor<T> Ar1 = A.getRows(1, 2, 1);
     std::vector<T> expected1 = {6., 7., 9., 10., 12., 13.};
     std::vector<T> actual1(6);
     Ar1.download(actual1);
@@ -745,8 +777,8 @@ TEST_F(TensorTest, tensorGetRows) {
 TEMPLATE_WITH_TYPE_T
 void tensorTranspose() {
     std::vector<T> aData = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-    DTensor <T> A(aData, 3, 2, 2);
-    DTensor <T> Atranspose = A.tr();
+    DTensor<T> A(aData, 3, 2, 2);
+    DTensor<T> Atranspose = A.tr();
     EXPECT_EQ(2, Atranspose.numRows());
     EXPECT_EQ(3, Atranspose.numCols());
     EXPECT_EQ(2, Atranspose.numMats());
@@ -786,12 +818,12 @@ void tensorLeastSquares1(T epsilon) {
                             6, 8,
                             -9, 20};
     std::vector<T> bData = {1, 1, -1, 2, 30, -80};
-    DTensor <T> A0(aData, 2, 2, 3);
-    DTensor <T> A(A0);
-    DTensor <T> B(bData, 2, 1, 3);
-    DTensor <T> sol(B);
+    DTensor<T> A0(aData, 2, 2, 3);
+    DTensor<T> A(A0);
+    DTensor<T> B(bData, 2, 1, 3);
+    DTensor<T> sol(B);
     A0.leastSquaresBatched(sol);
-    DTensor <T> C(2, 1, 3);
+    DTensor<T> C(2, 1, 3);
     C.addAB(A, sol);
     C -= B;
     T nrmErr = C.normF();
@@ -824,8 +856,8 @@ void singularValuesComputation(float epsilon) {
     std::vector<T> bData = {1, 6, 6, 6, 6, 6, 6, 6,
                             2, 7, 7, 7, 7, 7, 7, 7,
                             3, 8, 8, 8, 8, 8, 8, 8,};
-    DTensor <T> B(bData, 8, 3);
-    Svd <T> svd(B, true, false);
+    DTensor<T> B(bData, 8, 3);
+    Svd<T> svd(B, true, false);
     EXPECT_EQ(true, svd.factorise());
     auto S = svd.singularValues();
     EXPECT_NEAR(32.496241123753592, S(0), epsilon); // value from MATLAB
@@ -850,15 +882,15 @@ void singularValuesMemory(float epsilon) {
     std::vector<T> bData = {1, 6, 6, 6, 6, 6, 6, 6,
                             2, 7, 7, 7, 7, 7, 7, 7,
                             3, 8, 8, 8, 8, 8, 8, 8,};
-    DTensor <T> B(bData, 8, 3);
-    Svd <T> svd(B, true, false);
+    DTensor<T> B(bData, 8, 3);
+    Svd<T> svd(B, true, false);
     EXPECT_EQ(true, svd.factorise());
-    DTensor <T> const &v1 = svd.rightSingularVectors();
-    DTensor <T> const &v2 = svd.rightSingularVectors();
+    DTensor<T> const &v1 = svd.rightSingularVectors();
+    DTensor<T> const &v2 = svd.rightSingularVectors();
     EXPECT_EQ(&v1, &v2);
     EXPECT_EQ(v1.raw(), v2.raw());
-    DTensor <T> const &s1 = svd.singularValues();
-    DTensor <T> const &s2 = svd.singularValues();
+    DTensor<T> const &s1 = svd.singularValues();
+    DTensor<T> const &s2 = svd.singularValues();
     EXPECT_EQ(&s1, &s2);
     EXPECT_EQ(s1.raw(), s2.raw());
     auto u1 = svd.leftSingularVectors().value();
@@ -879,11 +911,11 @@ TEST_F(SvdTest, singularValuesMemory) {
 TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void singularValuesMultipleMatrices(float epsilon) {
     std::vector<T> aData = {1, 2, 3, 4, 5, 6, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1};
-    DTensor <T> A(aData, 3, 2, 3);
-    Svd <T> svd(A, true); // do compute U (A will be destroyed)
+    DTensor<T> A(aData, 3, 2, 3);
+    Svd<T> svd(A, true); // do compute U (A will be destroyed)
     svd.factorise();
-    DTensor <T> const &S = svd.singularValues();
-    DTensor <T> const &V = svd.rightSingularVectors();
+    DTensor<T> const &S = svd.singularValues();
+    DTensor<T> const &V = svd.rightSingularVectors();
     auto Uopt = svd.leftSingularVectors();
     auto U = Uopt.value();
     std::vector<T> expected_v = {-0.386317703118612, -0.922365780077058, -0.922365780077058, 0.386317703118612,
@@ -928,9 +960,9 @@ void singularValuesRankMultipleMatrices(float epsilon) {
     std::vector<T> aData = {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 0,
                             1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12,
                             1, 2, 3, 4, 2, 4, 6, 8, 3, 6, 9, 12};
-    DTensor <T> A(aData, 4, 3, 3);
+    DTensor<T> A(aData, 4, 3, 3);
 
-    Svd <T> svd(A);
+    Svd<T> svd(A);
     svd.factorise();
     auto rank = svd.rank(epsilon);
     EXPECT_EQ(3, rank(0, 0, 0));
@@ -963,8 +995,8 @@ void choleskyFactorisation(T epsilon) {
     std::vector<T> aData = {10.0, 2.0, 3.0,
                             2.0, 20.0, -1.0,
                             3.0, -1.0, 30.0};
-    DTensor <T> A(aData, 3, 3, 1);
-    CholeskyFactoriser <T> chol(A);
+    DTensor<T> A(aData, 3, 3, 1);
+    CholeskyFactoriser<T> chol(A);
     chol.factorise();
     EXPECT_NEAR(3.162277660168380, A(0, 0), epsilon);
     EXPECT_NEAR(-0.361403161162101, A(2, 1), epsilon);
@@ -985,14 +1017,14 @@ void choleskyFactorisationSolution(T epsilon) {
     std::vector<T> aData = {10.0, 2.0, 3.0,
                             2.0, 20.0, -1.0,
                             3.0, -1.0, 30.0};
-    DTensor <T> A(aData, 3, 3, 1);
-    DTensor <T> L(A); // L = A
-    CholeskyFactoriser <T> chol(L);
+    DTensor<T> A(aData, 3, 3, 1);
+    DTensor<T> L(A); // L = A
+    CholeskyFactoriser<T> chol(L);
     chol.factorise();
 
     std::vector<T> bData = {-1., -3., 5.};
-    DTensor <T> rhs(bData, 3, 1, 1);
-    DTensor <T> sol(rhs);
+    DTensor<T> rhs(bData, 3, 1, 1);
+    DTensor<T> sol(rhs);
     chol.solve(sol);
 
     std::vector<T> expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036};
@@ -1000,7 +1032,7 @@ void choleskyFactorisationSolution(T epsilon) {
     sol.download(actual);
     for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon);
 
-    DTensor <T> error = A * sol;
+    DTensor<T> error = A * sol;
     error -= rhs;
     EXPECT_TRUE(error.normF() < epsilon);
 
@@ -1020,12 +1052,12 @@ void choleskyBatchFactorisation(T epsilon) {
     std::vector<T> aData = {10.0, 2.0, 3.0,
                             2.0, 20.0, -1.0,
                             3.0, -1.0, 30.0};
-    DTensor <T> A(3, 3, 2);
-    DTensor <T> A0(A, 2, 0, 0);
-    DTensor <T> A1(A, 2, 1, 1);
+    DTensor<T> A(3, 3, 2);
+    DTensor<T> A0(A, 2, 0, 0);
+    DTensor<T> A1(A, 2, 1, 1);
     A0.upload(aData);
     A1.upload(aData);
-    CholeskyBatchFactoriser <T> chol(A);
+    CholeskyBatchFactoriser<T> chol(A);
     chol.factorise();
     // 0
     EXPECT_NEAR(3.162277660168380, A(0, 0, 0), epsilon);
@@ -1051,28 +1083,28 @@ void choleskyBatchFactorSolve(T epsilon) {
     std::vector<T> aData = {10.0, 2.0, 3.0,
                             2.0, 20.0, -1.0,
                             3.0, -1.0, 30.0};
-    DTensor <T> A(3, 3, 2);
-    DTensor <T> A0(A, 2, 0, 0);
-    DTensor <T> A1(A, 2, 1, 1);
+    DTensor<T> A(3, 3, 2);
+    DTensor<T> A0(A, 2, 0, 0);
+    DTensor<T> A1(A, 2, 1, 1);
     A0.upload(aData);
     A1.upload(aData);
-    DTensor <T> L(A); // L = A
-    CholeskyBatchFactoriser <T> chol(L);
+    DTensor<T> L(A); // L = A
+    CholeskyBatchFactoriser<T> chol(L);
     chol.factorise();
     std::vector<T> bData = {-1., -3., 5.};
-    DTensor <T> rhs(3, 1, 2);
-    DTensor <T> rhs0(rhs, 2, 0, 0);
-    DTensor <T> rhs1(rhs, 2, 1, 1);
+    DTensor<T> rhs(3, 1, 2);
+    DTensor<T> rhs0(rhs, 2, 0, 0);
+    DTensor<T> rhs1(rhs, 2, 1, 1);
     rhs0.upload(bData);
     rhs1.upload(bData);
-    DTensor <T> sol(rhs);
+    DTensor<T> sol(rhs);
     chol.solve(sol);
     std::vector<T> expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036};
     std::vector<T> actual(6);
     sol.download(actual);
     for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon);  // 0
     for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon);  // 1
-    DTensor <T> error = A * sol;
+    DTensor<T> error = A * sol;
     error -= rhs;
     EXPECT_TRUE(error.normF() < epsilon);
 }
@@ -1091,35 +1123,35 @@ void choleskyBatchSolve(T epsilon) {
     std::vector<T> aData = {10.0, 2.0, 3.0,
                             2.0, 20.0, -1.0,
                             3.0, -1.0, 30.0};
-    DTensor <T> A(3, 3, 2);
-    DTensor <T> A0(A, 2, 0, 0);
-    DTensor <T> A1(A, 2, 1, 1);
+    DTensor<T> A(3, 3, 2);
+    DTensor<T> A0(A, 2, 0, 0);
+    DTensor<T> A1(A, 2, 1, 1);
     A0.upload(aData);
     A1.upload(aData);
     std::vector<T> lowData = {3.162277660168380, 0, 0,
                               0.632455532033676, 4.427188724235731, 0,
                               0.948683298050514, -0.361403161162101, 5.382321781081287};  // from matlab
-    DTensor <T> low(3, 3, 2);
-    DTensor <T> low0(low, 2, 0, 0);
-    DTensor <T> low1(low, 2, 1, 1);
+    DTensor<T> low(3, 3, 2);
+    DTensor<T> low0(low, 2, 0, 0);
+    DTensor<T> low1(low, 2, 1, 1);
     low0.upload(lowData, rowMajor);
     low1.upload(lowData, rowMajor);
-    DTensor <T> L(low);
-    CholeskyBatchFactoriser <T> chol(L, true);
+    DTensor<T> L(low);
+    CholeskyBatchFactoriser<T> chol(L, true);
     std::vector<T> bData = {-1., -3., 5.};
-    DTensor <T> rhs(3, 1, 2);
-    DTensor <T> rhs0(rhs, 2, 0, 0);
-    DTensor <T> rhs1(rhs, 2, 1, 1);
+    DTensor<T> rhs(3, 1, 2);
+    DTensor<T> rhs0(rhs, 2, 0, 0);
+    DTensor<T> rhs1(rhs, 2, 1, 1);
     rhs0.upload(bData);
     rhs1.upload(bData);
-    DTensor <T> sol(rhs);
+    DTensor<T> sol(rhs);
     chol.solve(sol);
     std::vector<T> expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036};
     std::vector<T> actual(6);
     sol.download(actual);
     for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon);  // 0
     for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon);  // 1
-    DTensor <T> error = A * sol;
+    DTensor<T> error = A * sol;
     error -= rhs;
     EXPECT_TRUE(error.normF() < epsilon);
 }
@@ -1149,15 +1181,15 @@ TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void qrFactorisation(T epsilon) {
     size_t nR = 4;
     size_t nC = 3;
-    DTensor <T> temp(nR, nC);
-    DTensor <T> A = DTensor<T>::createRandomTensor(nR, nC, 1, -100, 100);
-    QRFactoriser <T> qr(temp);
+    DTensor<T> temp(nR, nC);
+    DTensor<T> A = DTensor<T>::createRandomTensor(nR, nC, 1, -100, 100);
+    QRFactoriser<T> qr(temp);
     A.deviceCopyTo(temp);
     int status = qr.factorise();
     EXPECT_EQ(status, 0);
-    DTensor <T> Q(nR, nC);
-    DTensor <T> R(nC, nC, 1, true);
-    DTensor <T> QR(nR, nC);
+    DTensor<T> Q(nR, nC);
+    DTensor<T> R(nC, nC, 1, true);
+    DTensor<T> QR(nR, nC);
     status = qr.getQR(Q, R);
     EXPECT_EQ(status, 0);
     QR.addAB(Q, R);
@@ -1180,15 +1212,15 @@ TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void qrFactorisationTall(T epsilon) {
     size_t nR = 20;
     size_t nC = 3;
-    DTensor <T> temp(nR, nC);
-    DTensor <T> A = DTensor<T>::createRandomTensor(nR, nC, 1, -100, 100);
-    QRFactoriser <T> qr(temp);
+    DTensor<T> temp(nR, nC);
+    DTensor<T> A = DTensor<T>::createRandomTensor(nR, nC, 1, -100, 100);
+    QRFactoriser<T> qr(temp);
     A.deviceCopyTo(temp);
     int status = qr.factorise();
     EXPECT_EQ(status, 0);
-    DTensor <T> Q(nR, nC);
-    DTensor <T> R(nC, nC, 1, true);
-    DTensor <T> QR(nR, nC);
+    DTensor<T> Q(nR, nC);
+    DTensor<T> R(nC, nC, 1, true);
+    DTensor<T> QR(nR, nC);
     status = qr.getQR(Q, R);
     EXPECT_EQ(status, 0);
     QR.addAB(Q, R);
@@ -1210,7 +1242,7 @@ TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void qrLeastSquares(T epsilon) {
     size_t nR = 4;
     size_t nC = 3;
-    DTensor <T> temp(nR, nC);
+    DTensor<T> temp(nR, nC);
     std::vector<T> vecA = {85.5638, -59.4001, -80.1992,
                            99.9464, 5.51393, 5.17935,
                            6.87488, -26.7536, 36.0914,
@@ -1219,12 +1251,12 @@ void qrLeastSquares(T epsilon) {
                            -48.5744,
                            43.4229,
                            -56.5081};  // Random vector
-    DTensor <T> A(vecA, nR, nC, 1, rowMajor);
-    DTensor <T> b(vecB, nR);
-    DTensor <T> xFull(nR);
-    DTensor <T> x(xFull, 0, 0, nC - 1);
-    DTensor <T> Ax(nR);
-    QRFactoriser <T> qr(temp);
+    DTensor<T> A(vecA, nR, nC, 1, rowMajor);
+    DTensor<T> b(vecB, nR);
+    DTensor<T> xFull(nR);
+    DTensor<T> x(xFull, 0, 0, nC - 1);
+    DTensor<T> Ax(nR);
+    QRFactoriser<T> qr(temp);
     A.deviceCopyTo(temp);
     int status = qr.factorise();
     EXPECT_EQ(status, 0);
@@ -1265,19 +1297,19 @@ void computeNullspaceTensor(T epsilon) {
                             1, 2, 3, 4, 2, 4, 6, 8, 3, 6, 9, 12,
                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-    DTensor <T> A(aData, 3, 4, 5);
-    Nullspace <T> ns(A);
-    DTensor <T> nA = ns.nullspace();
+    DTensor<T> A(aData, 3, 4, 5);
+    Nullspace<T> ns(A);
+    DTensor<T> nA = ns.nullspace();
     size_t nMats = nA.numMats();
     EXPECT_EQ(nMats, 5);
     for (size_t i = 0; i < nMats; i++) {
-        DTensor <T> nAi(nA, 2, i, i);
-        DTensor <T> Ai(A, 2, i, i);
-        DTensor <T> mustBeZero = Ai * nAi;
+        DTensor<T> nAi(nA, 2, i, i);
+        DTensor<T> Ai(A, 2, i, i);
+        DTensor<T> mustBeZero = Ai * nAi;
         EXPECT_LT(mustBeZero.normF(), epsilon);
 
-        DTensor <T> nAiTr = nAi.tr();
-        DTensor <T> mustBeEye = nAiTr * nAi;
+        DTensor<T> nAiTr = nAi.tr();
+        DTensor<T> mustBeEye = nAiTr * nAi;
         EXPECT_NEAR(1, mustBeEye(0, 0, 0), epsilon);
         for (size_t ir = 0; ir < mustBeEye.numRows(); ir++) {
             for (size_t ic = 0; ic < mustBeEye.numCols(); ic++) {
@@ -1306,9 +1338,9 @@ void computeNullspaceTrivial(T epsilon) {
                         1, 1, 1,
                         5, 6, 7,
                         9, 0, 3};
-    DTensor <T> A(data, 3, 3, 2, rowMajor);
-    Nullspace <T> nullA(A);
-    DTensor <T> N = nullA.nullspace();
+    DTensor<T> A(data, 3, 3, 2, rowMajor);
+    Nullspace<T> nullA(A);
+    DTensor<T> N = nullA.nullspace();
     EXPECT_EQ(N.normF(), 0);
 }
 
@@ -1329,28 +1361,28 @@ void projectOnNullspaceTensor(T epsilon) {
     std::vector<T> mat{1, -2, 3, 4, -1, -1, -1,
                        1, 2, -3, 4, -1, -1, -1,
                        -1, 3, 5, -7, -1, -1, -1};
-    DTensor <T> A(m, n, 1);
+    DTensor<T> A(m, n, 1);
     A.upload(mat, rowMajor);
-    Nullspace <T> ns = Nullspace(A);
-    DTensor <T> N = ns.nullspace();
+    Nullspace<T> ns = Nullspace(A);
+    DTensor<T> N = ns.nullspace();
 
     // online
     std::vector<T> vec{1, 2, 3, 4, 5, 6, 7};
-    DTensor <T> x(vec, n);
-    DTensor <T> proj(x);
+    DTensor<T> x(vec, n);
+    DTensor<T> proj(x);
     ns.project(proj);
 
     // Testing that proj is indeed in ker A
-    DTensor <T> error(m, 1, 1, true);
+    DTensor<T> error(m, 1, 1, true);
     error.addAB(A, proj);
     EXPECT_TRUE(error.normF() < epsilon);
 
     // Orthogonality test (other - p) † (p - x)
     std::vector<T> h_other{1, -2, 5, 4, 0, 0, 0};
-    DTensor <T> other(h_other, n);
-    DTensor <T> y = N * other;
-    DTensor <T> delta1 = y - proj;
-    DTensor <T> delta2 = proj - x;
+    DTensor<T> other(h_other, n);
+    DTensor<T> y = N * other;
+    DTensor<T> delta1 = y - proj;
+    DTensor<T> delta2 = proj - x;
     EXPECT_LT(delta1.dotF(delta2), epsilon);
 }
 

From 4198ecf5589996055a07a2f7dec037b4c4e0349d Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 23:16:23 +0000
Subject: [PATCH 16/17] Better memory management in
 DTensor<T>::allocateOnDevice

Free all allocated memory if allocation fails
Fix code formatting in testTensor
allocateOnDevice made void
---
 include/tensor.cuh | 16 ++++++++--------
 test/testTensor.cu |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index d62803c..0f8b5ee 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -202,9 +202,8 @@ private:
      * Allocate `size` number of `T` data on the device.
      * @param size number of data elements to allocate
      * @param zero sets allocated data to `0`
-     * @return
      */
-    bool allocateOnDevice(size_t size, bool zero = false);
+    void allocateOnDevice(size_t size, bool zero = false);
 
     /**
      * Create column-major `std::vector` from a row-major one.
@@ -837,23 +836,24 @@ void DTensor<T>::applyLeftGivensRotation(size_t i, size_t j, const T *c, const T
 }
 
 template<typename T>
-inline bool DTensor<T>::allocateOnDevice(size_t size, bool zero) {
-    if (size <= 0) return false;
+inline void DTensor<T>::allocateOnDevice(size_t size, bool zero) {
+    cudaError_t cudaStatus;
+    if (size <= 0) return;
     destroy();
     m_doDestroyData = true;
     size_t buffer_size = size * sizeof(T);
-    bool cudaStatus = cudaMalloc(&m_d_data, buffer_size);
-    if (cudaStatus != cudaSuccess) return false;
+    gpuErrChk(cudaMalloc(&m_d_data, buffer_size));
     if (zero) gpuErrChk(cudaMemset(m_d_data, 0, buffer_size)); // set to zero all elements
 
     if (numMats() > 1) {
         m_doDestroyPtrMatrices = true;
         cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T *));
+        if (cudaStatus != cudaSuccess) {
+            gpuErrChk(cudaFree(m_d_data));
+        }
     } else {
         m_doDestroyPtrMatrices = false;
     }
-
-    return (cudaStatus != cudaSuccess);
 }
 
 template<typename T>
diff --git a/test/testTensor.cu b/test/testTensor.cu
index 4c1c147..d68ca02 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -123,7 +123,7 @@ TEMPLATE_WITH_TYPE_T
 void tensorMoveConstructor() {
     DTensor<T> zero(2, 3, 4, true);
     DTensor<T> x(std::move(zero));
-    DTensor<T> y(DTensor < T > {100, 10, 1000});
+    DTensor<T> y(DTensor<T> {100, 10, 1000});
 }
 
 TEST_F(TensorTest, tensorMoveConstructor) {

From 2c326fc3926af7f2a2b44e5ebdcd2b93d00e250e Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Thu, 7 Nov 2024 23:19:18 +0000
Subject: [PATCH 17/17] DTensor<T>::allocateOnDevice: memento mori

---
 include/tensor.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index 0f8b5ee..40e05f0 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -849,7 +849,8 @@ inline void DTensor<T>::allocateOnDevice(size_t size, bool zero) {
         m_doDestroyPtrMatrices = true;
         cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T *));
         if (cudaStatus != cudaSuccess) {
-            gpuErrChk(cudaFree(m_d_data));
+            gpuErrChk(cudaFree(m_d_data)); // ... free previously allocated memory
+            gpuErrChk(cudaStatus); // ... and memento mori
         }
     } else {
         m_doDestroyPtrMatrices = false;