diff --git a/.gitignore b/.gitignore
index 7fe8987..032fffb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# Custom
+python/*.bt
+
 ## Ignore Visual Studio temporary files, build results, and
 ## files generated by popular Visual Studio add-ons.
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 32569ba..60d3173 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+<!-- ---------------------
+      v1.8.0
+     --------------------- -->
+## v1.8.0 - 21-03-2025
+
+### Added
+
+- Support for counting total allocated memory via `Session::getInstance().totalAllocatedBytes()` 
+
+
 <!-- ---------------------
       v1.7.2
      --------------------- -->
diff --git a/README.md b/README.md
index 5e1e99e..b6e560e 100644
--- a/README.md
+++ b/README.md
@@ -395,4 +395,14 @@ N.project(vectors);
 std::cout << vectors << "\n";
 ```
 
+## 5. Other
+
+We can get the total allocated bytes (on the GPU) with
+```c++
+size_t allocatedBytes = 
+    Session::getInstance().totalAllocatedBytes();
+```
+
+
+
 ## Happy number crunching!
diff --git a/include/tensor.cuh b/include/tensor.cuh
index bca5706..5089ed7 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -1,3 +1,6 @@
+#ifndef TENSOR_CUH
+#define TENSOR_CUH
+
 #include <random>
 #include <algorithm>
 #include <iterator>
@@ -13,8 +16,6 @@
 #include <cassert>
 #include <fstream>
 
-#ifndef TENSOR_CUH
-#define TENSOR_CUH
 
 /**
  * Define defaults
@@ -43,7 +44,8 @@ static std::random_device RND_DEVICE;
  * @param hi
  * @return
  */
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 std::vector<T> generateRealRandomVector(size_t n, T low, T hi) {
     std::mt19937_64 mersenne_engine(RND_DEVICE());
     std::uniform_real_distribution<T> dist(low, hi);
@@ -84,24 +86,25 @@ constexpr size_t numBlocks(size_t n, size_t threads_per_block = THREADS_PER_BLOC
  */
 #define gpuErrChk(status) { gpuAssert((status), __FILE__, __LINE__); } while(false)
 
-TEMPLATE_WITH_TYPE_T inline void gpuAssert(T code, const char *file, int line, bool abort = true) {
+TEMPLATE_WITH_TYPE_T
+inline void gpuAssert(T code, const char *file, int line, bool abort = true) {
     if constexpr (std::is_same_v<T, cudaError_t>) {
         if (code != cudaSuccess) {
             std::cerr << "cuda error. String: " << cudaGetErrorString(code)
-                      << ", file: " << file << ", line: " << line << "\n";
+                    << ", file: " << file << ", line: " << line << "\n";
             if (abort) exit(code);
         }
     } else if constexpr (std::is_same_v<T, cublasStatus_t>) {
         if (code != CUBLAS_STATUS_SUCCESS) {
             std::cerr << "cublas error. Name: " << cublasGetStatusName(code)
-                      << ", string: " << cublasGetStatusString(code)
-                      << ", file: " << file << ", line: " << line << "\n";
+                    << ", string: " << cublasGetStatusString(code)
+                    << ", file: " << file << ", line: " << line << "\n";
             if (abort) exit(code);
         }
     } else if constexpr (std::is_same_v<T, cusolverStatus_t>) {
         if (code != CUSOLVER_STATUS_SUCCESS) {
             std::cerr << "cusolver error. Status: " << code
-                      << ", file: " << file << ", line: " << line << "\n";
+                    << ", file: " << file << ", line: " << line << "\n";
             if (abort) exit(code);
         }
     } else {
@@ -124,7 +127,6 @@ TEMPLATE_WITH_TYPE_T inline void gpuAssert(T code, const char *file, int line, b
  */
 class Session {
 public:
-
     static Session &getInstance() {
         static Session instance;
         return instance;
@@ -143,7 +145,7 @@ private:
 
     cublasHandle_t m_cublasHandle;
     cusolverDnHandle_t m_cusolverHandle;
-
+    size_t bytesAllocated = 0;
 
 public:
     Session(Session const &) = delete;
@@ -153,6 +155,30 @@ public:
     cublasHandle_t &cuBlasHandle() { return m_cublasHandle; }
 
     cusolverDnHandle_t &cuSolverHandle() { return m_cusolverHandle; }
+
+    /**
+     * Preferred method for CUDA memory allocation; it allocated memory on the device
+     * and counts the allocated bytes (you can then call #totalAllocatedBytes()).
+     * In essence, this is a wrapper for cudaMalloc.
+     *
+     * @param d pointer to memory address to be allocated
+     * @param s size to be allocated
+     * @return CUDA error
+     */
+    cudaError_t cudaAllocate(void** d, size_t s) {
+        cudaError_t err = cudaMalloc(d, s);
+        if (err == cudaSuccess) {
+            bytesAllocated += s;
+        }
+        return err;
+    }
+
+    /**
+     * @return Total allocated bytes
+     */
+    size_t totalAllocatedBytes() const { return bytesAllocated; }
+
+    void incrementAllocatedBytes(size_t s) { bytesAllocated += s; }
 };
 
 
@@ -164,8 +190,8 @@ public:
  * Storage mode for the data of a matrix
  */
 enum StorageMode {
-    columnMajor,  ///< column major storage (default)
-    rowMajor,  ///< row major storage
+    columnMajor, ///< column major storage (default)
+    rowMajor, ///< row major storage
     defaultMajor = columnMajor
 };
 
@@ -179,24 +205,25 @@ enum StorageMode {
  */
 TEMPLATE_WITH_TYPE_T
 class DTensor {
-
 private:
-    T *m_d_data = nullptr;  ///< Pointer to device data
+    T *m_d_data = nullptr; ///< Pointer to device data
     T **m_d_ptrMatrices = nullptr; ///< Pointer to matrices in tensor
-    size_t m_numRows = 0;  ///< Number of rows
-    size_t m_numCols = 0;  ///< Number of columns
-    size_t m_numMats = 0;  ///< Number of matrices
-    bool m_doDestroyData = false;  ///< Whether to destroy memory
-    bool m_doDestroyPtrMatrices = false;  ///< Whether to destroy memory
+    size_t m_numRows = 0; ///< Number of rows
+    size_t m_numCols = 0; ///< Number of columns
+    size_t m_numMats = 0; ///< Number of matrices
+    bool m_doDestroyData = false; ///< Whether to destroy memory
+    bool m_doDestroyPtrMatrices = false; ///< Whether to destroy memory
 
     void destroy() {
         if (m_doDestroyData) {
             if (m_d_data) gpuErrChk(cudaFree(m_d_data));
             m_d_data = nullptr;
+            Session::getInstance().incrementAllocatedBytes(-m_numRows * m_numCols * m_numMats * sizeof(T));
         }
         if (m_doDestroyPtrMatrices) {
             if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices));
             m_d_ptrMatrices = nullptr;
+            Session::getInstance().incrementAllocatedBytes(-m_numMats * sizeof(T *));
         }
     }
 
@@ -239,7 +266,6 @@ private:
     void initialisePointersToMatricesData();
 
 public:
-
     /**
      * Create a tensor with random elements
      * @param numRows number of rows
@@ -559,7 +585,6 @@ public:
     friend std::ostream &operator<<(std::ostream &out, const DTensor<T> &data) {
         return data.print(out);
     }
-
 }; /* END OF DTENSOR */
 
 template<typename T>
@@ -611,9 +636,12 @@ data_t<T> vectorFromTextFile(std::string path_to_file) {
     if (!file.is_open()) { throw std::invalid_argument("[vectorFromTextFile] the file does not exist"); }
 
     std::string line;
-    getline(file, line); dataStruct.numRows = atoi(line.c_str());
-    getline(file, line); dataStruct.numCols = atoi(line.c_str());
-    getline(file, line); dataStruct.numMats = atoi(line.c_str());
+    getline(file, line);
+    dataStruct.numRows = atoi(line.c_str());
+    getline(file, line);
+    dataStruct.numCols = atoi(line.c_str());
+    getline(file, line);
+    dataStruct.numMats = atoi(line.c_str());
 
     size_t numElements = dataStruct.numRows * dataStruct.numCols * dataStruct.numMats;
     std::vector<T> vecDataFromFile(numElements);
@@ -638,7 +666,7 @@ data_t<T> vectorFromTextFile(std::string path_to_file) {
             vecDataFromFile[i] = std::stoull(line.c_str());
         } else if constexpr (std::is_same_v<T, size_t>) {
             sscanf(line.c_str(), "%zu", &vecDataFromFile[i]);
-        }  else {
+        } else {
             throw std::invalid_argument("data type not supported");
         }
 
@@ -675,10 +703,10 @@ template<typename T>
 DTensor<T> DTensor<T>::parseFromFile(std::string path_to_file,
                                      StorageMode mode) {
     // Figure out file extension
-    size_t pathToFileLength = path_to_file.length() ;
-    std::string fileNameExtension = path_to_file.substr(pathToFileLength-3);
+    size_t pathToFileLength = path_to_file.length();
+    std::string fileNameExtension = path_to_file.substr(pathToFileLength - 3);
     typedef data_t<T> (*PARSER)(std::string);
-    PARSER parser = (fileNameExtension == ".bt")  ? vectorFromBinaryFile<T> : vectorFromTextFile<T>;
+    PARSER parser = (fileNameExtension == ".bt") ? vectorFromBinaryFile<T> : vectorFromTextFile<T>;
     auto parsedData = parser(path_to_file);
     DTensor<T> tensorFromData(parsedData.data, parsedData.numRows, parsedData.numCols, parsedData.numMats, mode);
     return tensorFromData;
@@ -690,10 +718,10 @@ void DTensor<T>::saveToFile(std::string pathToFile) {
     download(myData);
 
     // Figure out file extension
-    size_t pathToFileLength = pathToFile.length() ;
-    std::string fileNameExtension = pathToFile.substr(pathToFileLength-3);
+    size_t pathToFileLength = pathToFile.length();
+    std::string fileNameExtension = pathToFile.substr(pathToFileLength - 3);
     // If the extension is .bt...
-     if (fileNameExtension == ".bt") {
+    if (fileNameExtension == ".bt") {
         uint64_t nr = (uint64_t) numRows(),
                 nc = (uint64_t) numCols(),
                 nm = (uint64_t) numMats();
@@ -705,13 +733,13 @@ void DTensor<T>::saveToFile(std::string pathToFile) {
         for (const T &el: myData) outFile.write(reinterpret_cast<const char *>(&el), sizeof(T));
         outFile.close();
     } else {
-         std::ofstream file(pathToFile);
-         file << numRows() << std::endl << numCols() << std::endl << numMats() << std::endl;
-         if constexpr (std::is_floating_point<T>::value) {
-             file << std::setprecision(std::numeric_limits<T>::max_digits10);
-         }
-         for (const T &el: myData) file << el << std::endl;
-     }
+        std::ofstream file(pathToFile);
+        file << numRows() << std::endl << numCols() << std::endl << numMats() << std::endl;
+        if constexpr (std::is_floating_point<T>::value) {
+            file << std::setprecision(std::numeric_limits<T>::max_digits10);
+        }
+        for (const T &el: myData) file << el << std::endl;
+    }
 }
 
 
@@ -739,7 +767,7 @@ void DTensor<T>::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats
         }
         /* Reallocate memory for m_d_ptrMatrices, if necessary */
         if (newNumMats > 1) {
-            gpuErrChk(cudaMalloc(&m_d_ptrMatrices, newNumMats * sizeof(T *)));
+            gpuErrChk(Session::getInstance().cudaAllocate((void**)&m_d_ptrMatrices, newNumMats * sizeof(T *)));
             m_doDestroyPtrMatrices = true;
         }
     }
@@ -781,7 +809,7 @@ DTensor<T>::DTensor(const DTensor<T> &other) {
 
     allocateOnDevice(m_numRows * m_numCols * m_numMats);
     gpuErrChk(cudaMemcpy(m_d_data, other.raw(), m_numRows * m_numCols * m_numMats * sizeof(T),
-                         cudaMemcpyDeviceToDevice));
+        cudaMemcpyDeviceToDevice));
     /* Initialise m_d_ptrMatrices */
     initialisePointersToMatricesData();
 }
@@ -860,9 +888,9 @@ inline double DTensor<double>::dotF(const DTensor<double> &other) {
     size_t n = numEl();
     double result;
     gpuErrChk(cublasDdot(Session::getInstance().cuBlasHandle(), n,
-                         raw(), 1,
-                         other.raw(), 1,
-                         &result));
+        raw(), 1,
+        other.raw(), 1,
+        &result));
     return result;
 }
 
@@ -873,9 +901,9 @@ inline float DTensor<float>::dotF(const DTensor<float> &other) {
     size_t n = numEl();
     float result;
     gpuErrChk(cublasSdot(Session::getInstance().cuBlasHandle(), n,
-                         raw(), 1,
-                         other.raw(), 1,
-                         &result));
+        raw(), 1,
+        other.raw(), 1,
+        &result));
     return result;
 }
 
@@ -883,7 +911,7 @@ template<>
 inline double DTensor<double>::normF() const {
     double the_norm;
     gpuErrChk(cublasDnrm2(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
-                          &the_norm));
+        &the_norm));
     return the_norm;
 }
 
@@ -891,7 +919,7 @@ template<>
 inline float DTensor<float>::normF() const {
     float the_norm;
     gpuErrChk(cublasSnrm2(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
-                          &the_norm));
+        &the_norm));
     return the_norm;
 }
 
@@ -899,7 +927,7 @@ template<>
 inline float DTensor<float>::sumAbs() const {
     float sumAbsAllElements;
     gpuErrChk(cublasSasum(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
-                          &sumAbsAllElements));
+        &sumAbsAllElements));
     return sumAbsAllElements;
 }
 
@@ -907,7 +935,7 @@ template<>
 inline double DTensor<double>::sumAbs() const {
     double sumAbsAllElements;
     gpuErrChk(cublasDasum(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
-                          &sumAbsAllElements));
+        &sumAbsAllElements));
     return sumAbsAllElements;
 }
 
@@ -916,7 +944,7 @@ inline float DTensor<float>::maxAbs() const {
     int idx;
     float hostDst;
     gpuErrChk(cublasIsamax(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
-                           &idx));
+        &idx));
     gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(float), cudaMemcpyDeviceToHost));
     return std::signbit(hostDst) ? -hostDst : hostDst;
 }
@@ -926,7 +954,7 @@ inline double DTensor<double>::maxAbs() const {
     int idx;
     double hostDst;
     gpuErrChk(cublasIdamax(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
-                           &idx));
+        &idx));
     gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(double), cudaMemcpyDeviceToHost));
     return std::signbit(hostDst) ? -hostDst : hostDst;
 }
@@ -936,7 +964,7 @@ inline float DTensor<float>::minAbs() const {
     int idx;
     float hostDst;
     gpuErrChk(cublasIsamin(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
-                           &idx));
+        &idx));
     gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(float), cudaMemcpyDeviceToHost));
     return std::signbit(hostDst) ? -hostDst : hostDst;
 }
@@ -946,7 +974,7 @@ inline double DTensor<double>::minAbs() const {
     int idx;
     double hostDst;
     gpuErrChk(cublasIdamin(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1,
-                           &idx));
+        &idx));
     gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(double), cudaMemcpyDeviceToHost));
     return std::signbit(hostDst) ? -hostDst : hostDst;
 }
@@ -958,10 +986,10 @@ void DTensor<T>::applyRightGivensRotation(size_t i, size_t j, const T *c, const
     T *col_j = m_d_data + j * m_numRows;
     if constexpr (std::is_same_v<T, double>) {
         gpuErrChk(cublasDrot(Session::getInstance().cuBlasHandle(), m_numRows,
-                             col_i, 1, col_j, 1, c, minus_s));
+            col_i, 1, col_j, 1, c, minus_s));
     } else if constexpr (std::is_same_v<T, float>) {
         gpuErrChk(cublasSrot(Session::getInstance().cuBlasHandle(), m_numRows,
-                             col_i, 1, col_j, 1, c, minus_s));
+            col_i, 1, col_j, 1, c, minus_s));
     }
 }
 
@@ -970,14 +998,14 @@ void DTensor<T>::applyLeftGivensRotation(size_t i, size_t j, const T *c, const T
     if (m_numMats > 1) throw std::invalid_argument("[applyLeftGivensRotation] tensors (nMat>1) not supported");
     if constexpr (std::is_same_v<T, double>) {
         gpuErrChk(cublasDrot(Session::getInstance().cuBlasHandle(), m_numCols,
-                             m_d_data + i, m_numRows,
-                             m_d_data + j, m_numRows,
-                             c, minus_s));
+            m_d_data + i, m_numRows,
+            m_d_data + j, m_numRows,
+            c, minus_s));
     } else if constexpr (std::is_same_v<T, float>) {
         gpuErrChk(cublasSrot(Session::getInstance().cuBlasHandle(), m_numCols,
-                             m_d_data + i, m_numRows,
-                             m_d_data + j, m_numRows,
-                             c, minus_s));
+            m_d_data + i, m_numRows,
+            m_d_data + j, m_numRows,
+            c, minus_s));
     } else {
         throw std::invalid_argument("[applyLeftGivensRotation] Unsupported type T");
     }
@@ -990,12 +1018,12 @@ inline void DTensor<T>::allocateOnDevice(size_t size, bool zero) {
     destroy();
     m_doDestroyData = true;
     size_t buffer_size = size * sizeof(T);
-    gpuErrChk(cudaMalloc(&m_d_data, buffer_size));
+    gpuErrChk(Session::getInstance().cudaAllocate((void**)&m_d_data, buffer_size));
     if (zero) gpuErrChk(cudaMemset(m_d_data, 0, buffer_size)); // set to zero all elements
 
     if (numMats() > 1) {
         m_doDestroyPtrMatrices = true;
-        cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T *));
+        cudaStatus = Session::getInstance().cudaAllocate((void**) &m_d_ptrMatrices, numMats() * sizeof(T *));
         if (cudaStatus != cudaSuccess) {
             gpuErrChk(cudaFree(m_d_data)); // ... free previously allocated memory
             gpuErrChk(cudaStatus); // ... and memento mori
@@ -1028,9 +1056,9 @@ template<typename T>
 inline void DTensor<T>::download(std::vector<T> &vec) const {
     vec.resize(m_numRows * m_numCols * m_numMats);
     gpuErrChk(cudaMemcpy(vec.data(),
-                         m_d_data,
-                         m_numRows * m_numCols * m_numMats * sizeof(T),
-                         cudaMemcpyDeviceToHost));
+        m_d_data,
+        m_numRows * m_numCols * m_numMats * sizeof(T),
+        cudaMemcpyDeviceToHost));
 }
 
 template<typename T>
@@ -1051,11 +1079,11 @@ inline DTensor<float> DTensor<float>::tr() const {
     size_t numElMat = m_numCols * m_numRows;
     for (size_t i = 0; i < m_numMats; i++) {
         gpuErrChk(cublasSgeam(Session::getInstance().cuBlasHandle(),
-                              CUBLAS_OP_T, CUBLAS_OP_N,
-                              m_numCols, m_numRows,
-                              &alpha, raw() + numElMat * i, m_numRows,
-                              &beta, nullptr, m_numCols,
-                              transposes.raw() + numElMat * i, m_numCols));
+            CUBLAS_OP_T, CUBLAS_OP_N,
+            m_numCols, m_numRows,
+            &alpha, raw() + numElMat * i, m_numRows,
+            &beta, nullptr, m_numCols,
+            transposes.raw() + numElMat * i, m_numCols));
     }
     return transposes;
 }
@@ -1067,11 +1095,11 @@ inline DTensor<double> DTensor<double>::tr() const {
     size_t numElMat = m_numCols * m_numRows;
     for (size_t i = 0; i < m_numMats; i++) {
         gpuErrChk(cublasDgeam(Session::getInstance().cuBlasHandle(),
-                              CUBLAS_OP_T, CUBLAS_OP_N,
-                              m_numCols, m_numRows,
-                              &alpha, raw() + numElMat * i, m_numRows,
-                              &beta, nullptr, m_numCols,
-                              transposes.raw() + numElMat * i, m_numCols));
+            CUBLAS_OP_T, CUBLAS_OP_N,
+            m_numCols, m_numRows,
+            &alpha, raw() + numElMat * i, m_numRows,
+            &beta, nullptr, m_numCols,
+            transposes.raw() + numElMat * i, m_numCols));
     }
     return transposes;
 }
@@ -1082,16 +1110,16 @@ inline void DTensor<T>::deviceCopyTo(DTensor<T> &elsewhere) const {
         throw std::invalid_argument("[deviceCopyTo] tensor does not fit into destination");
     }
     gpuErrChk(cudaMemcpy(elsewhere.raw(),
-                         m_d_data,
-                         m_numRows * m_numCols * m_numMats * sizeof(T),
-                         cudaMemcpyDeviceToDevice));
+        m_d_data,
+        m_numRows * m_numCols * m_numMats * sizeof(T),
+        cudaMemcpyDeviceToDevice));
 }
 
 template<>
 inline DTensor<double> &DTensor<double>::operator*=(double scalar) {
     double alpha = scalar;
     gpuErrChk(
-            cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
+        cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
     return *this;
 }
 
@@ -1109,7 +1137,7 @@ template<>
 inline DTensor<float> &DTensor<float>::operator*=(float scalar) {
     float alpha = scalar;
     gpuErrChk(
-            cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
+        cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
     return *this;
 }
 
@@ -1117,8 +1145,8 @@ template<>
 inline DTensor<double> &DTensor<double>::operator+=(const DTensor<double> &rhs) {
     const double alpha = 1.;
     gpuErrChk(
-            cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
-                        1, m_d_data, 1));
+        cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
+            1, m_d_data, 1));
     return *this;
 }
 
@@ -1126,8 +1154,8 @@ template<>
 inline DTensor<float> &DTensor<float>::operator+=(const DTensor<float> &rhs) {
     const float alpha = 1.;
     gpuErrChk(
-            cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
-                        1, m_d_data, 1));
+        cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
+            1, m_d_data, 1));
     return *this;
 }
 
@@ -1143,8 +1171,8 @@ template<>
 inline DTensor<double> &DTensor<double>::operator-=(const DTensor<double> &rhs) {
     const double alpha = -1.;
     gpuErrChk(
-            cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
-                        1, m_d_data, 1));
+        cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
+            1, m_d_data, 1));
     return *this;
 }
 
@@ -1165,21 +1193,21 @@ inline void DTensor<double>::addAB(const DTensor<double> &A, const DTensor<doubl
     double _alpha = alpha, _beta = beta;
     if (nMat > 1) {
         gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(),
-                                     CUBLAS_OP_N, CUBLAS_OP_N,
-                                     nRA, nCB, nCA, &_alpha,
-                                     A.m_d_ptrMatrices, nRA,
-                                     B.m_d_ptrMatrices, nCA,
-                                     &_beta,
-                                     m_d_ptrMatrices, nRA,
-                                     nMat));
+            CUBLAS_OP_N, CUBLAS_OP_N,
+            nRA, nCB, nCA, &_alpha,
+            A.m_d_ptrMatrices, nRA,
+            B.m_d_ptrMatrices, nCA,
+            &_beta,
+            m_d_ptrMatrices, nRA,
+            nMat));
     } else {
         gpuErrChk(cublasDgemm(Session::getInstance().cuBlasHandle(),
-                              CUBLAS_OP_N, CUBLAS_OP_N,
-                              nRA, nCB, nCA, &_alpha,
-                              A.raw(), nRA,
-                              B.raw(), nCA,
-                              &_beta,
-                              raw(), nRA));
+            CUBLAS_OP_N, CUBLAS_OP_N,
+            nRA, nCB, nCA, &_alpha,
+            A.raw(), nRA,
+            B.raw(), nCA,
+            &_beta,
+            raw(), nRA));
     }
 }
 
@@ -1192,21 +1220,21 @@ inline void DTensor<float>::addAB(const DTensor<float> &A, const DTensor<float>
     float _alpha = alpha, _beta = beta;
     if (nMat > 1) {
         gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(),
-                                     CUBLAS_OP_N, CUBLAS_OP_N,
-                                     nRA, nCB, nCA, &_alpha,
-                                     A.m_d_ptrMatrices, nRA,
-                                     B.m_d_ptrMatrices, nCA,
-                                     &_beta,
-                                     m_d_ptrMatrices, nRA,
-                                     nMat));
+            CUBLAS_OP_N, CUBLAS_OP_N,
+            nRA, nCB, nCA, &_alpha,
+            A.m_d_ptrMatrices, nRA,
+            B.m_d_ptrMatrices, nCA,
+            &_beta,
+            m_d_ptrMatrices, nRA,
+            nMat));
     } else {
         gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(),
-                              CUBLAS_OP_N, CUBLAS_OP_N,
-                              nRA, nCB, nCA, &_alpha,
-                              A.raw(), nRA,
-                              B.raw(), nCA,
-                              &_beta,
-                              raw(), nRA));
+            CUBLAS_OP_N, CUBLAS_OP_N,
+            nRA, nCB, nCA, &_alpha,
+            A.raw(), nRA,
+            B.raw(), nCA,
+            &_beta,
+            raw(), nRA));
     }
 }
 
@@ -1225,17 +1253,17 @@ inline void DTensor<double>::leastSquaresBatched(DTensor &B) {
     int info = 0;
     DTensor<int> infoArray(batchSize); // TODO consider preallocating?
     gpuErrChk(cublasDgelsBatched(Session::getInstance().cuBlasHandle(),
-                                 CUBLAS_OP_N,
-                                 m_numRows,
-                                 m_numCols,
-                                 nColsB,
-                                 m_d_ptrMatrices,
-                                 m_numRows,
-                                 B.m_d_ptrMatrices,
-                                 m_numRows,
-                                 &info,
-                                 infoArray.raw(),
-                                 batchSize));
+        CUBLAS_OP_N,
+        m_numRows,
+        m_numCols,
+        nColsB,
+        m_d_ptrMatrices,
+        m_numRows,
+        B.m_d_ptrMatrices,
+        m_numRows,
+        &info,
+        infoArray.raw(),
+        batchSize));
 }
 
 template<>
@@ -1253,17 +1281,17 @@ inline void DTensor<float>::leastSquaresBatched(DTensor &B) {
     int info = 0;
     DTensor<int> infoArray(batchSize); // TODO consider preallocating?
     gpuErrChk(cublasSgelsBatched(Session::getInstance().cuBlasHandle(),
-                                 CUBLAS_OP_N,
-                                 m_numRows,
-                                 m_numCols,
-                                 nColsB,
-                                 m_d_ptrMatrices,
-                                 m_numRows,
-                                 B.m_d_ptrMatrices,
-                                 m_numRows,
-                                 &info,
-                                 infoArray.raw(),
-                                 batchSize));
+        CUBLAS_OP_N,
+        m_numRows,
+        m_numCols,
+        nColsB,
+        m_d_ptrMatrices,
+        m_numRows,
+        B.m_d_ptrMatrices,
+        m_numRows,
+        &info,
+        infoArray.raw(),
+        batchSize));
 }
 
 template<>
@@ -1273,10 +1301,10 @@ inline DTensor<double> DTensor<double>::getRows(size_t rowsFrom, size_t rowsTo,
     DTensor<double> rowsOnly(rowsRangeLength, numCols(), 1);
     for (size_t i = 0; i < rowsRangeLength; i++) {
         gpuErrChk(cublasDcopy(Session::getInstance().cuBlasHandle(),
-                              n, // # values to copy
-                              raw() + rowsFrom + i + matIdx * n * m, m,
-                              rowsOnly.raw() + i,
-                              rowsRangeLength));
+            n, // # values to copy
+            raw() + rowsFrom + i + matIdx * n * m, m,
+            rowsOnly.raw() + i,
+            rowsRangeLength));
     }
     return rowsOnly;
 }
@@ -1288,10 +1316,10 @@ inline DTensor<float> DTensor<float>::getRows(size_t rowsFrom, size_t rowsTo, si
     DTensor<float> rowsOnly(rowsRangeLength, numCols(), 1);
     for (size_t i = 0; i < rowsRangeLength; i++) {
         gpuErrChk(cublasScopy(Session::getInstance().cuBlasHandle(),
-                              n, // # values to copy
-                              raw() + rowsFrom + i + matIdx * n * m, m,
-                              rowsOnly.raw() + i,
-                              rowsRangeLength));
+            n, // # values to copy
+            raw() + rowsFrom + i + matIdx * n * m, m,
+            rowsOnly.raw() + i,
+            rowsRangeLength));
     }
     return rowsOnly;
 }
@@ -1300,8 +1328,8 @@ template<typename T>
 std::ostream &DTensor<T>::print(std::ostream &out) const {
     size_t nr = m_numRows, nc = m_numCols, nm = m_numMats;
     out << "Tensor [" << m_numRows << " x "
-        << m_numCols << " x "
-        << m_numMats << "]:" << std::endl;
+            << m_numCols << " x "
+            << m_numMats << "]:" << std::endl;
     std::vector<T> temp;
     download(temp);
     for (size_t k = 0; k < nm; k++) {
@@ -1317,7 +1345,6 @@ std::ostream &DTensor<T>::print(std::ostream &out) const {
 }
 
 
-
 /* ================================================================================================
  *  STATUS INTERFACE
  * ================================================================================================ */
@@ -1326,19 +1353,18 @@ std::ostream &DTensor<T>::print(std::ostream &out) const {
  */
 class IStatus {
 protected:
-    std::unique_ptr<DTensor<int>> m_info;  ///< Status code of computation
+    std::unique_ptr<DTensor<int> > m_info; ///< Status code of computation
 
     IStatus(size_t n = 1) {
-        m_info = std::make_unique<DTensor<int>>(1, 1, n);
+        m_info = std::make_unique<DTensor<int> >(1, 1, n);
     }
 
 public:
-
     /**
      * Provides access to the info stored in this class
      * @return tensor of shape (1, 1, n)
      */
-    virtual DTensor<int>& info() {
+    virtual DTensor<int> &info() {
         return *m_info;
     }
 };
@@ -1356,7 +1382,8 @@ public:
  * @param d_count on exit, count of elements (int on device)
  * @param epsilon threshold
  */
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 __global__ void k_countNonzeroSingularValues(const T *d_array, size_t n, unsigned int *d_count, T epsilon) {
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     if (idx < n && d_array[idx] > epsilon) {
@@ -1371,19 +1398,18 @@ __global__ void k_countNonzeroSingularValues(const T *d_array, size_t n, unsigne
  * Then, many same-type-(m,n,1)-tensor can be factorised using this object's workspace.
  * @tparam T data type of (m,n,1)-tensor to be factorised (must be float or double)
  */
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 class Svd : public IStatus {
-
 private:
-
-    int m_lwork = -1;  ///< Size of workspace needed for SVD
-    DTensor<T> *m_tensor = nullptr;  ///< Pointer to original matrix to be factorised
-    std::shared_ptr<DTensor<T>> m_Vtr;  ///< Matrix V' or right singular vectors
-    std::shared_ptr<DTensor<T>> m_S;  ///< Diagonal matrix S or singular values
-    std::shared_ptr<DTensor<T>> m_U;  ///< Matrix U or left singular vectors
-    std::unique_ptr<DTensor<T>> m_workspace;  ///< Workspace for SVD
-    std::shared_ptr<DTensor<unsigned int>> m_rank;  ///< Rank of original matrix
-    bool m_computeU = false;  ///< Whether to compute U
+    int m_lwork = -1; ///< Size of workspace needed for SVD
+    DTensor<T> *m_tensor = nullptr; ///< Pointer to original matrix to be factorised
+    std::shared_ptr<DTensor<T> > m_Vtr; ///< Matrix V' or right singular vectors
+    std::shared_ptr<DTensor<T> > m_S; ///< Diagonal matrix S or singular values
+    std::shared_ptr<DTensor<T> > m_U; ///< Matrix U or left singular vectors
+    std::unique_ptr<DTensor<T> > m_workspace; ///< Workspace for SVD
+    std::shared_ptr<DTensor<unsigned int> > m_rank; ///< Rank of original matrix
+    bool m_computeU = false; ///< Whether to compute U
     bool m_destroyMatrix = true; ///< Whether to sacrifice original matrix
 
     /**
@@ -1403,10 +1429,7 @@ private:
      */
     void computeWorkspaceSize(size_t m, size_t n);
 
-
 public:
-
-
     /**
      * Constructor.
      * @param mat matrix to be factorised
@@ -1425,11 +1448,11 @@ public:
         size_t minMN = std::min(m, n);
         size_t nMats = mat.numMats();
         computeWorkspaceSize(m, n);
-        m_workspace = std::make_unique<DTensor<T>>(m_lwork, 1, 1);  ///< Allocates required workspace memory
-        m_Vtr = std::make_shared<DTensor<T>>(n, n, nMats);
-        m_S = std::make_shared<DTensor<T>>(minMN, 1, nMats);
-        m_rank = std::make_unique<DTensor<unsigned int>>(1, 1, nMats, true);
-        if (computeU) m_U = std::make_shared<DTensor<T>>(m, m, nMats);
+        m_workspace = std::make_unique<DTensor<T> >(m_lwork, 1, 1); ///< Allocates required workspace memory
+        m_Vtr = std::make_shared<DTensor<T> >(n, n, nMats);
+        m_S = std::make_shared<DTensor<T> >(minMN, 1, nMats);
+        m_rank = std::make_unique<DTensor<unsigned int> >(1, 1, nMats, true);
+        if (computeU) m_U = std::make_shared<DTensor<T> >(m, m, nMats);
     }
 
     /**
@@ -1455,7 +1478,7 @@ public:
     /**
      * @return matrix U, or left singular vectors
      */
-    std::optional<std::shared_ptr<DTensor<T>>> leftSingularVectors() const {
+    std::optional<std::shared_ptr<DTensor<T> > > leftSingularVectors() const {
         if (!m_computeU) return std::nullopt;
         return m_U;
     }
@@ -1481,13 +1504,10 @@ public:
             DTensor<T> Si(*m_S, 2, i, i);
             DTensor<unsigned int> rankI(*m_rank, 2, i, i);
             k_countNonzeroSingularValues<T><<<numBlocks(numElS), THREADS_PER_BLOCK>>>(Si.raw(), numElS,
-                                                                                      rankI.raw(), epsilon);
+                rankI.raw(), epsilon);
         }
         return *m_rank;
     }
-
-
-
 };
 
 
@@ -1507,25 +1527,25 @@ inline void Svd<double>::factorise() {
     size_t m = m_tensor->numRows();
     size_t n = m_tensor->numCols();
     size_t nMats = m_tensor->numMats();
-    std::unique_ptr<DTensor<double>> Ui;
+    std::unique_ptr<DTensor<double> > Ui;
     for (size_t i = 0; i < nMats; i++) {
         DTensor<double> Ai(*m_tensor, 2, i, i); // tensor A[:, :, i]
         DTensor<double> Si(*m_S, 2, i, i); // S[:, :, i]
         DTensor<double> Vtri(*m_Vtr, 2, i, i); // Vtr[:, :, i]
         if (m_computeU)
-            Ui = std::make_unique<DTensor<double>>(*m_U, 2, i, i);
+            Ui = std::make_unique<DTensor<double> >(*m_U, 2, i, i);
         gpuErrChk(
-                cusolverDnDgesvd(Session::getInstance().cuSolverHandle(),
-                                 (m_computeU) ? 'A' : 'N', 'A',
-                                 m, n,
-                                 Ai.raw(), m,
-                                 Si.raw(),
-                                 (m_computeU) ? Ui->raw() : nullptr, m,
-                                 Vtri.raw(), n,
-                                 m_workspace->raw(),
-                                 m_lwork,
-                                 nullptr,  // rwork (used only if SVD fails)
-                                 m_info->raw() + i));
+            cusolverDnDgesvd(Session::getInstance().cuSolverHandle(),
+                (m_computeU) ? 'A' : 'N', 'A',
+                m, n,
+                Ai.raw(), m,
+                Si.raw(),
+                (m_computeU) ? Ui->raw() : nullptr, m,
+                Vtri.raw(), n,
+                m_workspace->raw(),
+                m_lwork,
+                nullptr, // rwork (used only if SVD fails)
+                m_info->raw() + i));
     }
 }
 
@@ -1534,25 +1554,25 @@ inline void Svd<float>::factorise() {
     size_t m = m_tensor->numRows();
     size_t n = m_tensor->numCols();
     size_t nMats = m_tensor->numMats();
-    std::unique_ptr<DTensor<float>> Ui;
+    std::unique_ptr<DTensor<float> > Ui;
     for (size_t i = 0; i < nMats; i++) {
         DTensor<float> Ai(*m_tensor, 2, i, i); // tensor A[:, :, i]
         DTensor<float> Si(*m_S, 2, i, i); // S[:, :, i]
         DTensor<float> Vtri(*m_Vtr, 2, i, i); // Vtr[:, :, i]
         if (m_computeU)
-            Ui = std::make_unique<DTensor<float>>(*m_U, 2, i, i);
+            Ui = std::make_unique<DTensor<float> >(*m_U, 2, i, i);
         gpuErrChk(
-                cusolverDnSgesvd(Session::getInstance().cuSolverHandle(),
-                                 (m_computeU) ? 'A' : 'N', 'A',
-                                 m, n,
-                                 Ai.raw(), m,
-                                 Si.raw(),
-                                 (m_computeU) ? Ui->raw() : nullptr, m,
-                                 Vtri.raw(), n,
-                                 m_workspace->raw(),
-                                 m_lwork,
-                                 nullptr,  // rwork (used only if SVD fails)
-                                 m_info->raw() + i));
+            cusolverDnSgesvd(Session::getInstance().cuSolverHandle(),
+                (m_computeU) ? 'A' : 'N', 'A',
+                m, n,
+                Ai.raw(), m,
+                Si.raw(),
+                (m_computeU) ? Ui->raw() : nullptr, m,
+                Vtri.raw(), n,
+                m_workspace->raw(),
+                m_lwork,
+                nullptr, // rwork (used only if SVD fails)
+                m_info->raw() + i));
     }
 }
 
@@ -1567,13 +1587,13 @@ inline void Svd<float>::factorise() {
  * Then, many same-type-(m,n,1)-tensor can be factorised using this object's workspace
  * @tparam T data type of (m,n,1)-tensor to be factorised (must be float or double)
  */
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 class CholeskyFactoriser : public IStatus {
-
 private:
-    int m_workspaceSize = 0;  ///< Size of workspace needed for CF
-    std::unique_ptr<DTensor<T>> m_workspace;  ///< Workspace for CF
-    DTensor<T> *m_matrix;  ///< Matrix to factorise. Do not destroy!
+    int m_workspaceSize = 0; ///< Size of workspace needed for CF
+    std::unique_ptr<DTensor<T> > m_workspace; ///< Workspace for CF
+    DTensor<T> *m_matrix; ///< Matrix to factorise. Do not destroy!
 
     /**
      * Computes the workspace size required by cuSolver.
@@ -1581,13 +1601,12 @@ private:
     void computeWorkspaceSize();
 
 public:
-
     CholeskyFactoriser(DTensor<T> &A) : IStatus() {
         if (A.numMats() > 1) throw std::invalid_argument("[Cholesky] 3D tensors require `CholeskyBatchFactoriser`");
         if (A.numRows() != A.numCols()) throw std::invalid_argument("[Cholesky] Matrix A must be square");
         m_matrix = &A;
         computeWorkspaceSize();
-        m_workspace = std::make_unique<DTensor<T>>(m_workspaceSize);
+        m_workspace = std::make_unique<DTensor<T> >(m_workspaceSize);
     }
 
     /**
@@ -1603,34 +1622,32 @@ public:
      * @param b provided matrix
      */
     void solve(DTensor<T> &b);
-
-
 };
 
 template<>
 inline void CholeskyFactoriser<double>::computeWorkspaceSize() {
     size_t n = m_matrix->numRows();
     gpuErrChk(cusolverDnDpotrf_bufferSize(Session::getInstance().cuSolverHandle(),
-                                          CUBLAS_FILL_MODE_LOWER, n,
-                                          nullptr, n, &m_workspaceSize));
+        CUBLAS_FILL_MODE_LOWER, n,
+        nullptr, n, &m_workspaceSize));
 }
 
 template<>
 inline void CholeskyFactoriser<float>::computeWorkspaceSize() {
     size_t n = m_matrix->numRows();
     gpuErrChk(cusolverDnSpotrf_bufferSize(Session::getInstance().cuSolverHandle(),
-                                          CUBLAS_FILL_MODE_LOWER, n,
-                                          nullptr, n, &m_workspaceSize));
+        CUBLAS_FILL_MODE_LOWER, n,
+        nullptr, n, &m_workspaceSize));
 }
 
 template<>
 inline void CholeskyFactoriser<double>::factorise() {
     size_t n = m_matrix->numRows();
     gpuErrChk(cusolverDnDpotrf(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, n,
-                               m_matrix->raw(), n,
-                               m_workspace->raw(),
-                               m_workspaceSize,
-                               m_info->raw()));
+        m_matrix->raw(), n,
+        m_workspace->raw(),
+        m_workspaceSize,
+        m_info->raw()));
 }
 
 
@@ -1638,32 +1655,32 @@ template<>
 inline void CholeskyFactoriser<float>::factorise() {
     size_t n = m_matrix->numRows();
     gpuErrChk(cusolverDnSpotrf(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, n,
-                               m_matrix->raw(), n,
-                               m_workspace->raw(),
-                               m_workspaceSize,
-                               m_info->raw()));
+        m_matrix->raw(), n,
+        m_workspace->raw(),
+        m_workspaceSize,
+        m_info->raw()));
 }
 
 template<>
 inline void CholeskyFactoriser<double>::solve(DTensor<double> &rhs) {
     size_t n = m_matrix->numRows();
     gpuErrChk(cusolverDnDpotrs(Session::getInstance().cuSolverHandle(),
-                               CUBLAS_FILL_MODE_LOWER,
-                               n, 1,
-                               m_matrix->raw(), n,
-                               rhs.raw(), n,
-                               m_info->raw()));
+        CUBLAS_FILL_MODE_LOWER,
+        n, 1,
+        m_matrix->raw(), n,
+        rhs.raw(), n,
+        m_info->raw()));
 }
 
 template<>
 inline void CholeskyFactoriser<float>::solve(DTensor<float> &rhs) {
     size_t n = m_matrix->numRows();
     gpuErrChk(cusolverDnSpotrs(Session::getInstance().cuSolverHandle(),
-                               CUBLAS_FILL_MODE_LOWER,
-                               n, 1,
-                               m_matrix->raw(), n,
-                               rhs.raw(), n,
-                               m_info->raw()));
+        CUBLAS_FILL_MODE_LOWER,
+        n, 1,
+        m_matrix->raw(), n,
+        rhs.raw(), n,
+        m_info->raw()));
 }
 
 
@@ -1677,14 +1694,14 @@ inline void CholeskyFactoriser<float>::solve(DTensor<float> &rhs) {
  * Then, many same-type-(m,n,1)-tensor can be factorised using this object's workspace
  * @tparam T data type of (m,n,1)-tensor to be factorised (must be float or double)
  */
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 class QRFactoriser : public IStatus {
-
 private:
-    int m_workspaceSize = 0;  ///< Size of workspace needed for LS
-    std::unique_ptr<DTensor<T>> m_householder;  ///< For storing householder reflectors
-    std::unique_ptr<DTensor<T>> m_workspace;  ///< Workspace for LS
-    DTensor<T> *m_matrix;  ///< Lhs matrix template. Do not destroy!
+    int m_workspaceSize = 0; ///< Size of workspace needed for LS
+    std::unique_ptr<DTensor<T> > m_householder; ///< For storing householder reflectors
+    std::unique_ptr<DTensor<T> > m_workspace; ///< Workspace for LS
+    DTensor<T> *m_matrix; ///< Lhs matrix template. Do not destroy!
 
     /**
      * Computes the workspace size required by cuSolver.
@@ -1692,14 +1709,13 @@ private:
     void computeWorkspaceSize();
 
 public:
-
     QRFactoriser(DTensor<T> &A) : IStatus() {
         if (A.numMats() > 1) throw std::invalid_argument("[QR] 3D tensors require `leastSquaresBatched`");
         if (A.numRows() < A.numCols()) throw std::invalid_argument("[QR] Matrix A must be tall or square");
         m_matrix = &A;
         computeWorkspaceSize();
-        m_workspace = std::make_unique<DTensor<T>>(m_workspaceSize);
-        m_householder = std::make_unique<DTensor<T>>(m_matrix->numCols());
+        m_workspace = std::make_unique<DTensor<T> >(m_workspaceSize);
+        m_householder = std::make_unique<DTensor<T> >(m_matrix->numCols());
     }
 
     /**
@@ -1726,8 +1742,6 @@ public:
      * @throws std::invalid_argument if Q or R have invalid dimensions
      */
     void getQR(DTensor<T> &Q, DTensor<T> &R);
-
-
 };
 
 template<>
@@ -1735,9 +1749,9 @@ inline void QRFactoriser<double>::computeWorkspaceSize() {
     size_t m = m_matrix->numRows();
     size_t n = m_matrix->numCols();
     gpuErrChk(cusolverDnDgeqrf_bufferSize(Session::getInstance().cuSolverHandle(),
-                                          m, n,
-                                          nullptr, m,
-                                          &m_workspaceSize));
+        m, n,
+        nullptr, m,
+        &m_workspaceSize));
 }
 
 template<>
@@ -1745,9 +1759,9 @@ inline void QRFactoriser<float>::computeWorkspaceSize() {
     size_t m = m_matrix->numRows();
     size_t n = m_matrix->numCols();
     gpuErrChk(cusolverDnSgeqrf_bufferSize(Session::getInstance().cuSolverHandle(),
-                                          m, n,
-                                          nullptr, m,
-                                          &m_workspaceSize));
+        m, n,
+        nullptr, m,
+        &m_workspaceSize));
 }
 
 template<>
@@ -1755,11 +1769,11 @@ inline void QRFactoriser<double>::factorise() {
     size_t m = m_matrix->numRows();
     size_t n = m_matrix->numCols();
     gpuErrChk(cusolverDnDgeqrf(Session::getInstance().cuSolverHandle(),
-                               m, n,
-                               m_matrix->raw(), m,
-                               m_householder->raw(),
-                               m_workspace->raw(), m_workspaceSize,
-                               m_info->raw()));
+        m, n,
+        m_matrix->raw(), m,
+        m_householder->raw(),
+        m_workspace->raw(), m_workspaceSize,
+        m_info->raw()));
 }
 
 
@@ -1768,11 +1782,11 @@ inline void QRFactoriser<float>::factorise() {
     size_t m = m_matrix->numRows();
     size_t n = m_matrix->numCols();
     gpuErrChk(cusolverDnSgeqrf(Session::getInstance().cuSolverHandle(),
-                               m, n,
-                               m_matrix->raw(), m,
-                               m_householder->raw(),
-                               m_workspace->raw(), m_workspaceSize,
-                               m_info->raw()));
+        m, n,
+        m_matrix->raw(), m,
+        m_householder->raw(),
+        m_workspace->raw(), m_workspaceSize,
+        m_info->raw()));
 }
 
 template<>
@@ -1781,17 +1795,17 @@ inline void QRFactoriser<double>::leastSquares(DTensor<double> &rhs) {
     size_t n = m_matrix->numCols();
     double alpha = 1.;
     gpuErrChk(cusolverDnDormqr(Session::getInstance().cuSolverHandle(),
-                               CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, 1, n,
-                               m_matrix->raw(), m,
-                               m_householder->raw(),
-                               rhs.raw(), m,
-                               m_workspace->raw(), m_workspaceSize,
-                               m_info->raw()));
+        CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, 1, n,
+        m_matrix->raw(), m,
+        m_householder->raw(),
+        rhs.raw(), m,
+        m_workspace->raw(), m_workspaceSize,
+        m_info->raw()));
     gpuErrChk(cublasDtrsm(Session::getInstance().cuBlasHandle(),
-                          CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1,
-                          &alpha,
-                          m_matrix->raw(), m,
-                          rhs.raw(), m));
+        CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1,
+        &alpha,
+        m_matrix->raw(), m,
+        rhs.raw(), m));
 }
 
 template<>
@@ -1800,17 +1814,17 @@ inline void QRFactoriser<float>::leastSquares(DTensor<float> &rhs) {
     size_t n = m_matrix->numCols();
     float alpha = 1.;
     gpuErrChk(cusolverDnSormqr(Session::getInstance().cuSolverHandle(),
-                               CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, 1, n,
-                               m_matrix->raw(), m,
-                               m_householder->raw(),
-                               rhs.raw(), m,
-                               m_workspace->raw(), m_workspaceSize,
-                               m_info->raw()));
+        CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, 1, n,
+        m_matrix->raw(), m,
+        m_householder->raw(),
+        rhs.raw(), m,
+        m_workspace->raw(), m_workspaceSize,
+        m_info->raw()));
     gpuErrChk(cublasStrsm(Session::getInstance().cuBlasHandle(),
-                          CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1,
-                          &alpha,
-                          m_matrix->raw(), m,
-                          rhs.raw(), m));
+        CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1,
+        &alpha,
+        m_matrix->raw(), m,
+        rhs.raw(), m));
 }
 
 template<>
@@ -1831,12 +1845,12 @@ inline void QRFactoriser<double>::getQR(DTensor<double> &Q, DTensor<double> &R)
     Q.upload(vecQ, rowMajor);
     // Apply Householder reflectors to compute Q
     gpuErrChk(cusolverDnDormqr(Session::getInstance().cuSolverHandle(),
-                               CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, n, n,
-                               m_matrix->raw(), m,
-                               m_householder->raw(),
-                               Q.raw(), m,
-                               m_workspace->raw(), m_workspaceSize,
-                               m_info->raw()));
+        CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, n, n,
+        m_matrix->raw(), m,
+        m_householder->raw(),
+        Q.raw(), m,
+        m_workspace->raw(), m_workspaceSize,
+        m_info->raw()));
     // Extract upper triangular R
     std::vector<double> vecR(n * n, 0.);
     for (size_t r = 0; r < n; r++) {
@@ -1865,12 +1879,12 @@ inline void QRFactoriser<float>::getQR(DTensor<float> &Q, DTensor<float> &R) {
     Q.upload(vecQ, rowMajor);
     // Apply Householder reflectors to compute Q
     gpuErrChk(cusolverDnSormqr(Session::getInstance().cuSolverHandle(),
-                               CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, n, n,
-                               m_matrix->raw(), m,
-                               m_householder->raw(),
-                               Q.raw(), m,
-                               m_workspace->raw(), m_workspaceSize,
-                               m_info->raw()));
+        CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, n, n,
+        m_matrix->raw(), m,
+        m_householder->raw(),
+        Q.raw(), m,
+        m_workspace->raw(), m_workspaceSize,
+        m_info->raw()));
     // Extract upper triangular R
     std::vector<float> vecR(n * n, 0.);
     for (size_t r = 0; r < n; r++) {
@@ -1892,16 +1906,14 @@ inline void QRFactoriser<float>::getQR(DTensor<float> &Q, DTensor<float> &R) {
  * Nullspace computes, pads, and stores the nullspace matrices.
  * @tparam T data type (must be float or double)
  */
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 class Nullspace {
-
 private:
-
-    std::unique_ptr<DTensor<T>> m_nullspace;  ///< Stores all nullspace matrices (N)
-    std::unique_ptr<DTensor<T>> m_projOp;  ///< Stores all projection operators (N*N')
+    std::unique_ptr<DTensor<T> > m_nullspace; ///< Stores all nullspace matrices (N)
+    std::unique_ptr<DTensor<T> > m_projOp; ///< Stores all projection operators (N*N')
 
 public:
-
     /**
      * Constructor (computes nullspace)
      * @param a device tensor
@@ -1935,8 +1947,8 @@ TEMPLATE_CONSTRAINT_REQUIRES_FPX
 inline Nullspace<T>::Nullspace(DTensor<T> &a) {
     size_t m = a.numRows(), n = a.numCols(), nMats = a.numMats();
     if (m > n) throw std::invalid_argument("[nullspace] I was expecting a square or fat matrix");
-    m_nullspace = std::make_unique<DTensor<T>>(n, n, nMats, true);
-    m_projOp = std::make_unique<DTensor<T>>(n, n, nMats, true);
+    m_nullspace = std::make_unique<DTensor<T> >(n, n, nMats, true);
+    m_projOp = std::make_unique<DTensor<T> >(n, n, nMats, true);
     auto aTranspose = a.tr();
     Svd<T> svd(aTranspose, true);
     svd.factorise();
@@ -1945,10 +1957,11 @@ inline Nullspace<T>::Nullspace(DTensor<T> &a) {
     std::vector<unsigned int> hostRankA;
     devRankA.download(hostRankA);
 
-    std::optional<std::shared_ptr<DTensor<T>>> leftSingValsOptional = svd.leftSingularVectors();
+    std::optional<std::shared_ptr<DTensor<T> > > leftSingValsOptional = svd.leftSingularVectors();
     assert(leftSingValsOptional); // make sure left SVs were computed
-    std::shared_ptr<DTensor<T>> leftSingVals = leftSingValsOptional.value();
-    for (size_t i = 0; i < nMats; i++) { // for each matrix
+    std::shared_ptr<DTensor<T> > leftSingVals = leftSingValsOptional.value();
+    for (size_t i = 0; i < nMats; i++) {
+        // for each matrix
         // Slice the matrix of left SVs to get the matrix that spans
         // the nullspace of a[:, :, i]
         unsigned int rankAi = hostRankA[i];
@@ -1983,14 +1996,14 @@ inline void Nullspace<T>::project(DTensor<T> &b) {
  * or you can provide pre-computed Cholesky lower-triangular matrices and then solve.
  * @tparam T data type of tensor (must be float or double)
  */
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 class CholeskyBatchFactoriser : public IStatus {
-
 private:
-    DTensor<T> *m_matrix;  ///< Matrix to factorise or lower-triangular decomposed matrix
-    size_t m_numRows = 0;  ///< Number of rows in `m_matrix`
-    size_t m_numMats = 0;  ///< Number of matrices in `m_matrix`
-    bool m_factorisationDone = false;  ///< Whether `m_matrix` holds original or lower-triangular matrix
+    DTensor<T> *m_matrix; ///< Matrix to factorise or lower-triangular decomposed matrix
+    size_t m_numRows = 0; ///< Number of rows in `m_matrix`
+    size_t m_numMats = 0; ///< Number of matrices in `m_matrix`
+    bool m_factorisationDone = false; ///< Whether `m_matrix` holds original or lower-triangular matrix
 
 public:
     CholeskyBatchFactoriser() = delete;
@@ -2000,7 +2013,8 @@ public:
      * @param A either matrices to be factorised or lower-triangular Cholesky decomposition matrices
      * @param factorised whether A is the original matrices or the factorised ones (default=original matrices)
      */
-    CholeskyBatchFactoriser(DTensor<T> &A, bool factorised = false) : IStatus(A.numMats()), m_factorisationDone(factorised) {
+    CholeskyBatchFactoriser(DTensor<T> &A, bool factorised = false) : IStatus(A.numMats()),
+                                                                      m_factorisationDone(factorised) {
         if (A.numRows() != A.numCols()) throw std::invalid_argument("[CholeskyBatch] A must be square");
         m_matrix = &A;
         m_numRows = A.numRows();
@@ -2017,20 +2031,18 @@ public:
      * @param b vector in system Ax=b to be solved where x is the solution
      */
     void solve(DTensor<T> &b);
-
-
 };
 
 template<>
 inline void CholeskyBatchFactoriser<double>::factorise() {
     if (m_factorisationDone) return;
     gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(),
-                                      CUBLAS_FILL_MODE_LOWER,
-                                      m_numRows,
-                                      m_matrix->ptrMatrices(),
-                                      m_numRows,
-                                      m_info->raw(),
-                                      m_numMats));
+        CUBLAS_FILL_MODE_LOWER,
+        m_numRows,
+        m_matrix->ptrMatrices(),
+        m_numRows,
+        m_info->raw(),
+        m_numMats));
     m_factorisationDone = true;
 }
 
@@ -2038,12 +2050,12 @@ template<>
 inline void CholeskyBatchFactoriser<float>::factorise() {
     if (m_factorisationDone) return;
     gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(),
-                                      CUBLAS_FILL_MODE_LOWER,
-                                      m_numRows,
-                                      m_matrix->ptrMatrices(),
-                                      m_numRows,
-                                      m_info->raw(),
-                                      m_numMats));
+        CUBLAS_FILL_MODE_LOWER,
+        m_numRows,
+        m_matrix->ptrMatrices(),
+        m_numRows,
+        m_info->raw(),
+        m_numMats));
     m_factorisationDone = true;
 }
 
@@ -2055,15 +2067,15 @@ inline void CholeskyBatchFactoriser<double>::solve(DTensor<double> &b) {
     }
     if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
     gpuErrChk(cusolverDnDpotrsBatched(Session::getInstance().cuSolverHandle(),
-                                      CUBLAS_FILL_MODE_LOWER,
-                                      m_numRows,
-                                      1,  ///< only supports rhs = 1
-                                      m_matrix->ptrMatrices(),
-                                      m_numRows,
-                                      b.ptrMatrices(),
-                                      m_numRows,
-                                      m_info->raw(),
-                                      m_numMats));
+        CUBLAS_FILL_MODE_LOWER,
+        m_numRows,
+        1, ///< only supports rhs = 1
+        m_matrix->ptrMatrices(),
+        m_numRows,
+        b.ptrMatrices(),
+        m_numRows,
+        m_info->raw(),
+        m_numMats));
 }
 
 template<>
@@ -2074,19 +2086,18 @@ inline void CholeskyBatchFactoriser<float>::solve(DTensor<float> &b) {
     }
     if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
     gpuErrChk(cusolverDnSpotrsBatched(Session::getInstance().cuSolverHandle(),
-                                      CUBLAS_FILL_MODE_LOWER,
-                                      m_numRows,
-                                      1,  ///< only supports rhs = 1
-                                      m_matrix->ptrMatrices(),
-                                      m_numRows,
-                                      b.ptrMatrices(),
-                                      m_numRows,
-                                      m_info->raw(),
-                                      m_numMats));
+        CUBLAS_FILL_MODE_LOWER,
+        m_numRows,
+        1, ///< only supports rhs = 1
+        m_matrix->ptrMatrices(),
+        m_numRows,
+        b.ptrMatrices(),
+        m_numRows,
+        m_info->raw(),
+        m_numMats));
 }
 
 
-
 /* ================================================================================================
  *  GIVENS ANNIHILATOR
  * ================================================================================================ */
@@ -2098,23 +2109,22 @@ inline void CholeskyBatchFactoriser<float>::solve(DTensor<float> &b) {
  *
  * @tparam T data type of tensor (must be float or double)
  */
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 class GivensAnnihilator {
-
 private:
     DTensor<T> *m_matrix;
     /**
      * Auxiliary memory on the device of length 3 used to store
      * rhypot(xij, xkj), cos θ, and sin θ.
      */
-    std::unique_ptr<DTensor<T>> m_d_rhyp_cos_sin;
+    std::unique_ptr<DTensor<T> > m_d_rhyp_cos_sin;
 
     void init() {
-        m_d_rhyp_cos_sin = std::make_unique<DTensor<T>>(3);
+        m_d_rhyp_cos_sin = std::make_unique<DTensor<T> >(3);
     }
 
 public:
-
     GivensAnnihilator() {
         init();
     }
@@ -2156,10 +2166,10 @@ public:
      * @throws std::invalid_argument if i, k, or j are out of bounds
      */
     void annihilate(size_t i, size_t k, size_t j);
-
 };
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 __global__ void k_givensAnnihilateRHypot(const T *data,
                                          T *res,
                                          size_t i, size_t k, size_t j,
@@ -2172,7 +2182,8 @@ __global__ void k_givensAnnihilateRHypot(const T *data,
 }
 
 
-template<typename T> TEMPLATE_CONSTRAINT_REQUIRES_FPX
+template<typename T>
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 inline void GivensAnnihilator<T>::annihilate(size_t i, size_t k, size_t j) {
     /* A few checks */
     size_t nR = m_matrix->numRows(), nC = m_matrix->numCols();
diff --git a/main.cu b/main.cu
index 01ad042..742321d 100644
--- a/main.cu
+++ b/main.cu
@@ -4,17 +4,26 @@
 #include <string>
 #include <vector>
 
-
-int main() {
+void xyz() {
     /* Write to binary file */
     auto r = DTensor<double>::createRandomTensor(3, 6, 4, -1, 1);
+    auto r2 = DTensor<double>::createRandomTensor(300, 600, 4, -1, 1);
     std::string fName = "tensor.bt"; // binary tensor file extension: .bt
-    r.saveToFile(fName);
 
     /* Parse binary file */
     auto recov = DTensor<double>::parseFromFile(fName);
     auto err = r - recov;
-    std::cout << "max error : " << err.maxAbs();
+    std::cout << "max error : " << err.maxAbs() << std::endl;
+    std::cout << "Memory: " << std::setprecision(3)
+            << (float) Session::getInstance().totalAllocatedBytes() / 1e6
+            << " MB" << std::endl;
+}
 
+
+int main() {
+    xyz();
+    std::cout << "Memory (outside): " << std::setprecision(3)
+            << (float) Session::getInstance().totalAllocatedBytes() / 1e6
+            << " MB" << std::endl;
     return 0;
 }
diff --git a/test/testTensor.cu b/test/testTensor.cu
index 6b10636..bff6ac8 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -11,9 +11,11 @@
  * ================================================================================================ */
 class TensorTest : public testing::Test {
 protected:
-    TensorTest() {}
+    TensorTest() {
+    }
 
-    virtual ~TensorTest() {}
+    virtual ~TensorTest() {
+    }
 };
 
 #define TENSOR_DATA_234A {1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 10, 5, 4, 3, 2, 1, -1, 4, 3, 4, 3, 4, 8}
@@ -52,17 +54,25 @@ void tensorConstructionStorageMode() {
     size_t rows = 3;
     size_t cols = 2;
     size_t mats = 2;
-    std::vector<T> aCm = {1, 3, 5,
-                          2, 4, 6};
-    std::vector<T> bCm = {7, 9, 11,
-                          8, 10, 12};
+    std::vector<T> aCm = {
+        1, 3, 5,
+        2, 4, 6
+    };
+    std::vector<T> bCm = {
+        7, 9, 11,
+        8, 10, 12
+    };
     const std::vector<T> Cm = {1, 3, 5, 2, 4, 6, 7, 9, 11, 8, 10, 12};
-    std::vector<T> aRm = {1, 2,
-                          3, 4,
-                          5, 6};
-    std::vector<T> bRm = {7, 8,
-                          9, 10,
-                          11, 12};
+    std::vector<T> aRm = {
+        1, 2,
+        3, 4,
+        5, 6
+    };
+    std::vector<T> bRm = {
+        7, 8,
+        9, 10,
+        11, 12
+    };
     std::vector<T> Rm = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
     std::vector<T> hostData(rows * cols * mats);
     // test constructor
@@ -183,8 +193,8 @@ TEST_F(TensorTest, parseTensorFromFileBinary) {
 TEST_F(TensorTest, parseTensorFromBinaryPython) {
     std::string fName = "../../python/b_d.bt";
     DTensor<double> b = DTensor<double>::parseFromFile(fName);
-    for (size_t i=0; i<3; i++) {
-        for (size_t j=0; j<3; j++) {
+    for (size_t i = 0; i < 3; i++) {
+        for (size_t j = 0; j < 3; j++) {
             EXPECT_NEAR(1 + 2*j + 6*i, b(i, j, 0), PRECISION_HIGH);
             EXPECT_NEAR(2 + 2*j + 6*i, b(i, j, 1), PRECISION_HIGH);
         }
@@ -212,7 +222,7 @@ TEMPLATE_WITH_TYPE_T
 void tensorMoveConstructor() {
     DTensor<T> zero(2, 3, 4, true);
     DTensor<T> x(std::move(zero));
-    DTensor<T> y(DTensor<T> {100, 10, 1000});
+    DTensor<T> y(DTensor<T>{100, 10, 1000});
 }
 
 TEST_F(TensorTest, tensorMoveConstructor) {
@@ -378,8 +388,10 @@ void tensorDeviceCopyTo() {
     DTensor<T> other(2, 3, 5, true);
     DTensor<T> z(other, 2, 1, 4);
     tenz.deviceCopyTo(z);
-    std::vector<T> expected = {0, 0, 0, 0, 0, 0,
-                               1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 10, 5, 4, 3, 2, 1, -1, 4, 3, 4, 3, 4, 8};
+    std::vector<T> expected = {
+        0, 0, 0, 0, 0, 0,
+        1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 10, 5, 4, 3, 2, 1, -1, 4, 3, 4, 3, 4, 8
+    };
     std::vector<T> actual(2 * 3 * 5);
     other.download(actual);
     EXPECT_EQ(expected, actual);
@@ -453,9 +465,11 @@ void tensorSliceAndReshape(T epsilon) {
     bSlice.reshape(2, 9, 1);
     aSlice += bSlice;
 
-    std::vector<T> dataAExpected = {1, 2, 3, 4, 5, 6, 41, 7, 5, 5,
-                                    19, 17, 14, 13, 5, 11, -8, -4,
-                                    6, 8, 8, -2, 8, 13};
+    std::vector<T> dataAExpected = {
+        1, 2, 3, 4, 5, 6, 41, 7, 5, 5,
+        19, 17, 14, 13, 5, 11, -8, -4,
+        6, 8, 8, -2, 8, 13
+    };
     DTensor<T> aExpected(dataAExpected, 2, 3, 4);
 
     DTensor<T> err = aExpected - a;
@@ -479,12 +493,12 @@ void tensorDotF(T epsilon) {
     DTensor<T> vecA(dataA, dataA.size());
     DTensor<T> vecB(dataB, dataB.size());
     T dotVector = vecA.dotF(vecB);
-    EXPECT_EQ(604, dotVector);  // from MATLAB
+    EXPECT_EQ(604, dotVector); // from MATLAB
     // as matrices
     DTensor<T> tenA(dataA, 2, 3, 4);
     DTensor<T> tenB(dataB, 2, 3, 4);
     T dotTensor = tenA.dotF(tenB);
-    EXPECT_EQ(604, dotTensor);  // from MATLAB
+    EXPECT_EQ(604, dotTensor); // from MATLAB
 }
 
 TEST_F(TensorTest, tensorDotF) {
@@ -680,8 +694,10 @@ TEST_F(TensorTest, tensorAssignmentOperator) {
 TEMPLATE_WITH_TYPE_T
 void tensorTimesEqualsScalar() {
     std::vector<T> data = TENSOR_DATA_234A;
-    std::vector<T> dataTimes3 = {3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12,
-                                 24};
+    std::vector<T> dataTimes3 = {
+        3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12,
+        24
+    };
     DTensor<T> tenz(data, 2, 3, 4);
     tenz *= 3.0;
     std::vector<T> actual;
@@ -701,8 +717,10 @@ TEST_F(TensorTest, tensorTimesEqualsScalar) {
 TEMPLATE_WITH_TYPE_T
 void tensorTimesScalar() {
     std::vector<T> data = TENSOR_DATA_234A;
-    std::vector<T> dataTimes3 = {3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12,
-                                 24};
+    std::vector<T> dataTimes3 = {
+        3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12,
+        24
+    };
     DTensor<T> tenz(data, 2, 3, 4);
     auto tripleTensor = 3.0 * tenz;
     std::vector<T> actual;
@@ -809,12 +827,16 @@ TEST_F(TensorTest, tensorMinusTensor) {
 
 TEMPLATE_WITH_TYPE_T
 void tensorAddAB() {
-    std::vector<T> aData = {1, 2, 3, 4, 5, 6,
-                            7, 8, 9, 10, 11, 12,
-                            13, 14, 15, 16, 17, 18};
-    std::vector<T> bData = {6, 5, 4, 3, 2, 1,
-                            7, 6, 5, 4, 3, 2,
-                            1, 2, 1, 5, -6, 8};
+    std::vector<T> aData = {
+        1, 2, 3, 4, 5, 6,
+        7, 8, 9, 10, 11, 12,
+        13, 14, 15, 16, 17, 18
+    };
+    std::vector<T> bData = {
+        6, 5, 4, 3, 2, 1,
+        7, 6, 5, 4, 3, 2,
+        1, 2, 1, 5, -6, 8
+    };
     DTensor<T> A(aData, 2, 3, 3);
     DTensor<T> B(bData, 3, 2, 3);
     DTensor<T> C(2, 2, 3, true);
@@ -876,12 +898,14 @@ TEST_F(TensorTest, tensorSliceAxis01PtrMatrices) {
 
 TEMPLATE_WITH_TYPE_T
 void tensorGetRows() {
-    std::vector<T> aData = {10.5, 25.0, 60.0,
-                            -21.0, 720.0, -1.0,
-                            11.0, -1.0, 30.0,
-                            5., 6., 7.,
-                            8., 9., 10.,
-                            11., 12., 13};
+    std::vector<T> aData = {
+        10.5, 25.0, 60.0,
+        -21.0, 720.0, -1.0,
+        11.0, -1.0, 30.0,
+        5., 6., 7.,
+        8., 9., 10.,
+        11., 12., 13
+    };
     DTensor<T> A(aData, 3, 3, 2);
     DTensor<T> Ar0 = A.getRows(1, 1, 0);
     std::vector<T> expected0 = {25., 720., -1.};
@@ -918,7 +942,6 @@ void tensorTranspose() {
     std::vector<T> actual;
     Atranspose.download(actual);
     EXPECT_EQ(expected, actual);
-
 }
 
 TEST_F(TensorTest, tensorTranspose) {
@@ -926,14 +949,60 @@ TEST_F(TensorTest, tensorTranspose) {
     tensorTranspose<double>();
 }
 
+/* ---------------------------------------
+ * Tensor: total bytes allocated
+ * --------------------------------------- */
+TEMPLATE_WITH_TYPE_T
+void tensorBytesAllocated() {
+    const size_t previouslyAllocatedBytes = Session::getInstance().totalAllocatedBytes();
+    size_t m = 10, n = 10, k = 20;
+    DTensor<T> A(m, n, k);
+    const size_t allocatedBytes = Session::getInstance().totalAllocatedBytes();
+    const size_t currentAllocatedBytes = m * n * k * sizeof(T) + k * sizeof(T *);
+    const size_t expectedBytes = currentAllocatedBytes + previouslyAllocatedBytes;
+    EXPECT_EQ(expectedBytes, allocatedBytes);
+}
+
+TEST_F(TensorTest, tensorBytesAllocated) {
+    tensorBytesAllocated<float>();
+    tensorBytesAllocated<double>();
+    tensorBytesAllocated<int>();
+}
+
+
+/* ---------------------------------------
+ * Tensor: total bytes allocated
+ * --------------------------------------- */
+TEMPLATE_WITH_TYPE_T
+void tensorBytesAllocatedDeallocated() {
+    const size_t previouslyAllocatedBytes = Session::getInstance().totalAllocatedBytes();
+    size_t m = 10, n = 10, k = 20;
+    auto *A = new DTensor<T>(m, n, k); // new allocation (increments session)
+    const size_t allocatedBytes = Session::getInstance().totalAllocatedBytes();
+    const size_t expectedBytes = previouslyAllocatedBytes + m * n * k * sizeof(T) + k * sizeof(T *);
+    EXPECT_EQ(expectedBytes, allocatedBytes);
+    delete A;
+    EXPECT_EQ(previouslyAllocatedBytes, Session::getInstance().totalAllocatedBytes());
+}
+
+TEST_F(TensorTest, tensorBytesAllocatedDeallocated) {
+    tensorBytesAllocatedDeallocated<float>();
+    tensorBytesAllocatedDeallocated<double>();
+    tensorBytesAllocatedDeallocated<int>();
+}
+
+
+
 /* ================================================================================================
  *  LEAST SQUARES TESTS
  * ================================================================================================ */
 class LeastSquaresTest : public testing::Test {
 protected:
-    LeastSquaresTest() {}
+    LeastSquaresTest() {
+    }
 
-    virtual ~LeastSquaresTest() {}
+    virtual ~LeastSquaresTest() {
+    }
 };
 
 /* ---------------------------------------
@@ -943,12 +1012,14 @@ protected:
 TEMPLATE_WITH_TYPE_T
 void tensorLeastSquares1(T epsilon) {
     // TODO test with tall matrices too
-    std::vector<T> aData = {1, 2,
-                            3, 4,
-                            7, 8,
-                            9, 10,
-                            6, 8,
-                            -9, 20};
+    std::vector<T> aData = {
+        1, 2,
+        3, 4,
+        7, 8,
+        9, 10,
+        6, 8,
+        -9, 20
+    };
     std::vector<T> bData = {1, 1, -1, 2, 30, -80};
     DTensor<T> A0(aData, 2, 2, 3);
     DTensor<T> A(A0);
@@ -973,9 +1044,11 @@ TEST_F(LeastSquaresTest, tensorLS1) {
  * ================================================================================================ */
 class SvdTest : public testing::Test {
 protected:
-    SvdTest() {}
+    SvdTest() {
+    }
 
-    virtual ~SvdTest() {}
+    virtual ~SvdTest() {
+    }
 };
 
 /* ---------------------------------------
@@ -983,11 +1056,14 @@ protected:
  * and matrix rank
  * --------------------------------------- */
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void singularValuesComputation(float epsilon) {
-    std::vector<T> bData = {1, 6, 6, 6, 6, 6, 6, 6,
-                            2, 7, 7, 7, 7, 7, 7, 7,
-                            3, 8, 8, 8, 8, 8, 8, 8,};
+    std::vector<T> bData = {
+        1, 6, 6, 6, 6, 6, 6, 6,
+        2, 7, 7, 7, 7, 7, 7, 7,
+        3, 8, 8, 8, 8, 8, 8, 8,
+    };
     DTensor<T> B(bData, 8, 3);
     Svd<T> svd(B, true, false);
     svd.factorise();
@@ -1012,11 +1088,14 @@ TEST_F(SvdTest, singularValuesComputation) {
  * Singular values - memory mgmt
  * --------------------------------------- */
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void singularValuesMemory(float epsilon) {
-    std::vector<T> bData = {1, 6, 6, 6, 6, 6, 6, 6,
-                            2, 7, 7, 7, 7, 7, 7, 7,
-                            3, 8, 8, 8, 8, 8, 8, 8,};
+    std::vector<T> bData = {
+        1, 6, 6, 6, 6, 6, 6, 6,
+        2, 7, 7, 7, 7, 7, 7, 7,
+        3, 8, 8, 8, 8, 8, 8, 8,
+    };
     DTensor<T> B(bData, 8, 3);
     Svd<T> svd(B, true, false);
     svd.factorise();
@@ -1044,7 +1123,8 @@ TEST_F(SvdTest, singularValuesMemory) {
 /* ---------------------------------------
  * SVD with multiple matrices
  * --------------------------------------- */
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void singularValuesMultipleMatrices(float epsilon) {
     std::vector<T> aData = {1, 2, 3, 4, 5, 6, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1};
     DTensor<T> A(aData, 3, 2, 3);
@@ -1054,31 +1134,35 @@ void singularValuesMultipleMatrices(float epsilon) {
     DTensor<T> const &V = svd.rightSingularVectors();
     auto Uopt = svd.leftSingularVectors();
     auto U = Uopt.value();
-    std::vector<T> expected_v = {-0.386317703118612, -0.922365780077058, -0.922365780077058, 0.386317703118612,
-                                 -0.447213595499958, -0.894427190999916, 0.894427190999916, -0.447213595499958,
-                                 0, -1, 1, 0};
+    std::vector<T> expected_v = {
+        -0.386317703118612, -0.922365780077058, -0.922365780077058, 0.386317703118612,
+        -0.447213595499958, -0.894427190999916, 0.894427190999916, -0.447213595499958,
+        0, -1, 1, 0
+    };
     std::vector<T> actual_v(12);
     V.download(actual_v);
-    for (size_t i = 0; i < 4; i++) EXPECT_NEAR(expected_v[i], actual_v[i], epsilon);
+    for (size_t i = 0; i < 4; i++)
+        EXPECT_NEAR(expected_v[i], actual_v[i], epsilon);
     std::vector<T> expected_s = {9.508032000695726, 0.772869635673484, 3.872983346207417, 0, 1, 0};
     std::vector<T> actual_s(6);
     S.download(actual_s);
-    for (size_t i = 0; i < 6; i++) EXPECT_NEAR(expected_s[i], actual_s[i], epsilon);
+    for (size_t i = 0; i < 6; i++)
+        EXPECT_NEAR(expected_s[i], actual_s[i], epsilon);
     std::vector<T> expected_u = {
-            -0.428667133548626, -0.566306918848035, -0.703946704147444,
-            0.805963908589298, 0.112382414096594, -0.581199080396110,
-            0.408248290463863, -0.816496580927726, 0.408248290463863,
-            -0.577350269189626, -0.577350269189626, -0.577350269189626,
-            0.816496580927726, -0.408248290463863, -0.408248290463863,
-            0.000000000000000, -0.707106781186548, 0.707106781186547,
-            0, 0, -1,
-            1, 0, 0,
-            0, -1, 0,
+        -0.428667133548626, -0.566306918848035, -0.703946704147444,
+        0.805963908589298, 0.112382414096594, -0.581199080396110,
+        0.408248290463863, -0.816496580927726, 0.408248290463863,
+        -0.577350269189626, -0.577350269189626, -0.577350269189626,
+        0.816496580927726, -0.408248290463863, -0.408248290463863,
+        0.000000000000000, -0.707106781186548, 0.707106781186547,
+        0, 0, -1,
+        1, 0, 0,
+        0, -1, 0,
     };
     std::vector<T> actual_u(27);
     U->download(actual_u);
-    for (size_t i = 0; i < 27; i++) EXPECT_NEAR(expected_u[i], actual_u[i], epsilon);
-
+    for (size_t i = 0; i < 27; i++)
+        EXPECT_NEAR(expected_u[i], actual_u[i], epsilon);
 }
 
 TEST_F(SvdTest, singularValuesMultipleMatrices) {
@@ -1091,11 +1175,14 @@ TEST_F(SvdTest, singularValuesMultipleMatrices) {
  * SVD for rank computation of multiple
  * matrices
  * --------------------------------------- */
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void singularValuesRankMultipleMatrices(float epsilon) {
-    std::vector<T> aData = {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 0,
-                            1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12,
-                            1, 2, 3, 4, 2, 4, 6, 8, 3, 6, 9, 12};
+    std::vector<T> aData = {
+        1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 0,
+        1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12,
+        1, 2, 3, 4, 2, 4, 6, 8, 3, 6, 9, 12
+    };
     DTensor<T> A(aData, 4, 3, 3);
 
     Svd<T> svd(A);
@@ -1116,9 +1203,11 @@ TEST_F(SvdTest, singularValuesRankMultipleMatrices) {
  * ================================================================================================ */
 class CholeskyTest : public testing::Test {
 protected:
-    CholeskyTest() {}
+    CholeskyTest() {
+    }
 
-    virtual ~CholeskyTest() {}
+    virtual ~CholeskyTest() {
+    }
 };
 
 
@@ -1126,11 +1215,14 @@ protected:
  * Cholesky factorisation
  * --------------------------------------- */
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void choleskyFactorisation(T epsilon) {
-    std::vector<T> aData = {10.0, 2.0, 3.0,
-                            2.0, 20.0, -1.0,
-                            3.0, -1.0, 30.0};
+    std::vector<T> aData = {
+        10.0, 2.0, 3.0,
+        2.0, 20.0, -1.0,
+        3.0, -1.0, 30.0
+    };
     DTensor<T> A(aData, 3, 3, 1);
     CholeskyFactoriser<T> chol(A);
     chol.factorise();
@@ -1149,11 +1241,14 @@ TEST_F(CholeskyTest, choleskyFactorisation) {
  * Cholesky factorisation: solve system
  * --------------------------------------- */
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void choleskyFactorisationSolution(T epsilon) {
-    std::vector<T> aData = {10.0, 2.0, 3.0,
-                            2.0, 20.0, -1.0,
-                            3.0, -1.0, 30.0};
+    std::vector<T> aData = {
+        10.0, 2.0, 3.0,
+        2.0, 20.0, -1.0,
+        3.0, -1.0, 30.0
+    };
     DTensor<T> A(aData, 3, 3, 1);
     DTensor<T> L(A); // L = A
     CholeskyFactoriser<T> chol(L);
@@ -1167,12 +1262,12 @@ void choleskyFactorisationSolution(T epsilon) {
     std::vector<T> expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036};
     std::vector<T> actual(3);
     sol.download(actual);
-    for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon);
+    for (size_t i = 0; i < 3; i++)
+        EXPECT_NEAR(expected[i], actual[i], epsilon);
 
     DTensor<T> error = A * sol;
     error -= rhs;
     EXPECT_TRUE(error.normF() < epsilon);
-
 }
 
 TEST_F(CholeskyTest, choleskyFactorisationSolution) {
@@ -1184,11 +1279,14 @@ TEST_F(CholeskyTest, choleskyFactorisationSolution) {
  * Batched Cholesky factorisation
  * --------------------------------------- */
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void choleskyBatchFactorisation(T epsilon) {
-    std::vector<T> aData = {10.0, 2.0, 3.0,
-                            2.0, 20.0, -1.0,
-                            3.0, -1.0, 30.0};
+    std::vector<T> aData = {
+        10.0, 2.0, 3.0,
+        2.0, 20.0, -1.0,
+        3.0, -1.0, 30.0
+    };
     DTensor<T> A(3, 3, 2);
     DTensor<T> A0(A, 2, 0, 0);
     DTensor<T> A1(A, 2, 1, 1);
@@ -1217,11 +1315,14 @@ TEST_F(CholeskyTest, choleskyBatchFactorisation) {
  * Batched Cholesky solve
  * --------------------------------------- */
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void choleskyBatchFactorSolve(T epsilon) {
-    std::vector<T> aData = {10.0, 2.0, 3.0,
-                            2.0, 20.0, -1.0,
-                            3.0, -1.0, 30.0};
+    std::vector<T> aData = {
+        10.0, 2.0, 3.0,
+        2.0, 20.0, -1.0,
+        3.0, -1.0, 30.0
+    };
     DTensor<T> A(3, 3, 2);
     DTensor<T> A0(A, 2, 0, 0);
     DTensor<T> A1(A, 2, 1, 1);
@@ -1241,8 +1342,10 @@ void choleskyBatchFactorSolve(T epsilon) {
     std::vector<T> expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036};
     std::vector<T> actual(6);
     sol.download(actual);
-    for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon);  // 0
-    for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon);  // 1
+    for (size_t i = 0; i < 3; i++)
+        EXPECT_NEAR(expected[i], actual[i], epsilon); // 0
+    for (size_t i = 0; i < 3; i++)
+        EXPECT_NEAR(expected[i], actual[i + 3], epsilon); // 1
     DTensor<T> error = A * sol;
     error -= rhs;
     EXPECT_TRUE(error.normF() < epsilon);
@@ -1258,19 +1361,24 @@ TEST_F(CholeskyTest, choleskyBatchFactorSolve) {
  * Batched Cholesky solve (factor provided)
  * --------------------------------------- */
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void choleskyBatchSolve(T epsilon) {
-    std::vector<T> aData = {10.0, 2.0, 3.0,
-                            2.0, 20.0, -1.0,
-                            3.0, -1.0, 30.0};
+    std::vector<T> aData = {
+        10.0, 2.0, 3.0,
+        2.0, 20.0, -1.0,
+        3.0, -1.0, 30.0
+    };
     DTensor<T> A(3, 3, 2);
     DTensor<T> A0(A, 2, 0, 0);
     DTensor<T> A1(A, 2, 1, 1);
     A0.upload(aData);
     A1.upload(aData);
-    std::vector<T> lowData = {3.162277660168380, 0, 0,
-                              0.632455532033676, 4.427188724235731, 0,
-                              0.948683298050514, -0.361403161162101, 5.382321781081287};  // from matlab
+    std::vector<T> lowData = {
+        3.162277660168380, 0, 0,
+        0.632455532033676, 4.427188724235731, 0,
+        0.948683298050514, -0.361403161162101, 5.382321781081287
+    }; // from matlab
     DTensor<T> low(3, 3, 2);
     DTensor<T> low0(low, 2, 0, 0);
     DTensor<T> low1(low, 2, 1, 1);
@@ -1289,8 +1397,10 @@ void choleskyBatchSolve(T epsilon) {
     std::vector<T> expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036};
     std::vector<T> actual(6);
     sol.download(actual);
-    for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon);  // 0
-    for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon);  // 1
+    for (size_t i = 0; i < 3; i++)
+        EXPECT_NEAR(expected[i], actual[i], epsilon); // 0
+    for (size_t i = 0; i < 3; i++)
+        EXPECT_NEAR(expected[i], actual[i + 3], epsilon); // 1
     DTensor<T> error = A * sol;
     error -= rhs;
     EXPECT_TRUE(error.normF() < epsilon);
@@ -1308,9 +1418,11 @@ TEST_F(CholeskyTest, choleskyBatchSolve) {
  * ================================================================================================ */
 class QRTest : public testing::Test {
 protected:
-    QRTest() {}
+    QRTest() {
+    }
 
-    virtual ~QRTest() {}
+    virtual ~QRTest() {
+    }
 };
 
 
@@ -1318,7 +1430,8 @@ protected:
  * QR factorisation
  * --------------------------------------- */
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void qrFactorisation(T epsilon) {
     size_t nR = 4;
     size_t nC = 3;
@@ -1349,7 +1462,8 @@ TEST_F(QRTest, qrFactorisation) {
  * - tall and skinny matrix
  * --------------------------------------- */
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void qrFactorisationTall(T epsilon) {
     size_t nR = 20;
     size_t nC = 3;
@@ -1379,19 +1493,24 @@ TEST_F(QRTest, qrFactorisationTall) {
  * QR factorisation: solve least squares
  * --------------------------------------- */
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void qrLeastSquares(T epsilon) {
     size_t nR = 4;
     size_t nC = 3;
     DTensor<T> temp(nR, nC);
-    std::vector<T> vecA = {85.5638, -59.4001, -80.1992,
-                           99.9464, 5.51393, 5.17935,
-                           6.87488, -26.7536, 36.0914,
-                           -44.3857, -32.1268, 54.8915};  // Random matrix
-    std::vector<T> vecB = {-23.3585,
-                           -48.5744,
-                           43.4229,
-                           -56.5081};  // Random vector
+    std::vector<T> vecA = {
+        85.5638, -59.4001, -80.1992,
+        99.9464, 5.51393, 5.17935,
+        6.87488, -26.7536, 36.0914,
+        -44.3857, -32.1268, 54.8915
+    }; // Random matrix
+    std::vector<T> vecB = {
+        -23.3585,
+        -48.5744,
+        43.4229,
+        -56.5081
+    }; // Random vector
     DTensor<T> A(vecA, nR, nC, 1, rowMajor);
     DTensor<T> b(vecB, nR);
     DTensor<T> xFull(nR);
@@ -1407,7 +1526,7 @@ void qrLeastSquares(T epsilon) {
     Ax.addAB(A, x);
     Ax -= b;
     T nrm = Ax.normF();
-    EXPECT_NEAR(nrm, 80.003169364198072, epsilon);  // From MatLab
+    EXPECT_NEAR(nrm, 80.003169364198072, epsilon); // From MatLab
 }
 
 TEST_F(QRTest, qrLeastSquares) {
@@ -1421,9 +1540,11 @@ TEST_F(QRTest, qrLeastSquares) {
  * ================================================================================================ */
 class NullspaceTest : public testing::Test {
 protected:
-    NullspaceTest() {}
+    NullspaceTest() {
+    }
 
-    virtual ~NullspaceTest() {}
+    virtual ~NullspaceTest() {
+    }
 };
 
 
@@ -1431,13 +1552,16 @@ protected:
  * Basic nullspace test
  * --------------------------------------- */
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void computeNullspaceTensor(T epsilon) {
-    std::vector<T> aData = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0,
-                            1, 2, 3, 4, 5, 6, 7, 8, 9, 7, 8, 9,
-                            1, 2, 3, 4, 2, 4, 6, 8, 3, 6, 9, 12,
-                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    std::vector<T> aData = {
+        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0,
+        1, 2, 3, 4, 5, 6, 7, 8, 9, 7, 8, 9,
+        1, 2, 3, 4, 2, 4, 6, 8, 3, 6, 9, 12,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    };
     DTensor<T> A(aData, 3, 4, 5);
     Nullspace<T> ns(A);
     DTensor<T> nA = ns.nullspace();
@@ -1471,14 +1595,17 @@ TEST_F(NullspaceTest, computeNullspaceTensor) {
  * Nullspace is trivial
  * --------------------------------------- */
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void computeNullspaceTrivial(T epsilon) {
-    std::vector<T> data{4, 5, 7,
-                        4, 1, 8,
-                        4, 5, 0,
-                        1, 1, 1,
-                        5, 6, 7,
-                        9, 0, 3};
+    std::vector<T> data{
+        4, 5, 7,
+        4, 1, 8,
+        4, 5, 0,
+        1, 1, 1,
+        5, 6, 7,
+        9, 0, 3
+    };
     DTensor<T> A(data, 3, 3, 2, rowMajor);
     Nullspace<T> nullA(A);
     DTensor<T> N = nullA.nullspace();
@@ -1494,14 +1621,17 @@ TEST_F(NullspaceTest, computeNullspaceTrivial) {
  * Project onto nullspace
  * --------------------------------------- */
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void projectOnNullspaceTensor(T epsilon) {
     // offline
     size_t m = 3;
     size_t n = 7;
-    std::vector<T> mat{1, -2, 3, 4, -1, -1, -1,
-                       1, 2, -3, 4, -1, -1, -1,
-                       -1, 3, 5, -7, -1, -1, -1};
+    std::vector<T> mat{
+        1, -2, 3, 4, -1, -1, -1,
+        1, 2, -3, 4, -1, -1, -1,
+        -1, 3, 5, -7, -1, -1, -1
+    };
     DTensor<T> A(m, n, 1);
     A.upload(mat, rowMajor);
     Nullspace<T> ns = Nullspace(A);
@@ -1538,9 +1668,11 @@ TEST_F(NullspaceTest, projectOnNullspaceTensor) {
  * ================================================================================================ */
 class GivensAnnihilatorTest : public testing::Test {
 protected:
-    GivensAnnihilatorTest() {}
+    GivensAnnihilatorTest() {
+    }
 
-    virtual ~GivensAnnihilatorTest() {}
+    virtual ~GivensAnnihilatorTest() {
+    }
 };
 
 
@@ -1548,7 +1680,8 @@ protected:
  * GivensAnnihilator works
  * --------------------------------------- */
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void givensAnnihilateElement(T epsilon) {
     size_t m = 10;
     size_t n = 6;
@@ -1573,12 +1706,12 @@ TEST_F(GivensAnnihilatorTest, givensAnnihilateElement) {
 }
 
 
-
 /* ---------------------------------------
  * GivensAnnihilator: correctness
  * --------------------------------------- */
 
-TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
 void givensAnnihilateCorrectness(T epsilon) {
     size_t m = 10, n = 6;
     std::vector<double> v(m * n);
@@ -1593,13 +1726,9 @@ void givensAnnihilateCorrectness(T epsilon) {
     EXPECT_NEAR(2.137186834969645, a(0, 0), epsilon);
     EXPECT_NEAR(44.552125559751822, a(0, 3), epsilon);
     EXPECT_NEAR(-0.328797974610715, a(1, 3), epsilon);
-
 }
 
 TEST_F(GivensAnnihilatorTest, givensAnnihilateCorrectness) {
     givensAnnihilateCorrectness<double>(1e-14);
     givensAnnihilateCorrectness<float>(1e-12);
 }
-
-
-