diff --git a/include/tensor.cuh b/include/tensor.cuh
index 0a8d1d1..8007209 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -181,16 +181,36 @@ class DTensor {
 
 private:
     T *m_d_data = nullptr;  ///< Pointer to device data
+    T **m_d_ptrMatrices = nullptr; ///< Pointer to matrices in tensor
     size_t m_numRows = 0;  ///< Number of rows
     size_t m_numCols = 0;  ///< Number of columns
     size_t m_numMats = 0;  ///< Number of matrices
-    bool m_doDestroy = false;  ///< Whether to destroy memory
+    bool m_doDestroyData = false;  ///< Whether to destroy m_d_data
+    bool m_doDestroyPointersToMatrices = false; ///< Whether to destroy m_d_ptrMatrices
+
+    void makePointersToMatrices() {
+        std::vector<T *> h_pointers(m_numMats);
+        size_t numelMat = m_numRows * m_numCols;
+        h_pointers[0] = m_d_data;
+        for (size_t i = 1; i < m_numMats; i++) {
+            h_pointers[i] = m_d_data + i * numelMat;
+        }
+        size_t buffer_size = m_numMats * sizeof(T *);
+        gpuErrChk(cudaMemcpy(m_d_ptrMatrices, h_pointers.data(), buffer_size, cudaMemcpyHostToDevice));
+        m_doDestroyPointersToMatrices = true;
+    }
 
     bool destroy() {
-        if (!m_doDestroy) return false;
-        if (m_d_data) cudaFree(m_d_data);
-        m_d_data = nullptr;
-        return true;
+        bool willDestroy = m_doDestroyData || m_doDestroyPointersToMatrices;
+        if (m_doDestroyData) {
+            if (m_d_data) gpuErrChk(cudaFree(m_d_data));
+            m_d_data = nullptr;
+        }
+        if (m_doDestroyPointersToMatrices) {
+            if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices));
+            m_d_ptrMatrices = nullptr;
+        }
+        return willDestroy;
     }
 
     /**
@@ -286,6 +306,10 @@ public:
      * @param axis axis to slice (0=rows, 1=columns, 2=matrices)
      * @param from index to slice axis from (zero-indexed)
      * @param to index to slice axis to (inclusive)
+     *
+     * @warning If axis=0 or axis=2, this method will (i) allocate memory on the GPU
+     * to store one element of size T* (pointer-to-T), (ii) will transfer one such
+     * element from the host to the device.
      */
     DTensor(const DTensor &other, size_t axis, size_t from, size_t to);
 
@@ -294,6 +318,10 @@ public:
      */
     T *raw() const;
 
+    T **ptrMatrices() const {
+        return m_d_ptrMatrices;
+    }
+
     /**
      * @return number of rows
      */
@@ -515,9 +543,8 @@ DTensor<T> DTensor<T>::createRandomTensor(size_t numRows, size_t numCols, size_t
         auto randVec = generateIntRandomVector(numRows * numCols * numMats, low, hi);
         DTensor<T> a(randVec, numRows, numCols, numMats);
         return a;
-    } else {
-        throw std::invalid_argument("[createRandomTensor] unsupported type T");
     }
+    throw std::invalid_argument("[createRandomTensor] unsupported type T");
 }
 
 template<typename T>
@@ -533,6 +560,7 @@ void DTensor<T>::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats
     m_numRows = newNumRows;
     m_numCols = newNumCols;
     m_numMats = newNumMats;
+    // TODO create
 }
 
 template<typename T>
@@ -542,6 +570,7 @@ DTensor<T>::DTensor(size_t m, size_t n, size_t k, bool zero) {
     m_numMats = k;
     size_t size = m * n * k;
     allocateOnDevice(size, zero);
+    makePointersToMatrices();
 }
 
 template<typename T>
@@ -552,6 +581,7 @@ DTensor<T>::DTensor(const std::vector<T> &data, size_t m, size_t n, size_t k, St
     size_t size = m * n * k;
     allocateOnDevice(size);
     upload(data, mode);
+    makePointersToMatrices();
 }
 
 template<typename T>
@@ -563,6 +593,7 @@ DTensor<T>::DTensor(const DTensor<T> &other) {
     allocateOnDevice(m_numRows * m_numCols * m_numMats);
     gpuErrChk(cudaMemcpy(m_d_data, other.raw(), m_numRows * m_numCols * m_numMats * sizeof(T),
                          cudaMemcpyDeviceToDevice));
+    makePointersToMatrices();
 }
 
 template<typename T>
@@ -574,6 +605,7 @@ DTensor<T>::DTensor(const DTensor<T> &other, size_t axis, size_t from, size_t to
         m_numRows = other.m_numRows;
         m_numCols = other.m_numCols;
         m_numMats = len;
+        m_d_ptrMatrices = other.m_d_ptrMatrices + from;
     } else if (axis == 1) {
         offset = other.m_numRows * from;
         m_numRows = other.m_numRows;
@@ -586,7 +618,12 @@ DTensor<T>::DTensor(const DTensor<T> &other, size_t axis, size_t from, size_t to
         m_numMats = 1;
     }
     m_d_data = other.m_d_data + offset;
-    m_doDestroy = false;
+    m_doDestroyData = false; // no new memory allocated!
+    m_doDestroyPointersToMatrices = false; // no new auxiliary memory allocated!
+    if (axis != 2) {
+        // m_d_ptrMatrices is not needed for vectors and matrices
+        m_d_ptrMatrices = nullptr;
+    }
 }
 
 template<typename T>
@@ -595,9 +632,11 @@ DTensor<T>::DTensor(DTensor<T> &&other) {
     m_numRows = other.m_numRows;
     m_numMats = other.m_numMats;
     m_d_data = other.m_d_data;
-    m_doDestroy = true;
-    other.m_doDestroy = false;
+    m_d_ptrMatrices = other.m_d_ptrMatrices;
+    m_doDestroyData = true;
+    other.m_doDestroyData = false;
     other.m_d_data = nullptr;
+    other.m_d_ptrMatrices = nullptr;
     other.m_numCols = 0;
     other.m_numRows = 0;
     other.m_numMats = 0;
@@ -757,12 +796,19 @@ template<typename T>
 inline bool DTensor<T>::allocateOnDevice(size_t size, bool zero) {
     if (size <= 0) return false;
     destroy();
-    m_doDestroy = true;
-    size_t buffer_size = size * sizeof(T);
-    bool cudaStatus = cudaMalloc(&m_d_data, buffer_size);
+    m_doDestroyData = true;
+    /* Allocate memory for m_d_data */
+    size_t data_size_bytes = size * sizeof(T);
+    bool cudaStatus = cudaMalloc(&m_d_data, data_size_bytes);
     if (cudaStatus != cudaSuccess) return false;
-    if (zero) gpuErrChk(cudaMemset(m_d_data, 0, buffer_size)); // set to zero all elements
-    return true;
+    if (zero) gpuErrChk(cudaMemset(m_d_data, 0, data_size_bytes)); // set to zero all elements
+
+    /* Allocate memory for m_d_ptrMatrices */
+    size_t ptr_matrices_bytes = m_numMats * sizeof(T *);
+    cudaStatus = cudaMalloc(&m_d_ptrMatrices, ptr_matrices_bytes);
+    m_doDestroyPointersToMatrices = true;
+
+    return (cudaStatus != cudaSuccess);
 }
 
 template<typename T>
@@ -798,6 +844,7 @@ inline T *DTensor<T>::raw() const {
     return m_d_data;
 }
 
+
 template<>
 inline DTensor<float> DTensor<float>::tr() const {
     DTensor<float> transposes(m_numCols, m_numRows, m_numMats);
@@ -839,6 +886,12 @@ inline void DTensor<T>::deviceCopyTo(DTensor<T> &elsewhere) const {
                          m_d_data,
                          m_numRows * m_numCols * m_numMats * sizeof(T),
                          cudaMemcpyDeviceToDevice));
+    if (m_d_ptrMatrices) {
+        gpuErrChk(cudaMemcpy(elsewhere.m_d_ptrMatrices,
+                             m_d_ptrMatrices,
+                             m_numMats * sizeof(T *),
+                             cudaMemcpyDeviceToDevice));
+    }
 }
 
 template<>
@@ -854,7 +907,7 @@ DTensor<T> &DTensor<T>::operator=(const DTensor<T> &other) {
     m_numMats = other.m_numMats;
     m_numRows = other.m_numRows;
     m_numCols = other.m_numCols;
-    m_doDestroy = false;
+    m_doDestroyData = false;
     m_d_data = other.m_d_data;
     return *this;
 }
@@ -910,17 +963,17 @@ inline T DTensor<T>::operator()(size_t i, size_t j, size_t k) const {
     return hostDst;
 }
 
-template<typename T>
-inline DTensor<T *> DTensor<T>::pointersToMatrices() const {
-    std::vector<T *> h_pointers(m_numMats);
-    size_t numelMat = m_numRows * m_numCols;
-    h_pointers[0] = m_d_data;
-    for (size_t i = 1; i < m_numMats; i++) {
-        h_pointers[i] = m_d_data + i * numelMat;
-    }
-    DTensor<T *> t(h_pointers, m_numMats, 1, 1);
-    return t;
-}
+//template<typename T>
+//inline DTensor<T *> DTensor<T>::pointersToMatrices() const {
+//    std::vector<T *> h_pointers(m_numMats);
+//    size_t numelMat = m_numRows * m_numCols;
+//    h_pointers[0] = m_d_data;
+//    for (size_t i = 1; i < m_numMats; i++) {
+//        h_pointers[i] = m_d_data + i * numelMat;
+//    }
+//    DTensor<T *> t(h_pointers, m_numMats, 1, 1);
+//    return t;
+//}
 
 template<>
 inline void DTensor<double>::addAB(const DTensor<double> &A, const DTensor<double> &B, double alpha, double beta) {
@@ -928,18 +981,25 @@ inline void DTensor<double>::addAB(const DTensor<double> &A, const DTensor<doubl
     size_t nRA = A.numRows();
     size_t nCA = A.numCols();
     size_t nCB = B.numCols();
-    DTensor<double *> ptrA = A.pointersToMatrices();
-    DTensor<double *> ptrB = B.pointersToMatrices();
-    DTensor<double *> ptr = pointersToMatrices();
     double _alpha = alpha, _beta = beta;
-    gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(),
-                                 CUBLAS_OP_N, CUBLAS_OP_N,
-                                 nRA, nCB, nCA, &_alpha,
-                                 ptrA.raw(), nRA,
-                                 ptrB.raw(), nCA,
-                                 &_beta,
-                                 ptr.raw(), nRA,
-                                 nMat));
+    if (nMat > 1) {
+        gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(),
+                                     CUBLAS_OP_N, CUBLAS_OP_N,
+                                     nRA, nCB, nCA, &_alpha,
+                                     A.m_d_ptrMatrices, nRA,
+                                     B.m_d_ptrMatrices, nCA,
+                                     &_beta,
+                                     m_d_ptrMatrices, nRA,
+                                     nMat));
+    } else {
+        gpuErrChk(cublasDgemm(Session::getInstance().cuBlasHandle(),
+                              CUBLAS_OP_N, CUBLAS_OP_N,
+                              nRA, nCB, nCA, &_alpha,
+                              A.raw(), nRA,
+                              B.raw(), nCA,
+                              &_beta,
+                              raw(), nRA));
+    }
 }
 
 template<>
@@ -948,18 +1008,25 @@ inline void DTensor<float>::addAB(const DTensor<float> &A, const DTensor<float>
     size_t nRA = A.numRows();
     size_t nCA = A.numCols();
     size_t nCB = B.numCols();
-    DTensor<float *> ptrA = A.pointersToMatrices();
-    DTensor<float *> ptrB = B.pointersToMatrices();
-    DTensor<float *> ptr = pointersToMatrices();
     float _alpha = alpha, _beta = beta;
-    gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(),
-                                 CUBLAS_OP_N, CUBLAS_OP_N,
-                                 nRA, nCB, nCA, &_alpha,
-                                 ptrA.raw(), nRA,
-                                 ptrB.raw(), nCA,
-                                 &_beta,
-                                 ptr.raw(), nRA,
-                                 nMat));
+    if (nMat > 1) {
+        gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(),
+                                     CUBLAS_OP_N, CUBLAS_OP_N,
+                                     nRA, nCB, nCA, &_alpha,
+                                     A.m_d_ptrMatrices, nRA,
+                                     B.m_d_ptrMatrices, nCA,
+                                     &_beta,
+                                     m_d_ptrMatrices, nRA,
+                                     nMat));
+    } else {
+        gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(),
+                              CUBLAS_OP_N, CUBLAS_OP_N,
+                              nRA, nCB, nCA, &_alpha,
+                              A.raw(), nRA,
+                              B.raw(), nCA,
+                              &_beta,
+                              raw(), nRA));
+    }
 }
 
 template<>
@@ -976,16 +1043,14 @@ inline void DTensor<double>::leastSquaresBatched(DTensor &B) {
         throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
     int info = 0;
     DTensor<int> infoArray(batchSize);
-    DTensor<double *> As = pointersToMatrices();
-    DTensor<double *> Bs = B.pointersToMatrices();
     gpuErrChk(cublasDgelsBatched(Session::getInstance().cuBlasHandle(),
                                  CUBLAS_OP_N,
                                  m_numRows,
                                  m_numCols,
                                  nColsB,
-                                 As.raw(),
+                                 m_d_ptrMatrices,
                                  m_numRows,
-                                 Bs.raw(),
+                                 B.m_d_ptrMatrices,
                                  m_numRows,
                                  &info,
                                  infoArray.raw(),
@@ -1006,16 +1071,14 @@ inline void DTensor<float>::leastSquaresBatched(DTensor &B) {
         throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
     int info = 0;
     DTensor<int> infoArray(batchSize);
-    DTensor<float *> As = pointersToMatrices();
-    DTensor<float *> Bs = B.pointersToMatrices();
     gpuErrChk(cublasSgelsBatched(Session::getInstance().cuBlasHandle(),
                                  CUBLAS_OP_N,
                                  m_numRows,
                                  m_numCols,
                                  nColsB,
-                                 As.raw(),
+                                 m_d_ptrMatrices,
                                  m_numRows,
-                                 Bs.raw(),
+                                 B.m_d_ptrMatrices,
                                  m_numRows,
                                  &info,
                                  infoArray.raw(),
@@ -1775,11 +1838,10 @@ public:
 template<>
 inline void CholeskyBatchFactoriser<double>::factorise() {
     if (m_factorisationDone) return;
-    DTensor<double *> ptrA = m_matrix->pointersToMatrices();
     gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(),
                                       CUBLAS_FILL_MODE_LOWER,
                                       m_numRows,
-                                      ptrA.raw(),
+                                      m_matrix->ptrMatrices(),
                                       m_numRows,
                                       m_deviceInfo->raw(),
                                       m_numMats));
@@ -1789,11 +1851,10 @@ inline void CholeskyBatchFactoriser<double>::factorise() {
 template<>
 inline void CholeskyBatchFactoriser<float>::factorise() {
     if (m_factorisationDone) return;
-    DTensor<float *> ptrA = m_matrix->pointersToMatrices();
     gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(),
                                       CUBLAS_FILL_MODE_LOWER,
                                       m_numRows,
-                                      ptrA.raw(),
+                                      m_matrix->ptrMatrices(),
                                       m_numRows,
                                       m_deviceInfo->raw(),
                                       m_numMats));
@@ -1807,15 +1868,13 @@ inline void CholeskyBatchFactoriser<double>::solve(DTensor<double> &b) {
         throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
     }
     if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
-    DTensor<double *> ptrA = m_matrix->pointersToMatrices();
-    DTensor<double *> ptrB = b.pointersToMatrices();
     gpuErrChk(cusolverDnDpotrsBatched(Session::getInstance().cuSolverHandle(),
                                       CUBLAS_FILL_MODE_LOWER,
                                       m_numRows,
                                       1,  ///< only supports rhs = 1
-                                      ptrA.raw(),
+                                      m_matrix->ptrMatrices(),
                                       m_numRows,
-                                      ptrB.raw(),
+                                      b.ptrMatrices(),
                                       m_numRows,
                                       m_deviceInfo->raw(),
                                       m_numMats));
@@ -1828,15 +1887,13 @@ inline void CholeskyBatchFactoriser<float>::solve(DTensor<float> &b) {
         throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
     }
     if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
-    DTensor<float *> ptrA = m_matrix->pointersToMatrices();
-    DTensor<float *> ptrB = b.pointersToMatrices();
     gpuErrChk(cusolverDnSpotrsBatched(Session::getInstance().cuSolverHandle(),
                                       CUBLAS_FILL_MODE_LOWER,
                                       m_numRows,
                                       1,  ///< only supports rhs = 1
-                                      ptrA.raw(),
+                                      m_matrix->ptrMatrices(),
                                       m_numRows,
-                                      ptrB.raw(),
+                                      b.ptrMatrices(),
                                       m_numRows,
                                       m_deviceInfo->raw(),
                                       m_numMats));
diff --git a/test/testTensor.cu b/test/testTensor.cu
index 0a33ffc..3cc7580 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -635,29 +635,6 @@ TEST_F(TensorTest, tensorMinusTensor) {
     tensorMinusTensor<double>();
 }
 
-/* ---------------------------------------
- * Tensor: pointers to matrices (on device)
- * --------------------------------------- */
-
-TEMPLATE_WITH_TYPE_T
-void tensorPointersToMatrices() {
-    std::vector<T> dataA = TENSOR_DATA_234A;
-    DTensor<T> A(dataA, 2, 3, 4);
-    DTensor<T *> pointers = A.pointersToMatrices();
-    EXPECT_EQ(4, pointers.numRows());
-    EXPECT_EQ(1, pointers.numCols());
-    EXPECT_EQ(1, pointers.numMats());
-    T *p1 = pointers(1, 0, 0); // pointer to matrix #1
-    T hostDst; // let's see what's there...
-    cudaMemcpy(&hostDst, p1, sizeof(T), cudaMemcpyDeviceToHost);
-    EXPECT_EQ(dataA[6], hostDst);
-}
-
-TEST_F(TensorTest, tensorPointersToMatrices) {
-    tensorPointersToMatrices<float>();
-    tensorPointersToMatrices<double>();
-    tensorPointersToMatrices<int>();
-}
 
 /* ---------------------------------------
  * Tensor: C = AB