diff --git a/.gitignore b/.gitignore index 7fe8987..032fffb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Custom +python/*.bt + ## Ignore Visual Studio temporary files, build results, and ## files generated by popular Visual Studio add-ons. diff --git a/CHANGELOG.md b/CHANGELOG.md index 32569ba..60d3173 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## v1.8.0 - 21-03-2025 + +### Added + +- Support for counting total allocated memory via `Session::getInstance().totalAllocatedBytes()` + + diff --git a/README.md b/README.md index 5e1e99e..b6e560e 100644 --- a/README.md +++ b/README.md @@ -395,4 +395,14 @@ N.project(vectors); std::cout << vectors << "\n"; ``` +## 5. Other + +We can get the total allocated bytes (on the GPU) with +```c++ +size_t allocatedBytes = + Session::getInstance().totalAllocatedBytes(); +``` + + + ## Happy number crunching! diff --git a/include/tensor.cuh b/include/tensor.cuh index bca5706..5089ed7 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -1,3 +1,6 @@ +#ifndef TENSOR_CUH +#define TENSOR_CUH + #include #include #include @@ -13,8 +16,6 @@ #include #include -#ifndef TENSOR_CUH -#define TENSOR_CUH /** * Define defaults @@ -43,7 +44,8 @@ static std::random_device RND_DEVICE; * @param hi * @return */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX std::vector generateRealRandomVector(size_t n, T low, T hi) { std::mt19937_64 mersenne_engine(RND_DEVICE()); std::uniform_real_distribution dist(low, hi); @@ -84,24 +86,25 @@ constexpr size_t numBlocks(size_t n, size_t threads_per_block = THREADS_PER_BLOC */ #define gpuErrChk(status) { gpuAssert((status), __FILE__, __LINE__); } while(false) -TEMPLATE_WITH_TYPE_T inline void gpuAssert(T code, const char *file, int line, bool abort = true) { +TEMPLATE_WITH_TYPE_T +inline void gpuAssert(T code, const char *file, int line, bool abort = true) { if constexpr (std::is_same_v) { if (code != cudaSuccess) { std::cerr << "cuda error. String: " << cudaGetErrorString(code) - << ", file: " << file << ", line: " << line << "\n"; + << ", file: " << file << ", line: " << line << "\n"; if (abort) exit(code); } } else if constexpr (std::is_same_v) { if (code != CUBLAS_STATUS_SUCCESS) { std::cerr << "cublas error. Name: " << cublasGetStatusName(code) - << ", string: " << cublasGetStatusString(code) - << ", file: " << file << ", line: " << line << "\n"; + << ", string: " << cublasGetStatusString(code) + << ", file: " << file << ", line: " << line << "\n"; if (abort) exit(code); } } else if constexpr (std::is_same_v) { if (code != CUSOLVER_STATUS_SUCCESS) { std::cerr << "cusolver error. Status: " << code - << ", file: " << file << ", line: " << line << "\n"; + << ", file: " << file << ", line: " << line << "\n"; if (abort) exit(code); } } else { @@ -124,7 +127,6 @@ TEMPLATE_WITH_TYPE_T inline void gpuAssert(T code, const char *file, int line, b */ class Session { public: - static Session &getInstance() { static Session instance; return instance; @@ -143,7 +145,7 @@ private: cublasHandle_t m_cublasHandle; cusolverDnHandle_t m_cusolverHandle; - + size_t bytesAllocated = 0; public: Session(Session const &) = delete; @@ -153,6 +155,30 @@ public: cublasHandle_t &cuBlasHandle() { return m_cublasHandle; } cusolverDnHandle_t &cuSolverHandle() { return m_cusolverHandle; } + + /** + * Preferred method for CUDA memory allocation; it allocated memory on the device + * and counts the allocated bytes (you can then call #totalAllocatedBytes()). + * In essence, this is a wrapper for cudaMalloc. + * + * @param d pointer to memory address to be allocated + * @param s size to be allocated + * @return CUDA error + */ + cudaError_t cudaAllocate(void** d, size_t s) { + cudaError_t err = cudaMalloc(d, s); + if (err == cudaSuccess) { + bytesAllocated += s; + } + return err; + } + + /** + * @return Total allocated bytes + */ + size_t totalAllocatedBytes() const { return bytesAllocated; } + + void incrementAllocatedBytes(size_t s) { bytesAllocated += s; } }; @@ -164,8 +190,8 @@ public: * Storage mode for the data of a matrix */ enum StorageMode { - columnMajor, ///< column major storage (default) - rowMajor, ///< row major storage + columnMajor, ///< column major storage (default) + rowMajor, ///< row major storage defaultMajor = columnMajor }; @@ -179,24 +205,25 @@ enum StorageMode { */ TEMPLATE_WITH_TYPE_T class DTensor { - private: - T *m_d_data = nullptr; ///< Pointer to device data + T *m_d_data = nullptr; ///< Pointer to device data T **m_d_ptrMatrices = nullptr; ///< Pointer to matrices in tensor - size_t m_numRows = 0; ///< Number of rows - size_t m_numCols = 0; ///< Number of columns - size_t m_numMats = 0; ///< Number of matrices - bool m_doDestroyData = false; ///< Whether to destroy memory - bool m_doDestroyPtrMatrices = false; ///< Whether to destroy memory + size_t m_numRows = 0; ///< Number of rows + size_t m_numCols = 0; ///< Number of columns + size_t m_numMats = 0; ///< Number of matrices + bool m_doDestroyData = false; ///< Whether to destroy memory + bool m_doDestroyPtrMatrices = false; ///< Whether to destroy memory void destroy() { if (m_doDestroyData) { if (m_d_data) gpuErrChk(cudaFree(m_d_data)); m_d_data = nullptr; + Session::getInstance().incrementAllocatedBytes(-m_numRows * m_numCols * m_numMats * sizeof(T)); } if (m_doDestroyPtrMatrices) { if (m_d_ptrMatrices) gpuErrChk(cudaFree(m_d_ptrMatrices)); m_d_ptrMatrices = nullptr; + Session::getInstance().incrementAllocatedBytes(-m_numMats * sizeof(T *)); } } @@ -239,7 +266,6 @@ private: void initialisePointersToMatricesData(); public: - /** * Create a tensor with random elements * @param numRows number of rows @@ -559,7 +585,6 @@ public: friend std::ostream &operator<<(std::ostream &out, const DTensor &data) { return data.print(out); } - }; /* END OF DTENSOR */ template @@ -611,9 +636,12 @@ data_t vectorFromTextFile(std::string path_to_file) { if (!file.is_open()) { throw std::invalid_argument("[vectorFromTextFile] the file does not exist"); } std::string line; - getline(file, line); dataStruct.numRows = atoi(line.c_str()); - getline(file, line); dataStruct.numCols = atoi(line.c_str()); - getline(file, line); dataStruct.numMats = atoi(line.c_str()); + getline(file, line); + dataStruct.numRows = atoi(line.c_str()); + getline(file, line); + dataStruct.numCols = atoi(line.c_str()); + getline(file, line); + dataStruct.numMats = atoi(line.c_str()); size_t numElements = dataStruct.numRows * dataStruct.numCols * dataStruct.numMats; std::vector vecDataFromFile(numElements); @@ -638,7 +666,7 @@ data_t vectorFromTextFile(std::string path_to_file) { vecDataFromFile[i] = std::stoull(line.c_str()); } else if constexpr (std::is_same_v) { sscanf(line.c_str(), "%zu", &vecDataFromFile[i]); - } else { + } else { throw std::invalid_argument("data type not supported"); } @@ -675,10 +703,10 @@ template DTensor DTensor::parseFromFile(std::string path_to_file, StorageMode mode) { // Figure out file extension - size_t pathToFileLength = path_to_file.length() ; - std::string fileNameExtension = path_to_file.substr(pathToFileLength-3); + size_t pathToFileLength = path_to_file.length(); + std::string fileNameExtension = path_to_file.substr(pathToFileLength - 3); typedef data_t (*PARSER)(std::string); - PARSER parser = (fileNameExtension == ".bt") ? vectorFromBinaryFile : vectorFromTextFile; + PARSER parser = (fileNameExtension == ".bt") ? vectorFromBinaryFile : vectorFromTextFile; auto parsedData = parser(path_to_file); DTensor tensorFromData(parsedData.data, parsedData.numRows, parsedData.numCols, parsedData.numMats, mode); return tensorFromData; @@ -690,10 +718,10 @@ void DTensor::saveToFile(std::string pathToFile) { download(myData); // Figure out file extension - size_t pathToFileLength = pathToFile.length() ; - std::string fileNameExtension = pathToFile.substr(pathToFileLength-3); + size_t pathToFileLength = pathToFile.length(); + std::string fileNameExtension = pathToFile.substr(pathToFileLength - 3); // If the extension is .bt... - if (fileNameExtension == ".bt") { + if (fileNameExtension == ".bt") { uint64_t nr = (uint64_t) numRows(), nc = (uint64_t) numCols(), nm = (uint64_t) numMats(); @@ -705,13 +733,13 @@ void DTensor::saveToFile(std::string pathToFile) { for (const T &el: myData) outFile.write(reinterpret_cast(&el), sizeof(T)); outFile.close(); } else { - std::ofstream file(pathToFile); - file << numRows() << std::endl << numCols() << std::endl << numMats() << std::endl; - if constexpr (std::is_floating_point::value) { - file << std::setprecision(std::numeric_limits::max_digits10); - } - for (const T &el: myData) file << el << std::endl; - } + std::ofstream file(pathToFile); + file << numRows() << std::endl << numCols() << std::endl << numMats() << std::endl; + if constexpr (std::is_floating_point::value) { + file << std::setprecision(std::numeric_limits::max_digits10); + } + for (const T &el: myData) file << el << std::endl; + } } @@ -739,7 +767,7 @@ void DTensor::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats } /* Reallocate memory for m_d_ptrMatrices, if necessary */ if (newNumMats > 1) { - gpuErrChk(cudaMalloc(&m_d_ptrMatrices, newNumMats * sizeof(T *))); + gpuErrChk(Session::getInstance().cudaAllocate((void**)&m_d_ptrMatrices, newNumMats * sizeof(T *))); m_doDestroyPtrMatrices = true; } } @@ -781,7 +809,7 @@ DTensor::DTensor(const DTensor &other) { allocateOnDevice(m_numRows * m_numCols * m_numMats); gpuErrChk(cudaMemcpy(m_d_data, other.raw(), m_numRows * m_numCols * m_numMats * sizeof(T), - cudaMemcpyDeviceToDevice)); + cudaMemcpyDeviceToDevice)); /* Initialise m_d_ptrMatrices */ initialisePointersToMatricesData(); } @@ -860,9 +888,9 @@ inline double DTensor::dotF(const DTensor &other) { size_t n = numEl(); double result; gpuErrChk(cublasDdot(Session::getInstance().cuBlasHandle(), n, - raw(), 1, - other.raw(), 1, - &result)); + raw(), 1, + other.raw(), 1, + &result)); return result; } @@ -873,9 +901,9 @@ inline float DTensor::dotF(const DTensor &other) { size_t n = numEl(); float result; gpuErrChk(cublasSdot(Session::getInstance().cuBlasHandle(), n, - raw(), 1, - other.raw(), 1, - &result)); + raw(), 1, + other.raw(), 1, + &result)); return result; } @@ -883,7 +911,7 @@ template<> inline double DTensor::normF() const { double the_norm; gpuErrChk(cublasDnrm2(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &the_norm)); + &the_norm)); return the_norm; } @@ -891,7 +919,7 @@ template<> inline float DTensor::normF() const { float the_norm; gpuErrChk(cublasSnrm2(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &the_norm)); + &the_norm)); return the_norm; } @@ -899,7 +927,7 @@ template<> inline float DTensor::sumAbs() const { float sumAbsAllElements; gpuErrChk(cublasSasum(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &sumAbsAllElements)); + &sumAbsAllElements)); return sumAbsAllElements; } @@ -907,7 +935,7 @@ template<> inline double DTensor::sumAbs() const { double sumAbsAllElements; gpuErrChk(cublasDasum(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &sumAbsAllElements)); + &sumAbsAllElements)); return sumAbsAllElements; } @@ -916,7 +944,7 @@ inline float DTensor::maxAbs() const { int idx; float hostDst; gpuErrChk(cublasIsamax(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &idx)); + &idx)); gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(float), cudaMemcpyDeviceToHost)); return std::signbit(hostDst) ? -hostDst : hostDst; } @@ -926,7 +954,7 @@ inline double DTensor::maxAbs() const { int idx; double hostDst; gpuErrChk(cublasIdamax(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &idx)); + &idx)); gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(double), cudaMemcpyDeviceToHost)); return std::signbit(hostDst) ? -hostDst : hostDst; } @@ -936,7 +964,7 @@ inline float DTensor::minAbs() const { int idx; float hostDst; gpuErrChk(cublasIsamin(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &idx)); + &idx)); gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(float), cudaMemcpyDeviceToHost)); return std::signbit(hostDst) ? -hostDst : hostDst; } @@ -946,7 +974,7 @@ inline double DTensor::minAbs() const { int idx; double hostDst; gpuErrChk(cublasIdamin(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, m_d_data, 1, - &idx)); + &idx)); gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(double), cudaMemcpyDeviceToHost)); return std::signbit(hostDst) ? -hostDst : hostDst; } @@ -958,10 +986,10 @@ void DTensor::applyRightGivensRotation(size_t i, size_t j, const T *c, const T *col_j = m_d_data + j * m_numRows; if constexpr (std::is_same_v) { gpuErrChk(cublasDrot(Session::getInstance().cuBlasHandle(), m_numRows, - col_i, 1, col_j, 1, c, minus_s)); + col_i, 1, col_j, 1, c, minus_s)); } else if constexpr (std::is_same_v) { gpuErrChk(cublasSrot(Session::getInstance().cuBlasHandle(), m_numRows, - col_i, 1, col_j, 1, c, minus_s)); + col_i, 1, col_j, 1, c, minus_s)); } } @@ -970,14 +998,14 @@ void DTensor::applyLeftGivensRotation(size_t i, size_t j, const T *c, const T if (m_numMats > 1) throw std::invalid_argument("[applyLeftGivensRotation] tensors (nMat>1) not supported"); if constexpr (std::is_same_v) { gpuErrChk(cublasDrot(Session::getInstance().cuBlasHandle(), m_numCols, - m_d_data + i, m_numRows, - m_d_data + j, m_numRows, - c, minus_s)); + m_d_data + i, m_numRows, + m_d_data + j, m_numRows, + c, minus_s)); } else if constexpr (std::is_same_v) { gpuErrChk(cublasSrot(Session::getInstance().cuBlasHandle(), m_numCols, - m_d_data + i, m_numRows, - m_d_data + j, m_numRows, - c, minus_s)); + m_d_data + i, m_numRows, + m_d_data + j, m_numRows, + c, minus_s)); } else { throw std::invalid_argument("[applyLeftGivensRotation] Unsupported type T"); } @@ -990,12 +1018,12 @@ inline void DTensor::allocateOnDevice(size_t size, bool zero) { destroy(); m_doDestroyData = true; size_t buffer_size = size * sizeof(T); - gpuErrChk(cudaMalloc(&m_d_data, buffer_size)); + gpuErrChk(Session::getInstance().cudaAllocate((void**)&m_d_data, buffer_size)); if (zero) gpuErrChk(cudaMemset(m_d_data, 0, buffer_size)); // set to zero all elements if (numMats() > 1) { m_doDestroyPtrMatrices = true; - cudaStatus = cudaMalloc(&m_d_ptrMatrices, numMats() * sizeof(T *)); + cudaStatus = Session::getInstance().cudaAllocate((void**) &m_d_ptrMatrices, numMats() * sizeof(T *)); if (cudaStatus != cudaSuccess) { gpuErrChk(cudaFree(m_d_data)); // ... free previously allocated memory gpuErrChk(cudaStatus); // ... and memento mori @@ -1028,9 +1056,9 @@ template inline void DTensor::download(std::vector &vec) const { vec.resize(m_numRows * m_numCols * m_numMats); gpuErrChk(cudaMemcpy(vec.data(), - m_d_data, - m_numRows * m_numCols * m_numMats * sizeof(T), - cudaMemcpyDeviceToHost)); + m_d_data, + m_numRows * m_numCols * m_numMats * sizeof(T), + cudaMemcpyDeviceToHost)); } template @@ -1051,11 +1079,11 @@ inline DTensor DTensor::tr() const { size_t numElMat = m_numCols * m_numRows; for (size_t i = 0; i < m_numMats; i++) { gpuErrChk(cublasSgeam(Session::getInstance().cuBlasHandle(), - CUBLAS_OP_T, CUBLAS_OP_N, - m_numCols, m_numRows, - &alpha, raw() + numElMat * i, m_numRows, - &beta, nullptr, m_numCols, - transposes.raw() + numElMat * i, m_numCols)); + CUBLAS_OP_T, CUBLAS_OP_N, + m_numCols, m_numRows, + &alpha, raw() + numElMat * i, m_numRows, + &beta, nullptr, m_numCols, + transposes.raw() + numElMat * i, m_numCols)); } return transposes; } @@ -1067,11 +1095,11 @@ inline DTensor DTensor::tr() const { size_t numElMat = m_numCols * m_numRows; for (size_t i = 0; i < m_numMats; i++) { gpuErrChk(cublasDgeam(Session::getInstance().cuBlasHandle(), - CUBLAS_OP_T, CUBLAS_OP_N, - m_numCols, m_numRows, - &alpha, raw() + numElMat * i, m_numRows, - &beta, nullptr, m_numCols, - transposes.raw() + numElMat * i, m_numCols)); + CUBLAS_OP_T, CUBLAS_OP_N, + m_numCols, m_numRows, + &alpha, raw() + numElMat * i, m_numRows, + &beta, nullptr, m_numCols, + transposes.raw() + numElMat * i, m_numCols)); } return transposes; } @@ -1082,16 +1110,16 @@ inline void DTensor::deviceCopyTo(DTensor &elsewhere) const { throw std::invalid_argument("[deviceCopyTo] tensor does not fit into destination"); } gpuErrChk(cudaMemcpy(elsewhere.raw(), - m_d_data, - m_numRows * m_numCols * m_numMats * sizeof(T), - cudaMemcpyDeviceToDevice)); + m_d_data, + m_numRows * m_numCols * m_numMats * sizeof(T), + cudaMemcpyDeviceToDevice)); } template<> inline DTensor &DTensor::operator*=(double scalar) { double alpha = scalar; gpuErrChk( - cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); + cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); return *this; } @@ -1109,7 +1137,7 @@ template<> inline DTensor &DTensor::operator*=(float scalar) { float alpha = scalar; gpuErrChk( - cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); + cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); return *this; } @@ -1117,8 +1145,8 @@ template<> inline DTensor &DTensor::operator+=(const DTensor &rhs) { const double alpha = 1.; gpuErrChk( - cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, - 1, m_d_data, 1)); + cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + 1, m_d_data, 1)); return *this; } @@ -1126,8 +1154,8 @@ template<> inline DTensor &DTensor::operator+=(const DTensor &rhs) { const float alpha = 1.; gpuErrChk( - cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, - 1, m_d_data, 1)); + cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + 1, m_d_data, 1)); return *this; } @@ -1143,8 +1171,8 @@ template<> inline DTensor &DTensor::operator-=(const DTensor &rhs) { const double alpha = -1.; gpuErrChk( - cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, - 1, m_d_data, 1)); + cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + 1, m_d_data, 1)); return *this; } @@ -1165,21 +1193,21 @@ inline void DTensor::addAB(const DTensor &A, const DTensor 1) { gpuErrChk(cublasDgemmBatched(Session::getInstance().cuBlasHandle(), - CUBLAS_OP_N, CUBLAS_OP_N, - nRA, nCB, nCA, &_alpha, - A.m_d_ptrMatrices, nRA, - B.m_d_ptrMatrices, nCA, - &_beta, - m_d_ptrMatrices, nRA, - nMat)); + CUBLAS_OP_N, CUBLAS_OP_N, + nRA, nCB, nCA, &_alpha, + A.m_d_ptrMatrices, nRA, + B.m_d_ptrMatrices, nCA, + &_beta, + m_d_ptrMatrices, nRA, + nMat)); } else { gpuErrChk(cublasDgemm(Session::getInstance().cuBlasHandle(), - CUBLAS_OP_N, CUBLAS_OP_N, - nRA, nCB, nCA, &_alpha, - A.raw(), nRA, - B.raw(), nCA, - &_beta, - raw(), nRA)); + CUBLAS_OP_N, CUBLAS_OP_N, + nRA, nCB, nCA, &_alpha, + A.raw(), nRA, + B.raw(), nCA, + &_beta, + raw(), nRA)); } } @@ -1192,21 +1220,21 @@ inline void DTensor::addAB(const DTensor &A, const DTensor float _alpha = alpha, _beta = beta; if (nMat > 1) { gpuErrChk(cublasSgemmBatched(Session::getInstance().cuBlasHandle(), - CUBLAS_OP_N, CUBLAS_OP_N, - nRA, nCB, nCA, &_alpha, - A.m_d_ptrMatrices, nRA, - B.m_d_ptrMatrices, nCA, - &_beta, - m_d_ptrMatrices, nRA, - nMat)); + CUBLAS_OP_N, CUBLAS_OP_N, + nRA, nCB, nCA, &_alpha, + A.m_d_ptrMatrices, nRA, + B.m_d_ptrMatrices, nCA, + &_beta, + m_d_ptrMatrices, nRA, + nMat)); } else { gpuErrChk(cublasSgemm(Session::getInstance().cuBlasHandle(), - CUBLAS_OP_N, CUBLAS_OP_N, - nRA, nCB, nCA, &_alpha, - A.raw(), nRA, - B.raw(), nCA, - &_beta, - raw(), nRA)); + CUBLAS_OP_N, CUBLAS_OP_N, + nRA, nCB, nCA, &_alpha, + A.raw(), nRA, + B.raw(), nCA, + &_beta, + raw(), nRA)); } } @@ -1225,17 +1253,17 @@ inline void DTensor::leastSquaresBatched(DTensor &B) { int info = 0; DTensor infoArray(batchSize); // TODO consider preallocating? gpuErrChk(cublasDgelsBatched(Session::getInstance().cuBlasHandle(), - CUBLAS_OP_N, - m_numRows, - m_numCols, - nColsB, - m_d_ptrMatrices, - m_numRows, - B.m_d_ptrMatrices, - m_numRows, - &info, - infoArray.raw(), - batchSize)); + CUBLAS_OP_N, + m_numRows, + m_numCols, + nColsB, + m_d_ptrMatrices, + m_numRows, + B.m_d_ptrMatrices, + m_numRows, + &info, + infoArray.raw(), + batchSize)); } template<> @@ -1253,17 +1281,17 @@ inline void DTensor::leastSquaresBatched(DTensor &B) { int info = 0; DTensor infoArray(batchSize); // TODO consider preallocating? gpuErrChk(cublasSgelsBatched(Session::getInstance().cuBlasHandle(), - CUBLAS_OP_N, - m_numRows, - m_numCols, - nColsB, - m_d_ptrMatrices, - m_numRows, - B.m_d_ptrMatrices, - m_numRows, - &info, - infoArray.raw(), - batchSize)); + CUBLAS_OP_N, + m_numRows, + m_numCols, + nColsB, + m_d_ptrMatrices, + m_numRows, + B.m_d_ptrMatrices, + m_numRows, + &info, + infoArray.raw(), + batchSize)); } template<> @@ -1273,10 +1301,10 @@ inline DTensor DTensor::getRows(size_t rowsFrom, size_t rowsTo, DTensor rowsOnly(rowsRangeLength, numCols(), 1); for (size_t i = 0; i < rowsRangeLength; i++) { gpuErrChk(cublasDcopy(Session::getInstance().cuBlasHandle(), - n, // # values to copy - raw() + rowsFrom + i + matIdx * n * m, m, - rowsOnly.raw() + i, - rowsRangeLength)); + n, // # values to copy + raw() + rowsFrom + i + matIdx * n * m, m, + rowsOnly.raw() + i, + rowsRangeLength)); } return rowsOnly; } @@ -1288,10 +1316,10 @@ inline DTensor DTensor::getRows(size_t rowsFrom, size_t rowsTo, si DTensor rowsOnly(rowsRangeLength, numCols(), 1); for (size_t i = 0; i < rowsRangeLength; i++) { gpuErrChk(cublasScopy(Session::getInstance().cuBlasHandle(), - n, // # values to copy - raw() + rowsFrom + i + matIdx * n * m, m, - rowsOnly.raw() + i, - rowsRangeLength)); + n, // # values to copy + raw() + rowsFrom + i + matIdx * n * m, m, + rowsOnly.raw() + i, + rowsRangeLength)); } return rowsOnly; } @@ -1300,8 +1328,8 @@ template std::ostream &DTensor::print(std::ostream &out) const { size_t nr = m_numRows, nc = m_numCols, nm = m_numMats; out << "Tensor [" << m_numRows << " x " - << m_numCols << " x " - << m_numMats << "]:" << std::endl; + << m_numCols << " x " + << m_numMats << "]:" << std::endl; std::vector temp; download(temp); for (size_t k = 0; k < nm; k++) { @@ -1317,7 +1345,6 @@ std::ostream &DTensor::print(std::ostream &out) const { } - /* ================================================================================================ * STATUS INTERFACE * ================================================================================================ */ @@ -1326,19 +1353,18 @@ std::ostream &DTensor::print(std::ostream &out) const { */ class IStatus { protected: - std::unique_ptr> m_info; ///< Status code of computation + std::unique_ptr > m_info; ///< Status code of computation IStatus(size_t n = 1) { - m_info = std::make_unique>(1, 1, n); + m_info = std::make_unique >(1, 1, n); } public: - /** * Provides access to the info stored in this class * @return tensor of shape (1, 1, n) */ - virtual DTensor& info() { + virtual DTensor &info() { return *m_info; } }; @@ -1356,7 +1382,8 @@ public: * @param d_count on exit, count of elements (int on device) * @param epsilon threshold */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX __global__ void k_countNonzeroSingularValues(const T *d_array, size_t n, unsigned int *d_count, T epsilon) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < n && d_array[idx] > epsilon) { @@ -1371,19 +1398,18 @@ __global__ void k_countNonzeroSingularValues(const T *d_array, size_t n, unsigne * Then, many same-type-(m,n,1)-tensor can be factorised using this object's workspace. * @tparam T data type of (m,n,1)-tensor to be factorised (must be float or double) */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX class Svd : public IStatus { - private: - - int m_lwork = -1; ///< Size of workspace needed for SVD - DTensor *m_tensor = nullptr; ///< Pointer to original matrix to be factorised - std::shared_ptr> m_Vtr; ///< Matrix V' or right singular vectors - std::shared_ptr> m_S; ///< Diagonal matrix S or singular values - std::shared_ptr> m_U; ///< Matrix U or left singular vectors - std::unique_ptr> m_workspace; ///< Workspace for SVD - std::shared_ptr> m_rank; ///< Rank of original matrix - bool m_computeU = false; ///< Whether to compute U + int m_lwork = -1; ///< Size of workspace needed for SVD + DTensor *m_tensor = nullptr; ///< Pointer to original matrix to be factorised + std::shared_ptr > m_Vtr; ///< Matrix V' or right singular vectors + std::shared_ptr > m_S; ///< Diagonal matrix S or singular values + std::shared_ptr > m_U; ///< Matrix U or left singular vectors + std::unique_ptr > m_workspace; ///< Workspace for SVD + std::shared_ptr > m_rank; ///< Rank of original matrix + bool m_computeU = false; ///< Whether to compute U bool m_destroyMatrix = true; ///< Whether to sacrifice original matrix /** @@ -1403,10 +1429,7 @@ private: */ void computeWorkspaceSize(size_t m, size_t n); - public: - - /** * Constructor. * @param mat matrix to be factorised @@ -1425,11 +1448,11 @@ public: size_t minMN = std::min(m, n); size_t nMats = mat.numMats(); computeWorkspaceSize(m, n); - m_workspace = std::make_unique>(m_lwork, 1, 1); ///< Allocates required workspace memory - m_Vtr = std::make_shared>(n, n, nMats); - m_S = std::make_shared>(minMN, 1, nMats); - m_rank = std::make_unique>(1, 1, nMats, true); - if (computeU) m_U = std::make_shared>(m, m, nMats); + m_workspace = std::make_unique >(m_lwork, 1, 1); ///< Allocates required workspace memory + m_Vtr = std::make_shared >(n, n, nMats); + m_S = std::make_shared >(minMN, 1, nMats); + m_rank = std::make_unique >(1, 1, nMats, true); + if (computeU) m_U = std::make_shared >(m, m, nMats); } /** @@ -1455,7 +1478,7 @@ public: /** * @return matrix U, or left singular vectors */ - std::optional>> leftSingularVectors() const { + std::optional > > leftSingularVectors() const { if (!m_computeU) return std::nullopt; return m_U; } @@ -1481,13 +1504,10 @@ public: DTensor Si(*m_S, 2, i, i); DTensor rankI(*m_rank, 2, i, i); k_countNonzeroSingularValues<<>>(Si.raw(), numElS, - rankI.raw(), epsilon); + rankI.raw(), epsilon); } return *m_rank; } - - - }; @@ -1507,25 +1527,25 @@ inline void Svd::factorise() { size_t m = m_tensor->numRows(); size_t n = m_tensor->numCols(); size_t nMats = m_tensor->numMats(); - std::unique_ptr> Ui; + std::unique_ptr > Ui; for (size_t i = 0; i < nMats; i++) { DTensor Ai(*m_tensor, 2, i, i); // tensor A[:, :, i] DTensor Si(*m_S, 2, i, i); // S[:, :, i] DTensor Vtri(*m_Vtr, 2, i, i); // Vtr[:, :, i] if (m_computeU) - Ui = std::make_unique>(*m_U, 2, i, i); + Ui = std::make_unique >(*m_U, 2, i, i); gpuErrChk( - cusolverDnDgesvd(Session::getInstance().cuSolverHandle(), - (m_computeU) ? 'A' : 'N', 'A', - m, n, - Ai.raw(), m, - Si.raw(), - (m_computeU) ? Ui->raw() : nullptr, m, - Vtri.raw(), n, - m_workspace->raw(), - m_lwork, - nullptr, // rwork (used only if SVD fails) - m_info->raw() + i)); + cusolverDnDgesvd(Session::getInstance().cuSolverHandle(), + (m_computeU) ? 'A' : 'N', 'A', + m, n, + Ai.raw(), m, + Si.raw(), + (m_computeU) ? Ui->raw() : nullptr, m, + Vtri.raw(), n, + m_workspace->raw(), + m_lwork, + nullptr, // rwork (used only if SVD fails) + m_info->raw() + i)); } } @@ -1534,25 +1554,25 @@ inline void Svd::factorise() { size_t m = m_tensor->numRows(); size_t n = m_tensor->numCols(); size_t nMats = m_tensor->numMats(); - std::unique_ptr> Ui; + std::unique_ptr > Ui; for (size_t i = 0; i < nMats; i++) { DTensor Ai(*m_tensor, 2, i, i); // tensor A[:, :, i] DTensor Si(*m_S, 2, i, i); // S[:, :, i] DTensor Vtri(*m_Vtr, 2, i, i); // Vtr[:, :, i] if (m_computeU) - Ui = std::make_unique>(*m_U, 2, i, i); + Ui = std::make_unique >(*m_U, 2, i, i); gpuErrChk( - cusolverDnSgesvd(Session::getInstance().cuSolverHandle(), - (m_computeU) ? 'A' : 'N', 'A', - m, n, - Ai.raw(), m, - Si.raw(), - (m_computeU) ? Ui->raw() : nullptr, m, - Vtri.raw(), n, - m_workspace->raw(), - m_lwork, - nullptr, // rwork (used only if SVD fails) - m_info->raw() + i)); + cusolverDnSgesvd(Session::getInstance().cuSolverHandle(), + (m_computeU) ? 'A' : 'N', 'A', + m, n, + Ai.raw(), m, + Si.raw(), + (m_computeU) ? Ui->raw() : nullptr, m, + Vtri.raw(), n, + m_workspace->raw(), + m_lwork, + nullptr, // rwork (used only if SVD fails) + m_info->raw() + i)); } } @@ -1567,13 +1587,13 @@ inline void Svd::factorise() { * Then, many same-type-(m,n,1)-tensor can be factorised using this object's workspace * @tparam T data type of (m,n,1)-tensor to be factorised (must be float or double) */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX class CholeskyFactoriser : public IStatus { - private: - int m_workspaceSize = 0; ///< Size of workspace needed for CF - std::unique_ptr> m_workspace; ///< Workspace for CF - DTensor *m_matrix; ///< Matrix to factorise. Do not destroy! + int m_workspaceSize = 0; ///< Size of workspace needed for CF + std::unique_ptr > m_workspace; ///< Workspace for CF + DTensor *m_matrix; ///< Matrix to factorise. Do not destroy! /** * Computes the workspace size required by cuSolver. @@ -1581,13 +1601,12 @@ private: void computeWorkspaceSize(); public: - CholeskyFactoriser(DTensor &A) : IStatus() { if (A.numMats() > 1) throw std::invalid_argument("[Cholesky] 3D tensors require `CholeskyBatchFactoriser`"); if (A.numRows() != A.numCols()) throw std::invalid_argument("[Cholesky] Matrix A must be square"); m_matrix = &A; computeWorkspaceSize(); - m_workspace = std::make_unique>(m_workspaceSize); + m_workspace = std::make_unique >(m_workspaceSize); } /** @@ -1603,34 +1622,32 @@ public: * @param b provided matrix */ void solve(DTensor &b); - - }; template<> inline void CholeskyFactoriser::computeWorkspaceSize() { size_t n = m_matrix->numRows(); gpuErrChk(cusolverDnDpotrf_bufferSize(Session::getInstance().cuSolverHandle(), - CUBLAS_FILL_MODE_LOWER, n, - nullptr, n, &m_workspaceSize)); + CUBLAS_FILL_MODE_LOWER, n, + nullptr, n, &m_workspaceSize)); } template<> inline void CholeskyFactoriser::computeWorkspaceSize() { size_t n = m_matrix->numRows(); gpuErrChk(cusolverDnSpotrf_bufferSize(Session::getInstance().cuSolverHandle(), - CUBLAS_FILL_MODE_LOWER, n, - nullptr, n, &m_workspaceSize)); + CUBLAS_FILL_MODE_LOWER, n, + nullptr, n, &m_workspaceSize)); } template<> inline void CholeskyFactoriser::factorise() { size_t n = m_matrix->numRows(); gpuErrChk(cusolverDnDpotrf(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, n, - m_matrix->raw(), n, - m_workspace->raw(), - m_workspaceSize, - m_info->raw())); + m_matrix->raw(), n, + m_workspace->raw(), + m_workspaceSize, + m_info->raw())); } @@ -1638,32 +1655,32 @@ template<> inline void CholeskyFactoriser::factorise() { size_t n = m_matrix->numRows(); gpuErrChk(cusolverDnSpotrf(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, n, - m_matrix->raw(), n, - m_workspace->raw(), - m_workspaceSize, - m_info->raw())); + m_matrix->raw(), n, + m_workspace->raw(), + m_workspaceSize, + m_info->raw())); } template<> inline void CholeskyFactoriser::solve(DTensor &rhs) { size_t n = m_matrix->numRows(); gpuErrChk(cusolverDnDpotrs(Session::getInstance().cuSolverHandle(), - CUBLAS_FILL_MODE_LOWER, - n, 1, - m_matrix->raw(), n, - rhs.raw(), n, - m_info->raw())); + CUBLAS_FILL_MODE_LOWER, + n, 1, + m_matrix->raw(), n, + rhs.raw(), n, + m_info->raw())); } template<> inline void CholeskyFactoriser::solve(DTensor &rhs) { size_t n = m_matrix->numRows(); gpuErrChk(cusolverDnSpotrs(Session::getInstance().cuSolverHandle(), - CUBLAS_FILL_MODE_LOWER, - n, 1, - m_matrix->raw(), n, - rhs.raw(), n, - m_info->raw())); + CUBLAS_FILL_MODE_LOWER, + n, 1, + m_matrix->raw(), n, + rhs.raw(), n, + m_info->raw())); } @@ -1677,14 +1694,14 @@ inline void CholeskyFactoriser::solve(DTensor &rhs) { * Then, many same-type-(m,n,1)-tensor can be factorised using this object's workspace * @tparam T data type of (m,n,1)-tensor to be factorised (must be float or double) */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX class QRFactoriser : public IStatus { - private: - int m_workspaceSize = 0; ///< Size of workspace needed for LS - std::unique_ptr> m_householder; ///< For storing householder reflectors - std::unique_ptr> m_workspace; ///< Workspace for LS - DTensor *m_matrix; ///< Lhs matrix template. Do not destroy! + int m_workspaceSize = 0; ///< Size of workspace needed for LS + std::unique_ptr > m_householder; ///< For storing householder reflectors + std::unique_ptr > m_workspace; ///< Workspace for LS + DTensor *m_matrix; ///< Lhs matrix template. Do not destroy! /** * Computes the workspace size required by cuSolver. @@ -1692,14 +1709,13 @@ private: void computeWorkspaceSize(); public: - QRFactoriser(DTensor &A) : IStatus() { if (A.numMats() > 1) throw std::invalid_argument("[QR] 3D tensors require `leastSquaresBatched`"); if (A.numRows() < A.numCols()) throw std::invalid_argument("[QR] Matrix A must be tall or square"); m_matrix = &A; computeWorkspaceSize(); - m_workspace = std::make_unique>(m_workspaceSize); - m_householder = std::make_unique>(m_matrix->numCols()); + m_workspace = std::make_unique >(m_workspaceSize); + m_householder = std::make_unique >(m_matrix->numCols()); } /** @@ -1726,8 +1742,6 @@ public: * @throws std::invalid_argument if Q or R have invalid dimensions */ void getQR(DTensor &Q, DTensor &R); - - }; template<> @@ -1735,9 +1749,9 @@ inline void QRFactoriser::computeWorkspaceSize() { size_t m = m_matrix->numRows(); size_t n = m_matrix->numCols(); gpuErrChk(cusolverDnDgeqrf_bufferSize(Session::getInstance().cuSolverHandle(), - m, n, - nullptr, m, - &m_workspaceSize)); + m, n, + nullptr, m, + &m_workspaceSize)); } template<> @@ -1745,9 +1759,9 @@ inline void QRFactoriser::computeWorkspaceSize() { size_t m = m_matrix->numRows(); size_t n = m_matrix->numCols(); gpuErrChk(cusolverDnSgeqrf_bufferSize(Session::getInstance().cuSolverHandle(), - m, n, - nullptr, m, - &m_workspaceSize)); + m, n, + nullptr, m, + &m_workspaceSize)); } template<> @@ -1755,11 +1769,11 @@ inline void QRFactoriser::factorise() { size_t m = m_matrix->numRows(); size_t n = m_matrix->numCols(); gpuErrChk(cusolverDnDgeqrf(Session::getInstance().cuSolverHandle(), - m, n, - m_matrix->raw(), m, - m_householder->raw(), - m_workspace->raw(), m_workspaceSize, - m_info->raw())); + m, n, + m_matrix->raw(), m, + m_householder->raw(), + m_workspace->raw(), m_workspaceSize, + m_info->raw())); } @@ -1768,11 +1782,11 @@ inline void QRFactoriser::factorise() { size_t m = m_matrix->numRows(); size_t n = m_matrix->numCols(); gpuErrChk(cusolverDnSgeqrf(Session::getInstance().cuSolverHandle(), - m, n, - m_matrix->raw(), m, - m_householder->raw(), - m_workspace->raw(), m_workspaceSize, - m_info->raw())); + m, n, + m_matrix->raw(), m, + m_householder->raw(), + m_workspace->raw(), m_workspaceSize, + m_info->raw())); } template<> @@ -1781,17 +1795,17 @@ inline void QRFactoriser::leastSquares(DTensor &rhs) { size_t n = m_matrix->numCols(); double alpha = 1.; gpuErrChk(cusolverDnDormqr(Session::getInstance().cuSolverHandle(), - CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, 1, n, - m_matrix->raw(), m, - m_householder->raw(), - rhs.raw(), m, - m_workspace->raw(), m_workspaceSize, - m_info->raw())); + CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, 1, n, + m_matrix->raw(), m, + m_householder->raw(), + rhs.raw(), m, + m_workspace->raw(), m_workspaceSize, + m_info->raw())); gpuErrChk(cublasDtrsm(Session::getInstance().cuBlasHandle(), - CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1, - &alpha, - m_matrix->raw(), m, - rhs.raw(), m)); + CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1, + &alpha, + m_matrix->raw(), m, + rhs.raw(), m)); } template<> @@ -1800,17 +1814,17 @@ inline void QRFactoriser::leastSquares(DTensor &rhs) { size_t n = m_matrix->numCols(); float alpha = 1.; gpuErrChk(cusolverDnSormqr(Session::getInstance().cuSolverHandle(), - CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, 1, n, - m_matrix->raw(), m, - m_householder->raw(), - rhs.raw(), m, - m_workspace->raw(), m_workspaceSize, - m_info->raw())); + CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, 1, n, + m_matrix->raw(), m, + m_householder->raw(), + rhs.raw(), m, + m_workspace->raw(), m_workspaceSize, + m_info->raw())); gpuErrChk(cublasStrsm(Session::getInstance().cuBlasHandle(), - CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1, - &alpha, - m_matrix->raw(), m, - rhs.raw(), m)); + CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1, + &alpha, + m_matrix->raw(), m, + rhs.raw(), m)); } template<> @@ -1831,12 +1845,12 @@ inline void QRFactoriser::getQR(DTensor &Q, DTensor &R) Q.upload(vecQ, rowMajor); // Apply Householder reflectors to compute Q gpuErrChk(cusolverDnDormqr(Session::getInstance().cuSolverHandle(), - CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, n, n, - m_matrix->raw(), m, - m_householder->raw(), - Q.raw(), m, - m_workspace->raw(), m_workspaceSize, - m_info->raw())); + CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, n, n, + m_matrix->raw(), m, + m_householder->raw(), + Q.raw(), m, + m_workspace->raw(), m_workspaceSize, + m_info->raw())); // Extract upper triangular R std::vector vecR(n * n, 0.); for (size_t r = 0; r < n; r++) { @@ -1865,12 +1879,12 @@ inline void QRFactoriser::getQR(DTensor &Q, DTensor &R) { Q.upload(vecQ, rowMajor); // Apply Householder reflectors to compute Q gpuErrChk(cusolverDnSormqr(Session::getInstance().cuSolverHandle(), - CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, n, n, - m_matrix->raw(), m, - m_householder->raw(), - Q.raw(), m, - m_workspace->raw(), m_workspaceSize, - m_info->raw())); + CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, n, n, + m_matrix->raw(), m, + m_householder->raw(), + Q.raw(), m, + m_workspace->raw(), m_workspaceSize, + m_info->raw())); // Extract upper triangular R std::vector vecR(n * n, 0.); for (size_t r = 0; r < n; r++) { @@ -1892,16 +1906,14 @@ inline void QRFactoriser::getQR(DTensor &Q, DTensor &R) { * Nullspace computes, pads, and stores the nullspace matrices. * @tparam T data type (must be float or double) */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX class Nullspace { - private: - - std::unique_ptr> m_nullspace; ///< Stores all nullspace matrices (N) - std::unique_ptr> m_projOp; ///< Stores all projection operators (N*N') + std::unique_ptr > m_nullspace; ///< Stores all nullspace matrices (N) + std::unique_ptr > m_projOp; ///< Stores all projection operators (N*N') public: - /** * Constructor (computes nullspace) * @param a device tensor @@ -1935,8 +1947,8 @@ TEMPLATE_CONSTRAINT_REQUIRES_FPX inline Nullspace::Nullspace(DTensor &a) { size_t m = a.numRows(), n = a.numCols(), nMats = a.numMats(); if (m > n) throw std::invalid_argument("[nullspace] I was expecting a square or fat matrix"); - m_nullspace = std::make_unique>(n, n, nMats, true); - m_projOp = std::make_unique>(n, n, nMats, true); + m_nullspace = std::make_unique >(n, n, nMats, true); + m_projOp = std::make_unique >(n, n, nMats, true); auto aTranspose = a.tr(); Svd svd(aTranspose, true); svd.factorise(); @@ -1945,10 +1957,11 @@ inline Nullspace::Nullspace(DTensor &a) { std::vector hostRankA; devRankA.download(hostRankA); - std::optional>> leftSingValsOptional = svd.leftSingularVectors(); + std::optional > > leftSingValsOptional = svd.leftSingularVectors(); assert(leftSingValsOptional); // make sure left SVs were computed - std::shared_ptr> leftSingVals = leftSingValsOptional.value(); - for (size_t i = 0; i < nMats; i++) { // for each matrix + std::shared_ptr > leftSingVals = leftSingValsOptional.value(); + for (size_t i = 0; i < nMats; i++) { + // for each matrix // Slice the matrix of left SVs to get the matrix that spans // the nullspace of a[:, :, i] unsigned int rankAi = hostRankA[i]; @@ -1983,14 +1996,14 @@ inline void Nullspace::project(DTensor &b) { * or you can provide pre-computed Cholesky lower-triangular matrices and then solve. * @tparam T data type of tensor (must be float or double) */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX class CholeskyBatchFactoriser : public IStatus { - private: - DTensor *m_matrix; ///< Matrix to factorise or lower-triangular decomposed matrix - size_t m_numRows = 0; ///< Number of rows in `m_matrix` - size_t m_numMats = 0; ///< Number of matrices in `m_matrix` - bool m_factorisationDone = false; ///< Whether `m_matrix` holds original or lower-triangular matrix + DTensor *m_matrix; ///< Matrix to factorise or lower-triangular decomposed matrix + size_t m_numRows = 0; ///< Number of rows in `m_matrix` + size_t m_numMats = 0; ///< Number of matrices in `m_matrix` + bool m_factorisationDone = false; ///< Whether `m_matrix` holds original or lower-triangular matrix public: CholeskyBatchFactoriser() = delete; @@ -2000,7 +2013,8 @@ public: * @param A either matrices to be factorised or lower-triangular Cholesky decomposition matrices * @param factorised whether A is the original matrices or the factorised ones (default=original matrices) */ - CholeskyBatchFactoriser(DTensor &A, bool factorised = false) : IStatus(A.numMats()), m_factorisationDone(factorised) { + CholeskyBatchFactoriser(DTensor &A, bool factorised = false) : IStatus(A.numMats()), + m_factorisationDone(factorised) { if (A.numRows() != A.numCols()) throw std::invalid_argument("[CholeskyBatch] A must be square"); m_matrix = &A; m_numRows = A.numRows(); @@ -2017,20 +2031,18 @@ public: * @param b vector in system Ax=b to be solved where x is the solution */ void solve(DTensor &b); - - }; template<> inline void CholeskyBatchFactoriser::factorise() { if (m_factorisationDone) return; gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(), - CUBLAS_FILL_MODE_LOWER, - m_numRows, - m_matrix->ptrMatrices(), - m_numRows, - m_info->raw(), - m_numMats)); + CUBLAS_FILL_MODE_LOWER, + m_numRows, + m_matrix->ptrMatrices(), + m_numRows, + m_info->raw(), + m_numMats)); m_factorisationDone = true; } @@ -2038,12 +2050,12 @@ template<> inline void CholeskyBatchFactoriser::factorise() { if (m_factorisationDone) return; gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(), - CUBLAS_FILL_MODE_LOWER, - m_numRows, - m_matrix->ptrMatrices(), - m_numRows, - m_info->raw(), - m_numMats)); + CUBLAS_FILL_MODE_LOWER, + m_numRows, + m_matrix->ptrMatrices(), + m_numRows, + m_info->raw(), + m_numMats)); m_factorisationDone = true; } @@ -2055,15 +2067,15 @@ inline void CholeskyBatchFactoriser::solve(DTensor &b) { } if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column"); gpuErrChk(cusolverDnDpotrsBatched(Session::getInstance().cuSolverHandle(), - CUBLAS_FILL_MODE_LOWER, - m_numRows, - 1, ///< only supports rhs = 1 - m_matrix->ptrMatrices(), - m_numRows, - b.ptrMatrices(), - m_numRows, - m_info->raw(), - m_numMats)); + CUBLAS_FILL_MODE_LOWER, + m_numRows, + 1, ///< only supports rhs = 1 + m_matrix->ptrMatrices(), + m_numRows, + b.ptrMatrices(), + m_numRows, + m_info->raw(), + m_numMats)); } template<> @@ -2074,19 +2086,18 @@ inline void CholeskyBatchFactoriser::solve(DTensor &b) { } if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column"); gpuErrChk(cusolverDnSpotrsBatched(Session::getInstance().cuSolverHandle(), - CUBLAS_FILL_MODE_LOWER, - m_numRows, - 1, ///< only supports rhs = 1 - m_matrix->ptrMatrices(), - m_numRows, - b.ptrMatrices(), - m_numRows, - m_info->raw(), - m_numMats)); + CUBLAS_FILL_MODE_LOWER, + m_numRows, + 1, ///< only supports rhs = 1 + m_matrix->ptrMatrices(), + m_numRows, + b.ptrMatrices(), + m_numRows, + m_info->raw(), + m_numMats)); } - /* ================================================================================================ * GIVENS ANNIHILATOR * ================================================================================================ */ @@ -2098,23 +2109,22 @@ inline void CholeskyBatchFactoriser::solve(DTensor &b) { * * @tparam T data type of tensor (must be float or double) */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX class GivensAnnihilator { - private: DTensor *m_matrix; /** * Auxiliary memory on the device of length 3 used to store * rhypot(xij, xkj), cos θ, and sin θ. */ - std::unique_ptr> m_d_rhyp_cos_sin; + std::unique_ptr > m_d_rhyp_cos_sin; void init() { - m_d_rhyp_cos_sin = std::make_unique>(3); + m_d_rhyp_cos_sin = std::make_unique >(3); } public: - GivensAnnihilator() { init(); } @@ -2156,10 +2166,10 @@ public: * @throws std::invalid_argument if i, k, or j are out of bounds */ void annihilate(size_t i, size_t k, size_t j); - }; -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX __global__ void k_givensAnnihilateRHypot(const T *data, T *res, size_t i, size_t k, size_t j, @@ -2172,7 +2182,8 @@ __global__ void k_givensAnnihilateRHypot(const T *data, } -template TEMPLATE_CONSTRAINT_REQUIRES_FPX +template +TEMPLATE_CONSTRAINT_REQUIRES_FPX inline void GivensAnnihilator::annihilate(size_t i, size_t k, size_t j) { /* A few checks */ size_t nR = m_matrix->numRows(), nC = m_matrix->numCols(); diff --git a/main.cu b/main.cu index 01ad042..742321d 100644 --- a/main.cu +++ b/main.cu @@ -4,17 +4,26 @@ #include #include - -int main() { +void xyz() { /* Write to binary file */ auto r = DTensor::createRandomTensor(3, 6, 4, -1, 1); + auto r2 = DTensor::createRandomTensor(300, 600, 4, -1, 1); std::string fName = "tensor.bt"; // binary tensor file extension: .bt - r.saveToFile(fName); /* Parse binary file */ auto recov = DTensor::parseFromFile(fName); auto err = r - recov; - std::cout << "max error : " << err.maxAbs(); + std::cout << "max error : " << err.maxAbs() << std::endl; + std::cout << "Memory: " << std::setprecision(3) + << (float) Session::getInstance().totalAllocatedBytes() / 1e6 + << " MB" << std::endl; +} + +int main() { + xyz(); + std::cout << "Memory (outside): " << std::setprecision(3) + << (float) Session::getInstance().totalAllocatedBytes() / 1e6 + << " MB" << std::endl; return 0; } diff --git a/test/testTensor.cu b/test/testTensor.cu index 6b10636..bff6ac8 100644 --- a/test/testTensor.cu +++ b/test/testTensor.cu @@ -11,9 +11,11 @@ * ================================================================================================ */ class TensorTest : public testing::Test { protected: - TensorTest() {} + TensorTest() { + } - virtual ~TensorTest() {} + virtual ~TensorTest() { + } }; #define TENSOR_DATA_234A {1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 10, 5, 4, 3, 2, 1, -1, 4, 3, 4, 3, 4, 8} @@ -52,17 +54,25 @@ void tensorConstructionStorageMode() { size_t rows = 3; size_t cols = 2; size_t mats = 2; - std::vector aCm = {1, 3, 5, - 2, 4, 6}; - std::vector bCm = {7, 9, 11, - 8, 10, 12}; + std::vector aCm = { + 1, 3, 5, + 2, 4, 6 + }; + std::vector bCm = { + 7, 9, 11, + 8, 10, 12 + }; const std::vector Cm = {1, 3, 5, 2, 4, 6, 7, 9, 11, 8, 10, 12}; - std::vector aRm = {1, 2, - 3, 4, - 5, 6}; - std::vector bRm = {7, 8, - 9, 10, - 11, 12}; + std::vector aRm = { + 1, 2, + 3, 4, + 5, 6 + }; + std::vector bRm = { + 7, 8, + 9, 10, + 11, 12 + }; std::vector Rm = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; std::vector hostData(rows * cols * mats); // test constructor @@ -183,8 +193,8 @@ TEST_F(TensorTest, parseTensorFromFileBinary) { TEST_F(TensorTest, parseTensorFromBinaryPython) { std::string fName = "../../python/b_d.bt"; DTensor b = DTensor::parseFromFile(fName); - for (size_t i=0; i<3; i++) { - for (size_t j=0; j<3; j++) { + for (size_t i = 0; i < 3; i++) { + for (size_t j = 0; j < 3; j++) { EXPECT_NEAR(1 + 2*j + 6*i, b(i, j, 0), PRECISION_HIGH); EXPECT_NEAR(2 + 2*j + 6*i, b(i, j, 1), PRECISION_HIGH); } @@ -212,7 +222,7 @@ TEMPLATE_WITH_TYPE_T void tensorMoveConstructor() { DTensor zero(2, 3, 4, true); DTensor x(std::move(zero)); - DTensor y(DTensor {100, 10, 1000}); + DTensor y(DTensor{100, 10, 1000}); } TEST_F(TensorTest, tensorMoveConstructor) { @@ -378,8 +388,10 @@ void tensorDeviceCopyTo() { DTensor other(2, 3, 5, true); DTensor z(other, 2, 1, 4); tenz.deviceCopyTo(z); - std::vector expected = {0, 0, 0, 0, 0, 0, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 10, 5, 4, 3, 2, 1, -1, 4, 3, 4, 3, 4, 8}; + std::vector expected = { + 0, 0, 0, 0, 0, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 10, 5, 4, 3, 2, 1, -1, 4, 3, 4, 3, 4, 8 + }; std::vector actual(2 * 3 * 5); other.download(actual); EXPECT_EQ(expected, actual); @@ -453,9 +465,11 @@ void tensorSliceAndReshape(T epsilon) { bSlice.reshape(2, 9, 1); aSlice += bSlice; - std::vector dataAExpected = {1, 2, 3, 4, 5, 6, 41, 7, 5, 5, - 19, 17, 14, 13, 5, 11, -8, -4, - 6, 8, 8, -2, 8, 13}; + std::vector dataAExpected = { + 1, 2, 3, 4, 5, 6, 41, 7, 5, 5, + 19, 17, 14, 13, 5, 11, -8, -4, + 6, 8, 8, -2, 8, 13 + }; DTensor aExpected(dataAExpected, 2, 3, 4); DTensor err = aExpected - a; @@ -479,12 +493,12 @@ void tensorDotF(T epsilon) { DTensor vecA(dataA, dataA.size()); DTensor vecB(dataB, dataB.size()); T dotVector = vecA.dotF(vecB); - EXPECT_EQ(604, dotVector); // from MATLAB + EXPECT_EQ(604, dotVector); // from MATLAB // as matrices DTensor tenA(dataA, 2, 3, 4); DTensor tenB(dataB, 2, 3, 4); T dotTensor = tenA.dotF(tenB); - EXPECT_EQ(604, dotTensor); // from MATLAB + EXPECT_EQ(604, dotTensor); // from MATLAB } TEST_F(TensorTest, tensorDotF) { @@ -680,8 +694,10 @@ TEST_F(TensorTest, tensorAssignmentOperator) { TEMPLATE_WITH_TYPE_T void tensorTimesEqualsScalar() { std::vector data = TENSOR_DATA_234A; - std::vector dataTimes3 = {3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12, - 24}; + std::vector dataTimes3 = { + 3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12, + 24 + }; DTensor tenz(data, 2, 3, 4); tenz *= 3.0; std::vector actual; @@ -701,8 +717,10 @@ TEST_F(TensorTest, tensorTimesEqualsScalar) { TEMPLATE_WITH_TYPE_T void tensorTimesScalar() { std::vector data = TENSOR_DATA_234A; - std::vector dataTimes3 = {3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12, - 24}; + std::vector dataTimes3 = { + 3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12, + 24 + }; DTensor tenz(data, 2, 3, 4); auto tripleTensor = 3.0 * tenz; std::vector actual; @@ -809,12 +827,16 @@ TEST_F(TensorTest, tensorMinusTensor) { TEMPLATE_WITH_TYPE_T void tensorAddAB() { - std::vector aData = {1, 2, 3, 4, 5, 6, - 7, 8, 9, 10, 11, 12, - 13, 14, 15, 16, 17, 18}; - std::vector bData = {6, 5, 4, 3, 2, 1, - 7, 6, 5, 4, 3, 2, - 1, 2, 1, 5, -6, 8}; + std::vector aData = { + 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18 + }; + std::vector bData = { + 6, 5, 4, 3, 2, 1, + 7, 6, 5, 4, 3, 2, + 1, 2, 1, 5, -6, 8 + }; DTensor A(aData, 2, 3, 3); DTensor B(bData, 3, 2, 3); DTensor C(2, 2, 3, true); @@ -876,12 +898,14 @@ TEST_F(TensorTest, tensorSliceAxis01PtrMatrices) { TEMPLATE_WITH_TYPE_T void tensorGetRows() { - std::vector aData = {10.5, 25.0, 60.0, - -21.0, 720.0, -1.0, - 11.0, -1.0, 30.0, - 5., 6., 7., - 8., 9., 10., - 11., 12., 13}; + std::vector aData = { + 10.5, 25.0, 60.0, + -21.0, 720.0, -1.0, + 11.0, -1.0, 30.0, + 5., 6., 7., + 8., 9., 10., + 11., 12., 13 + }; DTensor A(aData, 3, 3, 2); DTensor Ar0 = A.getRows(1, 1, 0); std::vector expected0 = {25., 720., -1.}; @@ -918,7 +942,6 @@ void tensorTranspose() { std::vector actual; Atranspose.download(actual); EXPECT_EQ(expected, actual); - } TEST_F(TensorTest, tensorTranspose) { @@ -926,14 +949,60 @@ TEST_F(TensorTest, tensorTranspose) { tensorTranspose(); } +/* --------------------------------------- + * Tensor: total bytes allocated + * --------------------------------------- */ +TEMPLATE_WITH_TYPE_T +void tensorBytesAllocated() { + const size_t previouslyAllocatedBytes = Session::getInstance().totalAllocatedBytes(); + size_t m = 10, n = 10, k = 20; + DTensor A(m, n, k); + const size_t allocatedBytes = Session::getInstance().totalAllocatedBytes(); + const size_t currentAllocatedBytes = m * n * k * sizeof(T) + k * sizeof(T *); + const size_t expectedBytes = currentAllocatedBytes + previouslyAllocatedBytes; + EXPECT_EQ(expectedBytes, allocatedBytes); +} + +TEST_F(TensorTest, tensorBytesAllocated) { + tensorBytesAllocated(); + tensorBytesAllocated(); + tensorBytesAllocated(); +} + + +/* --------------------------------------- + * Tensor: total bytes allocated + * --------------------------------------- */ +TEMPLATE_WITH_TYPE_T +void tensorBytesAllocatedDeallocated() { + const size_t previouslyAllocatedBytes = Session::getInstance().totalAllocatedBytes(); + size_t m = 10, n = 10, k = 20; + auto *A = new DTensor(m, n, k); // new allocation (increments session) + const size_t allocatedBytes = Session::getInstance().totalAllocatedBytes(); + const size_t expectedBytes = previouslyAllocatedBytes + m * n * k * sizeof(T) + k * sizeof(T *); + EXPECT_EQ(expectedBytes, allocatedBytes); + delete A; + EXPECT_EQ(previouslyAllocatedBytes, Session::getInstance().totalAllocatedBytes()); +} + +TEST_F(TensorTest, tensorBytesAllocatedDeallocated) { + tensorBytesAllocatedDeallocated(); + tensorBytesAllocatedDeallocated(); + tensorBytesAllocatedDeallocated(); +} + + + /* ================================================================================================ * LEAST SQUARES TESTS * ================================================================================================ */ class LeastSquaresTest : public testing::Test { protected: - LeastSquaresTest() {} + LeastSquaresTest() { + } - virtual ~LeastSquaresTest() {} + virtual ~LeastSquaresTest() { + } }; /* --------------------------------------- @@ -943,12 +1012,14 @@ protected: TEMPLATE_WITH_TYPE_T void tensorLeastSquares1(T epsilon) { // TODO test with tall matrices too - std::vector aData = {1, 2, - 3, 4, - 7, 8, - 9, 10, - 6, 8, - -9, 20}; + std::vector aData = { + 1, 2, + 3, 4, + 7, 8, + 9, 10, + 6, 8, + -9, 20 + }; std::vector bData = {1, 1, -1, 2, 30, -80}; DTensor A0(aData, 2, 2, 3); DTensor A(A0); @@ -973,9 +1044,11 @@ TEST_F(LeastSquaresTest, tensorLS1) { * ================================================================================================ */ class SvdTest : public testing::Test { protected: - SvdTest() {} + SvdTest() { + } - virtual ~SvdTest() {} + virtual ~SvdTest() { + } }; /* --------------------------------------- @@ -983,11 +1056,14 @@ protected: * and matrix rank * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void singularValuesComputation(float epsilon) { - std::vector bData = {1, 6, 6, 6, 6, 6, 6, 6, - 2, 7, 7, 7, 7, 7, 7, 7, - 3, 8, 8, 8, 8, 8, 8, 8,}; + std::vector bData = { + 1, 6, 6, 6, 6, 6, 6, 6, + 2, 7, 7, 7, 7, 7, 7, 7, + 3, 8, 8, 8, 8, 8, 8, 8, + }; DTensor B(bData, 8, 3); Svd svd(B, true, false); svd.factorise(); @@ -1012,11 +1088,14 @@ TEST_F(SvdTest, singularValuesComputation) { * Singular values - memory mgmt * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void singularValuesMemory(float epsilon) { - std::vector bData = {1, 6, 6, 6, 6, 6, 6, 6, - 2, 7, 7, 7, 7, 7, 7, 7, - 3, 8, 8, 8, 8, 8, 8, 8,}; + std::vector bData = { + 1, 6, 6, 6, 6, 6, 6, 6, + 2, 7, 7, 7, 7, 7, 7, 7, + 3, 8, 8, 8, 8, 8, 8, 8, + }; DTensor B(bData, 8, 3); Svd svd(B, true, false); svd.factorise(); @@ -1044,7 +1123,8 @@ TEST_F(SvdTest, singularValuesMemory) { /* --------------------------------------- * SVD with multiple matrices * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void singularValuesMultipleMatrices(float epsilon) { std::vector aData = {1, 2, 3, 4, 5, 6, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1}; DTensor A(aData, 3, 2, 3); @@ -1054,31 +1134,35 @@ void singularValuesMultipleMatrices(float epsilon) { DTensor const &V = svd.rightSingularVectors(); auto Uopt = svd.leftSingularVectors(); auto U = Uopt.value(); - std::vector expected_v = {-0.386317703118612, -0.922365780077058, -0.922365780077058, 0.386317703118612, - -0.447213595499958, -0.894427190999916, 0.894427190999916, -0.447213595499958, - 0, -1, 1, 0}; + std::vector expected_v = { + -0.386317703118612, -0.922365780077058, -0.922365780077058, 0.386317703118612, + -0.447213595499958, -0.894427190999916, 0.894427190999916, -0.447213595499958, + 0, -1, 1, 0 + }; std::vector actual_v(12); V.download(actual_v); - for (size_t i = 0; i < 4; i++) EXPECT_NEAR(expected_v[i], actual_v[i], epsilon); + for (size_t i = 0; i < 4; i++) + EXPECT_NEAR(expected_v[i], actual_v[i], epsilon); std::vector expected_s = {9.508032000695726, 0.772869635673484, 3.872983346207417, 0, 1, 0}; std::vector actual_s(6); S.download(actual_s); - for (size_t i = 0; i < 6; i++) EXPECT_NEAR(expected_s[i], actual_s[i], epsilon); + for (size_t i = 0; i < 6; i++) + EXPECT_NEAR(expected_s[i], actual_s[i], epsilon); std::vector expected_u = { - -0.428667133548626, -0.566306918848035, -0.703946704147444, - 0.805963908589298, 0.112382414096594, -0.581199080396110, - 0.408248290463863, -0.816496580927726, 0.408248290463863, - -0.577350269189626, -0.577350269189626, -0.577350269189626, - 0.816496580927726, -0.408248290463863, -0.408248290463863, - 0.000000000000000, -0.707106781186548, 0.707106781186547, - 0, 0, -1, - 1, 0, 0, - 0, -1, 0, + -0.428667133548626, -0.566306918848035, -0.703946704147444, + 0.805963908589298, 0.112382414096594, -0.581199080396110, + 0.408248290463863, -0.816496580927726, 0.408248290463863, + -0.577350269189626, -0.577350269189626, -0.577350269189626, + 0.816496580927726, -0.408248290463863, -0.408248290463863, + 0.000000000000000, -0.707106781186548, 0.707106781186547, + 0, 0, -1, + 1, 0, 0, + 0, -1, 0, }; std::vector actual_u(27); U->download(actual_u); - for (size_t i = 0; i < 27; i++) EXPECT_NEAR(expected_u[i], actual_u[i], epsilon); - + for (size_t i = 0; i < 27; i++) + EXPECT_NEAR(expected_u[i], actual_u[i], epsilon); } TEST_F(SvdTest, singularValuesMultipleMatrices) { @@ -1091,11 +1175,14 @@ TEST_F(SvdTest, singularValuesMultipleMatrices) { * SVD for rank computation of multiple * matrices * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void singularValuesRankMultipleMatrices(float epsilon) { - std::vector aData = {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 0, - 1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12, - 1, 2, 3, 4, 2, 4, 6, 8, 3, 6, 9, 12}; + std::vector aData = { + 1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 0, + 1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12, + 1, 2, 3, 4, 2, 4, 6, 8, 3, 6, 9, 12 + }; DTensor A(aData, 4, 3, 3); Svd svd(A); @@ -1116,9 +1203,11 @@ TEST_F(SvdTest, singularValuesRankMultipleMatrices) { * ================================================================================================ */ class CholeskyTest : public testing::Test { protected: - CholeskyTest() {} + CholeskyTest() { + } - virtual ~CholeskyTest() {} + virtual ~CholeskyTest() { + } }; @@ -1126,11 +1215,14 @@ protected: * Cholesky factorisation * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void choleskyFactorisation(T epsilon) { - std::vector aData = {10.0, 2.0, 3.0, - 2.0, 20.0, -1.0, - 3.0, -1.0, 30.0}; + std::vector aData = { + 10.0, 2.0, 3.0, + 2.0, 20.0, -1.0, + 3.0, -1.0, 30.0 + }; DTensor A(aData, 3, 3, 1); CholeskyFactoriser chol(A); chol.factorise(); @@ -1149,11 +1241,14 @@ TEST_F(CholeskyTest, choleskyFactorisation) { * Cholesky factorisation: solve system * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void choleskyFactorisationSolution(T epsilon) { - std::vector aData = {10.0, 2.0, 3.0, - 2.0, 20.0, -1.0, - 3.0, -1.0, 30.0}; + std::vector aData = { + 10.0, 2.0, 3.0, + 2.0, 20.0, -1.0, + 3.0, -1.0, 30.0 + }; DTensor A(aData, 3, 3, 1); DTensor L(A); // L = A CholeskyFactoriser chol(L); @@ -1167,12 +1262,12 @@ void choleskyFactorisationSolution(T epsilon) { std::vector expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036}; std::vector actual(3); sol.download(actual); - for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon); + for (size_t i = 0; i < 3; i++) + EXPECT_NEAR(expected[i], actual[i], epsilon); DTensor error = A * sol; error -= rhs; EXPECT_TRUE(error.normF() < epsilon); - } TEST_F(CholeskyTest, choleskyFactorisationSolution) { @@ -1184,11 +1279,14 @@ TEST_F(CholeskyTest, choleskyFactorisationSolution) { * Batched Cholesky factorisation * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void choleskyBatchFactorisation(T epsilon) { - std::vector aData = {10.0, 2.0, 3.0, - 2.0, 20.0, -1.0, - 3.0, -1.0, 30.0}; + std::vector aData = { + 10.0, 2.0, 3.0, + 2.0, 20.0, -1.0, + 3.0, -1.0, 30.0 + }; DTensor A(3, 3, 2); DTensor A0(A, 2, 0, 0); DTensor A1(A, 2, 1, 1); @@ -1217,11 +1315,14 @@ TEST_F(CholeskyTest, choleskyBatchFactorisation) { * Batched Cholesky solve * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void choleskyBatchFactorSolve(T epsilon) { - std::vector aData = {10.0, 2.0, 3.0, - 2.0, 20.0, -1.0, - 3.0, -1.0, 30.0}; + std::vector aData = { + 10.0, 2.0, 3.0, + 2.0, 20.0, -1.0, + 3.0, -1.0, 30.0 + }; DTensor A(3, 3, 2); DTensor A0(A, 2, 0, 0); DTensor A1(A, 2, 1, 1); @@ -1241,8 +1342,10 @@ void choleskyBatchFactorSolve(T epsilon) { std::vector expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036}; std::vector actual(6); sol.download(actual); - for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon); // 0 - for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon); // 1 + for (size_t i = 0; i < 3; i++) + EXPECT_NEAR(expected[i], actual[i], epsilon); // 0 + for (size_t i = 0; i < 3; i++) + EXPECT_NEAR(expected[i], actual[i + 3], epsilon); // 1 DTensor error = A * sol; error -= rhs; EXPECT_TRUE(error.normF() < epsilon); @@ -1258,19 +1361,24 @@ TEST_F(CholeskyTest, choleskyBatchFactorSolve) { * Batched Cholesky solve (factor provided) * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void choleskyBatchSolve(T epsilon) { - std::vector aData = {10.0, 2.0, 3.0, - 2.0, 20.0, -1.0, - 3.0, -1.0, 30.0}; + std::vector aData = { + 10.0, 2.0, 3.0, + 2.0, 20.0, -1.0, + 3.0, -1.0, 30.0 + }; DTensor A(3, 3, 2); DTensor A0(A, 2, 0, 0); DTensor A1(A, 2, 1, 1); A0.upload(aData); A1.upload(aData); - std::vector lowData = {3.162277660168380, 0, 0, - 0.632455532033676, 4.427188724235731, 0, - 0.948683298050514, -0.361403161162101, 5.382321781081287}; // from matlab + std::vector lowData = { + 3.162277660168380, 0, 0, + 0.632455532033676, 4.427188724235731, 0, + 0.948683298050514, -0.361403161162101, 5.382321781081287 + }; // from matlab DTensor low(3, 3, 2); DTensor low0(low, 2, 0, 0); DTensor low1(low, 2, 1, 1); @@ -1289,8 +1397,10 @@ void choleskyBatchSolve(T epsilon) { std::vector expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036}; std::vector actual(6); sol.download(actual); - for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon); // 0 - for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon); // 1 + for (size_t i = 0; i < 3; i++) + EXPECT_NEAR(expected[i], actual[i], epsilon); // 0 + for (size_t i = 0; i < 3; i++) + EXPECT_NEAR(expected[i], actual[i + 3], epsilon); // 1 DTensor error = A * sol; error -= rhs; EXPECT_TRUE(error.normF() < epsilon); @@ -1308,9 +1418,11 @@ TEST_F(CholeskyTest, choleskyBatchSolve) { * ================================================================================================ */ class QRTest : public testing::Test { protected: - QRTest() {} + QRTest() { + } - virtual ~QRTest() {} + virtual ~QRTest() { + } }; @@ -1318,7 +1430,8 @@ protected: * QR factorisation * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void qrFactorisation(T epsilon) { size_t nR = 4; size_t nC = 3; @@ -1349,7 +1462,8 @@ TEST_F(QRTest, qrFactorisation) { * - tall and skinny matrix * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void qrFactorisationTall(T epsilon) { size_t nR = 20; size_t nC = 3; @@ -1379,19 +1493,24 @@ TEST_F(QRTest, qrFactorisationTall) { * QR factorisation: solve least squares * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void qrLeastSquares(T epsilon) { size_t nR = 4; size_t nC = 3; DTensor temp(nR, nC); - std::vector vecA = {85.5638, -59.4001, -80.1992, - 99.9464, 5.51393, 5.17935, - 6.87488, -26.7536, 36.0914, - -44.3857, -32.1268, 54.8915}; // Random matrix - std::vector vecB = {-23.3585, - -48.5744, - 43.4229, - -56.5081}; // Random vector + std::vector vecA = { + 85.5638, -59.4001, -80.1992, + 99.9464, 5.51393, 5.17935, + 6.87488, -26.7536, 36.0914, + -44.3857, -32.1268, 54.8915 + }; // Random matrix + std::vector vecB = { + -23.3585, + -48.5744, + 43.4229, + -56.5081 + }; // Random vector DTensor A(vecA, nR, nC, 1, rowMajor); DTensor b(vecB, nR); DTensor xFull(nR); @@ -1407,7 +1526,7 @@ void qrLeastSquares(T epsilon) { Ax.addAB(A, x); Ax -= b; T nrm = Ax.normF(); - EXPECT_NEAR(nrm, 80.003169364198072, epsilon); // From MatLab + EXPECT_NEAR(nrm, 80.003169364198072, epsilon); // From MatLab } TEST_F(QRTest, qrLeastSquares) { @@ -1421,9 +1540,11 @@ TEST_F(QRTest, qrLeastSquares) { * ================================================================================================ */ class NullspaceTest : public testing::Test { protected: - NullspaceTest() {} + NullspaceTest() { + } - virtual ~NullspaceTest() {} + virtual ~NullspaceTest() { + } }; @@ -1431,13 +1552,16 @@ protected: * Basic nullspace test * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void computeNullspaceTensor(T epsilon) { - std::vector aData = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 7, 8, 9, - 1, 2, 3, 4, 2, 4, 6, 8, 3, 6, 9, 12, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector aData = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 7, 8, 9, + 1, 2, 3, 4, 2, 4, 6, 8, 3, 6, 9, 12, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; DTensor A(aData, 3, 4, 5); Nullspace ns(A); DTensor nA = ns.nullspace(); @@ -1471,14 +1595,17 @@ TEST_F(NullspaceTest, computeNullspaceTensor) { * Nullspace is trivial * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void computeNullspaceTrivial(T epsilon) { - std::vector data{4, 5, 7, - 4, 1, 8, - 4, 5, 0, - 1, 1, 1, - 5, 6, 7, - 9, 0, 3}; + std::vector data{ + 4, 5, 7, + 4, 1, 8, + 4, 5, 0, + 1, 1, 1, + 5, 6, 7, + 9, 0, 3 + }; DTensor A(data, 3, 3, 2, rowMajor); Nullspace nullA(A); DTensor N = nullA.nullspace(); @@ -1494,14 +1621,17 @@ TEST_F(NullspaceTest, computeNullspaceTrivial) { * Project onto nullspace * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void projectOnNullspaceTensor(T epsilon) { // offline size_t m = 3; size_t n = 7; - std::vector mat{1, -2, 3, 4, -1, -1, -1, - 1, 2, -3, 4, -1, -1, -1, - -1, 3, 5, -7, -1, -1, -1}; + std::vector mat{ + 1, -2, 3, 4, -1, -1, -1, + 1, 2, -3, 4, -1, -1, -1, + -1, 3, 5, -7, -1, -1, -1 + }; DTensor A(m, n, 1); A.upload(mat, rowMajor); Nullspace ns = Nullspace(A); @@ -1538,9 +1668,11 @@ TEST_F(NullspaceTest, projectOnNullspaceTensor) { * ================================================================================================ */ class GivensAnnihilatorTest : public testing::Test { protected: - GivensAnnihilatorTest() {} + GivensAnnihilatorTest() { + } - virtual ~GivensAnnihilatorTest() {} + virtual ~GivensAnnihilatorTest() { + } }; @@ -1548,7 +1680,8 @@ protected: * GivensAnnihilator works * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void givensAnnihilateElement(T epsilon) { size_t m = 10; size_t n = 6; @@ -1573,12 +1706,12 @@ TEST_F(GivensAnnihilatorTest, givensAnnihilateElement) { } - /* --------------------------------------- * GivensAnnihilator: correctness * --------------------------------------- */ -TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void givensAnnihilateCorrectness(T epsilon) { size_t m = 10, n = 6; std::vector v(m * n); @@ -1593,13 +1726,9 @@ void givensAnnihilateCorrectness(T epsilon) { EXPECT_NEAR(2.137186834969645, a(0, 0), epsilon); EXPECT_NEAR(44.552125559751822, a(0, 3), epsilon); EXPECT_NEAR(-0.328797974610715, a(1, 3), epsilon); - } TEST_F(GivensAnnihilatorTest, givensAnnihilateCorrectness) { givensAnnihilateCorrectness(1e-14); givensAnnihilateCorrectness(1e-12); } - - -