Change dense eigenvalue kernels to expect row-major data

nbeams · nbeams · commit b0f97642fe4c · 2025-04-28T22:28:53.000Z
diff --git a/common/cuda_hip/eigensolver/lobpcg_kernels.cpp b/common/cuda_hip/eigensolver/lobpcg_kernels.cpp
@@ -6,6 +6,7 @@
 
 #include <limits>
 
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
 #include "common/cuda_hip/base/blas_bindings.hpp"
@@ -28,6 +29,36 @@ constexpr int default_block_size = 512;
 namespace kernel {
 
 
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void matrix_conj(
+    const int32 n, ValueType* a, const int32 a_stride)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / n;
+    const auto col = tidx % n;
+    const ValueType zero = gko::zero<ValueType>();
+    if (row < n && col < n) {
+        a[row * a_stride + col] = conj(a[row * a_stride + col]);
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void two_matrix_conj(
+    const int32 n, ValueType* a, const int32 a_stride, ValueType* b,
+    const int32 b_stride)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / n;
+    const auto col = tidx % n;
+    const ValueType zero = gko::zero<ValueType>();
+    if (row < n && col < n) {
+        a[row * a_stride + col] = conj(a[row * a_stride + col]);
+        b[row * b_stride + col] = conj(b[row * b_stride + col]);
+    }
+}
+
+
 template <typename ValueType>
 __global__ __launch_bounds__(default_block_size) void fill_lower_col_major(
     const int32 n, const ValueType* source, const int32 source_stride,
@@ -64,8 +95,19 @@ void symm_eig(std::shared_ptr<const DefaultExecutor> exec,
         throw OverflowError(__FILE__, __LINE__,
                             name_demangling::get_type_name(typeid(int32)));
     }
-    int32 n = static_cast<int32>(a->get_size()[1]);  // column-major
+    int32 n = static_cast<int32>(a->get_size()[0]);
     int32 lda = static_cast<int32>(a->get_stride());
+    // The dev_lapack routine expects column-major data, so we take the
+    // conjugate to perform A = A^T.
+    if constexpr (gko::is_complex_s<ValueType>::value) {
+        const auto grid_dim = ceildiv(n * n, default_block_size);
+        if (grid_dim > 0) {
+            kernel::matrix_conj<<<grid_dim, default_block_size, 0,
+                                  exec->get_stream()>>>(
+                n, as_device_type(a->get_values()), lda);
+        }
+    }
+
     int32 fp_buffer_num_elems;
     dev_lapack::syevd_buffersize(handle, LAPACK_EIG_VECTOR, LAPACK_FILL_LOWER,
                                  n, a->get_values(), lda, e_vals->get_data(),
@@ -119,9 +161,22 @@ void symm_generalized_eig(std::shared_ptr<const DefaultExecutor> exec,
         throw OverflowError(__FILE__, __LINE__,
                             name_demangling::get_type_name(typeid(int32)));
     }
-    int32 n = static_cast<int32>(a->get_size()[1]);  // column-major
+
+    int32 n = static_cast<int32>(a->get_size()[0]);
     int32 lda = static_cast<int32>(a->get_stride());
     int32 ldb = static_cast<int32>(b->get_stride());
+    // The dev_lapack routine expects column-major data, so we take the
+    // conjugate to perform A = A^T.
+    if constexpr (gko::is_complex_s<ValueType>::value) {
+        const auto grid_dim = ceildiv(n * n, default_block_size);
+        if (grid_dim > 0) {
+            kernel::two_matrix_conj<<<grid_dim, default_block_size, 0,
+                                      exec->get_stream()>>>(
+                n, as_device_type(a->get_values()), lda,
+                as_device_type(b->get_values()), ldb);
+        }
+    }
+
     int32 fp_buffer_num_elems;
     dev_lapack::sygvd_buffersize(handle, LAPACK_EIG_TYPE_1, LAPACK_EIG_VECTOR,
                                  LAPACK_FILL_LOWER, n, a->get_values(), lda,
diff --git a/reference/eigensolver/lobpcg_kernels.cpp b/reference/eigensolver/lobpcg_kernels.cpp
@@ -4,6 +4,7 @@
 
 #include "core/eigensolver/lobpcg_kernels.hpp"
 
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
 #include "reference/base/blas_bindings.hpp"
@@ -65,6 +66,16 @@ void symm_eig(std::shared_ptr<const ReferenceExecutor> exec,
                       e_vals->get_data(), work, &fp_buffer_num_elems, iwork,
                       &int_buffer_num_elems);
     } else {  // Complex data type
+
+        // LAPACK expects column-major data, so we need to take the conjugate
+        // of the input matrix (same as performing A = A^T)
+        ValueType* data = a->get_values();
+        for (int32 row = 0; row < n; ++row) {
+            for (int32 col = 0; col < n; ++col) {
+                data[row * lda + col] = conj(data[row * lda + col]);
+            }
+        }
+
         int32 fp_buffer_num_elems;
         int32 rfp_buffer_num_elems;
         int32 int_buffer_num_elems;
@@ -151,6 +162,18 @@ void symm_generalized_eig(std::shared_ptr<const ReferenceExecutor> exec,
                       b->get_values(), &ldb, e_vals->get_data(), work,
                       &fp_buffer_num_elems, iwork, &int_buffer_num_elems);
     } else {  // Complex data type
+
+        // LAPACK expects column-major data, so we need to take the conjugate
+        // of the input matrices (same as performing A = A^T)
+        ValueType* a_data = a->get_values();
+        ValueType* b_data = b->get_values();
+        for (int32 row = 0; row < n; ++row) {
+            for (int32 col = 0; col < n; ++col) {
+                a_data[row * lda + col] = conj(a_data[row * lda + col]);
+                b_data[row * lda + col] = conj(b_data[row * lda + col]);
+            }
+        }
+
         int32 fp_buffer_num_elems;
         int32 rfp_buffer_num_elems;
         int32 int_buffer_num_elems;
diff --git a/reference/test/eigensolver/lobpcg_kernels.cpp b/reference/test/eigensolver/lobpcg_kernels.cpp
@@ -90,11 +90,6 @@ TYPED_TEST(Lobpcg, KernelSymmEig)
 
     if constexpr (gko::is_complex_s<value_type>::value) {
         small_a_copy = gko::clone(this->small_a_cmplx);
-        // The kernel expects column-major, so transpose the matrices
-        auto small_a_t =
-            gko::share(gko::as<Mtx>(this->small_a_cmplx->transpose()));
-        this->small_a_cmplx = small_a_t;
-
         small_a = this->small_a_cmplx;
     } else {
         small_a_copy = gko::clone(this->small_a_r);
@@ -104,9 +99,9 @@ TYPED_TEST(Lobpcg, KernelSymmEig)
     gko::kernels::reference::lobpcg::symm_eig(this->exec, small_a.get(),
                                               &(this->small_e_vals), &work);
 
-    // On exit, the eigenvectors will be stored in the A
-    // matrix. We create submatrices for the vectors
-    // to check that A * x = lambda * x for each vector.
+    // On exit, the eigenvectors will be stored in the rows of the A matrix.
+    // We create submatrices for the vectors to check that A * x = lambda * x
+    // for each vector.
     for (gko::size_type i = 0; i < this->small_e_vals.get_size(); i++) {
         auto evec = gko::share(Mtx::create(
             this->exec, gko::dim<2>{this->small_e_vals.get_size(), 1},
@@ -149,21 +144,14 @@ TYPED_TEST(Lobpcg, KernelSymmGeneralizedEig)
     auto work = gko::array<char>(this->exec, 1);
     std::shared_ptr<Mtx> small_a;
     std::shared_ptr<Mtx> small_b;
-
+    // Both A and B will be overwritten by the LAPACK call; store copies for
+    // the final check.
     std::shared_ptr<Mtx> small_a_copy;
     std::shared_ptr<Mtx> small_b_copy;
 
     if constexpr (gko::is_complex_s<value_type>::value) {
         small_a_copy = gko::clone(this->small_a_cmplx);
         small_b_copy = gko::clone(this->small_b_cmplx);
-        // The kernel expects column-major, so transpose the matrices
-        auto small_a_t =
-            gko::share(gko::as<Mtx>(this->small_a_cmplx->transpose()));
-        auto small_b_t =
-            gko::share(gko::as<Mtx>(this->small_b_cmplx->transpose()));
-        this->small_a_cmplx = small_a_t;
-        this->small_b_cmplx = small_b_t;
-
         small_a = this->small_a_cmplx;
         small_b = this->small_b_cmplx;
     } else {
@@ -176,9 +164,9 @@ TYPED_TEST(Lobpcg, KernelSymmGeneralizedEig)
     gko::kernels::reference::lobpcg::symm_generalized_eig(
         this->exec, small_a.get(), small_b.get(), &(this->small_e_vals), &work);
 
-    // On exit, the eigenvectors will be stored in the A
-    // matrix. We create submatrices for the vectors
-    // to check that A * x = lambda * B * x for each vector.
+    // On exit, the eigenvectors will be stored in the rows of the A matrix.
+    // We create submatrices for the vectors to check that
+    // A * x = lambda * B * x for each vector.
     for (gko::size_type i = 0; i < this->small_e_vals.get_size(); i++) {
         auto evec = gko::share(Mtx::create(
             this->exec, gko::dim<2>{this->small_e_vals.get_size(), 1},
@@ -196,7 +184,7 @@ TYPED_TEST(Lobpcg, KernelSymmGeneralizedEig)
         } else {
             lambda = lambda_r;
         }
-        // A*x = lambda * B * x;
+        // A * x = lambda * B * x;
         auto a_x = Mtx::create(this->exec,
                                gko::dim<2>{this->small_e_vals.get_size(), 1});
         auto lambda_b_x = Mtx::create(
diff --git a/test/eigensolver/lobpcg_kernels.cpp b/test/eigensolver/lobpcg_kernels.cpp
@@ -101,21 +101,10 @@ TYPED_TEST(Lobpcg, KernelSymmEigIsEquivalentToRef)
     auto refwork = gko::array<char>(this->ref, 1);
     auto d_work = gko::array<char>(this->exec, 1);
 
-    std::shared_ptr<Mtx> d_small_a_copy;
-
     if constexpr (gko::is_complex_s<value_type>::value) {
-        auto small_a_t =
-            gko::share(gko::as<Mtx>(this->small_a_cmplx->transpose()));
-        this->small_a_cmplx = small_a_t;
         this->small_a = this->small_a_cmplx;
-
-        d_small_a_copy = gko::clone(this->d_small_a_cmplx);
-        auto d_small_a_t =
-            gko::share(gko::as<Mtx>(this->d_small_a_cmplx->transpose()));
-        this->d_small_a_cmplx = d_small_a_t;
         this->d_small_a = this->d_small_a_cmplx;
     } else {
-        d_small_a_copy = gko::clone(this->d_small_a_r);
         this->small_a = this->small_a_r;
         this->d_small_a = this->d_small_a_r;
     }
@@ -171,35 +160,12 @@ TYPED_TEST(Lobpcg, KernelSymmGeneralizedEigIsEquivalentToRef)
     auto refwork = gko::array<char>(this->ref, 1);
     auto d_work = gko::array<char>(this->exec, 1);
 
-    std::shared_ptr<Mtx> d_small_a_copy;
-    std::shared_ptr<Mtx> d_small_b_copy;
-
     if constexpr (gko::is_complex_s<value_type>::value) {
-        auto small_a_t =
-            gko::share(gko::as<Mtx>(this->small_a_cmplx->transpose()));
-        auto small_b_t =
-            gko::share(gko::as<Mtx>(this->small_b_cmplx->transpose()));
-        this->small_a_cmplx = small_a_t;
-        this->small_b_cmplx = small_b_t;
-
         this->small_a = this->small_a_cmplx;
         this->small_b = this->small_b_cmplx;
-
-
-        d_small_a_copy = gko::clone(this->d_small_a_cmplx);
-        d_small_b_copy = gko::clone(this->d_small_b_cmplx);
-        auto d_small_a_t =
-            gko::share(gko::as<Mtx>(this->d_small_a_cmplx->transpose()));
-        auto d_small_b_t =
-            gko::share(gko::as<Mtx>(this->d_small_b_cmplx->transpose()));
-        this->d_small_a_cmplx = d_small_a_t;
-        this->d_small_b_cmplx = d_small_b_t;
-
         this->d_small_a = this->d_small_a_cmplx;
         this->d_small_b = this->d_small_b_cmplx;
     } else {
-        d_small_a_copy = gko::clone(this->d_small_a_r);
-        d_small_b_copy = gko::clone(this->d_small_b_r);
         this->small_a = this->small_a_r;
         this->small_b = this->small_b_r;
         this->d_small_a = this->d_small_a_r;