Skip to content

Commit b0f9764

Browse files
committed
Change dense eigenvalue kernels to expect row-major data
1 parent 3c600f1 commit b0f9764

File tree

4 files changed

+89
-57
lines changed

4 files changed

+89
-57
lines changed

common/cuda_hip/eigensolver/lobpcg_kernels.cpp

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include <limits>
88

9+
#include <ginkgo/core/base/math.hpp>
910
#include <ginkgo/core/base/types.hpp>
1011

1112
#include "common/cuda_hip/base/blas_bindings.hpp"
@@ -28,6 +29,36 @@ constexpr int default_block_size = 512;
2829
namespace kernel {
2930

3031

32+
template <typename ValueType>
33+
__global__ __launch_bounds__(default_block_size) void matrix_conj(
34+
const int32 n, ValueType* a, const int32 a_stride)
35+
{
36+
const auto tidx = thread::get_thread_id_flat();
37+
const auto row = tidx / n;
38+
const auto col = tidx % n;
39+
const ValueType zero = gko::zero<ValueType>();
40+
if (row < n && col < n) {
41+
a[row * a_stride + col] = conj(a[row * a_stride + col]);
42+
}
43+
}
44+
45+
46+
template <typename ValueType>
47+
__global__ __launch_bounds__(default_block_size) void two_matrix_conj(
48+
const int32 n, ValueType* a, const int32 a_stride, ValueType* b,
49+
const int32 b_stride)
50+
{
51+
const auto tidx = thread::get_thread_id_flat();
52+
const auto row = tidx / n;
53+
const auto col = tidx % n;
54+
const ValueType zero = gko::zero<ValueType>();
55+
if (row < n && col < n) {
56+
a[row * a_stride + col] = conj(a[row * a_stride + col]);
57+
b[row * b_stride + col] = conj(b[row * b_stride + col]);
58+
}
59+
}
60+
61+
3162
template <typename ValueType>
3263
__global__ __launch_bounds__(default_block_size) void fill_lower_col_major(
3364
const int32 n, const ValueType* source, const int32 source_stride,
@@ -64,8 +95,19 @@ void symm_eig(std::shared_ptr<const DefaultExecutor> exec,
6495
throw OverflowError(__FILE__, __LINE__,
6596
name_demangling::get_type_name(typeid(int32)));
6697
}
67-
int32 n = static_cast<int32>(a->get_size()[1]); // column-major
98+
int32 n = static_cast<int32>(a->get_size()[0]);
6899
int32 lda = static_cast<int32>(a->get_stride());
100+
// The dev_lapack routine expects column-major data, so we take the
101+
// conjugate to perform A = A^T.
102+
if constexpr (gko::is_complex_s<ValueType>::value) {
103+
const auto grid_dim = ceildiv(n * n, default_block_size);
104+
if (grid_dim > 0) {
105+
kernel::matrix_conj<<<grid_dim, default_block_size, 0,
106+
exec->get_stream()>>>(
107+
n, as_device_type(a->get_values()), lda);
108+
}
109+
}
110+
69111
int32 fp_buffer_num_elems;
70112
dev_lapack::syevd_buffersize(handle, LAPACK_EIG_VECTOR, LAPACK_FILL_LOWER,
71113
n, a->get_values(), lda, e_vals->get_data(),
@@ -119,9 +161,22 @@ void symm_generalized_eig(std::shared_ptr<const DefaultExecutor> exec,
119161
throw OverflowError(__FILE__, __LINE__,
120162
name_demangling::get_type_name(typeid(int32)));
121163
}
122-
int32 n = static_cast<int32>(a->get_size()[1]); // column-major
164+
165+
int32 n = static_cast<int32>(a->get_size()[0]);
123166
int32 lda = static_cast<int32>(a->get_stride());
124167
int32 ldb = static_cast<int32>(b->get_stride());
168+
// The dev_lapack routine expects column-major data, so we take the
169+
// conjugate to perform A = A^T.
170+
if constexpr (gko::is_complex_s<ValueType>::value) {
171+
const auto grid_dim = ceildiv(n * n, default_block_size);
172+
if (grid_dim > 0) {
173+
kernel::two_matrix_conj<<<grid_dim, default_block_size, 0,
174+
exec->get_stream()>>>(
175+
n, as_device_type(a->get_values()), lda,
176+
as_device_type(b->get_values()), ldb);
177+
}
178+
}
179+
125180
int32 fp_buffer_num_elems;
126181
dev_lapack::sygvd_buffersize(handle, LAPACK_EIG_TYPE_1, LAPACK_EIG_VECTOR,
127182
LAPACK_FILL_LOWER, n, a->get_values(), lda,

reference/eigensolver/lobpcg_kernels.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include "core/eigensolver/lobpcg_kernels.hpp"
66

7+
#include <ginkgo/core/base/math.hpp>
78
#include <ginkgo/core/base/types.hpp>
89

910
#include "reference/base/blas_bindings.hpp"
@@ -65,6 +66,16 @@ void symm_eig(std::shared_ptr<const ReferenceExecutor> exec,
6566
e_vals->get_data(), work, &fp_buffer_num_elems, iwork,
6667
&int_buffer_num_elems);
6768
} else { // Complex data type
69+
70+
// LAPACK expects column-major data, so we need to take the conjugate
71+
// of the input matrix (same as performing A = A^T)
72+
ValueType* data = a->get_values();
73+
for (int32 row = 0; row < n; ++row) {
74+
for (int32 col = 0; col < n; ++col) {
75+
data[row * lda + col] = conj(data[row * lda + col]);
76+
}
77+
}
78+
6879
int32 fp_buffer_num_elems;
6980
int32 rfp_buffer_num_elems;
7081
int32 int_buffer_num_elems;
@@ -151,6 +162,18 @@ void symm_generalized_eig(std::shared_ptr<const ReferenceExecutor> exec,
151162
b->get_values(), &ldb, e_vals->get_data(), work,
152163
&fp_buffer_num_elems, iwork, &int_buffer_num_elems);
153164
} else { // Complex data type
165+
166+
// LAPACK expects column-major data, so we need to take the conjugate
167+
// of the input matrices (same as performing A = A^T)
168+
ValueType* a_data = a->get_values();
169+
ValueType* b_data = b->get_values();
170+
for (int32 row = 0; row < n; ++row) {
171+
for (int32 col = 0; col < n; ++col) {
172+
a_data[row * lda + col] = conj(a_data[row * lda + col]);
173+
b_data[row * lda + col] = conj(b_data[row * lda + col]);
174+
}
175+
}
176+
154177
int32 fp_buffer_num_elems;
155178
int32 rfp_buffer_num_elems;
156179
int32 int_buffer_num_elems;

reference/test/eigensolver/lobpcg_kernels.cpp

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,6 @@ TYPED_TEST(Lobpcg, KernelSymmEig)
9090

9191
if constexpr (gko::is_complex_s<value_type>::value) {
9292
small_a_copy = gko::clone(this->small_a_cmplx);
93-
// The kernel expects column-major, so transpose the matrices
94-
auto small_a_t =
95-
gko::share(gko::as<Mtx>(this->small_a_cmplx->transpose()));
96-
this->small_a_cmplx = small_a_t;
97-
9893
small_a = this->small_a_cmplx;
9994
} else {
10095
small_a_copy = gko::clone(this->small_a_r);
@@ -104,9 +99,9 @@ TYPED_TEST(Lobpcg, KernelSymmEig)
10499
gko::kernels::reference::lobpcg::symm_eig(this->exec, small_a.get(),
105100
&(this->small_e_vals), &work);
106101

107-
// On exit, the eigenvectors will be stored in the A
108-
// matrix. We create submatrices for the vectors
109-
// to check that A * x = lambda * x for each vector.
102+
// On exit, the eigenvectors will be stored in the rows of the A matrix.
103+
// We create submatrices for the vectors to check that A * x = lambda * x
104+
// for each vector.
110105
for (gko::size_type i = 0; i < this->small_e_vals.get_size(); i++) {
111106
auto evec = gko::share(Mtx::create(
112107
this->exec, gko::dim<2>{this->small_e_vals.get_size(), 1},
@@ -149,21 +144,14 @@ TYPED_TEST(Lobpcg, KernelSymmGeneralizedEig)
149144
auto work = gko::array<char>(this->exec, 1);
150145
std::shared_ptr<Mtx> small_a;
151146
std::shared_ptr<Mtx> small_b;
152-
147+
// Both A and B will be overwritten by the LAPACK call; store copies for
148+
// the final check.
153149
std::shared_ptr<Mtx> small_a_copy;
154150
std::shared_ptr<Mtx> small_b_copy;
155151

156152
if constexpr (gko::is_complex_s<value_type>::value) {
157153
small_a_copy = gko::clone(this->small_a_cmplx);
158154
small_b_copy = gko::clone(this->small_b_cmplx);
159-
// The kernel expects column-major, so transpose the matrices
160-
auto small_a_t =
161-
gko::share(gko::as<Mtx>(this->small_a_cmplx->transpose()));
162-
auto small_b_t =
163-
gko::share(gko::as<Mtx>(this->small_b_cmplx->transpose()));
164-
this->small_a_cmplx = small_a_t;
165-
this->small_b_cmplx = small_b_t;
166-
167155
small_a = this->small_a_cmplx;
168156
small_b = this->small_b_cmplx;
169157
} else {
@@ -176,9 +164,9 @@ TYPED_TEST(Lobpcg, KernelSymmGeneralizedEig)
176164
gko::kernels::reference::lobpcg::symm_generalized_eig(
177165
this->exec, small_a.get(), small_b.get(), &(this->small_e_vals), &work);
178166

179-
// On exit, the eigenvectors will be stored in the A
180-
// matrix. We create submatrices for the vectors
181-
// to check that A * x = lambda * B * x for each vector.
167+
// On exit, the eigenvectors will be stored in the rows of the A matrix.
168+
// We create submatrices for the vectors to check that
169+
// A * x = lambda * B * x for each vector.
182170
for (gko::size_type i = 0; i < this->small_e_vals.get_size(); i++) {
183171
auto evec = gko::share(Mtx::create(
184172
this->exec, gko::dim<2>{this->small_e_vals.get_size(), 1},
@@ -196,7 +184,7 @@ TYPED_TEST(Lobpcg, KernelSymmGeneralizedEig)
196184
} else {
197185
lambda = lambda_r;
198186
}
199-
// A*x = lambda * B * x;
187+
// A * x = lambda * B * x;
200188
auto a_x = Mtx::create(this->exec,
201189
gko::dim<2>{this->small_e_vals.get_size(), 1});
202190
auto lambda_b_x = Mtx::create(

test/eigensolver/lobpcg_kernels.cpp

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -101,21 +101,10 @@ TYPED_TEST(Lobpcg, KernelSymmEigIsEquivalentToRef)
101101
auto refwork = gko::array<char>(this->ref, 1);
102102
auto d_work = gko::array<char>(this->exec, 1);
103103

104-
std::shared_ptr<Mtx> d_small_a_copy;
105-
106104
if constexpr (gko::is_complex_s<value_type>::value) {
107-
auto small_a_t =
108-
gko::share(gko::as<Mtx>(this->small_a_cmplx->transpose()));
109-
this->small_a_cmplx = small_a_t;
110105
this->small_a = this->small_a_cmplx;
111-
112-
d_small_a_copy = gko::clone(this->d_small_a_cmplx);
113-
auto d_small_a_t =
114-
gko::share(gko::as<Mtx>(this->d_small_a_cmplx->transpose()));
115-
this->d_small_a_cmplx = d_small_a_t;
116106
this->d_small_a = this->d_small_a_cmplx;
117107
} else {
118-
d_small_a_copy = gko::clone(this->d_small_a_r);
119108
this->small_a = this->small_a_r;
120109
this->d_small_a = this->d_small_a_r;
121110
}
@@ -171,35 +160,12 @@ TYPED_TEST(Lobpcg, KernelSymmGeneralizedEigIsEquivalentToRef)
171160
auto refwork = gko::array<char>(this->ref, 1);
172161
auto d_work = gko::array<char>(this->exec, 1);
173162

174-
std::shared_ptr<Mtx> d_small_a_copy;
175-
std::shared_ptr<Mtx> d_small_b_copy;
176-
177163
if constexpr (gko::is_complex_s<value_type>::value) {
178-
auto small_a_t =
179-
gko::share(gko::as<Mtx>(this->small_a_cmplx->transpose()));
180-
auto small_b_t =
181-
gko::share(gko::as<Mtx>(this->small_b_cmplx->transpose()));
182-
this->small_a_cmplx = small_a_t;
183-
this->small_b_cmplx = small_b_t;
184-
185164
this->small_a = this->small_a_cmplx;
186165
this->small_b = this->small_b_cmplx;
187-
188-
189-
d_small_a_copy = gko::clone(this->d_small_a_cmplx);
190-
d_small_b_copy = gko::clone(this->d_small_b_cmplx);
191-
auto d_small_a_t =
192-
gko::share(gko::as<Mtx>(this->d_small_a_cmplx->transpose()));
193-
auto d_small_b_t =
194-
gko::share(gko::as<Mtx>(this->d_small_b_cmplx->transpose()));
195-
this->d_small_a_cmplx = d_small_a_t;
196-
this->d_small_b_cmplx = d_small_b_t;
197-
198166
this->d_small_a = this->d_small_a_cmplx;
199167
this->d_small_b = this->d_small_b_cmplx;
200168
} else {
201-
d_small_a_copy = gko::clone(this->d_small_a_r);
202-
d_small_b_copy = gko::clone(this->d_small_b_r);
203169
this->small_a = this->small_a_r;
204170
this->small_b = this->small_b_r;
205171
this->d_small_a = this->d_small_a_r;

0 commit comments

Comments
 (0)