Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions common/cuda_hip/matrix/dense_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -792,6 +792,34 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);


template <typename ValueType, typename IndexType>
void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* a,
const matrix::Csr<ValueType, IndexType>* b,
matrix::Dense<ValueType>* c)
{
// TODO: implement c = a * b with single thread
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// TODO: implement c = a * b with single thread

sorry for my wrong copy. you can delete this comment here

GKO_NOT_IMPLEMENTED;
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL);


template <typename ValueType, typename IndexType>
void mspm(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* alpha,
const matrix::Dense<ValueType>* a,
const matrix::Csr<ValueType, IndexType>* b,
const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
{
// TODO: implement c = alpha * a * b + beta * c with single thread
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// TODO: implement c = alpha * a * b + beta * c with single thread

you can delete this, too

GKO_NOT_IMPLEMENTED;
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);


template <typename ValueType>
void transpose(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* orig,
Expand Down
2 changes: 2 additions & 0 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,8 @@ namespace dense {

GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);
GKO_STUB_VALUE_CONVERSION_OR_COPY(GKO_DECLARE_DENSE_COPY_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL);
GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL);
Expand Down
49 changes: 37 additions & 12 deletions core/matrix/dense.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ namespace {

GKO_REGISTER_OPERATION(simple_apply, dense::simple_apply);
GKO_REGISTER_OPERATION(apply, dense::apply);
GKO_REGISTER_OPERATION(simple_mspm, dense::simple_mspm);
GKO_REGISTER_OPERATION(mspm, dense::mspm);
GKO_REGISTER_OPERATION(copy, dense::copy);
GKO_REGISTER_OPERATION(fill, dense::fill);
GKO_REGISTER_OPERATION(scale, dense::scale);
Expand Down Expand Up @@ -110,25 +112,48 @@ GKO_REGISTER_OPERATION(add_scaled_identity, dense::add_scaled_identity);
template <typename ValueType>
void Dense<ValueType>::apply_impl(const LinOp* b, LinOp* x) const
{
precision_dispatch_real_complex<ValueType>(
[this](auto dense_b, auto dense_x) {
this->get_executor()->run(
dense::make_simple_apply(this, dense_b, dense_x));
},
b, x);
// TODO: it does not consider mixed precision for MSpM
if (auto b_csr =
dynamic_cast<const matrix::Csr<ValueType, gko::int32>*>(b)) {
this->get_executor()->run(
dense::make_simple_mspm(this, b_csr, as<Dense>(x)));
} else if (auto b_csr =
dynamic_cast<const matrix::Csr<ValueType, gko::int64>*>(b)) {
this->get_executor()->run(
dense::make_simple_mspm(this, b_csr, as<Dense>(x)));
} else {
precision_dispatch_real_complex<ValueType>(
[this](auto dense_b, auto dense_x) {
this->get_executor()->run(
dense::make_simple_apply(this, dense_b, dense_x));
},
b, x);
}
}


template <typename ValueType>
void Dense<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
const LinOp* beta, LinOp* x) const
{
precision_dispatch_real_complex<ValueType>(
[this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
this->get_executor()->run(dense::make_apply(
dense_alpha, this, dense_b, dense_beta, dense_x));
},
alpha, b, beta, x);
// TODO: it does not consider mixed precision for MSpM
if (auto b_csr =
dynamic_cast<const matrix::Csr<ValueType, gko::int32>*>(b)) {
this->get_executor()->run(dense::make_mspm(
as<Dense>(alpha), this, b_csr, as<Dense>(beta), as<Dense>(x)));
} else if (auto b_csr =
dynamic_cast<const matrix::Csr<ValueType, gko::int64>*>(b)) {
this->get_executor()->run(dense::make_mspm(
as<Dense>(alpha), this, b_csr, as<Dense>(beta), as<Dense>(x)));
} else {
precision_dispatch_real_complex<ValueType>(
[this](auto dense_alpha, auto dense_b, auto dense_beta,
auto dense_x) {
this->get_executor()->run(dense::make_apply(
dense_alpha, this, dense_b, dense_beta, dense_x));
},
alpha, b, beta, x);
}
}


Expand Down
19 changes: 18 additions & 1 deletion core/matrix/dense_kernels.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand Down Expand Up @@ -31,6 +31,19 @@ namespace kernels {
const matrix::Dense<_type>* a, const matrix::Dense<_type>* b, \
const matrix::Dense<_type>* beta, matrix::Dense<_type>* c)

#define GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL(_vtype, _itype) \
void simple_mspm(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_vtype>* a, \
const matrix::Csr<_vtype, _itype>* b, \
matrix::Dense<_vtype>* c)

#define GKO_DECLARE_DENSE_MSPM_KERNEL(_vtype, _itype) \
void mspm(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_vtype>* alpha, \
const matrix::Dense<_vtype>* a, \
const matrix::Csr<_vtype, _itype>* b, \
const matrix::Dense<_vtype>* beta, matrix::Dense<_vtype>* c)

#define GKO_DECLARE_DENSE_COPY_KERNEL(_intype, _outtype) \
void copy(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_intype>* input, \
Expand Down Expand Up @@ -354,6 +367,10 @@ namespace kernels {
GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_APPLY_KERNEL(ValueType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_DENSE_MSPM_KERNEL(ValueType, IndexType); \
template <typename InValueType, typename OutValueType> \
GKO_DECLARE_DENSE_COPY_KERNEL(InValueType, OutValueType); \
template <typename ValueType> \
Expand Down
28 changes: 28 additions & 0 deletions dpcpp/matrix/dense_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,34 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);


template <typename ValueType, typename IndexType>
void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* a,
const matrix::Csr<ValueType, IndexType>* b,
matrix::Dense<ValueType>* c)
{
// TODO: implement c = a * b with single thread
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// TODO: implement c = a * b with single thread

GKO_NOT_IMPLEMENTED;
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL);


template <typename ValueType, typename IndexType>
void mspm(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* alpha,
const matrix::Dense<ValueType>* a,
const matrix::Csr<ValueType, IndexType>* b,
const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
{
// TODO: implement c = alpha * a * b + beta * c with single thread
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// TODO: implement c = alpha * a * b + beta * c with single thread

GKO_NOT_IMPLEMENTED;
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);


template <typename ValueType, typename IndexType>
void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* source,
Expand Down
86 changes: 86 additions & 0 deletions omp/matrix/dense_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "accessor/block_col_major.hpp"
#include "accessor/range.hpp"
#include "core/components/prefix_sum_kernels.hpp"
#include "core/matrix/csr_accessor_helper.hpp"


namespace gko {
Expand Down Expand Up @@ -138,6 +139,91 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);

template <typename ValueType, typename IndexType, typename InitAcc, typename DefMultOperand>
void mspm_auxiliary(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* a,
const matrix::Csr<ValueType, IndexType>* b,
matrix::Dense<ValueType>* c,
InitAcc initialize_accumulator,
DefMultOperand define_multiplication_operand)
Comment on lines +147 to +148
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
InitAcc initialize_accumulator,
DefMultOperand define_multiplication_operand)
Init init,
MultOperand mult_op)

I might shorten these names.

{
//initialization
const auto b_rowptrs = b->get_const_row_ptrs();
const auto b_cols = b->get_const_col_idxs();
const auto a_vals = acc::helper::build_const_rrm_accessor<ValueType>(a);
const auto b_vals = acc::helper::build_const_rrm_accessor<ValueType>(b);
Comment on lines +153 to +154
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is unnecessary because the kernel is working for uniform precision now.
Does it affect your performance?

const auto c_vals_ptr = c->get_values();
//accumulate partial results of a row
const auto sub_acc_size = b->get_size()[1]; //each accumulator stores a whole row
const size_t nb_th = omp_get_max_threads(); //number of threads
array<ValueType> acc_array(exec, sub_acc_size*nb_th); //one accumulator per row
auto acc_ptr = acc_array.get_data();
//compute the multiplication, 1 thread per row
#pragma omp parallel
{
const auto th_id = omp_get_thread_num();
const auto th_acc_begin_ptr = acc_ptr + th_id*sub_acc_size;
const auto th_acc_end_ptr = acc_ptr + (th_id+1)*sub_acc_size;
#pragma omp for
for(IndexType row=zero<IndexType>(); row<c->get_size()[0]; row++){
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for(IndexType row=zero<IndexType>(); row<c->get_size()[0]; row++){
for(auto row=zero<IndexType>(); row<c->get_size()[0]; row++){

nit

//reinitialize accumulator to 0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wrong comment.

initialize_accumulator(th_acc_begin_ptr, sub_acc_size, row);
//iterate over the whole matrix b
for(IndexType k=zero<IndexType>(); k<b->get_size()[0]; k++){
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for(IndexType k=zero<IndexType>(); k<b->get_size()[0]; k++){
for(auto k=zero<IndexType>(); k<b->get_size()[0]; k++){

nit

const auto val_A = define_multiplication_operand(row, k);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

//iterate over the non-zero values of a row
for(IndexType idx_B=b_rowptrs[k]; idx_B<b_rowptrs[k+1]; idx_B++){
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for(IndexType idx_B=b_rowptrs[k]; idx_B<b_rowptrs[k+1]; idx_B++){
for(auto idx_b=b_rowptrs[k]; idx_b<b_rowptrs[k+1]; idx_b++){

also for the name

const auto col = b_cols[idx_B];
th_acc_begin_ptr[col] += val_A * b_vals(idx_B);
}
}
//move accumulator to result
auto out_ptr = c_vals_ptr + row*c->get_stride();
std::copy(th_acc_begin_ptr, th_acc_end_ptr, out_ptr);
Comment on lines +181 to +182
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: Maybe I missing that. Could you remind me why you need the accumulator array?
From the implementation, one thread handle a row, so you can directly operate on the output data without accumulator, right?

}
}
}

template <typename ValueType, typename IndexType>
void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* a,
const matrix::Csr<ValueType, IndexType>* b,
matrix::Dense<ValueType>* c)
{
auto simple_init_acc = [b](ValueType* acc_begin_ptr, IndexType acc_size, IndexType row){
std::fill(acc_begin_ptr, acc_begin_ptr + acc_size, zero<ValueType>()); //reinitialize accumulator with zeroes
};
auto simple_def_mult_operand = [a](IndexType row, IndexType k){
return a->at(row, k); //no multiplication by alpha, just get value in a
};
mspm_auxiliary(exec, a, b, c, simple_init_acc, simple_def_mult_operand);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL);


template <typename ValueType, typename IndexType>
void mspm(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* alpha,
const matrix::Dense<ValueType>* a,
const matrix::Csr<ValueType, IndexType>* b,
const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
{
auto advanced_init_acc = [b, c, beta](ValueType* acc_begin_ptr, IndexType acc_size, IndexType row){
const auto begin_row_c_vals_ptr = c->get_const_values() + c->get_stride()*row;
std::transform( //initialize the accumulator with c + beta
begin_row_c_vals_ptr, begin_row_c_vals_ptr + acc_size,
Comment on lines +215 to +216
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
std::transform( //initialize the accumulator with c + beta
begin_row_c_vals_ptr, begin_row_c_vals_ptr + acc_size,
//initialize the accumulator with c + beta
std::transform(
begin_row_c_vals_ptr, begin_row_c_vals_ptr + acc_size,

I would move comment out of function call to let clang-format do format unless there is a good reason.

acc_begin_ptr, std::bind1st(std::multiplies<ValueType>(), beta->at(0, 0)));
};
auto advanced_def_mult_operand = [a, alpha](IndexType row, IndexType k){
return alpha->at(0, 0) * a->at(row, k); //multiply a(row,k) by alpha
};
mspm_auxiliary(exec, a, b, c, advanced_init_acc, advanced_def_mult_operand);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);


template <typename ValueType, typename IndexType>
void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
Expand Down
79 changes: 79 additions & 0 deletions reference/matrix/dense_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "accessor/range.hpp"
#include "core/base/mixed_precision_types.hpp"
#include "core/components/prefix_sum_kernels.hpp"
#include "core/matrix/csr_accessor_helper.hpp"


namespace gko {
Expand Down Expand Up @@ -91,6 +92,84 @@ void apply(std::shared_ptr<const ReferenceExecutor> exec,

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);

template <typename ValueType, typename IndexType, typename InitAcc, typename DefMultOperand>
void mspm_auxiliary(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* a,
const matrix::Csr<ValueType, IndexType>* b,
matrix::Dense<ValueType>* c,
InitAcc initialize_accumulator,
DefMultOperand define_multiplication_operand)
{
//initialization
const auto b_rowptrs = b->get_const_row_ptrs();
const auto b_cols = b->get_const_col_idxs();
const auto a_vals = acc::helper::build_const_rrm_accessor<ValueType>(a);
const auto b_vals = acc::helper::build_const_rrm_accessor<ValueType>(b);
const auto c_vals_ptr = c->get_values();
//accumulate partial results of a row
const auto acc_size = b->get_size()[1]; //the accumulator stores a whole row
array<ValueType> acc_array(exec, acc_size);
auto acc_begin_ptr = acc_array.get_data();
auto acc_end_ptr = acc_begin_ptr + acc_size;
//compute the multiplication
for(IndexType row=zero<IndexType>(); row<c->get_size()[0]; row++){ //iterate over a's row
//reinitialize accumulator to 0
initialize_accumulator(acc_begin_ptr, acc_size, row);
//iterate over the whole matrix b
for(IndexType k=zero<IndexType>(); k<b->get_size()[0]; k++){
const auto val_A = define_multiplication_operand(row, k);
//iterate over the non-zero values of a row
for(IndexType idx_B=b_rowptrs[k]; idx_B<b_rowptrs[k+1]; idx_B++){
const auto col = b_cols[idx_B];
acc_begin_ptr[col] += val_A * b_vals(idx_B);
}
}
//move accumulator to result
auto out_ptr = c_vals_ptr + row*c->get_stride();
std::copy(acc_begin_ptr, acc_end_ptr, out_ptr);
}
}

template <typename ValueType, typename IndexType>
void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* a,
const matrix::Csr<ValueType, IndexType>* b,
matrix::Dense<ValueType>* c)
{
auto simple_init_acc = [b](ValueType* acc_begin_ptr, IndexType acc_size, IndexType row){
std::fill(acc_begin_ptr, acc_begin_ptr + acc_size, zero<ValueType>()); //reinitialize accumulator with zeroes
};
auto simple_def_mult_operand = [a](IndexType row, IndexType k){
return a->at(row, k); //no multiplication by alpha, just get value in a
};
mspm_auxiliary(exec, a, b, c, simple_init_acc, simple_def_mult_operand);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL);


template <typename ValueType, typename IndexType>
void mspm(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* alpha,
const matrix::Dense<ValueType>* a,
const matrix::Csr<ValueType, IndexType>* b,
const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
{
auto advanced_init_acc = [b, c, beta](ValueType* acc_begin_ptr, IndexType acc_size, IndexType row){
const auto begin_row_c_vals_ptr = c->get_const_values() + c->get_stride()*row;
std::transform( //initialize the accumulator with c + beta
begin_row_c_vals_ptr, begin_row_c_vals_ptr + acc_size,
acc_begin_ptr, std::bind1st(std::multiplies<ValueType>(), beta->at(0, 0)));
};
auto advanced_def_mult_operand = [a, alpha](IndexType row, IndexType k){
return alpha->at(0, 0) * a->at(row, k); //multiply a(row,k) by alpha
};
Comment on lines +165 to +167
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
auto advanced_def_mult_operand = [a, alpha](IndexType row, IndexType k){
return alpha->at(0, 0) * a->at(row, k); //multiply a(row,k) by alpha
};
auto advanced_def_mult_operand = [alpha](auto a_val, auto b_val){
return alpha->at(0, 0) * a_val * b_val; //multiply a(row,k) by alpha
};

from the name, I was expecting this form. also this will reduce the unclear data access from the function call, but I do not have strong opinion on this yet.

mspm_auxiliary(exec, a, b, c, advanced_init_acc, advanced_def_mult_operand);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);


template <typename InValueType, typename OutValueType>
void copy(std::shared_ptr<const DefaultExecutor> exec,
Expand Down
Loading
Loading