ginkgo-project · CoxyMielle · Aug 8, 2025 · Aug 13, 2025 · yhmtsai · Aug 13, 2025
diff --git a/common/cuda_hip/matrix/dense_kernels.cpp b/common/cuda_hip/matrix/dense_kernels.cpp
@@ -792,6 +792,34 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Dense<ValueType>* a,
+                 const matrix::Csr<ValueType, IndexType>* b,
+                 matrix::Dense<ValueType>* c)
+{
+    // TODO: implement c = a * b with single thread
-    // TODO: implement c = a * b with single thread
-    // TODO: implement c = a * b with single thread
+    GKO_NOT_IMPLEMENTED;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void mspm(std::shared_ptr<const DefaultExecutor> exec,
+          const matrix::Dense<ValueType>* alpha,
+          const matrix::Dense<ValueType>* a,
+          const matrix::Csr<ValueType, IndexType>* b,
+          const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
+{
+    // TODO: implement c = alpha * a * b + beta * c with single thread
-    // TODO: implement c = alpha * a * b + beta * c with single thread
-    // TODO: implement c = alpha * a * b + beta * c with single thread
+    GKO_NOT_IMPLEMENTED;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);
+
+
 template <typename ValueType>
 void transpose(std::shared_ptr<const DefaultExecutor> exec,
                const matrix::Dense<ValueType>* orig,

diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -440,6 +440,8 @@ namespace dense {
 
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);
 GKO_STUB_VALUE_CONVERSION_OR_COPY(GKO_DECLARE_DENSE_COPY_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL);
 GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL);

diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
@@ -42,6 +42,8 @@ namespace {
 
 GKO_REGISTER_OPERATION(simple_apply, dense::simple_apply);
 GKO_REGISTER_OPERATION(apply, dense::apply);
+GKO_REGISTER_OPERATION(simple_mspm, dense::simple_mspm);
+GKO_REGISTER_OPERATION(mspm, dense::mspm);
 GKO_REGISTER_OPERATION(copy, dense::copy);
 GKO_REGISTER_OPERATION(fill, dense::fill);
 GKO_REGISTER_OPERATION(scale, dense::scale);
@@ -110,25 +112,48 @@ GKO_REGISTER_OPERATION(add_scaled_identity, dense::add_scaled_identity);
 template <typename ValueType>
 void Dense<ValueType>::apply_impl(const LinOp* b, LinOp* x) const
 {
-    precision_dispatch_real_complex<ValueType>(
-        [this](auto dense_b, auto dense_x) {
-            this->get_executor()->run(
-                dense::make_simple_apply(this, dense_b, dense_x));
-        },
-        b, x);
+    // TODO: it does not consider mixed precision for MSpM
+    if (auto b_csr =
+            dynamic_cast<const matrix::Csr<ValueType, gko::int32>*>(b)) {
+        this->get_executor()->run(
+            dense::make_simple_mspm(this, b_csr, as<Dense>(x)));
+    } else if (auto b_csr =
+                   dynamic_cast<const matrix::Csr<ValueType, gko::int64>*>(b)) {
+        this->get_executor()->run(
+            dense::make_simple_mspm(this, b_csr, as<Dense>(x)));
+    } else {
+        precision_dispatch_real_complex<ValueType>(
+            [this](auto dense_b, auto dense_x) {
+                this->get_executor()->run(
+                    dense::make_simple_apply(this, dense_b, dense_x));
+            },
+            b, x);
+    }
 }
 
 
 template <typename ValueType>
 void Dense<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
                                   const LinOp* beta, LinOp* x) const
 {
-    precision_dispatch_real_complex<ValueType>(
-        [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
-            this->get_executor()->run(dense::make_apply(
-                dense_alpha, this, dense_b, dense_beta, dense_x));
-        },
-        alpha, b, beta, x);
+    // TODO: it does not consider mixed precision for MSpM
+    if (auto b_csr =
+            dynamic_cast<const matrix::Csr<ValueType, gko::int32>*>(b)) {
+        this->get_executor()->run(dense::make_mspm(
+            as<Dense>(alpha), this, b_csr, as<Dense>(beta), as<Dense>(x)));
+    } else if (auto b_csr =
+                   dynamic_cast<const matrix::Csr<ValueType, gko::int64>*>(b)) {
+        this->get_executor()->run(dense::make_mspm(
+            as<Dense>(alpha), this, b_csr, as<Dense>(beta), as<Dense>(x)));
+    } else {
+        precision_dispatch_real_complex<ValueType>(
+            [this](auto dense_alpha, auto dense_b, auto dense_beta,
+                   auto dense_x) {
+                this->get_executor()->run(dense::make_apply(
+                    dense_alpha, this, dense_b, dense_beta, dense_x));
+            },
+            alpha, b, beta, x);
+    }
 }
 
 

diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -31,6 +31,19 @@ namespace kernels {
                const matrix::Dense<_type>* a, const matrix::Dense<_type>* b, \
                const matrix::Dense<_type>* beta, matrix::Dense<_type>* c)
 
+#define GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL(_vtype, _itype)      \
+    void simple_mspm(std::shared_ptr<const DefaultExecutor> exec, \
+                     const matrix::Dense<_vtype>* a,              \
+                     const matrix::Csr<_vtype, _itype>* b,        \
+                     matrix::Dense<_vtype>* c)
+
+#define GKO_DECLARE_DENSE_MSPM_KERNEL(_vtype, _itype)      \
+    void mspm(std::shared_ptr<const DefaultExecutor> exec, \
+              const matrix::Dense<_vtype>* alpha,          \
+              const matrix::Dense<_vtype>* a,              \
+              const matrix::Csr<_vtype, _itype>* b,        \
+              const matrix::Dense<_vtype>* beta, matrix::Dense<_vtype>* c)
+
 #define GKO_DECLARE_DENSE_COPY_KERNEL(_intype, _outtype)   \
     void copy(std::shared_ptr<const DefaultExecutor> exec, \
               const matrix::Dense<_intype>* input,         \
@@ -354,6 +367,10 @@ namespace kernels {
     GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL(ValueType);                         \
     template <typename ValueType>                                             \
     GKO_DECLARE_DENSE_APPLY_KERNEL(ValueType);                                \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL(ValueType, IndexType);               \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_MSPM_KERNEL(ValueType, IndexType);                      \
     template <typename InValueType, typename OutValueType>                    \
     GKO_DECLARE_DENSE_COPY_KERNEL(InValueType, OutValueType);                 \
     template <typename ValueType>                                             \

diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -277,6 +277,34 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Dense<ValueType>* a,
+                 const matrix::Csr<ValueType, IndexType>* b,
+                 matrix::Dense<ValueType>* c)
+{
+    // TODO: implement c = a * b with single thread
-    // TODO: implement c = a * b with single thread
-    // TODO: implement c = a * b with single thread
+    GKO_NOT_IMPLEMENTED;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void mspm(std::shared_ptr<const DefaultExecutor> exec,
+          const matrix::Dense<ValueType>* alpha,
+          const matrix::Dense<ValueType>* a,
+          const matrix::Csr<ValueType, IndexType>* b,
+          const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
+{
+    // TODO: implement c = alpha * a * b + beta * c with single thread
-    // TODO: implement c = alpha * a * b + beta * c with single thread
-    // TODO: implement c = alpha * a * b + beta * c with single thread
+    GKO_NOT_IMPLEMENTED;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,

diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
@@ -23,6 +23,7 @@
 #include "accessor/block_col_major.hpp"
 #include "accessor/range.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/csr_accessor_helper.hpp"
 
 
 namespace gko {
@@ -138,6 +139,91 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
+template <typename ValueType, typename IndexType, typename InitAcc, typename DefMultOperand>
+void mspm_auxiliary(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Dense<ValueType>* a,
+                    const matrix::Csr<ValueType, IndexType>* b,
+                    matrix::Dense<ValueType>* c,
+                    InitAcc initialize_accumulator,
+                    DefMultOperand define_multiplication_operand)
-                    InitAcc initialize_accumulator,
-                    DefMultOperand define_multiplication_operand)
+                    Init init,
+                    MultOperand mult_op)
-                    InitAcc initialize_accumulator,
-                    DefMultOperand define_multiplication_operand)
+                    Init init,
+                    MultOperand mult_op)
+{
+    //initialization
+    const auto b_rowptrs = b->get_const_row_ptrs();
+    const auto b_cols = b->get_const_col_idxs();
+    const auto a_vals = acc::helper::build_const_rrm_accessor<ValueType>(a);
+    const auto b_vals = acc::helper::build_const_rrm_accessor<ValueType>(b);
+    const auto c_vals_ptr = c->get_values();
+    //accumulate partial results of a row
+    const auto sub_acc_size = b->get_size()[1]; //each accumulator stores a whole row
+    const size_t nb_th = omp_get_max_threads(); //number of threads
+    array<ValueType> acc_array(exec, sub_acc_size*nb_th); //one accumulator per row
+    auto acc_ptr = acc_array.get_data();
+    //compute the multiplication, 1 thread per row
+    #pragma omp parallel
+    {
+        const auto th_id = omp_get_thread_num();
+        const auto th_acc_begin_ptr = acc_ptr + th_id*sub_acc_size;
+        const auto th_acc_end_ptr = acc_ptr + (th_id+1)*sub_acc_size;
+        #pragma omp for
+        for(IndexType row=zero<IndexType>(); row<c->get_size()[0]; row++){
-        for(IndexType row=zero<IndexType>(); row<c->get_size()[0]; row++){
+        for(auto row=zero<IndexType>(); row<c->get_size()[0]; row++){
-        for(IndexType row=zero<IndexType>(); row<c->get_size()[0]; row++){
+        for(auto row=zero<IndexType>(); row<c->get_size()[0]; row++){
+            //reinitialize accumulator to 0
+            initialize_accumulator(th_acc_begin_ptr, sub_acc_size, row);
+            //iterate over the whole matrix b
+            for(IndexType k=zero<IndexType>(); k<b->get_size()[0]; k++){
-            for(IndexType k=zero<IndexType>(); k<b->get_size()[0]; k++){
+            for(auto k=zero<IndexType>(); k<b->get_size()[0]; k++){
-            for(IndexType k=zero<IndexType>(); k<b->get_size()[0]; k++){
+            for(auto k=zero<IndexType>(); k<b->get_size()[0]; k++){
+                const auto val_A = define_multiplication_operand(row, k);
+                //iterate over the non-zero values of a row
+                for(IndexType idx_B=b_rowptrs[k]; idx_B<b_rowptrs[k+1]; idx_B++){
-                for(IndexType idx_B=b_rowptrs[k]; idx_B<b_rowptrs[k+1]; idx_B++){
+                for(auto idx_b=b_rowptrs[k]; idx_b<b_rowptrs[k+1]; idx_b++){
-                for(IndexType idx_B=b_rowptrs[k]; idx_B<b_rowptrs[k+1]; idx_B++){
+                for(auto idx_b=b_rowptrs[k]; idx_b<b_rowptrs[k+1]; idx_b++){
+                    const auto col = b_cols[idx_B];
+                    th_acc_begin_ptr[col] += val_A * b_vals(idx_B);
+                }
+            }
+            //move accumulator to result
+            auto out_ptr = c_vals_ptr + row*c->get_stride();
+            std::copy(th_acc_begin_ptr, th_acc_end_ptr, out_ptr);
+        }
+    }
+}
+
+template <typename ValueType, typename IndexType>
+void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Dense<ValueType>* a,
+                 const matrix::Csr<ValueType, IndexType>* b,
+                 matrix::Dense<ValueType>* c)
+{
+    auto simple_init_acc = [b](ValueType* acc_begin_ptr, IndexType acc_size, IndexType row){
+        std::fill(acc_begin_ptr, acc_begin_ptr + acc_size, zero<ValueType>()); //reinitialize accumulator with zeroes
+    };
+    auto simple_def_mult_operand = [a](IndexType row, IndexType k){
+        return a->at(row, k); //no multiplication by alpha, just get value in a
+    };
+    mspm_auxiliary(exec, a, b, c, simple_init_acc, simple_def_mult_operand);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void mspm(std::shared_ptr<const DefaultExecutor> exec,
+          const matrix::Dense<ValueType>* alpha,
+          const matrix::Dense<ValueType>* a,
+          const matrix::Csr<ValueType, IndexType>* b,
+          const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
+{
+    auto advanced_init_acc = [b, c, beta](ValueType* acc_begin_ptr, IndexType acc_size, IndexType row){
+        const auto begin_row_c_vals_ptr = c->get_const_values() + c->get_stride()*row;
+        std::transform( //initialize the accumulator with c + beta
+            begin_row_c_vals_ptr, begin_row_c_vals_ptr + acc_size,
-        std::transform( //initialize the accumulator with c + beta
-            begin_row_c_vals_ptr, begin_row_c_vals_ptr + acc_size,
+        //initialize the accumulator with c + beta
+        std::transform(
+            begin_row_c_vals_ptr, begin_row_c_vals_ptr + acc_size,
-        std::transform( //initialize the accumulator with c + beta
-            begin_row_c_vals_ptr, begin_row_c_vals_ptr + acc_size,
+        //initialize the accumulator with c + beta
+        std::transform(
+            begin_row_c_vals_ptr, begin_row_c_vals_ptr + acc_size,
+            acc_begin_ptr, std::bind1st(std::multiplies<ValueType>(), beta->at(0, 0)));
+    };
+    auto advanced_def_mult_operand = [a, alpha](IndexType row, IndexType k){
+        return alpha->at(0, 0) * a->at(row, k); //multiply a(row,k) by alpha
+    };
+    mspm_auxiliary(exec, a, b, c, advanced_init_acc, advanced_def_mult_operand);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);
+
 
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,

diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
@@ -22,6 +22,7 @@
 #include "accessor/range.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/csr_accessor_helper.hpp"
 
 
 namespace gko {
@@ -91,6 +92,84 @@ void apply(std::shared_ptr<const ReferenceExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
+template <typename ValueType, typename IndexType, typename InitAcc, typename DefMultOperand>
+void mspm_auxiliary(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Dense<ValueType>* a,
+                    const matrix::Csr<ValueType, IndexType>* b,
+                    matrix::Dense<ValueType>* c,
+                    InitAcc initialize_accumulator,
+                    DefMultOperand define_multiplication_operand)
+{
+    //initialization
+    const auto b_rowptrs = b->get_const_row_ptrs();
+    const auto b_cols = b->get_const_col_idxs();
+    const auto a_vals = acc::helper::build_const_rrm_accessor<ValueType>(a);
+    const auto b_vals = acc::helper::build_const_rrm_accessor<ValueType>(b);
+    const auto c_vals_ptr = c->get_values();
+    //accumulate partial results of a row
+    const auto acc_size = b->get_size()[1]; //the accumulator stores a whole row
+    array<ValueType> acc_array(exec, acc_size);
+    auto acc_begin_ptr = acc_array.get_data();
+    auto acc_end_ptr = acc_begin_ptr + acc_size;
+    //compute the multiplication
+    for(IndexType row=zero<IndexType>(); row<c->get_size()[0]; row++){ //iterate over a's row
+        //reinitialize accumulator to 0
+        initialize_accumulator(acc_begin_ptr, acc_size, row);
+        //iterate over the whole matrix b
+        for(IndexType k=zero<IndexType>(); k<b->get_size()[0]; k++){
+            const auto val_A = define_multiplication_operand(row, k);
+            //iterate over the non-zero values of a row
+            for(IndexType idx_B=b_rowptrs[k]; idx_B<b_rowptrs[k+1]; idx_B++){
+                const auto col = b_cols[idx_B];
+                acc_begin_ptr[col] += val_A * b_vals(idx_B);
+            }
+        }
+        //move accumulator to result
+        auto out_ptr = c_vals_ptr + row*c->get_stride();
+        std::copy(acc_begin_ptr, acc_end_ptr, out_ptr);
+    }
+}
+
+template <typename ValueType, typename IndexType>
+void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Dense<ValueType>* a,
+                 const matrix::Csr<ValueType, IndexType>* b,
+                 matrix::Dense<ValueType>* c)
+{
+    auto simple_init_acc = [b](ValueType* acc_begin_ptr, IndexType acc_size, IndexType row){
+        std::fill(acc_begin_ptr, acc_begin_ptr + acc_size, zero<ValueType>()); //reinitialize accumulator with zeroes
+    };
+    auto simple_def_mult_operand = [a](IndexType row, IndexType k){
+        return a->at(row, k); //no multiplication by alpha, just get value in a
+    };
+    mspm_auxiliary(exec, a, b, c, simple_init_acc, simple_def_mult_operand);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void mspm(std::shared_ptr<const DefaultExecutor> exec,
+          const matrix::Dense<ValueType>* alpha,
+          const matrix::Dense<ValueType>* a,
+          const matrix::Csr<ValueType, IndexType>* b,
+          const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
+{
+    auto advanced_init_acc = [b, c, beta](ValueType* acc_begin_ptr, IndexType acc_size, IndexType row){
+        const auto begin_row_c_vals_ptr = c->get_const_values() + c->get_stride()*row;
+        std::transform( //initialize the accumulator with c + beta
+            begin_row_c_vals_ptr, begin_row_c_vals_ptr + acc_size,
+            acc_begin_ptr, std::bind1st(std::multiplies<ValueType>(), beta->at(0, 0)));
+    };
+    auto advanced_def_mult_operand = [a, alpha](IndexType row, IndexType k){
+        return alpha->at(0, 0) * a->at(row, k); //multiply a(row,k) by alpha
+    };
-    auto advanced_def_mult_operand = [a, alpha](IndexType row, IndexType k){
-        return alpha->at(0, 0) * a->at(row, k); //multiply a(row,k) by alpha
-    };
+    auto advanced_def_mult_operand = [alpha](auto a_val, auto b_val){
+        return alpha->at(0, 0) * a_val * b_val; //multiply a(row,k) by alpha
+    };
-    auto advanced_def_mult_operand = [a, alpha](IndexType row, IndexType k){
-        return alpha->at(0, 0) * a->at(row, k); //multiply a(row,k) by alpha
-    };
+    auto advanced_def_mult_operand = [alpha](auto a_val, auto b_val){
+        return alpha->at(0, 0) * a_val * b_val; //multiply a(row,k) by alpha
+    };
+    mspm_auxiliary(exec, a, b, c, advanced_init_acc, advanced_def_mult_operand);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);
+
 
 template <typename InValueType, typename OutValueType>
 void copy(std::shared_ptr<const DefaultExecutor> exec,