From 7f6c166ce7a98be35a4537a5aface44eaf40a482 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 8 Aug 2025 13:36:52 +0200
Subject: [PATCH 1/2] initialize MSpM kernel

---
 common/cuda_hip/matrix/dense_kernels.cpp | 28 ++++++++++++++
 core/device_hooks/common_kernels.inc.cpp |  2 +
 core/matrix/dense.cpp                    | 49 ++++++++++++++++++------
 core/matrix/dense_kernels.hpp            | 19 ++++++++-
 dpcpp/matrix/dense_kernels.dp.cpp        | 28 ++++++++++++++
 omp/matrix/dense_kernels.cpp             | 28 ++++++++++++++
 reference/matrix/dense_kernels.cpp       | 28 ++++++++++++++
 7 files changed, 169 insertions(+), 13 deletions(-)
diff --git a/common/cuda_hip/matrix/dense_kernels.cpp b/common/cuda_hip/matrix/dense_kernels.cpp
index 5e31215faaa..9d9b1780897 100644
--- a/common/cuda_hip/matrix/dense_kernels.cpp
+++ b/common/cuda_hip/matrix/dense_kernels.cpp
@@ -792,6 +792,34 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Dense<ValueType>* a,
+                 const matrix::Csr<ValueType, IndexType>* b,
+                 matrix::Dense<ValueType>* c)
+{
+    // TODO: implement c = a * b with single thread
+    GKO_NOT_IMPLEMENTED;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void mspm(std::shared_ptr<const DefaultExecutor> exec,
+          const matrix::Dense<ValueType>* alpha,
+          const matrix::Dense<ValueType>* a,
+          const matrix::Csr<ValueType, IndexType>* b,
+          const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
+{
+    // TODO: implement c = alpha * a * b + beta * c with single thread
+    GKO_NOT_IMPLEMENTED;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);
+
+
 template <typename ValueType>
 void transpose(std::shared_ptr<const DefaultExecutor> exec,
                const matrix::Dense<ValueType>* orig,
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index d5a0fec4fbd..e7f89786afe 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -440,6 +440,8 @@ namespace dense {
 
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);
 GKO_STUB_VALUE_CONVERSION_OR_COPY(GKO_DECLARE_DENSE_COPY_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL);
 GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL);
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index d3868e7741b..d897271b480 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -42,6 +42,8 @@ namespace {
 
 GKO_REGISTER_OPERATION(simple_apply, dense::simple_apply);
 GKO_REGISTER_OPERATION(apply, dense::apply);
+GKO_REGISTER_OPERATION(simple_mspm, dense::simple_mspm);
+GKO_REGISTER_OPERATION(mspm, dense::mspm);
 GKO_REGISTER_OPERATION(copy, dense::copy);
 GKO_REGISTER_OPERATION(fill, dense::fill);
 GKO_REGISTER_OPERATION(scale, dense::scale);
@@ -110,12 +112,23 @@ GKO_REGISTER_OPERATION(add_scaled_identity, dense::add_scaled_identity);
 template <typename ValueType>
 void Dense<ValueType>::apply_impl(const LinOp* b, LinOp* x) const
 {
-    precision_dispatch_real_complex<ValueType>(
-        [this](auto dense_b, auto dense_x) {
-            this->get_executor()->run(
-                dense::make_simple_apply(this, dense_b, dense_x));
-        },
-        b, x);
+    // TODO: it does not consider mixed precision for MSpM
+    if (auto b_csr =
+            dynamic_cast<const matrix::Csr<ValueType, gko::int32>*>(b)) {
+        this->get_executor()->run(
+            dense::make_simple_mspm(this, b_csr, as<Dense>(x)));
+    } else if (auto b_csr =
+                   dynamic_cast<const matrix::Csr<ValueType, gko::int64>*>(b)) {
+        this->get_executor()->run(
+            dense::make_simple_mspm(this, b_csr, as<Dense>(x)));
+    } else {
+        precision_dispatch_real_complex<ValueType>(
+            [this](auto dense_b, auto dense_x) {
+                this->get_executor()->run(
+                    dense::make_simple_apply(this, dense_b, dense_x));
+            },
+            b, x);
+    }
 }
 
 
@@ -123,12 +136,24 @@ template <typename ValueType>
 void Dense<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
                                   const LinOp* beta, LinOp* x) const
 {
-    precision_dispatch_real_complex<ValueType>(
-        [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
-            this->get_executor()->run(dense::make_apply(
-                dense_alpha, this, dense_b, dense_beta, dense_x));
-        },
-        alpha, b, beta, x);
+    // TODO: it does not consider mixed precision for MSpM
+    if (auto b_csr =
+            dynamic_cast<const matrix::Csr<ValueType, gko::int32>*>(b)) {
+        this->get_executor()->run(dense::make_mspm(
+            as<Dense>(alpha), this, b_csr, as<Dense>(beta), as<Dense>(x)));
+    } else if (auto b_csr =
+                   dynamic_cast<const matrix::Csr<ValueType, gko::int64>*>(b)) {
+        this->get_executor()->run(dense::make_mspm(
+            as<Dense>(alpha), this, b_csr, as<Dense>(beta), as<Dense>(x)));
+    } else {
+        precision_dispatch_real_complex<ValueType>(
+            [this](auto dense_alpha, auto dense_b, auto dense_beta,
+                   auto dense_x) {
+                this->get_executor()->run(dense::make_apply(
+                    dense_alpha, this, dense_b, dense_beta, dense_x));
+            },
+            alpha, b, beta, x);
+    }
 }
 
 
diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
index 7422b431aa0..a18ca982f9d 100644
--- a/core/matrix/dense_kernels.hpp
+++ b/core/matrix/dense_kernels.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -31,6 +31,19 @@ namespace kernels {
                const matrix::Dense<_type>* a, const matrix::Dense<_type>* b, \
                const matrix::Dense<_type>* beta, matrix::Dense<_type>* c)
 
+#define GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL(_vtype, _itype)      \
+    void simple_mspm(std::shared_ptr<const DefaultExecutor> exec, \
+                     const matrix::Dense<_vtype>* a,              \
+                     const matrix::Csr<_vtype, _itype>* b,        \
+                     matrix::Dense<_vtype>* c)
+
+#define GKO_DECLARE_DENSE_MSPM_KERNEL(_vtype, _itype)      \
+    void mspm(std::shared_ptr<const DefaultExecutor> exec, \
+              const matrix::Dense<_vtype>* alpha,          \
+              const matrix::Dense<_vtype>* a,              \
+              const matrix::Csr<_vtype, _itype>* b,        \
+              const matrix::Dense<_vtype>* beta, matrix::Dense<_vtype>* c)
+
 #define GKO_DECLARE_DENSE_COPY_KERNEL(_intype, _outtype)   \
     void copy(std::shared_ptr<const DefaultExecutor> exec, \
               const matrix::Dense<_intype>* input,         \
@@ -354,6 +367,10 @@ namespace kernels {
     GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL(ValueType);                         \
     template <typename ValueType>                                             \
     GKO_DECLARE_DENSE_APPLY_KERNEL(ValueType);                                \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL(ValueType, IndexType);               \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_MSPM_KERNEL(ValueType, IndexType);                      \
     template <typename InValueType, typename OutValueType>                    \
     GKO_DECLARE_DENSE_COPY_KERNEL(InValueType, OutValueType);                 \
     template <typename ValueType>                                             \
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 99aac7064e5..fa847b86f62 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -277,6 +277,34 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Dense<ValueType>* a,
+                 const matrix::Csr<ValueType, IndexType>* b,
+                 matrix::Dense<ValueType>* c)
+{
+    // TODO: implement c = a * b with single thread
+    GKO_NOT_IMPLEMENTED;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void mspm(std::shared_ptr<const DefaultExecutor> exec,
+          const matrix::Dense<ValueType>* alpha,
+          const matrix::Dense<ValueType>* a,
+          const matrix::Csr<ValueType, IndexType>* b,
+          const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
+{
+    // TODO: implement c = alpha * a * b + beta * c with single thread
+    GKO_NOT_IMPLEMENTED;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index fe1f58ef93d..9e0a4f05031 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -139,6 +139,34 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Dense<ValueType>* a,
+                 const matrix::Csr<ValueType, IndexType>* b,
+                 matrix::Dense<ValueType>* c)
+{
+    // TODO: implement c = a * b with single thread
+    GKO_NOT_IMPLEMENTED;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void mspm(std::shared_ptr<const DefaultExecutor> exec,
+          const matrix::Dense<ValueType>* alpha,
+          const matrix::Dense<ValueType>* a,
+          const matrix::Csr<ValueType, IndexType>* b,
+          const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
+{
+    // TODO: implement c = alpha * a * b + beta * c with single thread
+    GKO_NOT_IMPLEMENTED;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index 7c36d9101d5..7d6b0ccdbdf 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -92,6 +92,34 @@ void apply(std::shared_ptr<const ReferenceExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Dense<ValueType>* a,
+                 const matrix::Csr<ValueType, IndexType>* b,
+                 matrix::Dense<ValueType>* c)
+{
+    // TODO: implement c = a * b with single thread
+    GKO_NOT_IMPLEMENTED;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_SIMPLE_MSPM_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void mspm(std::shared_ptr<const DefaultExecutor> exec,
+          const matrix::Dense<ValueType>* alpha,
+          const matrix::Dense<ValueType>* a,
+          const matrix::Csr<ValueType, IndexType>* b,
+          const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
+{
+    // TODO: implement c = alpha * a * b + beta * c with single thread
+    GKO_NOT_IMPLEMENTED;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);
+
+
 template <typename InValueType, typename OutValueType>
 void copy(std::shared_ptr<const DefaultExecutor> exec,
           const matrix::Dense<InValueType>* input,

From 4000577528a3866b5c64de2760669a89a743a601 Mon Sep 17 00:00:00 2001
From: Coxy <coximielle@gmail.com>
Date: Wed, 13 Aug 2025 15:40:52 +0200
Subject: [PATCH 2/2] implementation and tests for simple and advanced MSpM on
 reference and omp executors

---
 omp/matrix/dense_kernels.cpp            |  66 ++++++++++-
 reference/matrix/dense_kernels.cpp      |  59 +++++++++-
 reference/test/matrix/dense_kernels.cpp | 149 +++++++++++++++++++++++-
 test/matrix/dense_kernels.cpp           |  36 +++++-
 4 files changed, 299 insertions(+), 11 deletions(-)

diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index 9e0a4f05031..9020d69d5cd 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -23,6 +23,7 @@
 #include "accessor/block_col_major.hpp"
 #include "accessor/range.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/csr_accessor_helper.hpp"
 
 
 namespace gko {
@@ -138,6 +139,50 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
+template <typename ValueType, typename IndexType, typename InitAcc, typename DefMultOperand>
+void mspm_auxiliary(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Dense<ValueType>* a,
+                    const matrix::Csr<ValueType, IndexType>* b,
+                    matrix::Dense<ValueType>* c,
+                    InitAcc initialize_accumulator,
+                    DefMultOperand define_multiplication_operand)
+{
+    //initialization
+    const auto b_rowptrs = b->get_const_row_ptrs();
+    const auto b_cols = b->get_const_col_idxs();
+    const auto a_vals = acc::helper::build_const_rrm_accessor<ValueType>(a);
+    const auto b_vals = acc::helper::build_const_rrm_accessor<ValueType>(b);
+    const auto c_vals_ptr = c->get_values();
+    //accumulate partial results of a row
+    const auto sub_acc_size = b->get_size()[1]; //each accumulator stores a whole row
+    const size_t nb_th = omp_get_max_threads(); //number of threads
+    array<ValueType> acc_array(exec, sub_acc_size*nb_th); //one accumulator per row
+    auto acc_ptr = acc_array.get_data();
+    //compute the multiplication, 1 thread per row
+    #pragma omp parallel
+    {
+        const auto th_id = omp_get_thread_num();
+        const auto th_acc_begin_ptr = acc_ptr + th_id*sub_acc_size;
+        const auto th_acc_end_ptr = acc_ptr + (th_id+1)*sub_acc_size;
+        #pragma omp for
+        for(IndexType row=zero<IndexType>(); row<c->get_size()[0]; row++){
+            //reinitialize accumulator to 0
+            initialize_accumulator(th_acc_begin_ptr, sub_acc_size, row);
+            //iterate over the whole matrix b
+            for(IndexType k=zero<IndexType>(); k<b->get_size()[0]; k++){
+                const auto val_A = define_multiplication_operand(row, k);
+                //iterate over the non-zero values of a row
+                for(IndexType idx_B=b_rowptrs[k]; idx_B<b_rowptrs[k+1]; idx_B++){
+                    const auto col = b_cols[idx_B];
+                    th_acc_begin_ptr[col] += val_A * b_vals(idx_B);
+                }
+            }
+            //move accumulator to result
+            auto out_ptr = c_vals_ptr + row*c->get_stride();
+            std::copy(th_acc_begin_ptr, th_acc_end_ptr, out_ptr);
+        }
+    }
+}
 
 template <typename ValueType, typename IndexType>
 void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
@@ -145,8 +190,13 @@ void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
                  const matrix::Csr<ValueType, IndexType>* b,
                  matrix::Dense<ValueType>* c)
 {
-    // TODO: implement c = a * b with single thread
-    GKO_NOT_IMPLEMENTED;
+    auto simple_init_acc = [b](ValueType* acc_begin_ptr, IndexType acc_size, IndexType row){
+        std::fill(acc_begin_ptr, acc_begin_ptr + acc_size, zero<ValueType>()); //reinitialize accumulator with zeroes
+    };
+    auto simple_def_mult_operand = [a](IndexType row, IndexType k){
+        return a->at(row, k); //no multiplication by alpha, just get value in a
+    };
+    mspm_auxiliary(exec, a, b, c, simple_init_acc, simple_def_mult_operand);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -160,8 +210,16 @@ void mspm(std::shared_ptr<const DefaultExecutor> exec,
           const matrix::Csr<ValueType, IndexType>* b,
           const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
 {
-    // TODO: implement c = alpha * a * b + beta * c with single thread
-    GKO_NOT_IMPLEMENTED;
+    auto advanced_init_acc = [b, c, beta](ValueType* acc_begin_ptr, IndexType acc_size, IndexType row){
+        const auto begin_row_c_vals_ptr = c->get_const_values() + c->get_stride()*row;
+        std::transform( //initialize the accumulator with c + beta
+            begin_row_c_vals_ptr, begin_row_c_vals_ptr + acc_size,
+            acc_begin_ptr, std::bind1st(std::multiplies<ValueType>(), beta->at(0, 0)));
+    };
+    auto advanced_def_mult_operand = [a, alpha](IndexType row, IndexType k){
+        return alpha->at(0, 0) * a->at(row, k); //multiply a(row,k) by alpha
+    };
+    mspm_auxiliary(exec, a, b, c, advanced_init_acc, advanced_def_mult_operand);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index 7d6b0ccdbdf..5b476a90ef4 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -22,6 +22,7 @@
 #include "accessor/range.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/csr_accessor_helper.hpp"
 
 
 namespace gko {
@@ -91,6 +92,43 @@ void apply(std::shared_ptr<const ReferenceExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
+template <typename ValueType, typename IndexType, typename InitAcc, typename DefMultOperand>
+void mspm_auxiliary(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Dense<ValueType>* a,
+                    const matrix::Csr<ValueType, IndexType>* b,
+                    matrix::Dense<ValueType>* c,
+                    InitAcc initialize_accumulator,
+                    DefMultOperand define_multiplication_operand)
+{
+    //initialization
+    const auto b_rowptrs = b->get_const_row_ptrs();
+    const auto b_cols = b->get_const_col_idxs();
+    const auto a_vals = acc::helper::build_const_rrm_accessor<ValueType>(a);
+    const auto b_vals = acc::helper::build_const_rrm_accessor<ValueType>(b);
+    const auto c_vals_ptr = c->get_values();
+    //accumulate partial results of a row
+    const auto acc_size = b->get_size()[1]; //the accumulator stores a whole row
+    array<ValueType> acc_array(exec, acc_size);
+    auto acc_begin_ptr = acc_array.get_data();
+    auto acc_end_ptr = acc_begin_ptr + acc_size;
+    //compute the multiplication
+    for(IndexType row=zero<IndexType>(); row<c->get_size()[0]; row++){ //iterate over a's row
+        //reinitialize accumulator to 0
+        initialize_accumulator(acc_begin_ptr, acc_size, row);
+        //iterate over the whole matrix b
+        for(IndexType k=zero<IndexType>(); k<b->get_size()[0]; k++){
+            const auto val_A = define_multiplication_operand(row, k);
+            //iterate over the non-zero values of a row
+            for(IndexType idx_B=b_rowptrs[k]; idx_B<b_rowptrs[k+1]; idx_B++){
+                const auto col = b_cols[idx_B];
+                acc_begin_ptr[col] += val_A * b_vals(idx_B);
+            }
+        }
+        //move accumulator to result
+        auto out_ptr = c_vals_ptr + row*c->get_stride();
+        std::copy(acc_begin_ptr, acc_end_ptr, out_ptr);
+    }
+}
 
 template <typename ValueType, typename IndexType>
 void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
@@ -98,8 +136,13 @@ void simple_mspm(std::shared_ptr<const DefaultExecutor> exec,
                  const matrix::Csr<ValueType, IndexType>* b,
                  matrix::Dense<ValueType>* c)
 {
-    // TODO: implement c = a * b with single thread
-    GKO_NOT_IMPLEMENTED;
+    auto simple_init_acc = [b](ValueType* acc_begin_ptr, IndexType acc_size, IndexType row){
+        std::fill(acc_begin_ptr, acc_begin_ptr + acc_size, zero<ValueType>()); //reinitialize accumulator with zeroes
+    };
+    auto simple_def_mult_operand = [a](IndexType row, IndexType k){
+        return a->at(row, k); //no multiplication by alpha, just get value in a
+    };
+    mspm_auxiliary(exec, a, b, c, simple_init_acc, simple_def_mult_operand);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -113,8 +156,16 @@ void mspm(std::shared_ptr<const DefaultExecutor> exec,
           const matrix::Csr<ValueType, IndexType>* b,
           const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
 {
-    // TODO: implement c = alpha * a * b + beta * c with single thread
-    GKO_NOT_IMPLEMENTED;
+    auto advanced_init_acc = [b, c, beta](ValueType* acc_begin_ptr, IndexType acc_size, IndexType row){
+        const auto begin_row_c_vals_ptr = c->get_const_values() + c->get_stride()*row;
+        std::transform( //initialize the accumulator with c + beta
+            begin_row_c_vals_ptr, begin_row_c_vals_ptr + acc_size,
+            acc_begin_ptr, std::bind1st(std::multiplies<ValueType>(), beta->at(0, 0)));
+    };
+    auto advanced_def_mult_operand = [a, alpha](IndexType row, IndexType k){
+        return alpha->at(0, 0) * a->at(row, k); //multiply a(row,k) by alpha
+    };
+    mspm_auxiliary(exec, a, b, c, advanced_init_acc, advanced_def_mult_operand);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_MSPM_KERNEL);
diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp
index 2ce7b023a1c..90304e9e15b 100644
--- a/reference/test/matrix/dense_kernels.cpp
+++ b/reference/test/matrix/dense_kernels.cpp
@@ -55,7 +55,8 @@ class Dense : public ::testing::Test {
           mtx6(gko::initialize<Mtx>({{1.0, 2.0, 0.0}, {0.0, 1.5, 0.0}}, exec)),
           mtx7(gko::initialize<Mtx>({{1.0, 2.0, 3.0}, {0.0, 1.5, 0.0}}, exec)),
           mtx8(gko::initialize<Mtx>(
-              {I<T>({1.0, -1.0}), I<T>({-2.0, 2.0}), I<T>({-3.0, 3.0})}, exec))
+              {I<T>({1.0, -1.0}), I<T>({-2.0, 2.0}), I<T>({-3.0, 3.0})}, exec)),
+          mtx9(gko::initialize<Mtx>({I<T>({1.0}), I<T>({2.0}), I<T>({3.0}), I<T>({4.0})}, exec))
     {}
 
     std::shared_ptr<const gko::Executor> exec;
@@ -67,6 +68,7 @@ class Dense : public ::testing::Test {
     std::unique_ptr<Mtx> mtx6;
     std::unique_ptr<Mtx> mtx7;
     std::unique_ptr<Mtx> mtx8;
+    std::unique_ptr<Mtx> mtx9;
     std::default_random_engine rand_engine;
 
     template <typename MtxType>
@@ -1425,6 +1427,151 @@ class DenseWithIndexType
 TYPED_TEST_SUITE(DenseWithIndexType, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
+TYPED_TEST(DenseWithIndexType, SimpleMspmVectors)
+{ //dense vector (mtx9) x horizontal sparse vector
+    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using SMtx = gko::matrix::Csr<T, index_type>;
+    using Mtx = gko::matrix::Dense<T>;
+    //create sparse vector { 0 1 10 100 0 }
+    gko::array<T> arr_val(this->exec, {1.0, 10.0, 100.0});
+    gko::array<index_type> arr_col(this->exec, {1, 2, 3});
+    gko::array<index_type> arr_row(this->exec, {0, 3});
+    std::unique_ptr<SMtx> smtx( SMtx::create(this->exec, gko::dim<2>(1, 5), arr_val, arr_col, arr_row) );
+    //declare result
+    std::unique_ptr<Mtx> res(gko::initialize<Mtx>({
+        {-1.0, -1.0, -1.0, -1.0, -1.0},
+        {-1.0, -1.0, -1.0, -1.0, -1.0},
+        {-1.0, -1.0, -1.0, -1.0, -1.0},
+        {-1.0, -1.0, -1.0, -1.0, -1.0}
+    }, this->exec));
+
+    this->mtx9->apply(smtx, res);
+
+    for(index_type row=0; row<res->get_size()[0]; row++){
+        EXPECT_EQ(res->at(row, 0), T{0.0});
+        EXPECT_EQ(res->at(row, 4), T{0.0});
+    }
+    EXPECT_EQ(res->at(0, 1), T{1.0});
+    EXPECT_EQ(res->at(0, 2), T{10.0});
+    EXPECT_EQ(res->at(0, 3), T{100.0});
+    EXPECT_EQ(res->at(1, 1), T{2.0});
+    EXPECT_EQ(res->at(1, 2), T{20.0});
+    EXPECT_EQ(res->at(1, 3), T{200.0});
+    EXPECT_EQ(res->at(2, 1), T{3.0});
+    EXPECT_EQ(res->at(2, 2), T{30.0});
+    EXPECT_EQ(res->at(2, 3), T{300.0});
+    EXPECT_EQ(res->at(3, 1), T{4.0});
+    EXPECT_EQ(res->at(3, 2), T{40.0});
+    EXPECT_EQ(res->at(3, 3), T{400.0});
+}
+
+TYPED_TEST(DenseWithIndexType, SimpleMspmMatrices)
+{ //dense matrix (mtx5) x sparse matrix (mtx3 of sparse test file)
+    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using SMtx = gko::matrix::Csr<T, index_type>;
+    using Mtx = gko::matrix::Dense<T>;
+    //create sparse matrix
+        // 0 2 1
+        // 3 1 8
+        // 0 0 0
+    gko::array<T> arr_val(this->exec, {2.0, 1.0, 3.0, 1.0, 8.0});
+    gko::array<index_type> arr_col(this->exec, {1, 2, 0, 1, 2});
+    gko::array<index_type> arr_row(this->exec, {0, 2, 5, 5});
+    std::unique_ptr<SMtx> smtx( SMtx::create(this->exec, gko::dim<2>(3, 3), arr_val, arr_col, arr_row) );
+    //declare result
+    std::unique_ptr<Mtx> res(gko::initialize<Mtx>({
+        {-1.0, -1.0, -1.0},
+        {-1.0, -1.0, -1.0},
+        {-1.0, -1.0, -1.0}
+    }, this->exec));
+
+    this->mtx5->apply(smtx, res);
+
+    std::unique_ptr<Mtx> expected(gko::initialize<Mtx>({
+        {-3.0, 1.0, -7.0},
+        {6.0, -2.0, 14.0},
+        {10.2, 7.6, 29.3}
+    }, this->exec));
+    GKO_ASSERT_MTX_NEAR(res, expected, r<T>::value);
+}
+
+TYPED_TEST(DenseWithIndexType, AdvancedMspmVectors)
+{ //dense vector (mtx9) x horizontal sparse vector
+    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using SMtx = gko::matrix::Csr<T, index_type>;
+    using Mtx = gko::matrix::Dense<T>;
+    //create sparse vector { 0 1 10 100 0 }
+    gko::array<T> arr_val(this->exec, {1.0, 10.0, 100.0});
+    gko::array<index_type> arr_col(this->exec, {1, 2, 3});
+    gko::array<index_type> arr_row(this->exec, {0, 3});
+    std::unique_ptr<SMtx> smtx( SMtx::create(this->exec, gko::dim<2>(1, 5), arr_val, arr_col, arr_row) );
+    //scalars
+    const auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    const auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    //declare result
+    std::unique_ptr<Mtx> res(gko::initialize<Mtx>({
+        {-1.0, -1.0, -1.0, -1.0, -1.0},
+        {-1.0, -1.0, -1.0, -1.0, -1.0},
+        {-1.0, -1.0, -1.0, -1.0, -1.0},
+        {-1.0, -1.0, -1.0, -1.0, -1.0}
+    }, this->exec));
+
+    this->mtx9->apply(alpha, smtx, beta, res);
+
+    for(index_type row=0; row<res->get_size()[0]; row++){
+        EXPECT_EQ(res->at(row, 0), T{1.0});
+        EXPECT_EQ(res->at(row, 4), T{1.0});
+    }
+    EXPECT_EQ(res->at(0, 1), T{3.0});
+    EXPECT_EQ(res->at(0, 2), T{21.0});
+    EXPECT_EQ(res->at(0, 3), T{201.0});
+    EXPECT_EQ(res->at(1, 1), T{5.0});
+    EXPECT_EQ(res->at(1, 2), T{41.0});
+    EXPECT_EQ(res->at(1, 3), T{401.0});
+    EXPECT_EQ(res->at(2, 1), T{7.0});
+    EXPECT_EQ(res->at(2, 2), T{61.0});
+    EXPECT_EQ(res->at(2, 3), T{601.0});
+    EXPECT_EQ(res->at(3, 1), T{9.0});
+    EXPECT_EQ(res->at(3, 2), T{81.0});
+    EXPECT_EQ(res->at(3, 3), T{801.0});
+}
+
+TYPED_TEST(DenseWithIndexType, AdvancedMspmMatrices)
+{ //dense matrix (mtx5) x sparse matrix (mtx3 of sparse test file)
+    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using SMtx = gko::matrix::Csr<T, index_type>;
+    using Mtx = gko::matrix::Dense<T>;
+    //create sparse matrix
+        // 0 2 1
+        // 3 1 8
+        // 0 0 0
+    gko::array<T> arr_val(this->exec, {2.0, 1.0, 3.0, 1.0, 8.0});
+    gko::array<index_type> arr_col(this->exec, {1, 2, 0, 1, 2});
+    gko::array<index_type> arr_row(this->exec, {0, 2, 5, 5});
+    std::unique_ptr<SMtx> smtx( SMtx::create(this->exec, gko::dim<2>(3, 3), arr_val, arr_col, arr_row) );
+    //scalars
+    const auto alpha = gko::initialize<Mtx>({2.0}, this->exec);
+    const auto beta = gko::initialize<Mtx>({-1.0}, this->exec);
+    //declare result
+    std::unique_ptr<Mtx> res(gko::initialize<Mtx>({
+        {-1.0, -1.0, -1.0},
+        {-1.0, -1.0, -1.0},
+        {-1.0, -1.0, -1.0}
+    }, this->exec));
+
+    this->mtx5->apply(alpha, smtx, beta, res);
+
+    std::unique_ptr<Mtx> expected(gko::initialize<Mtx>({
+        {-5.0, 3.0, -13.0},
+        {13.0, -3.0, 29.0},
+        {21.4, 16.2, 59.6}
+    }, this->exec));
+    GKO_ASSERT_MTX_NEAR(res, expected, r<T>::value);
+}
 
 template <typename ValueType, typename IndexType>
 void assert_coo_eq_mtx4(const gko::matrix::Coo<ValueType, IndexType>* coo_mtx)
diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp
index 727d6ee7d2f..f2813e7f969 100644
--- a/test/matrix/dense_kernels.cpp
+++ b/test/matrix/dense_kernels.cpp
@@ -34,6 +34,7 @@ class Dense : public CommonTestFixture {
     // in single mode, mixed_type will be the same as value_type
     using mixed_type = float;
     using Mtx = gko::matrix::Dense<value_type>;
+    using CsrMtx = gko::matrix::Csr<value_type>;
     using MixedMtx = gko::matrix::Dense<mixed_type>;
     using NormVector = gko::matrix::Dense<gko::remove_complex<value_type>>;
     using Arr = gko::array<index_type>;
@@ -98,6 +99,7 @@ class Dense : public CommonTestFixture {
         x = gen_mtx<Mtx>(65, 25);
         y = gen_mtx<Mtx>(25, 35);
         c_x = gen_mtx<ComplexMtx>(65, 25);
+        csr_y = gen_mtx<CsrMtx>(25, 35);
         alpha = gko::initialize<Mtx>({2.0}, ref);
         beta = gko::initialize<Mtx>({-1.0}, ref);
         result = gen_mtx<Mtx>(65, 35);
@@ -105,6 +107,7 @@ class Dense : public CommonTestFixture {
         dx = gko::clone(exec, x);
         dy = gko::clone(exec, y);
         dc_x = gko::clone(exec, c_x);
+        dcsr_y = gko::clone(exec, csr_y);
         dresult = gko::clone(exec, result);
         dalpha = gko::clone(exec, alpha);
         dbeta = gko::clone(exec, beta);
@@ -166,6 +169,7 @@ class Dense : public CommonTestFixture {
     std::unique_ptr<ComplexMtx> c_y;
     std::unique_ptr<ComplexMtx> c_alpha;
     std::unique_ptr<Mtx> y;
+    std::unique_ptr<CsrMtx> csr_y;
     std::unique_ptr<Mtx> alpha;
     std::unique_ptr<Mtx> beta;
     std::unique_ptr<Mtx> result;
@@ -175,6 +179,7 @@ class Dense : public CommonTestFixture {
     std::unique_ptr<ComplexMtx> dc_y;
     std::unique_ptr<ComplexMtx> dc_alpha;
     std::unique_ptr<Mtx> dy;
+    std::unique_ptr<CsrMtx> dcsr_y;
     std::unique_ptr<Mtx> dalpha;
     std::unique_ptr<Mtx> dbeta;
     std::unique_ptr<Mtx> dresult;
@@ -325,9 +330,9 @@ TEST_F(Dense, ApplyToComplexIsEquivalentToRef)
 TEST_F(Dense, ApplyToMixedComplexIsEquivalentToRef)
 {
     set_up_apply_data();
-    auto complex_b = gen_mtx<ComplexMtx>(x->get_size()[1], 1);
+    auto complex_b = gen_mtx<MixedComplexMtx>(x->get_size()[1], 1);
     auto dcomplex_b = gko::clone(exec, complex_b);
-    auto complex_x = gen_mtx<ComplexMtx>(x->get_size()[0], 1);
+    auto complex_x = gen_mtx<MixedComplexMtx>(x->get_size()[0], 1);
     auto dcomplex_x = gko::clone(exec, complex_x);
 
     x->apply(complex_b, complex_x);
@@ -368,6 +373,33 @@ TEST_F(Dense, AdvancedApplyToMixedComplexIsEquivalentToRef)
     GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 2e-7);
 }
 
+TEST_F(Dense, SimpleMspmIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    #ifdef GKO_COMPILING_OMP
+    x->apply(csr_y, result);
+    dx->apply(dcsr_y, dresult);
+
+    GKO_ASSERT_MTX_NEAR(dresult, result, r<value_type>::value);
+    #else
+    ASSERT_THROW(dx->apply(dcsr_y, dresult), gko::NotImplemented);
+    #endif
+}
+
+TEST_F(Dense, AdvancedMspmIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    #ifdef GKO_COMPILING_OMP
+    x->apply(alpha, csr_y, beta, result);
+    dx->apply(dalpha, dcsr_y, dbeta, dresult);
+
+    GKO_ASSERT_MTX_NEAR(dresult, result, r<value_type>::value);
+    #else
+    ASSERT_THROW(dx->apply(dalpha, dcsr_y, dbeta, dresult), gko::NotImplemented);
+    #endif
+}
 
 TEST_F(Dense, ComputeDotComplexIsEquivalentToRef)
 {