From 789ef1318e283aa1ed32c27e514e1501358180ca Mon Sep 17 00:00:00 2001
From: Egor Orachev <egororachyov5@gmail.com>
Date: Sat, 2 Sep 2023 11:07:03 +0300
Subject: [PATCH] gh-224: add matrix transpose

---
 CMakeLists.txt                  |  14 ---
 README.md                       |   8 ++
 include/spla.h                  |   1 +
 include/spla/exec.hpp           |  20 ++++
 include/spla/type.hpp           |   2 +-
 python/example.py               |  11 +-
 python/pyspla/__init__.py       |  28 +++--
 python/pyspla/bridge.py         |   3 +
 python/pyspla/matrix.py         | 112 ++++++++++++++++++
 src/binding/c_exec.cpp          |   3 +
 src/cpu/cpu_algo_registry.cpp   |   6 +
 src/cpu/cpu_m_reduce.hpp        |  10 +-
 src/cpu/cpu_m_transpose.hpp     | 197 ++++++++++++++++++++++++++++++++
 src/exec.cpp                    |  14 +++
 src/schedule/schedule_tasks.cpp |  21 ++++
 src/schedule/schedule_tasks.hpp |  18 +++
 src/type.cpp                    |   2 +-
 tests/test_matrix.cpp           |  27 +++++
 18 files changed, 462 insertions(+), 35 deletions(-)
 create mode 100644 src/cpu/cpu_m_transpose.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 45f9f129d..2d7497490 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -320,20 +320,6 @@ add_library(spla SHARED
         src/cpu/cpu_format_dok_vec.hpp
         src/cpu/cpu_format_lil.hpp
         src/cpu/cpu_formats.hpp
-        src/cpu/cpu_m_reduce.hpp
-        src/cpu/cpu_m_reduce_by_column.hpp
-        src/cpu/cpu_m_reduce_by_row.hpp
-        src/cpu/cpu_mxm.hpp
-        src/cpu/cpu_mxmT_masked.hpp
-        src/cpu/cpu_mxv.hpp
-        src/cpu/cpu_vxm.hpp
-        src/cpu/cpu_v_assign.hpp
-        src/cpu/cpu_v_count_mf.hpp
-        src/cpu/cpu_v_eadd.hpp
-        src/cpu/cpu_v_eadd_fdb.hpp
-        src/cpu/cpu_v_emult.hpp
-        src/cpu/cpu_v_map.hpp
-        src/cpu/cpu_v_reduce.hpp
         src/util/pair_hash.hpp
         src/profiling/time_profiler.cpp
         src/profiling/time_profiler.hpp
diff --git a/README.md b/README.md
index b59d2c33d..e2a80a408 100644
--- a/README.md
+++ b/README.md
@@ -100,6 +100,14 @@ v, c, d = bfs(0, A)
 
 ## Performance
 
+Spla shows greate performance comparing to Nvidia CUDA based optimized GraphBLAST library, processing large graphs
+in extreme cases counting 1 BILLION edges with speed and without memory issues. Also, spla shows outstanding performance
+in Page-Rank algorithms, outperforming low-level Nvidia CUDA highly-optimized Gunrock library. Spla shows scalability
+on GPUs on Intel, Nvidia and AMD with acceptable performance. Spla can be run even on integrated GPUs. Here you can
+get good speedup, what is much faster than `scipy` or `networkx`.
+
+More details with performance study given down bellow.
+
 ### Comparison on a Nvidia GPU
 
 | ![stats](./docs/stats/rq1_rel.png)                                                                                                            |
diff --git a/include/spla.h b/include/spla.h
index bb8d3ed3a..72bff7619 100644
--- a/include/spla.h
+++ b/include/spla.h
@@ -373,6 +373,7 @@ SPLA_API spla_Status spla_Exec_vxm_masked(spla_Vector r, spla_Vector mask, spla_
 SPLA_API spla_Status spla_Exec_m_reduce_by_row(spla_Vector r, spla_Matrix M, spla_OpBinary op_reduce, spla_Scalar init, spla_Descriptor desc, spla_ScheduleTask* task);
 SPLA_API spla_Status spla_Exec_m_reduce_by_column(spla_Vector r, spla_Matrix M, spla_OpBinary op_reduce, spla_Scalar init, spla_Descriptor desc, spla_ScheduleTask* task);
 SPLA_API spla_Status spla_Exec_m_reduce(spla_Scalar r, spla_Scalar s, spla_Matrix M, spla_OpBinary op_reduce, spla_Descriptor desc, spla_ScheduleTask* task);
+SPLA_API spla_Status spla_Exec_m_transpose(spla_Matrix R, spla_Matrix M, spla_OpUnary op_apply, spla_Descriptor desc, spla_ScheduleTask* task);
 SPLA_API spla_Status spla_Exec_v_eadd(spla_Vector r, spla_Vector u, spla_Vector v, spla_OpBinary op, spla_Descriptor desc, spla_ScheduleTask* task);
 SPLA_API spla_Status spla_Exec_v_emult(spla_Vector r, spla_Vector u, spla_Vector v, spla_OpBinary op, spla_Descriptor desc, spla_ScheduleTask* task);
 SPLA_API spla_Status spla_Exec_v_eadd_fdb(spla_Vector r, spla_Vector v, spla_Vector fdb, spla_OpBinary op, spla_Descriptor desc, spla_ScheduleTask* task);
diff --git a/include/spla/exec.hpp b/include/spla/exec.hpp
index efa8a4927..fc62f13fb 100644
--- a/include/spla/exec.hpp
+++ b/include/spla/exec.hpp
@@ -239,6 +239,26 @@ namespace spla {
             ref_ptr<Descriptor>    desc     = ref_ptr<Descriptor>(),
             ref_ptr<ScheduleTask>* task_hnd = nullptr);
 
+    /**
+     * @brief Execute (schedule) matrix transpose operation
+     *
+     * @note Pass valid `task_hnd` to store as a task, rather then execute immediately.
+     *
+     * @param R Matrix to store result
+     * @param M Matrix to transpose
+     * @param op_apply Unary op to transform value
+     * @param desc Scheduled task descriptor; default is null
+     * @param task_hnd Optional task hnd; pass not-null pointer to store task
+     *
+     * @return Status on task execution or status on hnd creation
+     */
+    SPLA_API Status exec_m_transpose(
+            ref_ptr<Matrix>        R,
+            ref_ptr<Matrix>        M,
+            ref_ptr<OpUnary>       op_apply,
+            ref_ptr<Descriptor>    desc     = ref_ptr<Descriptor>(),
+            ref_ptr<ScheduleTask>* task_hnd = nullptr);
+
     /**
      * @brief Execute (schedule) element-wise addition by structure of two vectors
      *
diff --git a/include/spla/type.hpp b/include/spla/type.hpp
index 47825ff9b..f61adf867 100644
--- a/include/spla/type.hpp
+++ b/include/spla/type.hpp
@@ -54,10 +54,10 @@ namespace spla {
         SPLA_API virtual int                get_id()          = 0;
     };
 
+    using T_BOOL  = bool;
     using T_INT   = std::int32_t;
     using T_UINT  = std::uint32_t;
     using T_FLOAT = float;
-    using T_BOOL  = bool;
 
     SPLA_API extern ref_ptr<Type> BOOL;
     SPLA_API extern ref_ptr<Type> INT;
diff --git a/python/example.py b/python/example.py
index dee0ee164..6d71057c8 100644
--- a/python/example.py
+++ b/python/example.py
@@ -1,5 +1,10 @@
 from pyspla import *
 
-u = Vector.from_lists([0, 1], [10, 20], 4, INT)
-v = Vector.from_lists([1, 3], [-5, 12], 4, INT)
-print(u.emult(INT.PLUS, v))
+M = Matrix.from_lists([0, 1, 2], [3, 2, 0], [-5, 3, 9], (3, 4), INT)
+print(M)
+
+print(M.transpose())
+
+print(M.transpose(op_apply=INT.UONE))
+
+print(M.transpose(op_apply=INT.AINV))
diff --git a/python/pyspla/__init__.py b/python/pyspla/__init__.py
index 329cd63c1..a3bc4c5d9 100644
--- a/python/pyspla/__init__.py
+++ b/python/pyspla/__init__.py
@@ -129,6 +129,14 @@
 Performance
 -----------
 
+Spla shows greate performance comparing to Nvidia CUDA based optimized GraphBLAST library, processing large graphs
+in extreme cases counting 1 BILLION edges with speed and without memory issues. Also, spla shows outstanding performance
+in Page-Rank algorithms, outperforming low-level Nvidia CUDA highly-optimized Gunrock library. Spla shows scalability
+on GPUs on Intel, Nvidia and AMD with acceptable performance. Spla can be run even on integrated GPUs. Here you can
+get good speedup, what is much faster than `scipy` or `networkx`.
+
+More details with performance study given down bellow.
+
 **Comparison on a Nvidia GPU**
 
 ![stats](../../docs/stats/rq1_rel_compact.png)
@@ -174,23 +182,21 @@
 Containers
 ----------
 
-Library provides fundamental generalized linear algebra containers
-for data storage and mathematical computations. These containers
-are generalized, so any of built-in types may be used to parametrize
-type of data. Containers have sparse formats by default, so it is
-possible to create large-dimension but low data containers. Containers
-are storage-invariant, so the best format for the storage is automatically
-managed by container internally. All required format conversion done
-in the context of particular primitive usage.
+Library provides fundamental generalized linear algebra containers for data storage and mathematical computations.
+These containers are generalized, so any of built-in types may be used to parametrize type of data. Containers
+have sparse formats by default, so it is possible to create large-dimension but low data containers. Containers
+are storage-invariant, so the best format for the storage is automatically managed by container internally.
+All required format conversion done in the context of particular primitive usage.
 
 Types
 -----
 
 Library provides a set of standard and common built-in data types. Library value types
-differ a bit from a classic type definition. In spla library type is essentially is a
+differ a bit from a classic type definition. In `spla` library type is essentially is a
 storage characteristic, which defines count and layout of bytes per element. User
 can interpret stored data as her/she wants. Spla types set is limited due to the nature
 of GPUs accelerations, where arbitrary layout of data causes significant performance penalties.
+Types such as `int` `float` `uint` `bool` are supported. More types can be added on demand.
 
 Ops
 ---
@@ -203,8 +209,8 @@
 Math operations
 ---------------
 
-Library provides as of high-level linera algebra operations over matrices and vectors with
-parametrization by binary, unary and select `ops`. There is avalable implementation for
+Library provides a set of high-level linera algebra operations over matrices and vectors with
+parametrization by binary, unary and select `ops`. There is avalable implementations for
 general `mxm` matrix-matrix, masked `mxmT` matrix-matrix, masked `mxv` matrix-vector,
 masked `vxm` vector-matrix products, matrix and vector reductions, assignment, and so on.
 Most operations have both CPU and GPU implementation. Thus, you will have GPU performance
diff --git a/python/pyspla/bridge.py b/python/pyspla/bridge.py
index 7e4765770..e8304cc90 100644
--- a/python/pyspla/bridge.py
+++ b/python/pyspla/bridge.py
@@ -554,6 +554,7 @@ def load_library(lib_path):
     _spla.spla_Exec_m_reduce_by_row.restype = _status_t
     _spla.spla_Exec_m_reduce_by_column.restype = _status_t
     _spla.spla_Exec_m_reduce.restype = _status_t
+    _spla.spla_Exec_m_transpose.restype = _status_t
     _spla.spla_Exec_v_eadd.restype = _status_t
     _spla.spla_Exec_v_emult.restype = _status_t
     _spla.spla_Exec_v_eadd_fdb.restype = _status_t
@@ -576,6 +577,8 @@ def load_library(lib_path):
         [_object_t, _object_t, _object_t, _object_t, _object_t, _p_object_t]
     _spla.spla_Exec_m_reduce.argtypes = \
         [_object_t, _object_t, _object_t, _object_t, _object_t, _p_object_t]
+    _spla.spla_Exec_m_transpose.argtypes = \
+        [_object_t, _object_t, _object_t, _object_t, _p_object_t]
     _spla.spla_Exec_v_eadd.argtypes = \
         [_object_t, _object_t, _object_t, _object_t, _object_t, _p_object_t]
     _spla.spla_Exec_v_emult.argtypes = \
diff --git a/python/pyspla/matrix.py b/python/pyspla/matrix.py
index 69d73185d..df1b2b7b5 100644
--- a/python/pyspla/matrix.py
+++ b/python/pyspla/matrix.py
@@ -535,6 +535,42 @@ def dense(cls, shape, dtype=INT, fill_value=0):
 
         return M
 
+    @classmethod
+    def diag(cls, shape, dtype=INT, diag_value=1):
+        """
+        Diagonal matrix of desired shape and desired fill value on diagonal.
+
+        >>> M = Matrix.diag((5, 5), INT, -1)
+        >>> print(M)
+        '
+            0 1 2 3 4
+         0|-1 . . . .|  0
+         1| .-1 . . .|  1
+         2| . .-1 . .|  2
+         3| . . .-1 .|  3
+         4| . . . .-1|  4
+            0 1 2 3 4
+        '
+
+        :param shape: 2-tuple.
+            Size of the matrix.
+
+        :param dtype: optional: Type. default: INT.
+            Type of values matrix will have.
+
+        :param diag_value: optional: any. default: 1.
+            Optional value to fill the diagonal with.
+
+        :return: Matrix with main diagonal filled with value.
+        """
+
+        M = Matrix(shape, dtype)
+
+        for i in range(min(shape[0], shape[1])):
+            M.set(i, i, diag_value)
+
+        return M
+
     def mxm(self, M, op_mult, op_add, out=None, init=None, desc=None):
         """
         General sparse-matrix by sparse-matrix product.
@@ -935,6 +971,82 @@ def reduce(self, op_reduce, out=None, init=None, desc=None):
 
         return out
 
+    def transpose(self, out=None, op_apply=None, desc=None):
+        """
+        Transpose matrix.
+
+        Generate 3x4 matrix with int source data.
+        >>> M = Matrix.from_lists([0, 1, 2], [3, 2, 0], [-5, 3, 9], (3, 4), INT)
+        >>> print(M)
+        '
+            0 1 2 3
+         0| . . .-5|  0
+         1| . . 3 .|  1
+         2| 9 . . .|  2
+            0 1 2 3
+        '
+
+        Transpose matrix `M` as usual and print result.
+        >>> print(M.transpose())
+        '
+            0 1 2
+         0| . . 9|  0
+         1| . . .|  1
+         2| . 3 .|  2
+         3|-5 . .|  3
+            0 1 2
+        '
+
+        Transpose by map each value to `1`, discarding prev value.
+        >>> print(M.transpose(op_apply=INT.UONE))
+        '
+            0 1 2
+         0| . . 1|  0
+         1| . . .|  1
+         2| . 1 .|  2
+         3| 1 . .|  3
+            0 1 2
+        '
+
+        Transpose and apply additive-inverse for each value effectively changing the sign of values.
+        >>> print(M.transpose(op_apply=INT.AINV))
+        '
+            0 1 2
+         0| . .-9|  0
+         1| . . .|  1
+         2| .-3 .|  2
+         3| 5 . .|  3
+            0 1 2
+        '
+
+        :param out: optional: Matrix: default: none.
+            Optional matrix to store result.
+
+        :param op_apply: optional: OpUnary. default: None.
+            Optional unary function to apply on transposition.
+
+        :param desc: optional: Descriptor. default: None.
+            Optional descriptor object to configure the execution.
+
+        :return: Transposed matrix.
+        """
+
+        if out is None:
+            out = Matrix(shape=(self.n_cols, self.n_rows), dtype=self.dtype)
+        if op_apply is None:
+            op_apply = self.dtype.IDENTITY
+
+        assert out
+        assert op_apply
+        assert out.n_rows == self.n_cols
+        assert out.n_cols == self.n_rows
+        assert out.dtype == self.dtype
+
+        check(backend().spla_Exec_m_transpose(out.hnd, self.hnd, op_apply.hnd,
+                                              self._get_desc(desc), self._get_task(None)))
+
+        return out
+
     def __str__(self):
         return self.to_string()
 
diff --git a/src/binding/c_exec.cpp b/src/binding/c_exec.cpp
index 4382d0f06..a2c35430e 100644
--- a/src/binding/c_exec.cpp
+++ b/src/binding/c_exec.cpp
@@ -64,6 +64,9 @@ spla_Status spla_Exec_m_reduce_by_column(spla_Vector r, spla_Matrix M, spla_OpBi
 spla_Status spla_Exec_m_reduce(spla_Scalar r, spla_Scalar s, spla_Matrix M, spla_OpBinary op_reduce, spla_Descriptor desc, spla_ScheduleTask* task) {
     SPLA_WRAP_EXEC(exec_m_reduce, AS_S(r), AS_S(s), AS_M(M), AS_OB(op_reduce));
 }
+spla_Status spla_Exec_m_transpose(spla_Matrix R, spla_Matrix M, spla_OpUnary op_apply, spla_Descriptor desc, spla_ScheduleTask* task) {
+    SPLA_WRAP_EXEC(exec_m_transpose, AS_M(R), AS_M(M), AS_OU(op_apply));
+}
 spla_Status spla_Exec_v_eadd(spla_Vector r, spla_Vector u, spla_Vector v, spla_OpBinary op, spla_Descriptor desc, spla_ScheduleTask* task) {
     SPLA_WRAP_EXEC(exec_v_eadd, AS_V(r), AS_V(u), AS_V(v), AS_OB(op));
 }
diff --git a/src/cpu/cpu_algo_registry.cpp b/src/cpu/cpu_algo_registry.cpp
index c81111a31..6bc4b7ce7 100644
--- a/src/cpu/cpu_algo_registry.cpp
+++ b/src/cpu/cpu_algo_registry.cpp
@@ -34,6 +34,7 @@
 #include <cpu/cpu_m_reduce.hpp>
 #include <cpu/cpu_m_reduce_by_column.hpp>
 #include <cpu/cpu_m_reduce_by_row.hpp>
+#include <cpu/cpu_m_transpose.hpp>
 #include <cpu/cpu_mxm.hpp>
 #include <cpu/cpu_mxmT_masked.hpp>
 #include <cpu/cpu_mxv.hpp>
@@ -102,6 +103,11 @@ namespace spla {
         g_registry->add(MAKE_KEY_CPU_0("m_reduce", UINT), std::make_shared<Algo_m_reduce_cpu<T_UINT>>());
         g_registry->add(MAKE_KEY_CPU_0("m_reduce", FLOAT), std::make_shared<Algo_m_reduce_cpu<T_FLOAT>>());
 
+        // algorthm m_transpose
+        g_registry->add(MAKE_KEY_CPU_0("m_transpose", INT), std::make_shared<Algo_m_transpose_cpu<T_INT>>());
+        g_registry->add(MAKE_KEY_CPU_0("m_transpose", UINT), std::make_shared<Algo_m_transpose_cpu<T_UINT>>());
+        g_registry->add(MAKE_KEY_CPU_0("m_transpose", FLOAT), std::make_shared<Algo_m_transpose_cpu<T_FLOAT>>());
+
         // algorthm mxv_masked
         g_registry->add(MAKE_KEY_CPU_0("mxv_masked", INT), std::make_shared<Algo_mxv_masked_cpu<T_INT>>());
         g_registry->add(MAKE_KEY_CPU_0("mxv_masked", UINT), std::make_shared<Algo_mxv_masked_cpu<T_UINT>>());
diff --git a/src/cpu/cpu_m_reduce.hpp b/src/cpu/cpu_m_reduce.hpp
index 269db8846..4e7446710 100644
--- a/src/cpu/cpu_m_reduce.hpp
+++ b/src/cpu/cpu_m_reduce.hpp
@@ -59,17 +59,17 @@ namespace spla {
             auto t = ctx.task.template cast_safe<ScheduleTask_m_reduce>();
             auto M = t->M.template cast_safe<TMatrix<T>>();
 
-            if (M->is_valid(FormatMatrix::CpuDok)) {
-                return execute_dok(ctx);
+            if (M->is_valid(FormatMatrix::CpuCsr)) {
+                return execute_csr(ctx);
             }
             if (M->is_valid(FormatMatrix::CpuLil)) {
                 return execute_lil(ctx);
             }
-            if (M->is_valid(FormatMatrix::CpuCsr)) {
-                return execute_csr(ctx);
+            if (M->is_valid(FormatMatrix::CpuDok)) {
+                return execute_dok(ctx);
             }
 
-            return execute_dok(ctx);
+            return execute_csr(ctx);
         }
 
     private:
diff --git a/src/cpu/cpu_m_transpose.hpp b/src/cpu/cpu_m_transpose.hpp
new file mode 100644
index 000000000..9a6f05205
--- /dev/null
+++ b/src/cpu/cpu_m_transpose.hpp
@@ -0,0 +1,197 @@
+/**********************************************************************************/
+/* This file is part of spla project                                              */
+/* https://github.com/JetBrains-Research/spla                                     */
+/**********************************************************************************/
+/* MIT License                                                                    */
+/*                                                                                */
+/* Copyright (c) 2023 SparseLinearAlgebra                                         */
+/*                                                                                */
+/* Permission is hereby granted, free of charge, to any person obtaining a copy   */
+/* of this software and associated documentation files (the "Software"), to deal  */
+/* in the Software without restriction, including without limitation the rights   */
+/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell      */
+/* copies of the Software, and to permit persons to whom the Software is          */
+/* furnished to do so, subject to the following conditions:                       */
+/*                                                                                */
+/* The above copyright notice and this permission notice shall be included in all */
+/* copies or substantial portions of the Software.                                */
+/*                                                                                */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR     */
+/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,       */
+/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE    */
+/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER         */
+/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  */
+/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  */
+/* SOFTWARE.                                                                      */
+/**********************************************************************************/
+
+#ifndef SPLA_CPU_M_TRANSPOSE_HPP
+#define SPLA_CPU_M_TRANSPOSE_HPP
+
+#include <schedule/schedule_tasks.hpp>
+
+#include <core/dispatcher.hpp>
+#include <core/registry.hpp>
+#include <core/tmatrix.hpp>
+#include <core/top.hpp>
+#include <core/tscalar.hpp>
+#include <core/ttype.hpp>
+#include <core/tvector.hpp>
+
+#include <algorithm>
+#include <numeric>
+
+namespace spla {
+
+    template<typename T>
+    class Algo_m_transpose_cpu final : public RegistryAlgo {
+    public:
+        ~Algo_m_transpose_cpu() override = default;
+
+        std::string get_name() override {
+            return "m_transpose";
+        }
+
+        std::string get_description() override {
+            return "transpose matrix on cpu sequentially";
+        }
+
+        Status execute(const DispatchContext& ctx) override {
+            auto t = ctx.task.template cast_safe<ScheduleTask_m_transpose>();
+            auto M = t->M.template cast_safe<TMatrix<T>>();
+
+            if (M->is_valid(FormatMatrix::CpuCsr)) {
+                return execute_csr(ctx);
+            }
+            if (M->is_valid(FormatMatrix::CpuLil)) {
+                return execute_lil(ctx);
+            }
+            if (M->is_valid(FormatMatrix::CpuDok)) {
+                return execute_dok(ctx);
+            }
+
+            return execute_csr(ctx);
+        }
+
+    private:
+        Status execute_dok(const DispatchContext& ctx) {
+            TIME_PROFILE_SCOPE("cpu/m_transpose_dok");
+
+            auto t = ctx.task.template cast_safe<ScheduleTask_m_transpose>();
+
+            ref_ptr<TMatrix<T>>     R         = t->R.template cast_safe<TMatrix<T>>();
+            ref_ptr<TMatrix<T>>     M         = t->M.template cast_safe<TMatrix<T>>();
+            ref_ptr<TOpUnary<T, T>> op_reduce = t->op_apply.template cast_safe<TOpUnary<T, T>>();
+
+            R->validate_wd(FormatMatrix::CpuDok);
+            M->validate_rw(FormatMatrix::CpuDok);
+
+            CpuDok<T>*       p_dok_R    = R->template get<CpuDok<T>>();
+            const CpuDok<T>* p_dok_M    = M->template get<CpuDok<T>>();
+            auto&            func_apply = op_reduce->function;
+
+            assert(p_dok_R->Ax.empty());
+
+            p_dok_R->Ax.reserve(p_dok_M->Ax.size());
+
+            for (const auto& entry : p_dok_M->Ax) {
+                p_dok_R->Ax[{entry.first.second, entry.first.first}] = func_apply(entry.second);
+            }
+
+            p_dok_R->values = p_dok_M->values;
+
+            return Status::Ok;
+        }
+
+        Status execute_lil(const DispatchContext& ctx) {
+            TIME_PROFILE_SCOPE("cpu/m_transpose_lil");
+
+            auto t = ctx.task.template cast_safe<ScheduleTask_m_transpose>();
+
+            ref_ptr<TMatrix<T>>     R         = t->R.template cast_safe<TMatrix<T>>();
+            ref_ptr<TMatrix<T>>     M         = t->M.template cast_safe<TMatrix<T>>();
+            ref_ptr<TOpUnary<T, T>> op_reduce = t->op_apply.template cast_safe<TOpUnary<T, T>>();
+
+            R->validate_wd(FormatMatrix::CpuLil);
+            M->validate_rw(FormatMatrix::CpuLil);
+
+            CpuLil<T>*       p_lil_R    = R->template get<CpuLil<T>>();
+            const CpuLil<T>* p_lil_M    = M->template get<CpuLil<T>>();
+            auto&            func_apply = op_reduce->function;
+
+            const uint DM = M->get_n_rows();
+            const uint DN = M->get_n_cols();
+
+            assert(M->get_n_rows() == R->get_n_cols());
+            assert(M->get_n_cols() == R->get_n_rows());
+
+            assert(p_lil_R->Ar.size() == DN);
+
+            for (uint i = 0; i < DM; i++) {
+                for (const auto [j, x] : p_lil_M->Ar[i]) {
+                    p_lil_R->Ar[j].emplace_back(i, func_apply(x));
+                }
+            }
+
+            p_lil_R->values = p_lil_M->values;
+
+            return Status::Ok;
+        }
+
+        Status execute_csr(const DispatchContext& ctx) {
+            TIME_PROFILE_SCOPE("cpu/m_transpose_csr");
+
+            auto t = ctx.task.template cast_safe<ScheduleTask_m_transpose>();
+
+            ref_ptr<TMatrix<T>>     R         = t->R.template cast_safe<TMatrix<T>>();
+            ref_ptr<TMatrix<T>>     M         = t->M.template cast_safe<TMatrix<T>>();
+            ref_ptr<TOpUnary<T, T>> op_reduce = t->op_apply.template cast_safe<TOpUnary<T, T>>();
+
+            R->validate_wd(FormatMatrix::CpuCsr);
+            M->validate_rw(FormatMatrix::CpuCsr);
+
+            CpuCsr<T>*       p_csr_R    = R->template get<CpuCsr<T>>();
+            const CpuCsr<T>* p_csr_M    = M->template get<CpuCsr<T>>();
+            auto&            func_apply = op_reduce->function;
+
+            const uint DM = M->get_n_rows();
+            const uint DN = M->get_n_cols();
+
+            assert(M->get_n_rows() == R->get_n_cols());
+            assert(M->get_n_cols() == R->get_n_rows());
+
+            std::vector<uint> sizes(DN + 1, 0);
+
+            for (uint i = 0; i < DM; i++) {
+                for (uint k = p_csr_M->Ap[i]; k < p_csr_M->Ap[i + 1]; k++) {
+                    uint j = p_csr_M->Aj[k];
+                    sizes[j] += 1;
+                }
+            }
+
+            cpu_csr_resize(DN, uint(p_csr_M->Ax.size()), *p_csr_R);
+            std::exclusive_scan(sizes.begin(), sizes.end(), p_csr_R->Ap.begin(), 0);
+
+            std::vector<uint> offsets(DN, 0);
+
+            for (uint i = 0; i < DM; i++) {
+                for (uint k = p_csr_M->Ap[i]; k < p_csr_M->Ap[i + 1]; k++) {
+                    uint j = p_csr_M->Aj[k];
+                    T    x = p_csr_M->Ax[k];
+
+                    p_csr_R->Aj[p_csr_R->Ap[j] + offsets[j]] = i;
+                    p_csr_R->Ax[p_csr_R->Ap[j] + offsets[j]] = func_apply(x);
+
+                    offsets[j] += 1;
+                }
+            }
+
+            p_csr_R->values = p_csr_M->values;
+
+            return Status::Ok;
+        }
+    };
+
+}// namespace spla
+
+#endif//SPLA_CPU_M_TRANSPOSE_HPP
diff --git a/src/exec.cpp b/src/exec.cpp
index 60624f0cc..6d4c53f40 100644
--- a/src/exec.cpp
+++ b/src/exec.cpp
@@ -203,6 +203,20 @@ namespace spla {
         EXEC_OR_MAKE_TASK
     }
 
+    Status exec_m_transpose(
+            ref_ptr<Matrix>        R,
+            ref_ptr<Matrix>        M,
+            ref_ptr<OpUnary>       op_apply,
+            ref_ptr<Descriptor>    desc,
+            ref_ptr<ScheduleTask>* task_hnd) {
+        auto task      = make_ref<ScheduleTask_m_transpose>();
+        task->R        = std::move(R);
+        task->M        = std::move(M);
+        task->op_apply = std::move(op_apply);
+        task->desc     = std::move(desc);
+        EXEC_OR_MAKE_TASK
+    }
+
     Status exec_v_eadd(
             ref_ptr<Vector>        r,
             ref_ptr<Vector>        u,
diff --git a/src/schedule/schedule_tasks.cpp b/src/schedule/schedule_tasks.cpp
index e2da734fb..05a78fc2c 100644
--- a/src/schedule/schedule_tasks.cpp
+++ b/src/schedule/schedule_tasks.cpp
@@ -216,6 +216,27 @@ namespace spla {
         return {r.as<Object>(), s.as<Object>(), M.as<Object>(), op_reduce.as<Object>()};
     }
 
+    std::string ScheduleTask_m_transpose::get_name() {
+        return "m_transpose";
+    }
+    std::string ScheduleTask_m_transpose::get_key() {
+        std::stringstream key;
+        key << get_name()
+            << TYPE_KEY(R->get_type());
+
+        return key.str();
+    }
+    std::string ScheduleTask_m_transpose::get_key_full() {
+        std::stringstream key;
+        key << get_name()
+            << OP_KEY(op_apply);
+
+        return key.str();
+    }
+    std::vector<ref_ptr<Object>> ScheduleTask_m_transpose::get_args() {
+        return {R.as<Object>(), M.as<Object>(), op_apply.as<Object>()};
+    }
+
     std::string ScheduleTask_v_eadd::get_name() {
         return "v_eadd";
     }
diff --git a/src/schedule/schedule_tasks.hpp b/src/schedule/schedule_tasks.hpp
index 048432806..36d2ec16a 100644
--- a/src/schedule/schedule_tasks.hpp
+++ b/src/schedule/schedule_tasks.hpp
@@ -220,6 +220,24 @@ namespace spla {
         ref_ptr<OpBinary> op_reduce;
     };
 
+    /**
+     * @class ScheduleTask_m_transpose
+     * @brief Matrix transpose
+     */
+    class ScheduleTask_m_transpose final : public ScheduleTaskBase {
+    public:
+        ~ScheduleTask_m_transpose() override = default;
+
+        std::string                  get_name() override;
+        std::string                  get_key() override;
+        std::string                  get_key_full() override;
+        std::vector<ref_ptr<Object>> get_args() override;
+
+        ref_ptr<Matrix>  R;
+        ref_ptr<Matrix>  M;
+        ref_ptr<OpUnary> op_apply;
+    };
+
     /**
      * @class ScheduleTask_v_eadd
      * @brief Vector ewise add
diff --git a/src/type.cpp b/src/type.cpp
index 7690beb54..7b75a7154 100644
--- a/src/type.cpp
+++ b/src/type.cpp
@@ -29,7 +29,7 @@
 
 namespace spla {
 
-    ref_ptr<Type> BOOL  = TType<T_INT>::make_type("BOOL", "B", "bool", "signed 4 byte logical type", 1);
+    ref_ptr<Type> BOOL  = TType<T_INT>::make_type("BOOL", "B", "bool", "4 byte logical type", 1);
     ref_ptr<Type> INT   = TType<T_INT>::make_type("INT", "I", "int", "signed 4 byte integral type", 2);
     ref_ptr<Type> UINT  = TType<T_UINT>::make_type("UINT", "U", "uint", "unsigned 4 byte integral type", 3);
     ref_ptr<Type> FLOAT = TType<T_FLOAT>::make_type("FLOAT", "F", "float", "4 byte floating point type", 4);
diff --git a/tests/test_matrix.cpp b/tests/test_matrix.cpp
index 58f02227a..dfeaa93dc 100644
--- a/tests/test_matrix.cpp
+++ b/tests/test_matrix.cpp
@@ -188,4 +188,31 @@ TEST(matrix, reduce) {
     EXPECT_EQ(ir->as_int(), M * K * 2);
 }
 
+TEST(matrix, transpose) {
+    const spla::uint M = 100, N = 200, K = 8;
+
+    auto iM = spla::Matrix::make(M, N, spla::INT);
+    auto iR = spla::Matrix::make(N, M, spla::INT);
+
+    for (spla::uint i = 0; i < M; i += 1) {
+        for (spla::uint j = 0; j < N; j += 1) {
+            if ((i + j) % 2) iM->set_int(i, j, i * 10 + j);
+        }
+    }
+
+    spla::exec_m_transpose(iR, iM, spla::AINV_INT);
+
+    for (spla::uint i = 0; i < M; i += 1) {
+        for (spla::uint j = 0; j < N; j += 1) {
+            int v;
+            iR->get_int(j, i, v);
+            if ((i + j) % 2) {
+                EXPECT_EQ(-(i * 10 + j), v);
+            } else {
+                EXPECT_EQ(0, v);
+            }
+        }
+    }
+}
+
 SPLA_GTEST_MAIN_WITH_FINALIZE
\ No newline at end of file