From 789ef1318e283aa1ed32c27e514e1501358180ca Mon Sep 17 00:00:00 2001 From: Egor Orachev Date: Sat, 2 Sep 2023 11:07:03 +0300 Subject: [PATCH] gh-224: add matrix transpose --- CMakeLists.txt | 14 --- README.md | 8 ++ include/spla.h | 1 + include/spla/exec.hpp | 20 ++++ include/spla/type.hpp | 2 +- python/example.py | 11 +- python/pyspla/__init__.py | 28 +++-- python/pyspla/bridge.py | 3 + python/pyspla/matrix.py | 112 ++++++++++++++++++ src/binding/c_exec.cpp | 3 + src/cpu/cpu_algo_registry.cpp | 6 + src/cpu/cpu_m_reduce.hpp | 10 +- src/cpu/cpu_m_transpose.hpp | 197 ++++++++++++++++++++++++++++++++ src/exec.cpp | 14 +++ src/schedule/schedule_tasks.cpp | 21 ++++ src/schedule/schedule_tasks.hpp | 18 +++ src/type.cpp | 2 +- tests/test_matrix.cpp | 27 +++++ 18 files changed, 462 insertions(+), 35 deletions(-) create mode 100644 src/cpu/cpu_m_transpose.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 45f9f129d..2d7497490 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -320,20 +320,6 @@ add_library(spla SHARED src/cpu/cpu_format_dok_vec.hpp src/cpu/cpu_format_lil.hpp src/cpu/cpu_formats.hpp - src/cpu/cpu_m_reduce.hpp - src/cpu/cpu_m_reduce_by_column.hpp - src/cpu/cpu_m_reduce_by_row.hpp - src/cpu/cpu_mxm.hpp - src/cpu/cpu_mxmT_masked.hpp - src/cpu/cpu_mxv.hpp - src/cpu/cpu_vxm.hpp - src/cpu/cpu_v_assign.hpp - src/cpu/cpu_v_count_mf.hpp - src/cpu/cpu_v_eadd.hpp - src/cpu/cpu_v_eadd_fdb.hpp - src/cpu/cpu_v_emult.hpp - src/cpu/cpu_v_map.hpp - src/cpu/cpu_v_reduce.hpp src/util/pair_hash.hpp src/profiling/time_profiler.cpp src/profiling/time_profiler.hpp diff --git a/README.md b/README.md index b59d2c33d..e2a80a408 100644 --- a/README.md +++ b/README.md @@ -100,6 +100,14 @@ v, c, d = bfs(0, A) ## Performance +Spla shows greate performance comparing to Nvidia CUDA based optimized GraphBLAST library, processing large graphs +in extreme cases counting 1 BILLION edges with speed and without memory issues. Also, spla shows outstanding performance +in Page-Rank algorithms, outperforming low-level Nvidia CUDA highly-optimized Gunrock library. Spla shows scalability +on GPUs on Intel, Nvidia and AMD with acceptable performance. Spla can be run even on integrated GPUs. Here you can +get good speedup, what is much faster than `scipy` or `networkx`. + +More details with performance study given down bellow. + ### Comparison on a Nvidia GPU | ![stats](./docs/stats/rq1_rel.png) | diff --git a/include/spla.h b/include/spla.h index bb8d3ed3a..72bff7619 100644 --- a/include/spla.h +++ b/include/spla.h @@ -373,6 +373,7 @@ SPLA_API spla_Status spla_Exec_vxm_masked(spla_Vector r, spla_Vector mask, spla_ SPLA_API spla_Status spla_Exec_m_reduce_by_row(spla_Vector r, spla_Matrix M, spla_OpBinary op_reduce, spla_Scalar init, spla_Descriptor desc, spla_ScheduleTask* task); SPLA_API spla_Status spla_Exec_m_reduce_by_column(spla_Vector r, spla_Matrix M, spla_OpBinary op_reduce, spla_Scalar init, spla_Descriptor desc, spla_ScheduleTask* task); SPLA_API spla_Status spla_Exec_m_reduce(spla_Scalar r, spla_Scalar s, spla_Matrix M, spla_OpBinary op_reduce, spla_Descriptor desc, spla_ScheduleTask* task); +SPLA_API spla_Status spla_Exec_m_transpose(spla_Matrix R, spla_Matrix M, spla_OpUnary op_apply, spla_Descriptor desc, spla_ScheduleTask* task); SPLA_API spla_Status spla_Exec_v_eadd(spla_Vector r, spla_Vector u, spla_Vector v, spla_OpBinary op, spla_Descriptor desc, spla_ScheduleTask* task); SPLA_API spla_Status spla_Exec_v_emult(spla_Vector r, spla_Vector u, spla_Vector v, spla_OpBinary op, spla_Descriptor desc, spla_ScheduleTask* task); SPLA_API spla_Status spla_Exec_v_eadd_fdb(spla_Vector r, spla_Vector v, spla_Vector fdb, spla_OpBinary op, spla_Descriptor desc, spla_ScheduleTask* task); diff --git a/include/spla/exec.hpp b/include/spla/exec.hpp index efa8a4927..fc62f13fb 100644 --- a/include/spla/exec.hpp +++ b/include/spla/exec.hpp @@ -239,6 +239,26 @@ namespace spla { ref_ptr desc = ref_ptr(), ref_ptr* task_hnd = nullptr); + /** + * @brief Execute (schedule) matrix transpose operation + * + * @note Pass valid `task_hnd` to store as a task, rather then execute immediately. + * + * @param R Matrix to store result + * @param M Matrix to transpose + * @param op_apply Unary op to transform value + * @param desc Scheduled task descriptor; default is null + * @param task_hnd Optional task hnd; pass not-null pointer to store task + * + * @return Status on task execution or status on hnd creation + */ + SPLA_API Status exec_m_transpose( + ref_ptr R, + ref_ptr M, + ref_ptr op_apply, + ref_ptr desc = ref_ptr(), + ref_ptr* task_hnd = nullptr); + /** * @brief Execute (schedule) element-wise addition by structure of two vectors * diff --git a/include/spla/type.hpp b/include/spla/type.hpp index 47825ff9b..f61adf867 100644 --- a/include/spla/type.hpp +++ b/include/spla/type.hpp @@ -54,10 +54,10 @@ namespace spla { SPLA_API virtual int get_id() = 0; }; + using T_BOOL = bool; using T_INT = std::int32_t; using T_UINT = std::uint32_t; using T_FLOAT = float; - using T_BOOL = bool; SPLA_API extern ref_ptr BOOL; SPLA_API extern ref_ptr INT; diff --git a/python/example.py b/python/example.py index dee0ee164..6d71057c8 100644 --- a/python/example.py +++ b/python/example.py @@ -1,5 +1,10 @@ from pyspla import * -u = Vector.from_lists([0, 1], [10, 20], 4, INT) -v = Vector.from_lists([1, 3], [-5, 12], 4, INT) -print(u.emult(INT.PLUS, v)) +M = Matrix.from_lists([0, 1, 2], [3, 2, 0], [-5, 3, 9], (3, 4), INT) +print(M) + +print(M.transpose()) + +print(M.transpose(op_apply=INT.UONE)) + +print(M.transpose(op_apply=INT.AINV)) diff --git a/python/pyspla/__init__.py b/python/pyspla/__init__.py index 329cd63c1..a3bc4c5d9 100644 --- a/python/pyspla/__init__.py +++ b/python/pyspla/__init__.py @@ -129,6 +129,14 @@ Performance ----------- +Spla shows greate performance comparing to Nvidia CUDA based optimized GraphBLAST library, processing large graphs +in extreme cases counting 1 BILLION edges with speed and without memory issues. Also, spla shows outstanding performance +in Page-Rank algorithms, outperforming low-level Nvidia CUDA highly-optimized Gunrock library. Spla shows scalability +on GPUs on Intel, Nvidia and AMD with acceptable performance. Spla can be run even on integrated GPUs. Here you can +get good speedup, what is much faster than `scipy` or `networkx`. + +More details with performance study given down bellow. + **Comparison on a Nvidia GPU** ![stats](../../docs/stats/rq1_rel_compact.png) @@ -174,23 +182,21 @@ Containers ---------- -Library provides fundamental generalized linear algebra containers -for data storage and mathematical computations. These containers -are generalized, so any of built-in types may be used to parametrize -type of data. Containers have sparse formats by default, so it is -possible to create large-dimension but low data containers. Containers -are storage-invariant, so the best format for the storage is automatically -managed by container internally. All required format conversion done -in the context of particular primitive usage. +Library provides fundamental generalized linear algebra containers for data storage and mathematical computations. +These containers are generalized, so any of built-in types may be used to parametrize type of data. Containers +have sparse formats by default, so it is possible to create large-dimension but low data containers. Containers +are storage-invariant, so the best format for the storage is automatically managed by container internally. +All required format conversion done in the context of particular primitive usage. Types ----- Library provides a set of standard and common built-in data types. Library value types -differ a bit from a classic type definition. In spla library type is essentially is a +differ a bit from a classic type definition. In `spla` library type is essentially is a storage characteristic, which defines count and layout of bytes per element. User can interpret stored data as her/she wants. Spla types set is limited due to the nature of GPUs accelerations, where arbitrary layout of data causes significant performance penalties. +Types such as `int` `float` `uint` `bool` are supported. More types can be added on demand. Ops --- @@ -203,8 +209,8 @@ Math operations --------------- -Library provides as of high-level linera algebra operations over matrices and vectors with -parametrization by binary, unary and select `ops`. There is avalable implementation for +Library provides a set of high-level linera algebra operations over matrices and vectors with +parametrization by binary, unary and select `ops`. There is avalable implementations for general `mxm` matrix-matrix, masked `mxmT` matrix-matrix, masked `mxv` matrix-vector, masked `vxm` vector-matrix products, matrix and vector reductions, assignment, and so on. Most operations have both CPU and GPU implementation. Thus, you will have GPU performance diff --git a/python/pyspla/bridge.py b/python/pyspla/bridge.py index 7e4765770..e8304cc90 100644 --- a/python/pyspla/bridge.py +++ b/python/pyspla/bridge.py @@ -554,6 +554,7 @@ def load_library(lib_path): _spla.spla_Exec_m_reduce_by_row.restype = _status_t _spla.spla_Exec_m_reduce_by_column.restype = _status_t _spla.spla_Exec_m_reduce.restype = _status_t + _spla.spla_Exec_m_transpose.restype = _status_t _spla.spla_Exec_v_eadd.restype = _status_t _spla.spla_Exec_v_emult.restype = _status_t _spla.spla_Exec_v_eadd_fdb.restype = _status_t @@ -576,6 +577,8 @@ def load_library(lib_path): [_object_t, _object_t, _object_t, _object_t, _object_t, _p_object_t] _spla.spla_Exec_m_reduce.argtypes = \ [_object_t, _object_t, _object_t, _object_t, _object_t, _p_object_t] + _spla.spla_Exec_m_transpose.argtypes = \ + [_object_t, _object_t, _object_t, _object_t, _p_object_t] _spla.spla_Exec_v_eadd.argtypes = \ [_object_t, _object_t, _object_t, _object_t, _object_t, _p_object_t] _spla.spla_Exec_v_emult.argtypes = \ diff --git a/python/pyspla/matrix.py b/python/pyspla/matrix.py index 69d73185d..df1b2b7b5 100644 --- a/python/pyspla/matrix.py +++ b/python/pyspla/matrix.py @@ -535,6 +535,42 @@ def dense(cls, shape, dtype=INT, fill_value=0): return M + @classmethod + def diag(cls, shape, dtype=INT, diag_value=1): + """ + Diagonal matrix of desired shape and desired fill value on diagonal. + + >>> M = Matrix.diag((5, 5), INT, -1) + >>> print(M) + ' + 0 1 2 3 4 + 0|-1 . . . .| 0 + 1| .-1 . . .| 1 + 2| . .-1 . .| 2 + 3| . . .-1 .| 3 + 4| . . . .-1| 4 + 0 1 2 3 4 + ' + + :param shape: 2-tuple. + Size of the matrix. + + :param dtype: optional: Type. default: INT. + Type of values matrix will have. + + :param diag_value: optional: any. default: 1. + Optional value to fill the diagonal with. + + :return: Matrix with main diagonal filled with value. + """ + + M = Matrix(shape, dtype) + + for i in range(min(shape[0], shape[1])): + M.set(i, i, diag_value) + + return M + def mxm(self, M, op_mult, op_add, out=None, init=None, desc=None): """ General sparse-matrix by sparse-matrix product. @@ -935,6 +971,82 @@ def reduce(self, op_reduce, out=None, init=None, desc=None): return out + def transpose(self, out=None, op_apply=None, desc=None): + """ + Transpose matrix. + + Generate 3x4 matrix with int source data. + >>> M = Matrix.from_lists([0, 1, 2], [3, 2, 0], [-5, 3, 9], (3, 4), INT) + >>> print(M) + ' + 0 1 2 3 + 0| . . .-5| 0 + 1| . . 3 .| 1 + 2| 9 . . .| 2 + 0 1 2 3 + ' + + Transpose matrix `M` as usual and print result. + >>> print(M.transpose()) + ' + 0 1 2 + 0| . . 9| 0 + 1| . . .| 1 + 2| . 3 .| 2 + 3|-5 . .| 3 + 0 1 2 + ' + + Transpose by map each value to `1`, discarding prev value. + >>> print(M.transpose(op_apply=INT.UONE)) + ' + 0 1 2 + 0| . . 1| 0 + 1| . . .| 1 + 2| . 1 .| 2 + 3| 1 . .| 3 + 0 1 2 + ' + + Transpose and apply additive-inverse for each value effectively changing the sign of values. + >>> print(M.transpose(op_apply=INT.AINV)) + ' + 0 1 2 + 0| . .-9| 0 + 1| . . .| 1 + 2| .-3 .| 2 + 3| 5 . .| 3 + 0 1 2 + ' + + :param out: optional: Matrix: default: none. + Optional matrix to store result. + + :param op_apply: optional: OpUnary. default: None. + Optional unary function to apply on transposition. + + :param desc: optional: Descriptor. default: None. + Optional descriptor object to configure the execution. + + :return: Transposed matrix. + """ + + if out is None: + out = Matrix(shape=(self.n_cols, self.n_rows), dtype=self.dtype) + if op_apply is None: + op_apply = self.dtype.IDENTITY + + assert out + assert op_apply + assert out.n_rows == self.n_cols + assert out.n_cols == self.n_rows + assert out.dtype == self.dtype + + check(backend().spla_Exec_m_transpose(out.hnd, self.hnd, op_apply.hnd, + self._get_desc(desc), self._get_task(None))) + + return out + def __str__(self): return self.to_string() diff --git a/src/binding/c_exec.cpp b/src/binding/c_exec.cpp index 4382d0f06..a2c35430e 100644 --- a/src/binding/c_exec.cpp +++ b/src/binding/c_exec.cpp @@ -64,6 +64,9 @@ spla_Status spla_Exec_m_reduce_by_column(spla_Vector r, spla_Matrix M, spla_OpBi spla_Status spla_Exec_m_reduce(spla_Scalar r, spla_Scalar s, spla_Matrix M, spla_OpBinary op_reduce, spla_Descriptor desc, spla_ScheduleTask* task) { SPLA_WRAP_EXEC(exec_m_reduce, AS_S(r), AS_S(s), AS_M(M), AS_OB(op_reduce)); } +spla_Status spla_Exec_m_transpose(spla_Matrix R, spla_Matrix M, spla_OpUnary op_apply, spla_Descriptor desc, spla_ScheduleTask* task) { + SPLA_WRAP_EXEC(exec_m_transpose, AS_M(R), AS_M(M), AS_OU(op_apply)); +} spla_Status spla_Exec_v_eadd(spla_Vector r, spla_Vector u, spla_Vector v, spla_OpBinary op, spla_Descriptor desc, spla_ScheduleTask* task) { SPLA_WRAP_EXEC(exec_v_eadd, AS_V(r), AS_V(u), AS_V(v), AS_OB(op)); } diff --git a/src/cpu/cpu_algo_registry.cpp b/src/cpu/cpu_algo_registry.cpp index c81111a31..6bc4b7ce7 100644 --- a/src/cpu/cpu_algo_registry.cpp +++ b/src/cpu/cpu_algo_registry.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +103,11 @@ namespace spla { g_registry->add(MAKE_KEY_CPU_0("m_reduce", UINT), std::make_shared>()); g_registry->add(MAKE_KEY_CPU_0("m_reduce", FLOAT), std::make_shared>()); + // algorthm m_transpose + g_registry->add(MAKE_KEY_CPU_0("m_transpose", INT), std::make_shared>()); + g_registry->add(MAKE_KEY_CPU_0("m_transpose", UINT), std::make_shared>()); + g_registry->add(MAKE_KEY_CPU_0("m_transpose", FLOAT), std::make_shared>()); + // algorthm mxv_masked g_registry->add(MAKE_KEY_CPU_0("mxv_masked", INT), std::make_shared>()); g_registry->add(MAKE_KEY_CPU_0("mxv_masked", UINT), std::make_shared>()); diff --git a/src/cpu/cpu_m_reduce.hpp b/src/cpu/cpu_m_reduce.hpp index 269db8846..4e7446710 100644 --- a/src/cpu/cpu_m_reduce.hpp +++ b/src/cpu/cpu_m_reduce.hpp @@ -59,17 +59,17 @@ namespace spla { auto t = ctx.task.template cast_safe(); auto M = t->M.template cast_safe>(); - if (M->is_valid(FormatMatrix::CpuDok)) { - return execute_dok(ctx); + if (M->is_valid(FormatMatrix::CpuCsr)) { + return execute_csr(ctx); } if (M->is_valid(FormatMatrix::CpuLil)) { return execute_lil(ctx); } - if (M->is_valid(FormatMatrix::CpuCsr)) { - return execute_csr(ctx); + if (M->is_valid(FormatMatrix::CpuDok)) { + return execute_dok(ctx); } - return execute_dok(ctx); + return execute_csr(ctx); } private: diff --git a/src/cpu/cpu_m_transpose.hpp b/src/cpu/cpu_m_transpose.hpp new file mode 100644 index 000000000..9a6f05205 --- /dev/null +++ b/src/cpu/cpu_m_transpose.hpp @@ -0,0 +1,197 @@ +/**********************************************************************************/ +/* This file is part of spla project */ +/* https://github.com/JetBrains-Research/spla */ +/**********************************************************************************/ +/* MIT License */ +/* */ +/* Copyright (c) 2023 SparseLinearAlgebra */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining a copy */ +/* of this software and associated documentation files (the "Software"), to deal */ +/* in the Software without restriction, including without limitation the rights */ +/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */ +/* copies of the Software, and to permit persons to whom the Software is */ +/* furnished to do so, subject to the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be included in all */ +/* copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */ +/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */ +/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */ +/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */ +/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */ +/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */ +/* SOFTWARE. */ +/**********************************************************************************/ + +#ifndef SPLA_CPU_M_TRANSPOSE_HPP +#define SPLA_CPU_M_TRANSPOSE_HPP + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace spla { + + template + class Algo_m_transpose_cpu final : public RegistryAlgo { + public: + ~Algo_m_transpose_cpu() override = default; + + std::string get_name() override { + return "m_transpose"; + } + + std::string get_description() override { + return "transpose matrix on cpu sequentially"; + } + + Status execute(const DispatchContext& ctx) override { + auto t = ctx.task.template cast_safe(); + auto M = t->M.template cast_safe>(); + + if (M->is_valid(FormatMatrix::CpuCsr)) { + return execute_csr(ctx); + } + if (M->is_valid(FormatMatrix::CpuLil)) { + return execute_lil(ctx); + } + if (M->is_valid(FormatMatrix::CpuDok)) { + return execute_dok(ctx); + } + + return execute_csr(ctx); + } + + private: + Status execute_dok(const DispatchContext& ctx) { + TIME_PROFILE_SCOPE("cpu/m_transpose_dok"); + + auto t = ctx.task.template cast_safe(); + + ref_ptr> R = t->R.template cast_safe>(); + ref_ptr> M = t->M.template cast_safe>(); + ref_ptr> op_reduce = t->op_apply.template cast_safe>(); + + R->validate_wd(FormatMatrix::CpuDok); + M->validate_rw(FormatMatrix::CpuDok); + + CpuDok* p_dok_R = R->template get>(); + const CpuDok* p_dok_M = M->template get>(); + auto& func_apply = op_reduce->function; + + assert(p_dok_R->Ax.empty()); + + p_dok_R->Ax.reserve(p_dok_M->Ax.size()); + + for (const auto& entry : p_dok_M->Ax) { + p_dok_R->Ax[{entry.first.second, entry.first.first}] = func_apply(entry.second); + } + + p_dok_R->values = p_dok_M->values; + + return Status::Ok; + } + + Status execute_lil(const DispatchContext& ctx) { + TIME_PROFILE_SCOPE("cpu/m_transpose_lil"); + + auto t = ctx.task.template cast_safe(); + + ref_ptr> R = t->R.template cast_safe>(); + ref_ptr> M = t->M.template cast_safe>(); + ref_ptr> op_reduce = t->op_apply.template cast_safe>(); + + R->validate_wd(FormatMatrix::CpuLil); + M->validate_rw(FormatMatrix::CpuLil); + + CpuLil* p_lil_R = R->template get>(); + const CpuLil* p_lil_M = M->template get>(); + auto& func_apply = op_reduce->function; + + const uint DM = M->get_n_rows(); + const uint DN = M->get_n_cols(); + + assert(M->get_n_rows() == R->get_n_cols()); + assert(M->get_n_cols() == R->get_n_rows()); + + assert(p_lil_R->Ar.size() == DN); + + for (uint i = 0; i < DM; i++) { + for (const auto [j, x] : p_lil_M->Ar[i]) { + p_lil_R->Ar[j].emplace_back(i, func_apply(x)); + } + } + + p_lil_R->values = p_lil_M->values; + + return Status::Ok; + } + + Status execute_csr(const DispatchContext& ctx) { + TIME_PROFILE_SCOPE("cpu/m_transpose_csr"); + + auto t = ctx.task.template cast_safe(); + + ref_ptr> R = t->R.template cast_safe>(); + ref_ptr> M = t->M.template cast_safe>(); + ref_ptr> op_reduce = t->op_apply.template cast_safe>(); + + R->validate_wd(FormatMatrix::CpuCsr); + M->validate_rw(FormatMatrix::CpuCsr); + + CpuCsr* p_csr_R = R->template get>(); + const CpuCsr* p_csr_M = M->template get>(); + auto& func_apply = op_reduce->function; + + const uint DM = M->get_n_rows(); + const uint DN = M->get_n_cols(); + + assert(M->get_n_rows() == R->get_n_cols()); + assert(M->get_n_cols() == R->get_n_rows()); + + std::vector sizes(DN + 1, 0); + + for (uint i = 0; i < DM; i++) { + for (uint k = p_csr_M->Ap[i]; k < p_csr_M->Ap[i + 1]; k++) { + uint j = p_csr_M->Aj[k]; + sizes[j] += 1; + } + } + + cpu_csr_resize(DN, uint(p_csr_M->Ax.size()), *p_csr_R); + std::exclusive_scan(sizes.begin(), sizes.end(), p_csr_R->Ap.begin(), 0); + + std::vector offsets(DN, 0); + + for (uint i = 0; i < DM; i++) { + for (uint k = p_csr_M->Ap[i]; k < p_csr_M->Ap[i + 1]; k++) { + uint j = p_csr_M->Aj[k]; + T x = p_csr_M->Ax[k]; + + p_csr_R->Aj[p_csr_R->Ap[j] + offsets[j]] = i; + p_csr_R->Ax[p_csr_R->Ap[j] + offsets[j]] = func_apply(x); + + offsets[j] += 1; + } + } + + p_csr_R->values = p_csr_M->values; + + return Status::Ok; + } + }; + +}// namespace spla + +#endif//SPLA_CPU_M_TRANSPOSE_HPP diff --git a/src/exec.cpp b/src/exec.cpp index 60624f0cc..6d4c53f40 100644 --- a/src/exec.cpp +++ b/src/exec.cpp @@ -203,6 +203,20 @@ namespace spla { EXEC_OR_MAKE_TASK } + Status exec_m_transpose( + ref_ptr R, + ref_ptr M, + ref_ptr op_apply, + ref_ptr desc, + ref_ptr* task_hnd) { + auto task = make_ref(); + task->R = std::move(R); + task->M = std::move(M); + task->op_apply = std::move(op_apply); + task->desc = std::move(desc); + EXEC_OR_MAKE_TASK + } + Status exec_v_eadd( ref_ptr r, ref_ptr u, diff --git a/src/schedule/schedule_tasks.cpp b/src/schedule/schedule_tasks.cpp index e2da734fb..05a78fc2c 100644 --- a/src/schedule/schedule_tasks.cpp +++ b/src/schedule/schedule_tasks.cpp @@ -216,6 +216,27 @@ namespace spla { return {r.as(), s.as(), M.as(), op_reduce.as()}; } + std::string ScheduleTask_m_transpose::get_name() { + return "m_transpose"; + } + std::string ScheduleTask_m_transpose::get_key() { + std::stringstream key; + key << get_name() + << TYPE_KEY(R->get_type()); + + return key.str(); + } + std::string ScheduleTask_m_transpose::get_key_full() { + std::stringstream key; + key << get_name() + << OP_KEY(op_apply); + + return key.str(); + } + std::vector> ScheduleTask_m_transpose::get_args() { + return {R.as(), M.as(), op_apply.as()}; + } + std::string ScheduleTask_v_eadd::get_name() { return "v_eadd"; } diff --git a/src/schedule/schedule_tasks.hpp b/src/schedule/schedule_tasks.hpp index 048432806..36d2ec16a 100644 --- a/src/schedule/schedule_tasks.hpp +++ b/src/schedule/schedule_tasks.hpp @@ -220,6 +220,24 @@ namespace spla { ref_ptr op_reduce; }; + /** + * @class ScheduleTask_m_transpose + * @brief Matrix transpose + */ + class ScheduleTask_m_transpose final : public ScheduleTaskBase { + public: + ~ScheduleTask_m_transpose() override = default; + + std::string get_name() override; + std::string get_key() override; + std::string get_key_full() override; + std::vector> get_args() override; + + ref_ptr R; + ref_ptr M; + ref_ptr op_apply; + }; + /** * @class ScheduleTask_v_eadd * @brief Vector ewise add diff --git a/src/type.cpp b/src/type.cpp index 7690beb54..7b75a7154 100644 --- a/src/type.cpp +++ b/src/type.cpp @@ -29,7 +29,7 @@ namespace spla { - ref_ptr BOOL = TType::make_type("BOOL", "B", "bool", "signed 4 byte logical type", 1); + ref_ptr BOOL = TType::make_type("BOOL", "B", "bool", "4 byte logical type", 1); ref_ptr INT = TType::make_type("INT", "I", "int", "signed 4 byte integral type", 2); ref_ptr UINT = TType::make_type("UINT", "U", "uint", "unsigned 4 byte integral type", 3); ref_ptr FLOAT = TType::make_type("FLOAT", "F", "float", "4 byte floating point type", 4); diff --git a/tests/test_matrix.cpp b/tests/test_matrix.cpp index 58f02227a..dfeaa93dc 100644 --- a/tests/test_matrix.cpp +++ b/tests/test_matrix.cpp @@ -188,4 +188,31 @@ TEST(matrix, reduce) { EXPECT_EQ(ir->as_int(), M * K * 2); } +TEST(matrix, transpose) { + const spla::uint M = 100, N = 200, K = 8; + + auto iM = spla::Matrix::make(M, N, spla::INT); + auto iR = spla::Matrix::make(N, M, spla::INT); + + for (spla::uint i = 0; i < M; i += 1) { + for (spla::uint j = 0; j < N; j += 1) { + if ((i + j) % 2) iM->set_int(i, j, i * 10 + j); + } + } + + spla::exec_m_transpose(iR, iM, spla::AINV_INT); + + for (spla::uint i = 0; i < M; i += 1) { + for (spla::uint j = 0; j < N; j += 1) { + int v; + iR->get_int(j, i, v); + if ((i + j) % 2) { + EXPECT_EQ(-(i * 10 + j), v); + } else { + EXPECT_EQ(0, v); + } + } + } +} + SPLA_GTEST_MAIN_WITH_FINALIZE \ No newline at end of file