diff --git a/docs/api/python/ndarray/contrib.md b/docs/api/python/ndarray/contrib.md index b2bfa170b1da..709ddae007c5 100644 --- a/docs/api/python/ndarray/contrib.md +++ b/docs/api/python/ndarray/contrib.md @@ -60,6 +60,7 @@ In the rest of this document, we list routines provided by the `ndarray.contrib` isnan index_copy getnnz + edge_id ``` ## API Reference diff --git a/src/operator/contrib/dgl_graph.cc b/src/operator/contrib/dgl_graph.cc new file mode 100644 index 000000000000..bf54ed398f93 --- /dev/null +++ b/src/operator/contrib/dgl_graph.cc @@ -0,0 +1,464 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../elemwise_op_common.h" +#include "../../imperative/imperative_utils.h" +#include "../subgraph_op_common.h" +#include "../mshadow_op.h" +#include "../mxnet_op.h" +#include "../tensor/init_op.h" + +namespace mxnet { +namespace op { + + +///////////////////////// Create induced subgraph /////////////////////////// + +struct DGLSubgraphParam : public dmlc::Parameter { + int num_args; + bool return_mapping; + DMLC_DECLARE_PARAMETER(DGLSubgraphParam) { + DMLC_DECLARE_FIELD(num_args).set_lower_bound(2) + .describe("Number of input arguments, including all symbol inputs."); + DMLC_DECLARE_FIELD(return_mapping) + .describe("Return mapping of vid and eid between the subgraph and the parent graph."); + } +}; // struct DGLSubgraphParam + +DMLC_REGISTER_PARAMETER(DGLSubgraphParam); + +static bool DGLSubgraphStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->at(0), kCSRStorage); + for (size_t i = 1; i < in_attrs->size(); i++) + CHECK_EQ(in_attrs->at(i), kDefaultStorage); + + bool success = true; + *dispatch_mode = DispatchMode::kFComputeEx; + for (size_t i = 0; i < out_attrs->size(); i++) { + if (!type_assign(&(*out_attrs)[i], mxnet::kCSRStorage)) + success = false; + } + return success; +} + +static bool DGLSubgraphShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + const DGLSubgraphParam& params = nnvm::get(attrs.parsed); + CHECK_EQ(in_attrs->at(0).ndim(), 2U); + for (size_t i = 1; i < in_attrs->size(); i++) + CHECK_EQ(in_attrs->at(i).ndim(), 1U); + + size_t num_g = params.num_args - 1; + for (size_t i = 0; i < num_g; i++) { + TShape gshape(2); + gshape[0] = in_attrs->at(i + 1)[0]; + gshape[1] = in_attrs->at(i + 1)[0]; + out_attrs->at(i) = gshape; + } + for (size_t i = num_g; i < out_attrs->size(); i++) { + TShape gshape(2); + gshape[0] = in_attrs->at(i - num_g + 1)[0]; + gshape[1] = in_attrs->at(i - num_g + 1)[0]; + out_attrs->at(i) = gshape; + } + return true; +} + +static bool DGLSubgraphType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + const DGLSubgraphParam& params = nnvm::get(attrs.parsed); + size_t num_g = params.num_args - 1; + for (size_t i = 0; i < num_g; i++) { + CHECK_EQ(in_attrs->at(i + 1), mshadow::kInt64); + } + for (size_t i = 0; i < out_attrs->size(); i++) { + out_attrs->at(i) = in_attrs->at(0); + } + return true; +} + +typedef int64_t dgl_id_t; + +class Bitmap { + const size_t size = 1024 * 1024 * 4; + const size_t mask = size - 1; + std::vector map; + + size_t hash(dgl_id_t id) const { + return id & mask; + } + public: + Bitmap(const dgl_id_t *vid_data, int64_t len): map(size) { + for (int64_t i = 0; i < len; ++i) { + map[hash(vid_data[i])] = 1; + } + } + + bool test(dgl_id_t id) const { + return map[hash(id)]; + } +}; + +/* + * This uses a hashtable to check if a node is in the given node list. + */ +class HashTableChecker { + std::unordered_map oldv2newv; + Bitmap map; + + public: + HashTableChecker(const dgl_id_t *vid_data, int64_t len): map(vid_data, len) { + oldv2newv.reserve(len); + for (int64_t i = 0; i < len; ++i) { + oldv2newv[vid_data[i]] = i; + } + } + + void CollectOnRow(const dgl_id_t col_idx[], const dgl_id_t eids[], size_t row_len, + std::vector *new_col_idx, + std::vector *orig_eids) { + // TODO(zhengda) I need to make sure the column index in each row is sorted. + for (size_t j = 0; j < row_len; ++j) { + const dgl_id_t oldsucc = col_idx[j]; + const dgl_id_t eid = eids[j]; + Collect(oldsucc, eid, new_col_idx, orig_eids); + } + } + + void Collect(const dgl_id_t old_id, const dgl_id_t old_eid, + std::vector *col_idx, + std::vector *orig_eids) { + if (!map.test(old_id)) + return; + + auto it = oldv2newv.find(old_id); + if (it != oldv2newv.end()) { + const dgl_id_t new_id = it->second; + col_idx->push_back(new_id); + if (orig_eids) + orig_eids->push_back(old_eid); + } + } +}; + +static void GetSubgraph(const NDArray &csr_arr, const NDArray &varr, + const NDArray &sub_csr, const NDArray *old_eids) { + const TBlob &data = varr.data(); + int64_t num_vertices = csr_arr.shape()[0]; + const size_t len = varr.shape()[0]; + const dgl_id_t *vid_data = data.dptr(); + HashTableChecker def_check(vid_data, len); + // check if varr is sorted. + CHECK(std::is_sorted(vid_data, vid_data + len)) << "The input vertex list has to be sorted"; + + // Collect the non-zero entries in from the original graph. + std::vector row_idx(len + 1); + std::vector col_idx; + std::vector orig_eids; + col_idx.reserve(len * 50); + orig_eids.reserve(len * 50); + const dgl_id_t *eids = csr_arr.data().dptr(); + const dgl_id_t *indptr = csr_arr.aux_data(csr::kIndPtr).dptr(); + const dgl_id_t *indices = csr_arr.aux_data(csr::kIdx).dptr(); + for (size_t i = 0; i < len; ++i) { + const dgl_id_t oldvid = vid_data[i]; + CHECK_LT(oldvid, num_vertices) << "Vertex Id " << oldvid << " isn't in a graph of " + << num_vertices << " vertices"; + size_t row_start = indptr[oldvid]; + size_t row_len = indptr[oldvid + 1] - indptr[oldvid]; + def_check.CollectOnRow(indices + row_start, eids + row_start, row_len, + &col_idx, old_eids == nullptr ? nullptr : &orig_eids); + + row_idx[i + 1] = col_idx.size(); + } + + TShape nz_shape(1); + nz_shape[0] = col_idx.size(); + TShape indptr_shape(1); + indptr_shape[0] = row_idx.size(); + + // Store the non-zeros in a subgraph with edge attributes of new edge ids. + sub_csr.CheckAndAllocData(nz_shape); + sub_csr.CheckAndAllocAuxData(csr::kIdx, nz_shape); + sub_csr.CheckAndAllocAuxData(csr::kIndPtr, indptr_shape); + dgl_id_t *indices_out = sub_csr.aux_data(csr::kIdx).dptr(); + dgl_id_t *indptr_out = sub_csr.aux_data(csr::kIndPtr).dptr(); + std::copy(col_idx.begin(), col_idx.end(), indices_out); + std::copy(row_idx.begin(), row_idx.end(), indptr_out); + dgl_id_t *sub_eids = sub_csr.data().dptr(); + for (int64_t i = 0; i < nz_shape[0]; i++) + sub_eids[i] = i; + + // Store the non-zeros in a subgraph with edge attributes of old edge ids. + if (old_eids) { + old_eids->CheckAndAllocData(nz_shape); + old_eids->CheckAndAllocAuxData(csr::kIdx, nz_shape); + old_eids->CheckAndAllocAuxData(csr::kIndPtr, indptr_shape); + dgl_id_t *indices_out = old_eids->aux_data(csr::kIdx).dptr(); + dgl_id_t *indptr_out = old_eids->aux_data(csr::kIndPtr).dptr(); + dgl_id_t *sub_eids = old_eids->data().dptr(); + std::copy(col_idx.begin(), col_idx.end(), indices_out); + std::copy(row_idx.begin(), row_idx.end(), indptr_out); + std::copy(orig_eids.begin(), orig_eids.end(), sub_eids); + } +} + +static void DGLSubgraphComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DGLSubgraphParam& params = nnvm::get(attrs.parsed); + int num_g = params.num_args - 1; +#pragma omp parallel for + for (int i = 0; i < num_g; i++) { + const NDArray *old_eids = params.return_mapping ? &outputs[i + num_g] : nullptr; + GetSubgraph(inputs[0], inputs[i + 1], outputs[i], old_eids); + } +} + +NNVM_REGISTER_OP(_contrib_dgl_subgraph) +.describe(R"code(This operator constructs an induced subgraph for +a given set of vertices from a graph. The operator accepts multiple +sets of vertices as input. For each set of vertices, it returns a pair +of CSR matrices if return_mapping is True: the first matrix contains edges +with new edge Ids, the second matrix contains edges with the original +edge Ids. +Example:: + x=[[1, 0, 0, 2], + [3, 0, 4, 0], + [0, 5, 0, 0], + [0, 6, 7, 0]] + v = [0, 1, 2] + dgl_subgraph(x, v, return_mapping=True) = + [[1, 0, 0], + [2, 0, 3], + [0, 4, 0]], + [[1, 0, 0], + [3, 0, 4], + [0, 5, 0]] +)code" ADD_FILELINE) +.set_attr_parser(ParamParser) +.set_num_inputs([](const NodeAttrs& attrs) { + const DGLSubgraphParam& params = nnvm::get(attrs.parsed); + return params.num_args; +}) +.set_num_outputs([](const NodeAttrs& attrs) { + const DGLSubgraphParam& params = nnvm::get(attrs.parsed); + int num_varray = params.num_args - 1; + if (params.return_mapping) + return num_varray * 2; + else + return num_varray; +}) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + const DGLSubgraphParam& params = nnvm::get(attrs.parsed); + std::vector names; + names.reserve(params.num_args); + names.emplace_back("graph"); + for (int i = 1; i < params.num_args; ++i) + names.push_back("varray" + std::to_string(i - 1)); + return names; +}) +.set_attr("FInferStorageType", DGLSubgraphStorageType) +.set_attr("FInferShape", DGLSubgraphShape) +.set_attr("FInferType", DGLSubgraphType) +.set_attr("FComputeEx", DGLSubgraphComputeExCPU) +.set_attr("key_var_num_args", "num_args") +.add_argument("graph", "NDArray-or-Symbol", "Input graph where we sample vertices.") +.add_argument("data", "NDArray-or-Symbol[]", + "The input arrays that include data arrays and states.") +.add_arguments(DGLSubgraphParam::__FIELDS__()); + +///////////////////////// Edge Id /////////////////////////// + +inline bool EdgeIDShape(const nnvm::NodeAttrs& attrs, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_EQ(in_attrs->size(), 3U); + CHECK_EQ(out_attrs->size(), 1U); + CHECK_EQ(in_attrs->at(1).ndim(), 1U); + CHECK_EQ(in_attrs->at(2).ndim(), 1U); + CHECK_EQ(in_attrs->at(1)[0], in_attrs->at(2)[0]); + + SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(1)); + SHAPE_ASSIGN_CHECK(*in_attrs, 1, out_attrs->at(0)); + SHAPE_ASSIGN_CHECK(*in_attrs, 2, out_attrs->at(0)); + return out_attrs->at(0).ndim() != 0U && out_attrs->at(0).Size() != 0U; +} + +inline bool EdgeIDType(const nnvm::NodeAttrs& attrs, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_EQ(in_attrs->size(), 3U); + CHECK_EQ(out_attrs->size(), 1U); + + TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0)); + TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0)); + return out_attrs->at(0) != -1; +} + +inline bool EdgeIDStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_EQ(in_attrs->size(), 3U) << "Only works for 2d arrays"; + CHECK_EQ(out_attrs->size(), 1U); + int& in_stype = in_attrs->at(0); + int& out_stype = out_attrs->at(0); + bool dispatched = false; + if (!dispatched && in_stype == kCSRStorage) { + // csr -> dns + dispatched = storage_type_assign(&out_stype, kDefaultStorage, + dispatch_mode, DispatchMode::kFComputeEx); + } + if (!dispatched) { + LOG(ERROR) << "Cannot dispatch edge_id storage type, only works for csr matrices"; + } + return dispatched; +} + +struct edge_id_csr_forward { + template + MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data, + const IType* in_indices, const IType* in_indptr, + const CType* u, const CType* v) { + const int64_t target_row_id = static_cast(u[i]); + const IType target_col_id = static_cast(v[i]); + auto ptr = std::find(in_indices + in_indptr[target_row_id], + in_indices + in_indptr[target_row_id + 1], target_col_id); + if (ptr == in_indices + in_indptr[target_row_id + 1]) { + // does not exist in the range + out_data[i] = DType(-1); + } else { + out_data[i] = *(in_data + (ptr - in_indices)); + } + } +}; + +template +void EdgeIDForwardCsrImpl(const OpContext& ctx, + const std::vector& inputs, + const OpReqType req, + const NDArray& output) { + using namespace mshadow; + using namespace mxnet_op; + using namespace csr; + if (req == kNullOp) return; + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(req, kWriteTo) << "EdgeID with CSR only supports kWriteTo"; + Stream *s = ctx.get_stream(); + const NDArray& u = inputs[1]; + const nnvm::dim_t out_elems = u.shape().Size(); + if (!inputs[0].storage_initialized()) { + MSHADOW_TYPE_SWITCH(output.dtype(), DType, { + Kernel, xpu>::Launch( + s, out_elems, output.data().dptr(), DType(-1)); + }); + return; + } + const NDArray& data = inputs[0]; + const TBlob& in_data = data.data(); + const TBlob& in_indices = data.aux_data(kIdx); + const TBlob& in_indptr = data.aux_data(kIndPtr); + const NDArray& v = inputs[2]; + + CHECK_EQ(data.aux_type(kIdx), data.aux_type(kIndPtr)) + << "The dtypes of indices and indptr don't match"; + MSHADOW_TYPE_SWITCH(data.dtype(), DType, { + MSHADOW_IDX_TYPE_SWITCH(data.aux_type(kIdx), IType, { + MSHADOW_TYPE_SWITCH(u.dtype(), CType, { + Kernel::Launch( + s, out_elems, output.data().dptr(), in_data.dptr(), + in_indices.dptr(), in_indptr.dptr(), + u.data().dptr(), v.data().dptr()); + }); + }); + }); +} + +template +void EdgeIDForwardEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const auto in_stype = inputs[0].storage_type(); + const auto out_stype = outputs[0].storage_type(); + if (in_stype == kCSRStorage && out_stype == kDefaultStorage) { + EdgeIDForwardCsrImpl(ctx, inputs, req[0], outputs[0]); + } else { + LogUnimplementedOp(attrs, ctx, inputs, req, outputs); + } +} + +NNVM_REGISTER_OP(_contrib_edge_id) +.describe(R"code(This operator implements the edge_id function for a graph +stored in a CSR matrix (the value of the CSR stores the edge Id of the graph). +output[i] = input[u[i], v[i]] if there is an edge between u[i] and v[i]], +otherwise output[i] will be -1. Both u and v should be 1D vectors. +Example:: + x = [[ 1, 0, 0 ], + [ 0, 2, 0 ], + [ 0, 0, 3 ]] + u = [ 0, 0, 1, 1, 2, 2 ] + v = [ 0, 1, 1, 2, 0, 2 ] + edge_id(x, u, v) = [ 1, -1, 2, -1, -1, 3 ] + +The storage type of ``edge_id`` output depends on storage types of inputs + - edge_id(csr, default, default) = default + - default and rsp inputs are not supported + +)code" ADD_FILELINE) +.set_num_inputs(3) +.set_num_outputs(1) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data", "u", "v"}; + }) +.set_attr("FInferShape", EdgeIDShape) +.set_attr("FInferType", EdgeIDType) +.set_attr("FInferStorageType", EdgeIDStorageType) +.set_attr("FComputeEx", EdgeIDForwardEx) +.add_argument("data", "NDArray-or-Symbol", "Input ndarray") +.add_argument("u", "NDArray-or-Symbol", "u ndarray") +.add_argument("v", "NDArray-or-Symbol", "v ndarray"); + + +} // namespace op +} // namespace mxnet diff --git a/tests/python/unittest/test_contrib_operator.py b/tests/python/unittest/test_contrib_operator.py index 58728d8dad68..43d3db648a85 100644 --- a/tests/python/unittest/test_contrib_operator.py +++ b/tests/python/unittest/test_contrib_operator.py @@ -261,6 +261,7 @@ def test_multibox_target_op(): assert_array_equal(loc_mask.asnumpy(), expected_loc_mask) assert_array_equal(cls_target.asnumpy(), expected_cls_target) + if __name__ == '__main__': import nose nose.runmodule() diff --git a/tests/python/unittest/test_dgl_graph.py b/tests/python/unittest/test_dgl_graph.py new file mode 100644 index 000000000000..774f811a0d1f --- /dev/null +++ b/tests/python/unittest/test_dgl_graph.py @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: skip-file +from __future__ import print_function +import numpy as np +import scipy as sp +import mxnet as mx +import random +import itertools +from numpy.testing import assert_allclose, assert_array_equal +from mxnet.test_utils import * +import unittest + +def test_edge_id(): + shape = rand_shape_2d() + data = rand_ndarray(shape, stype='csr', density=0.4) + ground_truth = np.zeros(shape, dtype=np.float32) + ground_truth -= 1.0 + indptr_np = data.indptr.asnumpy() + data_np = data.data.asnumpy() + indices_np = data.indices.asnumpy() + for i in range(shape[0]): + for j in range(indptr_np[i], indptr_np[i+1]): + idx = indices_np[j] + ground_truth[i, idx] = data_np[j] + + np_u = np.random.randint(0, shape[0], size=(5, )) + np_v = np.random.randint(0, shape[1], size=(5, )) + mx_u = mx.nd.array(np_u) + mx_v = mx.nd.array(np_v) + assert_almost_equal(mx.nd.contrib.edge_id(data, mx_u, mx_v).asnumpy(), + ground_truth[np_u, np_v], rtol=1e-5, atol=1e-6) + +def generate_graph(n): + arr = sp.sparse.random(n, n, density=0.2, format='coo') + arr.data = np.arange(0, len(arr.row), dtype=np.float32) + return arr.tocsr(), mx.nd.sparse.csr_matrix(arr.tocsr()).astype(np.int64) + +def test_subgraph(): + sp_g, g = generate_graph(100) + vertices = np.unique(np.random.randint(0, 100, size=(20))) + subgs = mx.nd.contrib.dgl_subgraph(g, mx.nd.array(vertices, dtype=np.int64), + return_mapping=True) + subgs[0].check_format() + subgs[1].check_format() + assert_array_equal(subgs[0].indptr, subgs[1].indptr) + assert_array_equal(subgs[0].indices, subgs[1].indices) + sp_subg = subgs[1].asscipy() + for i in range(len(subgs[0].indptr) - 1): + subv1 = i + v1 = vertices[subv1] + row_start = int(subgs[0].indptr[subv1].asnumpy()[0]) + row_end = int(subgs[0].indptr[subv1 + 1].asnumpy()[0]) + if row_start >= len(subgs[0].indices): + remain = subgs[0].indptr[subv1:].asnumpy() + assert np.sum(remain == row_start) == len(remain) + break + row = subgs[0].indices[row_start:row_end] + for j, subv2 in enumerate(row.asnumpy()): + v2 = vertices[subv2] + assert sp_g[v1, v2] == sp_subg[subv1, subv2] + +if __name__ == "__main__": + import nose + nose.runmodule()