From 624584c6ad471057537f3203aff98421a047dd73 Mon Sep 17 00:00:00 2001 From: yosefe Date: Tue, 4 Nov 2014 13:34:27 +0200 Subject: [PATCH 1/4] IB/RC: Initial implementation of RC transport. + Memory mapping/unmapping. + Remote key pack/unpack. + Implement put_short() for mlx5 device. + Add unit test and performance test for put_short() --- Makefile.am | 2 +- config/m4/ib.m4 | 12 +- configure.ac | 1 + src/uct/Makefile.am | 24 ++- src/uct/api/tl.h | 121 ++++++++++---- src/uct/api/uct.h | 94 ++++++++++- src/uct/api/uct_def.h | 8 +- src/uct/ib/base/ib_context.c | 11 +- src/uct/ib/base/ib_context.h | 7 + src/uct/ib/base/ib_device.c | 93 ++++++++++- src/uct/ib/base/ib_device.h | 1 + src/uct/ib/base/ib_iface.c | 55 ++++++- src/uct/ib/base/ib_iface.h | 18 ++- src/uct/ib/base/ib_verbs.h | 2 + src/uct/ib/mlx5/ib_mlx5.c | 69 ++++++++ src/uct/ib/mlx5/ib_mlx5.h | 66 ++++++++ src/uct/ib/rc/rc_ep.c | 141 +++++++++++++++++ src/uct/ib/rc/rc_ep.h | 37 +++++ src/uct/ib/rc/rc_iface.c | 41 ++++- src/uct/ib/rc/rc_iface.h | 11 +- src/uct/ib/rc/rc_mlx5.c | 187 ++++++++++++++++++++++ src/uct/ib/rc/rc_mlx5.h | 31 ++++ src/uct/ib/rc/rc_tl.c | 27 ---- src/uct/tl/context.c | 8 +- src/uct/tl/tl.c | 9 -- test/gtest/uct/test_uct_context.cc | 119 ++++++++++++++ test/perf/Makefile.am | 14 ++ test/perf/perftest.c | 243 +++++++++++++++++++++++++++++ 28 files changed, 1342 insertions(+), 110 deletions(-) create mode 100644 src/uct/ib/mlx5/ib_mlx5.c create mode 100644 src/uct/ib/mlx5/ib_mlx5.h create mode 100644 src/uct/ib/rc/rc_ep.c create mode 100644 src/uct/ib/rc/rc_ep.h create mode 100644 src/uct/ib/rc/rc_mlx5.c create mode 100644 src/uct/ib/rc/rc_mlx5.h delete mode 100644 src/uct/ib/rc/rc_tl.c delete mode 100644 src/uct/tl/tl.c create mode 100644 test/perf/Makefile.am create mode 100644 test/perf/perftest.c diff --git a/Makefile.am b/Makefile.am index 1bf96cbd6de..5b4ff063449 100644 --- a/Makefile.am +++ b/Makefile.am @@ -10,7 +10,7 @@ ACLOCAL_AMFLAGS = -I config/m4 -SUBDIRS = src/ucs src/uct +SUBDIRS = src/ucs src/uct test/perf if HAVE_GTEST SUBDIRS += test/gtest diff --git a/config/m4/ib.m4 b/config/m4/ib.m4 index d65ab4c351f..da8d9a40e27 100644 --- a/config/m4/ib.m4 +++ b/config/m4/ib.m4 @@ -75,14 +75,24 @@ AC_CHECK_HEADER([infiniband/verbs_exp.h], [verbs_exp=no]) +# +# mlx5 PRM +# +with_mlx5_hw=no +AC_CHECK_HEADERS([infiniband/mlx5_hw.h], + [AC_MSG_NOTICE([Compiling with mlx5 bare-metal support]) + AC_DEFINE([HAVE_MLX5_HW], 1, [mlx5 bare-metal support]) + with_mlx5_hw=yes]) + + # # For automake # AM_CONDITIONAL([HAVE_IB], [test "x$with_ib" != xno]) AM_CONDITIONAL([HAVE_TL_RC], [test "x$with_rc" != xno]) +AM_CONDITIONAL([HAVE_MLX5_HW], [test "x$with_mlx5_hw" != xno]) mlnx_valg_libdir=/usr/lib64/mlnx_ofed/valgrind AS_IF([test -d "$mlnx_valg_libdir"], [AC_MSG_NOTICE([Added $mlnx_valg_libdir to valgrind LD_LIBRARY_PATH]) valgrind_libpath="$mlnx_valg_libdir:$valgrind_libpath"]) - diff --git a/configure.ac b/configure.ac index 3956c1e4472..e91b7b41fbb 100644 --- a/configure.ac +++ b/configure.ac @@ -178,6 +178,7 @@ AC_CONFIG_FILES([ src/uct/Makefile src/uct/api/version.h test/gtest/Makefile + test/perf/Makefile ]) AC_OUTPUT diff --git a/src/uct/Makefile.am b/src/uct/Makefile.am index f0d0aebc3cf..33204fa008c 100644 --- a/src/uct/Makefile.am +++ b/src/uct/Makefile.am @@ -22,8 +22,7 @@ noinst_HEADERS = \ tl/context.h libuct_la_SOURCES = \ - tl/context.c \ - tl/tl.c + tl/context.c if HAVE_IB noinst_HEADERS += \ @@ -36,13 +35,30 @@ libuct_la_SOURCES += \ ib/base/ib_context.c \ ib/base/ib_device.c \ ib/base/ib_iface.c + +if HAVE_MLX5_HW +noinst_HEADERS += \ + ib/mlx5/ib_mlx5.h + +libuct_la_SOURCES += \ + ib/mlx5/ib_mlx5.c +endif + endif if HAVE_TL_RC noinst_HEADERS += \ + ib/rc/rc_ep.h \ ib/rc/rc_iface.h libuct_la_SOURCES += \ - ib/rc/rc_iface.c \ - ib/rc/rc_tl.c + ib/rc/rc_ep.c \ + ib/rc/rc_iface.c + +if HAVE_MLX5_HW +libuct_la_SOURCES += \ + ib/rc/rc_mlx5.c \ + ib/rc/rc_mlx5.h +endif + endif diff --git a/src/uct/api/tl.h b/src/uct/api/tl.h index a4991d86b03..d8f9ce9efa6 100644 --- a/src/uct/api/tl.h +++ b/src/uct/api/tl.h @@ -21,19 +21,25 @@ /** - * Communication interface context + * Communication resource. */ -typedef struct uct_iface { - uct_tl_ops_t *ops; -} uct_iface_t; +typedef struct uct_resource_desc { + char tl_name[UCT_MAX_NAME_LEN]; /* Transport name */ + char hw_name[UCT_MAX_NAME_LEN]; /* Hardware resource name */ + uint64_t latency; /* Latency, nanoseconds */ + size_t bandwidth; /* Bandwidth, bytes/second */ + cpu_set_t local_cpus; /* Mask of CPUs near the resource */ + socklen_t addrlen; /* Size of address */ + struct sockaddr_storage subnet_addr; /* Subnet address. Devices which can + reach each other have same address */ +} uct_resource_desc_t; -/** - * Remote endpoint - */ -typedef struct uct_ep { - uct_tl_ops_t *ops; -} uct_ep_t; +struct uct_iface_addr { +}; + +struct uct_ep_addr { +}; /** @@ -56,24 +62,17 @@ typedef struct uct_iface_attr { /** - * Communication resource. + * Protection domain attributes */ -typedef struct uct_resource_desc { - char tl_name[UCT_MAX_NAME_LEN]; /* Transport name */ - char hw_name[UCT_MAX_NAME_LEN]; /* Hardware resource name */ - uint64_t latency; /* Latency, nanoseconds */ - size_t bandwidth; /* Bandwidth, bytes/second */ - cpu_set_t local_cpus; /* Mask of CPUs near the resource */ - socklen_t addrlen; /* Size of address */ - struct sockaddr_storage subnet_addr; /* Subnet address. Devices which can - reach each other have same address */ -} uct_resource_desc_t; +typedef struct uct_pd_attr { + size_t rkey_packed_size; /* Size of buffer needed for packed rkey */ +} uct_pd_attr_t; /** - * Transport operations. + * Transport "global" operations */ -struct uct_tl_ops { +typedef struct uct_tl_ops { ucs_status_t (*query_resources)(uct_context_h context, uct_resource_desc_t **resources_p, @@ -81,29 +80,83 @@ struct uct_tl_ops { ucs_status_t (*iface_open)(uct_context_h context, const char *hw_name, uct_iface_h *iface_p); - void (*iface_close)(uct_iface_h iface); +} uct_tl_ops_t; + + +/** + * Transport memory operations + */ +typedef struct uct_pd_ops { + ucs_status_t (*query)(uct_pd_h pd, uct_pd_attr_t *pd_attr); + + ucs_status_t (*mem_map)(uct_pd_h pd, void *address, size_t length, + uct_lkey_t *lkey_p); + + ucs_status_t (*mem_unmap)(uct_pd_h pd, uct_lkey_t lkey); + + /* TODO support "mem attach", MPI-3 style */ + + ucs_status_t (*rkey_pack)(uct_pd_h pd, uct_lkey_t lkey, void *rkey_buffer); + + ucs_status_t (*rkey_unpack)(uct_pd_h pd, void *rkey_buffer, uct_rkey_t *rkey_p); + + void (*rkey_release)(uct_pd_h pd, uct_rkey_t rkey); +} uct_pd_ops_t; + + +/** + * Transport iface operations. + */ +typedef struct uct_iface_ops { ucs_status_t (*iface_query)(uct_iface_h iface, uct_iface_attr_t *iface_attr); ucs_status_t (*iface_get_address)(uct_iface_h iface, uct_iface_addr_t *iface_addr); - ucs_status_t (*ep_create)(uct_ep_h *ep_p); + ucs_status_t (*iface_flush)(uct_iface_h iface, uct_req_h *req_p, + uct_completion_cb_t cb); + + void (*iface_close)(uct_iface_h iface); + + ucs_status_t (*ep_create)(uct_iface_h iface, uct_ep_h *ep_p); void (*ep_destroy)(uct_ep_h ep); - ucs_status_t (*ep_get_address)(uct_ep_h *ep, + ucs_status_t (*ep_get_address)(uct_ep_h ep, + uct_ep_addr_t *ep_addr); + ucs_status_t (*ep_connect_to_iface)(uct_ep_h ep, uct_iface_addr_t *iface_addr); + ucs_status_t (*ep_connect_to_ep)(uct_ep_h ep, uct_iface_addr_t *iface_addr, uct_ep_addr_t *ep_addr); - ucs_status_t (*ep_connect_to_iface)(uct_iface_addr_t *iface_addr); - ucs_status_t (*ep_connect_to_ep)(uct_iface_addr_t *iface_addr, - uct_ep_addr_t *ep_addr); ucs_status_t (*ep_put_short)(uct_ep_h ep, void *buffer, unsigned length, - uct_rkey_t rkey, uct_req_h *req_p, - uct_completion_cb_t cb); + uint64_t remote_addr, uct_rkey_t rkey, + uct_req_h *req_p, uct_completion_cb_t cb); +} uct_iface_ops_t; - ucs_status_t (*iface_flush)(uct_iface_h iface, uct_req_h *req_p, - uct_completion_cb_t cb); -}; + +/** + * Protection domain + */ +typedef struct uct_pd { + uct_pd_ops_t *ops; +} uct_pd_t; + + +/** + * Communication interface context + */ +typedef struct uct_iface { + uct_iface_ops_t ops; + uct_pd_h pd; +} uct_iface_t; + + +/** + * Remote endpoint + */ +typedef struct uct_ep { + uct_iface_h iface; +} uct_ep_t; #endif diff --git a/src/uct/api/uct.h b/src/uct/api/uct.h index 0caa7b8703e..a9f71db10b7 100644 --- a/src/uct/api/uct.h +++ b/src/uct/api/uct.h @@ -58,6 +58,7 @@ void uct_release_resource_list(uct_resource_desc_t *resources); /** + * @ingroup CONTEXT * @brief Open a communication interface. * * @param [in] context Handle to context. @@ -71,12 +72,91 @@ ucs_status_t uct_iface_open(uct_context_h context, const char *tl_name, const char *hw_name, uct_iface_h *iface_p); -/** - * @brief Close a communication interface. - * - * @param [in] iface Interface to close. - */ -void uct_iface_close(uct_iface_h iface); - +static inline ucs_status_t uct_pd_query(uct_pd_h pd, uct_pd_attr_t *pd_attr) +{ + return pd->ops->query(pd, pd_attr); +} + +static inline ucs_status_t uct_pd_mem_map(uct_pd_h pd, void *address, + size_t length, uct_lkey_t *lkey_p) +{ + return pd->ops->mem_map(pd, address, length, lkey_p); +} + +static inline ucs_status_t uct_pd_mem_unmap(uct_pd_h pd, uct_lkey_t lkey) +{ + return pd->ops->mem_unmap(pd, lkey); +} + +static inline ucs_status_t uct_pd_rkey_pack(uct_pd_h pd, uct_lkey_t lkey, void *rkey_buffer) +{ + return pd->ops->rkey_pack(pd, lkey, rkey_buffer); +} + +static inline ucs_status_t uct_pd_rkey_unpack(uct_pd_h pd, void *rkey_buffer, uct_rkey_t *rkey_p) +{ + return pd->ops->rkey_unpack(pd, rkey_buffer, rkey_p); +} + +static inline void uct_pd_rkey_release(uct_pd_h pd, uct_rkey_t rkey) +{ + pd->ops->rkey_release(pd, rkey); +} + +static inline ucs_status_t uct_iface_query(uct_iface_h iface, + uct_iface_attr_t *iface_attr) +{ + return iface->ops.iface_query(iface, iface_attr); +} + +static inline ucs_status_t uct_iface_get_address(uct_iface_h iface, + uct_iface_addr_t *iface_addr) +{ + return iface->ops.iface_get_address(iface, iface_addr); +} + +static inline ucs_status_t uct_iface_flush(uct_iface_h iface, uct_req_h *req_p, + uct_completion_cb_t cb) +{ + return iface->ops.iface_flush(iface, req_p, cb); +} + +static inline void uct_iface_close(uct_iface_h iface) +{ + iface->ops.iface_close(iface); +} + +static inline ucs_status_t uct_ep_create(uct_iface_h iface, uct_ep_h *ep_p) +{ + return iface->ops.ep_create(iface, ep_p); +} + +static inline void uct_ep_destroy(uct_ep_h ep) +{ + ep->iface->ops.ep_destroy(ep); +} + +static inline ucs_status_t uct_ep_get_address(uct_ep_h ep, uct_ep_addr_t *ep_addr) +{ + return ep->iface->ops.ep_get_address(ep, ep_addr); +} + +static inline ucs_status_t uct_ep_connect_to_iface(uct_ep_h ep, uct_iface_addr_t *iface_addr) +{ + return ep->iface->ops.ep_connect_to_iface(ep, iface_addr); +} + +static inline ucs_status_t uct_ep_connect_to_ep(uct_ep_h ep, uct_iface_addr_t *iface_addr, + uct_ep_addr_t *ep_addr) +{ + return ep->iface->ops.ep_connect_to_ep(ep, iface_addr, ep_addr); +} + +static inline ucs_status_t uct_ep_put_short(uct_ep_h ep, void *buffer, unsigned length, + uint64_t remote_addr, uct_rkey_t rkey, + uct_req_h *req_p, uct_completion_cb_t cb) +{ + return ep->iface->ops.ep_put_short(ep, buffer, length, remote_addr, rkey, req_p, cb); +} #endif diff --git a/src/uct/api/uct_def.h b/src/uct/api/uct_def.h index fb6ae84557b..17c0497892c 100644 --- a/src/uct/api/uct_def.h +++ b/src/uct/api/uct_def.h @@ -17,10 +17,10 @@ typedef struct uct_iface *uct_iface_h; typedef struct uct_iface_addr uct_iface_addr_t; typedef struct uct_ep *uct_ep_h; typedef struct uct_ep_addr uct_ep_addr_t; -typedef struct uct_tl_ops uct_tl_ops_t; -typedef uint64_t uct_lkey_t; -typedef uint64_t uct_rkey_t; +typedef uintptr_t uct_lkey_t; +typedef uintptr_t uct_rkey_t; typedef struct uct_req *uct_req_h; - +typedef struct uct_memory_region *uct_memory_region_h; +typedef struct uct_pd *uct_pd_h; #endif diff --git a/src/uct/ib/base/ib_context.c b/src/uct/ib/base/ib_context.c index 682d29f7e7a..50610261016 100644 --- a/src/uct/ib/base/ib_context.c +++ b/src/uct/ib/base/ib_context.c @@ -75,9 +75,11 @@ ucs_status_t uct_ib_query_resources(uct_context_h context, unsigned flags, static void uct_ib_register_tls(uct_context_t *context) { +#if HAVE_MLX5_HW #if HAVE_TL_RC - extern uct_tl_ops_t uct_rc_tl_ops; - uct_register_tl(context, "rc", &uct_rc_tl_ops); + extern uct_tl_ops_t uct_rc_mlx5_tl_ops; + uct_register_tl(context, "rc_mlx5", &uct_rc_mlx5_tl_ops); +#endif #endif } @@ -122,11 +124,8 @@ ucs_status_t uct_ib_init(uct_context_h context) if (ibctx->num_devices > 0) { ucs_debug("initialized IB component with %u devices", ibctx->num_devices); uct_ib_register_tls(context); - status = UCS_OK; - } else { - ucs_free(ibctx->devices); - status = UCS_ERR_NO_DEVICE; } + status = UCS_OK; out_free_device_list: ibv_free_device_list(device_list); diff --git a/src/uct/ib/base/ib_context.h b/src/uct/ib/base/ib_context.h index 798b10ae195..f4de715b773 100644 --- a/src/uct/ib/base/ib_context.h +++ b/src/uct/ib/base/ib_context.h @@ -9,6 +9,12 @@ #define UCT_IB_CONTEXT_H_ #include "ib_device.h" +#include + + +#define UCT_IB_RESOURCE_FLAG_MLX4_PRM UCS_BIT(1) /* Device supports mlx4 PRM */ +#define UCT_IB_RESOURCE_FLAG_MLX5_PRM UCS_BIT(2) /* Device supports mlx5 PRM */ +#define UCT_IB_RESOURCE_FLAG_DC UCS_BIT(3) /* Device supports DC */ typedef struct uct_ib_context uct_ib_context_t; @@ -17,6 +23,7 @@ struct uct_ib_context { uct_ib_device_t **devices; /* Array of devices */ }; + /* * Helper function to list IB resources */ diff --git a/src/uct/ib/base/ib_device.c b/src/uct/ib/base/ib_device.c index 3eeab80836d..7ae655ce3fe 100644 --- a/src/uct/ib/base/ib_device.c +++ b/src/uct/ib/base/ib_device.c @@ -7,6 +7,7 @@ #define _GNU_SOURCE /* for CPU_ZERO/CPU_SET in sched.h */ #include "ib_device.h" +#include "ib_context.h" #include #include @@ -51,6 +52,71 @@ static void uct_ib_device_get_affinity(const char *dev_name, cpu_set_t *cpu_mask } } +static ucs_status_t uct_ib_pd_query(uct_pd_h pd, uct_pd_attr_t *pd_attr) +{ + pd_attr->rkey_packed_size = sizeof(uint32_t); + return UCS_OK; +} + +static ucs_status_t uct_ib_mem_map(uct_pd_h pd, void *address, size_t length, + uct_lkey_t *lkey_p) +{ + uct_ib_device_t *dev = ucs_derived_of(pd, uct_ib_device_t); + struct ibv_mr *mr; + + mr = ibv_reg_mr(dev->pd, address, length, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_ATOMIC); + if (mr == NULL) { + ucs_error("ibv_reg_mr() failed: %m"); + return UCS_ERR_IO_ERROR; + } + + *lkey_p = (uintptr_t)mr; + return UCS_OK; +} + +static ucs_status_t uct_ib_mem_unmap(uct_pd_h pd, uct_lkey_t lkey) +{ + struct ibv_mr *mr = (void*)lkey; + int ret; + + ret = ibv_dereg_mr(mr); + if (ret != 0) { + ucs_error("ibv_dereg_mr() failed: %m"); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + +static ucs_status_t uct_ib_rkey_pack(uct_pd_h pd, uct_lkey_t lkey, + void *rkey_buffer) +{ + struct ibv_mr *mr = (void*)lkey; + + *(uint32_t*)rkey_buffer = htonl(mr->rkey); /* Use r-keys as big endian */ + return UCS_OK; +} + +static ucs_status_t uct_ib_rkey_unpack(uct_pd_h pd, void *rkey_buffer, + uct_rkey_t *rkey_p) +{ + *rkey_p = *(uint32_t*)rkey_buffer; + return UCS_OK; +} + +uct_pd_ops_t uct_ib_pd_ops = { + .query = uct_ib_pd_query, + .mem_map = uct_ib_mem_map, + .mem_unmap = uct_ib_mem_unmap, + .rkey_pack = uct_ib_rkey_pack, + .rkey_unpack = uct_ib_rkey_unpack, + .rkey_release = (void*)ucs_empty_function +}; + ucs_status_t uct_ib_device_create(struct ibv_device *ibv_device, uct_ib_device_t **dev_p) { struct ibv_context *ibv_context; @@ -99,6 +165,7 @@ ucs_status_t uct_ib_device_create(struct ibv_device *ibv_device, uct_ib_device_t } /* Save device information */ + dev->super.ops = &uct_ib_pd_ops; dev->ibv_context = ibv_context; dev->dev_attr = dev_attr; dev->first_port = first_port; @@ -119,6 +186,14 @@ ucs_status_t uct_ib_device_create(struct ibv_device *ibv_device, uct_ib_device_t } } + /* Allocate protection domain */ + dev->pd = ibv_alloc_pd(dev->ibv_context); + if (dev->pd == NULL) { + ucs_error("ibv_alloc_pd() failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err_free_device; + } + ucs_debug("created device '%s' (%s) with %d ports", uct_ib_device_name(dev), ibv_node_type_str(ibv_device->node_type), dev->num_ports); @@ -136,6 +211,7 @@ ucs_status_t uct_ib_device_create(struct ibv_device *ibv_device, uct_ib_device_t void uct_ib_device_destroy(uct_ib_device_t *dev) { + ibv_dealloc_pd(dev->pd); ibv_close_device(dev->ibv_context); ucs_free(dev); } @@ -150,7 +226,22 @@ int uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num, unsigned fl return 0; } - /* TODO check flags, e.g DC support */ + if (flags & UCT_IB_RESOURCE_FLAG_DC) { + if (!IBV_DEVICE_HAS_DC(&dev->dev_attr)) { + return 0; + } + } + + if (flags & UCT_IB_RESOURCE_FLAG_MLX4_PRM) { + return 0; /* Unsupported yet */ + } + + if (flags & UCT_IB_RESOURCE_FLAG_MLX5_PRM) { + /* TODO list all devices with their flags */ + if (dev->dev_attr.vendor_id != 0x02c9 || dev->dev_attr.vendor_part_id != 4113) { + return 0; + } + } return 1; } diff --git a/src/uct/ib/base/ib_device.h b/src/uct/ib/base/ib_device.h index 6c62d2b72ec..a76d076ee36 100644 --- a/src/uct/ib/base/ib_device.h +++ b/src/uct/ib/base/ib_device.h @@ -16,6 +16,7 @@ typedef struct uct_ib_device uct_ib_device_t; struct uct_ib_device { + uct_pd_t super; struct ibv_context *ibv_context; /* Verbs context */ struct ibv_pd *pd; /* Protection domain */ struct ibv_exp_device_attr dev_attr; /* Cached device attributes */ diff --git a/src/uct/ib/base/ib_iface.c b/src/uct/ib/base/ib_iface.c index ed78d279398..4a78fc4fbbd 100644 --- a/src/uct/ib/base/ib_iface.c +++ b/src/uct/ib/base/ib_iface.c @@ -9,6 +9,7 @@ #include "ib_context.h" #include +#include #include #include @@ -42,7 +43,7 @@ static ucs_status_t uct_ib_iface_find_port(uct_ib_context_t *ibctx, return UCS_ERR_NO_DEVICE; /* Port number out of range */ } - iface->device = dev; + iface->super.pd = &dev->super; iface->port_num = port_num; return UCS_OK; } @@ -56,12 +57,64 @@ ucs_status_t ucs_ib_iface_init(uct_context_h context, uct_ib_iface_t *iface, const char *hw_name) { uct_ib_context_t *ibctx = ucs_component_get(context, ib, uct_ib_context_t); + struct ibv_exp_port_attr *port_attr; + uct_ib_device_t *dev; ucs_status_t status; status = uct_ib_iface_find_port(ibctx, iface, hw_name); + if (status != UCS_OK) { + goto err; + } + + dev = uct_ib_iface_device(iface); + + /* TODO comp_channel */ + /* TODO cqe */ + iface->send_cq = ibv_create_cq(dev->ibv_context, 1024, NULL, NULL, 0); + if (iface->send_cq == NULL) { + ucs_error("Failed to create send cq: %m"); + status = UCS_ERR_IO_ERROR; + goto err; + } + + iface->recv_cq = ibv_create_cq(dev->ibv_context, 1024, NULL, NULL, 0); + if (iface->recv_cq == NULL) { + ucs_error("Failed to create recv cq: %m"); + goto err_destroy_send_cq; + } + + port_attr = uct_ib_device_port_attr(dev, iface->port_num); + switch (port_attr->link_layer) { + case IBV_LINK_LAYER_UNSPECIFIED: + case IBV_LINK_LAYER_INFINIBAND: + iface->addr.lid = port_attr->lid; + break; + default: + ucs_error("Unsupported link layer"); + goto err_destroy_recv_cq; + } + + return UCS_OK; + +err_destroy_recv_cq: + ibv_destroy_cq(iface->recv_cq); +err_destroy_send_cq: + ibv_destroy_cq(iface->send_cq); +err: return status; } void ucs_ib_iface_cleanup(uct_ib_iface_t *iface) { + int ret; + + ret = ibv_destroy_cq(iface->recv_cq); + if (ret != 0) { + ucs_warn("ibv_destroy_cq(recv_cq) returned %d: %m", ret); + } + + ret = ibv_destroy_cq(iface->send_cq); + if (ret != 0) { + ucs_warn("ibv_destroy_cq(send_cq) returned %d: %m", ret); + } } diff --git a/src/uct/ib/base/ib_iface.h b/src/uct/ib/base/ib_iface.h index d37f984d5db..b1df026be0c 100644 --- a/src/uct/ib/base/ib_iface.h +++ b/src/uct/ib/base/ib_iface.h @@ -11,21 +11,27 @@ #include "ib_device.h" #include +#include + + +typedef struct uct_ib_iface_addr { + uct_iface_addr_t super; + uint16_t lid; /* TODO support RoCE/GRH */ +} uct_ib_iface_addr_t; typedef struct uct_ib_iface { uct_iface_t super; - uct_ib_device_t *device; uint8_t port_num; - /* TODO * lmc - * port_addr; * sl * gid_index - * port_num * comp_channel; */ + uct_ib_iface_addr_t addr; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; } uct_ib_iface_t; @@ -33,5 +39,9 @@ ucs_status_t ucs_ib_iface_init(uct_context_h context, uct_ib_iface_t *iface, const char *hw_name); void ucs_ib_iface_cleanup(uct_ib_iface_t *iface); +static inline uct_ib_device_t * uct_ib_iface_device(uct_ib_iface_t *iface) +{ + return ucs_derived_of(iface->super.pd, uct_ib_device_t); +} #endif diff --git a/src/uct/ib/base/ib_verbs.h b/src/uct/ib/base/ib_verbs.h index 8ea8f0aa8d4..9d7a40e76b0 100644 --- a/src/uct/ib/base/ib_verbs.h +++ b/src/uct/ib/base/ib_verbs.h @@ -74,12 +74,14 @@ static inline struct ibv_mr *ibv_exp_reg_mr(struct ibv_exp_reg_mr_in *in) } # define IBV_IS_MPAGES_AVAIL(_attr) ((_attr)->exp_device_cap_flags & IBV_EXP_DEVICE_MR_ALLOCATE) +# define IBV_DEVICE_HAS_DC(_attr) ((_attr)->exp_device_cap_flags & IBV_EXP_DEVICE_DC_TRANSPORT) # define IBV_EXP_REG_MR_FLAGS(_f, _e) ((_f) | (_e)) # define IBV_SHARED_MR_ACCESS_FLAGS(_shared_mr) ((_shared_mr)->exp_access) # define IBV_EXP_DEVICE_ATTR_SET_COMP_MASK(_attr) # define IBV_EXP_PORT_ATTR_SET_COMP_MASK(_attr) #else # define IBV_IS_MPAGES_AVAIL(_attr) ((_attr)->device_cap_flags2 & IBV_EXP_DEVICE_MR_ALLOCATE) +# define IBV_DEVICE_HAS_DC(_attr) 0 # define IBV_EXP_REG_MR_FLAGS(_f, _e) (_f) , (_e) # define IBV_SHARED_MR_ACCESS_FLAGS(_shared_mr) ((_shared_mr)->access) # define IBV_EXP_DEVICE_ATTR_SET_COMP_MASK(_attr) (_attr)->comp_mask = (IBV_EXP_DEVICE_ATTR_RESERVED - 1) diff --git a/src/uct/ib/mlx5/ib_mlx5.c b/src/uct/ib/mlx5/ib_mlx5.c new file mode 100644 index 00000000000..1177a1488ef --- /dev/null +++ b/src/uct/ib/mlx5/ib_mlx5.c @@ -0,0 +1,69 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* +* $COPYRIGHT$ +* $HEADER$ +*/ + +#include "ib_mlx5.h" + +#include +#include + + +ucs_status_t uct_ib_mlx5_get_qp_info(struct ibv_qp *qp, uct_ib_mlx5_qp_info_t *qp_info) +{ + struct mlx5_qp *mqp = ucs_container_of(qp, struct mlx5_qp, verbs_qp.qp); + + if ((mqp->sq.cur_post != 0) || (mqp->rq.head != 0) || mqp->bf->need_lock) { + return UCS_ERR_NO_DEVICE; + } + + qp_info->qpn = qp->qp_num; + qp_info->dbrec = mqp->db; + qp_info->sq.buf = mqp->buf.buf + mqp->sq.offset; + qp_info->sq.wqe_cnt = mqp->sq.wqe_cnt; + qp_info->sq.stride = 1 << mqp->sq.wqe_shift; + qp_info->rq.buf = mqp->buf.buf + mqp->rq.offset; + qp_info->rq.wqe_cnt = mqp->rq.wqe_cnt; + qp_info->rq.stride = 1 << mqp->rq.wqe_shift; + qp_info->bf.reg = mqp->bf->reg; + + if (mqp->bf->uuarn > 0) { + qp_info->bf.size = mqp->bf->buf_size; + } else { + qp_info->bf.size = 0; /* No BF */ + } + + return UCS_OK; +} + +ucs_status_t uct_ib_mlx5_get_cq_info(struct ibv_cq *cq, uct_ib_mlx5_cq_info_t *cq_info) +{ + struct mlx5_cq *mcq = ucs_container_of(cq, struct mlx5_cq, ibv_cq); + + if (mcq->cons_index != 0) { + return UCS_ERR_NO_DEVICE; + } + + cq_info->cqn = mcq->cqn; + cq_info->cqe_cnt = mcq->ibv_cq.cqe + 1; + cq_info->cqe_size = mcq->cqe_sz; + cq_info->buf = mcq->active_buf->buf; + cq_info->dbrec = mcq->dbrec; + + return UCS_OK; +} + +void uct_ib_mlx5_update_cq_ci(struct ibv_cq *cq, unsigned cq_ci) +{ + struct mlx5_cq *mcq = ucs_container_of(cq, struct mlx5_cq, ibv_cq); + + mcq->cons_index = cq_ci; +} + +void uct_ib_mlx5_get_av(struct ibv_ah *ah, struct mlx5_wqe_av *av) +{ + memcpy(av, &ucs_container_of(ah, struct mlx5_ah, ibv_ah)->av, sizeof(*av)); +} + diff --git a/src/uct/ib/mlx5/ib_mlx5.h b/src/uct/ib/mlx5/ib_mlx5.h new file mode 100644 index 00000000000..55ba8141a48 --- /dev/null +++ b/src/uct/ib/mlx5/ib_mlx5.h @@ -0,0 +1,66 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* +* $COPYRIGHT$ +* $HEADER$ +*/ + +#ifndef UCT_IB_MLX5_H_ +#define UCT_IB_MLX5_H_ + + +#include +#include +#include +#include +#include + + +typedef struct uct_ib_mlx5_qp_info { + uint32_t qpn; /* QP number */ + uint32_t *dbrec; /* QP doorbell record in RAM */ + + struct { + void *buf; /* Work queue buffer */ + unsigned wqe_cnt; /* Number of WQEs in the work queue */ + unsigned stride; /* Size of each WQE */ + } sq, rq; + + struct { + void *reg; /* BlueFlame register */ + unsigned size; /* BlueFlame register size (0 - unsupported) */ + } bf; +} uct_ib_mlx5_qp_info_t; + + +typedef struct uct_ib_mlx5_cq_info { + uint32_t cqn; /* CQ number */ + unsigned cqe_cnt; /* Number of CQEs in the queue */ + void *buf; /* CQ buffer */ + uint32_t *dbrec; /* CQ doorbell record */ + unsigned cqe_size; /* Size of a CQE */ +} uct_ib_mlx5_cq_info_t; + + +/** + * Get internal QP information. + */ +ucs_status_t uct_ib_mlx5_get_qp_info(struct ibv_qp *qp, uct_ib_mlx5_qp_info_t *qp_info); + +/** + * Get internal CQ information. + */ +ucs_status_t uct_ib_mlx5_get_cq_info(struct ibv_cq *cq, uct_ib_mlx5_cq_info_t *cq_info); + +/** + * Update CI to support req_notify_cq + */ +void uct_ib_mlx5_update_cq_ci(struct ibv_cq *cq, unsigned cq_ci); + +/** + * Get internal AV information. + */ +void uct_ib_mlx5_get_av(struct ibv_ah *ah, struct mlx5_wqe_av *av); + + +#endif diff --git a/src/uct/ib/rc/rc_ep.c b/src/uct/ib/rc/rc_ep.c new file mode 100644 index 00000000000..a4f774ae081 --- /dev/null +++ b/src/uct/ib/rc/rc_ep.c @@ -0,0 +1,141 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* +* $COPYRIGHT$ +* $HEADER$ +*/ + +#include "rc_ep.h" +#include "rc_iface.h" + +#include +#include +#include + + +ucs_status_t uct_rc_ep_init(uct_rc_ep_t *ep) +{ + uct_rc_iface_t *iface = ucs_derived_of(ep->super.iface, uct_rc_iface_t); + uct_ib_device_t *dev = uct_ib_iface_device(&iface->super); + struct ibv_qp_init_attr qp_init_attr; + ucs_status_t status; + + /* Create QP */ + qp_init_attr.qp_context = NULL; + qp_init_attr.send_cq = iface->super.send_cq; + qp_init_attr.recv_cq = iface->super.recv_cq; + qp_init_attr.srq = NULL; /* TODO */ + qp_init_attr.cap.max_send_wr = 1024; + qp_init_attr.cap.max_recv_wr = 1024; + qp_init_attr.cap.max_send_sge = 2; + qp_init_attr.cap.max_recv_sge = 1; + qp_init_attr.cap.max_inline_data = 0; + qp_init_attr.qp_type = IBV_QPT_RC; + qp_init_attr.sq_sig_all = 0; + qp_init_attr.xrc_domain = NULL; + ep->qp = ibv_create_qp(dev->pd, &qp_init_attr); + if (ep->qp == NULL) { + ucs_error("failed to create qp: %m"); + status = UCS_ERR_IO_ERROR; + goto err; + } + + return UCS_OK; + +err: + return status; +} + +void uct_rc_ep_cleanup(uct_rc_ep_t *ep) +{ + int ret; + + ret = ibv_destroy_qp(ep->qp); + if (ret != 0) { + ucs_warn("ibv_destroy_qp() returned %d: %m", ret); + } +} + +ucs_status_t uct_rc_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *ep_addr) +{ + uct_rc_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_ep_t); + + ((uct_rc_ep_addr_t*)ep_addr)->qp_num = ep->qp->qp_num; + return UCS_OK; +} + +ucs_status_t uct_rc_ep_connect_to_ep(uct_ep_h tl_ep, uct_iface_addr_t *tl_iface_addr, + uct_ep_addr_t *tl_ep_addr) +{ + uct_rc_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_ep_t); + uct_rc_iface_t *iface = ucs_derived_of(ep->super.iface, uct_rc_iface_t); + uct_ib_iface_addr_t *iface_addr = ucs_derived_of(tl_iface_addr, uct_ib_iface_addr_t); + uct_rc_ep_addr_t *ep_addr = ucs_derived_of(tl_ep_addr, uct_rc_ep_addr_t); + struct ibv_qp_attr qp_attr; + int ret; + + memset(&qp_attr, 0, sizeof(qp_attr)); + + qp_attr.qp_state = IBV_QPS_INIT; + qp_attr.pkey_index = 0; + qp_attr.port_num = iface->super.port_num; + qp_attr.qp_access_flags = IBV_ACCESS_LOCAL_WRITE| + IBV_ACCESS_REMOTE_WRITE| + IBV_ACCESS_REMOTE_READ| + IBV_ACCESS_REMOTE_ATOMIC; + ret = ibv_modify_qp(ep->qp, &qp_attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS); + if (ret) { + ucs_error("error modifying QP to INIT: %m"); + return UCS_ERR_IO_ERROR; + } + + qp_attr.qp_state = IBV_QPS_RTR; + qp_attr.ah_attr.dlid = iface_addr->lid; /* TODO LMC */ + qp_attr.ah_attr.sl = 9; /* TODO SL */ + qp_attr.ah_attr.src_path_bits = 0; /* TODO LMC */ + qp_attr.ah_attr.static_rate = 0; + qp_attr.ah_attr.is_global = 0; /* TODO RoCE */ + qp_attr.ah_attr.port_num = iface->super.port_num; + qp_attr.dest_qp_num = ep_addr->qp_num; + qp_attr.rq_psn = 0; + qp_attr.path_mtu = IBV_MTU_2048; /* TODO select by device type */ + qp_attr.max_dest_rd_atomic = 4; /* TODO consider this in sender */ + qp_attr.min_rnr_timer = 14; /* TODO config */ + ret = ibv_modify_qp(ep->qp, &qp_attr, + IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER); + if (ret) { + ucs_error("error modifying QP to RTR: %m"); + return UCS_ERR_IO_ERROR; + } + + qp_attr.qp_state = IBV_QPS_RTS; + qp_attr.sq_psn = 0; + qp_attr.timeout = 14; /* TODO config */ + qp_attr.rnr_retry = 7; /* TODO config */ + qp_attr.retry_cnt = 7; /* TODO config */ + qp_attr.max_rd_atomic = 4; /* TODO consider this in sender */ + ret = ibv_modify_qp(ep->qp, &qp_attr, + IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC); + if (ret) { + ucs_error("error modifying QP to RTS: %m"); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + diff --git a/src/uct/ib/rc/rc_ep.h b/src/uct/ib/rc/rc_ep.h new file mode 100644 index 00000000000..6d81e632024 --- /dev/null +++ b/src/uct/ib/rc/rc_ep.h @@ -0,0 +1,37 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* +* $COPYRIGHT$ +* $HEADER$ +*/ + +#ifndef UCT_RC_EP_H +#define UCT_RC_EP_H + +#include "rc_iface.h" + +#include + + +typedef struct uct_rc_ep_addr { + uct_ep_addr_t super; + uint32_t qp_num; +} uct_rc_ep_addr_t; + + +typedef struct uct_rc_ep { + uct_ep_t super; + struct ibv_qp *qp; +} uct_rc_ep_t; + + +ucs_status_t uct_rc_ep_init(uct_rc_ep_t *ep); + +void uct_rc_ep_cleanup(uct_rc_ep_t *ep); + +ucs_status_t uct_rc_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *ep_addr); + +ucs_status_t uct_rc_ep_connect_to_ep(uct_ep_h tl_ep, uct_iface_addr_t *tl_iface_addr, + uct_ep_addr_t *tl_ep_addr); + +#endif diff --git a/src/uct/ib/rc/rc_iface.c b/src/uct/ib/rc/rc_iface.c index 44061a916fd..62f7a1896c0 100644 --- a/src/uct/ib/rc/rc_iface.c +++ b/src/uct/ib/rc/rc_iface.c @@ -6,11 +6,11 @@ */ #include "rc_iface.h" +#include "rc_ep.h" #include #include -extern uct_tl_ops_t uct_rc_tl_ops; ucs_status_t uct_rc_iface_open(uct_context_h context, const char *hw_name, uct_iface_h *iface_p) @@ -23,13 +23,20 @@ ucs_status_t uct_rc_iface_open(uct_context_h context, const char *hw_name, return UCS_ERR_NO_MEMORY; } - iface->super.super.ops = &uct_rc_tl_ops; + iface->super.super.ops.iface_close = uct_rc_iface_close; + iface->super.super.ops.iface_get_address = uct_rc_iface_get_address; + iface->super.super.ops.iface_flush = uct_rc_iface_flush; + iface->super.super.ops.ep_get_address = uct_rc_ep_get_address; + iface->super.super.ops.ep_connect_to_iface = NULL; + iface->super.super.ops.ep_connect_to_ep = uct_rc_ep_connect_to_ep; + status = ucs_ib_iface_init(context, &iface->super, hw_name); if (status != UCS_OK) { goto err_free; } - ucs_debug("opened RC dev %s port %d", uct_ib_device_name(iface->super.device), + ucs_debug("opened RC dev %s port %d", + uct_ib_device_name(uct_ib_iface_device(&iface->super)), iface->super.port_num); *iface_p = &iface->super.super; @@ -40,8 +47,34 @@ ucs_status_t uct_rc_iface_open(uct_context_h context, const char *hw_name, return status; } -void uct_rc_iface_close(uct_iface_h iface) +void uct_rc_iface_close(uct_iface_h tl_iface) { + uct_rc_iface_t *iface = ucs_derived_of(tl_iface, uct_rc_iface_t); + + ucs_ib_iface_cleanup(&iface->super); ucs_free(iface); } +void uct_rc_iface_query(uct_rc_iface_t *iface, uct_iface_attr_t *iface_attr) +{ + iface_attr->max_short = 0; + iface_attr->max_bcopy = 0; + iface_attr->max_zcopy = 0; + iface_attr->iface_addr_len = sizeof(uct_ib_iface_addr_t); + iface_attr->ep_addr_len = sizeof(uct_rc_ep_addr_t); + iface_attr->flags = 0; +} + +ucs_status_t uct_rc_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *iface_addr) +{ + uct_rc_iface_t *iface = ucs_derived_of(tl_iface, uct_rc_iface_t); + + *(uct_ib_iface_addr_t*)iface_addr = iface->super.addr; + return UCS_OK; +} + +ucs_status_t uct_rc_iface_flush(uct_iface_h tl_iface, uct_req_h *req_p, + uct_completion_cb_t cb) +{ + return UCS_OK; +} diff --git a/src/uct/ib/rc/rc_iface.h b/src/uct/ib/rc/rc_iface.h index bd772ab9600..7fe0e5952da 100644 --- a/src/uct/ib/rc/rc_iface.h +++ b/src/uct/ib/rc/rc_iface.h @@ -19,7 +19,16 @@ typedef struct uct_rc_iface { ucs_status_t uct_rc_iface_open(uct_context_h context, const char *hw_name, uct_iface_h *iface_p); -void uct_rc_iface_close(uct_iface_h iface); +void uct_rc_iface_close(uct_iface_h tl_iface); +void uct_rc_iface_query(uct_rc_iface_t *iface, uct_iface_attr_t *iface_attr); + +ucs_status_t uct_rc_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *iface_addr); + +ucs_status_t uct_rc_iface_flush(uct_iface_h iface, uct_req_h *req_p, + uct_completion_cb_t cb); + +ucs_status_t uct_rc_iface_flush(uct_iface_h iface, uct_req_h *req_p, + uct_completion_cb_t cb); #endif diff --git a/src/uct/ib/rc/rc_mlx5.c b/src/uct/ib/rc/rc_mlx5.c new file mode 100644 index 00000000000..4c5765e9bb2 --- /dev/null +++ b/src/uct/ib/rc/rc_mlx5.c @@ -0,0 +1,187 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* +* $COPYRIGHT$ +* $HEADER$ +*/ + +#include "rc_iface.h" +#include "rc_mlx5.h" + +#include +#include +#include +#include +#include +#include +#include +#include /* For htonl */ + + +typedef struct { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_wqe_raddr_seg raddr; + struct mlx5_wqe_inl_data_seg inl; +} UCS_S_PACKED uct_ib_mlx5_wqe_rc_rdma_inl_seg_t; + + +static ucs_status_t uct_rc_mlx5_query_resources(uct_context_h context, + uct_resource_desc_t **resources_p, + unsigned *num_resources_p) +{ + /* TODO take transport overhead into account */ + return uct_ib_query_resources(context, UCT_IB_RESOURCE_FLAG_MLX5_PRM, + resources_p, num_resources_p); +} + +static ucs_status_t uct_rc_mlx5_ep_create(uct_iface_h tl_iface, uct_ep_h *ep_p) +{ + uct_ib_mlx5_qp_info_t qp_info; + uct_rc_mlx5_ep_t *ep; + ucs_status_t status; + + ep = ucs_malloc(sizeof(*ep), "rc mlx5 ep"); + if (ep == NULL) { + status = UCS_ERR_NO_MEMORY; + goto err; + } + + ep->super.super.iface = tl_iface; + + status = uct_rc_ep_init(&ep->super); + if (status != UCS_OK) { + goto err; + } + + status = uct_ib_mlx5_get_qp_info(ep->super.qp, &qp_info); + if (status != UCS_OK) { + goto err_cleanup_rc_ep; + } + + if ((qp_info.bf.size == 0) || !ucs_is_pow2(qp_info.bf.size) || + (qp_info.sq.stride != MLX5_SEND_WQE_BB) || !ucs_is_pow2(qp_info.sq.wqe_cnt)) + { + ucs_error("mlx5 device parameters not suitable for transport"); + goto err_cleanup_rc_ep; + } + + ep->qpn_ds = htonl(ep->super.qp->qp_num << 8); + ep->tx.qstart = qp_info.sq.buf; + ep->tx.qend = qp_info.sq.buf + (MLX5_SEND_WQE_BB * qp_info.sq.wqe_cnt); + ep->tx.seg = ep->tx.qstart; + ep->tx.sw_pi = 0; + ep->tx.hw_ci = 0; + ep->tx.bf_reg = qp_info.bf.reg; + ep->tx.bf_size = qp_info.bf.size; + ep->tx.dbrec = &qp_info.dbrec[MLX5_SND_DBR]; + + memset(ep->tx.qstart, 0, ep->tx.qend - ep->tx.qstart); + + *ep_p = &ep->super.super; + return UCS_OK; + +err_cleanup_rc_ep: + uct_rc_ep_cleanup(&ep->super); +err: + return status; +} + +static void uct_rc_mlx5_ep_destroy(uct_ep_h tl_ep) +{ + uct_rc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_mlx5_ep_t); + + uct_rc_ep_cleanup(&ep->super); + ucs_free(ep); +} + +static ucs_status_t uct_rc_mlx5_ep_put_short(uct_ep_h tl_ep, void *buffer, + unsigned length, + uint64_t remote_addr, + uct_rkey_t rkey, uct_req_h *req_p, + uct_completion_cb_t cb) +{ + uct_rc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_mlx5_ep_t); + uct_ib_mlx5_wqe_rc_rdma_inl_seg_t *seg = ep->tx.seg; + unsigned wqe_size; + uint32_t sw_pi_16_n; + uint64_t *src, *dst; + unsigned i; + + sw_pi_16_n = htonl(ep->tx.sw_pi & 0xffff); + wqe_size = ((length + 15) / 16) + (sizeof(*seg) / 16); + ucs_assert(wqe_size < MLX5_SEND_WQE_BB); + + /* Build WQE */ + seg->ctrl.opmod_idx_opcode = (sw_pi_16_n >> 8) | (MLX5_OPCODE_RDMA_WRITE << 24); + seg->ctrl.qpn_ds = htonl(wqe_size) | ep->qpn_ds; + seg->raddr.raddr = htonll(remote_addr); + seg->raddr.rkey = (uint32_t)rkey; + + /* Data */ + UCS_STATIC_ASSERT(seg + 1 == ((void*)seg + 32 + 4)); + seg->inl.byte_count = htonl(length | MLX5_INLINE_SEG); + memcpy(seg + 1, buffer, length); + + /* Write doorbell record */ + ucs_compiler_fence(); + *ep->tx.dbrec = sw_pi_16_n; + + /* Make sure that doorbell record is written before ringing the doorbell */ + ucs_memory_bus_store_fence(); + + /* BF copy */ + dst = ep->tx.bf_reg; + src = (void*)seg; + for (i = 0; i < MLX5_SEND_WQE_BB / sizeof(*dst); ++i) { + *dst++ = *src++; + } + + ucs_memory_bus_store_fence(); + + /* Flip BF register */ + ep->tx.bf_reg = (void*) ((uintptr_t) ep->tx.bf_reg ^ ep->tx.bf_size); + ++ep->tx.sw_pi; + + /* Advance queue pointer */ + ep->tx.seg += MLX5_SEND_WQE_BB; + if (ep->tx.seg == ep->tx.qend) { + ep->tx.seg = ep->tx.qstart; + } + + return UCS_OK; +} + +static ucs_status_t uct_rc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) +{ + uct_rc_iface_t *iface = ucs_derived_of(tl_iface, uct_rc_iface_t); + + uct_rc_iface_query(iface, iface_attr); + iface_attr->max_short = MLX5_SEND_WQE_BB - sizeof(uct_ib_mlx5_wqe_rc_rdma_inl_seg_t); /* TODO */ + return UCS_OK; +} + +static ucs_status_t uct_rc_mlx5_iface_open(uct_context_h context, + const char *hw_name, + uct_iface_h *iface_p) +{ + ucs_status_t status; + uct_iface_h iface; + + status = uct_rc_iface_open(context, hw_name, &iface); + if (status != UCS_OK) { + return status; + } + + iface->ops.iface_query = uct_rc_mlx5_iface_query; + iface->ops.ep_put_short = uct_rc_mlx5_ep_put_short; + iface->ops.ep_create = uct_rc_mlx5_ep_create; + iface->ops.ep_destroy = uct_rc_mlx5_ep_destroy; + *iface_p = iface; + return UCS_OK; +} + +uct_tl_ops_t uct_rc_mlx5_tl_ops = { + .query_resources = uct_rc_mlx5_query_resources, + .iface_open = uct_rc_mlx5_iface_open, +}; + diff --git a/src/uct/ib/rc/rc_mlx5.h b/src/uct/ib/rc/rc_mlx5.h new file mode 100644 index 00000000000..78699c4457c --- /dev/null +++ b/src/uct/ib/rc/rc_mlx5.h @@ -0,0 +1,31 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* +* $COPYRIGHT$ +* $HEADER$ +*/ + +#ifndef UCT_RC_MLX5 +#define UCT_RC_MLX5 + +#include "rc_ep.h" + +typedef struct { + uct_rc_ep_t super; + unsigned qpn_ds; + + struct { + unsigned sw_pi; + unsigned hw_ci; + void *seg; + void *bf_reg; + unsigned long bf_size; + uint32_t *dbrec; + void *qstart; + void *qend; + } tx; +} uct_rc_mlx5_ep_t; + + + +#endif diff --git a/src/uct/ib/rc/rc_tl.c b/src/uct/ib/rc/rc_tl.c deleted file mode 100644 index 953d12a80e2..00000000000 --- a/src/uct/ib/rc/rc_tl.c +++ /dev/null @@ -1,27 +0,0 @@ -/** -* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. -* -* $COPYRIGHT$ -* $HEADER$ -*/ - -#include "rc_iface.h" - -#include -#include -#include - - -ucs_status_t uct_rc_query_resources(uct_context_h context, uct_resource_desc_t **resources_p, - unsigned *num_resources_p) -{ - /* TODO take transport overhead into account */ - return uct_ib_query_resources(context, 0, resources_p, num_resources_p); -} - -uct_tl_ops_t uct_rc_tl_ops = { - .query_resources = uct_rc_query_resources, - .iface_open = uct_rc_iface_open, - .iface_close = uct_rc_iface_close, -}; - diff --git a/src/uct/tl/context.c b/src/uct/tl/context.c index ce498bd17df..cb6c6b5770d 100644 --- a/src/uct/tl/context.c +++ b/src/uct/tl/context.c @@ -130,11 +130,7 @@ ucs_status_t uct_iface_open(uct_context_h context, const char *tl_name, } } - /* Invalid transport name */ - return UCS_ERR_INVALID_PARAM; + /* Non-existing transport */ + return UCS_ERR_NO_ELEM; } -void uct_iface_close(uct_iface_h iface) -{ - iface->ops->iface_close(iface); -} diff --git a/src/uct/tl/tl.c b/src/uct/tl/tl.c deleted file mode 100644 index f08617165c2..00000000000 --- a/src/uct/tl/tl.c +++ /dev/null @@ -1,9 +0,0 @@ -/** -* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. -* -* $COPYRIGHT$ -* $HEADER$ -*/ - -#include - diff --git a/test/gtest/uct/test_uct_context.cc b/test/gtest/uct/test_uct_context.cc index d586a621157..89c6da3446d 100644 --- a/test/gtest/uct/test_uct_context.cc +++ b/test/gtest/uct/test_uct_context.cc @@ -73,3 +73,122 @@ UCS_TEST_F(test_uct, open_iface) { uct_cleanup(ucth); } + +class entity { +public: + entity() { + ucs_status_t status; + + status = uct_init(&m_ucth); + ASSERT_UCS_OK(status); + + status = uct_iface_open(m_ucth, "rc_mlx5", "mlx5_0:1", &m_iface); + ASSERT_UCS_OK(status); + + status = uct_ep_create(m_iface, &m_ep); + ASSERT_UCS_OK(status); + } + + ~entity() { + uct_ep_destroy(m_ep); + uct_iface_close(m_iface); + uct_cleanup(m_ucth); + } + + void connect(const entity& other) { + ucs_status_t status; + + uct_iface_attr_t iface_attr; + status = uct_iface_query(other.m_iface, &iface_attr); + ASSERT_UCS_OK(status); + + uct_iface_addr_t *iface_addr = (uct_iface_addr_t*)malloc(iface_attr.iface_addr_len); + uct_ep_addr_t *ep_addr = (uct_ep_addr_t*)malloc(iface_attr.ep_addr_len); + + status = uct_iface_get_address(other.m_iface, iface_addr); + ASSERT_UCS_OK(status); + + status = uct_ep_get_address(other.m_ep, ep_addr); + ASSERT_UCS_OK(status); + + status = uct_ep_connect_to_ep(m_ep, iface_addr, ep_addr); + ASSERT_UCS_OK(status); + + free(ep_addr); + free(iface_addr); + } + + void mem_map(void *address, size_t length, uct_lkey_t *lkey_p, uct_rkey_t *rkey_p) { + ucs_status_t status; + void *rkey_buffer; + uct_pd_attr_t pd_attr; + + status = uct_pd_mem_map(m_iface->pd, address, length, lkey_p); + ASSERT_UCS_OK(status); + + status = uct_pd_query(m_iface->pd, &pd_attr); + ASSERT_UCS_OK(status); + + rkey_buffer = malloc(pd_attr.rkey_packed_size); + ASSERT_TRUE(rkey_buffer != NULL); + + status = uct_pd_rkey_pack(m_iface->pd, *lkey_p, rkey_buffer); + ASSERT_UCS_OK(status); + + status = uct_pd_rkey_unpack(m_iface->pd, rkey_buffer, rkey_p); + ASSERT_UCS_OK(status); + + free(rkey_buffer); + } + + void mem_unmap(uct_lkey_t lkey, uct_lkey_t rkey) { + ucs_status_t status; + uct_pd_rkey_release(m_iface->pd, rkey); + status = uct_pd_mem_unmap(m_iface->pd, lkey); + ASSERT_UCS_OK(status); + } + + void put8(uint64_t value, uint64_t address, uct_rkey_t rkey) { + ucs_status_t status; + status = uct_ep_put_short(m_ep, &value, sizeof(value), address, rkey, + NULL, NULL); + ASSERT_UCS_OK(status); + } + + void flush() { + ucs_status_t status; + status = uct_iface_flush(m_iface, NULL, NULL); + ASSERT_UCS_OK(status); + } + + uct_context_h m_ucth; + uct_iface_h m_iface; + uct_ep_h m_ep; + +}; + +UCS_TEST_F(test_uct, connect_ep) { + + const uint64_t magic = 0xdeadbeed1ee7a880; + entity e1, e2; + uct_lkey_t lkey; + uct_rkey_t rkey; + uint64_t val8; + + e2.mem_map(&val8, sizeof(val8), &lkey, &rkey); + + e1.connect(e2); + e2.connect(e1); + + val8 = 0; + e1.put8(magic, (uintptr_t)&val8, rkey); + + usleep(100000); + + e1.flush(); + + EXPECT_EQ(magic, val8); + + e2.mem_unmap(lkey, rkey); + +} diff --git a/test/perf/Makefile.am b/test/perf/Makefile.am new file mode 100644 index 00000000000..2c054c61dec --- /dev/null +++ b/test/perf/Makefile.am @@ -0,0 +1,14 @@ +# +# Copyright (C) Mellanox Technologies Ltd. 2001-2011. ALL RIGHTS RESERVED. +# +# $COPYRIGHT$ +# $HEADER$ +# + +bin_PROGRAMS = ucx_perftest + +ucx_perftest_SOURCES = perftest.c +ucx_perftest_LDADD = $(abs_top_builddir)/src/uct/libuct.la +ucx_perftest_CPPFLAGS = \ + -I$(abs_top_srcdir)/src \ + -I$(abs_top_builddir)/src diff --git a/test/perf/perftest.c b/test/perf/perftest.c new file mode 100644 index 00000000000..479b38038ce --- /dev/null +++ b/test/perf/perftest.c @@ -0,0 +1,243 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* +* $COPYRIGHT$ +* $HEADER$ +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +int sock_init(int argc, char **argv, int *my_rank) +{ + struct sockaddr_in inaddr; + struct hostent *he; + int sockfd; + int optval; + int ret; + + inaddr.sin_port = htons(12345); + memset(inaddr.sin_zero, 0, sizeof(inaddr.sin_zero)); + + sockfd = socket(AF_INET, SOCK_STREAM, 0); + if (sockfd < 0) { + fprintf(stderr, "socket() failed: %m\n"); + return sockfd; + } + + if (argc == 1) { + optval = 1; + ret = setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)); + if (ret < 0) { + return ret; + } + + inaddr.sin_family = AF_INET; + inaddr.sin_addr.s_addr = INADDR_ANY; + ret = bind(sockfd, (struct sockaddr*)&inaddr, sizeof(inaddr)); + if (ret < 0) { + fprintf(stderr, "bind() failed: %m\n"); + return ret; + } + + ret = listen(sockfd, 100); + if (ret < 0) { + fprintf(stderr, "listen() failed: %m\n"); + return ret; + } + + *my_rank = 0; + printf("Waiting for connection...\n"); + return accept(sockfd, NULL, NULL); + } else { + he = gethostbyname(argv[1]); + if (he == NULL || he->h_addr_list == NULL) { + fprintf(stderr, "host %s not found: %s\n", argv[1], hstrerror(h_errno)); + return -1; + } + + inaddr.sin_family = he->h_addrtype; + memcpy(&inaddr.sin_addr, he->h_addr_list[0], he->h_length); + + ret = connect(sockfd, (struct sockaddr*)&inaddr, sizeof(inaddr)); + if (ret < 0) { + fprintf(stderr, "connect() failed: %m\n"); + return -1; + } + + *my_rank = 1; + return sockfd; + } +} + +ssize_t xchg(int sockfd, void *ptr, size_t length) +{ + if (send(sockfd, ptr, length, 0) != length) { + fprintf(stderr, "send() failed: %m\n"); + return -1; + } + + if (recv(sockfd, ptr, length, 0) != length) { + fprintf(stderr, "recv() failed: %m\n"); + return -1; + } + + return length; +} + +int main(int argc, char **argv) +{ + static const uint64_t count = 1000000ul; + static volatile uint64_t shared_val8; + unsigned long vaddr; + uct_rkey_t rkey; + void *rkey_buffer; + uct_lkey_t lkey; + uct_iface_addr_t *iface_addr; + uct_ep_addr_t *ep_addr; + ucs_status_t status; + uct_context_h context; + uct_iface_h iface; + uct_iface_attr_t iface_attr; + uct_pd_attr_t pd_attr; + uct_ep_h ep; + int my_rank; + int sockfd; + uint64_t value; + struct timeval start, end; + double lat; + + status = uct_init(&context); + if (status != UCS_OK) { + fprintf(stderr, "Initialization failed\n"); + return -1; + } + + status = uct_iface_open(context, "rc_mlx5", "mlx5_0:1", &iface); + if (status != UCS_OK) { + fprintf(stderr, "Failed to open interface\n"); + return -1; + } + + status = uct_iface_query(iface, &iface_attr); + if (status != UCS_OK) { + fprintf(stderr, "Failed to query interface\n"); + return -1; + } + + status = uct_ep_create(iface, &ep); + if (status != UCS_OK) { + fprintf(stderr, "Failed to create endpoint\n"); + return -1; + } + + status = uct_pd_query(iface->pd, &pd_attr); + if (status != UCS_OK) { + fprintf(stderr, "Failed to query pd\n"); + return -1; + } + + shared_val8 = -1; + status = uct_pd_mem_map(iface->pd, (void*)&shared_val8, sizeof(shared_val8), &lkey); + if (status != UCS_OK) { + fprintf(stderr, "Failed to register\n"); + return -1; + } + + iface_addr = malloc(iface_attr.iface_addr_len); + ep_addr = malloc(iface_attr.ep_addr_len); + + uct_iface_get_address(iface, iface_addr); + uct_ep_get_address(ep, ep_addr); + + sockfd = sock_init(argc, argv, &my_rank); + if (sockfd < 0) { + return -1; + } + + xchg(sockfd, iface_addr, iface_attr.iface_addr_len); + xchg(sockfd, ep_addr, iface_attr.ep_addr_len); + + status = uct_ep_connect_to_ep(ep, iface_addr, ep_addr); + if (status != UCS_OK) { + fprintf(stderr, "Failed to connect to ep\n"); + return -1; + } + + vaddr = (uintptr_t)&shared_val8; + + rkey_buffer = malloc(pd_attr.rkey_packed_size); + status = uct_pd_rkey_pack(iface->pd, lkey, rkey_buffer); + if (status != UCS_OK) { + fprintf(stderr, "Failed to pack rkey\n"); + return -1; + } + + xchg(sockfd, rkey_buffer, pd_attr.rkey_packed_size); + status = uct_pd_rkey_unpack(iface->pd, rkey_buffer, &rkey); + if (status != UCS_OK) { + fprintf(stderr, "Failed to unpack rkey\n"); + return -1; + } + + shared_val8 = -1; + + xchg(sockfd, &vaddr, sizeof(vaddr)); + + free(rkey_buffer); + free(iface_addr); + free(ep_addr); + close(sockfd); + + printf("Starting test...\n"); + + value = 0; + + if (my_rank == 0) { + for (value = 0; value < 100; ++value) { + uct_ep_put_short(ep, &value, sizeof(value), vaddr, rkey, NULL, NULL); + while (shared_val8 != value); + } + } else if (my_rank == 1) { + for (value = 0; value < 100; ++value) { + while (shared_val8 != value); + uct_ep_put_short(ep, &value, sizeof(value), vaddr, rkey, NULL, NULL); + } + } + + gettimeofday(&start, NULL); + + if (my_rank == 0) { + for (value = 0; value < count; ++value) { + uct_ep_put_short(ep, &value, sizeof(value), vaddr, rkey, NULL, NULL); + while (shared_val8 != value); + } + } else if (my_rank == 1) { + for (value = 0; value < count; ++value) { + while (shared_val8 != value); + uct_ep_put_short(ep, &value, sizeof(value), vaddr, rkey, NULL, NULL); + } + } + + gettimeofday(&end, NULL); + + lat = ((end.tv_sec - start.tv_sec) * 1e6 + (end.tv_usec - start.tv_usec)) / count / 2; + + printf("Test done latency=%.3f usec\n", lat); + + uct_pd_rkey_release(iface->pd, rkey); + uct_pd_mem_unmap(iface->pd, lkey); + uct_ep_destroy(ep); + uct_iface_close(iface); + uct_cleanup(context); + return 0; +} + From ee7b76f51bef373e1639274c5715be6069d26d78 Mon Sep 17 00:00:00 2001 From: yosefe Date: Thu, 6 Nov 2014 18:01:31 +0200 Subject: [PATCH 2/4] IB/DOC: Code review fixes * Add destructor error handling convention to CodeStyle * Change return value of uct_ib_device_port_check() to ucs_status_t --- doc/CodeStyle | 4 ++++ src/uct/ib/base/ib_context.c | 4 ++-- src/uct/ib/base/ib_device.c | 15 ++++++++------- src/uct/ib/base/ib_device.h | 3 ++- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/doc/CodeStyle b/doc/CodeStyle index 39f353b942e..6d2f7fec754 100644 --- a/doc/CodeStyle +++ b/doc/CodeStyle @@ -33,5 +33,9 @@ - the function which prints the log message is the first one which decides which error it is. If a functions returns an error because it's callee returned erroneous ucs_status_t, it does not have to print a log message. + - destructors are not able to propagate error code to the caller because they + return void. also, users are not ready to handle errors during cleanup flow. + therefore a destructor should handle an error by printing a warning or an + error message. * Logging \ No newline at end of file diff --git a/src/uct/ib/base/ib_context.c b/src/uct/ib/base/ib_context.c index 50610261016..50c3b86f3ea 100644 --- a/src/uct/ib/base/ib_context.c +++ b/src/uct/ib/base/ib_context.c @@ -33,7 +33,7 @@ ucs_status_t uct_ib_query_resources(uct_context_h context, unsigned flags, for (port_num = dev->first_port; port_num < dev->first_port + dev->num_ports; ++port_num) { - if (uct_ib_device_port_check(dev, port_num, flags)) { + if (uct_ib_device_port_check(dev, port_num, flags) == UCS_OK) { ++num_resources; } } @@ -53,7 +53,7 @@ ucs_status_t uct_ib_query_resources(uct_context_h context, unsigned flags, for (port_num = dev->first_port; port_num < dev->first_port + dev->num_ports; ++port_num) { - if (uct_ib_device_port_check(dev, port_num, flags)) { + if (uct_ib_device_port_check(dev, port_num, flags) == UCS_OK) { rsc = &resources[resource_index++]; status = uct_ib_device_port_get_resource(dev, port_num, rsc); if (status != UCS_OK) { diff --git a/src/uct/ib/base/ib_device.c b/src/uct/ib/base/ib_device.c index 7ae655ce3fe..18486a850e1 100644 --- a/src/uct/ib/base/ib_device.c +++ b/src/uct/ib/base/ib_device.c @@ -216,34 +216,35 @@ void uct_ib_device_destroy(uct_ib_device_t *dev) ucs_free(dev); } -int uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num, unsigned flags) +ucs_status_t uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num, + unsigned flags) { if (port_num < dev->first_port || port_num >= dev->first_port + dev->num_ports) { - return 0; + return UCS_ERR_NO_DEVICE; } if (uct_ib_device_port_attr(dev, port_num)->state != IBV_PORT_ACTIVE) { - return 0; + return UCS_ERR_UNREACHABLE; } if (flags & UCT_IB_RESOURCE_FLAG_DC) { if (!IBV_DEVICE_HAS_DC(&dev->dev_attr)) { - return 0; + return UCS_ERR_UNSUPPORTED; } } if (flags & UCT_IB_RESOURCE_FLAG_MLX4_PRM) { - return 0; /* Unsupported yet */ + return UCS_ERR_UNSUPPORTED; /* Unsupported yet */ } if (flags & UCT_IB_RESOURCE_FLAG_MLX5_PRM) { /* TODO list all devices with their flags */ if (dev->dev_attr.vendor_id != 0x02c9 || dev->dev_attr.vendor_part_id != 4113) { - return 0; + return UCS_ERR_UNSUPPORTED; } } - return 1; + return UCS_OK; } const char *uct_ib_device_name(uct_ib_device_t *dev) diff --git a/src/uct/ib/base/ib_device.h b/src/uct/ib/base/ib_device.h index a76d076ee36..8dc5ccd03fe 100644 --- a/src/uct/ib/base/ib_device.h +++ b/src/uct/ib/base/ib_device.h @@ -31,7 +31,8 @@ ucs_status_t uct_ib_device_create(struct ibv_device *ibv_device, uct_ib_device_t void uct_ib_device_destroy(uct_ib_device_t *dev); -int uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num, unsigned flags); +ucs_status_t uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num, + unsigned flags); ucs_status_t uct_ib_device_port_get_resource(uct_ib_device_t *dev, uint8_t port_num, uct_resource_desc_t *resource); From 6e2087175bc0dc11b9cc6ba3dd358ac3d8e7d822 Mon Sep 17 00:00:00 2001 From: yosefe Date: Sun, 9 Nov 2014 10:21:03 +0200 Subject: [PATCH 3/4] API/IB: Return rkey_bundle_t from rkey_unpack(), which has rkey type. --- src/uct/api/tl.h | 29 +++++++++++++++---- src/uct/api/uct.h | 46 ++++++++++++++++++++---------- src/uct/api/uct_def.h | 1 + src/uct/ib/base/ib_device.c | 26 +++++++++++------ src/uct/ib/base/ib_device.h | 3 ++ src/uct/ib/rc/rc_mlx5.c | 1 + src/uct/tl/context.c | 22 ++++++++++++++ test/gtest/uct/test_uct_context.cc | 23 ++++++++------- test/perf/perftest.c | 20 ++++++------- 9 files changed, 121 insertions(+), 50 deletions(-) diff --git a/src/uct/api/tl.h b/src/uct/api/tl.h index d8f9ce9efa6..7c4b9a264f5 100644 --- a/src/uct/api/tl.h +++ b/src/uct/api/tl.h @@ -48,6 +48,11 @@ struct uct_ep_addr { typedef void (*uct_completion_cb_t)(uct_req_h req, ucs_status_t status); +/** + * Remote key release function. + */ +typedef void (*uct_rkey_release_func_t)(uct_context_h context, uct_rkey_t rkey); + /** * Interface attributes: capabilities and limitations. */ @@ -69,6 +74,15 @@ typedef struct uct_pd_attr { } uct_pd_attr_t; +/** + * Remote key with its type + */ +typedef struct uct_rkey_bundle { + uct_rkey_t rkey; /**< Remote key descriptor, passed to RMA functions */ + void *type; /**< Remote key type */ +} uct_rkey_bundle_t; + + /** * Transport "global" operations */ @@ -80,6 +94,10 @@ typedef struct uct_tl_ops { ucs_status_t (*iface_open)(uct_context_h context, const char *hw_name, uct_iface_h *iface_p); + + ucs_status_t (*rkey_unpack)(uct_context_h context, void *rkey_buffer, + uct_rkey_bundle_t *rkey_ob); + } uct_tl_ops_t; @@ -89,18 +107,17 @@ typedef struct uct_tl_ops { typedef struct uct_pd_ops { ucs_status_t (*query)(uct_pd_h pd, uct_pd_attr_t *pd_attr); + /* TODO + * - support "mem attach", MPI-3 style, e.g by passing rkey + * - support allocation, e.g by returning an address + */ ucs_status_t (*mem_map)(uct_pd_h pd, void *address, size_t length, - uct_lkey_t *lkey_p); + unsigned flags, uct_lkey_t *lkey_p); ucs_status_t (*mem_unmap)(uct_pd_h pd, uct_lkey_t lkey); - /* TODO support "mem attach", MPI-3 style */ - ucs_status_t (*rkey_pack)(uct_pd_h pd, uct_lkey_t lkey, void *rkey_buffer); - ucs_status_t (*rkey_unpack)(uct_pd_h pd, void *rkey_buffer, uct_rkey_t *rkey_p); - - void (*rkey_release)(uct_pd_h pd, uct_rkey_t rkey); } uct_pd_ops_t; diff --git a/src/uct/api/uct.h b/src/uct/api/uct.h index a9f71db10b7..c5a4f623f0b 100644 --- a/src/uct/api/uct.h +++ b/src/uct/api/uct.h @@ -72,37 +72,53 @@ ucs_status_t uct_iface_open(uct_context_h context, const char *tl_name, const char *hw_name, uct_iface_h *iface_p); +/** + * @ingroup CONTEXT + * + * @brief Unpack a remote key. + * + * @param [in] context Handle to context. + * @param [in] rkey_buffer Packet remote key buffer. + * @param [out] rkey_ob Filled with the unpacked remote key and its type. + * + * @return Error code. + */ +ucs_status_t uct_rkey_unpack(uct_context_h context, void *rkey_buffer, + uct_rkey_bundle_t *rkey_ob); + + +/** + * @ingroup CONTEXT + * + * @brief Unpack a remote key. + * + * @param [in] context Handle to context. + * @param [in] rkey_ob Remote key to release. + */ +void uct_rkey_release(uct_context_h context, uct_rkey_bundle_t *rkey_ob); + + static inline ucs_status_t uct_pd_query(uct_pd_h pd, uct_pd_attr_t *pd_attr) { return pd->ops->query(pd, pd_attr); } -static inline ucs_status_t uct_pd_mem_map(uct_pd_h pd, void *address, - size_t length, uct_lkey_t *lkey_p) +static inline ucs_status_t uct_mem_map(uct_pd_h pd, void *address, size_t length, + unsigned flags, uct_lkey_t *lkey_p) { - return pd->ops->mem_map(pd, address, length, lkey_p); + return pd->ops->mem_map(pd, address, length, flags, lkey_p); } -static inline ucs_status_t uct_pd_mem_unmap(uct_pd_h pd, uct_lkey_t lkey) +static inline ucs_status_t uct_mem_unmap(uct_pd_h pd, uct_lkey_t lkey) { return pd->ops->mem_unmap(pd, lkey); } -static inline ucs_status_t uct_pd_rkey_pack(uct_pd_h pd, uct_lkey_t lkey, void *rkey_buffer) +static inline ucs_status_t uct_rkey_pack(uct_pd_h pd, uct_lkey_t lkey, void *rkey_buffer) { return pd->ops->rkey_pack(pd, lkey, rkey_buffer); } -static inline ucs_status_t uct_pd_rkey_unpack(uct_pd_h pd, void *rkey_buffer, uct_rkey_t *rkey_p) -{ - return pd->ops->rkey_unpack(pd, rkey_buffer, rkey_p); -} - -static inline void uct_pd_rkey_release(uct_pd_h pd, uct_rkey_t rkey) -{ - pd->ops->rkey_release(pd, rkey); -} - static inline ucs_status_t uct_iface_query(uct_iface_h iface, uct_iface_attr_t *iface_attr) { diff --git a/src/uct/api/uct_def.h b/src/uct/api/uct_def.h index 17c0497892c..d33dd3fd72a 100644 --- a/src/uct/api/uct_def.h +++ b/src/uct/api/uct_def.h @@ -22,5 +22,6 @@ typedef uintptr_t uct_rkey_t; typedef struct uct_req *uct_req_h; typedef struct uct_memory_region *uct_memory_region_h; typedef struct uct_pd *uct_pd_h; +typedef void *uct_rkey_ctx_h; #endif diff --git a/src/uct/ib/base/ib_device.c b/src/uct/ib/base/ib_device.c index 18486a850e1..b6155293fb3 100644 --- a/src/uct/ib/base/ib_device.c +++ b/src/uct/ib/base/ib_device.c @@ -14,6 +14,7 @@ #include #include +#define UCT_IB_RKEY_MAGIC 0x69626962 /* ibib *(const uint32_t*)"ibib" */ static void uct_ib_device_get_affinity(const char *dev_name, cpu_set_t *cpu_mask) { @@ -54,12 +55,12 @@ static void uct_ib_device_get_affinity(const char *dev_name, cpu_set_t *cpu_mask static ucs_status_t uct_ib_pd_query(uct_pd_h pd, uct_pd_attr_t *pd_attr) { - pd_attr->rkey_packed_size = sizeof(uint32_t); + pd_attr->rkey_packed_size = sizeof(uint32_t) * 2; return UCS_OK; } static ucs_status_t uct_ib_mem_map(uct_pd_h pd, void *address, size_t length, - uct_lkey_t *lkey_p) + unsigned flags, uct_lkey_t *lkey_p) { uct_ib_device_t *dev = ucs_derived_of(pd, uct_ib_device_t); struct ibv_mr *mr; @@ -96,15 +97,26 @@ static ucs_status_t uct_ib_rkey_pack(uct_pd_h pd, uct_lkey_t lkey, void *rkey_buffer) { struct ibv_mr *mr = (void*)lkey; + uint32_t *ptr = rkey_buffer; - *(uint32_t*)rkey_buffer = htonl(mr->rkey); /* Use r-keys as big endian */ + *(ptr++) = UCT_IB_RKEY_MAGIC; + *(ptr++) = htonl(mr->rkey); /* Use r-keys as big endian */ return UCS_OK; } -static ucs_status_t uct_ib_rkey_unpack(uct_pd_h pd, void *rkey_buffer, - uct_rkey_t *rkey_p) +ucs_status_t uct_ib_rkey_unpack(uct_context_h context, void *rkey_buffer, + uct_rkey_bundle_t *rkey_ob) { - *rkey_p = *(uint32_t*)rkey_buffer; + uint32_t *ptr = rkey_buffer; + uint32_t magic; + + magic = *(ptr++); + if (magic != UCT_IB_RKEY_MAGIC) { + return UCS_ERR_UNSUPPORTED; + } + + rkey_ob->rkey = *(ptr++); + rkey_ob->type = (void*)ucs_empty_function; return UCS_OK; } @@ -113,8 +125,6 @@ uct_pd_ops_t uct_ib_pd_ops = { .mem_map = uct_ib_mem_map, .mem_unmap = uct_ib_mem_unmap, .rkey_pack = uct_ib_rkey_pack, - .rkey_unpack = uct_ib_rkey_unpack, - .rkey_release = (void*)ucs_empty_function }; ucs_status_t uct_ib_device_create(struct ibv_device *ibv_device, uct_ib_device_t **dev_p) diff --git a/src/uct/ib/base/ib_device.h b/src/uct/ib/base/ib_device.h index 8dc5ccd03fe..d83e5397ffe 100644 --- a/src/uct/ib/base/ib_device.h +++ b/src/uct/ib/base/ib_device.h @@ -39,6 +39,9 @@ ucs_status_t uct_ib_device_port_get_resource(uct_ib_device_t *dev, uint8_t port_ const char *uct_ib_device_name(uct_ib_device_t *dev); +ucs_status_t uct_ib_rkey_unpack(uct_context_h context, void *rkey_buffer, + uct_rkey_bundle_t *rkey_ob); + static inline struct ibv_exp_port_attr* uct_ib_device_port_attr(uct_ib_device_t *dev, uint8_t port_num) { return &dev->port_attr[port_num - dev->first_port]; diff --git a/src/uct/ib/rc/rc_mlx5.c b/src/uct/ib/rc/rc_mlx5.c index 4c5765e9bb2..d2623aa9881 100644 --- a/src/uct/ib/rc/rc_mlx5.c +++ b/src/uct/ib/rc/rc_mlx5.c @@ -183,5 +183,6 @@ static ucs_status_t uct_rc_mlx5_iface_open(uct_context_h context, uct_tl_ops_t uct_rc_mlx5_tl_ops = { .query_resources = uct_rc_mlx5_query_resources, .iface_open = uct_rc_mlx5_iface_open, + .rkey_unpack = uct_ib_rkey_unpack, }; diff --git a/src/uct/tl/context.c b/src/uct/tl/context.c index cb6c6b5770d..ae164f55ea3 100644 --- a/src/uct/tl/context.c +++ b/src/uct/tl/context.c @@ -134,3 +134,25 @@ ucs_status_t uct_iface_open(uct_context_h context, const char *tl_name, return UCS_ERR_NO_ELEM; } +ucs_status_t uct_rkey_unpack(uct_context_h context, void *rkey_buffer, + uct_rkey_bundle_t *rkey_ob) +{ + uct_context_tl_info_t *tl; + ucs_status_t status; + + for (tl = context->tls; tl < context->tls + context->num_tls; ++tl) { + status = tl->ops->rkey_unpack(context, rkey_buffer, rkey_ob); + if (status != UCS_ERR_UNSUPPORTED) { + return status; + } + } + + return UCS_ERR_INVALID_PARAM; +} + +void uct_rkey_release(uct_context_h context, uct_rkey_bundle_t *rkey_ob) +{ + uct_rkey_release_func_t release = rkey_ob->type; + release(context, rkey_ob->rkey); +} + diff --git a/test/gtest/uct/test_uct_context.cc b/test/gtest/uct/test_uct_context.cc index 89c6da3446d..c849a4ec236 100644 --- a/test/gtest/uct/test_uct_context.cc +++ b/test/gtest/uct/test_uct_context.cc @@ -118,12 +118,13 @@ class entity { free(iface_addr); } - void mem_map(void *address, size_t length, uct_lkey_t *lkey_p, uct_rkey_t *rkey_p) { + void mem_map(void *address, size_t length, uct_lkey_t *lkey_p, + uct_rkey_bundle_t *rkey_p) { ucs_status_t status; void *rkey_buffer; uct_pd_attr_t pd_attr; - status = uct_pd_mem_map(m_iface->pd, address, length, lkey_p); + status = uct_mem_map(m_iface->pd, address, length, 0, lkey_p); ASSERT_UCS_OK(status); status = uct_pd_query(m_iface->pd, &pd_attr); @@ -132,19 +133,19 @@ class entity { rkey_buffer = malloc(pd_attr.rkey_packed_size); ASSERT_TRUE(rkey_buffer != NULL); - status = uct_pd_rkey_pack(m_iface->pd, *lkey_p, rkey_buffer); + status = uct_rkey_pack(m_iface->pd, *lkey_p, rkey_buffer); ASSERT_UCS_OK(status); - status = uct_pd_rkey_unpack(m_iface->pd, rkey_buffer, rkey_p); + status = uct_rkey_unpack(m_ucth, rkey_buffer, rkey_p); ASSERT_UCS_OK(status); free(rkey_buffer); } - void mem_unmap(uct_lkey_t lkey, uct_lkey_t rkey) { + void mem_unmap(uct_lkey_t lkey, uct_rkey_bundle_t *rkey) { ucs_status_t status; - uct_pd_rkey_release(m_iface->pd, rkey); - status = uct_pd_mem_unmap(m_iface->pd, lkey); + uct_rkey_release(m_ucth, rkey); + status = uct_mem_unmap(m_iface->pd, lkey); ASSERT_UCS_OK(status); } @@ -170,9 +171,9 @@ class entity { UCS_TEST_F(test_uct, connect_ep) { const uint64_t magic = 0xdeadbeed1ee7a880; - entity e1, e2; + uct_rkey_bundle_t rkey; uct_lkey_t lkey; - uct_rkey_t rkey; + entity e1, e2; uint64_t val8; e2.mem_map(&val8, sizeof(val8), &lkey, &rkey); @@ -181,7 +182,7 @@ UCS_TEST_F(test_uct, connect_ep) { e2.connect(e1); val8 = 0; - e1.put8(magic, (uintptr_t)&val8, rkey); + e1.put8(magic, (uintptr_t)&val8, rkey.rkey); usleep(100000); @@ -189,6 +190,6 @@ UCS_TEST_F(test_uct, connect_ep) { EXPECT_EQ(magic, val8); - e2.mem_unmap(lkey, rkey); + e2.mem_unmap(lkey, &rkey); } diff --git a/test/perf/perftest.c b/test/perf/perftest.c index 479b38038ce..5345341d5a4 100644 --- a/test/perf/perftest.c +++ b/test/perf/perftest.c @@ -98,7 +98,7 @@ int main(int argc, char **argv) static const uint64_t count = 1000000ul; static volatile uint64_t shared_val8; unsigned long vaddr; - uct_rkey_t rkey; + uct_rkey_bundle_t rkey; void *rkey_buffer; uct_lkey_t lkey; uct_iface_addr_t *iface_addr; @@ -146,7 +146,7 @@ int main(int argc, char **argv) } shared_val8 = -1; - status = uct_pd_mem_map(iface->pd, (void*)&shared_val8, sizeof(shared_val8), &lkey); + status = uct_mem_map(iface->pd, (void*)&shared_val8, sizeof(shared_val8), 0, &lkey); if (status != UCS_OK) { fprintf(stderr, "Failed to register\n"); return -1; @@ -175,14 +175,14 @@ int main(int argc, char **argv) vaddr = (uintptr_t)&shared_val8; rkey_buffer = malloc(pd_attr.rkey_packed_size); - status = uct_pd_rkey_pack(iface->pd, lkey, rkey_buffer); + status = uct_rkey_pack(iface->pd, lkey, rkey_buffer); if (status != UCS_OK) { fprintf(stderr, "Failed to pack rkey\n"); return -1; } xchg(sockfd, rkey_buffer, pd_attr.rkey_packed_size); - status = uct_pd_rkey_unpack(iface->pd, rkey_buffer, &rkey); + status = uct_rkey_unpack(context, rkey_buffer, &rkey); if (status != UCS_OK) { fprintf(stderr, "Failed to unpack rkey\n"); return -1; @@ -203,13 +203,13 @@ int main(int argc, char **argv) if (my_rank == 0) { for (value = 0; value < 100; ++value) { - uct_ep_put_short(ep, &value, sizeof(value), vaddr, rkey, NULL, NULL); + uct_ep_put_short(ep, &value, sizeof(value), vaddr, rkey.rkey, NULL, NULL); while (shared_val8 != value); } } else if (my_rank == 1) { for (value = 0; value < 100; ++value) { while (shared_val8 != value); - uct_ep_put_short(ep, &value, sizeof(value), vaddr, rkey, NULL, NULL); + uct_ep_put_short(ep, &value, sizeof(value), vaddr, rkey.rkey, NULL, NULL); } } @@ -217,13 +217,13 @@ int main(int argc, char **argv) if (my_rank == 0) { for (value = 0; value < count; ++value) { - uct_ep_put_short(ep, &value, sizeof(value), vaddr, rkey, NULL, NULL); + uct_ep_put_short(ep, &value, sizeof(value), vaddr, rkey.rkey, NULL, NULL); while (shared_val8 != value); } } else if (my_rank == 1) { for (value = 0; value < count; ++value) { while (shared_val8 != value); - uct_ep_put_short(ep, &value, sizeof(value), vaddr, rkey, NULL, NULL); + uct_ep_put_short(ep, &value, sizeof(value), vaddr, rkey.rkey, NULL, NULL); } } @@ -233,8 +233,8 @@ int main(int argc, char **argv) printf("Test done latency=%.3f usec\n", lat); - uct_pd_rkey_release(iface->pd, rkey); - uct_pd_mem_unmap(iface->pd, lkey); + uct_rkey_release(context, &rkey); + uct_mem_unmap(iface->pd, lkey); uct_ep_destroy(ep); uct_iface_close(iface); uct_cleanup(context); From 1d57bc69f20ac3fa274230dc2bffc8ae743a87b8 Mon Sep 17 00:00:00 2001 From: yosefe Date: Sun, 9 Nov 2014 15:53:40 +0200 Subject: [PATCH 4/4] PERF/RC: Fix coverity errors. --- .gitignore | 1 + src/uct/ib/rc/rc_mlx5.c | 4 +++- test/perf/perftest.c | 9 ++++++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 352d1a71012..bf36f912f35 100644 --- a/.gitignore +++ b/.gitignore @@ -37,4 +37,5 @@ depcomp *.o src/ucs/ucs_stats_parser test/gtest/gtest +test/perf/ucx_perftest build-* diff --git a/src/uct/ib/rc/rc_mlx5.c b/src/uct/ib/rc/rc_mlx5.c index d2623aa9881..e6fd656cd20 100644 --- a/src/uct/ib/rc/rc_mlx5.c +++ b/src/uct/ib/rc/rc_mlx5.c @@ -50,7 +50,7 @@ static ucs_status_t uct_rc_mlx5_ep_create(uct_iface_h tl_iface, uct_ep_h *ep_p) status = uct_rc_ep_init(&ep->super); if (status != UCS_OK) { - goto err; + goto err_free; } status = uct_ib_mlx5_get_qp_info(ep->super.qp, &qp_info); @@ -82,6 +82,8 @@ static ucs_status_t uct_rc_mlx5_ep_create(uct_iface_h tl_iface, uct_ep_h *ep_p) err_cleanup_rc_ep: uct_rc_ep_cleanup(&ep->super); +err_free: + ucs_free(ep); err: return status; } diff --git a/test/perf/perftest.c b/test/perf/perftest.c index 5345341d5a4..47e9f6a639f 100644 --- a/test/perf/perftest.c +++ b/test/perf/perftest.c @@ -37,6 +37,7 @@ int sock_init(int argc, char **argv, int *my_rank) optval = 1; ret = setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)); if (ret < 0) { + close(sockfd); return ret; } @@ -45,22 +46,27 @@ int sock_init(int argc, char **argv, int *my_rank) ret = bind(sockfd, (struct sockaddr*)&inaddr, sizeof(inaddr)); if (ret < 0) { fprintf(stderr, "bind() failed: %m\n"); + close(sockfd); return ret; } ret = listen(sockfd, 100); if (ret < 0) { fprintf(stderr, "listen() failed: %m\n"); + close(sockfd); return ret; } *my_rank = 0; printf("Waiting for connection...\n"); - return accept(sockfd, NULL, NULL); + ret = accept(sockfd, NULL, NULL); + close(sockfd); + return ret; } else { he = gethostbyname(argv[1]); if (he == NULL || he->h_addr_list == NULL) { fprintf(stderr, "host %s not found: %s\n", argv[1], hstrerror(h_errno)); + close(sockfd); return -1; } @@ -70,6 +76,7 @@ int sock_init(int argc, char **argv, int *my_rank) ret = connect(sockfd, (struct sockaddr*)&inaddr, sizeof(inaddr)); if (ret < 0) { fprintf(stderr, "connect() failed: %m\n"); + close(sockfd); return -1; }