diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100644 index ba1209350c7..00000000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,14 +0,0 @@ -build: - only: - - tags - script: - - "./autogen.sh" - - "./contrib/configure-release --disable-numa" - - make -j - - make dist - - 'export upload_url=$(curl -s -H "Authorization: token $github_token" "https://api.github.com/repos/openucx/ucx/releases" | python -c "import sys,os,json; d=json.load(sys.stdin); tag=os.environ.get(\"CI_COMMIT_TAG\"); rel = [r for r in d if r[\"tag_name\"] == tag]; url = rel[0][\"upload_url\"] if rel else \"\"; print url" | grep -oP "https\S+assets")' - - echo $upload_url - - 'export tar_name=$(ls *.tar.gz)' - - echo $tar_name - - 'curl -s -H "Authorization: token $github_token" -H "Content-Type: application/zip" --data-binary @"$tar_name" "${upload_url}?name=${tar_name}&label=${tar_name}"' - diff --git a/AUTHORS b/AUTHORS index 77d0527611c..674d95e769c 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,39 +1,71 @@ -Akshay Venkatesh -Alexander Margolin -Alexander Mikheev +Akshay Venkatesh +Alex Margolin +Alex Mikheev Alina Sklarevich -Andrey Maslennikov -Artem Polyakov -Artemy Kovalyov +Andrey Maslennikov +Artem Polyakov +Artemy Kovalyov Aurelien Bouteiller -Devendar Bureddy +Bin Lei +Brad Benton +Corey J. Nolet +Devendar Bureddy +Dmitry Gladkov +Doug Jacobsen Elad Persiko Eugene Voronov Evgeny Leksikov -Gilles Gouaillardet +Gilbert Lee +Gilles Gouaillardet Graham Lopez Guy Shattah +Hiroyuki Sato Howard Pritchard Igor Ivanov -Ilya Nelkenbaum -Jeff Daily -Khaled Hamidouche -Manjunath Gorentla Venkata +Ilya Nelkenbaum +Jakir Kham +Jason Gunthorpe +Jeff Daily +John Snyder +Keisuke Fukuda +Ken Raffenetti +Khaled Hamidouche +Konstantin Belousov +Luis E. Pena +Manjunath Gorentla Venkata +Marek Schimara Matthew Baker Mike Dubman -Mikhail Brinskii +Mikhail Brinskiy Nathan Hjelm Netanel Yosephian -Pavel Shamis +Olly Perks +Pak Lui +Pavan Balaji +Pavel Shamis (Pasha) +Peter Andreas Entschev +Peter Rudenko +Qiang Yu +Rohit Zambre Sasha Kotchubievsky -Sergey Oblomov -Sergey Shalnov -Serguei Sagalovitch +Scott Saulters +Sergey Lebedev +Sergey Oblomov +Sergey Shalnov +Serguei Sagalovitch +Sheng Yang +Shuki Zanyovka +Sourav Chakraborty +Srinivasan Subramanian Stephen Richmond Swen Boehm Tony Curtis +Valentin Petrov +Wenbin Lu Xin Zhao Yossi Itigin +Yuriy Shestakov +Zhu Yanjun In addition we would like to acknowledge the following members of UCX community for their participation in annual face-to-face meeting, design discussions, and diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index 140810400e0..00000000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,20 +0,0 @@ - -1. Please sign the [UCX contributors agreement](http://www.openucx.org/license). - -1. Please follow the [code style](https://github.com/openucx/ucx/blob/master/doc/CodeStyle.md) and [logging style](https://github.com/openucx/ucx/blob/master/doc/LoggingStyle.md). - -1. Make sure automatic tests pass. - -1. Request a review by [mentioning](https://github.com/blog/821-mention-somebody-they-re-notified) the relevant reviewer. - -1. PR which is reviewed and currently waiting for a fix and/or response, will be marked with "Waiting for Author Response" by the reviewer, - -1. If you need to fix your PR, there are 2 options: amend (replace) your commit, or add new commit with fixes. - * Before anyone has posted comments on the PR, it's allowed to amend. - Example: Fixing bugs found in automatic tests. - * If some comments have been posted, the fixes should be in a new commit. - Reason: Replacing the commit discards the comments. - -1. After getting :+1: from one or more of UCX maintainers the PR can be merged. - -More details [here](http://github.com/openucx/ucx/wiki/Guidance-for-contributors). diff --git a/INSTALL b/INSTALL deleted file mode 100644 index 04516d5b8b5..00000000000 --- a/INSTALL +++ /dev/null @@ -1,92 +0,0 @@ -# -# Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. -# -# See file LICENSE for terms. -# - - -For More Information -==================== - -This file is a *very* short overview of building and installing UCX -and building MPI/SHMEM libraries with it. Much more information is -available on the UCX github wiki and UCX web site. - - https://github.com/openucx/ucx/wiki - - -and- - - http://openucx.org/ - - -Developer Builds -================ - -shell$ ./autogen.sh -shell$ ./contrib/configure-devel --prefix=$PWD/install-debug - -You will need very recent versions of GNU Autoconf, Automake, and -Libtool. - -*** NOTE: Developer's copies of UCX typically include a large -performance penalty at run-time because of extra debugging overhead. - - -User Builds -=========== - -Building UCX is typically a combination of running "configure" -and "make". Execute the following commands to install the UCX -system from within the directory at the top of the tree: - -shell$ ./autogen.sh -shell$ ./contrib/configure-release --prefix=/where/to/install -shell$ make all install - -If you need special access to install, then you can execute "make -all" as a user with write permissions in the build tree, and a -separate "make install" as a user with write permissions to the -install tree. - -Compiling support for various networks or other specific hardware may -require additional command line flags when running configure. - -Parallel builds are also supported (although some versions of "make", -such as GNU make, will only use the first target listed on the command -line when executing parallel builds). For example (assume GNU make): - -shell$ make -j 4 all -shell$ make install - -Parallel make is generally only helpful in the build phase (i.e., -"make all"); the installation process (i.e., "make install") is mostly -serial and does not benefit much from parallelization. - -# Build rpm package -shell$ contrib/buildrpm.sh -s -b - -# Build deb package -shell$ dpkg-buildpackage -us -uc - -# Build Doxygen documentation -shell$ make docs - - -Compiling OpenMPI with UCX library -================================== - -The Open MPI package includes two public software layers: MPI and OpenSHMEM. -UCX support in OpenMPI is not upstream yet, it is work in progress. -Current work on OpenMPI/UCX integration is ongoing for OpenSHMEM API and can be found here: - -https://github.com/openucx/ompi-mirror/pull/1 - - -To compile OpenMPI with UCX support: - -# checkout Pull Request -% git clone https://github.com/openucx/ompi-mirror -% cd ompi-mirror -% git fetch origin pull/1/head:pr-1 -% git checkout pr-1 - diff --git a/LICENSE b/LICENSE index 1621dab2bfd..f713af4af38 100644 --- a/LICENSE +++ b/LICENSE @@ -1,12 +1,16 @@ Copyright (c) 2014-2015 UT-Battelle, LLC. All rights reserved. -Copyright (C) 2014-2015 Mellanox Technologies Ltd. All rights reserved. +Copyright (C) 2014-2020 Mellanox Technologies Ltd. All rights reserved. Copyright (C) 2014-2015 The University of Houston System. All rights reserved. Copyright (C) 2015 The University of Tennessee and The University of Tennessee Research Foundation. All rights reserved. -Copyright (C) 2016 ARM Ltd. All rights reserved. +Copyright (C) 2016-2020 ARM Ltd. All rights reserved. Copyright (c) 2016 Los Alamos National Security, LLC. All rights reserved. -Copyright (C) 2016-2017 Advanced Micro Devices, Inc. All rights reserved. -Copyright (C) 2019-2020 Huawei Technologies Co.,Ltd. All rights reserved. +Copyright (C) 2016-2020 Advanced Micro Devices, Inc. All rights reserved. +Copyright (C) 2019 UChicago Argonne, LLC. All rights reserved. +Copyright (c) 2018-2020 NVIDIA CORPORATION. All rights reserved. +Copyright (C) 2020 Huawei Technologies Co., Ltd. All rights reserved. +Copyright (C) 2016-2020 Stony Brook University. All rights reserved. +Copyright (C) 2019-2021 Huawei Technologies Co.,Ltd. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions diff --git a/Makefile.am b/Makefile.am index d1086bf5644..64364aa4fbf 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3,7 +3,6 @@ # Copyright (C) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. # Copyright (C) The University of Tennessee and The University # of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED. -# Copyright (C) Huawei Technologies Co.,Ltd. 2020. ALL RIGHTS RESERVED. # # See file LICENSE for terms. # @@ -22,11 +21,11 @@ doc_dir = $(pkgdatadir)/doc if !DOCS_ONLY perftest_dir = $(pkgdatadir)/perftest dist_perftest__DATA = contrib/ucx_perftest_config/msg_pow2 \ + contrib/ucx_perftest_config/msg_pow2_large \ contrib/ucx_perftest_config/README \ contrib/ucx_perftest_config/test_types_uct \ contrib/ucx_perftest_config/test_types_ucp \ contrib/ucx_perftest_config/transports - SUBDIRS = \ src/ucm \ src/ucs \ @@ -34,15 +33,16 @@ SUBDIRS = \ src/ucp if HAVE_UCG -SUBDIRS += src/ucg +SUBDIRS += $(UCG_SUBDIR) endif SUBDIRS += \ src/tools/info \ src/tools/perf \ src/tools/profile \ + bindings/java \ test/apps \ - test/examples + examples if HAVE_GTEST SUBDIRS += test/gtest @@ -52,11 +52,6 @@ if HAVE_MPICC SUBDIRS += test/mpi endif -if HAVE_JAVA -SUBDIRS += bindings/java/src/main/native -endif - -EXTRA_DIST += bindings/java EXTRA_DIST += contrib/configure-devel EXTRA_DIST += contrib/configure-release EXTRA_DIST += contrib/configure-prof @@ -70,16 +65,16 @@ EXTRA_DIST += debian EXTRA_DIST += ucx.pc.in EXTRA_DIST += LICENSE endif #!DOCS_ONLY -EXTRA_DIST += doc/uml/uct.dot +EXTRA_DIST += docs/uml/uct.dot -include $(srcdir)/doc/doxygen/doxygen.am +include $(srcdir)/docs/doxygen/doxygen.am .PHONY: docs docs-clean pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = ucx.pc -DOCLIST = doc/doxygen/doxygen-doc/ucx.tag +DOCLIST = docs/doxygen/doxygen-doc/ucx.tag FORMAT = pdf DOT_CLEANFILES = @@ -90,8 +85,8 @@ gtest: endif if HAVE_DOT -DOCLIST += doc/uml/uml.tag doc/uml/uct.$(FORMAT) doc/uml/ucp.$(FORMAT) -DOT_CLEANFILES += doc/uml/uml.tag doc/uml/uct.$(FORMAT) doc/uml/ucp.$(FORMAT) +DOCLIST += docs/uml/uml.tag docs/uml/uct.$(FORMAT) docs/uml/ucp.$(FORMAT) +DOT_CLEANFILES += docs/uml/uml.tag docs/uml/uct.$(FORMAT) docs/uml/ucp.$(FORMAT) endif docs: $(DOCLIST) @@ -100,10 +95,10 @@ docs-clean: $(RM) $(DX_CLEANFILES) $(RM) $(DOT_CLEANFILES) -doc/doxygen/doxygen-doc/ucx.tag: $(doxygen_doc_files) doxygen-doc +docs/doxygen/doxygen-doc/ucx.tag: $(doxygen_doc_files) doxygen-doc -doc/uml/uml.tag: - mkdir -p doc/uml +docs/uml/uml.tag: + mkdir -p docs/uml echo `date` > $@ .dot.pdf: diff --git a/NEWS b/NEWS index 2d217399006..d1dcb81f25a 100644 --- a/NEWS +++ b/NEWS @@ -1,14 +1,169 @@ # -## Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. -## Copyright (C) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. -## Copyright (C) ARM Ltd. 2017-2018. ALL RIGHTS RESERVED. +## Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. +## Copyright (C) UT-Battelle, LLC. 2014-2019. ALL RIGHTS RESERVED. +## Copyright (C) ARM Ltd. 2017-2020. ALL RIGHTS RESERVED. ## ## See file LICENSE for terms. ## # +## 1.9.0 (September 19, 2020) +### Features: +#### UCX Core +- Added a new class of communication APIs '*_nbx' that enable API extendability while + preserving ABI backward compatibility +- Added asynchronous event support to UCT/IB/DEVX +- Added support for latest CUDA library version +- Added NAK-based reliability protocol for UCT/IB/UD to optimize resends +- Added new tests for ROCm +- Added new configuration parameters for protocol selection +- Added performance optimization for Fujitsu A64FX with InfiniBand +- Added performance optimization for clear cache code aarch64 +- Added support for relaxed-order PCIe access in IB RDMA transports +- Added new TCP connection manager +- Added support for UCT/IB PKey with partial membership in IB transports +- Added support for RoCE LAG +- Added support for ROCm 3.7 and above +- Added flow control for RDMA read operations +- Improved endpoint flush implementation for UCT/IB +- Improved UD timer to avoid interrupting the main thread when not in use +- Improved latency estimation for network path with CUDA +- Improved error reporting messages +- Improved performance in active message flow (removed malloc call) +- Improved performance in ptr_array flow +- Improved performance in UCT/SM progress engine flow +- Improved I/O demo code +- Improved rendezvous protocol for CUDA +- Updated examples code + +#### UCX Java (API Preview) +- Added support for UCX shared library loading from both classpath and LD_LIBRARY_PATH +- Added configuration map to ucp_params to be able to set UCX properties programmatically + +### Bugfixes: +- Fixes for most resent versions of GCC, CLANG, ARMCLANG, PGI +- Fixes in UCT/IB for strict order keys +- Fixes in memory barrier code for aarch64 +- Fixes in UCT/IB/DEVX for fork system call +- Fixes in UCT/IB for rand() call in rdma-core +- Fixed in group rescheduling for UCT/IB/DC +- Fixes in UCT/CUDA bandwidth reporting +- Fixes in rkey_ptr protocol +- Fixes in lane selection for rendezvous protocol based on get-zero-copy flow +- Fixes for ROCm build +- Fixes for XPMEM transport +- Fixes in closing endpoint code +- Fixes in RDMACM code +- Fixes in memcpy selection for AMD +- Fixed in UCT/UD endpoint flush functionality +- Fixes in XPMEM detection +- Fixes in rendezvous staging protocol +- Fixes in ROCEv1 mlx5 UDP source port configuration +- Multiple fixes in RPM spec file +- Multiple fixes in UCP documentation +- Multiple fixes in socket connection manager +- Multiple fixes in gtest +- Multiple fixes in JAVA API implementation + +## 1.8.1 (July 10, 2020) +### Features: +- Added binary release pipeline in Azure CI + +### Bugfixes: +- Multiple fixes in testing environment +- Fixes in InfiniBand DEVX transport +- Fixes in memory management for CUDA IPC transport +- Fixes for binutils 2.34+ +- Fixes in RPM SPEC file and package generation +- Fixes for AMD ROCM build environment + +## 1.8.0 (April 3, 2020) +### Features: +#### UCX Core +- Improved detection for DEVX support +- Improved TCP scalability +- Added support for ROCM to perftest +- Added support for different source and target memory types to perftest +- Added optimized memcpy for ROCM devices +- Added hardware tag-matching for CUDA buffers +- Added support for CUDA and ROCM managed memories +- Added support for client/server disconnect protocol over rdma connection manager +- Added support for striding receive queue for hardware tag-matching +- Added XPMEM-based rendezvous protocol for shared memory +- Added support shared memory communication between containers on same machine +- Added support for multi-threaded RDMA memory registration for large regions +- Added new test cases to Azure CI + +#### UCX Java (API Preview) +- Added APIs for stream send/recv, tag probe, and connect request handle +- Added Java package (automatically published) to Maven central + +### Bugfixes: +- Multiple fixes in JUCX +- Fixes in UCP thread safety +- Fixes for most recent versions GCC, PGI, and ICC +- Fixes for CPU affinity on Azure instances +- Fixes in XPMEM support on PPC64 +- Performance fixes in CUDA IPC +- Fixes in RDMA CM flows +- Multiple fixes in TCP transport +- Multiple fixes in documentation +- Fixes in transport lane selection logic +- Fixes in Java jar build +- Fixes in socket connection manager for Nvidia DGX-2 platform + +## 1.7.0 (January 19, 2020) +### Features: +- Added support for multiple listening transports +- Added UCT socket-based connection manager transport +- Updated API for UCT component management +- Added API to retrieve the listening port +- Added UCP active message API +- Removed deprecated API for querying UCT memory domains +- Refactored server/client examples +- Added support for dlopen interception in UCM +- Added support for PCIe atomics +- Updated Java API: added support for most of UCP layer operations +- Updated support for Mellanox DevX API +- Added multiple UCT/TCP transport performance optimizations +- Optimized memcpy() for Intel platforms +- Added protection from non-UCX socket based app connections +- Improved search time for PKEY object +- Enable gtest over IPv6 interfaces +- Updated Mellanox and Bull device IDs +- Added support for CUDA_VISIBLE_DEVICES +- Increased limits for CUDA IPC registration + +### Bugfixes: +- Multiple fixes in UCP, UCT, UCM libraries +- Multiple fixes for BSD and Mac OS systems +- Fixes for Clang compiler +- Fixes for CUDA IPC +- Fix CPU optimization configuration options +- Fix JUCX build on GPU nodes +- Fix in Azure release pipeline flow +- Fix in CUDA memory hooks management +- Fix in GPU memory peer direct gtest +- Fix in TCP connection establishment flow +- Fix in GPU IPC check +- Fix in CUDA Jenkins test flow +- Multiple fixes in CUDA IPC flow +- Fix adding missing header files +- Fix to prevent failures in presence of VPN enabled Ethernet interfaces + +## 1.6.1 (September 23, 2019) +### Features: +- Added Bull Atos HCA device IDs +- Added Azure Pipelines testing + +### Bugfixes: +- Multiple static checker fixes +- Remove pkg.m4 dependency +- Multiple clang static checker fixes +- Fix mem type support with generic datatype + ## 1.6.0 (July 17, 2019) -Features: +### Features: - Modular architecture for UCT transports - ROCm transport re-design: support for managed memory, direct copy, ROCm GDR - Random scheduling policy for DC transport @@ -17,9 +172,9 @@ Features: - Support for PCI atomics with IB transports - Reduced UCP address size for homogeneous environments -Bugfixes: +### Bugfixes: - Multiple stability and performance improvements in TCP transport -- Multiple stability fixed in Verbs and MLX5 transports +- Multiple stability fixes in Verbs and MLX5 transports - Multiple stability fixes in UCM memory hooks - Multiple stability fixes in UGNI transport - RPM Spec file cleanup @@ -42,20 +197,20 @@ Bugfixes: - Fix race condition updating fired_events from multiple threads - Fix madvise() hook -Tested configurations: +### Tested configurations: - RDMA: MLNX_OFED 4.5, distribution inbox drivers, rdma-core 22.1 - CUDA: gdrcopy 1.3.2, cuda 9.2, ROCm 2.2 - XPMEM: 2.6.2 - KNEM: 1.1.3 ## 1.5.1 (April 1, 2019) -Bugfixes: +### Bugfixes: - Fix dc_mlx5 transport support check for inbox libmlx5 drivers - issue #3301 - Fix compilation warnings with gcc9 and clang - ROCm - reduce log level of device-not-found message ## 1.5.0 (February 14, 2019) -Features: +### Features: - New emulation mode enabling full UCX functionality (Atomic, Put, Get) over TCP and RDMA-CORE interconnects that don't implement full RDMA semantics - Non-blocking API for all one-sided operations. All blocking communication APIs marked @@ -67,7 +222,7 @@ Features: - Statistics for UCT tag API - GPU-to-Infiniband HCA affinity support based on locality/distance (PCIe) -Bugfixes: +### Bugfixes: - Fix overflow in RC/DC flush operations - Update description in SPEC file and README - Fix RoCE source port for dc_mlx5 flow control @@ -75,18 +230,17 @@ Bugfixes: - Fix segfault in UCP, due to int truncation in count_one_bits() - Multiple other bugfixes (full list on github) -Tested configurations: +### Tested configurations: - InfiniBand: MLNX_OFED 4.4-4.5, distribution inbox drivers, rdma-core - CUDA: gdrcopy 1.2, cuda 9.1.85 - XPMEM: 2.6.2 - KNEM: 1.1.2 ## 1.4.0-rc2 (October 23, 2018) - -Features: +### Features: - Improved support for installation with latest ROCm - Improved support for latest rdma-core -- Adding support for CUDA IPC for intra-node GPU +- Added support for CUDA IPC for intra-node GPU - Added support for CUDA memory allocation cache for mem-type detection - Added support for latest Mellanox devices - Added support for Nvidia GPU managed memory @@ -95,7 +249,7 @@ Features: and INADDR_ANY - Added support for bitwise atomics operations -Bugfixes: +### Bugfixes: - Performance fixes for rendezvous protocol - Memory hook fixes - Clang support fixes @@ -106,37 +260,36 @@ Bugfixes: - Segfault fix for a code generated by armclang compiler - UCP memory-domain index fix for zero-copy active messages -Tested configurations: +### Tested configurations: - InfiniBand: MLNX_OFED 4.2-4.4, distribution inbox drivers, rdma-core - CUDA: gdrcopy 1.2, cuda 9.1.85 - XPMEM: 2.6.2 - KNEM: 1.1.2 - Multiple bugfixes (full list on github) -Known issues: - #2919 - Segfault in CUDA support when KNEM not present and CMA is active - intra-node RMA transport. As a workaround user can disable CMA support at - compile time: --disable-cma. Alternatively user can remove CMA from UCX_TLS - list, for example: UCX_TLS=mm,rc,cuda_copy,cuda_ipc,gdr_copy. +### Known issues: +#2919 - Segfault in CUDA support when KNEM not present and CMA is active +intra-node RMA transport. As a workaround user can disable CMA support at +compile time: --disable-cma. Alternatively user can remove CMA from UCX_TLS +list, for example: UCX_TLS=mm,rc,cuda_copy,cuda_ipc,gdr_copy. ## 1.3.1 (August 20, 2018) - -Bugfixes: +### Bugfixes: - Prevent potential out-of-order sending in shared memory active messages - CUDA: Include cudamem.h in source tarball, pass cudaFree memory size - Registration cache: fix large range lookup, handle shmat(REMAP)/mmap(FIXED) - Limit IB CQE size for specific ARM boards - RPM: explicitly set gcc-c++ as requirement - Multiple bugfixes (full list on github) -Tested configurations: + +### Tested configurations: - InfiniBand: MLNX_OFED 4.2, inbox OFED drivers. - CUDA: gdrcopy 1.2, cuda 9.1.85 - XPMEM: 2.6.2 - KNEM: 1.1.2 ## 1.3.0 (February 15, 2018) - -Features: +### Features: - Added stream-based communication API to UCP - Added support for GPU platforms: Nvidia CUDA and AMD ROCm software stacks - Added API for client/server based connection establishment @@ -155,30 +308,31 @@ Features: - Add support for external epoll fd and edge-triggered events - Added registration cache for knem - Initial support for Java bindings -Bugfixes: + +### Bugfixes: - Multiple bugfixes (full list on github) -Tested configurations: + +### Tested configurations: - InfiniBand: MLNX_OFED 4.2, inbox OFED drivers. - CUDA: gdrcopy 1.2, cuda 9.1.85 - XPMEM: 2.6.2 - KNEM: 1.1.2 -Known issues: - #2047 - UCP: ucp_do_am_bcopy_multi drops data on UCS_ERROR_NO_RESOURCE - #2047 - failure in ud/uct_flush_test.am_zcopy_flush_ep_nb/1 - #1977 - failure in shm/test_ucp_rma.blocking_small/0 - #1926 - Timeout in mpi_test_suite with HW TM - #1920 - transport retry count exceeded in many-to-one tests - #1689 - Segmentation fault on memory hooks test in jenkins +### Known issues: +#2047 - UCP: ucp_do_am_bcopy_multi drops data on UCS_ERROR_NO_RESOURCE +#2047 - failure in ud/uct_flush_test.am_zcopy_flush_ep_nb/1 +#1977 - failure in shm/test_ucp_rma.blocking_small/0 +#1926 - Timeout in mpi_test_suite with HW TM +#1920 - transport retry count exceeded in many-to-one tests +#1689 - Segmentation fault on memory hooks test in jenkins ## 1.2.2 (January 4, 2018) - -Main: +### Main: - Support including UCX API headers from C++ code - UD transport to handle unicast flood on RoCE fabric - Compilation fixes for gcc 7.1.1, clang 3.6, clang 5 -Details: +### Details: - When UD transport is used with RoCE, packets intended for other peers may arrive on different adapters (as a result of unicast flooding). - This change adds packet filtering based on destination GIDs. Now the packet @@ -191,79 +345,73 @@ Details: - [cleanup] Fixup license headers ## 1.2.1 (August 28, 2017) - +### Bugfixes: - Compilation fixes for gcc 7.1 - Spec file cleanups - Versioning cleanups ## 1.2.0 (June 15, 2017) - -Supported platforms - - Shared memory: KNEM, CMA, XPMEM, SYSV, Posix - - VERBs over InfiniBand and RoCE. - VERBS over other RDMA interconnects (iWarp, OmniPath, etc.) is available - for community evaluation and has not been tested in context of this release - - Cray Gemini and Aries - - Architectures: x86_64, ARMv8 (64bit), Power64 -Features: - - Added support for InfiniBand DC and UD transports, including accelerated verbs for Mellanox devices - - Full support for PGAS/SHMEM interfaces, blocking and non-blocking APIs - - Support for MPI tag matching, both in software and offload mode - - Zero copy protocols and rendezvous, registration cache - - Handling transport errors - - Flow control for DC/RC - - Dataypes support: contiguous, IOV, generic - - Multi-threading support - - Support for ARMv8 64bit architecture - - A new API for efficient memory polling - - Support for malloc-hooks and memory registration caching -Bugfixes: - - Multiple bugfixes improving overall stability of the library -Known issues: - #1604 - Failure in ud/test_ud_slow_timer.retransmit1/1 with valgrind bug - #1588 - Fix reading cpuinfo timebase for ppc bug portability training - #1579 - Ud/test_ud.ca_md test takes too long too complete bug - #1576 - Failure in ud/test_ud_slow_timer.retransmit1/0 with valgrind bug - #1569 - Send completion with error with dc_verbs bug - #1566 - Segfault in malloc_hook.fork on arm bug - #1565 - Hang in udrc/test_ucp_rma.nonblocking_stream_get_nbi_flush_worker bug - #1534 - Wireup.c:473 Fatal: endpoint reconfiguration not supported yet bug - #1533 - Stack overflow under Valgrind 'rc_mlx5/uct_p2p_err_test.local_access_error/0' bug - #1513 - Hang in MPI_Finalize with UCX_TLS=rc[_x],sm on the bsend2 test bug - #1504 - Failure in cm/uct_p2p_am_test.am_bcopy/1 bug - #1492 - Hang when using polling fd bug - #1489 - Hang on the osu_fop_latency test with RoCE bug - #1005 - ROcE problem with OMPI direct modex - UD assertion +### Supported platforms +- Shared memory: KNEM, CMA, XPMEM, SYSV, Posix +- VERBs over InfiniBand and RoCE. + VERBS over other RDMA interconnects (iWarp, OmniPath, etc.) is available + for community evaluation and has not been tested in context of this release +- Cray Gemini and Aries +- Architectures: x86_64, ARMv8 (64bit), Power64 + +### Features: +- Added support for InfiniBand DC and UD transports, including accelerated verbs for Mellanox devices +- Full support for PGAS/SHMEM interfaces, blocking and non-blocking APIs +- Support for MPI tag matching, both in software and offload mode +- Zero copy protocols and rendezvous, registration cache +- Handling transport errors +- Flow control for DC/RC +- Dataypes support: contiguous, IOV, generic +- Multi-threading support +- Support for ARMv8 64bit architecture +- A new API for efficient memory polling +- Support for malloc-hooks and memory registration caching + +### Bugfixes: + - Multiple bugfixes improving overall stability of the library + +### Known issues: +#1604 - Failure in ud/test_ud_slow_timer.retransmit1/1 with valgrind bug +#1588 - Fix reading cpuinfo timebase for ppc bug portability training +#1579 - Ud/test_ud.ca_md test takes too long too complete bug +#1576 - Failure in ud/test_ud_slow_timer.retransmit1/0 with valgrind bug +#1569 - Send completion with error with dc_verbs bug +#1566 - Segfault in malloc_hook.fork on arm bug +#1565 - Hang in udrc/test_ucp_rma.nonblocking_stream_get_nbi_flush_worker bug +#1534 - Wireup.c:473 Fatal: endpoint reconfiguration not supported yet bug +#1533 - Stack overflow under Valgrind 'rc_mlx5/uct_p2p_err_test.local_access_error/0' bug +#1513 - Hang in MPI_Finalize with UCX_TLS=rc[_x],sm on the bsend2 test bug +#1504 - Failure in cm/uct_p2p_am_test.am_bcopy/1 bug +#1492 - Hang when using polling fd bug +#1489 - Hang on the osu_fop_latency test with RoCE bug +#1005 - ROcE problem with OMPI direct modex - UD assertion ## 1.1.0 (September 1, 2015) - -Workarounds: -Features: - - Added support for AM based on FIFO in `mm` shared memory transport - - Added support for UCT `knem` shared memory transport (http://knem.gforge.inria.fr) - - Added support for UCT `mm/xpmem` shared memory transport (https://github.com/hjelmn/xpmem) - - -Bugfixes: -Known issues: - +### Workarounds: +### Features: +- Added support for AM based on FIFO in `mm` shared memory transport +- Added support for UCT `knem` shared memory transport (http://knem.gforge.inria.fr) +- Added support for UCT `mm/xpmem` shared memory transport (https://github.com/hjelmn/xpmem) ## 1.0.0 (July 22, 2015) - -Features: - - - Added support for UCT `cma` shared memory transport (Cross-Memory Attatch) - - Added support for UCT `mm` shared memory transport with mmap/sysv APIs - - Added support for UCT `rc` transport based on Infiniband/RC with verbs - - Added support for UCT `mlx5_rc` transport based on Infiniband/RC with accelerated verbs - - Added support for UCT `cm` transport based on Infiniband/SIDR (Service ID Resolution) - - Added support for UCT `ugni` transport based on Cray/UGNI - - Added support for Doxygen based documentation generation - - Added support for UCP basic protocol layer to fit PGAS paradigm (RMA, AMO) - - Added ucx_perftest utility to exercise major UCX flows and provide performance metrics - - Added test script for jenkins (contrib/test_jenkins.sh) - - Added packaging for RPM/DEB based linux distributions (see contrib/buildrpm.sh) - - Added Unit-tests infractucture for UCX functionality based on Google Test framework (see test/gtest/) - - Added initial integration for OpenMPI with UCX for PGAS/SHMEM API - (see: https://github.com/openucx/ompi-mirror/pull/1) - - Added end-to-end testing infrastructure based on MTT (see contrib/mtt/README_MTT) +### Features: +- Added support for UCT `cma` shared memory transport (Cross-Memory Attatch) +- Added support for UCT `mm` shared memory transport with mmap/sysv APIs +- Added support for UCT `rc` transport based on Infiniband/RC with verbs +- Added support for UCT `mlx5_rc` transport based on Infiniband/RC with accelerated verbs +- Added support for UCT `cm` transport based on Infiniband/SIDR (Service ID Resolution) +- Added support for UCT `ugni` transport based on Cray/UGNI +- Added support for Doxygen based documentation generation +- Added support for UCP basic protocol layer to fit PGAS paradigm (RMA, AMO) +- Added ucx_perftest utility to exercise major UCX flows and provide performance metrics +- Added test script for jenkins (contrib/test_jenkins.sh) +- Added packaging for RPM/DEB based linux distributions (see contrib/buildrpm.sh) +- Added Unit-tests infractucture for UCX functionality based on Google Test framework (see test/gtest/) +- Added initial integration for OpenMPI with UCX for PGAS/SHMEM API + (see: https://github.com/openucx/ompi-mirror/pull/1) +- Added end-to-end testing infrastructure based on MTT (see contrib/mtt/README_MTT) diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md deleted file mode 100644 index a542d3f8263..00000000000 --- a/PULL_REQUEST_TEMPLATE.md +++ /dev/null @@ -1,10 +0,0 @@ -## What -_Describe what this PR is doing._ - -## Why ? -_Justification for the PR. If there is existing issue/bug please reference. For -bug fixes why and what can be merged in a single item._ - -## How ? -_It is optional but for complex PRs please provide information about the design, -architecture, approach, etc._ diff --git a/README b/README index edffc367074..01e0b721a26 100644 --- a/README +++ b/README @@ -1,9 +1,10 @@
- +
- - - + follow on Twitter + + Documentation Status +
@@ -36,20 +37,63 @@ shared memory mechanisms for efficient intra-node communication. ## Using UCX -### Building and Running Internal Unit Tests +### Release Builds + +Building UCX is typically a combination of running "configure" and "make". +Execute the following commands to install the UCX system from within the +directory at the top of the tree: ```sh $ ./autogen.sh -$ ./contrib/configure-devel -$ make +$ ./contrib/configure-release --prefix=/where/to/install +$ make -j8 +$ make install +``` + +NOTE: Compiling support for various networks or other specific hardware may +require additional command line flags when running configure. + +### Developer Builds + +```bash +$ ./autogen.sh +$ ./contrib/configure-devel --prefix=$PWD/install-debug +``` + +*** NOTE: Developer builds of UCX typically include a large performance +penalty at run-time because of extra debugging code. + +### Running internal unit tests + +```sh $ make -C test/gtest test ``` +### Build RPM package +```bash +$ contrib/buildrpm.sh -s -b +``` + +### Build DEB package +```bash +$ dpkg-buildpackage -us -uc +``` + +### Build Doxygen documentation +```bash +$ make docs +``` + +### OpenMPI and OpenSHMEM installation with UCX +[Wiki page](http://github.com/openucx/ucx/wiki/OpenMPI-and-OpenSHMEM-installation-with-UCX) + +### MPICH installation with UCX +[Wiki page](http://github.com/openucx/ucx/wiki/MPICH-installation-with-UCX) + ### UCX Performance Test Start server: - ```sh $ ./src/tools/perf/ucx_perftest -c 0 ``` @@ -57,12 +101,14 @@ $ ./src/tools/perf/ucx_perftest -c 0 Connect client: ```sh -$ ./src/tools/perf/ucx_perftest -t tag_lat -c 0 +$ ./src/tools/perf/ucx_perftest -t tag_lat -c 1 ``` +Note: the `-c` flag sets CPU affinity. If running both commands on same host, make sure you set the affinity to different CPU cores. ## Our Community * [Project Website](http://www.openucx.org/) +* [ReadTheDocs](https://openucx.readthedocs.io/en/master/) * [Github](http://www.github.com/openucx/ucx/) * [Software Releases](http://www.github.com/openucx/ucx/releases) * [Mailing List](https://elist.ornl.gov/mailman/listinfo/ucx-group) @@ -80,7 +126,7 @@ In order to contribute to UCX, please sign up with an appropriate [Contributor Agreement](http://www.openucx.org/license/). Follow these -[instructions](https://github.com/openucx/ucx/blob/master/CONTRIBUTING.md) +[instructions](https://github.com/openucx/ucx/wiki/Guidance-for-contributors) when submitting contributions and changes. ## UCX Publications @@ -110,7 +156,7 @@ To reference the UCX website: ## UCX Architecture -![](doc/doxygen/Architecture.png) +![](docs/doxygen/Architecture.png) | Component | Role | Description | | :---: | :---: | --- | @@ -192,3 +238,4 @@ Barrier: | 5 | Topo-aware Recursive + K-nomial (intra)(Socket) | | 6 | Topo-aware K-nomial(Node) | | 7 | Topo-aware K-nomial(Socket) | + diff --git a/bindings/java/Makefile.am b/bindings/java/Makefile.am new file mode 100644 index 00000000000..c849b139298 --- /dev/null +++ b/bindings/java/Makefile.am @@ -0,0 +1,18 @@ +# +# Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. +# +# See file LICENSE for terms. +# + +EXTRA_DIST = \ + src/main/java \ + src/test \ + checkstyle.xml \ + pom.xml.in \ + README.md + +SUBDIRS = \ + src/main/native + +clean-local: + -rm -rf resources \ No newline at end of file diff --git a/bindings/java/checkstyle.xml b/bindings/java/checkstyle.xml index cd0547f2a0d..907d7af9302 100644 --- a/bindings/java/checkstyle.xml +++ b/bindings/java/checkstyle.xml @@ -22,11 +22,12 @@ + + + + + - - - - @@ -38,11 +39,6 @@ - - - - - @@ -53,6 +49,11 @@ + + + + + diff --git a/bindings/java/pom.xml b/bindings/java/pom.xml deleted file mode 100644 index 260af0deef8..00000000000 --- a/bindings/java/pom.xml +++ /dev/null @@ -1,218 +0,0 @@ - - - - 4.0.0 - - org.ucx - jucx - 0.0.1-SNAPSHOT - jar - - jucx - https://github.com/openucx/ucx - Java binding to ucx high performance communication library - - - - BSD 3 Clause License - http://www.openucx.org/license/ - repo - - - - - UTF-8 - ${basedir}/src/main/native - ${basedir}/../../src - ${ucx.src.dir}/ucm/.libs - ${ucx.src.dir}/ucs/.libs - ${ucx.src.dir}/uct/.libs - ${ucx.src.dir}/ucp/.libs - ${env.JUCX_INST}/lib - 4.12 - **/jucx/** - false - - - - - junit - junit - ${junit.version} - test - - - - - - - resources - - **/* - - - - - - - resources - - **/* - - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.1 - - 1.8 - 1.8 - - ${sources} - - - - - default-testCompile - test-compile - - testCompile - - - - - - - maven-clean-plugin - 3.0.0 - - - - resources - - **/*.so - - - - - - - - maven-resources-plugin - 2.7 - - - copy-dynamic-libs - generate-resources - - copy-resources - - - true - ${skipCopy} - ${basedir}/resources - - - ${ucx.inst.dir} - - **/*.so - - - - ${native.dir}/.libs - - **/*.so - - - - ${ucm.lib.path} - - **/*.so - - - - ${ucs.lib.path} - - **/*.so - - - - ${uct.lib.path} - - **/*.so - - - - ${ucp.lib.path} - - **/*.so - - - - - - - - - org.apache.maven.plugins - maven-checkstyle-plugin - 3.0.0 - - - com.puppycrawl.tools - checkstyle - 8.18 - - - - - validate - validate - - checkstyle.xml - UTF-8 - true - true - false - - - check - - - - - - org.codehaus.mojo - native-maven-plugin - 1.0-alpha-9 - true - - ${basedir}/src/main/native/ - - org.ucx.jucx.ucp.UcpConstants - org.ucx.jucx.ucp.UcpContext - org.ucx.jucx.ucp.UcpWorker - org.ucx.jucx.ucs.UcsConstants - - - - ${basedir}/src/main/native/ - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 3.0.1 - - - - diff --git a/bindings/java/pom.xml.in b/bindings/java/pom.xml.in new file mode 100644 index 00000000000..a530b220d33 --- /dev/null +++ b/bindings/java/pom.xml.in @@ -0,0 +1,393 @@ + + + + 4.0.0 + org.openucx + jucx + @VERSION@ + jar + jucx + https://github.com/openucx/ucx + Java binding to ucx high performance communication library + + + + UCX group + https://elist.ornl.gov/mailman/listinfo/ucx-group + + + + + + BSD 3 Clause License + http://www.openucx.org/license/ + repo + + + + + scm:git:git://github.com/openucx/ucx.git + scm:git:ssh://git@github.com/openucx/ucx.git + HEAD + https://github.com/openucx/ucx.git + + + + false + ${env.GPG_PASSPHRASE} + UTF-8 + @abs_top_srcdir@/src + ${ucx.src.dir}/../bindings/java + @abs_top_builddir@ + ${ucx.build.dir}/bindings/java/src/main/native + ${ucx.build.dir}/src/ucm/.libs + ${ucx.build.dir}/src/ucs/.libs + ${ucx.build.dir}/src/uct/.libs + ${ucx.build.dir}/src/ucp/.libs + 4.12 + **/jucx/** + false + + + + + jdk8 + + 1.8 + + + + + maven-compiler-plugin + + 1.8 + 1.8 + + + + + + + jdk9 + + [1.9,) + + + + + maven-compiler-plugin + + 8 + + + + + + + + + Github + https://github.com/openucx/ucx/issues + + + + + Peter Rudenko + peterr@mellanox.com + Mellanox Technologies + + + Yossi Itigin + yosefe@mellanox.com + Mellanox Technologies + + + + + + ossrh + https://oss.sonatype.org/content/repositories/snapshots + + + ossrh + https://oss.sonatype.org/service/local/staging/deploy/maven2/ + + + + + + gcs-maven-central-mirror + + GCS Maven Central mirror + https://maven-central.storage-download.googleapis.com/maven2/ + + true + + + false + + + + + central + Maven Repository + https://repo.maven.apache.org/maven2 + + true + + + false + + + + + + gcs-maven-central-mirror + + GCS Maven Central mirror + https://maven-central.storage-download.googleapis.com/maven2/ + + true + + + false + + + + central + https://repo.maven.apache.org/maven2 + + true + + + false + + + + + + + junit + junit + ${junit.version} + test + + + + + ${jucx.src.dir}/src/main/java + ${jucx.src.dir}/src/test/java + ${native.dir}/build-java + + + resources + + **/* + + + + + + resources + + libjucx.so + + + + + + + + org.apache.maven.plugins + maven-source-plugin + 3.1.0 + + + attach-sources + + jar-no-fork + + + + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.6 + + + sign-artifacts + deploy + + sign + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + -h + ${native.dir} + + + ${sources} + + + + + default-testCompile + test-compile + + testCompile + + + + + + + maven-clean-plugin + 3.0.0 + + + + resources + + **/*.so + + + + + + + + maven-resources-plugin + 2.7 + + + copy-dynamic-libs + generate-resources + + copy-resources + + + true + ${skipCopy} + ${basedir}/resources + + + ${native.dir}/.libs + + **/*.so + + + + ${ucm.lib.path} + + **/*.so + + + + ${ucs.lib.path} + + **/*.so + + + + ${uct.lib.path} + + **/*.so + + + + ${ucp.lib.path} + + **/*.so + + + + + + + + + + org.apache.maven.plugins + maven-checkstyle-plugin + 3.1.0 + + + com.puppycrawl.tools + checkstyle + 8.29 + + + + + validate + validate + + ${jucx.src.dir}/checkstyle.xml + UTF-8 + true + true + false + true + warning + + + check + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.0.1 + + true + all,-missing + + + + attach-javadocs + + jar + + + + + + + org.sonatype.plugins + nexus-staging-maven-plugin + 1.6.8 + true + + ossrh + https://oss.sonatype.org/ + true + + + + + + diff --git a/bindings/java/src/main/java/org/ucx/jucx/NativeLibs.java b/bindings/java/src/main/java/org/openucx/jucx/NativeLibs.java similarity index 68% rename from bindings/java/src/main/java/org/ucx/jucx/NativeLibs.java rename to bindings/java/src/main/java/org/openucx/jucx/NativeLibs.java index d5cb0afc7ae..287ce53d260 100644 --- a/bindings/java/src/main/java/org/ucx/jucx/NativeLibs.java +++ b/bindings/java/src/main/java/org/openucx/jucx/NativeLibs.java @@ -3,25 +3,23 @@ * See file LICENSE for terms. */ -package org.ucx.jucx; +package org.openucx.jucx; import java.io.*; import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; public class NativeLibs { - private static final String UCM = "libucm.so"; - private static final String UCS = "libucs.so"; - private static final String UCT = "libuct.so"; - private static final String UCP = "libucp.so"; - private static final String JUCX = "libjucx.so"; - private static ClassLoader loader = NativeLibs.class.getClassLoader(); + private static final String UCM = "ucm"; + private static final String UCS = "ucs"; + private static final String UCT = "uct"; + private static final String UCP = "ucp"; + private static final String JUCX = "jucx"; + private static final ClassLoader loader = NativeLibs.class.getClassLoader(); private static String errorMessage = null; static { - extractUCTLibs(); // UCT Transport loadLibrary(UCM); // UCM library loadLibrary(UCS); // UCS library loadLibrary(UCT); // UCT library @@ -43,8 +41,17 @@ public static void load() { */ private static void loadLibrary(String resourceName) { // Search shared object on java classpath - URL url = loader.getResource(resourceName); - File file = null; + URL url = loader.getResource(System.mapLibraryName(resourceName)); + + if (url == null) { + // If not found in classpath, try to load from java.library.path + try { + System.loadLibrary(resourceName); + } catch (Throwable ignored) { } + return; + } + + File file; try { // Extract shared object's content to a generated temp file file = extractResource(url); } catch (IOException ex) { @@ -57,47 +64,8 @@ private static void loadLibrary(String resourceName) { try { // Load shared object to JVM System.load(filename); } catch (UnsatisfiedLinkError ex) { - errorMessage = "Native code library failed to load: " - + resourceName; - } - - file.deleteOnExit(); - } - } - - /** - * Extracts shared UCT transport. - */ - private static void extractUCTLibs() { - URL ucxResource = loader.getResource("ucx"); - File ucxFolder = new File(ucxResource.getPath()); - Path ucxTempFolder; - try { - createTempDir(); - ucxTempFolder = Files.createDirectory(Paths.get(tempDir.getPath(), "ucx")); - ucxTempFolder.toFile().deleteOnExit(); - } catch (IOException ex) { - errorMessage = "Failed to create temp directory"; - return; - } - for (File uctLib: ucxFolder.listFiles()) { - if (!uctLib.getName().startsWith("libuct_")) { - continue; - } - FileOutputStream os = null; - FileInputStream is = null; - File out = new File(ucxTempFolder.toAbsolutePath().toString(), uctLib.getName()); - out.deleteOnExit(); - try { - is = new FileInputStream(uctLib); - os = new FileOutputStream(out); - copy(is, os); - } catch (IOException ex) { - errorMessage = "Failed to copy UCT lib: " + ex.getLocalizedMessage(); - return; - } finally { - closeQuietly(os); - closeQuietly(is); + errorMessage = "Native code library failed to load: " + file.getName() + + ". " + ex.getLocalizedMessage(); } } } @@ -110,6 +78,10 @@ private static void extractUCTLibs() { * @throws IOException if fails to extract resource properly */ private static File extractResource(URL resourceURL) throws IOException { + if (!resourceURL.getProtocol().equals("jar")) { + return new File(resourceURL.getPath()); + } + InputStream is = resourceURL.openStream(); if (is == null) { errorMessage = "Error extracting native library content"; diff --git a/bindings/java/src/main/java/org/openucx/jucx/UcxCallback.java b/bindings/java/src/main/java/org/openucx/jucx/UcxCallback.java new file mode 100644 index 00000000000..a75cb766e8d --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/UcxCallback.java @@ -0,0 +1,20 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx; + +import org.openucx.jucx.ucp.UcpRequest; + +/** + * Callback wrapper to notify successful or failure events from JNI. + */ + +public class UcxCallback { + public void onSuccess(UcpRequest request) {} + + public void onError(int ucsStatus, String errorMsg) { + throw new UcxException(errorMsg); + } +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/UcxException.java b/bindings/java/src/main/java/org/openucx/jucx/UcxException.java new file mode 100644 index 00000000000..8fb3554473c --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/UcxException.java @@ -0,0 +1,20 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx; + +/** + * Exception to be thrown from JNI and all UCX routines. + */ +public class UcxException extends RuntimeException { + + public UcxException() { + super(); + } + + public UcxException(String message) { + super(message); + } +} diff --git a/bindings/java/src/main/java/org/ucx/jucx/UcxNativeStruct.java b/bindings/java/src/main/java/org/openucx/jucx/UcxNativeStruct.java similarity index 71% rename from bindings/java/src/main/java/org/ucx/jucx/UcxNativeStruct.java rename to bindings/java/src/main/java/org/openucx/jucx/UcxNativeStruct.java index 680478c57e7..2fd71cbfd54 100644 --- a/bindings/java/src/main/java/org/ucx/jucx/UcxNativeStruct.java +++ b/bindings/java/src/main/java/org/openucx/jucx/UcxNativeStruct.java @@ -3,7 +3,7 @@ * See file LICENSE for terms. */ -package org.ucx.jucx; +package org.openucx.jucx; /** * Wrapper around native ucx struct, that holds pointer address. @@ -20,6 +20,10 @@ public Long getNativeId() { } protected void setNativeId(Long nativeId) { + if (nativeId != null && nativeId < 0) { + throw new UcxException("UcxNativeStruct.setNativeId: invalid native pointer: " + + nativeId); + } this.nativeId = nativeId; } } diff --git a/bindings/java/src/main/java/org/ucx/jucx/UcxParams.java b/bindings/java/src/main/java/org/openucx/jucx/UcxParams.java similarity index 95% rename from bindings/java/src/main/java/org/ucx/jucx/UcxParams.java rename to bindings/java/src/main/java/org/openucx/jucx/UcxParams.java index 8aa04830b94..e72a9bae5a6 100644 --- a/bindings/java/src/main/java/org/ucx/jucx/UcxParams.java +++ b/bindings/java/src/main/java/org/openucx/jucx/UcxParams.java @@ -3,7 +3,7 @@ * See file LICENSE for terms. */ -package org.ucx.jucx; +package org.openucx.jucx; /** * Common interface for representing parameters to instantiate ucx objects. diff --git a/bindings/java/src/main/java/org/openucx/jucx/UcxUtils.java b/bindings/java/src/main/java/org/openucx/jucx/UcxUtils.java new file mode 100644 index 00000000000..8f43bf0be82 --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/UcxUtils.java @@ -0,0 +1,42 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx; + +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; +import java.nio.ByteBuffer; + +public class UcxUtils { + + private static final Constructor directBufferConstructor; + + static { + try { + Class classDirectByteBuffer = Class.forName("java.nio.DirectByteBuffer"); + directBufferConstructor = classDirectByteBuffer.getDeclaredConstructor(long.class, + int.class); + directBufferConstructor.setAccessible(true); + } catch (Exception e) { + throw new UcxException(e.getMessage()); + } + } + + /** + * Returns view of underlying memory region as a ByteBuffer. + * @param address - address of start of memory region + */ + public static ByteBuffer getByteBufferView(long address, int length) + throws IllegalAccessException, InvocationTargetException, InstantiationException { + return (ByteBuffer)directBufferConstructor.newInstance(address, length); + } + + /** + * Returns native address of the current position of a direct byte buffer. + */ + public static long getAddress(ByteBuffer buffer) { + return ((sun.nio.ch.DirectBuffer) buffer).address() + buffer.position(); + } +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/examples/UcxBenchmark.java b/bindings/java/src/main/java/org/openucx/jucx/examples/UcxBenchmark.java new file mode 100644 index 00000000000..26636e7f7aa --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/examples/UcxBenchmark.java @@ -0,0 +1,105 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx.examples; + +import org.openucx.jucx.ucp.*; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.Map; +import java.util.Stack; + +public abstract class UcxBenchmark { + + protected static Map argsMap = new HashMap<>(); + + // Stack of closable resources (context, worker, etc.) to be closed at the end. + protected static Stack resources = new Stack<>(); + + protected static UcpContext context; + + protected static UcpWorker worker; + + protected static int serverPort; + + protected static int numIterations; + + protected static long totalSize; + + protected static UcpMemMapParams allocationParams; + + private static String DESCRIPTION = "JUCX benchmark.\n" + + "Run: \n" + + "java -cp jucx.jar org.openucx.jucx.examples.UcxReadBWBenchmarkReceiver " + + "[s=host] [p=port] [n=number of iterations]\n" + + "java -cp jucx.jar org.openucx.jucx.examples.UcxReadBWBenchmarkSender " + + "[s=receiver host] [p=receiver port] [t=total size to transfer]\n\n" + + "Parameters:\n" + + "h - print help\n" + + "s - IP address to bind sender listener (default: 0.0.0.0)\n" + + "p - port to bind sender listener (default: 54321)\n" + + "t - total size in bytes to transfer from sender to receiver (default 10000)\n" + + "o - on demand registration (default: false) \n" + + "n - number of iterations (default 5)\n"; + + static { + argsMap.put("s", "0.0.0.0"); + argsMap.put("p", "54321"); + argsMap.put("t", "10000"); + argsMap.put("o", "false"); + argsMap.put("n", "5"); + } + + /** + * Initializes common variables from command line arguments. + */ + protected static boolean initializeArguments(String[] args) { + for (String arg: args) { + if (arg.contains("h")) { + System.out.println(DESCRIPTION); + return false; + } + String[] parts = arg.split("="); + argsMap.put(parts[0], parts[1]); + } + try { + serverPort = Integer.parseInt(argsMap.get("p")); + numIterations = Integer.parseInt(argsMap.get("n")); + totalSize = Long.parseLong(argsMap.get("t")); + allocationParams = new UcpMemMapParams().allocate().setLength(totalSize); + if (argsMap.get("o").compareToIgnoreCase("true") == 0) { + allocationParams.nonBlocking(); + } + } catch (NumberFormatException ex) { + System.out.println(DESCRIPTION); + return false; + } + return true; + } + + protected static void createContextAndWorker() { + context = new UcpContext(new UcpParams().requestWakeupFeature() + .requestRmaFeature().requestTagFeature()); + resources.push(context); + + worker = context.newWorker(new UcpWorkerParams()); + resources.push(worker); + } + + protected static double getBandwithGbits(long nanoTimeDelta, long size) { + double sizeInGigabits = (double)size * 8.0 / 1e9; + double secondsElapsed = nanoTimeDelta / 1e9; + return sizeInGigabits / secondsElapsed; + } + + protected static void closeResources() throws IOException { + while (!resources.empty()) { + resources.pop().close(); + } + } +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/examples/UcxReadBWBenchmarkReceiver.java b/bindings/java/src/main/java/org/openucx/jucx/examples/UcxReadBWBenchmarkReceiver.java new file mode 100644 index 00000000000..cc1b79c9558 --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/examples/UcxReadBWBenchmarkReceiver.java @@ -0,0 +1,111 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx.examples; + +import org.openucx.jucx.UcxCallback; +import org.openucx.jucx.ucp.UcpRequest; +import org.openucx.jucx.UcxUtils; +import org.openucx.jucx.ucp.*; + + +import java.net.InetSocketAddress; +import java.nio.ByteBuffer; +import java.util.concurrent.atomic.AtomicReference; + +public class UcxReadBWBenchmarkReceiver extends UcxBenchmark { + + public static void main(String[] args) throws Exception { + if (!initializeArguments(args)) { + return; + } + + createContextAndWorker(); + + String serverHost = argsMap.get("s"); + InetSocketAddress sockaddr = new InetSocketAddress(serverHost, serverPort); + AtomicReference connRequest = new AtomicReference<>(null); + UcpListener listener = worker.newListener( + new UcpListenerParams() + .setConnectionHandler(connRequest::set) + .setSockAddr(sockaddr)); + resources.push(listener); + System.out.println("Waiting for connections on " + sockaddr + " ..."); + + while (connRequest.get() == null) { + worker.progress(); + } + + UcpEndpoint endpoint = worker.newEndpoint(new UcpEndpointParams() + .setConnectionRequest(connRequest.get()) + .setPeerErrorHandlingMode()); + + // Temporary workaround until new connection establishment protocol in UCX. + for (int i = 0; i < 10; i++) { + worker.progress(); + try { + Thread.sleep(10); + } catch (Exception ignored) { } + } + + ByteBuffer recvBuffer = ByteBuffer.allocateDirect(4096); + UcpRequest recvRequest = worker.recvTaggedNonBlocking(recvBuffer, null); + + worker.progressRequest(recvRequest); + + long remoteAddress = recvBuffer.getLong(); + long remoteSize = recvBuffer.getLong(); + int remoteKeySize = recvBuffer.getInt(); + int rkeyBufferOffset = recvBuffer.position(); + + recvBuffer.position(rkeyBufferOffset + remoteKeySize); + int remoteHashCode = recvBuffer.getInt(); + System.out.printf("Received connection. Will read %d bytes from remote address %d%n", + remoteSize, remoteAddress); + + recvBuffer.position(rkeyBufferOffset); + UcpRemoteKey remoteKey = endpoint.unpackRemoteKey(recvBuffer); + resources.push(remoteKey); + + UcpMemory recvMemory = context.memoryMap(allocationParams); + resources.push(recvMemory); + ByteBuffer data = UcxUtils.getByteBufferView(recvMemory.getAddress(), + (int)Math.min(Integer.MAX_VALUE, totalSize)); + for (int i = 0; i < numIterations; i++) { + final int iterNum = i; + UcpRequest getRequest = endpoint.getNonBlocking(remoteAddress, remoteKey, + recvMemory.getAddress(), totalSize, + new UcxCallback() { + long startTime = System.nanoTime(); + + @Override + public void onSuccess(UcpRequest request) { + long finishTime = System.nanoTime(); + data.clear(); + assert data.hashCode() == remoteHashCode; + double bw = getBandwithGbits(finishTime - startTime, remoteSize); + System.out.printf("Iteration %d, bandwidth: %.4f GB/s%n", iterNum, bw); + } + }); + + worker.progressRequest(getRequest); + // To make sure we receive correct data each time to compare hashCodes + data.put(0, (byte)1); + } + + ByteBuffer sendBuffer = ByteBuffer.allocateDirect(100); + sendBuffer.asCharBuffer().put("DONE"); + + UcpRequest sent = endpoint.sendTaggedNonBlocking(sendBuffer, null); + worker.progressRequest(sent); + + UcpRequest closeRequest = endpoint.closeNonBlockingFlush(); + worker.progressRequest(closeRequest); + // Close request won't be return to pull automatically, since there's no callback. + resources.push(closeRequest); + + closeResources(); + } +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/examples/UcxReadBWBenchmarkSender.java b/bindings/java/src/main/java/org/openucx/jucx/examples/UcxReadBWBenchmarkSender.java new file mode 100644 index 00000000000..9c60206c11a --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/examples/UcxReadBWBenchmarkSender.java @@ -0,0 +1,70 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx.examples; + +import org.openucx.jucx.UcxCallback; +import org.openucx.jucx.ucp.UcpRequest; +import org.openucx.jucx.UcxUtils; +import org.openucx.jucx.ucp.UcpEndpoint; +import org.openucx.jucx.ucp.UcpEndpointParams; +import org.openucx.jucx.ucp.UcpMemory; + +import java.net.InetSocketAddress; +import java.nio.ByteBuffer; + + +public class UcxReadBWBenchmarkSender extends UcxBenchmark { + + public static void main(String[] args) throws Exception { + if (!initializeArguments(args)) { + return; + } + + createContextAndWorker(); + + String serverHost = argsMap.get("s"); + UcpEndpoint endpoint = worker.newEndpoint(new UcpEndpointParams() + .setPeerErrorHandlingMode() + .setSocketAddress(new InetSocketAddress(serverHost, serverPort))); + + UcpMemory memory = context.memoryMap(allocationParams); + resources.push(memory); + ByteBuffer data = UcxUtils.getByteBufferView(memory.getAddress(), + (int)Math.min(Integer.MAX_VALUE, totalSize)); + + // Send worker and memory address and Rkey to receiver. + ByteBuffer rkeyBuffer = memory.getRemoteKeyBuffer(); + + // 24b = 8b buffer address + 8b buffer size + 4b rkeyBuffer size + 4b hashCode + ByteBuffer sendData = ByteBuffer.allocateDirect(24 + rkeyBuffer.capacity()); + sendData.putLong(memory.getAddress()); + sendData.putLong(totalSize); + sendData.putInt(rkeyBuffer.capacity()); + sendData.put(rkeyBuffer); + sendData.putInt(data.hashCode()); + sendData.clear(); + + // Send memory metadata and wait until receiver will finish benchmark. + endpoint.sendTaggedNonBlocking(sendData, null); + ByteBuffer recvBuffer = ByteBuffer.allocateDirect(4096); + UcpRequest recvRequest = worker.recvTaggedNonBlocking(recvBuffer, + new UcxCallback() { + @Override + public void onSuccess(UcpRequest request) { + System.out.println("Received a message:"); + System.out.println(recvBuffer.asCharBuffer().toString().trim()); + } + }); + + worker.progressRequest(recvRequest); + + UcpRequest closeRequest = endpoint.closeNonBlockingFlush(); + worker.progressRequest(closeRequest); + resources.push(closeRequest); + + closeResources(); + } +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpConnectionRequest.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpConnectionRequest.java new file mode 100644 index 00000000000..f0e7529accf --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpConnectionRequest.java @@ -0,0 +1,18 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ +package org.openucx.jucx.ucp; + +import org.openucx.jucx.UcxNativeStruct; + +/** + * A server-side handle to incoming connection request. Can be used to create an + * endpoint which connects back to the client. + */ +public class UcpConnectionRequest extends UcxNativeStruct { + + private UcpConnectionRequest(long nativeId) { + setNativeId(nativeId); + } +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpConstants.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpConstants.java new file mode 100644 index 00000000000..e47a25af70b --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpConstants.java @@ -0,0 +1,123 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx.ucp; + +import org.openucx.jucx.NativeLibs; +import org.openucx.jucx.UcxCallback; + +public class UcpConstants { + static { + NativeLibs.load(); + loadConstants(); + } + + /** + * UCP context parameters field mask. + * + *

The enumeration allows specifying which fields in {@link UcpParams} are + * present. It is used for the enablement of backward compatibility support. + */ + static long UCP_PARAM_FIELD_FEATURES; + static long UCP_PARAM_FIELD_TAG_SENDER_MASK; + static long UCP_PARAM_FIELD_MT_WORKERS_SHARED; + static long UCP_PARAM_FIELD_ESTIMATED_NUM_EPS; + + /** + * UCP configuration features + * + *

The enumeration list describes the features supported by UCP. + * An application can request the features using "UCP parameters" + * during "UCP initialization" process. + */ + static long UCP_FEATURE_TAG; + static long UCP_FEATURE_RMA; + static long UCP_FEATURE_AMO32; + static long UCP_FEATURE_AMO64; + static long UCP_FEATURE_WAKEUP; + static long UCP_FEATURE_STREAM; + + /** + * UCP worker parameters field mask. + * + *

The enumeration allows specifying which fields in {@link UcpWorker} are + * present. It is used for the enablement of backward compatibility support. + */ + static long UCP_WORKER_PARAM_FIELD_THREAD_MODE; + static long UCP_WORKER_PARAM_FIELD_CPU_MASK; + static long UCP_WORKER_PARAM_FIELD_EVENTS; + static long UCP_WORKER_PARAM_FIELD_USER_DATA; + static long UCP_WORKER_PARAM_FIELD_EVENT_FD; + + /** + * Mask of events which are expected on wakeup. + * If it's not set all types of events will trigger on + * wakeup. + */ + static long UCP_WAKEUP_RMA; + static long UCP_WAKEUP_AMO; + static long UCP_WAKEUP_TAG_SEND; + static long UCP_WAKEUP_TAG_RECV; + static long UCP_WAKEUP_TX; + static long UCP_WAKEUP_RX; + static long UCP_WAKEUP_EDGE; + + /** + * UCP listener parameters field mask. + */ + static long UCP_LISTENER_PARAM_FIELD_SOCK_ADDR; + static long UCP_LISTENER_PARAM_FIELD_ACCEPT_HANDLER; + static long UCP_LISTENER_PARAM_FIELD_CONN_HANDLER; + + /** + * UCP endpoint parameters field mask. + */ + static long UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; + static long UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE; + static long UCP_EP_PARAM_FIELD_ERR_HANDLER; + static long UCP_EP_PARAM_FIELD_USER_DATA; + static long UCP_EP_PARAM_FIELD_SOCK_ADDR; + static long UCP_EP_PARAM_FIELD_FLAGS; + static long UCP_EP_PARAM_FIELD_CONN_REQUEST; + + /** + * UCP error handling mode. + */ + static int UCP_ERR_HANDLING_MODE_PEER; + + /** + * The enumeration list describes the endpoint's parameters flags. + */ + static long UCP_EP_PARAMS_FLAGS_CLIENT_SERVER; + static long UCP_EP_PARAMS_FLAGS_NO_LOOPBACK; + + /** + * The enumeration is used to specify the behavior of UcpEndpoint closeNonBlocking. + */ + static int UCP_EP_CLOSE_MODE_FORCE; + static int UCP_EP_CLOSE_MODE_FLUSH; + + /** + * UCP memory mapping parameters field mask. + */ + static long UCP_MEM_MAP_PARAM_FIELD_ADDRESS; + static long UCP_MEM_MAP_PARAM_FIELD_LENGTH; + static long UCP_MEM_MAP_PARAM_FIELD_FLAGS; + + /** + * The enumeration list describes the memory mapping flags. + */ + static long UCP_MEM_MAP_NONBLOCK; + static long UCP_MEM_MAP_ALLOCATE; + static long UCP_MEM_MAP_FIXED; + + /** + * The enumeration defines behavior of + * {@link UcpEndpoint#recvStreamNonBlocking(long, long, long, UcxCallback)} function. + */ + public static long UCP_STREAM_RECV_FLAG_WAITALL; + + private static native void loadConstants(); +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpContext.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpContext.java new file mode 100644 index 00000000000..50cf4de6df5 --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpContext.java @@ -0,0 +1,89 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx.ucp; + +import java.io.Closeable; +import java.nio.ByteBuffer; + +import org.openucx.jucx.NativeLibs; +import org.openucx.jucx.UcxException; +import org.openucx.jucx.UcxNativeStruct; +import org.openucx.jucx.UcxUtils; + +/** + * UCP application context (or just a context) is an opaque handle that holds a + * UCP communication instance's global information. It represents a single UCP + * communication instance. The communication instance could be an OS process + * (an application) that uses UCP library. This global information includes + * communication resources, endpoints, memory, temporary file storage, and + * other communication information directly associated with a specific UCP + * instance. The context also acts as an isolation mechanism, allowing + * resources associated with the context to manage multiple concurrent + * communication instances. For example, users can isolate their communication + * by allocating and using separate contexts. Alternatively, users can share the + * communication resources (memory, network resource context, etc.) between + * them by using the same application context. A message sent or a RMA + * operation performed in one application context cannot be received in any + * other application context. + */ +public class UcpContext extends UcxNativeStruct implements Closeable { + static { + NativeLibs.load(); + } + + public UcpContext(UcpParams params) { + setNativeId(createContextNative(params)); + } + + @Override + public void close() { + cleanupContextNative(getNativeId()); + this.setNativeId(null); + } + + /** + * Creates new UcpWorker on current context. + */ + public UcpWorker newWorker(UcpWorkerParams params) { + return new UcpWorker(this, params); + } + + /** + * Associates memory allocated/mapped region with communication operations. + * The network stack associated with an application context + * can typically send and receive data from the mapped memory without + * CPU intervention; some devices and associated network stacks + * require the memory to be registered to send and receive data. + */ + public UcpMemory registerMemory(ByteBuffer buf) { + if (!buf.isDirect()) { + throw new UcxException("Registered buffer must be direct"); + } + UcpMemMapParams params = new UcpMemMapParams().setAddress(UcxUtils.getAddress(buf)) + .setLength(buf.remaining()); + UcpMemory result = memoryMapNative(getNativeId(), params); + + result.setByteBufferReference(buf); + return result; + } + + /** + * Associates memory allocated/mapped region with communication operations. + * The network stack associated with an application context + * can typically send and receive data from the mapped memory without + * CPU intervention; some devices and associated network stacks + * require the memory to be registered to send and receive data. + */ + public UcpMemory memoryMap(UcpMemMapParams params) { + return memoryMapNative(getNativeId(), params); + } + + private static native long createContextNative(UcpParams params); + + private static native void cleanupContextNative(long contextId); + + private native UcpMemory memoryMapNative(long conetxtId, UcpMemMapParams params); +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpoint.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpoint.java new file mode 100644 index 00000000000..f833277deb5 --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpoint.java @@ -0,0 +1,306 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ +package org.openucx.jucx.ucp; + +import org.openucx.jucx.*; + +import java.io.Closeable; +import java.nio.ByteBuffer; + +public class UcpEndpoint extends UcxNativeStruct implements Closeable { + private final String paramsString; + // Keep a reference to errorHandler to prevent it from GC and have valid ref + // from JNI error handler. + private final UcpEndpointErrorHandler errorHandler; + + @Override + public String toString() { + return "UcpEndpoint(id=" + getNativeId() + ", " + paramsString + ")"; + } + + public UcpEndpoint(UcpWorker worker, UcpEndpointParams params) { + // For backward compatibility and better error tracking always set ep error handler. + if (params.errorHandler == null) { + params.setErrorHandler((ep, status, errorMsg) -> { + throw new UcxException("Endpoint " + ep.toString() + + " error: " + errorMsg); + }); + } + this.errorHandler = params.errorHandler; + this.paramsString = params.toString(); + setNativeId(createEndpointNative(params, worker.getNativeId())); + } + + @Override + public void close() { + destroyEndpointNative(getNativeId()); + setNativeId(null); + } + + /** + * This routine unpacks the remote key (RKEY) object into the local memory + * such that it can be accessed and used by UCP routines. + * @param rkeyBuffer - Packed remote key buffer + * (see {@link UcpMemory#getRemoteKeyBuffer()}). + */ + public UcpRemoteKey unpackRemoteKey(ByteBuffer rkeyBuffer) { + return unpackRemoteKey(getNativeId(), + UcxUtils.getAddress(rkeyBuffer)); + } + + private void checkRemoteAccessParams(ByteBuffer buf, UcpRemoteKey remoteKey) { + if (!buf.isDirect()) { + throw new UcxException("Data buffer must be direct."); + } + if (remoteKey.getNativeId() == null) { + throw new UcxException("Remote key is null."); + } + } + + /** + * Non-blocking remote memory put operation. + * This routine initiates a storage of contiguous block of data that is + * described by the local {@code src} buffer, starting of it's {@code src.position()} + * and size {@code src.remaining()} in the remote contiguous memory + * region described by {@code remoteAddress} address and the {@code remoteKey} "memory + * handle". The routine returns immediately and does not + * guarantee re-usability of the source {@code data} buffer. + * {@code callback} is invoked on completion of this operation. + */ + public UcpRequest putNonBlocking(ByteBuffer src, long remoteAddress, UcpRemoteKey remoteKey, + UcxCallback callback) { + + checkRemoteAccessParams(src, remoteKey); + + return putNonBlocking(UcxUtils.getAddress(src), src.remaining(), remoteAddress, + remoteKey, callback); + } + + public UcpRequest putNonBlocking(long localAddress, long size, + long remoteAddress, UcpRemoteKey remoteKey, + UcxCallback callback) { + + return putNonBlockingNative(getNativeId(), localAddress, + size, remoteAddress, remoteKey.getNativeId(), callback); + } + + /** + * This routine initiates a storage of contiguous block of data that is + * described by the local {@code buffer} in the remote contiguous memory + * region described by {@code remoteAddress} and the {@code remoteKey} + * "memory handle". The routine returns immediately and does not + * guarantee re-usability of the source {@code src} buffer. + */ + public void putNonBlockingImplicit(ByteBuffer src, long remoteAddress, + UcpRemoteKey remoteKey) { + checkRemoteAccessParams(src, remoteKey); + + putNonBlockingImplicit(UcxUtils.getAddress(src), src.remaining(), remoteAddress, + remoteKey); + } + + /** + * This routine initiates a storage of contiguous block of data that is + * described by the local {@code localAddress} in the remote contiguous memory + * region described by {@code remoteAddress} and the {@code remoteKey} + * "memory handle". The routine returns immediately and does not + * guarantee re-usability of the source {@code localAddress} address. + */ + public void putNonBlockingImplicit(long localAddress, long size, + long remoteAddress, UcpRemoteKey remoteKey) { + putNonBlockingImplicitNative(getNativeId(), localAddress, size, remoteAddress, + remoteKey.getNativeId()); + } + + /** + * Non-blocking remote memory get operation. + * This routine initiates a load of a contiguous block of data that is + * described by the remote memory address {@code remoteAddress} and the + * {@code remoteKey} "memory handle". The routine returns immediately and does + * not guarantee that remote data is loaded and stored under the local {@code dst} buffer + * starting of it's {@code dst.position()} and size {@code dst.remaining()}. + * {@code callback} is invoked on completion of this operation. + * @return {@link UcpRequest} object that can be monitored for completion. + */ + public UcpRequest getNonBlocking(long remoteAddress, UcpRemoteKey remoteKey, + ByteBuffer dst, UcxCallback callback) { + + checkRemoteAccessParams(dst, remoteKey); + + return getNonBlocking(remoteAddress, remoteKey, UcxUtils.getAddress(dst), + dst.remaining(), callback); + } + + public UcpRequest getNonBlocking(long remoteAddress, UcpRemoteKey remoteKey, + long localAddress, long size, UcxCallback callback) { + + return getNonBlockingNative(getNativeId(), remoteAddress, remoteKey.getNativeId(), + localAddress, size, callback); + } + + /** + * Non-blocking implicit remote memory get operation. + * This routine initiate a load of contiguous block of data that is described + * by the remote memory address {@code remoteAddress} and the + * {@code remoteKey} "memory handle" in the local contiguous memory region described + * by {@code dst} buffer. The routine returns immediately and does not guarantee that + * remote data is loaded and stored under the local buffer. + */ + public void getNonBlockingImplicit(long remoteAddress, UcpRemoteKey remoteKey, + ByteBuffer dst) { + checkRemoteAccessParams(dst, remoteKey); + + getNonBlockingImplicit(remoteAddress, remoteKey, UcxUtils.getAddress(dst), + dst.remaining()); + } + + /** + * Non-blocking implicit remote memory get operation. + * This routine initiate a load of contiguous block of data that is described + * by the remote memory address {@code remoteAddress} and the + * {@code remoteKey} "memory handle" in the local contiguous memory region described + * by {@code localAddress} the local address. The routine returns immediately + * and does not guarantee that remote data is loaded and stored under the local buffer. + */ + public void getNonBlockingImplicit(long remoteAddress, UcpRemoteKey remoteKey, + long localAddress, long size) { + + getNonBlockingImplicitNative(getNativeId(), remoteAddress, remoteKey.getNativeId(), + localAddress, size); + } + + /** + * Non-blocking tagged-send operations + * This routine sends a messages that is described by the local buffer {@code sendBuffer}, + * starting of it's {@code sendBuffer.position()} and size {@code sendBuffer.remaining()}. + * to the destination endpoint. Each message is associated with a {@code tag} value + * that is used for message matching on the + * {@link UcpWorker#recvTaggedNonBlocking(ByteBuffer, long, long, UcxCallback)} + * "receiver". The routine is non-blocking and therefore returns immediately, + * however the actual send operation may be delayed. + * The send operation is considered completed when it is safe to reuse the source + * {@code data} buffer. {@code callback} is invoked on completion of this operation. + */ + public UcpRequest sendTaggedNonBlocking(ByteBuffer sendBuffer, long tag, UcxCallback callback) { + if (!sendBuffer.isDirect()) { + throw new UcxException("Send buffer must be direct."); + } + return sendTaggedNonBlocking(UcxUtils.getAddress(sendBuffer), + sendBuffer.remaining(), tag, callback); + } + + public UcpRequest sendTaggedNonBlocking(long localAddress, long size, + long tag, UcxCallback callback) { + + return sendTaggedNonBlockingNative(getNativeId(), + localAddress, size, tag, callback); + } + + /** + * Non blocking send operation. Invokes + * {@link UcpEndpoint#sendTaggedNonBlocking(ByteBuffer, long, UcxCallback)} with default 0 tag. + */ + public UcpRequest sendTaggedNonBlocking(ByteBuffer sendBuffer, UcxCallback callback) { + return sendTaggedNonBlocking(sendBuffer, 0, callback); + } + + /** + * This routine sends data that is described by the local address to the destination endpoint. + * The routine is non-blocking and therefore returns immediately, however the actual send + * operation may be delayed. The send operation is considered completed when it is safe + * to reuse the source buffer. The UCP library will schedule invocation of the call-back upon + * completion of the send operation. + */ + public UcpRequest sendStreamNonBlocking(long localAddress, long size, UcxCallback callback) { + return sendStreamNonBlockingNative(getNativeId(), localAddress, size, callback); + } + + public UcpRequest sendStreamNonBlocking(ByteBuffer buffer, UcxCallback callback) { + return sendStreamNonBlockingNative(getNativeId(), UcxUtils.getAddress(buffer), + buffer.remaining(), callback); + } + + /** + * This routine receives data that is described by the local address and a size on the endpoint. + * The routine is non-blocking and therefore returns immediately. The receive operation is + * considered complete when the message is delivered to the buffer. + * In order to notify the application about completion of a scheduled receive operation, + * the UCP library will invoke the call-back when data is in the receive buffer + * and ready for application access. + */ + public UcpRequest recvStreamNonBlocking(long localAddress, long size, long flags, + UcxCallback callback) { + return recvStreamNonBlockingNative(getNativeId(), localAddress, size, flags, callback); + } + + public UcpRequest recvStreamNonBlocking(ByteBuffer buffer, long flags, UcxCallback callback) { + return recvStreamNonBlocking(UcxUtils.getAddress(buffer), buffer.remaining(), flags, + callback); + } + + /** + * This routine flushes all outstanding AMO and RMA communications on this endpoint. + * All the AMO and RMA operations issued on this endpoint prior to this call + * are completed both at the origin and at the target. + */ + public UcpRequest flushNonBlocking(UcxCallback callback) { + return flushNonBlockingNative(getNativeId(), callback); + } + + /** + * Releases the endpoint without any confirmation from the peer. All + * outstanding requests will be completed with UCS_ERR_CANCELED error. + * This mode may cause transport level errors on remote side, so it requires set + * {@link UcpEndpointParams#setPeerErrorHandlingMode()} for all endpoints created on + * both (local and remote) sides to avoid undefined behavior. + */ + public UcpRequest closeNonBlockingForce() { + return closeNonBlockingNative(getNativeId(), UcpConstants.UCP_EP_CLOSE_MODE_FORCE); + } + + /** + * Releases the endpoint by scheduling flushes on all outstanding operations. + */ + public UcpRequest closeNonBlockingFlush() { + return closeNonBlockingNative(getNativeId(), UcpConstants.UCP_EP_CLOSE_MODE_FLUSH); + } + + private native long createEndpointNative(UcpEndpointParams params, long workerId); + + private static native void destroyEndpointNative(long epId); + + private static native UcpRemoteKey unpackRemoteKey(long epId, long rkeyAddress); + + private static native UcpRequest putNonBlockingNative(long enpointId, long localAddress, + long size, long remoteAddr, + long ucpRkeyId, UcxCallback callback); + + private static native void putNonBlockingImplicitNative(long enpointId, long localAddress, + long size, long remoteAddr, + long ucpRkeyId); + + private static native UcpRequest getNonBlockingNative(long enpointId, long remoteAddress, + long ucpRkeyId, long localAddress, + long size, UcxCallback callback); + + private static native void getNonBlockingImplicitNative(long enpointId, long remoteAddress, + long ucpRkeyId, long localAddress, + long size); + + private static native UcpRequest sendTaggedNonBlockingNative(long enpointId, long localAddress, + long size, long tag, + UcxCallback callback); + + private static native UcpRequest sendStreamNonBlockingNative(long enpointId, long localAddress, + long size, UcxCallback callback); + + private static native UcpRequest recvStreamNonBlockingNative(long enpointId, long localAddress, + long size, long flags, + UcxCallback callback); + + private static native UcpRequest flushNonBlockingNative(long enpointId, UcxCallback callback); + + private static native UcpRequest closeNonBlockingNative(long endpointId, int mode); +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpointErrorHandler.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpointErrorHandler.java new file mode 100644 index 00000000000..855e5ef5f46 --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpointErrorHandler.java @@ -0,0 +1,19 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ +package org.openucx.jucx.ucp; + +/** + * Handler to process transport level failure. + */ +public interface UcpEndpointErrorHandler { + /** + * This callback routine is invoked when transport level error detected. + * @param ep - Endpoint to handle transport level error. Upon return + * from the callback, this endpoint is no longer usable and + * all subsequent operations on this ep will fail with + * the error code passed in {@code status}. + */ + void onError(UcpEndpoint ep, int status, String errorMsg); +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpointParams.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpointParams.java new file mode 100644 index 00000000000..bde0f080216 --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpointParams.java @@ -0,0 +1,121 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx.ucp; + +import org.openucx.jucx.UcxParams; + +import java.net.InetSocketAddress; +import java.nio.ByteBuffer; + +/** + * Tuning parameters for the UCP endpoint. + */ +public class UcpEndpointParams extends UcxParams { + + @Override + public String toString() { + String result = "UcpEndpointParams{"; + if (ucpAddress != null) { + result += "ucpAddress,"; + } + result += "errorHandlingMode=" + + ((errorHandlingMode == 0) ? "UCP_ERR_HANDLING_MODE_NONE," : + "UCP_ERR_HANDLING_MODE_PEER,"); + + if (socketAddress != null) { + result += "socketAddress=" + socketAddress.toString() + ","; + } + + if (connectionRequest != 0) { + result += "connectionRequest,"; + } + return result; + } + + @Override + public UcpEndpointParams clear() { + super.clear(); + ucpAddress = null; + errorHandlingMode = 0; + flags = 0; + socketAddress = null; + connectionRequest = 0; + errorHandler = null; + return this; + } + + private ByteBuffer ucpAddress; + + private int errorHandlingMode; + + private long flags; + + private InetSocketAddress socketAddress; + + private long connectionRequest; + + UcpEndpointErrorHandler errorHandler; + + /** + * Destination address in form of workerAddress. + */ + public UcpEndpointParams setUcpAddress(ByteBuffer ucpAddress) { + this.fieldMask |= UcpConstants.UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; + this.ucpAddress = ucpAddress; + return this; + } + + /** + * Guarantees that send requests are always completed (successfully or error) even in + * case of remote failure, disables protocols and APIs which may cause a hang or undefined + * behavior in case of peer failure, may affect performance and memory footprint. + */ + public UcpEndpointParams setPeerErrorHandlingMode() { + this.fieldMask |= UcpConstants.UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE; + this.errorHandlingMode = UcpConstants.UCP_ERR_HANDLING_MODE_PEER; + return this; + } + + /** + * Destination address in form of InetSocketAddress. + */ + public UcpEndpointParams setSocketAddress(InetSocketAddress socketAddress) { + this.fieldMask |= UcpConstants.UCP_EP_PARAM_FIELD_SOCK_ADDR | + UcpConstants.UCP_EP_PARAM_FIELD_FLAGS; + this.flags |= UcpConstants.UCP_EP_PARAMS_FLAGS_CLIENT_SERVER; + this.socketAddress = socketAddress; + return this; + } + + /** + * Avoid connecting the endpoint to itself when connecting the endpoint + * to the same worker it was created on. Affects protocols which send to a particular + * remote endpoint, for example stream. + */ + public UcpEndpointParams setNoLoopbackMode() { + this.fieldMask |= UcpConstants.UCP_EP_PARAM_FIELD_FLAGS; + this.flags |= UcpConstants.UCP_EP_PARAMS_FLAGS_NO_LOOPBACK; + return this; + } + + /** + * Connection request from client. + */ + public UcpEndpointParams setConnectionRequest(UcpConnectionRequest connectionRequest) { + this.fieldMask |= UcpConstants.UCP_EP_PARAM_FIELD_CONN_REQUEST; + this.connectionRequest = connectionRequest.getNativeId(); + return this; + } + + /** + * Handler to process transport level failure. + */ + public UcpEndpointParams setErrorHandler(UcpEndpointErrorHandler errorHandler) { + this.fieldMask |= UcpConstants.UCP_EP_PARAM_FIELD_ERR_HANDLER; + this.errorHandler = errorHandler; + return this; + } +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListener.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListener.java new file mode 100644 index 00000000000..63c0ac003b1 --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListener.java @@ -0,0 +1,45 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ +package org.openucx.jucx.ucp; + +import org.openucx.jucx.UcxException; +import org.openucx.jucx.UcxNativeStruct; + +import java.io.Closeable; +import java.net.InetSocketAddress; + +/** + * The listener handle is an opaque object that is used for listening on a + * specific address and accepting connections from clients. + */ +public class UcpListener extends UcxNativeStruct implements Closeable { + + private InetSocketAddress address; + + public UcpListener(UcpWorker worker, UcpListenerParams params) { + if (params.getSockAddr() == null) { + throw new UcxException("UcpListenerParams.sockAddr must be non-null."); + } + setNativeId(createUcpListener(params, worker.getNativeId())); + address = params.getSockAddr(); + } + + /** + * Returns a socket address of this listener. + */ + public InetSocketAddress getAddress() { + return address; + } + + @Override + public void close() { + destroyUcpListenerNative(getNativeId()); + setNativeId(null); + } + + private static native long createUcpListener(UcpListenerParams params, long workerId); + + private static native void destroyUcpListenerNative(long listenerId); +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListenerConnectionHandler.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListenerConnectionHandler.java new file mode 100644 index 00000000000..4a5d84a1690 --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListenerConnectionHandler.java @@ -0,0 +1,20 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ +package org.openucx.jucx.ucp; + +/** + * A server-side handle to incoming connection request. Can be used to create an + * endpoint which connects back to the client. + */ +public interface UcpListenerConnectionHandler { + /** + * This callback routine is invoked on the server side to handle incoming + * connections from remote clients. + * @param connectionRequest - native pointer to connection request, that could be used + * in {@link UcpEndpointParams#setConnectionRequest( + * UcpConnectionRequest connectionRequest)} + */ + void onConnectionRequest(UcpConnectionRequest connectionRequest); +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListenerParams.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListenerParams.java new file mode 100644 index 00000000000..28153a0772d --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListenerParams.java @@ -0,0 +1,45 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ +package org.openucx.jucx.ucp; + +import java.net.InetSocketAddress; + +import org.openucx.jucx.UcxParams; + +public class UcpListenerParams extends UcxParams { + + @Override + public UcpListenerParams clear() { + super.clear(); + sockAddr = null; + return this; + } + + private InetSocketAddress sockAddr; + + private UcpListenerConnectionHandler connectionHandler; + + /** + * An address, on which {@link UcpListener} would bind. + */ + public UcpListenerParams setSockAddr(InetSocketAddress sockAddr) { + this.sockAddr = sockAddr; + this.fieldMask |= UcpConstants.UCP_LISTENER_PARAM_FIELD_SOCK_ADDR; + return this; + } + + public InetSocketAddress getSockAddr() { + return sockAddr; + } + + /** + * Handler of an incoming connection request in a client-server connection flow. + */ + public UcpListenerParams setConnectionHandler(UcpListenerConnectionHandler handler) { + this.connectionHandler = handler; + this.fieldMask |= UcpConstants.UCP_LISTENER_PARAM_FIELD_CONN_HANDLER; + return this; + } +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpMemMapParams.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpMemMapParams.java new file mode 100644 index 00000000000..9ce96b94089 --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpMemMapParams.java @@ -0,0 +1,72 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ +package org.openucx.jucx.ucp; + +import org.openucx.jucx.UcxParams; + +public class UcpMemMapParams extends UcxParams { + private long flags; + private long address; + private long length; + + @Override + public UcpMemMapParams clear() { + super.clear(); + address = 0; + length = 0; + flags = 0; + return this; + } + + public UcpMemMapParams setAddress(long address) { + this.fieldMask |= UcpConstants.UCP_MEM_MAP_PARAM_FIELD_ADDRESS; + this.address = address; + return this; + } + + public long getAddress() { + return address; + } + + public UcpMemMapParams setLength(long length) { + this.fieldMask |= UcpConstants.UCP_MEM_MAP_PARAM_FIELD_LENGTH; + this.length = length; + return this; + } + + public long getLength() { + return length; + } + + /** + * Identify requirement for allocation, if passed address is not a null-pointer + * then it will be used as a hint or direct address for allocation. + */ + public UcpMemMapParams allocate() { + this.fieldMask |= UcpConstants.UCP_MEM_MAP_PARAM_FIELD_FLAGS; + flags |= UcpConstants.UCP_MEM_MAP_ALLOCATE; + return this; + } + + /** + * Complete the registration faster, possibly by not populating the pages up-front, + * and mapping them later when they are accessed by communication routines. + */ + public UcpMemMapParams nonBlocking() { + this.fieldMask |= UcpConstants.UCP_MEM_MAP_PARAM_FIELD_FLAGS; + flags |= UcpConstants.UCP_MEM_MAP_NONBLOCK; + return this; + } + + /** + * Don't interpret address as a hint: place the mapping at exactly that + * address. The address must be a multiple of the page size. + */ + public UcpMemMapParams fixed() { + this.fieldMask |= UcpConstants.UCP_MEM_MAP_PARAM_FIELD_FLAGS; + flags |= UcpConstants.UCP_MEM_MAP_FIXED; + return this; + } +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpMemory.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpMemory.java new file mode 100644 index 00000000000..360b33f9e3a --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpMemory.java @@ -0,0 +1,109 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx.ucp; + +import org.openucx.jucx.UcxNativeStruct; + +import java.io.Closeable; +import java.nio.ByteBuffer; + +/** + * Memory handle is an opaque object representing a memory region allocated + * through UCP library, which is optimized for remote memory access + * operations (zero-copy operations). The memory could be registered + * to one or multiple network resources that are supported by UCP, + * such as InfiniBand, Gemini, and others. + */ +public class UcpMemory extends UcxNativeStruct implements Closeable { + + private UcpContext context; + + private ByteBuffer data; + + private long address; + + private long length; + + /** + * To prevent construct outside of JNI. + */ + private UcpMemory(long nativeId) { + setNativeId(nativeId); + } + + /** + * This routine unmaps a user specified memory segment. + * When the function returns, the {@code data} and associated + * "remote key" will be invalid and cannot be used with any UCP routine. + * Another well know terminology for the "unmap" operation that is typically + * used in the context of networking is memory "de-registration". The UCP + * library de-registers the memory the available hardware so it can be returned + * back to the operation system. + */ + public void deregister() { + unmapMemoryNative(context.getNativeId(), getNativeId()); + setNativeId(null); + data = null; + } + + /** + * This routine allocates memory buffer and packs into the buffer + * a remote access key (RKEY) object. RKEY is an opaque object that provides + * the information that is necessary for remote memory access. + * This routine packs the RKEY object in a portable format such that the + * object can be "unpacked" on any platform supported by the + * UCP library. + * RKEYs for InfiniBand and Cray Aries networks typically includes + * InifiniBand and Aries key. + * In order to enable remote direct memory access to the memory associated + * with the memory handle the application is responsible for sharing the RKEY with + * the peers that will initiate the access. + */ + public ByteBuffer getRemoteKeyBuffer() { + ByteBuffer rKeyBuffer = getRkeyBufferNative(context.getNativeId(), getNativeId()); + // 1. Allocating java native ByteBuffer (managed by java's reference count cleaner). + ByteBuffer result = ByteBuffer.allocateDirect(rKeyBuffer.capacity()); + // 2. Copy content of native ucp address to java's buffer. + result.put(rKeyBuffer); + result.clear(); + // 3. Release an address of the worker object. Memory allocated in JNI must be freed by JNI. + releaseRkeyBufferNative(rKeyBuffer); + return result; + } + + /** + * To keep reference to user's ByteBuffer so it won't be cleaned by refCount cleaner. + * @param data + */ + void setByteBufferReference(ByteBuffer data) { + this.data = data; + } + + /** + * Address of registered memory. + */ + public long getAddress() { + return address; + } + + /** + * Length of registered memory + */ + public long getLength() { + return length; + } + + private static native void unmapMemoryNative(long contextId, long memoryId); + + private static native ByteBuffer getRkeyBufferNative(long contextId, long memoryId); + + private static native void releaseRkeyBufferNative(ByteBuffer rkey); + + @Override + public void close() { + deregister(); + } +} diff --git a/bindings/java/src/main/java/org/ucx/jucx/ucp/UcpParams.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpParams.java similarity index 91% rename from bindings/java/src/main/java/org/ucx/jucx/ucp/UcpParams.java rename to bindings/java/src/main/java/org/openucx/jucx/ucp/UcpParams.java index 8d56d24cd14..d4ace227ed7 100644 --- a/bindings/java/src/main/java/org/ucx/jucx/ucp/UcpParams.java +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpParams.java @@ -3,9 +3,12 @@ * See file LICENSE for terms. */ -package org.ucx.jucx.ucp; +package org.openucx.jucx.ucp; -import org.ucx.jucx.UcxParams; +import org.openucx.jucx.UcxParams; + +import java.util.HashMap; +import java.util.Map; /** * Tuning parameters for UCP library. @@ -34,6 +37,8 @@ public class UcpParams extends UcxParams { private long estimatedNumEps; + private Map config; + @Override public UcpParams clear() { super.clear(); @@ -41,6 +46,7 @@ public UcpParams clear() { tagSenderMask = 0L; mtWorkersShared = false; estimatedNumEps = 0L; + config = null; return this; } @@ -140,4 +146,15 @@ public UcpParams requestStreamFeature() { this.features |= UcpConstants.UCP_FEATURE_STREAM; return this; } + + /** + * The routine sets runtime UCP library configuration. + */ + public UcpParams setConfig(String key, String value) { + if (config == null) { + config = new HashMap<>(); + } + config.put(key, value); + return this; + } } diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpRemoteKey.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpRemoteKey.java new file mode 100644 index 00000000000..fb66bf640c7 --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpRemoteKey.java @@ -0,0 +1,38 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ +package org.openucx.jucx.ucp; + +import org.openucx.jucx.UcxNativeStruct; + +import java.io.Closeable; + +/** + * Remote memory handle is an opaque object representing remote memory access + * information. Typically, the handle includes a memory access key and other + * network hardware specific information, which are input to remote memory + * access operations, such as PUT, GET, and ATOMIC. The object is + * communicated to remote peers to enable an access to the memory region. + */ +public class UcpRemoteKey extends UcxNativeStruct implements Closeable { + + /** + * Private constructor to construct from JNI only. + */ + private UcpRemoteKey() { + + } + + private UcpRemoteKey(long nativeRkeyPtr) { + setNativeId(nativeRkeyPtr); + } + + @Override + public void close() { + rkeyDestroy(getNativeId()); + setNativeId(null); + } + + private static native void rkeyDestroy(long ucpRkeyId); +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpRequest.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpRequest.java new file mode 100644 index 00000000000..87abf91f07b --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpRequest.java @@ -0,0 +1,57 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx.ucp; + +import org.openucx.jucx.UcxCallback; +import org.openucx.jucx.UcxNativeStruct; + +import java.io.Closeable; +import java.nio.ByteBuffer; + +/** + * Request object, that returns by ucp operations (GET, PUT, SEND, etc.). + * Call {@link UcpRequest#isCompleted()} to monitor completion of request. + */ +public class UcpRequest extends UcxNativeStruct implements Closeable { + + private long recvSize; + + private UcpRequest(long nativeId) { + setNativeId(nativeId); + } + + /** + * The size of the received data in bytes, valid only for recv requests, e.g.: + * {@link UcpWorker#recvTaggedNonBlocking(ByteBuffer buffer, UcxCallback clb)} + */ + public long getRecvSize() { + return recvSize; + } + + /** + * @return whether this request is completed. + */ + public boolean isCompleted() { + return (getNativeId() == null) || isCompletedNative(getNativeId()); + } + + /** + * This routine releases the non-blocking request back to the library, regardless + * of its current state. Communications operations associated with this request + * will make progress internally, however no further notifications or callbacks + * will be invoked for this request. + */ + @Override + public void close() { + if (getNativeId() != null) { + closeRequestNative(getNativeId()); + } + } + + private static native boolean isCompletedNative(long ucpRequest); + + private static native void closeRequestNative(long ucpRequest); +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpTagMessage.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpTagMessage.java new file mode 100644 index 00000000000..3d2d668239d --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpTagMessage.java @@ -0,0 +1,38 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx.ucp; + +import org.openucx.jucx.UcxCallback; +import org.openucx.jucx.UcxNativeStruct; + +/** + * UCP Message descriptor is an opaque handle for a message returned by + * {@link UcpWorker#tagProbeNonBlocking(long, long, boolean)}. + * This handle can be passed to + * {@link UcpWorker#recvTaggedMessageNonBlocking(long, long, UcpTagMessage, UcxCallback)} + * in order to receive the message data to a specific buffer. + */ +public class UcpTagMessage extends UcxNativeStruct { + private long recvLength; + + private long senderTag; + + private UcpTagMessage(long nativeId, long recvLength, long senderTag) { + if (nativeId != 0) { + setNativeId(nativeId); + } + this.recvLength = recvLength; + this.senderTag = senderTag; + } + + public long getRecvLength() { + return recvLength; + } + + public long getSenderTag() { + return senderTag; + } +} diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpWorker.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpWorker.java new file mode 100644 index 00000000000..3a1bdde601b --- /dev/null +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpWorker.java @@ -0,0 +1,265 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx.ucp; + +import java.io.Closeable; +import java.nio.ByteBuffer; + +import org.openucx.jucx.*; + +/** + * UCP worker is an opaque object representing the communication context. The + * worker represents an instance of a local communication resource and the + * progress engine associated with it. The progress engine is a construct that + * is responsible for asynchronous and independent progress of communication + * directives. The progress engine could be implemented in hardware or software. + * The worker object abstracts an instance of network resources such as a host + * channel adapter port, network interface, or multiple resources such as + * multiple network interfaces or communication ports. It could also represent + * virtual communication resources that are defined across multiple devices. + * Although the worker can represent multiple network resources, it is + * associated with a single {@link UcpContext} "UCX application context". + * All communication functions require a context to perform the operation on + * the dedicated hardware resource(s) and an "endpoint" to address the + * destination. + * + *

Worker are parallel "threading points" that an upper layer may use to + * optimize concurrent communications. + */ +public class UcpWorker extends UcxNativeStruct implements Closeable { + + public UcpWorker(UcpContext context, UcpWorkerParams params) { + setNativeId(createWorkerNative(params, context.getNativeId())); + } + + /** + * Creates new UcpEndpoint on current worker. + */ + public UcpEndpoint newEndpoint(UcpEndpointParams params) { + return new UcpEndpoint(this, params); + } + + /** + * Creates new UcpListener on current worker. + */ + public UcpListener newListener(UcpListenerParams params) { + return new UcpListener(this, params); + } + + @Override + public void close() { + releaseWorkerNative(getNativeId()); + setNativeId(null); + } + + /** + * This routine explicitly progresses all communication operations on a worker. + * @return Non-zero if any communication was progressed, zero otherwise. + */ + public int progress() { + return progressWorkerNative(getNativeId()); + } + + /** + * Blocking progress for request until it's not completed. + */ + public void progressRequest(UcpRequest request) { + while (!request.isCompleted()) { + progress(); + } + } + + /** + * This routine flushes all outstanding AMO and RMA communications on the + * this worker. All the AMO and RMA operations issued on this worker prior to this call + * are completed both at the origin and at the target when this call returns. + */ + public UcpRequest flushNonBlocking(UcxCallback callback) { + return flushNonBlockingNative(getNativeId(), callback); + } + + /** + * This routine waits (blocking) until an event has happened, as part of the + * wake-up mechanism. + * + * This function is guaranteed to return only if new communication events occur + * on the worker. Therefore one must drain all existing events before waiting + * on the file descriptor. This can be achieved by calling + * {@link UcpWorker#progress()} repeatedly until it returns 0. + */ + public void waitForEvents() { + waitWorkerNative(getNativeId()); + } + + /** + * This routine signals that the event has happened, as part of the wake-up + * mechanism. This function causes a blocking call to {@link UcpWorker#waitForEvents()} + * to return, even if no event from the underlying interfaces has taken place. + * + * It’s safe to use this routine from any thread, even if UCX is compiled + * without multi-threading support and/or initialized without + * {@link UcpWorkerParams#requestThreadSafety()}. However {@link UcpContext} has to be + * created with {@link UcpParams#requestWakeupFeature()}. + */ + public void signal() { + signalWorkerNative(getNativeId()); + } + + /** + * Non-blocking tagged-receive operation. + * This routine receives a messages that is described by the local {@code recvBuffer} + * buffer on the current worker. The tag value of the receive message has to match + * the {@code tag} of sent message. The routine is a non-blocking and therefore returns + * immediately. The receive operation is considered completed when the message is delivered + * to the {@code recvBuffer} at position {@code recvBuffer.position()} and size + * {@code recvBuffer.remaining()}. + * In order to notify the application about completion of the receive + * operation the UCP library will invoke the call-back {@code callback} when the received + * message is in the receive buffer and ready for application access. + * + * @param tagMask - bit mask that indicates the bits that are used for the matching of the + * incoming tag against the expected tag. + */ + public UcpRequest recvTaggedNonBlocking(ByteBuffer recvBuffer, long tag, long tagMask, + UcxCallback callback) { + if (!recvBuffer.isDirect()) { + throw new UcxException("Recv buffer must be direct."); + } + return recvTaggedNonBlockingNative(getNativeId(), UcxUtils.getAddress(recvBuffer), + recvBuffer.remaining(), tag, tagMask, callback); + } + + public UcpRequest recvTaggedNonBlocking(long localAddress, long size, long tag, long tagMask, + UcxCallback callback) { + return recvTaggedNonBlockingNative(getNativeId(), localAddress, size, + tag, tagMask, callback); + } + + /** + * Non-blocking receive operation. Invokes + * {@link UcpWorker#recvTaggedNonBlocking(ByteBuffer, long, long, UcxCallback)} + * with default tag=0 and tagMask=0. + */ + public UcpRequest recvTaggedNonBlocking(ByteBuffer recvBuffer, UcxCallback callback) { + return recvTaggedNonBlocking(recvBuffer, 0, 0, callback); + } + + /** + * Non-blocking probe and return a message. + * This routine probes (checks) if a messages described by the {@code tag} and + * {@code tagMask} was received (fully or partially) on the worker. The tag + * value of the received message has to match the {@code tag} and {@code tagMask} + * values, where the {@code tagMask} indicates what bits of the tag have to be + * matched. The function returns immediately and if the message is matched it + * returns a handle for the message. + * + * This function does not advance the communication state of the network. + * If this routine is used in busy-poll mode, need to make sure + * {@link UcpWorker#progress()} is called periodically to extract messages from the transport. + * + * @param remove - The flag indicates if the matched message has to be removed from UCP library. + * If true, the message handle is removed from the UCP library + * and the application is responsible to call + * {@link UcpWorker#recvTaggedMessageNonBlocking(long, long, UcpTagMessage, + * UcxCallback)} in order to receive the data and release the resources + * associated with the message handle. + * If false, the return value is merely an indication to whether a matching + * message is present, and it cannot be used in any other way, + * and in particular it cannot be passed to + * {@link UcpWorker#recvTaggedMessageNonBlocking(long, long, UcpTagMessage, + * UcxCallback)} + * @return NULL - No match found. + * Message handle (not NULL) - If message is matched the message handle is returned. + */ + public UcpTagMessage tagProbeNonBlocking(long tag, long tagMask, boolean remove) { + return tagProbeNonBlockingNative(getNativeId(), tag, tagMask, remove); + } + + /** + * Non-blocking receive operation for a probed message. + * This routine receives a messages that is described by the local {@code address}, + * {@code size}, and a {@code message} handle. The {@code message} handle can be obtain + * by calling the {@link UcpWorker#tagProbeNonBlocking(long, long, boolean)}. This routine + * is a non-blocking and therefore returns immediately. The receive operation is considered + * completed when the message is delivered to the buffer, described by {@code address} + * and {@code size}. + * In order to notify the application about completion of the receive operation + * the UCP library will invoke the call-back {@code callback} when the received message + * is in the receive buffer and ready for application access. + * If the receive operation cannot be stated the routine returns an error. + */ + public UcpRequest recvTaggedMessageNonBlocking(long address, long size, UcpTagMessage message, + UcxCallback callback) { + return recvTaggedMessageNonBlockingNative(getNativeId(), address, size, + message.getNativeId(), callback); + } + + public UcpRequest recvTaggedMessageNonBlocking(ByteBuffer buffer, UcpTagMessage message, + UcxCallback callback) { + return recvTaggedMessageNonBlocking(UcxUtils.getAddress(buffer), buffer.remaining(), + message, callback); + } + + /** + * This routine tries to cancels an outstanding communication request. After + * calling this routine, the request will be in completed or canceled (but + * not both) state regardless of the status of the target endpoint associated + * with the communication request. If the request is completed successfully, + * the "send" or the "receive" completion callbacks (based on the type of the request) will be + * called with the status argument of the callback set to UCS_OK, and in a + * case it is canceled the status argument is set to UCS_ERR_CANCELED. + */ + public void cancelRequest(UcpRequest request) { + cancelRequestNative(getNativeId(), request.getNativeId()); + } + + /** + * This routine returns the address of the worker object. This address can be + * passed to remote instances of the UCP library in order to connect to this + * worker. Ucp worker address - is an opaque object that is used as an + * identifier for a {@link UcpWorker} instance. + */ + public ByteBuffer getAddress() { + ByteBuffer nativeUcpAddress = workerGetAddressNative(getNativeId()); + // 1. Allocating java native ByteBuffer (managed by java's reference count cleaner). + ByteBuffer result = ByteBuffer.allocateDirect(nativeUcpAddress.capacity()); + // 2. Copy content of native ucp address to java's buffer. + result.put(nativeUcpAddress); + result.clear(); + // 3. Release an address of the worker object. Memory allocated in JNI must be freed by JNI. + releaseAddressNative(getNativeId(), nativeUcpAddress); + return result; + } + + private static native long createWorkerNative(UcpWorkerParams params, long ucpContextId); + + private static native void releaseWorkerNative(long workerId); + + private static native ByteBuffer workerGetAddressNative(long workerId); + + private static native void releaseAddressNative(long workerId, ByteBuffer addressId); + + private static native int progressWorkerNative(long workerId); + + private static native UcpRequest flushNonBlockingNative(long workerId, UcxCallback callback); + + private static native void waitWorkerNative(long workerId); + + private static native void signalWorkerNative(long workerId); + + private static native UcpRequest recvTaggedNonBlockingNative(long workerId, long localAddress, + long size, long tag, long tagMask, + UcxCallback callback); + + private static native UcpTagMessage tagProbeNonBlockingNative(long workerId, long tag, + long tagMask, boolean remove); + + private static native UcpRequest recvTaggedMessageNonBlockingNative(long workerId, long address, + long size, long tagMsgId, + UcxCallback callback); + + private static native void cancelRequestNative(long workerId, long requestId); +} diff --git a/bindings/java/src/main/java/org/ucx/jucx/ucp/UcpWorkerParams.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpWorkerParams.java similarity index 90% rename from bindings/java/src/main/java/org/ucx/jucx/ucp/UcpWorkerParams.java rename to bindings/java/src/main/java/org/openucx/jucx/ucp/UcpWorkerParams.java index 79f45923450..a1ea7362c35 100644 --- a/bindings/java/src/main/java/org/ucx/jucx/ucp/UcpWorkerParams.java +++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpWorkerParams.java @@ -3,13 +3,14 @@ * See file LICENSE for terms. */ -package org.ucx.jucx.ucp; +package org.openucx.jucx.ucp; import java.nio.ByteBuffer; import java.util.BitSet; -import org.ucx.jucx.UcxParams; -import org.ucx.jucx.ucs.UcsConstants; +import org.openucx.jucx.ucs.UcsConstants; +import org.openucx.jucx.UcxException; +import org.openucx.jucx.UcxParams; public class UcpWorkerParams extends UcxParams { @@ -38,7 +39,7 @@ public UcpWorkerParams clear() { * Requests the thread safety mode which worker and the associated resources * should be created with. * When thread safety requested, the - * {@link org.ucx.jucx.ucp.UcpWorker#UcpWorker(UcpContext, UcpWorkerParams)} + * {@link UcpWorker#UcpWorker(UcpContext, UcpWorkerParams)} * attempts to create worker where multiple threads can access concurrently. * The thread mode with which worker is created can differ from the * suggested mode. @@ -131,6 +132,9 @@ public UcpWorkerParams requestWakeupEdge() { * User data associated with the current worker. */ public UcpWorkerParams setUserData(ByteBuffer userData) { + if (!userData.isDirect()) { + throw new UcxException("User data must be of type DirectByteBuffer."); + } this.fieldMask |= UcpConstants.UCP_WORKER_PARAM_FIELD_USER_DATA; this.userData = userData; return this; @@ -141,9 +145,9 @@ public UcpWorkerParams setUserData(ByteBuffer userData) { * *

Events on the worker will be reported on the provided event file descriptor. * The provided file descriptor must be capable of aggregating notifications - * for arbitrary events, for example @c epoll(7) on Linux systems. + * for arbitrary events, for example epoll(7) on Linux systems. * - *

{@link userData} will be used as the event user-data on systems which + *

{@code userData} will be used as the event user-data on systems which * support it. For example, on Linux, it will be placed in * epoll_data_t::ptr, when returned from epoll_wait(2).

* diff --git a/bindings/java/src/main/java/org/ucx/jucx/ucs/UcsConstants.java b/bindings/java/src/main/java/org/openucx/jucx/ucs/UcsConstants.java similarity index 89% rename from bindings/java/src/main/java/org/ucx/jucx/ucs/UcsConstants.java rename to bindings/java/src/main/java/org/openucx/jucx/ucs/UcsConstants.java index 4b81dfec79f..b22f0b1da60 100644 --- a/bindings/java/src/main/java/org/ucx/jucx/ucs/UcsConstants.java +++ b/bindings/java/src/main/java/org/openucx/jucx/ucs/UcsConstants.java @@ -3,9 +3,9 @@ * See file LICENSE for terms. */ -package org.ucx.jucx.ucs; +package org.openucx.jucx.ucs; -import org.ucx.jucx.NativeLibs; +import org.openucx.jucx.NativeLibs; public class UcsConstants { static { diff --git a/bindings/java/src/main/java/org/ucx/jucx/UcxException.java b/bindings/java/src/main/java/org/ucx/jucx/UcxException.java deleted file mode 100644 index c22b1f327f3..00000000000 --- a/bindings/java/src/main/java/org/ucx/jucx/UcxException.java +++ /dev/null @@ -1,11 +0,0 @@ -/* - * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -package org.ucx.jucx; - -/** - * Exception to be thrown from JNI and all UCX routines. - */ -public class UcxException extends RuntimeException{} diff --git a/bindings/java/src/main/java/org/ucx/jucx/ucp/UcpConstants.java b/bindings/java/src/main/java/org/ucx/jucx/ucp/UcpConstants.java deleted file mode 100644 index 9597b6fd420..00000000000 --- a/bindings/java/src/main/java/org/ucx/jucx/ucp/UcpConstants.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -package org.ucx.jucx.ucp; - -import org.ucx.jucx.NativeLibs; - -public class UcpConstants { - static { - NativeLibs.load(); - loadConstants(); - } - - /** - * UCP context parameters field mask. - * - *

The enumeration allows specifying which fields in {@link UcpParams} are - * present. It is used for the enablement of backward compatibility support. - */ - public static long UCP_PARAM_FIELD_FEATURES; - public static long UCP_PARAM_FIELD_TAG_SENDER_MASK; - public static long UCP_PARAM_FIELD_MT_WORKERS_SHARED; - public static long UCP_PARAM_FIELD_ESTIMATED_NUM_EPS; - - /** - * UCP configuration features - * - *

The enumeration list describes the features supported by UCP. - * An application can request the features using "UCP parameters" - * during "UCP initialization" process. - */ - public static long UCP_FEATURE_TAG; - public static long UCP_FEATURE_RMA; - public static long UCP_FEATURE_AMO32; - public static long UCP_FEATURE_AMO64; - public static long UCP_FEATURE_WAKEUP; - public static long UCP_FEATURE_STREAM; - - /** - * UCP worker parameters field mask. - * - *

The enumeration allows specifying which fields in {@link UcpWorker} are - * present. It is used for the enablement of backward compatibility support. - */ - public static long UCP_WORKER_PARAM_FIELD_THREAD_MODE; - public static long UCP_WORKER_PARAM_FIELD_CPU_MASK; - public static long UCP_WORKER_PARAM_FIELD_EVENTS; - public static long UCP_WORKER_PARAM_FIELD_USER_DATA; - public static long UCP_WORKER_PARAM_FIELD_EVENT_FD; - - /** - * Mask of events which are expected on wakeup. - * If it's not set all types of events will trigger on - * wakeup. - */ - public static long UCP_WAKEUP_RMA; - public static long UCP_WAKEUP_AMO; - public static long UCP_WAKEUP_TAG_SEND; - public static long UCP_WAKEUP_TAG_RECV; - public static long UCP_WAKEUP_TX; - public static long UCP_WAKEUP_RX; - public static long UCP_WAKEUP_EDGE; - - private static native void loadConstants(); -} diff --git a/bindings/java/src/main/java/org/ucx/jucx/ucp/UcpContext.java b/bindings/java/src/main/java/org/ucx/jucx/ucp/UcpContext.java deleted file mode 100644 index 88ff8ab7402..00000000000 --- a/bindings/java/src/main/java/org/ucx/jucx/ucp/UcpContext.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -package org.ucx.jucx.ucp; - -import java.io.Closeable; - -import org.ucx.jucx.NativeLibs; -import org.ucx.jucx.UcxNativeStruct; - -/** - * UCP application context (or just a context) is an opaque handle that holds a - * UCP communication instance's global information. It represents a single UCP - * communication instance. The communication instance could be an OS process - * (an application) that uses UCP library. This global information includes - * communication resources, endpoints, memory, temporary file storage, and - * other communication information directly associated with a specific UCP - * instance. The context also acts as an isolation mechanism, allowing - * resources associated with the context to manage multiple concurrent - * communication instances. For example, users can isolate their communication - * by allocating and using separate contexts. Alternatively, users can share the - * communication resources (memory, network resource context, etc.) between - * them by using the same application context. A message sent or a RMA - * operation performed in one application context cannot be received in any - * other application context. - */ -public class UcpContext extends UcxNativeStruct implements Closeable { - static { - NativeLibs.load(); - } - - public UcpContext(UcpParams params) { - setNativeId(createContextNative(params)); - } - - @Override - public void close() { - cleanupContextNative(getNativeId()); - this.setNativeId(null); - } - - private static native long createContextNative(UcpParams params); - - private static native void cleanupContextNative(long contextId); - -} diff --git a/bindings/java/src/main/java/org/ucx/jucx/ucp/UcpWorker.java b/bindings/java/src/main/java/org/ucx/jucx/ucp/UcpWorker.java deleted file mode 100644 index b3441bdb7cf..00000000000 --- a/bindings/java/src/main/java/org/ucx/jucx/ucp/UcpWorker.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -package org.ucx.jucx.ucp; - -import java.io.Closeable; - -import org.ucx.jucx.UcxNativeStruct; - -/** - * UCP worker is an opaque object representing the communication context. The - * worker represents an instance of a local communication resource and progress - * engine associated with it. Progress engine is a construct that is - * responsible for asynchronous and independent progress of communication - * directives. The progress engine could be implement in hardware or software. - * The worker object abstract an instance of network resources such as a host - * channel adapter port, network interface, or multiple resources such as - * multiple network interfaces or communication ports. It could also represent - * virtual communication resources that are defined across multiple devices. - * Although the worker can represent multiple network resources, it is - * associated with a single {@link UcpContext} "UCX application context". - * All communication functions require a context to perform the operation on - * the dedicated hardware resource(s) and an "endpoint" to address the - * destination. - * - *

Worker are parallel "threading points" that an upper layer may use to - * optimize concurrent communications. - */ -public class UcpWorker extends UcxNativeStruct implements Closeable { - - public UcpWorker(UcpContext context, UcpWorkerParams params) { - setNativeId(createWorkerNative(params, context.getNativeId())); - } - - @Override - public void close() { - releaseWorkerNative(getNativeId()); - setNativeId(null); - } - - private static native long createWorkerNative(UcpWorkerParams params, long ucpContextId); - - private static native void releaseWorkerNative(long workerId); -} diff --git a/bindings/java/src/main/native/Makefile.am b/bindings/java/src/main/native/Makefile.am index c15beb7f6e2..aa9dda9794f 100644 --- a/bindings/java/src/main/native/Makefile.am +++ b/bindings/java/src/main/native/Makefile.am @@ -3,38 +3,66 @@ # See file LICENSE for terms. # -topdir=$(abs_top_builddir) -javadir=$(top_srcdir)/bindings/java -MVNCMD=$(MVN) -B -f $(javadir)/pom.xml -Dmaven.repo.local=$(topdir)/.deps/ - -BUILT_SOURCES = org_ucx_jucx_ucp_UcpConstants.h \ - org_ucx_jucx_ucp_UcpContext.h \ - org_ucx_jucx_ucp_UcpWorker.h \ - org_ucx_jucx_ucs_UcsConstants.h - -DISTCLEANFILES = org_ucx_jucx_ucp_UcpConstants.h \ - org_ucx_jucx_ucp_UcpContext.h \ - org_ucx_jucx_ucp_UcpWorker.h \ - org_ucx_jucx_ucs_UcsConstants.h - -org_ucx_jucx_ucp_UcpConstants.h: -org_ucx_jucx_ucp_UcpWorker.h: -org_ucx_jucx_ucs_UcsConstants.h: -org_ucx_jucx_ucp_UcpContext.h: - $(MVNCMD) compile native:javah +if HAVE_JAVA + +jardir = $(libdir) +topdir = $(abs_top_builddir) +java_build_dir = $(builddir)/build-java +jarfile = $(java_build_dir)/jucx-@VERSION@.jar +javadir = $(top_srcdir)/bindings/java + +MVNCMD = $(MVN) -B -T 1C -f \ + $(topdir)/bindings/java/pom.xml \ + -Dmaven.repo.local=$(java_build_dir)/.deps \ + -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn + +JUCX_GENERATED_H_FILES = org_openucx_jucx_ucp_UcpConstants.h \ + org_openucx_jucx_ucp_UcpContext.h \ + org_openucx_jucx_ucp_UcpEndpoint.h \ + org_openucx_jucx_ucp_UcpListener.h \ + org_openucx_jucx_ucp_UcpMemory.h \ + org_openucx_jucx_ucp_UcpRequest.h \ + org_openucx_jucx_ucp_UcpRemoteKey.h \ + org_openucx_jucx_ucp_UcpWorker.h \ + org_openucx_jucx_ucs_UcsConstants_ThreadMode.h \ + org_openucx_jucx_ucs_UcsConstants.h + +BUILT_SOURCES = $(JUCX_GENERATED_H_FILES) + +STAMP_FILE = native_headers.stamp + +MOSTLYCLEANFILES = $(JUCX_GENERATED_H_FILES) $(STAMP_FILE) + +# +# Create a timestamp file to avoid regenerating header files every time +# See https://www.gnu.org/software/automake/manual/html_node/Multiple-Outputs.html +# +$(STAMP_FILE): \ + $(javadir)/src/main/java/org/openucx/jucx/ucs/*.java \ + $(javadir)/src/main/java/org/openucx/jucx/ucp/*.java + $(MVNCMD) compile + touch $(STAMP_FILE) + +$(JUCX_GENERATED_H_FILES): $(STAMP_FILE) lib_LTLIBRARIES = libjucx.la libjucx_la_CPPFLAGS = -I$(JDK)/include -I$(JDK)/include/linux \ -I$(topdir)/src -I$(top_srcdir)/src +noinst_HEADERS = jucx_common_def.h + libjucx_la_SOURCES = context.cc \ + endpoint.cc \ jucx_common_def.cc \ + listener.cc \ + memory.cc \ + request.cc \ ucp_constants.cc \ ucs_constants.cc \ worker.cc -libjucx_la_CXXFLAGS = -fPIC -DPIC -Werror +libjucx_la_CXXFLAGS = -fPIC -DPIC -Werror -std=gnu++98 libjucx_la_LIBADD = $(topdir)/src/ucs/libucs.la \ $(topdir)/src/uct/libuct.la \ @@ -43,19 +71,39 @@ libjucx_la_LIBADD = $(topdir)/src/ucs/libucs.la \ libjucx_la_DEPENDENCIES = Makefile.am Makefile.in Makefile -#Compile Java source code and pack to jar -package: +# Compile Java source code and pack to jar +$(jarfile): $(MVNCMD) package -DskipTests -#Maven install phase -install-data-hook: - $(MVNCMD) install -DskipTests +package : $(jarfile) + +.PHONY: package -#Remove all compiled Java files +# Maven install phase +jar_DATA = $(jarfile) + +# Remove all compiled Java files clean-local: - $(MVNCMD) clean + -rm -rf $(java_build_dir) + +set-version: + $(MVNCMD) versions:set -DnewVersion=${JUCX_VERSION} + +# Publish JUCX jar to maven central +publish-snapshot: + @make set-version JUCX_VERSION=@VERSION@-SNAPSHOT + @make publish + +publish-release: + @make set-version JUCX_VERSION=${JUCX_VERSION} + @make publish + +publish: + $(MVNCMD) deploy -DskipTests ${ARGS} test: - $(MVNCMD) test + $(MVNCMD) test -DargLine="-XX:OnError='cat hs_err_pid%p.log'" docs: $(MVNCMD) javadoc:javadoc + +endif diff --git a/bindings/java/src/main/native/context.cc b/bindings/java/src/main/native/context.cc index 5d6f409f46c..e68eee74974 100644 --- a/bindings/java/src/main/native/context.cc +++ b/bindings/java/src/main/native/context.cc @@ -4,17 +4,61 @@ */ #include "jucx_common_def.h" -#include "org_ucx_jucx_ucp_UcpContext.h" +#include "org_openucx_jucx_ucp_UcpContext.h" +extern "C" { +#include +} + +/** + * Iterates through entries of java's hash map and apply + * ucp_config_modify and ucs_global_opts_set_value to each key value pair. + */ +static void jucx_map_apply_config(JNIEnv *env, ucp_config_t *config, + jobject *config_map) +{ + jclass c_map = env->GetObjectClass(*config_map); + jmethodID id_entrySet = + env->GetMethodID(c_map, "entrySet", "()Ljava/util/Set;"); + jclass c_entryset = env->FindClass("java/util/Set"); + jmethodID id_iterator = + env->GetMethodID(c_entryset, "iterator", "()Ljava/util/Iterator;"); + jclass c_iterator = env->FindClass("java/util/Iterator"); + jmethodID id_hasNext = env->GetMethodID(c_iterator, "hasNext", "()Z"); + jmethodID id_next = + env->GetMethodID(c_iterator, "next", "()Ljava/lang/Object;"); + jclass c_entry = env->FindClass("java/util/Map$Entry"); + jmethodID id_getKey = + env->GetMethodID(c_entry, "getKey", "()Ljava/lang/Object;"); + jmethodID id_getValue = + env->GetMethodID(c_entry, "getValue", "()Ljava/lang/Object;"); + jobject obj_entrySet = env->CallObjectMethod(*config_map, id_entrySet); + jobject obj_iterator = env->CallObjectMethod(obj_entrySet, id_iterator); -#include + while (env->CallBooleanMethod(obj_iterator, id_hasNext)) { + jobject entry = env->CallObjectMethod(obj_iterator, id_next); + jstring jstrKey = (jstring)env->CallObjectMethod(entry, id_getKey); + jstring jstrValue = (jstring)env->CallObjectMethod(entry, id_getValue); + const char *strKey = env->GetStringUTFChars(jstrKey, 0); + const char *strValue = env->GetStringUTFChars(jstrValue, 0); + ucs_status_t config_modify_status = ucp_config_modify(config, strKey, strValue); + ucs_status_t global_opts_status = ucs_global_opts_set_value(strKey, strValue); + + if ((config_modify_status != UCS_OK) && (global_opts_status != UCS_OK)) { + ucs_warn("JUCX: no such key %s, ignoring", strKey); + } + + env->ReleaseStringUTFChars(jstrKey, strKey); + env->ReleaseStringUTFChars(jstrValue, strValue); + } +} /** * Bridge method for creating ucp_context from java */ JNIEXPORT jlong JNICALL -Java_org_ucx_jucx_ucp_UcpContext_createContextNative(JNIEnv *env, jclass cls, - jobject jucx_ctx_params) +Java_org_openucx_jucx_ucp_UcpContext_createContextNative(JNIEnv *env, jclass cls, + jobject jucx_ctx_params) { ucp_params_t ucp_params = { 0 }; ucp_context_h ucp_context; @@ -47,17 +91,99 @@ Java_org_ucx_jucx_ucp_UcpContext_createContextNative(JNIEnv *env, jclass cls, field); } - ucs_status_t status = ucp_init(&ucp_params, NULL, &ucp_context); + ucp_params.field_mask |= UCP_PARAM_FIELD_REQUEST_INIT | + UCP_PARAM_FIELD_REQUEST_SIZE; + ucp_params.request_size = sizeof(struct jucx_context); + ucp_params.request_init = jucx_request_init; + + ucp_config_t *config = NULL; + ucs_status_t status; + + field = env->GetFieldID(jucx_param_class, "config", "Ljava/util/Map;"); + jobject config_map = env->GetObjectField(jucx_ctx_params, field); + + if (config_map != NULL) { + status = ucp_config_read(NULL, NULL, &config); + if (status != UCS_OK) { + JNU_ThrowExceptionByStatus(env, status); + } + + jucx_map_apply_config(env, config, &config_map); + } + + status = ucp_init(&ucp_params, config, &ucp_context); if (status != UCS_OK) { JNU_ThrowExceptionByStatus(env, status); } + + if (config != NULL) { + ucp_config_release(config); + } + return (native_ptr)ucp_context; } JNIEXPORT void JNICALL -Java_org_ucx_jucx_ucp_UcpContext_cleanupContextNative(JNIEnv *env, jclass cls, - jlong ucp_context_ptr) +Java_org_openucx_jucx_ucp_UcpContext_cleanupContextNative(JNIEnv *env, jclass cls, + jlong ucp_context_ptr) { ucp_cleanup((ucp_context_h)ucp_context_ptr); } + + +JNIEXPORT jobject JNICALL +Java_org_openucx_jucx_ucp_UcpContext_memoryMapNative(JNIEnv *env, jobject ctx, + jlong ucp_context_ptr, + jobject jucx_mmap_params) +{ + ucp_mem_map_params_t params = {0}; + ucp_mem_h memh; + jfieldID field; + + jclass jucx_mmap_class = env->GetObjectClass(jucx_mmap_params); + field = env->GetFieldID(jucx_mmap_class, "fieldMask", "J"); + params.field_mask = env->GetLongField(jucx_mmap_params, field); + + if (params.field_mask & UCP_MEM_MAP_PARAM_FIELD_ADDRESS) { + field = env->GetFieldID(jucx_mmap_class, "address", "J"); + params.address = (void *)env->GetLongField(jucx_mmap_params, field);; + } + + if (params.field_mask & UCP_MEM_MAP_PARAM_FIELD_LENGTH) { + field = env->GetFieldID(jucx_mmap_class, "length", "J"); + params.length = env->GetLongField(jucx_mmap_params, field);; + } + + if (params.field_mask & UCP_MEM_MAP_PARAM_FIELD_FLAGS) { + field = env->GetFieldID(jucx_mmap_class, "flags", "J"); + params.flags = env->GetLongField(jucx_mmap_params, field);; + } + + ucs_status_t status = ucp_mem_map((ucp_context_h)ucp_context_ptr, ¶ms, &memh); + if (status != UCS_OK) { + JNU_ThrowExceptionByStatus(env, status); + } + + // Construct UcpMemory class + jclass jucx_mem_cls = env->FindClass("org/openucx/jucx/ucp/UcpMemory"); + jmethodID constructor = env->GetMethodID(jucx_mem_cls, "", "(J)V"); + jobject jucx_mem = env->NewObject(jucx_mem_cls, constructor, (native_ptr)memh); + + // Set UcpContext pointer + field = env->GetFieldID(jucx_mem_cls, "context", "Lorg/openucx/jucx/ucp/UcpContext;"); + env->SetObjectField(jucx_mem, field, ctx); + + // Set address + field = env->GetFieldID(jucx_mem_cls, "address", "J"); + env->SetLongField(jucx_mem, field, (native_ptr)memh->address); + + // Set length + field = env->GetFieldID(jucx_mem_cls, "length", "J"); + env->SetLongField(jucx_mem, field, memh->length); + + /* Coverity thinks that memh is a leaked object here, + * but it's stored in a UcpMemory object */ + /* coverity[leaked_storage] */ + return jucx_mem; +} diff --git a/bindings/java/src/main/native/endpoint.cc b/bindings/java/src/main/native/endpoint.cc new file mode 100644 index 00000000000..fada1e4468c --- /dev/null +++ b/bindings/java/src/main/native/endpoint.cc @@ -0,0 +1,248 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "jucx_common_def.h" +#include "org_openucx_jucx_ucp_UcpEndpoint.h" + +#include /* memset */ + + +static void error_handler(void *arg, ucp_ep_h ep, ucs_status_t status) +{ + JNIEnv* env = get_jni_env(); + + ucs_debug("JUCX: endpoint %p error handler: %s", ep, ucs_status_string(status)); + jobject jucx_ep = reinterpret_cast(arg); + if (env->IsSameObject(jucx_ep, NULL)) { + ucs_warn("UcpEndpoint was garbage collected. Can't call it's error handler."); + return; + } + + jclass jucx_ep_error_hndl_cls = env->FindClass("org/openucx/jucx/ucp/UcpEndpointErrorHandler"); + jclass jucx_ep_class = env->GetObjectClass(jucx_ep); + jfieldID ep_error_hdnl_field = env->GetFieldID(jucx_ep_class, "errorHandler", + "Lorg/openucx/jucx/ucp/UcpEndpointErrorHandler;"); + jobject jucx_ep_error_hndl = env->GetObjectField(jucx_ep, ep_error_hdnl_field); + jmethodID on_error = env->GetMethodID(jucx_ep_error_hndl_cls, "onError", + "(Lorg/openucx/jucx/ucp/UcpEndpoint;ILjava/lang/String;)V"); + env->CallVoidMethod(jucx_ep_error_hndl, on_error, jucx_ep, status, + ucs_status_string(status)); + env->DeleteWeakGlobalRef(jucx_ep); +} + + +JNIEXPORT jlong JNICALL +Java_org_openucx_jucx_ucp_UcpEndpoint_createEndpointNative(JNIEnv *env, jobject jucx_ep, + jobject ucp_ep_params, + jlong worker_ptr) +{ + ucp_ep_params_t ep_params; + jfieldID field; + ucp_worker_h ucp_worker = (ucp_worker_h)worker_ptr; + ucp_ep_h endpoint; + + // Get field mask + jclass ucp_ep_params_class = env->GetObjectClass(ucp_ep_params); + field = env->GetFieldID(ucp_ep_params_class, "fieldMask", "J"); + ep_params.field_mask = env->GetLongField(ucp_ep_params, field); + + if (ep_params.field_mask & UCP_EP_PARAM_FIELD_REMOTE_ADDRESS) { + field = env->GetFieldID(ucp_ep_params_class, "ucpAddress", "Ljava/nio/ByteBuffer;"); + jobject buf = env->GetObjectField(ucp_ep_params, field); + ep_params.address = static_cast(env->GetDirectBufferAddress(buf)); + } + + if (ep_params.field_mask & UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE) { + field = env->GetFieldID(ucp_ep_params_class, "errorHandlingMode", "I"); + ep_params.err_mode = static_cast(env->GetIntField(ucp_ep_params, field)); + } + + if (ep_params.field_mask & UCP_EP_PARAM_FIELD_FLAGS) { + field = env->GetFieldID(ucp_ep_params_class, "flags", "J"); + ep_params.flags = env->GetLongField(ucp_ep_params, field); + } + + if (ep_params.field_mask & UCP_EP_PARAM_FIELD_SOCK_ADDR) { + struct sockaddr_storage worker_addr; + socklen_t addrlen; + memset(&worker_addr, 0, sizeof(struct sockaddr_storage)); + + field = env->GetFieldID(ucp_ep_params_class, + "socketAddress", "Ljava/net/InetSocketAddress;"); + jobject sock_addr = env->GetObjectField(ucp_ep_params, field); + + if (j2cInetSockAddr(env, sock_addr, worker_addr, addrlen)) { + ep_params.sockaddr.addr = (const struct sockaddr*)&worker_addr; + ep_params.sockaddr.addrlen = addrlen; + } + } + + if (ep_params.field_mask & UCP_EP_PARAM_FIELD_CONN_REQUEST) { + field = env->GetFieldID(ucp_ep_params_class, "connectionRequest", "J"); + ep_params.conn_request = reinterpret_cast(env->GetLongField(ucp_ep_params, field)); + } + + if (ep_params.field_mask & UCP_EP_PARAM_FIELD_ERR_HANDLER) { + // Important to use weak reference, to allow JUCX endpoint class to be closed and + // garbage collected, as error handler may never be called + ep_params.err_handler.arg = env->NewWeakGlobalRef(jucx_ep); + ep_params.err_handler.cb = error_handler; + } + + ucs_status_t status = ucp_ep_create(ucp_worker, &ep_params, &endpoint); + if (status != UCS_OK) { + JNU_ThrowExceptionByStatus(env, status); + } + + return (native_ptr)endpoint; +} + +JNIEXPORT void JNICALL +Java_org_openucx_jucx_ucp_UcpEndpoint_destroyEndpointNative(JNIEnv *env, jclass cls, + jlong ep_ptr) +{ + ucp_ep_destroy((ucp_ep_h)ep_ptr); +} + +JNIEXPORT jobject JNICALL +Java_org_openucx_jucx_ucp_UcpEndpoint_closeNonBlockingNative(JNIEnv *env, jclass cls, + jlong ep_ptr, jint mode) +{ + ucs_status_ptr_t request = ucp_ep_close_nb((ucp_ep_h)ep_ptr, mode); + + return process_request(request, NULL); +} + +JNIEXPORT jobject JNICALL +Java_org_openucx_jucx_ucp_UcpEndpoint_unpackRemoteKey(JNIEnv *env, jclass cls, + jlong ep_ptr, jlong addr) +{ + ucp_rkey_h rkey; + + ucs_status_t status = ucp_ep_rkey_unpack((ucp_ep_h)ep_ptr, (void *)addr, &rkey); + if (status != UCS_OK) { + JNU_ThrowExceptionByStatus(env, status); + } + + jobject result = new_rkey_instance(env, rkey); + + /* Coverity thinks that rkey is a leaked object here, + * but it's stored in a UcpRemoteKey object */ + /* coverity[leaked_storage] */ + return result; +} + +JNIEXPORT jobject JNICALL +Java_org_openucx_jucx_ucp_UcpEndpoint_putNonBlockingNative(JNIEnv *env, jclass cls, + jlong ep_ptr, jlong laddr, + jlong size, jlong raddr, + jlong rkey_ptr, jobject callback) +{ + ucs_status_ptr_t request = ucp_put_nb((ucp_ep_h)ep_ptr, (void *)laddr, size, raddr, + (ucp_rkey_h)rkey_ptr, jucx_request_callback); + + ucs_trace_req("JUCX: put_nb request %p, of size: %zu, raddr: %zu", + request, size, raddr); + return process_request(request, callback); +} + +JNIEXPORT void JNICALL +Java_org_openucx_jucx_ucp_UcpEndpoint_putNonBlockingImplicitNative(JNIEnv *env, jclass cls, + jlong ep_ptr, jlong laddr, + jlong size, jlong raddr, + jlong rkey_ptr) +{ + ucs_status_t status = ucp_put_nbi((ucp_ep_h)ep_ptr, (void *)laddr, size, raddr, + (ucp_rkey_h)rkey_ptr); + + if (UCS_STATUS_IS_ERR(status)) { + JNU_ThrowExceptionByStatus(env, status); + } +} + +JNIEXPORT jobject JNICALL +Java_org_openucx_jucx_ucp_UcpEndpoint_getNonBlockingNative(JNIEnv *env, jclass cls, + jlong ep_ptr, jlong raddr, + jlong rkey_ptr, jlong laddr, + jlong size, jobject callback) +{ + ucs_status_ptr_t request = ucp_get_nb((ucp_ep_h)ep_ptr, (void *)laddr, size, + raddr, (ucp_rkey_h)rkey_ptr, jucx_request_callback); + + ucs_trace_req("JUCX: get_nb request %p, raddr: %zu, size: %zu, result address: %zu", + request, raddr, size, laddr); + return process_request(request, callback); +} + +JNIEXPORT void JNICALL +Java_org_openucx_jucx_ucp_UcpEndpoint_getNonBlockingImplicitNative(JNIEnv *env, jclass cls, + jlong ep_ptr, jlong raddr, + jlong rkey_ptr, jlong laddr, + jlong size) +{ + ucs_status_t status = ucp_get_nbi((ucp_ep_h)ep_ptr, (void *)laddr, size, raddr, + (ucp_rkey_h)rkey_ptr); + + if (UCS_STATUS_IS_ERR(status)) { + JNU_ThrowExceptionByStatus(env, status); + } +} + +JNIEXPORT jobject JNICALL +Java_org_openucx_jucx_ucp_UcpEndpoint_sendTaggedNonBlockingNative(JNIEnv *env, jclass cls, + jlong ep_ptr, jlong addr, + jlong size, jlong tag, + jobject callback) +{ + ucs_status_ptr_t request = ucp_tag_send_nb((ucp_ep_h)ep_ptr, (void *)addr, size, + ucp_dt_make_contig(1), tag, jucx_request_callback); + + ucs_trace_req("JUCX: send_tag_nb request %p, size: %zu, tag: %ld", + request, size, tag); + return process_request(request, callback); +} + +JNIEXPORT jobject JNICALL +Java_org_openucx_jucx_ucp_UcpEndpoint_sendStreamNonBlockingNative(JNIEnv *env, jclass cls, + jlong ep_ptr, jlong addr, + jlong size, jobject callback) +{ + ucs_status_ptr_t request = ucp_stream_send_nb((ucp_ep_h)ep_ptr, (void *)addr, size, + ucp_dt_make_contig(1), jucx_request_callback, 0); + + ucs_trace_req("JUCX: send_stream_nb request %p, size: %zu", request, size); + return process_request(request, callback); +} + +JNIEXPORT jobject JNICALL +Java_org_openucx_jucx_ucp_UcpEndpoint_recvStreamNonBlockingNative(JNIEnv *env, jclass cls, + jlong ep_ptr, jlong addr, + jlong size, jlong flags, + jobject callback) +{ + size_t rlength; + ucs_status_ptr_t request = ucp_stream_recv_nb((ucp_ep_h)ep_ptr, (void *)addr, size, + ucp_dt_make_contig(1), stream_recv_callback, + &rlength, flags); + + ucs_trace_req("JUCX: recv_stream_nb request %p, size: %zu", request, size); + + if (request == NULL) { + // If request completed immidiately. + return process_completed_stream_recv(rlength, callback); + } + + return process_request(request, callback); +} + +JNIEXPORT jobject JNICALL +Java_org_openucx_jucx_ucp_UcpEndpoint_flushNonBlockingNative(JNIEnv *env, jclass cls, + jlong ep_ptr, + jobject callback) +{ + ucs_status_ptr_t request = ucp_ep_flush_nb((ucp_ep_h)ep_ptr, 0, jucx_request_callback); + + return process_request(request, callback); +} diff --git a/bindings/java/src/main/native/jucx_common_def.cc b/bindings/java/src/main/native/jucx_common_def.cc index 1b9c76c0bf9..db89149e4d7 100644 --- a/bindings/java/src/main/native/jucx_common_def.cc +++ b/bindings/java/src/main/native/jucx_common_def.cc @@ -4,26 +4,332 @@ */ #include "jucx_common_def.h" +extern "C" { + #include + #include + #include +} + +#include /* inet_addr */ +#include /* setlocale */ +#include /* memset */ + + +static JavaVM *jvm_global; +static jclass jucx_request_cls; +static jfieldID native_id_field; +static jfieldID recv_size_field; +static jmethodID on_success; +static jmethodID jucx_request_constructor; +static jclass ucp_rkey_cls; +static jmethodID ucp_rkey_cls_constructor; +static jclass ucp_tag_msg_cls; +static jmethodID ucp_tag_msg_cls_constructor; + +extern "C" JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM *jvm, void* reserved) { + setlocale(LC_NUMERIC, "C"); + ucs_debug_disable_signals(); + jvm_global = jvm; + JNIEnv* env; + if (jvm->GetEnv(reinterpret_cast(&env), JNI_VERSION_1_1) != JNI_OK) { + return JNI_ERR; + } + + jclass jucx_request_cls_local = env->FindClass("org/openucx/jucx/ucp/UcpRequest"); + jucx_request_cls = (jclass) env->NewGlobalRef(jucx_request_cls_local); + jclass jucx_callback_cls = env->FindClass("org/openucx/jucx/UcxCallback"); + native_id_field = env->GetFieldID(jucx_request_cls, "nativeId", "Ljava/lang/Long;"); + recv_size_field = env->GetFieldID(jucx_request_cls, "recvSize", "J"); + on_success = env->GetMethodID(jucx_callback_cls, "onSuccess", + "(Lorg/openucx/jucx/ucp/UcpRequest;)V"); + jucx_request_constructor = env->GetMethodID(jucx_request_cls, "", "(J)V"); + + jclass ucp_rkey_cls_local = env->FindClass("org/openucx/jucx/ucp/UcpRemoteKey"); + ucp_rkey_cls = (jclass) env->NewGlobalRef(ucp_rkey_cls_local); + ucp_rkey_cls_constructor = env->GetMethodID(ucp_rkey_cls, "", "(J)V"); + jclass ucp_tag_msg_cls_local = env->FindClass("org/openucx/jucx/ucp/UcpTagMessage"); + ucp_tag_msg_cls = (jclass) env->NewGlobalRef(ucp_tag_msg_cls_local); + ucp_tag_msg_cls_constructor = env->GetMethodID(ucp_tag_msg_cls, "", "(JJJ)V"); + return JNI_VERSION_1_1; +} + +extern "C" JNIEXPORT void JNICALL JNI_OnUnload(JavaVM *jvm, void *reserved) { + JNIEnv* env; + if (jvm->GetEnv(reinterpret_cast(&env), JNI_VERSION_1_1) != JNI_OK) { + return; + } + + if (jucx_request_cls != NULL) { + env->DeleteGlobalRef(jucx_request_cls); + } +} + +bool j2cInetSockAddr(JNIEnv *env, jobject sock_addr, sockaddr_storage& ss, socklen_t& sa_len) +{ + jfieldID field; + memset(&ss, 0, sizeof(ss)); + sa_len = 0; + + if (sock_addr == NULL) { + JNU_ThrowException(env, "j2cInetSockAddr: InetSocketAddr is null"); + return false; + } + + jclass inetsockaddr_cls = env->GetObjectClass(sock_addr); + + // Get sockAddr->port + jmethodID getPort = env->GetMethodID(inetsockaddr_cls, "getPort", "()I"); + jint port = env->CallIntMethod(sock_addr, getPort); + + // Get sockAddr->getAddress (InetAddress) + jmethodID getAddress = env->GetMethodID(inetsockaddr_cls, "getAddress", + "()Ljava/net/InetAddress;"); + jobject inet_address = env->CallObjectMethod(sock_addr, getAddress); + + if (inet_address == NULL) { + JNU_ThrowException(env, "j2cInetSockAddr: InetSocketAddr.getAddress is null"); + return false; + } + jclass inetaddr_cls = env->GetObjectClass(inet_address); -static inline void log_error(const char* error) + // Get address family. In Java IPv4 has addressFamily = 1, IPv6 = 2. + field = env->GetFieldID(inetaddr_cls, "holder", + "Ljava/net/InetAddress$InetAddressHolder;"); + jobject inet_addr_holder = env->GetObjectField(inet_address, field); + jclass inet_addr_holder_cls = env->GetObjectClass(inet_addr_holder); + field = env->GetFieldID(inet_addr_holder_cls, "family", "I"); + jint family = env->GetIntField(inet_addr_holder, field); + + field = env->GetStaticFieldID(inetaddr_cls, "IPv4", "I"); + const int JAVA_IPV4_FAMILY = env->GetStaticIntField(inetaddr_cls, field); + field = env->GetStaticFieldID(inetaddr_cls, "IPv6", "I"); + const int JAVA_IPV6_FAMILY = env->GetStaticIntField(inetaddr_cls, field); + + // Get the byte array that stores the IP address bytes in the InetAddress. + jmethodID get_addr_bytes = env->GetMethodID(inetaddr_cls, "getAddress", "()[B"); + jobject ip_byte_array = env->CallObjectMethod(inet_address, get_addr_bytes); + + if (ip_byte_array == NULL) { + JNU_ThrowException(env, "j2cInetSockAddr: InetAddr.getAddress.getAddress is null"); + return false; + } + + jbyteArray addressBytes = static_cast(ip_byte_array); + + if (family == JAVA_IPV4_FAMILY) { + // Deal with Inet4Address instances. + // We should represent this Inet4Address as an IPv4 sockaddr_in. + ss.ss_family = AF_INET; + sockaddr_in &sin = reinterpret_cast(ss); + sin.sin_port = htons(port); + jbyte *dst = reinterpret_cast(&sin.sin_addr.s_addr); + env->GetByteArrayRegion(addressBytes, 0, 4, dst); + sa_len = sizeof(sockaddr_in); + return true; + } else if (family == JAVA_IPV6_FAMILY) { + jclass inet6_addr_cls = env->FindClass("java/net/Inet6Address"); + ss.ss_family = AF_INET6; + sockaddr_in6& sin6 = reinterpret_cast(ss); + sin6.sin6_port = htons(port); + // IPv6 address. Copy the bytes... + jbyte *dst = reinterpret_cast(&sin6.sin6_addr.s6_addr); + env->GetByteArrayRegion(addressBytes, 0, 16, dst); + // ...and set the scope id... + jmethodID getScopeId = env->GetMethodID(inet6_addr_cls, "getScopeId", "()I"); + sin6.sin6_scope_id = env->CallIntMethod(inet_address, getScopeId); + sa_len = sizeof(sockaddr_in6); + return true; + } + JNU_ThrowException(env, "Unknown InetAddress family"); + return false; +} + +static inline void jucx_context_reset(struct jucx_context* ctx) { - ucs_error("JUCX - %s: %s \n", __FILE__, error); + ctx->callback = NULL; + ctx->jucx_request = NULL; + ctx->status = UCS_INPROGRESS; + ctx->length = 0; } -/** - * Throw a Java exception by name. Similar to SignalError. - */ -JNIEXPORT void JNICALL JNU_ThrowException(JNIEnv *env, const char *msg) +void jucx_request_init(void *request) +{ + struct jucx_context *ctx = (struct jucx_context *)request; + jucx_context_reset(ctx); + ucs_recursive_spinlock_init(&ctx->lock, 0); +} + +JNIEnv* get_jni_env() +{ + void *env; + jint rs = jvm_global->AttachCurrentThread(&env, NULL); + ucs_assert_always(rs == JNI_OK); + return (JNIEnv*)env; +} + +static inline void set_jucx_request_completed(JNIEnv *env, jobject jucx_request, + struct jucx_context *ctx) +{ + env->SetObjectField(jucx_request, native_id_field, NULL); + if ((ctx != NULL) && (ctx->length > 0)) { + env->SetLongField(jucx_request, recv_size_field, ctx->length); + } +} + +static inline void call_on_success(jobject callback, jobject request) +{ + JNIEnv *env = get_jni_env(); + env->CallVoidMethod(callback, on_success, request); +} + +static inline void call_on_error(jobject callback, ucs_status_t status) +{ + if (status == UCS_ERR_CANCELED) { + ucs_debug("JUCX: Request canceled"); + } else { + ucs_error("JUCX: request error: %s", ucs_status_string(status)); + } + + JNIEnv *env = get_jni_env(); + jclass callback_cls = env->GetObjectClass(callback); + jmethodID on_error = env->GetMethodID(callback_cls, "onError", "(ILjava/lang/String;)V"); + jstring error_msg = env->NewStringUTF(ucs_status_string(status)); + env->CallVoidMethod(callback, on_error, status, error_msg); +} + +static inline void jucx_call_callback(jobject callback, jobject jucx_request, + ucs_status_t status) +{ + if (status == UCS_OK) { + UCS_PROFILE_CALL_VOID(call_on_success, callback, jucx_request); + } else { + call_on_error(callback, status); + } +} + +UCS_PROFILE_FUNC_VOID(jucx_request_callback, (request, status), void *request, ucs_status_t status) +{ + struct jucx_context *ctx = (struct jucx_context *)request; + ucs_recursive_spin_lock(&ctx->lock); + if (ctx->jucx_request == NULL) { + // here because 1 of 2 reasons: + // 1. progress is in another thread and got here earlier then process_request happened. + // 2. this callback is inside ucp_tag_recv_nb function. + ctx->status = status; + ucs_recursive_spin_unlock(&ctx->lock); + return; + } + + JNIEnv *env = get_jni_env(); + set_jucx_request_completed(env, ctx->jucx_request, ctx); + + if (ctx->callback != NULL) { + jucx_call_callback(ctx->callback, ctx->jucx_request, status); + env->DeleteGlobalRef(ctx->callback); + } + + env->DeleteGlobalRef(ctx->jucx_request); + jucx_context_reset(ctx); + ucp_request_free(request); + ucs_recursive_spin_unlock(&ctx->lock); +} + +void recv_callback(void *request, ucs_status_t status, ucp_tag_recv_info_t *info) +{ + struct jucx_context *ctx = (struct jucx_context *)request; + ctx->length = info->length; + jucx_request_callback(request, status); +} + +void stream_recv_callback(void *request, ucs_status_t status, size_t length) +{ + struct jucx_context *ctx = (struct jucx_context *)request; + ctx->length = length; + jucx_request_callback(request, status); +} + +UCS_PROFILE_FUNC(jobject, process_request, (request, callback), void *request, jobject callback) +{ + JNIEnv *env = get_jni_env(); + jobject jucx_request = env->NewObject(jucx_request_cls, jucx_request_constructor, + (native_ptr)request); + + if (UCS_PTR_IS_PTR(request)) { + struct jucx_context *ctx = (struct jucx_context *)request; + ucs_recursive_spin_lock(&ctx->lock); + if (ctx->status == UCS_INPROGRESS) { + // request not completed yet, install user callback + if (callback != NULL) { + ctx->callback = env->NewGlobalRef(callback); + } + ctx->jucx_request = env->NewGlobalRef(jucx_request); + } else { + // request was completed whether by progress in other thread or inside + // ucp_tag_recv_nb function call. + set_jucx_request_completed(env, jucx_request, ctx); + if (callback != NULL) { + jucx_call_callback(callback, jucx_request, ctx->status); + } + jucx_context_reset(ctx); + ucp_request_free(request); + } + ucs_recursive_spin_unlock(&ctx->lock); + } else { + set_jucx_request_completed(env, jucx_request, NULL); + if (UCS_PTR_IS_ERR(request)) { + JNU_ThrowExceptionByStatus(env, UCS_PTR_STATUS(request)); + if (callback != NULL) { + call_on_error(callback, UCS_PTR_STATUS(request)); + } + } else if (callback != NULL) { + call_on_success(callback, jucx_request); + } + } + return jucx_request; +} + +jobject process_completed_stream_recv(size_t length, jobject callback) +{ + JNIEnv *env = get_jni_env(); + jobject jucx_request = env->NewObject(jucx_request_cls, jucx_request_constructor, NULL); + env->SetObjectField(jucx_request, native_id_field, NULL); + env->SetLongField(jucx_request, recv_size_field, length); + if (callback != NULL) { + jucx_call_callback(callback, jucx_request, UCS_OK); + } + return jucx_request; +} + +void jucx_connection_handler(ucp_conn_request_h conn_request, void *arg) +{ + jobject jucx_conn_handler = reinterpret_cast(arg); + + JNIEnv *env = get_jni_env(); + + // Construct connection request class instance + jclass conn_request_cls = env->FindClass("org/openucx/jucx/ucp/UcpConnectionRequest"); + jmethodID conn_request_constructor = env->GetMethodID(conn_request_cls, "", "(J)V"); + jobject jucx_conn_request = env->NewObject(conn_request_cls, conn_request_constructor, + (native_ptr)conn_request); + + // Call onConnectionRequest method + jclass jucx_conn_hndl_cls = env->FindClass("org/openucx/jucx/ucp/UcpListenerConnectionHandler"); + jmethodID on_conn_request = env->GetMethodID(jucx_conn_hndl_cls, "onConnectionRequest", + "(Lorg/openucx/jucx/ucp/UcpConnectionRequest;)V"); + env->CallVoidMethod(jucx_conn_handler, on_conn_request, jucx_conn_request); + env->DeleteGlobalRef(jucx_conn_handler); +} + + +jobject new_rkey_instance(JNIEnv *env, ucp_rkey_h rkey) { - jclass cls = env->FindClass("org/ucx/jucx/UcxException"); - log_error(msg); - if (cls != 0) {/* Otherwise an exception has already been thrown */ - env->ThrowNew(cls, msg); - } + return env->NewObject(ucp_rkey_cls, ucp_rkey_cls_constructor, (native_ptr)rkey); } -void JNU_ThrowExceptionByStatus(JNIEnv *env, ucs_status_t status) +jobject new_tag_msg_instance(JNIEnv *env, ucp_tag_message_h msg_tag, + ucp_tag_recv_info_t *info_tag) { - JNU_ThrowException(env, ucs_status_string(status)); + return env->NewObject(ucp_tag_msg_cls, ucp_tag_msg_cls_constructor, + (native_ptr)msg_tag, info_tag->length, info_tag->sender_tag); } diff --git a/bindings/java/src/main/native/jucx_common_def.h b/bindings/java/src/main/native/jucx_common_def.h index 5b7e2ee9fb6..833cc14f387 100644 --- a/bindings/java/src/main/native/jucx_common_def.h +++ b/bindings/java/src/main/native/jucx_common_def.h @@ -7,21 +7,103 @@ #include #include +#include +#include #include typedef uintptr_t native_ptr; -static void log_error(const char* error); +#define JUCX_DEFINE_LONG_CONSTANT(_name) do { \ + jfieldID field = env->GetStaticFieldID(cls, #_name, "J"); \ + if (field != NULL) { \ + env->SetStaticLongField(cls, field, _name); \ + } \ +} while(0) -JNIEXPORT void JNICALL JNU_ThrowException(JNIEnv *, const char *); +#define JUCX_DEFINE_INT_CONSTANT(_name) do { \ + jfieldID field = env->GetStaticFieldID(cls, #_name, "I"); \ + if (field != NULL) { \ + env->SetStaticIntField(cls, field, _name); \ + } \ +} while(0) -void JNU_ThrowExceptionByStatus(JNIEnv *, ucs_status_t); +/** + * Throw a Java exception by name. Similar to SignalError. + */ +#define JNU_ThrowException(_env, _msg) do { \ + jclass _cls = _env->FindClass("org/openucx/jucx/UcxException"); \ + ucs_error("JUCX: %s", _msg); \ + if (_cls != 0) { /* Otherwise an exception has already been thrown */ \ + _env->ThrowNew(_cls, _msg); \ + } \ +} while(0) -#define JUCX_DEFINE_LONG_CONSTANT(_name) do { \ - jfieldID field = env->GetStaticFieldID(cls, #_name, "J"); \ - env->SetStaticLongField(cls, field, _name); \ +#define JNU_ThrowExceptionByStatus(_env, _status) do { \ + JNU_ThrowException(_env, ucs_status_string(_status)); \ } while(0) +/** + * @brief Utility to convert Java InetSocketAddress class (corresponds to the Network Layer 4 + * and consists of an IP address and a port number) to corresponding sockaddr_storage struct. + * Supports IPv4 and IPv6. + */ +bool j2cInetSockAddr(JNIEnv *env, jobject sock_addr, sockaddr_storage& ss, socklen_t& sa_len); + +struct jucx_context { + jobject callback; + volatile jobject jucx_request; + ucs_status_t status; + ucs_recursive_spinlock_t lock; + size_t length; +}; + +void jucx_request_init(void *request); + +/** + * @brief Get the jni env object. To be able to call java methods from ucx async callbacks. + */ +JNIEnv* get_jni_env(); + +/** + * @brief Send callback used to invoke java callback class on completion of ucp operations. + */ +void jucx_request_callback(void *request, ucs_status_t status); + +/** + * @brief Recv callback used to invoke java callback class on completion of ucp tag_recv_nb operation. + */ +void recv_callback(void *request, ucs_status_t status, ucp_tag_recv_info_t *info); + +/** + * @brief Recv callback used to invoke java callback class on completion of ucp stream_recv_nb operation. + */ +void stream_recv_callback(void *request, ucs_status_t status, size_t length); + +/** + * @brief Utility to process request logic: if request is pointer - set callback to request context. + * If request is status - call callback directly. + * Returns jucx_request object, that could be monitored on completion. + */ +jobject process_request(void *request, jobject callback); + +/** + * @brief Call java callback on completed stream recv operation, that didn't invoke callback. + */ +jobject process_completed_stream_recv(size_t length, jobject callback); + +void jucx_connection_handler(ucp_conn_request_h conn_request, void *arg); + +/** + * @brief Creates new jucx rkey class. + */ +jobject new_rkey_instance(JNIEnv *env, ucp_rkey_h rkey); + +/** + * @brief Creates new jucx tag_msg class. + */ +jobject new_tag_msg_instance(JNIEnv *env, ucp_tag_message_h msg_tag, + ucp_tag_recv_info_t *info_tag); + #endif diff --git a/bindings/java/src/main/native/listener.cc b/bindings/java/src/main/native/listener.cc new file mode 100644 index 00000000000..3114e71488f --- /dev/null +++ b/bindings/java/src/main/native/listener.cc @@ -0,0 +1,65 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "jucx_common_def.h" +#include "org_openucx_jucx_ucp_UcpListener.h" + +#include /* memset */ + + +JNIEXPORT jlong JNICALL +Java_org_openucx_jucx_ucp_UcpListener_createUcpListener(JNIEnv *env, jclass cls, + jobject ucp_listener_params, + jlong worker_ptr) +{ + ucp_listener_params_t params; + ucp_listener_h listener; + jfieldID field; + ucp_worker_h ucp_worker = (ucp_worker_h)worker_ptr; + + // Get field mask + jclass jucx_listener_param_class = env->GetObjectClass(ucp_listener_params); + field = env->GetFieldID(jucx_listener_param_class, "fieldMask", "J"); + params.field_mask = env->GetLongField(ucp_listener_params, field); + + // Get sockAddr + field = env->GetFieldID(jucx_listener_param_class, + "sockAddr", "Ljava/net/InetSocketAddress;"); + jobject sock_addr = env->GetObjectField(ucp_listener_params, field); + + struct sockaddr_storage listen_addr; + socklen_t addrlen; + memset(&listen_addr, 0, sizeof(struct sockaddr_storage)); + + if (!j2cInetSockAddr(env, sock_addr, listen_addr, addrlen)) { + return -1; + } + + params.sockaddr.addr = (const struct sockaddr*)&listen_addr; + params.sockaddr.addrlen = addrlen; + + if (params.field_mask & UCP_LISTENER_PARAM_FIELD_CONN_HANDLER) { + field = env->GetFieldID(jucx_listener_param_class, + "connectionHandler", "Lorg/openucx/jucx/ucp/UcpListenerConnectionHandler;"); + jobject jucx_conn_handler = env->GetObjectField(ucp_listener_params, field); + params.conn_handler.arg = env->NewGlobalRef(jucx_conn_handler); + params.conn_handler.cb = jucx_connection_handler; + } + + ucs_status_t status = ucp_listener_create(ucp_worker, ¶ms, &listener); + if (status != UCS_OK) { + JNU_ThrowExceptionByStatus(env, status); + } + + return (native_ptr)listener; +} + +JNIEXPORT void JNICALL +Java_org_openucx_jucx_ucp_UcpListener_destroyUcpListenerNative(JNIEnv *env, + jclass cls, + jlong listener_ptr) +{ + ucp_listener_destroy((ucp_listener_h)listener_ptr); +} diff --git a/bindings/java/src/main/native/memory.cc b/bindings/java/src/main/native/memory.cc new file mode 100644 index 00000000000..8627aca89b5 --- /dev/null +++ b/bindings/java/src/main/native/memory.cc @@ -0,0 +1,45 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ +#include "jucx_common_def.h" +#include "org_openucx_jucx_ucp_UcpMemory.h" +#include "org_openucx_jucx_ucp_UcpRemoteKey.h" + + +JNIEXPORT void JNICALL +Java_org_openucx_jucx_ucp_UcpMemory_unmapMemoryNative(JNIEnv *env, jclass cls, + jlong context_ptr, jlong mem_ptr) +{ + ucs_status_t status = ucp_mem_unmap((ucp_context_h)context_ptr, (ucp_mem_h)mem_ptr); + if (status != UCS_OK) { + JNU_ThrowExceptionByStatus(env, status); + } +} + +JNIEXPORT jobject JNICALL +Java_org_openucx_jucx_ucp_UcpMemory_getRkeyBufferNative(JNIEnv *env, jclass cls, + jlong context_ptr, jlong mem_ptr) +{ + void *rkey_buffer; + size_t rkey_size; + + ucs_status_t status = ucp_rkey_pack((ucp_context_h)context_ptr, (ucp_mem_h)mem_ptr, + &rkey_buffer, &rkey_size); + if (status != UCS_OK) { + JNU_ThrowExceptionByStatus(env, status); + } + return env->NewDirectByteBuffer(rkey_buffer, rkey_size); +} + +JNIEXPORT void JNICALL +Java_org_openucx_jucx_ucp_UcpMemory_releaseRkeyBufferNative(JNIEnv *env, jclass cls, jobject rkey_buf) +{ + ucp_rkey_buffer_release(env->GetDirectBufferAddress(rkey_buf)); +} + +JNIEXPORT void JNICALL +Java_org_openucx_jucx_ucp_UcpRemoteKey_rkeyDestroy(JNIEnv *env, jclass cls, jlong rkey_ptr) +{ + ucp_rkey_destroy((ucp_rkey_h) rkey_ptr); +} diff --git a/bindings/java/src/main/native/request.cc b/bindings/java/src/main/native/request.cc new file mode 100644 index 00000000000..d65619b922e --- /dev/null +++ b/bindings/java/src/main/native/request.cc @@ -0,0 +1,23 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "org_openucx_jucx_ucp_UcpRequest.h" + +#include +#include + +JNIEXPORT jboolean JNICALL +Java_org_openucx_jucx_ucp_UcpRequest_isCompletedNative(JNIEnv *env, jclass cls, + jlong ucp_req_ptr) +{ + return ucp_request_check_status((void *)ucp_req_ptr) != UCS_INPROGRESS; +} + +JNIEXPORT void JNICALL +Java_org_openucx_jucx_ucp_UcpRequest_closeRequestNative(JNIEnv *env, jclass cls, + jlong ucp_req_ptr) +{ + ucp_request_free((void *)ucp_req_ptr); +} diff --git a/bindings/java/src/main/native/ucp_constants.cc b/bindings/java/src/main/native/ucp_constants.cc index 05ae14efd0d..c156aae4aea 100644 --- a/bindings/java/src/main/native/ucp_constants.cc +++ b/bindings/java/src/main/native/ucp_constants.cc @@ -3,7 +3,7 @@ * See file LICENSE for terms. */ -#include "org_ucx_jucx_ucp_UcpConstants.h" +#include "org_openucx_jucx_ucp_UcpConstants.h" #include "jucx_common_def.h" #include @@ -14,7 +14,7 @@ * */ JNIEXPORT void JNICALL -Java_org_ucx_jucx_ucp_UcpConstants_loadConstants(JNIEnv *env, jclass cls) +Java_org_openucx_jucx_ucp_UcpConstants_loadConstants(JNIEnv *env, jclass cls) { // UCP context parameters JUCX_DEFINE_LONG_CONSTANT(UCP_PARAM_FIELD_FEATURES); @@ -31,7 +31,7 @@ Java_org_ucx_jucx_ucp_UcpConstants_loadConstants(JNIEnv *env, jclass cls) JUCX_DEFINE_LONG_CONSTANT(UCP_FEATURE_WAKEUP); JUCX_DEFINE_LONG_CONSTANT(UCP_FEATURE_STREAM); - // UCP worker parameters. + // UCP worker parameters JUCX_DEFINE_LONG_CONSTANT(UCP_WORKER_PARAM_FIELD_THREAD_MODE); JUCX_DEFINE_LONG_CONSTANT(UCP_WORKER_PARAM_FIELD_CPU_MASK); JUCX_DEFINE_LONG_CONSTANT(UCP_WORKER_PARAM_FIELD_EVENTS); @@ -46,4 +46,42 @@ Java_org_ucx_jucx_ucp_UcpConstants_loadConstants(JNIEnv *env, jclass cls) JUCX_DEFINE_LONG_CONSTANT(UCP_WAKEUP_TX); JUCX_DEFINE_LONG_CONSTANT(UCP_WAKEUP_RX); JUCX_DEFINE_LONG_CONSTANT(UCP_WAKEUP_EDGE); + + // UCP listener parameters field mask + JUCX_DEFINE_LONG_CONSTANT(UCP_LISTENER_PARAM_FIELD_SOCK_ADDR); + JUCX_DEFINE_LONG_CONSTANT(UCP_LISTENER_PARAM_FIELD_ACCEPT_HANDLER); + JUCX_DEFINE_LONG_CONSTANT(UCP_LISTENER_PARAM_FIELD_CONN_HANDLER); + + // UCP endpoint parameters field mask + JUCX_DEFINE_LONG_CONSTANT(UCP_EP_PARAM_FIELD_REMOTE_ADDRESS); + JUCX_DEFINE_LONG_CONSTANT(UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE); + JUCX_DEFINE_LONG_CONSTANT(UCP_EP_PARAM_FIELD_ERR_HANDLER); + JUCX_DEFINE_LONG_CONSTANT(UCP_EP_PARAM_FIELD_USER_DATA); + JUCX_DEFINE_LONG_CONSTANT(UCP_EP_PARAM_FIELD_SOCK_ADDR); + JUCX_DEFINE_LONG_CONSTANT(UCP_EP_PARAM_FIELD_FLAGS); + JUCX_DEFINE_LONG_CONSTANT(UCP_EP_PARAM_FIELD_CONN_REQUEST); + + // UCP error handling mode + JUCX_DEFINE_INT_CONSTANT(UCP_ERR_HANDLING_MODE_PEER); + + // UCP endpoint close non blocking mode. + JUCX_DEFINE_INT_CONSTANT(UCP_EP_CLOSE_MODE_FORCE); + JUCX_DEFINE_INT_CONSTANT(UCP_EP_CLOSE_MODE_FLUSH); + + // The enumeration list describes the endpoint's parameters flags + JUCX_DEFINE_LONG_CONSTANT(UCP_EP_PARAMS_FLAGS_CLIENT_SERVER); + JUCX_DEFINE_LONG_CONSTANT(UCP_EP_PARAMS_FLAGS_NO_LOOPBACK); + + // UCP memory mapping parameters field mask + JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_PARAM_FIELD_ADDRESS); + JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_PARAM_FIELD_LENGTH); + JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_PARAM_FIELD_FLAGS); + + // The enumeration list describes the memory mapping flags + JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_NONBLOCK); + JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_ALLOCATE); + JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_FIXED); + + // The enumeration defines behavior of @ref ucp_stream_recv_nb function + JUCX_DEFINE_LONG_CONSTANT(UCP_STREAM_RECV_FLAG_WAITALL); } diff --git a/bindings/java/src/main/native/ucs_constants.cc b/bindings/java/src/main/native/ucs_constants.cc index 08e39b8afd9..28507b05c66 100644 --- a/bindings/java/src/main/native/ucs_constants.cc +++ b/bindings/java/src/main/native/ucs_constants.cc @@ -3,15 +3,15 @@ * See file LICENSE for terms. */ -#include "org_ucx_jucx_ucs_UcsConstants.h" +#include "org_openucx_jucx_ucs_UcsConstants.h" #include "jucx_common_def.h" #include JNIEXPORT void JNICALL -Java_org_ucx_jucx_ucs_UcsConstants_loadConstants(JNIEnv *env, jclass cls) +Java_org_openucx_jucx_ucs_UcsConstants_loadConstants(JNIEnv *env, jclass cls) { - jclass thread_mode = env->FindClass("org/ucx/jucx/ucs/UcsConstants$ThreadMode"); + jclass thread_mode = env->FindClass("org/openucx/jucx/ucs/UcsConstants$ThreadMode"); jfieldID field = env->GetStaticFieldID(thread_mode, "UCS_THREAD_MODE_MULTI", "I"); env->SetStaticIntField(thread_mode, field, UCS_THREAD_MODE_MULTI); } diff --git a/bindings/java/src/main/native/worker.cc b/bindings/java/src/main/native/worker.cc index 4ce058f96b6..1ca10eb140b 100644 --- a/bindings/java/src/main/native/worker.cc +++ b/bindings/java/src/main/native/worker.cc @@ -4,15 +4,15 @@ */ #include "jucx_common_def.h" -#include "org_ucx_jucx_ucp_UcpWorker.h" +#include "org_openucx_jucx_ucp_UcpWorker.h" /** * Bridge method for creating ucp_worker from java */ JNIEXPORT jlong JNICALL -Java_org_ucx_jucx_ucp_UcpWorker_createWorkerNative(JNIEnv *env, jclass cls, - jobject jucx_worker_params, - jlong context_ptr) +Java_org_openucx_jucx_ucp_UcpWorker_createWorkerNative(JNIEnv *env, jclass cls, + jobject jucx_worker_params, + jlong context_ptr) { ucp_worker_params_t worker_params = { 0 }; ucp_worker_h ucp_worker; @@ -68,8 +68,136 @@ Java_org_ucx_jucx_ucp_UcpWorker_createWorkerNative(JNIEnv *env, jclass cls, } JNIEXPORT void JNICALL -Java_org_ucx_jucx_ucp_UcpWorker_releaseWorkerNative(JNIEnv *env, jclass cls, - jlong ucp_worker_ptr) +Java_org_openucx_jucx_ucp_UcpWorker_releaseWorkerNative(JNIEnv *env, jclass cls, + jlong ucp_worker_ptr) { ucp_worker_destroy((ucp_worker_h)ucp_worker_ptr); } + + +JNIEXPORT jobject JNICALL +Java_org_openucx_jucx_ucp_UcpWorker_workerGetAddressNative(JNIEnv *env, jclass cls, + jlong ucp_worker_ptr) +{ + ucp_address_t *addr; + size_t len; + ucs_status_t status; + + status = ucp_worker_get_address((ucp_worker_h)ucp_worker_ptr, &addr, &len); + + if (status != UCS_OK) { + JNU_ThrowExceptionByStatus(env, status); + return NULL; + } + + return env->NewDirectByteBuffer(addr, len); +} + +JNIEXPORT void JNICALL +Java_org_openucx_jucx_ucp_UcpWorker_releaseAddressNative(JNIEnv *env, jclass cls, + jlong ucp_worker_ptr, + jobject ucp_address) +{ + + ucp_worker_release_address((ucp_worker_h)ucp_worker_ptr, + (ucp_address_t *)env->GetDirectBufferAddress(ucp_address)); +} + +JNIEXPORT jint JNICALL +Java_org_openucx_jucx_ucp_UcpWorker_progressWorkerNative(JNIEnv *env, jclass cls, jlong ucp_worker_ptr) +{ + return ucp_worker_progress((ucp_worker_h)ucp_worker_ptr); +} + +JNIEXPORT jobject JNICALL +Java_org_openucx_jucx_ucp_UcpWorker_flushNonBlockingNative(JNIEnv *env, jclass cls, + jlong ucp_worker_ptr, + jobject callback) +{ + ucs_status_ptr_t request = ucp_worker_flush_nb((ucp_worker_h)ucp_worker_ptr, 0, + jucx_request_callback); + + return process_request(request, callback); +} + +JNIEXPORT void JNICALL +Java_org_openucx_jucx_ucp_UcpWorker_waitWorkerNative(JNIEnv *env, jclass cls, jlong ucp_worker_ptr) +{ + ucs_status_t status = ucp_worker_wait((ucp_worker_h)ucp_worker_ptr); + + if (status != UCS_OK) { + JNU_ThrowExceptionByStatus(env, status); + } +} + +JNIEXPORT void JNICALL +Java_org_openucx_jucx_ucp_UcpWorker_signalWorkerNative(JNIEnv *env, jclass cls, jlong ucp_worker_ptr) +{ + ucs_status_t status = ucp_worker_signal((ucp_worker_h)ucp_worker_ptr); + + if (status != UCS_OK) { + JNU_ThrowExceptionByStatus(env, status); + } +} + +JNIEXPORT jobject JNICALL +Java_org_openucx_jucx_ucp_UcpWorker_recvTaggedNonBlockingNative(JNIEnv *env, jclass cls, + jlong ucp_worker_ptr, + jlong laddr, jlong size, + jlong tag, jlong tagMask, + jobject callback) +{ + ucs_status_ptr_t request = ucp_tag_recv_nb((ucp_worker_h)ucp_worker_ptr, + (void *)laddr, size, + ucp_dt_make_contig(1), tag, tagMask, + recv_callback); + + ucs_trace_req("JUCX: tag_recv_nb request %p, msg size: %zu, tag: %ld", request, size, tag); + + return process_request(request, callback); +} + +JNIEXPORT jobject JNICALL +Java_org_openucx_jucx_ucp_UcpWorker_tagProbeNonBlockingNative(JNIEnv *env, jclass cls, + jlong ucp_worker_ptr, + jlong tag, jlong tag_mask, + jboolean remove) +{ + ucp_tag_recv_info_t info_tag; + ucp_tag_message_h msg_tag = ucp_tag_probe_nb((ucp_worker_h)ucp_worker_ptr, tag, tag_mask, + remove, &info_tag); + jobject result = NULL; + + if (msg_tag != NULL) { + result = new_tag_msg_instance(env, msg_tag, &info_tag); + } + + return result; +} + +JNIEXPORT jobject JNICALL +Java_org_openucx_jucx_ucp_UcpWorker_recvTaggedMessageNonBlockingNative(JNIEnv *env, jclass cls, + jlong ucp_worker_ptr, + jlong laddr, jlong size, + jlong msg_ptr, + jobject callback) +{ + ucs_status_ptr_t request = ucp_tag_msg_recv_nb((ucp_worker_h)ucp_worker_ptr, + (void *)laddr, size, + ucp_dt_make_contig(1), + (ucp_tag_message_h)msg_ptr, + recv_callback); + + ucs_trace_req("JUCX: tag_msg_recv_nb request %p, msg size: %zu, msg: %p", request, size, + (ucp_tag_message_h)msg_ptr); + + return process_request(request, callback); +} + +JNIEXPORT void JNICALL +Java_org_openucx_jucx_ucp_UcpWorker_cancelRequestNative(JNIEnv *env, jclass cls, + jlong ucp_worker_ptr, + jlong ucp_request_ptr) +{ + ucp_request_cancel((ucp_worker_h)ucp_worker_ptr, (void *)ucp_request_ptr); +} diff --git a/bindings/java/src/test/java/org/ucx/jucx/UcpContextTest.java b/bindings/java/src/test/java/org/openucx/jucx/UcpContextTest.java similarity index 65% rename from bindings/java/src/test/java/org/ucx/jucx/UcpContextTest.java rename to bindings/java/src/test/java/org/openucx/jucx/UcpContextTest.java index d59d33a266f..8450604083d 100644 --- a/bindings/java/src/test/java/org/ucx/jucx/UcpContextTest.java +++ b/bindings/java/src/test/java/org/openucx/jucx/UcpContextTest.java @@ -3,15 +3,15 @@ * See file LICENSE for terms. */ -package org.ucx.jucx; +package org.openucx.jucx; import org.junit.Test; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; -import org.ucx.jucx.ucp.UcpContext; -import org.ucx.jucx.ucp.UcpParams; +import org.openucx.jucx.ucp.UcpContext; +import org.openucx.jucx.ucp.UcpParams; public class UcpContextTest { @@ -40,6 +40,25 @@ public void testCreateUcpContextRdma() { UcpContext context = createContext(contextParams); closeContext(context); } + + @Test + public void testConfigMap() { + UcpParams contextParams = new UcpParams().requestTagFeature() + .setConfig("TLS", "abcd").setConfig("NOT_EXISTING_", "234"); + boolean catched = false; + try { + createContext(contextParams); + } catch (UcxException exception) { + assertEquals("No such device", exception.getMessage()); + catched = true; + } + assertTrue(catched); + + // Return back original config + contextParams = new UcpParams().requestTagFeature().setConfig("TLS", "all"); + UcpContext context = createContext(contextParams); + closeContext(context); + } @Test(expected = NullPointerException.class) public void testCatchJVMSignal() { diff --git a/bindings/java/src/test/java/org/openucx/jucx/UcpEndpointTest.java b/bindings/java/src/test/java/org/openucx/jucx/UcpEndpointTest.java new file mode 100644 index 00000000000..ae5d4b080a0 --- /dev/null +++ b/bindings/java/src/test/java/org/openucx/jucx/UcpEndpointTest.java @@ -0,0 +1,499 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx; + +import org.junit.Test; +import org.openucx.jucx.ucp.*; + +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.HashMap; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.junit.Assert.*; + +public class UcpEndpointTest extends UcxTest { + @Test + public void testConnectToListenerByWorkerAddr() { + UcpContext context = new UcpContext(new UcpParams().requestStreamFeature()); + UcpWorker worker = context.newWorker(new UcpWorkerParams()); + UcpEndpointParams epParams = new UcpEndpointParams().setUcpAddress(worker.getAddress()) + .setPeerErrorHandlingMode().setNoLoopbackMode(); + UcpEndpoint endpoint = worker.newEndpoint(epParams); + assertNotNull(endpoint.getNativeId()); + + Collections.addAll(resources, context, worker, endpoint); + closeResources(); + } + + @Test + public void testGetNB() { + // Crerate 2 contexts + 2 workers + UcpParams params = new UcpParams().requestRmaFeature(); + UcpWorkerParams rdmaWorkerParams = new UcpWorkerParams().requestWakeupRMA(); + UcpContext context1 = new UcpContext(params); + UcpContext context2 = new UcpContext(params); + UcpWorker worker1 = context1.newWorker(rdmaWorkerParams); + UcpWorker worker2 = context2.newWorker(rdmaWorkerParams); + + // Create endpoint worker1 -> worker2 + UcpEndpointParams epParams = new UcpEndpointParams().setPeerErrorHandlingMode() + .setUcpAddress(worker2.getAddress()); + UcpEndpoint endpoint = worker1.newEndpoint(epParams); + + // Allocate 2 source and 2 destination buffers, to perform 2 RDMA Read operations + ByteBuffer src1 = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + ByteBuffer src2 = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + ByteBuffer dst1 = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + ByteBuffer dst2 = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + src1.asCharBuffer().put(UcpMemoryTest.RANDOM_TEXT); + src2.asCharBuffer().put(UcpMemoryTest.RANDOM_TEXT + UcpMemoryTest.RANDOM_TEXT); + + // Register source buffers on context2 + UcpMemory memory1 = context2.registerMemory(src1); + UcpMemory memory2 = context2.registerMemory(src2); + + UcpRemoteKey rkey1 = endpoint.unpackRemoteKey(memory1.getRemoteKeyBuffer()); + UcpRemoteKey rkey2 = endpoint.unpackRemoteKey(memory2.getRemoteKeyBuffer()); + + AtomicInteger numCompletedRequests = new AtomicInteger(0); + HashMap requestToData = new HashMap<>(); + UcxCallback callback = new UcxCallback() { + @Override + public void onSuccess(UcpRequest request) { + // Here thread safety is guaranteed since worker progress is called after + // request added to map. In multithreaded environment could be an issue that + // callback is called, but request wasn't added yet to map. + if (requestToData.get(request) == dst1) { + assertEquals(UcpMemoryTest.RANDOM_TEXT, dst1.asCharBuffer().toString().trim()); + memory1.deregister(); + } else { + assertEquals(UcpMemoryTest.RANDOM_TEXT + UcpMemoryTest.RANDOM_TEXT, + dst2.asCharBuffer().toString().trim()); + memory2.deregister(); + } + numCompletedRequests.incrementAndGet(); + } + }; + + // Submit 2 get requests + UcpRequest request1 = endpoint.getNonBlocking(memory1.getAddress(), rkey1, dst1, callback); + UcpRequest request2 = endpoint.getNonBlocking(memory2.getAddress(), rkey2, dst2, callback); + + // Map each request to corresponding data buffer. + requestToData.put(request1, dst1); + requestToData.put(request2, dst2); + + // Wait for 2 get operations to complete + while (numCompletedRequests.get() != 2) { + worker1.progress(); + worker2.progress(); + } + + assertTrue(request1.isCompleted() && request2.isCompleted()); + + Collections.addAll(resources, context2, context1, worker2, worker1, endpoint, rkey2, + rkey1); + closeResources(); + } + + @Test + public void testPutNB() { + // Crerate 2 contexts + 2 workers + UcpParams params = new UcpParams().requestRmaFeature(); + UcpWorkerParams rdmaWorkerParams = new UcpWorkerParams().requestWakeupRMA(); + UcpContext context1 = new UcpContext(params); + UcpContext context2 = new UcpContext(params); + UcpWorker worker1 = context1.newWorker(rdmaWorkerParams); + UcpWorker worker2 = context2.newWorker(rdmaWorkerParams); + + ByteBuffer src = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + ByteBuffer dst = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + src.asCharBuffer().put(UcpMemoryTest.RANDOM_TEXT); + + // Register destination buffer on context2 + UcpMemory memory = context2.registerMemory(dst); + UcpEndpoint ep = + worker1.newEndpoint(new UcpEndpointParams().setUcpAddress(worker2.getAddress())); + + UcpRemoteKey rkey = ep.unpackRemoteKey(memory.getRemoteKeyBuffer()); + ep.putNonBlocking(src, memory.getAddress(), rkey, null); + + worker1.progressRequest(worker1.flushNonBlocking(null)); + + assertEquals(UcpMemoryTest.RANDOM_TEXT, dst.asCharBuffer().toString().trim()); + + Collections.addAll(resources, context2, context1, worker2, worker1, rkey, ep, memory); + closeResources(); + } + + @Test + public void testSendRecv() throws Exception { + // Crerate 2 contexts + 2 workers + UcpParams params = new UcpParams().requestRmaFeature().requestTagFeature(); + UcpWorkerParams rdmaWorkerParams = new UcpWorkerParams().requestWakeupRMA(); + UcpContext context1 = new UcpContext(params); + UcpContext context2 = new UcpContext(params); + UcpWorker worker1 = context1.newWorker(rdmaWorkerParams); + UcpWorker worker2 = context2.newWorker(rdmaWorkerParams); + + // Allocate 2 source and 2 destination buffers, to perform 2 RDMA Read operations + UcpMemMapParams allocationParams = new UcpMemMapParams().allocate() + .setLength(UcpMemoryTest.MEM_SIZE); + UcpMemory memory1 = context1.memoryMap(allocationParams); + UcpMemory memory2 = context1.memoryMap(allocationParams); + ByteBuffer src1 = UcxUtils.getByteBufferView(memory1.getAddress(), UcpMemoryTest.MEM_SIZE); + ByteBuffer src2 = UcxUtils.getByteBufferView(memory1.getAddress(), UcpMemoryTest.MEM_SIZE); + ByteBuffer dst1 = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + ByteBuffer dst2 = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + src1.asCharBuffer().put(UcpMemoryTest.RANDOM_TEXT); + src2.asCharBuffer().put(UcpMemoryTest.RANDOM_TEXT + UcpMemoryTest.RANDOM_TEXT); + + AtomicInteger receivedMessages = new AtomicInteger(0); + worker2.recvTaggedNonBlocking(dst1, 0, 0, new UcxCallback() { + @Override + public void onSuccess(UcpRequest request) { + assertEquals(dst1, src1); + receivedMessages.incrementAndGet(); + } + }); + + worker2.recvTaggedNonBlocking(dst2, 1, -1, new UcxCallback() { + @Override + public void onSuccess(UcpRequest request) { + assertEquals(dst2, src2); + receivedMessages.incrementAndGet(); + } + }); + + UcpEndpoint ep = worker1.newEndpoint(new UcpEndpointParams() + .setUcpAddress(worker2.getAddress())); + + ep.sendTaggedNonBlocking(src1, 0, null); + ep.sendTaggedNonBlocking(src2, 1, null); + + while (receivedMessages.get() != 2) { + worker1.progress(); + worker2.progress(); + } + + Collections.addAll(resources, context2, context1, worker2, worker1, memory2, memory1, ep); + closeResources(); + } + + @Test + public void testRecvAfterSend() { + // Crerate 2 contexts + 2 workers + UcpParams params = new UcpParams().requestRmaFeature().requestTagFeature() + .setMtWorkersShared(true); + UcpWorkerParams rdmaWorkerParams = new UcpWorkerParams().requestWakeupRMA() + .requestThreadSafety(); + UcpContext context1 = new UcpContext(params); + UcpContext context2 = new UcpContext(params); + UcpWorker worker1 = context1.newWorker(rdmaWorkerParams); + UcpWorker worker2 = context2.newWorker(rdmaWorkerParams); + + UcpEndpoint ep = worker1.newEndpoint(new UcpEndpointParams() + .setPeerErrorHandlingMode() + .setUcpAddress(worker2.getAddress())); + + ByteBuffer src1 = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + ByteBuffer dst1 = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + + ep.sendTaggedNonBlocking(src1, 0, null); + + Thread progressThread = new Thread() { + @Override + public void run() { + while (!isInterrupted()) { + worker1.progress(); + worker2.progress(); + } + } + }; + + progressThread.setDaemon(true); + progressThread.start(); + + try { + Thread.sleep(5); + } catch (InterruptedException ignored) { } + + AtomicBoolean success = new AtomicBoolean(false); + + worker2.recvTaggedNonBlocking(dst1, 0, -1, new UcxCallback() { + @Override + public void onSuccess(UcpRequest request) { + assertEquals(UcpMemoryTest.MEM_SIZE, request.getRecvSize()); + success.set(true); + } + }); + + try { + int count = 0; + while ((++count < 100) && !success.get()) { + Thread.sleep(50); + } + } catch (InterruptedException ignored) { } + + assertTrue(success.get()); + UcpRequest closeRequest = ep.closeNonBlockingForce(); + + while (!closeRequest.isCompleted()) { + try { + // Wait until progress thread will close the endpoint. + Thread.sleep(10); + } catch (InterruptedException e) { + e.printStackTrace(); + } finally { + closeRequest.close(); + } + } + + progressThread.interrupt(); + try { + progressThread.join(); + } catch (InterruptedException ignored) { } + + Collections.addAll(resources, context1, context2, worker1, worker2); + closeResources(); + } + + @Test + public void testBufferOffset() { + int msgSize = 200; + int offset = 100; + // Crerate 2 contexts + 2 workers + UcpParams params = new UcpParams().requestTagFeature(); + UcpWorkerParams rdmaWorkerParams = new UcpWorkerParams().requestWakeupRMA(); + UcpContext context1 = new UcpContext(params); + UcpContext context2 = new UcpContext(params); + UcpWorker worker1 = context1.newWorker(rdmaWorkerParams); + UcpWorker worker2 = context2.newWorker(rdmaWorkerParams); + + ByteBuffer bigRecvBuffer = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + ByteBuffer bigSendBuffer = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + + bigRecvBuffer.position(offset).limit(offset + msgSize); + UcpRequest recv = worker1.recvTaggedNonBlocking(bigRecvBuffer, 0, + 0, null); + + UcpEndpoint ep = worker2.newEndpoint(new UcpEndpointParams() + .setUcpAddress(worker1.getAddress())); + + byte[] msg = new byte[msgSize]; + for (int i = 0; i < msgSize; i++) { + msg[i] = (byte)i; + } + + bigSendBuffer.position(offset).limit(offset + msgSize); + bigSendBuffer.put(msg); + bigSendBuffer.position(offset); + + UcpRequest sent = ep.sendTaggedNonBlocking(bigSendBuffer, 0, null); + + while (!sent.isCompleted() || !recv.isCompleted()) { + worker1.progress(); + worker2.progress(); + } + + bigSendBuffer.position(offset).limit(offset + msgSize); + bigRecvBuffer.position(offset).limit(offset + msgSize); + final ByteBuffer sendData = bigSendBuffer.slice(); + final ByteBuffer recvData = bigRecvBuffer.slice(); + assertEquals("Send buffer not equals to recv buffer", sendData, recvData); + + Collections.addAll(resources, context2, context1, worker2, worker1, ep); + closeResources(); + } + + @Test + public void testFlushEp() { + int numRequests = 10; + // Crerate 2 contexts + 2 workers + UcpParams params = new UcpParams().requestRmaFeature(); + UcpWorkerParams rdmaWorkerParams = new UcpWorkerParams().requestWakeupRMA(); + UcpContext context1 = new UcpContext(params); + UcpContext context2 = new UcpContext(params); + + ByteBuffer src = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + src.asCharBuffer().put(UcpMemoryTest.RANDOM_TEXT); + ByteBuffer dst = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + UcpMemory memory = context2.registerMemory(src); + + UcpWorker worker1 = context1.newWorker(rdmaWorkerParams); + UcpWorker worker2 = context2.newWorker(rdmaWorkerParams); + + UcpEndpoint ep = worker1.newEndpoint(new UcpEndpointParams() + .setUcpAddress(worker2.getAddress()).setPeerErrorHandlingMode()); + UcpRemoteKey rkey = ep.unpackRemoteKey(memory.getRemoteKeyBuffer()); + + int blockSize = UcpMemoryTest.MEM_SIZE / numRequests; + for (int i = 0; i < numRequests; i++) { + ep.getNonBlockingImplicit(memory.getAddress() + i * blockSize, rkey, + UcxUtils.getAddress(dst) + i * blockSize, blockSize); + } + + UcpRequest request = ep.flushNonBlocking(new UcxCallback() { + @Override + public void onSuccess(UcpRequest request) { + rkey.close(); + memory.deregister(); + assertEquals(dst.asCharBuffer().toString().trim(), UcpMemoryTest.RANDOM_TEXT); + } + }); + + while (!request.isCompleted()) { + worker1.progress(); + worker2.progress(); + } + + Collections.addAll(resources, context2, context1, worker2, worker1, ep); + closeResources(); + } + + @Test + public void testRecvSize() { + UcpContext context1 = new UcpContext(new UcpParams().requestTagFeature()); + UcpContext context2 = new UcpContext(new UcpParams().requestTagFeature()); + + UcpWorker worker1 = context1.newWorker(new UcpWorkerParams()); + UcpWorker worker2 = context2.newWorker(new UcpWorkerParams()); + + UcpEndpoint ep = worker1.newEndpoint( + new UcpEndpointParams().setUcpAddress(worker2.getAddress())); + + ByteBuffer sendBuffer = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + ByteBuffer recvBuffer = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + + sendBuffer.limit(UcpMemoryTest.MEM_SIZE / 2); + + UcpRequest send = ep.sendTaggedNonBlocking(sendBuffer, null); + UcpRequest recv = worker2.recvTaggedNonBlocking(recvBuffer, null); + + while (!send.isCompleted() || !recv.isCompleted()) { + worker1.progress(); + worker2.progress(); + } + + assertEquals(UcpMemoryTest.MEM_SIZE / 2, recv.getRecvSize()); + + Collections.addAll(resources, context1, context2, worker1, worker2, ep); + closeResources(); + } + + @Test + public void testStreamingAPI() { + UcpParams params = new UcpParams().requestStreamFeature().requestRmaFeature(); + UcpContext context1 = new UcpContext(params); + UcpContext context2 = new UcpContext(params); + + UcpWorker worker1 = context1.newWorker(new UcpWorkerParams()); + UcpWorker worker2 = context2.newWorker(new UcpWorkerParams()); + + UcpEndpoint clientToServer = worker1.newEndpoint( + new UcpEndpointParams().setUcpAddress(worker2.getAddress())); + + UcpEndpoint serverToClient = worker2.newEndpoint( + new UcpEndpointParams().setUcpAddress(worker1.getAddress())); + + ByteBuffer sendBuffer = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + sendBuffer.put(0, (byte)1); + ByteBuffer recvBuffer = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE * 2); + + UcpRequest[] sends = new UcpRequest[2]; + + sends[0] = clientToServer.sendStreamNonBlocking(sendBuffer, new UcxCallback() { + @Override + public void onSuccess(UcpRequest request) { + sendBuffer.put(0, (byte)2); + sends[1] = clientToServer.sendStreamNonBlocking(sendBuffer, null); + } + }); + + while (sends[1] == null || !sends[1].isCompleted()) { + worker1.progress(); + worker2.progress(); + } + + AtomicBoolean received = new AtomicBoolean(false); + serverToClient.recvStreamNonBlocking( + UcxUtils.getAddress(recvBuffer), UcpMemoryTest.MEM_SIZE * 2, + UcpConstants.UCP_STREAM_RECV_FLAG_WAITALL, + new UcxCallback() { + @Override + public void onSuccess(UcpRequest request) { + assertEquals(request.getRecvSize(), UcpMemoryTest.MEM_SIZE * 2); + assertEquals((byte)1, recvBuffer.get(0)); + assertEquals((byte)2, recvBuffer.get(UcpMemoryTest.MEM_SIZE)); + received.set(true); + } + }); + + while (!received.get()) { + worker1.progress(); + worker2.progress(); + } + + Collections.addAll(resources, context1, context2, worker1, worker2, clientToServer, + serverToClient); + closeResources(); + } + + @Test + public void testEpErrorHandler() { + // Crerate 2 contexts + 2 workers + UcpParams params = new UcpParams().requestTagFeature(); + UcpWorkerParams workerParams = new UcpWorkerParams(); + UcpContext context1 = new UcpContext(params); + UcpContext context2 = new UcpContext(params); + UcpWorker worker1 = context1.newWorker(workerParams); + UcpWorker worker2 = context2.newWorker(workerParams); + + ByteBuffer src = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + ByteBuffer dst = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + src.asCharBuffer().put(UcpMemoryTest.RANDOM_TEXT); + + AtomicBoolean errorHandlerCalled = new AtomicBoolean(false); + UcpEndpointParams epParams = new UcpEndpointParams() + .setPeerErrorHandlingMode() + .setErrorHandler((ep, status, errorMsg) -> errorHandlerCalled.set(true)) + .setUcpAddress(worker2.getAddress()); + UcpEndpoint ep = + worker1.newEndpoint(epParams); + + UcpRequest recv = worker2.recvTaggedNonBlocking(dst, null); + UcpRequest send = ep.sendTaggedNonBlocking(src, null); + + while (!send.isCompleted() || !recv.isCompleted()) { + worker1.progress(); + worker2.progress(); + } + + // Closing receiver worker & context + worker2.close(); + context2.close(); + assertNull(context2.getNativeId()); + + AtomicBoolean errorCallabackCalled = new AtomicBoolean(false); + + ep.sendTaggedNonBlocking(src, null); + worker1.progressRequest(ep.flushNonBlocking(new UcxCallback() { + @Override + public void onError(int ucsStatus, String errorMsg) { + errorCallabackCalled.set(true); + } + })); + + assertTrue(errorHandlerCalled.get()); + assertTrue(errorCallabackCalled.get()); + + ep.close(); + worker1.close(); + context1.close(); + } +} diff --git a/bindings/java/src/test/java/org/openucx/jucx/UcpListenerTest.java b/bindings/java/src/test/java/org/openucx/jucx/UcpListenerTest.java new file mode 100644 index 00000000000..658a6019700 --- /dev/null +++ b/bindings/java/src/test/java/org/openucx/jucx/UcpListenerTest.java @@ -0,0 +1,154 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ +package org.openucx.jucx; + +import org.junit.Test; +import org.openucx.jucx.ucp.*; + +import java.net.InetAddress; +import java.net.InetSocketAddress; +import java.net.NetworkInterface; +import java.net.SocketException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.atomic.AtomicReference; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.Assert.*; + +public class UcpListenerTest extends UcxTest { + static final int port = Integer.parseInt( + System.getenv().getOrDefault("JUCX_TEST_PORT", "55321")); + + @Test + public void testCreateUcpListener() { + UcpContext context = new UcpContext(new UcpParams().requestStreamFeature()); + UcpWorker worker = context.newWorker(new UcpWorkerParams()); + InetSocketAddress ipv4 = new InetSocketAddress("0.0.0.0", port); + try { + UcpListener ipv4Listener = worker.newListener( + new UcpListenerParams().setSockAddr(ipv4)); + + assertNotNull(ipv4Listener); + ipv4Listener.close(); + } catch (UcxException ignored) { } + + try { + InetSocketAddress ipv6 = new InetSocketAddress("::", port); + UcpListener ipv6Listener = worker.newListener( + new UcpListenerParams().setSockAddr(ipv6)); + + assertNotNull(ipv6Listener); + ipv6Listener.close(); + } catch (UcxException ignored) { } + + worker.close(); + context.close(); + } + + static Stream getInterfaces() { + try { + return Collections.list(NetworkInterface.getNetworkInterfaces()).stream() + .filter(iface -> { + try { + return iface.isUp() && !iface.isLoopback(); + } catch (SocketException e) { + return false; + } + }); + } catch (SocketException e) { + return Stream.empty(); + } + } + + /** + * Iterates over network interfaces and tries to bind and create listener + * on a specific socket address. + */ + static UcpListener tryBindListener(UcpWorker worker, UcpListenerParams params) { + UcpListener result = null; + List addresses = getInterfaces().flatMap(iface -> + Collections.list(iface.getInetAddresses()).stream()) + .collect(Collectors.toList()); + for (InetAddress address : addresses) { + try { + result = worker.newListener( + params.setSockAddr(new InetSocketAddress(address, port))); + break; + } catch (UcxException ignored) { } + } + assertNotNull("Could not find socket address to start UcpListener", result); + return result; + } + + @Test + public void testConnectionHandler() { + UcpContext context1 = new UcpContext(new UcpParams().requestStreamFeature() + .requestRmaFeature()); + UcpContext context2 = new UcpContext(new UcpParams().requestStreamFeature() + .requestRmaFeature()); + UcpWorker serverWorker1 = context1.newWorker(new UcpWorkerParams()); + UcpWorker serverWorker2 = context1.newWorker(new UcpWorkerParams()); + UcpWorker clientWorker = context2.newWorker(new UcpWorkerParams()); + + AtomicReference conRequest = new AtomicReference<>(null); + + // Create listener and set connection handler + UcpListenerParams listenerParams = new UcpListenerParams() + .setConnectionHandler(conRequest::set); + UcpListener listener = tryBindListener(serverWorker1, listenerParams); + + UcpEndpoint clientToServer = clientWorker.newEndpoint(new UcpEndpointParams() + .setSocketAddress(listener.getAddress())); + + while (conRequest.get() == null) { + serverWorker1.progress(); + clientWorker.progress(); + } + + // Create endpoint from another worker from pool. + UcpEndpoint serverToClient = serverWorker2.newEndpoint( + new UcpEndpointParams().setConnectionRequest(conRequest.get())); + + // Temporary workaround until new connection establishment protocol in UCX. + for (int i = 0; i < 10; i++) { + serverWorker1.progress(); + serverWorker2.progress(); + clientWorker.progress(); + try { + Thread.sleep(10); + } catch (Exception ignored) { } + } + + UcpRequest sent = serverToClient.sendStreamNonBlocking( + ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE), null); + + // Progress all workers to make sure recv request will complete immediately + for (int i = 0; i < 10; i++) { + serverWorker1.progress(); + serverWorker2.progress(); + clientWorker.progress(); + try { + Thread.sleep(2); + } catch (Exception ignored) { } + } + + UcpRequest recv = clientToServer.recvStreamNonBlocking( + ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE), 0, null); + + while (!sent.isCompleted() || !recv.isCompleted()) { + serverWorker1.progress(); + clientWorker.progress(); + } + + assertEquals(UcpMemoryTest.MEM_SIZE, recv.getRecvSize()); + + Collections.addAll(resources, context2, context1, clientWorker, serverWorker1, + serverWorker2, listener, serverToClient, clientToServer); + closeResources(); + } +} diff --git a/bindings/java/src/test/java/org/openucx/jucx/UcpMemoryTest.java b/bindings/java/src/test/java/org/openucx/jucx/UcpMemoryTest.java new file mode 100644 index 00000000000..01668d003ee --- /dev/null +++ b/bindings/java/src/test/java/org/openucx/jucx/UcpMemoryTest.java @@ -0,0 +1,81 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ +package org.openucx.jucx; + +import org.junit.Test; + +import org.openucx.jucx.ucp.*; + +import java.nio.ByteBuffer; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.UUID; + +import static java.nio.file.StandardOpenOption.*; +import static org.junit.Assert.*; + +public class UcpMemoryTest { + static int MEM_SIZE = 4096; + static String RANDOM_TEXT = UUID.randomUUID().toString(); + + @Test + public void testMmapFile() throws Exception { + UcpContext context = new UcpContext(new UcpParams().requestTagFeature()); + Path tempFile = Files.createTempFile("jucx", "test"); + // 1. Create FileChannel to file in tmp directory. + FileChannel fileChannel = FileChannel.open(tempFile, CREATE, WRITE, READ, DELETE_ON_CLOSE); + MappedByteBuffer buf = fileChannel.map(FileChannel.MapMode.READ_WRITE, 0, MEM_SIZE); + buf.asCharBuffer().put(RANDOM_TEXT); + buf.force(); + // 2. Register mmap buffer with ODP + UcpMemory mmapedMemory = context.memoryMap(new UcpMemMapParams() + .setAddress(UcxUtils.getAddress(buf)).setLength(MEM_SIZE).nonBlocking()); + + assertEquals(mmapedMemory.getAddress(), UcxUtils.getAddress(buf)); + + // 3. Test allocation + UcpMemory allocatedMemory = context.memoryMap(new UcpMemMapParams() + .allocate().setLength(MEM_SIZE).nonBlocking()); + assertEquals(allocatedMemory.getLength(), MEM_SIZE); + + allocatedMemory.deregister(); + mmapedMemory.deregister(); + fileChannel.close(); + context.close(); + } + + @Test + public void testGetRkey() { + UcpContext context = new UcpContext(new UcpParams().requestRmaFeature()); + ByteBuffer buf = ByteBuffer.allocateDirect(MEM_SIZE); + UcpMemory mem = context.registerMemory(buf); + ByteBuffer rkeyBuffer = mem.getRemoteKeyBuffer(); + assertTrue(rkeyBuffer.capacity() > 0); + assertTrue(mem.getAddress() > 0); + mem.deregister(); + context.close(); + } + + @Test + public void testRemoteKeyUnpack() { + UcpContext context = new UcpContext(new UcpParams().requestRmaFeature()); + UcpWorker worker1 = new UcpWorker(context, new UcpWorkerParams()); + UcpWorker worker2 = new UcpWorker(context, new UcpWorkerParams()); + UcpEndpoint endpoint = new UcpEndpoint(worker1, + new UcpEndpointParams().setUcpAddress(worker2.getAddress())); + ByteBuffer buf = ByteBuffer.allocateDirect(MEM_SIZE); + UcpMemory mem = context.registerMemory(buf); + UcpRemoteKey rkey = endpoint.unpackRemoteKey(mem.getRemoteKeyBuffer()); + assertNotNull(rkey.getNativeId()); + rkey.close(); + mem.deregister(); + endpoint.close(); + worker1.close(); + worker2.close(); + context.close(); + } +} diff --git a/bindings/java/src/test/java/org/openucx/jucx/UcpRequestTest.java b/bindings/java/src/test/java/org/openucx/jucx/UcpRequestTest.java new file mode 100644 index 00000000000..0ac1fc6327c --- /dev/null +++ b/bindings/java/src/test/java/org/openucx/jucx/UcpRequestTest.java @@ -0,0 +1,31 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ +package org.openucx.jucx; + +import org.junit.Test; +import org.openucx.jucx.ucp.*; + +import java.nio.ByteBuffer; +import static org.junit.Assert.*; + +public class UcpRequestTest { + @Test + public void testCancelRequest() { + UcpContext context = new UcpContext(new UcpParams().requestTagFeature()); + UcpWorker worker = context.newWorker(new UcpWorkerParams()); + UcpRequest recv = worker.recvTaggedNonBlocking(ByteBuffer.allocateDirect(100), null); + worker.cancelRequest(recv); + + while (!recv.isCompleted()) { + worker.progress(); + } + + assertTrue(recv.isCompleted()); + assertNull(recv.getNativeId()); + + worker.close(); + context.close(); + } +} diff --git a/bindings/java/src/test/java/org/openucx/jucx/UcpWorkerTest.java b/bindings/java/src/test/java/org/openucx/jucx/UcpWorkerTest.java new file mode 100644 index 00000000000..d896898a038 --- /dev/null +++ b/bindings/java/src/test/java/org/openucx/jucx/UcpWorkerTest.java @@ -0,0 +1,203 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx; + +import org.junit.Test; +import org.openucx.jucx.ucp.*; +import org.openucx.jucx.ucs.UcsConstants; + +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.concurrent.atomic.AtomicBoolean; + +import static org.junit.Assert.*; + +public class UcpWorkerTest extends UcxTest { + private static int numWorkers = Runtime.getRuntime().availableProcessors(); + + @Test + public void testSingleWorker() { + UcpContext context = new UcpContext(new UcpParams().requestTagFeature()); + assertEquals(2, UcsConstants.ThreadMode.UCS_THREAD_MODE_MULTI); + assertNotEquals(context.getNativeId(), null); + UcpWorker worker = context.newWorker(new UcpWorkerParams()); + assertNotNull(worker.getNativeId()); + assertEquals(0, worker.progress()); // No communications was submitted. + worker.close(); + assertNull(worker.getNativeId()); + context.close(); + } + + @Test + public void testMultipleWorkersWithinSameContext() { + UcpContext context = new UcpContext(new UcpParams().requestTagFeature()); + assertNotEquals(context.getNativeId(), null); + UcpWorker[] workers = new UcpWorker[numWorkers]; + UcpWorkerParams workerParam = new UcpWorkerParams(); + for (int i = 0; i < numWorkers; i++) { + workerParam.clear().setCpu(i).requestThreadSafety(); + workers[i] = context.newWorker(workerParam); + assertNotNull(workers[i].getNativeId()); + } + for (int i = 0; i < numWorkers; i++) { + workers[i].close(); + } + context.close(); + } + + @Test + public void testMultipleWorkersFromMultipleContexts() { + UcpContext tcpContext = new UcpContext(new UcpParams().requestTagFeature()); + UcpContext rdmaContext = new UcpContext(new UcpParams().requestRmaFeature() + .requestAtomic64BitFeature().requestAtomic32BitFeature()); + UcpWorker[] workers = new UcpWorker[numWorkers]; + UcpWorkerParams workerParams = new UcpWorkerParams(); + for (int i = 0; i < numWorkers; i++) { + ByteBuffer userData = ByteBuffer.allocateDirect(100); + workerParams.clear(); + if (i % 2 == 0) { + userData.asCharBuffer().put("TCPWorker" + i); + workerParams.requestWakeupRX().setUserData(userData); + workers[i] = tcpContext.newWorker(workerParams); + } else { + userData.asCharBuffer().put("RDMAWorker" + i); + workerParams.requestWakeupRMA().setCpu(i).setUserData(userData) + .requestThreadSafety(); + workers[i] = rdmaContext.newWorker(workerParams); + } + } + for (int i = 0; i < numWorkers; i++) { + workers[i].close(); + } + tcpContext.close(); + rdmaContext.close(); + } + + @Test + public void testGetWorkerAddress() { + UcpContext context = new UcpContext(new UcpParams().requestTagFeature()); + UcpWorker worker = context.newWorker(new UcpWorkerParams()); + ByteBuffer workerAddress = worker.getAddress(); + assertNotNull(workerAddress); + assertTrue(workerAddress.capacity() > 0); + worker.close(); + context.close(); + } + + @Test + public void testWorkerSleepWakeup() throws InterruptedException { + UcpContext context = new UcpContext(new UcpParams() + .requestRmaFeature().requestWakeupFeature()); + UcpWorker worker = context.newWorker( + new UcpWorkerParams().requestWakeupRMA()); + + AtomicBoolean success = new AtomicBoolean(false); + Thread workerProgressThread = new Thread() { + @Override + public void run() { + while (!isInterrupted()) { + if (worker.progress() == 0) { + worker.waitForEvents(); + } + } + success.set(true); + } + }; + + workerProgressThread.start(); + + workerProgressThread.interrupt(); + worker.signal(); + + workerProgressThread.join(); + assertTrue(success.get()); + + worker.close(); + context.close(); + } + + @Test + public void testFlushWorker() { + int numRequests = 10; + // Crerate 2 contexts + 2 workers + UcpParams params = new UcpParams().requestRmaFeature(); + UcpWorkerParams rdmaWorkerParams = new UcpWorkerParams().requestWakeupRMA(); + UcpContext context1 = new UcpContext(params); + UcpContext context2 = new UcpContext(params); + + ByteBuffer src = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + ByteBuffer dst = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + dst.asCharBuffer().put(UcpMemoryTest.RANDOM_TEXT); + UcpMemory memory = context2.registerMemory(src); + + UcpWorker worker1 = context1.newWorker(rdmaWorkerParams); + UcpWorker worker2 = context2.newWorker(rdmaWorkerParams); + + UcpEndpoint ep = worker1.newEndpoint( new UcpEndpointParams() + .setUcpAddress(worker2.getAddress()).setPeerErrorHandlingMode()); + UcpRemoteKey rkey = ep.unpackRemoteKey(memory.getRemoteKeyBuffer()); + + int blockSize = UcpMemoryTest.MEM_SIZE / numRequests; + for (int i = 0; i < numRequests; i++) { + ep.putNonBlockingImplicit(UcxUtils.getAddress(dst) + i * blockSize, + blockSize, memory.getAddress() + i * blockSize, rkey); + } + + UcpRequest request = worker1.flushNonBlocking(new UcxCallback() { + @Override + public void onSuccess(UcpRequest request) { + rkey.close(); + memory.deregister(); + assertEquals(dst.asCharBuffer().toString().trim(), UcpMemoryTest.RANDOM_TEXT); + } + }); + + while (!request.isCompleted()) { + worker1.progress(); + worker2.progress(); + } + + assertTrue(request.isCompleted()); + Collections.addAll(resources, context1, context2, worker1, worker2, ep); + closeResources(); + } + + @Test + public void testTagProbe() { + UcpParams params = new UcpParams().requestTagFeature(); + UcpContext context1 = new UcpContext(params); + UcpContext context2 = new UcpContext(params); + + UcpWorker worker1 = context1.newWorker(new UcpWorkerParams()); + UcpWorker worker2 = context2.newWorker(new UcpWorkerParams()); + ByteBuffer recvBuffer = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE); + + UcpTagMessage message = worker1.tagProbeNonBlocking(0, 0, false); + + assertNull(message); + + UcpEndpoint endpoint = worker2.newEndpoint( + new UcpEndpointParams().setUcpAddress(worker1.getAddress())); + + endpoint.sendTaggedNonBlocking( + ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE), null); + + do { + worker1.progress(); + worker2.progress(); + message = worker1.tagProbeNonBlocking(0, 0, true); + } while (message == null); + + assertEquals(UcpMemoryTest.MEM_SIZE, message.getRecvLength()); + assertEquals(0, message.getSenderTag()); + + UcpRequest recv = worker1.recvTaggedMessageNonBlocking(recvBuffer, message, null); + + worker1.progressRequest(recv); + + Collections.addAll(resources, context1, context2, worker1, worker2, endpoint); + } +} diff --git a/bindings/java/src/test/java/org/openucx/jucx/UcxTest.java b/bindings/java/src/test/java/org/openucx/jucx/UcxTest.java new file mode 100644 index 00000000000..5d40f2da2bd --- /dev/null +++ b/bindings/java/src/test/java/org/openucx/jucx/UcxTest.java @@ -0,0 +1,25 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +package org.openucx.jucx; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Stack; + +abstract class UcxTest { + // Stack of closable resources (context, worker, etc.) to be closed at the end. + protected static Stack resources = new Stack<>(); + + protected void closeResources() { + while (!resources.empty()) { + try { + resources.pop().close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } +} diff --git a/bindings/java/src/test/java/org/ucx/jucx/UcpWorkerTest.java b/bindings/java/src/test/java/org/ucx/jucx/UcpWorkerTest.java deleted file mode 100644 index 7a5de5e79e4..00000000000 --- a/bindings/java/src/test/java/org/ucx/jucx/UcpWorkerTest.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -package org.ucx.jucx; - -import org.junit.Test; -import org.ucx.jucx.ucp.UcpContext; -import org.ucx.jucx.ucp.UcpParams; -import org.ucx.jucx.ucp.UcpWorker; -import org.ucx.jucx.ucp.UcpWorkerParams; -import org.ucx.jucx.ucs.UcsConstants; - -import java.nio.ByteBuffer; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotEquals; - -public class UcpWorkerTest { - static int numWorkers = Runtime.getRuntime().availableProcessors(); - - @Test - public void testSingleWorker() { - UcpContext context = new UcpContext(new UcpParams().requestTagFeature()); - assertEquals(2, UcsConstants.ThreadMode.UCS_THREAD_MODE_MULTI); - assertNotEquals(context.getNativeId(), null); - UcpWorker worker = new UcpWorker(context, new UcpWorkerParams()); - assertNotEquals(worker.getNativeId(), null); - worker.close(); - assertEquals(worker.getNativeId(), null); - context.close(); - } - - @Test - public void testMultipleWorkersWithinSameContext() { - UcpContext context = new UcpContext(new UcpParams().requestTagFeature()); - assertNotEquals(context.getNativeId(), null); - UcpWorker workers[] = new UcpWorker[numWorkers]; - UcpWorkerParams workerParam = new UcpWorkerParams(); - for (int i = 0; i < numWorkers; i++) { - workerParam.clear().setCpu(i).requestThreadSafety(); - workers[i] = new UcpWorker(context, workerParam); - assertNotEquals(workers[i].getNativeId(), null); - } - for (int i = 0; i < numWorkers; i++) { - workers[i].close(); - } - context.close(); - } - - @Test - public void testMultipleWorkersFromMultipleContexts() { - UcpContext tcpContext = new UcpContext(new UcpParams().requestTagFeature()); - UcpContext rdmaContext = new UcpContext(new UcpParams().requestRmaFeature() - .requestAtomic64BitFeature().requestAtomic32BitFeature()); - UcpWorker workers[] = new UcpWorker[numWorkers]; - UcpWorkerParams workerParams = new UcpWorkerParams(); - for (int i = 0; i < numWorkers; i++) { - ByteBuffer userData = ByteBuffer.allocateDirect(100); - workerParams.clear(); - if (i % 2 == 0) { - userData.asCharBuffer().put("TCPWorker" + i); - workerParams.requestWakeupRX().setUserData(userData); - workers[i] = new UcpWorker(tcpContext, workerParams); - } else { - userData.asCharBuffer().put("RDMAWorker" + i); - workerParams.requestWakeupRMA().setCpu(i).setUserData(userData) - .requestThreadSafety(); - workers[i] = new UcpWorker(rdmaContext, workerParams); - } - } - for (int i = 0; i < numWorkers; i++) { - workers[i].close(); - } - tcpContext.close(); - rdmaContext.close(); - } -} diff --git a/buildlib/az-distro-release.yml b/buildlib/az-distro-release.yml new file mode 100644 index 00000000000..12162952a0c --- /dev/null +++ b/buildlib/az-distro-release.yml @@ -0,0 +1,94 @@ +jobs: + - job: distro_release + displayName: distro + + pool: + name: MLNX + demands: + - harbor_registry -equals yes + + variables: + - name: MOFED + value: mofed5.0-1.0.0.0 + + timeoutInMinutes: 180 + + strategy: + matrix: + centos7_cuda10_1: + build_container: centos7_cuda10_1 + artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-centos7-${{ variables.MOFED }}-cuda10.1.tar.bz2 + centos7_cuda10_2: + build_container: centos7_cuda10_2 + artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-centos7-${{ variables.MOFED }}-cuda10.2.tar.bz2 + centos7_cuda11_0: + build_container: centos7_cuda11_0 + artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-centos7-${{ variables.MOFED }}-cuda11.0.tar.bz2 + ubuntu16_cuda10_1: + build_container: ubuntu16_cuda10_1 + artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-ubuntu16.04-${{ variables.MOFED }}-cuda10.1.deb + ubuntu16_cuda10_2: + build_container: ubuntu16_cuda10_2 + artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-ubuntu16.04-${{ variables.MOFED }}-cuda10.2.deb + ubuntu18_cuda10_1: + build_container: ubuntu18_cuda10_1 + artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-ubuntu18.04-${{ variables.MOFED }}-cuda10.1.deb + ubuntu18_cuda10_2: + build_container: ubuntu18_cuda10_2 + artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-ubuntu18.04-${{ variables.MOFED }}-cuda10.2.deb + ubuntu18_cuda11_0: + build_container: ubuntu18_cuda11_0 + artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-ubuntu18.04-${{ variables.MOFED }}-cuda11.0.deb + + container: $[ variables['build_container'] ] + + steps: + - checkout: self + clean: true + path: "we/need/to/go/deeper" + # ^ Avoid rpmbuild error: Dest dir longer than base dir is not supported + + - bash: | + set -eE + ./autogen.sh + mkdir pkg-build + cd pkg-build + ../contrib/configure-release --with-cuda + displayName: Configure + + - bash: | + set -eE + cd pkg-build + ../contrib/buildrpm.sh -s -t -b --strict-ibverbs-dep + cd rpm-dist/`uname -m` + tar -cjf "../../../${AZ_ARTIFACT_NAME}" *.rpm + cd ../../.. + tar -tjf "${AZ_ARTIFACT_NAME}" + displayName: Build RPM package + condition: and(succeeded(), contains(variables['artifact_name'], 'centos')) + env: + AZ_ARTIFACT_NAME: $(artifact_name) + + - bash: | + set -eE + cd pkg-build + dpkg-buildpackage -us -uc + find .. -name '*.deb' + find .. -name '*.deb' -exec cp {} "../${AZ_ARTIFACT_NAME}" \; + dpkg-deb -I "../${AZ_ARTIFACT_NAME}" + displayName: Build DEB package + condition: and(succeeded(), contains(variables['artifact_name'], 'ubuntu')) + env: + AZ_ARTIFACT_NAME: $(artifact_name) + + - task: GithubRelease@0 + displayName: Upload artifacts to draft release + inputs: + githubConnection: release + repositoryName: openucx/ucx + action: edit + tag: $(Build.SourceBranchName) + isDraft: true + addChangeLog: false + assetUploadMode: replace + assets: "./$(artifact_name)" diff --git a/buildlib/azure-pipelines-pr.yml b/buildlib/azure-pipelines-pr.yml new file mode 100644 index 00000000000..dc5b734be7e --- /dev/null +++ b/buildlib/azure-pipelines-pr.yml @@ -0,0 +1,173 @@ +# See https://aka.ms/yaml +# This pipeline to be run on PRs + +trigger: none +pr: + - master + - v*.*.x + +resources: + containers: + - container: centos7 + image: ucfconsort.azurecr.io/ucx/centos7:1 + endpoint: ucfconsort_registry + - container: fedora + image: ucfconsort.azurecr.io/ucx/fedora:3 + endpoint: ucfconsort_registry + +stages: + - stage: Codestyle + jobs: + # Check that commit title matches code style guidelines + - job: commit_title + displayName: commit title + steps: + - checkout: self + clean: true + + - bash: | + set -eE + range="remotes/origin/$(System.PullRequest.TargetBranch)..$(Build.SourceVersion)" + ok=1 + for sha1 in `git log $range --format="%h"` + do + title=`git log -1 --format="%s" $sha1` + if echo $title | grep -qP '^Merge |^[0-9A-Z/_\-]*: \w' + then + echo "Good commit title: '$title'" + else + echo "Bad commit title: '$title'" + ok=0 + fi + done + if [ $ok -ne 1 ] + then + url="https://github.com/openucx/ucx/wiki/Guidance-for-contributors#general-guidelines" + echo "##vso[task.logissue type=error]Bad commit title(s), see $url for more info." + echo "##vso[task.complete result=Failed;]" + fi + condition: eq(variables['Build.Reason'], 'PullRequest') + + - stage: Build + jobs: + - job: static_checks + displayName: Static checks + container: fedora + steps: + - checkout: self + clean: true + + - bash: ./autogen.sh + displayName: Setup autotools + + - bash: | + set -eE + mkdir build && cd build + clang --version + gcc --version + cppcheck --version + ../contrib/configure-release + displayName: Configure + + - bash: | + set -eE + + cd build + + export PATH="`csclng --print-path-to-wrap`:`cscppc --print-path-to-wrap`:`cswrap --print-path-to-wrap`:$PATH" + make -j`nproc` 2>&1 | tee compile.log + displayName: Build + + - bash: | + set -eE + + cd build + + cs_errors="cs.err" + cslinker --quiet compile.log \ + | csgrep --mode=json --path $(dirname $PWD) --strip-path-prefix $(dirname $PWD) \ + | csgrep --mode=json --invert-match --path 'conftest.c' \ + | csgrep --mode=grep --invert-match --event "internal warning" --prune-events=1 \ + > $cs_errors + + if [ -s $cs_errors ]; then + echo "static checkers found errors:" + cat $cs_errors + echo "##vso[task.logissue type=error]static checkers found errors" + echo "##vso[task.complete result=Failed;]" + else + echo "No errors reported by static checkers" + fi + displayName: cstools reports + + # Perform test builds on relevant distributions. + - job: Distros + displayName: Build for + strategy: + matrix: + centos7: + CONTAINER: centos7 + CONFIGURE_OPTS: + container: $[ variables['CONTAINER'] ] + steps: + - checkout: self + clean: true + + - bash: ./autogen.sh + displayName: Setup autotools + + - bash: | + set -eE + mkdir build && cd build + ../configure $(CONFIGURE_OPTS) + displayName: Configure + + - bash: | + set -eE + cd build + gcc -v + make -s -j `nproc` + displayName: Build for $(CONTAINER) + + # Test RPM build + - job: build_rpm + displayName: build tarball and source rpm + container: fedora + steps: + - checkout: self + clean: true + + - bash: ./autogen.sh + displayName: Setup autotools + + - bash: | + set -eE + gcc --version + ./contrib/configure-release + ./contrib/buildrpm.sh -s -t -b + displayName: Build tarball + + - stage: Tests + dependsOn: [Codestyle] + jobs: + - template: tests.yml + parameters: + name: althca + demands: ucx_althca -equals yes + - template: tests.yml + parameters: + name: legacy + demands: ucx_legacy -equals yes + - template: tests.yml + parameters: + name: gpu + demands: ucx_gpu -equals yes + - template: tests.yml + parameters: + name: new + demands: ucx_new -equals yes + - template: tests.yml + parameters: + name: hwi + demands: ucx_hwi -equals yes + diff --git a/buildlib/azure-pipelines-release.yml b/buildlib/azure-pipelines-release.yml new file mode 100644 index 00000000000..ca0601517f5 --- /dev/null +++ b/buildlib/azure-pipelines-release.yml @@ -0,0 +1,74 @@ +# See https://aka.ms/yaml +# This pipeline to be run on tags creation + +pr: none +trigger: + tags: + include: + - v* + +resources: + containers: + - container: centos7 + image: ucfconsort.azurecr.io/ucx/centos7:2 + endpoint: ucfconsort_registry + - container: centos7_cuda10_1 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos7-mofed5.0-cuda10.1:1 + - container: centos7_cuda10_2 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos7-mofed5.0-cuda10.2:1 + - container: centos7_cuda11_0 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos7-mofed5.0-cuda11.0:1 + - container: ubuntu16_cuda10_1 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu16.04-mofed5.0-cuda10.1:1 + - container: ubuntu16_cuda10_2 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu16.04-mofed5.0-cuda10.2:1 + - container: ubuntu18_cuda10_1 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu18.04-mofed5.0-cuda10.1:1 + - container: ubuntu18_cuda10_2 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu18.04-mofed5.0-cuda10.2:1 + - container: ubuntu18_cuda11_0 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu18.04-mofed5.0-cuda11.0:1 + +stages: + # Create an empty draft to avoid race condition in distro releases + - stage: GitHubDraft + jobs: + - job: DraftRelease + container: centos7 + steps: + - checkout: self + clean: true + path: "we/need/to/go/deeper" + + - bash: ./autogen.sh + displayName: Setup autotools + + - bash: | + set -eE + gcc --version + ./contrib/configure-release + ./contrib/buildrpm.sh -s -t -b + displayName: Build tarball + + - task: GithubRelease@0 + displayName: Create/edit GitHub Draft Release + inputs: + githubConnection: release + repositoryName: openucx/ucx + action: edit + tag: $(Build.SourceBranchName) + isDraft: true + addChangeLog: false + releaseNotesSource: file + releaseNotesFile: NEWS + assetUploadMode: replace + assets: | + ./ucx-*.tar.gz + ./rpm-dist/ucx-*.src.rpm + + - stage: Release + jobs: + - template: az-distro-release.yml + - template: jucx-publish.yml + parameters: + target: publish-release diff --git a/buildlib/azure-pipelines.md b/buildlib/azure-pipelines.md new file mode 100644 index 00000000000..49267ce1422 --- /dev/null +++ b/buildlib/azure-pipelines.md @@ -0,0 +1,91 @@ +# Introduction + +This project uses Azure Pipelines a GitHub check to validate pull requests +prior to merging. Each time a pull request is updated AZP will spawn VMs and +run compiles and tests based on the instructions in the +buildlib/azure-pipelines.yml file. + +The test console output is linked from the GitHub check integration. + +Azure Pipelines is linked to the UCF Consortium's Azure Tenant: + + https://portal.azure.com + +And runs inside the Azure Dev Ops Organization: + + https://dev.azure.com/ucfconsort + +As the UCX project: + + https://dev.azure.com/ucfconsort/ucx + +# Containers + +Most of the build steps are done inside Docker containers. The container +allows direct control and customization over the operating system environment +to achieve the required test. + +UCF hosts a private docker registry on the Azure Container Registry at +ucfconsort.azurecr.io: + + https://portal.azure.com/#@jgunthorpegmail.onmicrosoft.com/resource/subscriptions/b8ff5e38-a317-4bbd-9831-b73d3887df30/resourceGroups/PipelinesRG/providers/Microsoft.ContainerRegistry/registries/ucfconsort/overview + +The Azure Pipelines VM's have high speed access to this registry and can boot +containers failure quickly. + +## Dockerfiles + +Each container is described by a docker file in buildlib/. Dockerfiles can be +built locally using the build command at the top of the Dockerfile. Every +container has a unique name and tag reflecting its content. So that builds +continue to work on any stable branches the container version number should be +incremented when a build-incompatible change is made. + +Once built the docker container needs to be pushed to the ACR, using the +following steps: + +```shell +$ az login +$ az acr login --name ucfconsort +$ docker push ucfconsort.azurecr.io/ucx/centos7:1 +``` + +See https://docs.microsoft.com/en-us/cli/azure for details on how to get the +command line tools. + +## Alternate to 'docker push' + +If network connectivity is too poor for push, then the container can be built +on a VM inside Azure using this command: + +```shell +$ az acr build --registry ucfconsort -t ucfconsort.azurecr.io/ucx/centos7:1 -f buildlib/centos7.Dockerfile buildlib/ +``` + +## Testing Containers Locally + +The local container can be entered and checked out using a command sequence +similar to: + +```shell +$ cd ../../ucx +$ docker run --rm -ti -v `pwd`:`pwd` -w `pwd` ucfconsort.azurecr.io/ucx/centos7:1 /bin/bash +# mkdir build-centos7 && cd build-centos7 +# ../configure +# make +``` + +This will duplicate what will happen when running inside AZP. + +# Release images +To build release images there is a `docker-compose` config. Here is how to use it: +```sh +cd buildlib +docker-compose build +``` + +Tag and push release images: +```sh +./buildlib/push-release-images.sh +``` + diff --git a/buildlib/azure-pipelines.yml b/buildlib/azure-pipelines.yml new file mode 100644 index 00000000000..e0d833f24a3 --- /dev/null +++ b/buildlib/azure-pipelines.yml @@ -0,0 +1,20 @@ +# See https://aka.ms/yaml +# This pipeline to be run on direct pushes and merges + +pr: none +trigger: + - master + - v*.*.x + +resources: + containers: + - container: centos7 + image: ucfconsort.azurecr.io/ucx/centos7:1 + endpoint: ucfconsort_registry + +stages: + - stage: Build + jobs: + - template: jucx-publish.yml + parameters: + target: publish-snapshot diff --git a/buildlib/centos7-release.Dockerfile b/buildlib/centos7-release.Dockerfile new file mode 100644 index 00000000000..65ad6cfbac2 --- /dev/null +++ b/buildlib/centos7-release.Dockerfile @@ -0,0 +1,46 @@ +ARG CUDA_VERSION=10.1 +FROM nvidia/cuda:${CUDA_VERSION}-devel-centos7 + +RUN yum install -y \ + autoconf \ + automake \ + doxygen \ + file \ + gcc-c++ \ + git \ + glibc-devel \ + libtool \ + make \ + maven \ + numactl-devel \ + rdma-core-devel \ + rpm-build \ + tcl \ + tcsh \ + tk \ + wget \ + && yum clean all + +# MOFED +ARG MOFED_VERSION=5.0-1.0.0.0 +ARG MOFED_OS=rhel7.6 +ENV MOFED_DIR MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 +ENV MOFED_SITE_PLACE MLNX_OFED-${MOFED_VERSION} +ENV MOFED_IMAGE ${MOFED_DIR}.tgz +RUN wget --no-verbose http://content.mellanox.com/ofed/${MOFED_SITE_PLACE}/${MOFED_IMAGE} && \ + tar -xzf ${MOFED_IMAGE} && \ + ${MOFED_DIR}/mlnxofedinstall --all -q \ + --user-space-only \ + --without-fw-update \ + --skip-distro-check \ + --without-ucx \ + --without-hcoll \ + --without-openmpi \ + --without-sharp \ + && rm -rf ${MOFED_DIR} && rm -rf *.tgz + +ENV CPATH /usr/local/cuda/include:${CPATH} +ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LD_LIBRARY_PATH} +ENV LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LIBRARY_PATH} +ENV PATH /usr/local/cuda/compat:${PATH} + diff --git a/buildlib/centos7.Dockerfile b/buildlib/centos7.Dockerfile new file mode 100644 index 00000000000..8f997b702ae --- /dev/null +++ b/buildlib/centos7.Dockerfile @@ -0,0 +1,18 @@ +# docker build -t ucfconsort.azurecr.io/ucx/centos7:1 -f buildlib/centos7.Dockerfile buildlib/ +FROM centos:7 + +RUN yum install -y \ + autoconf \ + automake \ + doxygen \ + file \ + gcc-c++ \ + git \ + glibc-devel \ + libtool \ + make \ + maven \ + numactl-devel \ + rdma-core-devel \ + rpm-build \ + && yum clean dbcache packages diff --git a/buildlib/docker-compose.yml b/buildlib/docker-compose.yml new file mode 100644 index 00000000000..f3aaf3d679a --- /dev/null +++ b/buildlib/docker-compose.yml @@ -0,0 +1,76 @@ +version: "3" + +services: + centos7-mofed5.0-cuda10.1: + image: centos7-mofed5.0-cuda10.1 + build: + context: . + dockerfile: centos7-release.Dockerfile + args: + MOFED_VERSION: 5.0-1.0.0.0 + MOFED_OS: rhel7.6 + CUDA_VERSION: 10.1 + centos7-mofed5.0-cuda10.2: + image: centos7-mofed5.0-cuda10.2 + build: + context: . + dockerfile: centos7-release.Dockerfile + args: + MOFED_VERSION: 5.0-1.0.0.0 + MOFED_OS: rhel7.6 + CUDA_VERSION: 10.2 + centos7-mofed5.0-cuda11.0: + image: centos7-mofed5.0-cuda11.0 + build: + context: . + dockerfile: centos7-release.Dockerfile + args: + MOFED_VERSION: 5.0-1.0.0.0 + MOFED_OS: rhel7.6 + CUDA_VERSION: 11.0 + ubuntu16.04-mofed5.0-cuda10.1: + image: ubuntu16.04-mofed5.0-cuda10.1 + build: + context: . + dockerfile: ubuntu-release.Dockerfile + args: + MOFED_VERSION: 5.0-1.0.0.0 + UBUNTU_VERSION: 16.04 + CUDA_VERSION: 10.1 + ubuntu16.04-mofed5.0-cuda10.2: + image: ubuntu16.04-mofed5.0-cuda10.2 + build: + context: . + dockerfile: ubuntu-release.Dockerfile + args: + MOFED_VERSION: 5.0-1.0.0.0 + UBUNTU_VERSION: 16.04 + CUDA_VERSION: 10.2 + ubuntu18.04-mofed5.0-cuda10.1: + image: ubuntu18.04-mofed5.0-cuda10.1 + build: + context: . + dockerfile: ubuntu-release.Dockerfile + args: + MOFED_VERSION: 5.0-1.0.0.0 + UBUNTU_VERSION: 18.04 + CUDA_VERSION: 10.1 + ubuntu18.04-mofed5.0-cuda10.2: + image: ubuntu18.04-mofed5.0-cuda10.2 + build: + context: . + dockerfile: ubuntu-release.Dockerfile + args: + MOFED_VERSION: 5.0-1.0.0.0 + UBUNTU_VERSION: 18.04 + CUDA_VERSION: 10.2 + ubuntu18.04-mofed5.0-cuda11.0: + image: ubuntu18.04-mofed5.0-cuda11.0 + build: + context: . + dockerfile: ubuntu-release.Dockerfile + args: + MOFED_VERSION: 5.0-1.0.0.0 + UBUNTU_VERSION: 18.04 + CUDA_VERSION: 11.0 + diff --git a/buildlib/fedora.Dockerfile b/buildlib/fedora.Dockerfile new file mode 100644 index 00000000000..785d84bd445 --- /dev/null +++ b/buildlib/fedora.Dockerfile @@ -0,0 +1,24 @@ +# docker build -t ucfconsort.azurecr.io/ucx/fedora:1 -f buildlib/fedora.Dockerfile buildlib/ +FROM fedora:32 + +RUN dnf install -y \ + autoconf \ + automake \ + clang \ + cppcheck \ + csclng \ + cscppc \ + csmock-common \ + doxygen \ + file \ + gcc-c++ \ + git \ + glibc-devel \ + java-1.8.0-openjdk-devel \ + libtool \ + make \ + maven \ + numactl-devel \ + rdma-core-devel \ + rpm-build \ + && dnf clean dbcache packages diff --git a/buildlib/jucx-publish.yml b/buildlib/jucx-publish.yml new file mode 100644 index 00000000000..de4bef44960 --- /dev/null +++ b/buildlib/jucx-publish.yml @@ -0,0 +1,66 @@ +parameters: + target: publish-snapshot + temp_cfg: $(System.DefaultWorkingDirectory)/bindings/java/src/main/native/build-java/tmp-settings.xml + gpg_dir: $(System.DefaultWorkingDirectory)/bindings/java/src/main/native/build-java/gpg + +jobs: + - job: jucx_release + + container: centos7 + + steps: + - checkout: self + clean: true + + - bash: | + set -eE + gcc --version + ./autogen.sh + ./contrib/configure-release --with-java + displayName: Configure + + - bash: | + set -eE + make -s -j`nproc` + displayName: Build ucx + + - bash: | + set -eE + { + echo -e "" + echo -e "ossrh\${env.SONATYPE_USERNAME}" + echo -e "\${env.SONATYPE_PASSWORD}" + echo -e "" + } > ${{ parameters.temp_cfg }} + displayName: Generate temporary config + + - task: DownloadSecureFile@1 + displayName: Download Secure file + inputs: + secureFile: sparkucx-secret.gpg + name: privateKey + + - task: DownloadSecureFile@1 + displayName: Download Secure file + inputs: + secureFile: sparkucx-public.gpg + name: publicKey + + - bash: | + mkdir ${{ parameters.gpg_dir }} + export GPG_TTY=`tty` + chmod 700 ${{ parameters.gpg_dir }} + cp $(publicKey.secureFilePath) ${{ parameters.gpg_dir }}/pubring.gpg + cp $(privateKey.secureFilePath) ${{ parameters.gpg_dir }}/secring.gpg + export GNUPGHOME=${{ parameters.gpg_dir }} + TAG=`git describe --tags` + # Maven requires version to be of form MAJOR_VERSION.MINOR_VERSIOn,... + # ucx tags are of form v1.x.x - need to remove 'v' from the beginning of string + MAVEN_VERSION=${TAG:1} + make -C bindings/java/src/main/native/ ${{ parameters.target }} \ + ARGS="--settings ${{ parameters.temp_cfg }}" JUCX_VERSION=${MAVEN_VERSION} + displayName: Publish JUCX jar to maven central + env: + GPG_PASSPHRASE: $(GPG_PASSPHRASE) + SONATYPE_PASSWORD: $(SONATYPE_PASSWORD) + SONATYPE_USERNAME: $(SONATYPE_USERNAME) diff --git a/buildlib/push-release-images.sh b/buildlib/push-release-images.sh new file mode 100644 index 00000000000..a44c6738c34 --- /dev/null +++ b/buildlib/push-release-images.sh @@ -0,0 +1,14 @@ +#!/bin/bash -eE + +# shellcheck disable=SC2086 +basedir=$(cd "$(dirname $0)" && pwd) + +registry=harbor.mellanox.com/ucx +tag=1 + +images=$(awk '/image:/ {print $2}' "${basedir}/docker-compose.yml") +for img in $images; do + target_name="${registry}/${img}:${tag}" + docker tag ${img}:latest ${target_name} + docker push ${target_name} +done diff --git a/buildlib/tests.yml b/buildlib/tests.yml new file mode 100644 index 00000000000..055eb276aa3 --- /dev/null +++ b/buildlib/tests.yml @@ -0,0 +1,40 @@ +parameters: + worker_ids: [0, 1, 2, 3] + num_workers: 4 + demands: [] + name: subtest + +jobs: + - job: tests_${{ parameters.name }} + pool: + name: MLNX + demands: ${{ parameters.demands }} + displayName: ${{ parameters.name }} on worker + timeoutInMinutes: 300 + strategy: + matrix: + ${{ each wid in parameters.worker_ids }}: + ${{ wid }}: + worker_id: ${{ wid }} + steps: + # address permissions issue when some files created as read-only + - bash: chmod u+rwx ./ -R + + - checkout: self + clean: true + + - bash: | + ./contrib/test_jenkins.sh + displayName: Run ./contrib/test_jenkins.sh + env: + nworkers: ${{ parameters.num_workers }} + worker: $(worker_id) + BUILD_NUMBER: "$(Build.BuildId)-$(Build.BuildNumber)" + JOB_URL: "$(System.TeamFoundationCollectionUri)$(System.TeamProject)/_build/results?buildId=$(Build.BuildId)" + # Set $JENKINS_RUN_TESTS to empty value to avoid setting CPU affinity in test_jenkins.sh + JENKINS_RUN_TESTS: "" + # $AZP_AGENT_ID is set for every self-hosted Azure agent (uniq for one host, from 1 to N) + EXECUTOR_NUMBER: $(AZP_AGENT_ID) + RUN_TESTS: yes + JENKINS_TEST_PERF: 0 + diff --git a/buildlib/ubuntu-release.Dockerfile b/buildlib/ubuntu-release.Dockerfile new file mode 100644 index 00000000000..61140f2a82a --- /dev/null +++ b/buildlib/ubuntu-release.Dockerfile @@ -0,0 +1,45 @@ +ARG CUDA_VERSION=10.1 +ARG UBUNTU_VERSION=16.04 +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + +RUN apt-get update && \ + apt-get install -y \ + automake \ + default-jdk \ + dh-make \ + g++ \ + git \ + openjdk-8-jdk \ + libcap2 \ + libnuma-dev \ + libtool \ + make \ + maven \ + udev \ + wget \ + && apt-get remove -y openjdk-11-* || apt-get autoremove -y \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +# MOFED +ARG MOFED_VERSION=5.0-1.0.0.0 +ARG UBUNTU_VERSION +ARG MOFED_OS=ubuntu${UBUNTU_VERSION} +ENV MOFED_DIR MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 +ENV MOFED_SITE_PLACE MLNX_OFED-${MOFED_VERSION} +ENV MOFED_IMAGE ${MOFED_DIR}.tgz +RUN wget --no-verbose http://content.mellanox.com/ofed/${MOFED_SITE_PLACE}/${MOFED_IMAGE} && \ + tar -xzf ${MOFED_IMAGE} +RUN ${MOFED_DIR}/mlnxofedinstall --all -q \ + --user-space-only \ + --without-fw-update \ + --skip-distro-check \ + --without-ucx \ + --without-hcoll \ + --without-openmpi \ + --without-sharp && \ + rm -rf ${MOFED_DIR} && rm -rf *.tgz + +ENV CPATH /usr/local/cuda/include:${CPATH} +ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LD_LIBRARY_PATH} +ENV LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LIBRARY_PATH} +ENV PATH /usr/local/cuda/compat:${PATH} diff --git a/config/m4/compiler.m4 b/config/m4/compiler.m4 index 477cada71bb..1ec255e996b 100644 --- a/config/m4/compiler.m4 +++ b/config/m4/compiler.m4 @@ -1,16 +1,40 @@ # # Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. # Copyright (c) UT-Battelle, LLC. 2017. ALL RIGHTS RESERVED. -# Copyright (C) ARM Ltd. 2016-2018. ALL RIGHTS RESERVED. -# Copyright (C) Huawei Technologies Co.,Ltd. 2020. ALL RIGHTS RESERVED. +# Copyright (C) ARM Ltd. 2016-2020. ALL RIGHTS RESERVED. +# Copyright (C) Huawei Technologies Co.,Ltd. 2020-2021. ALL RIGHTS RESERVED. # See file LICENSE for terms. # + # # Initialize CFLAGS # BASE_CFLAGS="-g -Wall -Werror" + +# +# Check that C++ is functional. +# +# AC_PROG_CXX never fails but falls back on g++ as a default CXX compiler that +# always present. If g++ isn't installed, the macro doesn't detect this and +# compilation fails later on. CHECK_CXX_COMP compiles simple C++ code to +# verify that compiler is present and functional. +# +AC_DEFUN([CHECK_CXX_COMP], + [AC_MSG_CHECKING(if $CXX works) + AC_LANG_PUSH([C++]) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + #ifndef __cplusplus + #error "No C++ support, AC_PROG_CXX failed" + #endif + ]])], + [AC_MSG_RESULT([yes])], + [AC_MSG_ERROR([Cannot continue. Please install C++ compiler.])]) + AC_LANG_POP([C++]) + ]) + + # # Debug mode # @@ -19,9 +43,11 @@ AC_ARG_ENABLE(debug, [], [enable_debug=no]) AS_IF([test "x$enable_debug" = xyes], - [BASE_CFLAGS="-D_DEBUG $BASE_CFLAGS"], + [BASE_CFLAGS="-D_DEBUG $BASE_CFLAGS" + BASE_CXXFLAGS="-D_DEBUG" $BASE_CXXFLAGS], []) + # # Optimization level # @@ -32,8 +58,10 @@ AC_ARG_ENABLE(compiler-opt, AS_IF([test "x$enable_compiler_opt" = "xyes"], [BASE_CFLAGS="-O3 $BASE_CFLAGS"], [test "x$enable_compiler_opt" = "xnone"], [AS_IF([test "x$enable_debug" = xyes], - [BASE_CFLAGS="-O0 $BASE_CFLAGS"], - [BASE_CFLAGS="-O3 $BASE_CFLAGS"])], + [BASE_CFLAGS="-O0 $BASE_CFLAGS" + BASE_CXXFLAGS="-O0 $BASE_CXXFLAGS"], + [BASE_CFLAGS="-O3 $BASE_CFLAGS" + BASE_CXXFLAGS="-O0 $BASE_CXXFLAGS"])], [test "x$enable_compiler_opt" = "xno"], [], [BASE_CFLAGS="-O$enable_compiler_opt $BASE_CFLAGS"]) @@ -51,6 +79,7 @@ AC_DEFUN([CHECK_CROSS_COMP], [ [AC_LINK_IFELSE([$1], [$2], [$3])]) ]) + # # Check for one specific attribute by compiling with C # Usage: CHECK_SPECIFIC_ATTRIBUTE([name], [doc], [program]) @@ -72,28 +101,42 @@ AC_DEFUN([CHECK_SPECIFIC_ATTRIBUTE], [ AC_DEFINE_UNQUOTED([HAVE_ATTRIBUTE_[$2]], [$ucx_cv_attribute_[$1]], [Check attribute [$1]]) ]) + # -# Check if compiler supports a given feaure -# Usage: COMPILER_OPTION([name], [doc], [flag], [default: yes|no], [program]) +# Enable/disable turning on machine-specific optimizations # -AC_DEFUN([COMPILER_OPTION], +AC_ARG_ENABLE(optimizations, + AC_HELP_STRING([--enable-optimizations], + [Enable non-portable machine-specific CPU optimizations, default: NO]), + [], + [enable_optimizations=no]) + + +# +# Check if compiler supports a given CPU optimization flag, and if yes - add it +# to BASE_CFLAGS substitution, and OPT_CFLAGS C define. +# +# Usage: COMPILER_CPU_OPTIMIZATION([name], [doc], [flag], [program]) +# +AC_DEFUN([COMPILER_CPU_OPTIMIZATION], [ AC_ARG_WITH([$1], [AC_HELP_STRING([--with-$1], [Use $2 compiler option.])], [], - [with_$1=$4]) + [with_$1=$enable_optimizations]) AS_IF([test "x$with_$1" != "xno"], [SAVE_CFLAGS="$CFLAGS" CFLAGS="$BASE_CFLAGS $CFLAGS $3" AC_MSG_CHECKING([$3]) - CHECK_CROSS_COMP([AC_LANG_SOURCE([$5])], + CHECK_CROSS_COMP([AC_LANG_SOURCE([$4])], [AC_MSG_RESULT([yes]) - # TODO: Add CPU UARCH detector and validator in UCX init. - # As for now we will avoid passing this information to - # library. - AS_IF([test "x$1" != "xmcpu" -a "x$1" != "xmarch"], - [OPT_CFLAGS="$OPT_CFLAGS|$1"],[])], + # TODO: Add CPU UARCH detector and validator in UCX init. + # As for now we will avoid passing this information to + # library. + BASE_CFLAGS="$BASE_CFLAGS $3" + AS_IF([test "x$1" != "xmcpu" -a "x$1" != "xmarch"], + [OPT_CFLAGS="$OPT_CFLAGS|$1"])], [AC_MSG_RESULT([no])]) CFLAGS="$SAVE_CFLAGS"]) ]) @@ -143,9 +186,19 @@ AC_DEFUN([DETECT_UARCH], ax_cpu="thunderxt88" ;; esac ;; - *) ax_cpu="native" - ;; + 0x48) case $cpupart in + 0xd01 | 0x0d01) + AC_DEFINE([HAVE_AARCH64_HI1620], 1, [Huawei Kunpeng 920]) + ax_cpu="tsv110" + ax_arch="armv8.2-a" ;; + esac + ;; + *) + ;; esac + AM_CONDITIONAL([HAVE_AARCH64_THUNDERX2], [test x$ax_cpu = xthunderx2t99]) + AM_CONDITIONAL([HAVE_AARCH64_THUNDERX1], [test x$ax_cpu = xthunderxt88]) + AM_CONDITIONAL([HAVE_AARCH64_HI1620], [test x$ax_cpu = xtsv110]) ]) @@ -157,33 +210,23 @@ AC_DEFUN([DETECT_UARCH], # AC_DEFUN([CHECK_COMPILER_FLAG], [ -# -# Force ICC treat command line warnings as errors. -# This evaluation should be called prior to all other compiler flags evals -# - AS_IF([test "x$icc_cmd_diag_to_error" = "x"], - [icc_cmd_diag_to_error=1 - AC_MSG_CHECKING([compiler flag -diag-error 10006]) - SAVE_CFLAGS="$CFLAGS" - CFLAGS="$BASE_CFLAGS $CFLAGS -diag-error 10006" - AC_COMPILE_IFELSE([AC_LANG_SOURCE([[int main(){return 0;}]])], - [BASE_CFLAGS="$BASE_CFLAGS -diag-error 10006" - AC_MSG_RESULT([yes])], - [AC_MSG_RESULT([no])]) - CFLAGS="$SAVE_CFLAGS" - ], - []) AC_MSG_CHECKING([compiler flag $1]) SAVE_CFLAGS="$CFLAGS" + SAVE_CXXFLAGS="$CFLAGS" CFLAGS="$BASE_CFLAGS $CFLAGS $2" + CXXFLAGS="$BASE_CXXFLAGS $CXXFLAGS $2" AC_COMPILE_IFELSE([$3], [AC_MSG_RESULT([yes]) + CFLAGS="$SAVE_CFLAGS" + CXXFLAGS="$SAVE_CXXFLAGS" $4], [AC_MSG_RESULT([no]) + CFLAGS="$SAVE_CFLAGS" + CXXFLAGS="$SAVE_CXXFLAGS" $5]) - CFLAGS="$SAVE_CFLAGS" ]) + # # ADD_COMPILER_FLAG_IF_SUPPORTED # Usage: ADD_COMPILER_FLAG_IF_SUPPORTED([name], [flag], [program], [if-true], [if-false]) @@ -194,10 +237,25 @@ AC_DEFUN([CHECK_COMPILER_FLAG], AC_DEFUN([ADD_COMPILER_FLAG_IF_SUPPORTED], [ CHECK_COMPILER_FLAG([$1], [$2], [$3], - [BASE_CFLAGS="$BASE_CFLAGS $2" $4], + [BASE_CFLAGS="$BASE_CFLAGS $2" + $4], [$5]) ]) + +# +# ADD_COMPILER_FLAGS_IF_SUPPORTED +# Usage: ADD_COMPILER_FLAGS_IF_SUPPORTED([[flag1], [flag2], [flag3]], [program]) +# +# The macro checks multiple flags supported by compiler +# +AC_DEFUN([ADD_COMPILER_FLAGS_IF_SUPPORTED], +[ + m4_foreach([_flag], [$1], + [ADD_COMPILER_FLAG_IF_SUPPORTED([_flag], [_flag], [$2], [], [])]) +]) + + # # CHECK_DEPRECATED_DECL_FLAG (flag, variable) # @@ -211,7 +269,7 @@ AC_DEFUN([CHECK_DEPRECATED_DECL_FLAG], CFLAGS="$BASE_CFLAGS $CFLAGS $1" AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ int __attribute__ ((__deprecated__)) f() { return 0; } - int main() { return f(); } + int main(int argc, char** argv) { return f(); } ]])], [AC_MSG_RESULT([yes]) $2="${$2} $1"], @@ -220,6 +278,17 @@ AC_DEFUN([CHECK_DEPRECATED_DECL_FLAG], ]) +# +# Force ICC treat command line warnings as errors. +# This evaluation should be called prior to all other compiler flags evals +# +CHECK_COMPILER_FLAG([-diag-error 10006], [-diag-error 10006], + [AC_LANG_SOURCE([[int main(int argc, char** argv){return 0;}]])], + [BASE_CFLAGS="$BASE_CFLAGS -diag-error 10006" + BASE_CXXFLAGS="$BASE_CXXFLAGS -diag-error 10006"], + []) + + CHECK_DEPRECATED_DECL_FLAG([-diag-disable 1478], CFLAGS_NO_DEPRECATED) # icc CHECK_DEPRECATED_DECL_FLAG([-Wno-deprecated-declarations], CFLAGS_NO_DEPRECATED) # gcc AC_SUBST([CFLAGS_NO_DEPRECATED], [$CFLAGS_NO_DEPRECATED]) @@ -232,7 +301,7 @@ ADD_COMPILER_FLAG_IF_SUPPORTED([-diag-disable 269], [-diag-disable 269], [AC_LANG_SOURCE([[#include #include - int main() { + int main(int argc, char** argv) { char *p = NULL; scanf("%m[^.]", &p); free(p); @@ -253,33 +322,31 @@ ADD_COMPILER_FLAG_IF_SUPPORTED([-diag-disable 269], UCX_ALLOC_ALIGN=16 ADD_COMPILER_FLAG_IF_SUPPORTED([-fmax-type-align=$UCX_ALLOC_ALIGN], [-fmax-type-align=$UCX_ALLOC_ALIGN], - [AC_LANG_SOURCE([[int main(){return 0;}]])], + [AC_LANG_SOURCE([[int main(int argc, char** argv){return 0;}]])], [AC_DEFINE_UNQUOTED([UCX_ALLOC_ALIGN], $UCX_ALLOC_ALIGN, [Set aligment assumption for compiler])], []) -# -# Enable/disable turning on machine-specific optimizations -# -AC_ARG_ENABLE(optimizations, - AC_HELP_STRING([--enable-optimizations], [Enable machine-specific optimizations, default: NO]), - [], - [enable_optimizations=no]) - - # # SSE/AVX # -COMPILER_OPTION([avx], [AVX], [-mavx], [$enable_optimizations], - [#include - int main() { return _mm256_testz_si256(_mm256_set1_epi32(1), _mm256_set1_epi32(3)); }]) +COMPILER_CPU_OPTIMIZATION([avx], [AVX], [-mavx], + [#include + int main(int argc, char** argv) { + return _mm256_testz_si256(_mm256_set1_epi32(1), _mm256_set1_epi32(3)); + } + ]) AS_IF([test "x$with_avx" != xyes], - [COMPILER_OPTION([sse41], [SSE 4.1], [-msse4.1], [$enable_optimizations], - [#include - int main() { return _mm_testz_si128(_mm_set1_epi32(1), _mm_set1_epi32(3)); }]) - COMPILER_OPTION([sse42], [SSE 4.2], [-msse4.2], [$enable_optimizations], - [#include - int main() { return _mm_popcnt_u32(0x101) - 2; }]) + [COMPILER_CPU_OPTIMIZATION([sse41], [SSE 4.1], [-msse4.1], + [#include + int main(int argc, char** argv) { + return _mm_testz_si128(_mm_set1_epi32(1), _mm_set1_epi32(3)); + } + ]) + COMPILER_CPU_OPTIMIZATION([sse42], [SSE 4.2], [-msse4.2], + [#include + int main(int argc, char** argv) { return _mm_popcnt_u32(0x101) - 2; + }]) ]) @@ -292,20 +359,22 @@ AS_IF([test "x$with_avx" != xyes], DETECT_UARCH() + # # CPU tuning # AS_IF([test "x$ax_cpu" != "x"], - [COMPILER_OPTION([mcpu], [CPU Model], [-mcpu=$ax_cpu], [$enable_optimizations], - [int main() { return 0;}]) + [COMPILER_CPU_OPTIMIZATION([mcpu], [CPU Model], [-mcpu=$ax_cpu], + [int main(int argc, char** argv) { return 0;}]) ]) + # # Architecture tuning # AS_IF([test "x$ax_arch" != "x"], - [COMPILER_OPTION([march], [architecture tuning], [-march=$ax_arch], [$enable_optimizations], - [int main() { return 0;}]) + [COMPILER_CPU_OPTIMIZATION([march], [architecture tuning], [-march=$ax_arch], + [int main(int argc, char** argv) { return 0;}]) ]) @@ -316,6 +385,37 @@ CHECK_SPECIFIC_ATTRIBUTE([optimize], [NOOPTIMIZE], [int foo (int arg) __attribute__ ((optimize("O0")));]) +# +# Compile code with frame pointer. Optimizations usually omit the frame pointer, +# but if we are profiling the code with callgraph we need it. +# This option may affect perofrmance so it is off by default. +# +AC_ARG_ENABLE([frame-pointer], + AS_HELP_STRING([--enable-frame-pointer], + [Compile with frame pointer, useful for profiling, default: NO]), + [], + [enable_frame_pointer=no]) +AS_IF([test "x$enable_frame_pointer" = xyes], + [ADD_COMPILER_FLAG_IF_SUPPORTED([-fno-omit-frame-pointer], + [-fno-omit-frame-pointer], + [AC_LANG_SOURCE([[int main(int argc, char** argv){return 0;}]])], + [AS_MESSAGE([compiling with frame pointer])], + [AS_MESSAGE([compiling with frame pointer is not supported])])], + [:]) + +ADD_COMPILER_FLAG_IF_SUPPORTED([-funwind-tables], + [-funwind-tables], + [AC_LANG_SOURCE([[int main(int argc, char** argv){return 0;}]])], + [AS_MESSAGE([compiling with unwind tables])], + [AS_MESSAGE([compiling without unwind tables])]) + + +# +# Check for C++ support +# +CHECK_CXX_COMP() + + # # Check for C++11 support # @@ -326,7 +426,7 @@ CXX11FLAGS="-std=c++11" CXXFLAGS="$CXXFLAGS $CXX11FLAGS" AC_COMPILE_IFELSE([AC_LANG_SOURCE([[#include #include - int main() { + int main(int argc, char** argv) { std::to_string(1); return 0; } ]])], @@ -340,19 +440,103 @@ AC_LANG_POP AM_CONDITIONAL([HAVE_CXX11], [test "x$cxx11_happy" != xno]) +# +# Check for GNU++11 support +# +AC_MSG_CHECKING([gnu++11 support]) +AC_LANG_PUSH([C++]) + +SAVE_CXXFLAGS="$CXXFLAGS" +CXX11FLAGS="-std=gnu++11" +CXXFLAGS="$CXXFLAGS $CXX11FLAGS" +AC_COMPILE_IFELSE([AC_LANG_SOURCE([[#include + #include + int main(int argc, char** argv) { + int a; + typeof(a) b = 0; + std::to_string(1); + return 0; + } ]])], + [AC_MSG_RESULT([yes]) + AC_SUBST([CXX11FLAGS]) + gnuxx11_happy=yes], + [AC_MSG_RESULT([no]) + gnuxx11_happy=no]) +CXXFLAGS="$SAVE_CXXFLAGS" +AM_CONDITIONAL([HAVE_GNUXX11], [test "x$gnuxx11_happy" != xno]) + +AC_CHECK_DECL(_GLIBCXX_NOTHROW, have_glibcxx_nothrow=yes, + have_glibcxx_nothrow=no, [[#include ]]) +AM_CONDITIONAL([HAVE_GLIBCXX_NOTHROW], [test "x$have_glibcxx_nothrow" = xyes]) + +AC_LANG_POP + + +# +# PGI specific switches +# +# --diag_suppress 181 - Suppress incorrect printf format for PGI18 compiler. TODO: remove it after compiler fix +# --diag_suppress 1215 - Suppress deprecated API warning for PGI18 compiler +# --diag_suppress 1901 - Use of a const variable in a constant expression is nonstandard in C +ADD_COMPILER_FLAGS_IF_SUPPORTED([[--display_error_number], + [--diag_suppress 181], + [--diag_suppress 1215], + [--diag_suppress 1901]], + [AC_LANG_SOURCE([[int main(int argc, char **argv){return 0;}]])]) + + +# +# Check if "-pedantic" flag is supported +# +CHECK_COMPILER_FLAG([-pedantic], [-pedantic], + [AC_LANG_SOURCE([[int main(int argc, char** argv){return 0;}]])], + [CFLAGS_PEDANTIC="$CFLAGS_PEDANTIC -pedantic"], + []) + + +# +# Add strict compilation flags +# +ADD_COMPILER_FLAGS_IF_SUPPORTED([[-Wno-missing-field-initializers], + [-Wno-unused-parameter], + [-Wno-unused-label], + [-Wno-long-long], + [-Wno-endif-labels], + [-Wno-sign-compare], + [-Wno-multichar], + [-Wno-deprecated-declarations], + [-Winvalid-pch]], + [AC_LANG_SOURCE([[int main(int argc, char **argv){return 0;}]])]) + + # # Set C++ optimization/debug flags to be the same as for C # BASE_CXXFLAGS="$BASE_CFLAGS" -AC_SUBST([BASE_CFLAGS], [$BASE_CFLAGS]) -AC_SUBST([BASE_CXXFLAGS], [$BASE_CXXFLAGS]) + + +# +# Add strict flags supported by C compiler only +# NOTE: This must be done after setting BASE_CXXFLAGS +# +ADD_COMPILER_FLAGS_IF_SUPPORTED([[-Wno-pointer-sign], + [-Werror-implicit-function-declaration], + [-Wno-format-zero-length], + [-Wnested-externs], + [-Wshadow]], + [AC_LANG_SOURCE([[int main(int argc, char **argv){return 0;}]])]) + + +AC_SUBST([BASE_CFLAGS]) +AC_SUBST([BASE_CXXFLAGS]) +AC_SUBST([CFLAGS_PEDANTIC]) + # -# Set common preprocessor flags +# Set common C preprocessor flags # BASE_CPPFLAGS="-DCPU_FLAGS=\"$OPT_CFLAGS\"" BASE_CPPFLAGS="$BASE_CPPFLAGS -I\${abs_top_srcdir}/src" BASE_CPPFLAGS="$BASE_CPPFLAGS -I\${abs_top_builddir}" BASE_CPPFLAGS="$BASE_CPPFLAGS -I\${abs_top_builddir}/src" -AC_MSG_NOTICE([Common preprocessor flags: ${BASE_CPPFLAGS}]) AC_SUBST([BASE_CPPFLAGS], [$BASE_CPPFLAGS]) diff --git a/config/m4/cuda.m4 b/config/m4/cuda.m4 index ba7d2e211b5..1862eb6148c 100644 --- a/config/m4/cuda.m4 +++ b/config/m4/cuda.m4 @@ -25,14 +25,14 @@ AS_IF([test "x$cuda_checked" != "xyes"], AS_IF([test -d "$with_cuda/lib64"], [libsuff="64"], [libsuff=""]) ucx_check_cuda_libdir="$with_cuda/lib$libsuff" CUDA_CPPFLAGS="-I$with_cuda/include" - CUDA_LDFLAGS="-L$ucx_check_cuda_libdir"]) + CUDA_LDFLAGS="-L$ucx_check_cuda_libdir -L$ucx_check_cuda_libdir/stubs"]) AS_IF([test ! -z "$with_cuda_libdir" -a "x$with_cuda_libdir" != "xyes"], [ucx_check_cuda_libdir="$with_cuda_libdir" - CUDA_LDFLAGS="-L$ucx_check_cuda_libdir"]) + CUDA_LDFLAGS="-L$ucx_check_cuda_libdir -L$ucx_check_cuda_libdir/stubs"]) - CPPFLAGS+=" $CUDA_CPPFLAGS" - LDFLAGS+=" $CUDA_LDFLAGS" + CPPFLAGS="$CPPFLAGS $CUDA_CPPFLAGS" + LDFLAGS="$LDFLAGS $CUDA_LDFLAGS" # Check cuda header files AC_CHECK_HEADERS([cuda.h cuda_runtime.h], @@ -40,11 +40,11 @@ AS_IF([test "x$cuda_checked" != "xyes"], # Check cuda libraries AS_IF([test "x$cuda_happy" = "xyes"], - [AC_CHECK_LIB([cuda], [cuPointerGetAttribute], - [CUDA_LDFLAGS+=" -lcuda"], [cuda_happy="no"])]) + [AC_CHECK_LIB([cuda], [cuDeviceGetUuid], + [CUDA_LDFLAGS="$CUDA_LDFLAGS -lcuda"], [cuda_happy="no"])]) AS_IF([test "x$cuda_happy" = "xyes"], [AC_CHECK_LIB([cudart], [cudaGetDeviceCount], - [CUDA_LDFLAGS+=" -lcudart"], [cuda_happy="no"])]) + [CUDA_LDFLAGS="$CUDA_LDFLAGS -lcudart"], [cuda_happy="no"])]) CPPFLAGS="$save_CPPFLAGS" LDFLAGS="$save_LDFLAGS" diff --git a/config/m4/gdrcopy.m4 b/config/m4/gdrcopy.m4 index 40ea489785f..537338c6ddb 100644 --- a/config/m4/gdrcopy.m4 +++ b/config/m4/gdrcopy.m4 @@ -38,6 +38,9 @@ AS_IF([test "x$with_gdrcopy" != "xno"], gdrcopy_happy="no"]) ], [gdrcopy_happy="no"]) + AS_IF([test "x$gdrcopy_happy" = "xyes"], + [AC_CHECK_DECLS([gdr_copy_to_mapping], [], [], [#include "gdrapi.h"])]) + CFLAGS="$save_CFLAGS" CPPFLAGS="$save_CPPFLAGS" LDFLAGS="$save_LDFLAGS" @@ -52,6 +55,7 @@ AS_IF([test "x$with_gdrcopy" != "xno"], [AC_MSG_ERROR([gdrcopy support is requested but gdrcopy packages cannot be found])], [AC_MSG_WARN([GDR_COPY not found])]) ]) + ], [AC_MSG_WARN([GDR_COPY was explicitly disabled])]) diff --git a/config/m4/java.m4 b/config/m4/java.m4 index 56c3f1397d6..e624519f481 100644 --- a/config/m4/java.m4 +++ b/config/m4/java.m4 @@ -11,8 +11,8 @@ java_happy="no" AC_ARG_WITH([java], [AC_HELP_STRING([--with-java=(PATH)], - [Compile Java UCX (default is no).]) - ], [], [with_java=no]) + [Compile Java UCX (default is guess).]) + ], [], [with_java=guess]) AS_IF([test "x$with_java" != xno], [ @@ -29,9 +29,9 @@ AS_IF([test "x$with_java" != xno], AC_CHECK_PROG(READLINK, readlink, yes) AS_IF([test "x${READLINK}" = xyes], [ - AC_SUBST([JAVA], [$(readlink -f $(type -P java))]) - AC_SUBST([JAVA_HOME], [${JAVA%*/jre*}]) - AC_MSG_WARN([Please set JAVA_HOME=$JAVA_HOME]) + JAVA_BIN_FOLDER=`AS_DIRNAME([$(readlink -f $(type -P javac))])` + JAVA_HOME=`AS_DIRNAME([$JAVA_BIN_FOLDER])` + AC_MSG_NOTICE([Setting JAVA_HOME=$JAVA_HOME]) ], [ AS_IF( @@ -77,6 +77,6 @@ AC_SUBST([JDK], [${java_dir}]) AM_CONDITIONAL([HAVE_JAVA], [test "x$java_happy" != "xno"]) #Set MVN according to whether user has Java and Maven or not AM_COND_IF([HAVE_JAVA], - [AC_SUBST([MVN], ["mvn"])], - [AC_SUBST([MVN], ["true"])] + [AC_SUBST([MVN], ["mvn"]) + build_bindings="${build_bindings}:java"] ) diff --git a/config/m4/mpi.m4 b/config/m4/mpi.m4 index 1b002be9ac3..61a5f8bae90 100644 --- a/config/m4/mpi.m4 +++ b/config/m4/mpi.m4 @@ -3,7 +3,7 @@ # # Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. # -# Copyright (C) Huawei Technologies Co.,Ltd. 2020. ALL RIGHTS RESERVED. +# Copyright (C) Huawei Technologies Co.,Ltd. 2020-2021. ALL RIGHTS RESERVED. # See file LICENSE for terms. # @@ -38,8 +38,8 @@ AS_IF([test "x$with_mpi" = xyes], AS_IF([test -n "$MPICC"], [AC_DEFINE([HAVE_MPI], [1], [MPI support]) - mpi_enable=Disabled], - [mpi_enable=Enabled]) + mpi_enable=enabled], + [mpi_enable=disabled]) AM_CONDITIONAL([HAVE_MPI], [test -n "$MPIRUN"]) AM_CONDITIONAL([HAVE_MPICC], [test -n "$MPICC"]) AM_CONDITIONAL([HAVE_MPIRUN], [test -n "$MPIRUN"]) diff --git a/config/m4/rocm.m4 b/config/m4/rocm.m4 index 567e50ed095..60ed5df8f66 100644 --- a/config/m4/rocm.m4 +++ b/config/m4/rocm.m4 @@ -23,6 +23,27 @@ AC_DEFUN([ROCM_PARSE_FLAGS], [AC_MSG_WARN([$arg of $1 not parsed])]) done]) +# ROCM_BUILD_FLAGS(ARG, VAR_LIBS, VAR_LDFLAGS, VAR_CPPFLAGS, VAR_ROOT) +# ---------------------------------------------------------- +# Parse value of ARG into appropriate LIBS, LDFLAGS, and +# CPPFLAGS variables. +AC_DEFUN([ROCM_BUILD_FLAGS], + $4="-I$1/include/hsa -I$1/include" + $3="-L$1/lib -L$1/lib64 -L$1/hsa/lib" + $2="-lhsa-runtime64 -lhsakmt" + $5="$1" +) + +# HIP_BUILD_FLAGS(ARG, VAR_LIBS, VAR_LDFLAGS, VAR_CPPFLAGS) +# ---------------------------------------------------------- +# Parse value of ARG into appropriate LIBS, LDFLAGS, and +# CPPFLAGS variables. +AC_DEFUN([HIP_BUILD_FLAGS], + $4="-D__HIP_PLATFORM_HCC__ -I$1/include/hip -I$1/include" + $3="-L$1/hip/lib -L$1/lib" + $2="-lamdhip64" +) + # # Check for ROCm support # @@ -37,26 +58,26 @@ AC_ARG_WITH([rocm], [with_rocm=guess]) rocm_happy=no +hip_happy=no AS_IF([test "x$with_rocm" != "xno"], [AS_CASE(["x$with_rocm"], [x|xguess|xyes], [AC_MSG_NOTICE([ROCm path was not specified. Guessing ...]) - with_rocm=/opt/rocm - ROCM_CPPFLAGS="-I$with_rocm/include/hsa -I$with_rocm/include" - ROCM_LDFLAGS="-L$with_rocm/hsa/lib -L$with_rocm/lib" - ROCM_LIBS="-lhsa-runtime64"], + with_rocm="/opt/rocm" + ROCM_BUILD_FLAGS([$with_rocm], + [ROCM_LIBS], [ROCM_LDFLAGS], [ROCM_CPPFLAGS], [ROCM_ROOT])], [x/*], [AC_MSG_NOTICE([ROCm path given as $with_rocm ...]) - ROCM_CPPFLAGS="-I$with_rocm/include/hsa -I$with_rocm/include" - ROCM_LDFLAGS="-L$with_rocm/hsa/lib -L$with_rocm/lib" - ROCM_LIBS="-lhsa-runtime64"], + ROCM_BUILD_FLAGS([$with_rocm], + [ROCM_LIBS], [ROCM_LDFLAGS], [ROCM_CPPFLAGS], [ROCM_ROOT])], [AC_MSG_NOTICE([ROCm flags given ...]) - ROCM_PARSE_FLAGS([with_rocm], + ROCM_PARSE_FLAGS([$with_rocm], [ROCM_LIBS], [ROCM_LDFLAGS], [ROCM_CPPFLAGS])]) SAVE_CPPFLAGS="$CPPFLAGS" SAVE_LDFLAGS="$LDFLAGS" SAVE_LIBS="$LIBS" + CPPFLAGS="$ROCM_CPPFLAGS $CPPFLAGS" LDFLAGS="$ROCM_LDFLAGS $LDFLAGS" LIBS="$ROCM_LIBS $LIBS" @@ -67,27 +88,53 @@ AS_IF([test "x$with_rocm" != "xno"], AS_IF([test "x$rocm_happy" = xyes], [AC_CHECK_HEADERS([hsa_ext_amd.h], [rocm_happy=yes], [rocm_happy=no])]) AS_IF([test "x$rocm_happy" = xyes], - [AC_SEARCH_LIBS([hsa_init], [hsa-runtime64]) - AS_CASE(["x$ac_cv_search_hsa_init"], - [xnone*], [], - [xno], [rocm_happy=no], - [x-l*], [ROCM_LIBS="$ac_cv_search_hsa_init $ROCM_LIBS"])]) + [AC_CHECK_LIB([hsa-runtime64], [hsa_init], [rocm_happy=yes], [rocm_happy=no])]) + + AS_IF([test "x$rocm_happy" = "xyes"], + [AC_DEFINE([HAVE_ROCM], 1, [Enable ROCM support]) + AC_SUBST([ROCM_CPPFLAGS]) + AC_SUBST([ROCM_LDFLAGS]) + AC_SUBST([ROCM_LIBS]) + AC_SUBST([ROCM_ROOT])], + [AC_MSG_WARN([ROCm not found])]) CPPFLAGS="$SAVE_CPPFLAGS" LDFLAGS="$SAVE_LDFLAGS" LIBS="$SAVE_LIBS" - AS_IF([test "x$rocm_happy" = "xyes"], - [AC_SUBST([ROCM_CPPFLAGS]) - AC_SUBST([ROCM_LDFLAGS]) - AC_SUBST([ROCM_LIBS])], - [AC_MSG_WARN([ROCm not found])]) + HIP_BUILD_FLAGS([$with_rocm], [HIP_LIBS], [HIP_LDFLAGS], [HIP_CPPFLAGS]) + + CPPFLAGS="$HIP_CPPFLAGS $CPPFLAGS" + LDFLAGS="$HIP_LDFLAGS $LDFLAGS" + LIBS="$HIP_LIBS $LIBS" + + hip_happy=no + AC_CHECK_LIB([hip_hcc], [hipFree], [AC_MSG_WARN([Please install ROCm-3.7.0 or above])], [hip_happy=yes]) + AS_IF([test "x$hip_happy" = xyes], + [AC_CHECK_HEADERS([hip_runtime.h], [hip_happy=yes], [hip_happy=no])]) + AS_IF([test "x$hip_happy" = xyes], + [AC_CHECK_LIB([amdhip64], [hipFree], [hip_happy=yes], [hip_happy=no])]) + AS_IF([test "x$hip_happy" = xyes], [HIP_CXXFLAGS="--std=gnu++11"], []) + + CPPFLAGS="$SAVE_CPPFLAGS" + LDFLAGS="$SAVE_LDFLAGS" + LIBS="$SAVE_LIBS" + + AS_IF([test "x$hip_happy" = "xyes"], + [AC_DEFINE([HAVE_HIP], 1, [Enable HIP support]) + AC_SUBST([HIP_CPPFLAGS]) + AC_SUBST([HIP_CXXFLAGS]) + AC_SUBST([HIP_LDFLAGS]) + AC_SUBST([HIP_LIBS])], + [AC_MSG_WARN([HIP Runtime not found])]) + ], [AC_MSG_WARN([ROCm was explicitly disabled])] ) rocm_checked=yes AM_CONDITIONAL([HAVE_ROCM], [test "x$rocm_happy" != xno]) +AM_CONDITIONAL([HAVE_HIP], [test "x$hip_happy" != xno]) ]) diff --git a/config/m4/sysdep.m4 b/config/m4/sysdep.m4 index b1135af123b..9a8d5d8f4c6 100644 --- a/config/m4/sysdep.m4 +++ b/config/m4/sysdep.m4 @@ -5,6 +5,9 @@ # +AC_FUNC_ALLOCA + + # # SystemV shared memory # @@ -12,32 +15,72 @@ AC_CHECK_LIB([rt], [shm_open], [], AC_MSG_ERROR([librt not found])) AC_CHECK_LIB([rt], [timer_create], [], AC_MSG_ERROR([librt not found])) + # # Extended string functions # -AC_CHECK_DECLS([asprintf, strdupa, basename, fmemopen], [], +AC_CHECK_HEADERS([libgen.h]) +AC_CHECK_DECLS([asprintf, basename, fmemopen], [], AC_MSG_ERROR([GNU string extensions not found]), [#define _GNU_SOURCE 1 #include - #include ]) + #include + #ifdef HAVE_LIBGEN_H + #include + #endif + ]) # # CPU-sets # +AC_CHECK_HEADERS([sys/cpuset.h]) AC_CHECK_DECLS([CPU_ZERO, CPU_ISSET], [], AC_MSG_ERROR([CPU_ZERO/CPU_ISSET not found]), [#define _GNU_SOURCE 1 - #include ]) + #include + #include + #ifdef HAVE_SYS_CPUSET_H + #include + #endif + ]) +AC_CHECK_TYPES([cpu_set_t, cpuset_t], [], [], + [#define _GNU_SOURCE 1 + #include + #include + #ifdef HAVE_SYS_CPUSET_H + #include + #endif]) + + +# +# Type for sighandler +# +AC_CHECK_TYPES([sighandler_t, __sighandler_t], [], [], + [#define _GNU_SOURCE 1 + #include ]) # # pthread # +AC_CHECK_HEADERS([pthread_np.h]) AC_SEARCH_LIBS(pthread_create, pthread) AC_SEARCH_LIBS(pthread_atfork, pthread) +# +# Misc. Linux-specific functions +# +AC_CHECK_FUNCS([clearenv]) +AC_CHECK_FUNCS([malloc_trim]) +AC_CHECK_FUNCS([memalign]) +AC_CHECK_FUNCS([posix_memalign]) +AC_CHECK_FUNCS([mremap]) +AC_CHECK_FUNCS([sched_setaffinity sched_getaffinity]) +AC_CHECK_FUNCS([cpuset_setaffinity cpuset_getaffinity]) + + # # Route file descriptor signal to specific thread # @@ -107,6 +150,7 @@ AC_ARG_ENABLE([numa], ] ) + # # Malloc hooks # @@ -120,7 +164,7 @@ CHECK_CROSS_COMP([AC_LANG_SOURCE([#include rc = 0; return NULL; } - int main() { + int main(int argc, char** argv) { __malloc_hook = myhook; ptr = malloc(1); return rc; @@ -143,8 +187,105 @@ AC_CHECK_HEADERS([sys/capability.h], [[#include ]])] ) - # # Check for PR_SET_PTRACER # AC_CHECK_DECLS([PR_SET_PTRACER], [], [], [#include ]) + + +# +# ipv6 s6_addr32/__u6_addr32 shortcuts for in6_addr +# ip header structure layout name +# +AC_CHECK_MEMBER(struct in6_addr.s6_addr32, + [AC_DEFINE([HAVE_IN6_ADDR_S6_ADDR32], [1], + [struct in6_addr has s6_addr32 member])], + [], + [#include ]) +AC_CHECK_MEMBER(struct in6_addr.__u6_addr.__u6_addr32, + [AC_DEFINE([HAVE_IN6_ADDR_U6_ADDR32], [1], + [struct in6_addr is BSD-style])], + [], + [#include ]) +AC_CHECK_MEMBER(struct iphdr.daddr.s_addr, + [AC_DEFINE([HAVE_IPHDR_DADDR], [1], + [struct iphdr has daddr member])], + [], + [#include ]) +AC_CHECK_MEMBER(struct ip.ip_dst.s_addr, + [AC_DEFINE([HAVE_IP_IP_DST], [1], + [struct ip has ip_dst member])], + [], + [#include + #include + #include ]) + + +# +# struct sigevent reporting thread id +# +AC_CHECK_MEMBER(struct sigevent._sigev_un._tid, + [AC_DEFINE([HAVE_SIGEVENT_SIGEV_UN_TID], [1], + [struct sigevent has _sigev_un._tid])], + [], + [#include ]) +AC_CHECK_MEMBER(struct sigevent.sigev_notify_thread_id, + [AC_DEFINE([HAVE_SIGEVENT_SIGEV_NOTIFY_THREAD_ID], [1], + [struct sigevent has sigev_notify_thread_id])], + [], + [#include ]) + + +# +# sa_restorer is something that only Linux has +# +AC_CHECK_MEMBER(struct sigaction.sa_restorer, + [AC_DEFINE([HAVE_SIGACTION_SA_RESTORER], [1], + [struct sigaction has sa_restorer member])], + [], + [#include ]) + + +# +# epoll vs. kqueue +# +AC_CHECK_HEADERS([sys/epoll.h]) +AC_CHECK_HEADERS([sys/eventfd.h]) +AC_CHECK_HEADERS([sys/event.h]) + + +# +# FreeBSD-specific threading functions +# +AC_CHECK_HEADERS([sys/thr.h]) + + +# +# malloc headers are Linux-specific +# +AC_CHECK_HEADERS([malloc.h]) +AC_CHECK_HEADERS([malloc_np.h]) + + +# +# endianess +# +AC_CHECK_HEADERS([endian.h, sys/endian.h]) + + +# +# Linux-only headers +# +AC_CHECK_HEADERS([linux/mman.h]) +AC_CHECK_HEADERS([linux/ip.h]) +AC_CHECK_HEADERS([linux/futex.h]) + + +# +# Networking headers +# +AC_CHECK_HEADERS([net/ethernet.h], [], [], + [#include ]) +AC_CHECK_HEADERS([netinet/ip.h], [], [], + [#include + #include ]) diff --git a/config/m4/ucg.m4 b/config/m4/ucg.m4 new file mode 100644 index 00000000000..14d2455f015 --- /dev/null +++ b/config/m4/ucg.m4 @@ -0,0 +1,28 @@ +# +# Copyright (C) Huawei Technologies Co., Ltd. 2019. ALL RIGHTS RESERVED. +# Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# + +# +# Enable UCG - Group collective operations component +# + +ucg_modules="" + +AC_ARG_ENABLE([ucg], + AS_HELP_STRING([--enable-ucg], + [Enable the group collective operations (experimental component), default: YES]), + [ucg=yes], + [ucg=no]) +AS_IF([test "x$enable_ucg" != xno], + [ucg_modules=":builtin" + m4_sinclude([src/ucg/configure.m4]) # m4_sinclude() silently ignores errors + AC_DEFINE([ENABLE_UCG], [1], + [Enable Groups and collective operations support (UCG)]) + AC_MSG_NOTICE([Building with Groups and collective operations support (UCG)]) + ]) +AS_IF([test -f ${ac_confdir}/src/ucg/Makefile.am], + [AC_SUBST([UCG_SUBDIR], [src/ucg])]) + +AM_CONDITIONAL([HAVE_UCG], [test "x$enable_ucg" != xno]) diff --git a/config/m4/ucs.m4 b/config/m4/ucs.m4 index 60f4bae132f..6722a70d5eb 100644 --- a/config/m4/ucs.m4 +++ b/config/m4/ucs.m4 @@ -43,6 +43,29 @@ AS_IF([test "x$enable_backtrace_detail" = xyes], AC_CHECK_TYPES([struct dl_phdr_info], [], [AC_MSG_WARN([struct dl_phdr_info not defined])];BT=0, [#define _GNU_SOURCE 1 #include ]) + AC_CHECK_DECLS([bfd_get_section_flags, bfd_section_flags, bfd_get_section_vma, bfd_section_vma], + [], [], [#include ]) + + AC_MSG_CHECKING([bfd_section_size API version]) + AC_LANG_PUSH([C]) + SAVE_CFLAGS="$CFLAGS" + AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ + #include + int main(int argc, char** argv) { + asection *sec = malloc(sizeof(*sec)); + bfd_section_size(sec); + free(sec); + return 0; + } ]])], + [AC_MSG_RESULT([1-arg API]) + AC_DEFINE([HAVE_1_ARG_BFD_SECTION_SIZE], [1], + [bfd_section_size 1-arg version])], + [AC_MSG_RESULT([2-args API]) + AC_DEFINE([HAVE_1_ARG_BFD_SECTION_SIZE], [0], + [bfd_section_size 2-args version])]) + CFLAGS="$SAVE_CFLAGS" + AC_LANG_POP([C]) + if test "x$BT" = "x1"; then AC_CHECK_FUNCS([cplus_demangle]) AC_DEFINE([HAVE_DETAILED_BACKTRACE], 1, [Enable detailed backtrace]) @@ -123,7 +146,6 @@ AS_IF([test "x$enable_logging" != xno], [AC_DEFINE([UCS_MAX_LOG_LEVEL], [UCS_LOG_LEVEL_INFO], [Highest log level])] ) - # # Disable assertions # @@ -143,7 +165,7 @@ AC_MSG_CHECKING([__attribute__((constructor))]) CHECK_CROSS_COMP([AC_LANG_SOURCE([static int rc = 1; static void constructor_test() __attribute__((constructor)); static void constructor_test() { rc = 0; } - int main() { return rc; }])], + int main(int argc, char** argv) { return rc; }])], [AC_MSG_RESULT([yes])], [AC_MSG_ERROR([Cannot continue. Please use compiler that supports __attribute__((constructor))])] @@ -156,7 +178,7 @@ CHECK_CROSS_COMP([AC_LANG_SOURCE([static int rc = 1; AC_ARG_WITH([cache-line-size], [AC_HELP_STRING([--with-cache-line-size=SIZE], [Build UCX with cache line size defined by user. This parameter - overwrites defaulf cache line sizes defines in + overwrites default cache line sizes defines in UCX (x86-64: 64, Power: 128, ARMv8: 64/128). The supported values are: 64, 128])], [], [with_cache_line_size=no]) @@ -201,5 +223,20 @@ case ${host} in AC_DEFINE([HAVE_HW_TIMER], [1], [high-resolution hardware timer disabled]) esac +# +# Enable built-in memcpy +# +AC_ARG_ENABLE([builtin-memcpy], + AS_HELP_STRING([--enable-builtin-memcpy], + [Enable builtin memcpy routine, default: YES]), + [], + [enable_builtin_memcpy=yes]) + +AS_IF([test "x$enable_builtin_memcpy" != xno], + [AS_MESSAGE([enabling builtin memcpy]) + AC_DEFINE([ENABLE_BUILTIN_MEMCPY], [1], [Enable builtin memcpy])], + [AC_DEFINE([ENABLE_BUILTIN_MEMCPY], [0], [Enable builtin memcpy])] + ) + AC_CHECK_FUNCS([__clear_cache], [], []) AC_CHECK_FUNCS([__aarch64_sync_cache_range], [], []) diff --git a/configure.ac b/configure.ac index 5fd1c8de417..28d5c5f8f24 100644 --- a/configure.ac +++ b/configure.ac @@ -4,13 +4,13 @@ # Copyright (C) The University of Tennessee and The University # of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED. # Copyright (C) ARM Ltd. 2016-2019. ALL RIGHTS RESERVED. -# Copyright (C) Huawei Technologies Co.,Ltd. 2020. ALL RIGHTS RESERVED. +# Copyright (C) Huawei Technologies Co.,Ltd. 2020-2021. ALL RIGHTS RESERVED. # See file LICENSE for terms. # AC_PREREQ([2.63]) define([ucx_ver_major], 1) -define([ucx_ver_minor], 6) +define([ucx_ver_minor], 9) define([ucx_ver_patch], 0) define([ts], esyscmd([sh -c "date +%Y%m%d%H%M%S"])) @@ -30,10 +30,13 @@ AC_USE_SYSTEM_EXTENSIONS AC_GNU_SOURCE AC_CONFIG_HEADERS([config.h]) -AC_CHECK_PROG(GITBIN,git,yes) +AC_CHECK_PROG(GITBIN, git, yes) AS_IF([test x"${GITBIN}" = x"yes"], - [AC_SUBST(SCM_VERSION, esyscmd([sh -c 'git rev-parse --short=7 HEAD']))], - [AC_SUBST(SCM_VERSION, "0000000")]) + [# remove preceding "refs/heads/" (11 characters) for symbolic ref + AC_SUBST(SCM_BRANCH, esyscmd([sh -c 'git symbolic-ref --quiet HEAD | sed "s/^.\{11\}//"'])) + AC_SUBST(SCM_VERSION, esyscmd([sh -c 'git rev-parse --short=7 HEAD']))], + [AC_SUBST(SCM_BRANCH, "") + AC_SUBST(SCM_VERSION, "0000000")]) AH_TOP([ #ifndef UCX_CONFIG_H @@ -80,7 +83,14 @@ LT_LIB_M AC_C_RESTRICT AC_FUNC_STRERROR_R -PKG_PROG_PKG_CONFIG +AC_PATH_TOOL([PKG_CONFIG], [pkg-config], [pkg-config]) + + +# +# Force link_all_deplibs=yes for libtool, otherwise it will not +# link against dependency libs +# +link_all_deplibs=yes # @@ -126,6 +136,8 @@ AC_DEFINE_UNQUOTED([UCX_MODULE_SUBDIR], ["${modulesubdir}"], [UCX module sub-dir # m4_include([config/m4/ax_prog_doxygen.m4]) m4_include([config/m4/graphviz.m4]) +m4_include([config/m4/ucg.m4]) + AC_ARG_WITH([docs_only], AS_HELP_STRING([--with-docs-only], [Compile only the docs and not the rest of UCX. [default=NO]]), @@ -145,7 +157,7 @@ DX_PS_FEATURE(OFF) DX_HTML_FEATURE(ON) DX_MAN_FEATURE(ON) DX_PDF_FEATURE(ON) -DX_INIT_DOXYGEN([UCX],[doc/doxygen/ucxdox],[doc/doxygen-doc]) +DX_INIT_DOXYGEN([UCX],[docs/doxygen/ucxdox],[docs/doxygen-doc]) AS_IF([test "x$with_docs_only" = xyes], [AS_MESSAGE([Documents only requested]) @@ -164,22 +176,25 @@ AS_IF([test "x$with_docs_only" = xyes], AM_CONDITIONAL([HAVE_MLX5_HW], [false]) AM_CONDITIONAL([HAVE_MLX5_HW_UD], [false]) AM_CONDITIONAL([HAVE_MLX5_DV], [false]) + AM_CONDITIONAL([HAVE_HNS_ROCE], [false]) + AM_CONDITIONAL([HAVE_DEVX], [false]) + AM_CONDITIONAL([HAVE_EXP], [false]) AM_CONDITIONAL([HAVE_TL_RC], [false]) AM_CONDITIONAL([HAVE_TL_DC], [false]) AM_CONDITIONAL([HAVE_DC_DV], [false]) AM_CONDITIONAL([HAVE_DC_EXP], [false]) AM_CONDITIONAL([HAVE_TL_UD], [false]) AM_CONDITIONAL([HAVE_TL_CM], [false]) - AM_CONDITIONAL([HAVE_HNS_ROCE], [false]) - AM_CONDITIONAL([HAVE_IBV_EX_HW_TM], [false]) AM_CONDITIONAL([HAVE_CRAY_UGNI], [false]) AM_CONDITIONAL([HAVE_CUDA], [false]) AM_CONDITIONAL([HAVE_GDR_COPY], [false]) AM_CONDITIONAL([HAVE_ROCM], [false]) + AM_CONDITIONAL([HAVE_HIP], [false]) AM_CONDITIONAL([HAVE_XPMEM], [false]) AM_CONDITIONAL([HAVE_CMA], [false]) AM_CONDITIONAL([HAVE_KNEM], [false]) AM_CONDITIONAL([HAVE_RDMACM], [false]) + AM_CONDITIONAL([HAVE_RDMACM_QP_LESS], [false]) AM_CONDITIONAL([HAVE_MPI], [false]) AM_CONDITIONAL([HAVE_MPIRUN], [false]) AM_CONDITIONAL([HAVE_MPICC], [false]) @@ -187,15 +202,18 @@ AS_IF([test "x$with_docs_only" = xyes], AM_CONDITIONAL([HAVE_UCM_PTMALLOC286], [false]) AM_CONDITIONAL([HAVE_JAVA], [false]) AM_CONDITIONAL([HAVE_CXX11], [false]) + AM_CONDITIONAL([HAVE_GNUXX11], [false]) + AM_CONDITIONAL([HAVE_GLIBCXX_NOTHROW], [false]) AM_CONDITIONAL([HAVE_TCMALLOC], [false]) AM_CONDITIONAL([ENABLE_EXPERIMENTAL_API], [false]) AM_CONDITIONAL([INSTALL_DEVEL_HEADERS], [false]) AM_CONDITIONAL([HAVE_EXAMPLES], [false]) - AM_CONDITIONAL([HAVE_UCG], [true]) # docs include UCG headers + AM_CONDITIONAL([HAVE_AARCH64_THUNDERX2], [false]) + AM_CONDITIONAL([HAVE_AARCH64_THUNDERX1], [false]) + AM_CONDITIONAL([HAVE_AARCH64_HI1620], [false]) ], [ AM_CONDITIONAL([DOCS_ONLY], [false]) - AM_CONDITIONAL([HAVE_UCG], [true]) m4_include([config/m4/compiler.m4]) m4_include([config/m4/sysdep.m4]) m4_include([config/m4/ucs.m4]) @@ -208,25 +226,9 @@ AS_IF([test "x$with_docs_only" = xyes], m4_include([config/m4/gdrcopy.m4]) m4_include([src/ucm/configure.m4]) m4_include([src/uct/configure.m4]) - m4_include([src/ucg/configure.m4]) m4_include([src/tools/perf/configure.m4]) m4_include([test/gtest/configure.m4]) - # - # Compile code with frame pointer. Optimizations usually omit the frame pointer, - # but if we are profiling the code with callgraph we need it. - # This option may affect perofrmance so it is off by default. - # - AC_ARG_ENABLE([frame-pointer], - AS_HELP_STRING([--enable-frame-pointer], - [Compile with frame pointer, useful for profiling, default: NO]), - [], - [enable_frame_pointer=no]) - AS_IF([test "x$enable_frame_pointer" = xyes], - [AS_MESSAGE([compiling with frame pointer]) - BASE_CFLAGS="$BASE_CFLAGS -fno-omit-frame-pointer"], - [:]) - # # Enable fault injection code @@ -248,7 +250,7 @@ AS_IF([test "x$with_docs_only" = xyes], AC_ARG_ENABLE([params-check], AS_HELP_STRING([--disable-params-check], [Disable checking user parameters passed to API, default: NO]), - [], + [], [enable_params_check=yes]) AS_IF([test "x$enable_params_check" = xyes], [AC_DEFINE([ENABLE_PARAMS_CHECK], [1], [Enable checking user parameters])], @@ -260,7 +262,7 @@ AS_IF([test "x$with_docs_only" = xyes], AC_ARG_ENABLE([debug-data], AS_HELP_STRING([--enable-debug-data], [Enable collecting data to ease debugging, default: NO]), - [], + [], [enable_debug_data=no]) AS_IF([test "x$enable_debug_data" = xyes], [AC_DEFINE([ENABLE_DEBUG_DATA], [1], [Enable collecting data]) @@ -280,9 +282,9 @@ AS_IF([test "x$with_docs_only" = xyes], [enable_mt=no]) AS_IF([test "x$enable_mt" = xyes], [AC_DEFINE([ENABLE_MT], [1], [Enable thread support in UCP and UCT]) - mt_enable=Enabled], + mt_enable=enabled], [AC_DEFINE([ENABLE_MT], [0]) - mt_enable=Disabled]) + mt_enable=disabled]) # @@ -301,9 +303,9 @@ AS_IF([test "x$with_docs_only" = xyes], # AC_ARG_ENABLE([devel-headers], AS_HELP_STRING([--enable-devel-headers], - [Enable installing development headers, default: NO]) + [Enable installing development headers, default: NO]), [], - [enable_debug_headers=no]) + [enable_devel_headers=no]) AM_CONDITIONAL([INSTALL_DEVEL_HEADERS], [test "x$enable_devel_headers" = "xyes"]) @@ -322,32 +324,26 @@ AS_IF([test "x$with_docs_only" = xyes], [], [enable_examples=no]) AM_CONDITIONAL([HAVE_EXAMPLES], [test "x$enable_examples" = "xyes"]) - - # - # UCG - Group collective operations component (Always enabled in this version) - # - AC_DEFINE([ENABLE_UCG], [1], [Enable Groups and collective operations support (UCG)]) ]) # Docs only # -# Print which transports and group topologies are built +# Print which transports are built # build_modules="${uct_modules}" -build_modules+="${uct_ib_modules}" -build_modules+="${uct_cuda_modules}" -build_modules+="${ucm_modules}" -build_modules+="${ucx_perftest_modules}" -build_modules+="${uct_rocm_modules}" -AS_IF([test "x$enable_ucg" != "xno"], - [AC_MSG_NOTICE([Supported group modules: $ucg_modules]) - build_modules+="${ucg_modules}"],[]) +build_modules="${build_modules}${uct_ib_modules}" +build_modules="${build_modules}${uct_cuda_modules}" +build_modules="${build_modules}${ucm_modules}" +build_modules="${build_modules}${ucx_perftest_modules}" +build_modules="${build_modules}${uct_rocm_modules}" +build_modules="${build_modules}${ucg_modules}" AC_SUBST([build_modules], [${build_modules}]) +AC_SUBST([build_bindings], [${build_bindings}]) # # Final output # AC_CONFIG_FILES([Makefile - doc/doxygen/header.tex + docs/doxygen/header.tex src/uct/api/version.h ]) AS_IF([test "x$with_docs_only" = xyes], [], [ @@ -367,15 +363,17 @@ AC_CONFIG_FILES([ src/ucp/Makefile src/ucp/api/ucp_version.h src/ucp/core/ucp_version.c - src/ucg/api/ucg_version.h - src/ucg/base/ucg_version.c src/tools/info/Makefile src/tools/profile/Makefile test/apps/Makefile + test/apps/iodemo/Makefile test/apps/sockaddr/Makefile - test/examples/Makefile + test/apps/profiling/Makefile test/mpi/Makefile + bindings/java/Makefile + bindings/java/pom.xml bindings/java/src/main/native/Makefile + examples/Makefile ]) AC_CONFIG_FILES([test/mpi/run_mpi.sh], [chmod a+x test/mpi/run_mpi.sh]) @@ -395,14 +393,21 @@ AC_MSG_NOTICE([Building documents only]) ], [ AC_MSG_NOTICE([UCX build configuration:]) +AC_MSG_NOTICE([ Build prefix: ${prefix}]) +AC_MSG_NOTICE([Preprocessor flags: ${BASE_CPPFLAGS}]) +AC_MSG_NOTICE([ C compiler: ${CC} ${BASE_CFLAGS}]) +AC_MSG_NOTICE([ C++ compiler: ${CXX} ${BASE_CXXFLAGS}]) AC_MSG_NOTICE([ Multi-thread: ${mt_enable}]) AC_MSG_NOTICE([ MPI tests: ${mpi_enable}]) AC_MSG_NOTICE([ Devel headers: ${enable_devel_headers}]) -AC_MSG_NOTICE([ UCT modules: <$(echo $uct_modules|tr ':' ' ') >]) -AC_MSG_NOTICE([ CUDA modules: <$(echo $uct_cuda_modules|tr ':' ' ') >]) -AC_MSG_NOTICE([ ROCM modules: <$(echo $uct_rocm_modules|tr ':' ' ') >]) -AC_MSG_NOTICE([ IB modules: <$(echo $uct_ib_modules|tr ':' ' ') >]) -AC_MSG_NOTICE([ UCM modules: <$(echo $ucm_modules|tr ':' ' ') >]) -AC_MSG_NOTICE([ Perf modules: <$(echo $ucx_perftest_modules|tr ':' ' ') >]) -AC_MSG_NOTICE([ UCG modules: <$(echo $ucg_modules|tr ':' ' ') >])], []) +AC_MSG_NOTICE([ Bindings: <$(echo ${build_bindings}|tr ':' ' ') >]) +AC_MSG_NOTICE([ UCT modules: <$(echo ${uct_modules}|tr ':' ' ') >]) +AC_MSG_NOTICE([ CUDA modules: <$(echo ${uct_cuda_modules}|tr ':' ' ') >]) +AC_MSG_NOTICE([ ROCM modules: <$(echo ${uct_rocm_modules}|tr ':' ' ') >]) +AC_MSG_NOTICE([ IB modules: <$(echo ${uct_ib_modules}|tr ':' ' ') >]) +AC_MSG_NOTICE([ UCM modules: <$(echo ${ucm_modules}|tr ':' ' ') >]) +AC_MSG_NOTICE([ Perf modules: <$(echo ${ucx_perftest_modules}|tr ':' ' ') >]) +AS_IF([test "x$enable_ucg" != "xno"], [ + AC_MSG_NOTICE([ UCG modules: <$(echo ${ucg_modules}|tr ':' ' ') >])]) +]) AC_MSG_NOTICE([=========================================================]) diff --git a/contrib/buildrpm.sh b/contrib/buildrpm.sh index 815af795c64..46d4187b8fb 100755 --- a/contrib/buildrpm.sh +++ b/contrib/buildrpm.sh @@ -4,15 +4,17 @@ PACKAGE=ucx WS=$PWD rpmspec=${PACKAGE}.spec -rpmmacros="--define='_rpmdir ${WS}/rpm-dist' --define='_srcrpmdir ${WS}/rpm-dist' --define='_sourcedir ${WS}' --define='_specdir ${WS}' --define='_builddir ${WS}'" -rpmopts="--nodeps --buildroot='${WS}/_rpm'" +rpmmacros="--define='_rpmdir ${WS}/rpm-dist' --define='_srcrpmdir ${WS}/rpm-dist' --define='_sourcedir ${WS}' --define='_specdir ${WS}' --define='_builddir ${WS}'" +rpmopts="--buildroot='${WS}/_rpm'" opt_tarball=0 opt_srcrpm=0 opt_binrpm=0 opt_no_dist=0 +opt_no_deps=0 +opt_strict_ibverb_dep=0 defines="" while test "$1" != ""; do @@ -21,7 +23,9 @@ while test "$1" != ""; do --srcrpm|-s) opt_srcrpm=1 ;; --binrpm|-b) opt_binrpm=1 ;; --no-dist) opt_no_dist=1 ;; + --nodeps) opt_no_deps=1 ;; --define|-d) defines="$defines --define '$2'"; shift ;; + --strict-ibverbs-dep) opt_strict_ibverb_dep=1 ;; *) cat < Add a define to rpmbuild - +--strict-ibverbs-dep Add RPM "Requires: libibverbs == VER-RELEASE" (libibverbs has to be installed) EOF exit 1 @@ -46,6 +51,15 @@ if [ $opt_no_dist -eq 1 ]; then rpmmacros="$rpmmacros '--undefine=dist'" fi +if [ $opt_strict_ibverb_dep -eq 1 ]; then + libibverbs_ver=$(rpm -q libibverbs --qf '%{version}-%{release}') + rpmmacros="${rpmmacros} --define='extra_deps libibverbs == ${libibverbs_ver}'" +fi + +if [ $opt_no_deps -eq 1 ]; then + rpmopts="$rpmopts --nodeps" +fi + mkdir -p rpm-dist if [ $opt_tarball -eq 1 ]; then @@ -72,7 +86,8 @@ if [ $opt_binrpm -eq 1 ]; then with_arg() { module=$1 with_arg=${2:-$module} - if echo ${build_modules} | tr ':' '\n' | grep -q "^${module}$" + if (echo ${build_modules} | tr ':' '\n' | grep -q "^${module}$") || + (echo ${build_bindings} | tr ':' '\n' | grep -q "^${module}$") then echo "--with ${with_arg}" else @@ -91,7 +106,7 @@ if [ $opt_binrpm -eq 1 ]; then with_args+=" $(with_arg rocm)" with_args+=" $(with_arg ugni)" with_args+=" $(with_arg xpmem)" + with_args+=" $(with_arg java)" echo rpmbuild -bb $rpmmacros $rpmopts $rpmspec $defines $with_args | bash -eEx fi - diff --git a/contrib/configure-devel b/contrib/configure-devel index f741ab135ce..7525e93d7da 100755 --- a/contrib/configure-devel +++ b/contrib/configure-devel @@ -21,6 +21,5 @@ $basedir/../configure \ --enable-memtrack \ --enable-fault-injection \ --enable-debug-data \ - --enable-devel-headers \ --enable-mt \ "$@" diff --git a/contrib/configure-opt b/contrib/configure-opt index a46893e2e9a..7c752555348 100755 --- a/contrib/configure-opt +++ b/contrib/configure-opt @@ -1,19 +1,16 @@ #!/bin/sh # -# Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. -# Copyright (C) Huawei Technologies Co.,Ltd. 2020. ALL RIGHTS RESERVED. +# Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. # # See file LICENSE for terms. # # -# UCX build for maximal performance using specific CPU. +# UCX build for maximal performance, use non-portable CPU optimizations. # No extra debugging or profiling code. # basedir=$(cd $(dirname $0) && pwd) $basedir/configure-release \ - --with-avx \ - --with-clwb \ - --with-sse41 \ + --enable-optimizations \ "$@" diff --git a/contrib/configure-release b/contrib/configure-release index 070e14f3e44..c0f44408056 100755 --- a/contrib/configure-release +++ b/contrib/configure-release @@ -2,19 +2,18 @@ # # Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. # +# Copyright (C) Huawei Technologies Co.,Ltd. 2020-2021. ALL RIGHTS RESERVED. # See file LICENSE for terms. # # -# UCX build for maximal performance. +# UCX build for maximal performance while maintaining portability. # No extra debugging or profiling code. # basedir=$(cd $(dirname $0) && pwd) $basedir/../configure \ - --enable-optimizations \ --disable-logging \ --disable-debug \ --disable-assertions \ - --disable-params-check \ "$@" diff --git a/contrib/rpmdef.sh.in b/contrib/rpmdef.sh.in index d0d6a3bc58e..50925e5b797 100644 --- a/contrib/rpmdef.sh.in +++ b/contrib/rpmdef.sh.in @@ -1 +1,2 @@ build_modules="@build_modules@" +build_bindings="@build_bindings@" diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh index c9520e6161a..23fbfb1e173 100755 --- a/contrib/test_jenkins.sh +++ b/contrib/test_jenkins.sh @@ -9,12 +9,14 @@ # # # Environment variables set by Jenkins CI: -# - WORKSPACE : path to work dir -# - BUILD_NUMBER : jenkins build number -# - JOB_URL : jenkins job url -# - EXECUTOR_NUMBER : number of executor within the test machine -# - JENKINS_RUN_TESTS : whether to run unit tests -# - JENKINS_TEST_PERF : whether to validate performance +# - WORKSPACE : path to work dir +# - BUILD_NUMBER : jenkins build number +# - JOB_URL : jenkins job url +# - EXECUTOR_NUMBER : number of executor within the test machine +# - JENKINS_RUN_TESTS : whether to run unit tests +# - RUN_TESTS : same as JENKINS_RUN_TESTS, but for Azure +# - JENKINS_TEST_PERF : whether to validate performance +# - JENKINS_NO_VALGRIND : set this to disable valgrind tests # # Optional environment variables (could be set by job configuration): # - nworkers : number of parallel executors @@ -24,6 +26,8 @@ WORKSPACE=${WORKSPACE:=$PWD} ucx_inst=${WORKSPACE}/install +CUDA_MODULE="dev/cuda11.0" +GDRCOPY_MODULE="dev/gdrcopy2.0_cuda11.0" if [ -z "$BUILD_NUMBER" ]; then echo "Running interactive" @@ -42,7 +46,9 @@ fi # -# Set affinity to 2 cores according to Jenkins executor number +# Set affinity to 2 cores according to Jenkins executor number. +# Affinity is inherited from agent in Azure CI. +# TODO: remove or rename after CI migration. # if [ -n "$EXECUTOR_NUMBER" ] && [ -n "$JENKINS_RUN_TESTS" ] then @@ -52,10 +58,19 @@ else fi # -# Build command runs with 10 tasks +# Parallel build command runs with 4 tasks, or number of cores on the system, +# whichever is lowest # +num_cpus=$(lscpu -p | grep -v '^#' | wc -l) +[ -z $num_cpus ] && num_cpus=1 +parallel_jobs=4 +[ $parallel_jobs -gt $num_cpus ] && parallel_jobs=$num_cpus +num_pinned_threads=$(nproc) +[ $parallel_jobs -gt $num_pinned_threads ] && parallel_jobs=$num_pinned_threads + MAKE="make" -MAKEP="make -j10" +MAKEP="make -j${parallel_jobs}" +export AUTOMAKE_JOBS=$parallel_jobs # @@ -68,6 +83,13 @@ then fi echo "==== Running on $(hostname), worker $worker / $nworkers ====" +# +# cleanup ucx +# +make_clean() { + rm -rf ${ucx_inst} + $MAKEP ${1:-clean} +} # # Test if an environment module exists and load it if yes. @@ -89,23 +111,33 @@ module_load() { fi } +# +# Safe unload for env modules (even if it doesn't exist) +# +module_unload() { + module=$1 + module unload "${module}" || true +} + # # try load cuda modules if nvidia driver is installed # try_load_cuda_env() { num_gpus=0 have_cuda=no + have_gdrcopy=no if [ -f "/proc/driver/nvidia/version" ]; then have_cuda=yes - module_load dev/cuda || have_cuda=no - module_load dev/gdrcopy || have_cuda=no + have_gdrcopy=yes + module_load $CUDA_MODULE || have_cuda=no + module_load $GDRCOPY_MODULE || have_gdrcopy=no num_gpus=$(nvidia-smi -L | wc -l) fi } unload_cuda_env() { - module unload dev/cuda - module unload dev/gdrcopy + module_unload $CUDA_MODULE + module_unload $GDRCOPY_MODULE } # @@ -174,6 +206,59 @@ get_active_ib_devices() { done } +# +# Get list of active IP interfaces +# +get_active_ip_ifaces() { + device_list=$(ip addr | awk '/state UP/ {print $2}' | sed s/://) + for netdev in ${device_list} + do + (ip addr show ${netdev} | grep -q 'inet ') && echo ${netdev} || true + done +} + +# +# Get IP addr for a given IP iface +# Argument is the IP iface +# +get_ifaddr() { + iface=$1 + echo $(ip addr show ${iface} | awk '/inet /{print $2}' | awk -F '/' '{print $1}') +} + +get_rdma_device_ip_addr() { + if [ ! -r /dev/infiniband/rdma_cm ] + then + return + fi + + if ! which ibdev2netdev >&/dev/null + then + return + fi + + iface=`ibdev2netdev | grep Up | awk '{print $5}' | head -1` + if [ -n "$iface" ] + then + ipaddr=$(get_ifaddr ${iface}) + fi + + if [ -z "$ipaddr" ] + then + # if there is no inet (IPv4) address, escape + return + fi + + ibdev=`ibdev2netdev | grep $iface | awk '{print $1}'` + node_guid=`cat /sys/class/infiniband/$ibdev/node_guid` + if [ $node_guid == "0000:0000:0000:0000" ] + then + return + fi + + echo $ipaddr +} + # # Prepare build environment # @@ -181,8 +266,12 @@ prepare() { echo " ==== Prepare ====" env cd ${WORKSPACE} + if [ -d build-test ] + then + chmod u+rwx build-test -R + rm -rf build-test + fi ./autogen.sh - rm -rf build-test mkdir -p build-test cd build-test } @@ -212,9 +301,9 @@ build_docs() { then echo " ==== Build docs only ====" ../configure --prefix=$ucx_inst --with-docs-only - $MAKEP clean + make_clean $MAKE docs - $MAKEP clean # FIXME distclean does not work with docs-only + make_clean # FIXME distclean does not work with docs-only fi } @@ -240,9 +329,9 @@ build_java_docs() { build_no_verbs() { echo "==== Build without IB verbs ====" ../contrib/configure-release --prefix=$ucx_inst --without-verbs - $MAKEP clean + make_clean $MAKEP - $MAKEP distclean + make_clean distclean } # @@ -251,9 +340,9 @@ build_no_verbs() { build_disable_numa() { echo "==== Check --disable-numa compilation option ====" ../contrib/configure-release --prefix=$ucx_inst --disable-numa - $MAKEP clean + make_clean $MAKEP - $MAKEP distclean + make_clean distclean } # @@ -262,12 +351,12 @@ build_disable_numa() { build_release_pkg() { echo "==== Build release ====" ../contrib/configure-release - $MAKEP clean + make_clean $MAKEP $MAKEP distcheck # Show UCX info - ./src/tools/info/ucx_info -s -f -c -v -y -d -b -p -w -e -uart + ./src/tools/info/ucx_info -s -f -c -v -y -d -b -p -w -e -uart -m 20M if [ -f /etc/redhat-release -o -f /etc/fedora-release ]; then rpm_based=yes @@ -290,7 +379,7 @@ build_release_pkg() { dpkg-buildpackage -us -uc else echo "==== Build RPM ====" - ../contrib/buildrpm.sh -s -b + ../contrib/buildrpm.sh -s -b --nodeps --define "_topdir $PWD" fi # check that UCX version is present in spec file @@ -303,7 +392,7 @@ build_release_pkg() { fi cd - - $MAKEP distclean + make_clean distclean } # @@ -315,15 +404,49 @@ build_icc() { then echo "==== Build with Intel compiler ====" ../contrib/configure-devel --prefix=$ucx_inst CC=icc CXX=icpc - $MAKEP clean + make_clean + $MAKEP + make_clean distclean + echo "==== Build with Intel compiler (clang) ====" + ../contrib/configure-devel --prefix=$ucx_inst CC=clang CXX=clang++ + make_clean $MAKEP - $MAKEP distclean + make_clean distclean echo "ok 1 - build successful " >> build_icc.tap else echo "==== Not building with Intel compiler ====" - echo "ok 1 - # SKIP because Coverity not installed" >> build_icc.tap + echo "ok 1 - # SKIP because Intel compiler not installed" >> build_icc.tap fi - module unload intel/ics + module_unload intel/ics +} + +# +# Build with PGI compiler +# +build_pgi() { + echo 1..1 > build_pgi.tap + pgi_test_file=$(mktemp ./XXXXXX).c + echo "int main() {return 0;}" > ${pgi_test_file} + + if module_load pgi/latest && pgcc18 --version && pgcc18 ${pgi_test_file} -o ${pgi_test_file}.out + then + echo "==== Build with PGI compiler ====" + # PGI failed to build valgrind headers, disable it for now + # TODO: Using non-default PGI compiler - pgcc18 which is going to be default + # in next versions. + # Switch to default CC compiler after pgcc18 is default for pgi module + ../contrib/configure-devel --prefix=$ucx_inst CC=pgcc18 --without-valgrind + make_clean + $MAKEP + make_clean distclean + echo "ok 1 - build successful " >> build_pgi.tap + else + echo "==== Not building with PGI compiler ====" + echo "ok 1 - # SKIP because PGI compiler not installed" >> build_pgi.tap + fi + + rm -rf ${pgi_test_file} ${pgi_test_file}.out + module_unload pgi/latest } # @@ -332,77 +455,89 @@ build_icc() { build_debug() { echo "==== Build with --enable-debug option ====" ../contrib/configure-devel --prefix=$ucx_inst --enable-debug --enable-examples - $MAKEP clean + make_clean + $MAKEP + make_clean distclean +} + +# +# Build prof +# +build_prof() { + echo "==== Build configure-prof ====" + ../contrib/configure-prof --prefix=$ucx_inst + make_clean $MAKEP - $MAKEP distclean + make_clean distclean } # # Build UGNI # build_ugni() { - echo 1..1 > build_ugni.tap - - echo "==== Build with cray-ugni ====" - # - # Point pkg-config to contrib/cray-ugni-mock, and replace - # PKG_CONFIG_TOP_BUILD_DIR with source dir, since the mock .pc files contain - # relative paths. - # - ../contrib/configure-devel --prefix=$ucx_inst --with-ugni \ - PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$PWD/../contrib/cray-ugni-mock \ - PKG_CONFIG_TOP_BUILD_DIR=$PWD/.. - $MAKEP clean - $MAKEP - - # make sure UGNI transport is enabled - grep '#define HAVE_TL_UGNI 1' config.h - - $MAKE distcheck - $MAKEP distclean - - module unload dev/cray-ugni - echo "ok 1 - build successful " >> build_ugni.tap + echo 1..1 > build_ugni.tap + + echo "==== Build with cray-ugni ====" + # + # Point pkg-config to contrib/cray-ugni-mock, and replace + # PKG_CONFIG_TOP_BUILD_DIR with source dir, since the mock .pc files contain + # relative paths. + # + ../contrib/configure-devel --prefix=$ucx_inst --with-ugni \ + PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$PWD/../contrib/cray-ugni-mock \ + PKG_CONFIG_TOP_BUILD_DIR=$PWD/.. + make_clean + $MAKEP + + # make sure UGNI transport is enabled + grep '#define HAVE_TL_UGNI 1' config.h + + $MAKE distcheck + make_clean distclean + + module_unload dev/cray-ugni + echo "ok 1 - build successful " >> build_ugni.tap } # # Build CUDA # build_cuda() { - echo 1..1 > build_cuda.tap - if module_load dev/cuda - then - if module_load dev/gdrcopy - then - echo "==== Build with enable cuda, gdr_copy ====" - ../contrib/configure-devel --prefix=$ucx_inst --with-cuda --with-gdrcopy - $MAKEP clean - $MAKEP - $MAKEP distclean - - ../contrib/configure-release --prefix=$ucx_inst --with-cuda --with-gdrcopy - $MAKEP clean - $MAKEP - $MAKEP distclean - module unload dev/gdrcopy - fi - - echo "==== Build with enable cuda, w/o gdr_copy ====" - ../contrib/configure-devel --prefix=$ucx_inst --with-cuda --without-gdrcopy - $MAKEP clean - $MAKEP - - module unload dev/cuda - - echo "==== Running test_link_map with cuda build but no cuda module ====" - env UCX_HANDLE_ERRORS=bt ./test/apps/test_link_map - - $MAKEP distclean - echo "ok 1 - build successful " >> build_cuda.tap - else - echo "==== Not building with cuda flags ====" - echo "ok 1 - # SKIP because cuda not installed" >> build_cuda.tap - fi + echo 1..1 > build_cuda.tap + if module_load $CUDA_MODULE + then + if module_load $GDRCOPY_MODULE + then + echo "==== Build with enable cuda, gdr_copy ====" + ../contrib/configure-devel --prefix=$ucx_inst --with-cuda --with-gdrcopy + make_clean + $MAKEP + make_clean distclean + + ../contrib/configure-release --prefix=$ucx_inst --with-cuda --with-gdrcopy + make_clean + $MAKEP + make_clean distclean + module unload $GDRCOPY_MODULE + fi + + echo "==== Build with enable cuda, w/o gdr_copy ====" + ../contrib/configure-devel --prefix=$ucx_inst --with-cuda --without-gdrcopy + make_clean + $MAKEP + + module unload $CUDA_MODULE + + echo "==== Running test_link_map with cuda build but no cuda module ====" + env UCX_HANDLE_ERRORS=bt ./test/apps/test_link_map + + make_clean distclean + echo "ok 1 - build successful " >> build_cuda.tap + else + echo "==== Not building with cuda flags ====" + echo "ok 1 - # SKIP because cuda not installed" >> build_cuda.tap + fi + unload_cuda_env } # @@ -414,11 +549,11 @@ build_clang() { then echo "==== Build with clang compiler ====" ../contrib/configure-devel --prefix=$ucx_inst CC=clang CXX=clang++ - $MAKEP clean + make_clean $MAKEP $MAKEP install UCX_HANDLE_ERRORS=bt,freeze UCX_LOG_LEVEL_TRIGGER=ERROR $ucx_inst/bin/ucx_info -d - $MAKEP distclean + make_clean distclean echo "ok 1 - build successful " >> build_clang.tap else echo "==== Not building with clang compiler ====" @@ -443,12 +578,13 @@ build_gcc_latest() { then echo "==== Build with GCC compiler ($(gcc --version|head -1)) ====" ../contrib/configure-devel --prefix=$ucx_inst - $MAKEP clean + make_clean $MAKEP $MAKEP install UCX_HANDLE_ERRORS=bt,freeze UCX_LOG_LEVEL_TRIGGER=ERROR $ucx_inst/bin/ucx_info -d - $MAKEP distclean + make_clean distclean echo "ok 1 - build successful " >> build_gcc_latest.tap + module unload dev/gcc-latest else echo "==== Not building with latest gcc compiler ====" echo "ok 1 - # SKIP because dev/gcc-latest module is not available" >> build_gcc_latest.tap @@ -467,14 +603,14 @@ build_experimental_api() { # Experimental header file should not be installed by regular build echo "==== Install WITHOUT experimental API ====" ../contrib/configure-release --prefix=$ucx_inst - $MAKEP clean + make_clean $MAKEP install ! test -e $ucx_inst/include/ucp/api/ucpx.h # Experimental header file should be installed by --enable-experimental-api echo "==== Install WITH experimental API ====" ../contrib/configure-release --prefix=$ucx_inst --enable-experimental-api - $MAKEP clean + make_clean $MAKEP install test -e $ucx_inst/include/ucp/api/ucpx.h } @@ -488,10 +624,10 @@ build_jucx() { then echo "==== Building JUCX bindings (java api for ucx) ====" ../contrib/configure-release --prefix=$ucx_inst --with-java - $MAKEP clean + make_clean $MAKEP $MAKEP install - $MAKEP distclean + make_clean distclean echo "ok 1 - build successful " >> build_jucx.tap module unload dev/jdk module unload dev/mvn @@ -505,22 +641,26 @@ build_jucx() { # Build with armclang compiler # build_armclang() { - echo 1..1 > build_armclang.tap - if module_load arm-compiler/latest - then - echo "==== Build with armclang compiler ====" - ../contrib/configure-devel --prefix=$ucx_inst CC=armclang CXX=armclang++ - $MAKEP clean - $MAKEP - $MAKEP install - UCX_HANDLE_ERRORS=bt,freeze UCX_LOG_LEVEL_TRIGGER=ERROR $ucx_inst/bin/ucx_info -d - $MAKEP distclean - echo "ok 1 - build successful " >> build_armclang.tap - module unload arm-compiler/latest - else - echo "==== Not building with armclang compiler ====" - echo "ok 1 - # SKIP because armclang not installed" >> build_armclang.tap - fi + echo 1..1 > build_armclang.tap + armclang_test_file=$(mktemp ./XXXXXX).c + echo "int main() {return 0;}" > ${armclang_test_file} + if module_load arm-compiler/latest && armclang --version && armclang ${armclang_test_file} -o ${armclang_test_file}.out + then + echo "==== Build with armclang compiler ====" + ../contrib/configure-devel --prefix=$ucx_inst CC=armclang CXX=armclang++ + make_clean + $MAKEP + $MAKEP install + UCX_HANDLE_ERRORS=bt,freeze UCX_LOG_LEVEL_TRIGGER=ERROR $ucx_inst/bin/ucx_info -d + make_clean distclean + echo "ok 1 - build successful " >> build_armclang.tap + else + echo "==== Not building with armclang compiler ====" + echo "ok 1 - # SKIP because armclang not installed" >> build_armclang.tap + fi + + rm -rf ${armclang_test_file} ${armclang_test_file}.out + module_unload arm-compiler/latest } check_inst_headers() { @@ -528,14 +668,139 @@ check_inst_headers() { echo "==== Testing installed headers ====" ../contrib/configure-release --prefix=$PWD/install - $MAKEP clean + make_clean $MAKEP install ../contrib/check_inst_headers.sh $PWD/install/include - $MAKEP distclean + make_clean distclean echo "ok 1 - build successful " >> inst_headers.tap } +check_make_distcheck() { + echo 1..1 > make_distcheck.tap + + # If the gcc version on the host is older than 4.8.5, don't run + # due to a compiler bug that reproduces when building with gtest + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61886 + if (echo "4.8.5"; gcc --version | head -1 | awk '{print $3}') | sort -CV + then + echo "==== Testing make distcheck ====" + make_clean && make_clean distclean + ../contrib/configure-release --prefix=$PWD/install + $MAKEP DISTCHECK_CONFIGURE_FLAGS="--enable-gtest" distcheck + else + echo "Not testing make distcheck: GCC version is too old ($(gcc --version|head -1))" + fi +} + +check_config_h() { + echo 1..1 > check_config_h.tap + + srcdir=$PWD/../src + + # Check if all .c files include config.h + echo "==== Checking for config.h files in directory $srcdir ====" + + missing=`find $srcdir -name \*.c -o -name \*.cc | xargs grep -LP '\#\s*include\s+"config.h"'` + + if [ `echo $missing | wc -w` -eq 0 ] + then + echo "ok 1 - check successful " >> check_config_h.tap + else + echo "Error: missing include config.h in files: $missing" + exit 1 + fi +} + +# +# Expands a CPU list such as "0-3,17" to "0 1 2 3 17" (each cpu in a new line) +# +expand_cpulist() { + cpulist=$1 + tokens=$(echo ${cpulist} | tr ',' ' ') + for token in ${tokens} + do + # if there is no '-', first and last would be equal + first=$(echo ${token} | cut -d'-' -f1) + last=$( echo ${token} | cut -d'-' -f2) + + for ((cpu=${first};cpu<=${last};++cpu)) + do + echo ${cpu} + done + done +} + +# +# Get the N'th CPU that the current process can run on +# +slice_affinity() { + n=$1 + + # get affinity mask of the current process + compact_cpulist=$($AFFINITY bash -c 'taskset -cp $$' | cut -d: -f2) + cpulist=$(expand_cpulist ${compact_cpulist}) + + echo "${cpulist}" | head -n $((n + 1)) | tail -1 +} + +# +# `rename` has a binary and Perl flavors. Ubuntu comes with Perl one and +# requires different usage. +# +rename_files() { + expr=$1; shift + replacement=$1; shift + files=$* + if rename --version | grep 'util-linux'; then + rename "${expr}" "${replacement}" $files + return + fi + + rename "s/\\${expr}\$/${replacement}/" "${files}" +} + +run_client_server_app() { + test_name=$1 + test_args=$2 + server_addr_arg=$3 + kill_server=$4 + error_emulation=$5 + + server_port=$((10000 + EXECUTOR_NUMBER)) + server_port_arg="-p $server_port" + + affinity_server=$(slice_affinity 0) + affinity_client=$(slice_affinity 1) + + taskset -c $affinity_server ${test_name} ${test_args} ${server_port_arg} & + server_pid=$! + + sleep 15 + + if [ $error_emulation -eq 1 ] + then + set +Ee + fi + + taskset -c $affinity_client ${test_name} ${test_args} ${server_addr_arg} ${server_port_arg} & + client_pid=$! + + wait ${client_pid} + + if [ $error_emulation -eq 1 ] + then + set -eE + fi + + if [ $kill_server -eq 1 ] + then + kill -9 ${server_pid} + else + wait ${server_pid} + fi +} + run_hello() { api=$1 shift @@ -544,49 +809,31 @@ run_hello() { if [ ! -x ${test_name} ] then - gcc -o ${test_name} ${ucx_inst}/share/ucx/examples/${test_name}.c \ - -l${api} -lucs -I${ucx_inst}/include -L${ucx_inst}/lib \ - -Wl,-rpath=${ucx_inst}/lib + $MAKEP -C examples ${test_name} fi # set smaller timeouts so the test will complete faster if [[ ${test_args} == *"-e"* ]] then - export UCX_UD_TIMEOUT=1s + export UCX_UD_TIMEOUT=15s export UCX_RC_TIMEOUT=1ms export UCX_RC_RETRY_COUNT=4 fi - - # hello-world example - tcp_port=$((10000 + EXECUTOR_NUMBER)) - - ./${test_name} ${test_args} -p ${tcp_port} & - hw_server_pid=$! - - sleep 15 - # temporary disable if [[ ${test_args} == *"-e"* ]] then - set +Ee + error_emulation=1 + else + error_emulation=0 fi - # need to be ran in background to reflect application PID in $! - ./${test_name} ${test_args} -n $(hostname) -p ${tcp_port} & - hw_client_pid=$! + run_client_server_app "./examples/${test_name}" "${test_args}" "-n $(hostname)" 0 $error_emulation - # make sure server process is not running if [[ ${test_args} == *"-e"* ]] then unset UCX_UD_TIMEOUT unset UCX_RC_TIMEOUT unset UCX_RC_RETRY_COUNT - wait ${hw_client_pid} - # return default - set -Ee - wait ${hw_server_pid} - else - wait ${hw_client_pid} ${hw_server_pid} fi } @@ -599,10 +846,20 @@ run_ucp_hello() { return # skip if cannot create ucp ep fi + mem_types_list="host " + + if [ "X$have_cuda" == "Xyes" ] + then + mem_types_list+="cuda cuda-managed " + fi + for test_mode in -w -f -b -e do - echo "==== Running UCP hello world with mode ${test_mode} ====" - run_hello ucp ${test_mode} + for mem_type in $mem_types_list + do + echo "==== Running UCP hello world with mode ${test_mode} and \"${mem_type}\" memory type ====" + run_hello ucp ${test_mode} -m ${mem_type} + done done rm -f ./ucp_hello_world } @@ -611,14 +868,29 @@ run_ucp_hello() { # Compile and run UCT hello world example # run_uct_hello() { + mem_types_list="host " + + if [ "X$have_cuda" == "Xyes" ] && [ -f "/sys/kernel/mm/memory_peers/nv_mem/version" ] + then + mem_types_list+="cuda-managed " + if [ -f "/sys/kernel/mm/memory_peers/nv_mem/version" ] + then + # test RDMA GPUDirect + mem_types_list+="cuda " + fi + fi + for send_func in -i -b -z do for ucx_dev in $(get_active_ib_devices) do - echo "==== Running UCT hello world server on rc/${ucx_dev} with sending ${send_func} ====" - run_hello uct -d ${ucx_dev} -t "rc" ${send_func} + for mem_type in $mem_types_list + do + echo "==== Running UCT hello world server on rc/${ucx_dev} with sending ${send_func} and \"${mem_type}\" memory type ====" + run_hello uct -d ${ucx_dev} -t "rc_verbs" ${send_func} -m ${mem_type} + done done - for ucx_dev in $(ip addr | awk '/state UP/ {print $2}' | sed s/://) + for ucx_dev in $(get_active_ip_iface) do echo "==== Running UCT hello world server on tcp/${ucx_dev} with sending ${send_func} ====" run_hello uct -d ${ucx_dev} -t "tcp" ${send_func} @@ -628,92 +900,87 @@ run_uct_hello() { } run_client_server() { + test_name=ucp_client_server - test_name=ucp_client_server - - if [ ! -x ${test_name} ] - then - gcc -o ${test_name} ${ucx_inst}/share/ucx/examples/${test_name}.c \ - -lucp -lucs -I${ucx_inst}/include -L${ucx_inst}/lib \ - -Wl,-rpath=${ucx_inst}/lib - fi - - iface=`ibdev2netdev | grep Up | awk '{print $5}' | head -1` - if [ -n "$iface" ] - then - server_ip=`ip addr show ${iface} | awk '/inet /{print $2}' | awk -F '/' '{print $1}'` - fi - - if [ -z "$server_ip" ] - then - # if there is no inet (IPv4) address, bail - return - fi - - ibdev=`ibdev2netdev | grep $iface | awk '{print $1}'` - node_guid=`cat /sys/class/infiniband/$ibdev/node_guid` - if [ $node_guid == "0000:0000:0000:0000" ] - then - return - fi - - server_port=$((10000 + EXECUTOR_NUMBER)) + if [ ! -x ${test_name} ] + then + gcc -o ${test_name} ${ucx_inst}/share/ucx/examples/${test_name}.c \ + -lucp -lucs -I${ucx_inst}/include -L${ucx_inst}/lib \ + -Wl,-rpath=${ucx_inst}/lib + fi - # run server side - ./${test_name} -p ${server_port} & - hw_server_pid=$! + server_ip=$(get_rdma_device_ip_addr) + if [ "$server_ip" == "" ] + then + return + fi - sleep 15 + run_client_server_app "./${test_name}" "" "-a ${server_ip}" 1 0 +} - # need to be ran in background to reflect application PID in $! - ./${test_name} -a ${server_ip} -p ${server_port} & - hw_client_pid=$! +run_ucp_client_server() { + echo "==== Running UCP client-server ====" + run_client_server - wait ${hw_client_pid} - kill -9 ${hw_server_pid} + rm -f ./ucp_client_server } -run_ucp_client_server() { +run_io_demo() { + server_ip=$(get_rdma_device_ip_addr) + if [ "$server_ip" == "" ] + then + return + fi + + echo "==== Running UCP IO demo ====" - if [ ! -r /dev/infiniband/rdma_cm ] - then - return - fi + test_args="$@ -o write,read -d 128:4194304 -i 10000 -w 10" + test_name=io_demo - ret=`which ibdev2netdev` - if [ -z "$ret" ] - then - return - fi + if [ ! -x ${test_name} ] + then + $MAKEP -C test/apps/iodemo ${test_name} + fi - echo "==== Running UCP client-server ====" - run_client_server + export UCX_SOCKADDR_CM_ENABLE=y + run_client_server_app "./test/apps/iodemo/${test_name}" "${test_args}" "${server_ip}" 1 0 - rm -f ./ucp_client_server + unset UCX_SOCKADDR_CM_ENABLE + make_clean } # -# Run UCX performance test with MPI +# Run UCX performance test +# Note: If requested running with MPI, MPI has to be initialized before +# The function accepts 0 (default value) or 1 that means launching w/ or w/o MPI # -run_ucx_perftest_mpi() { +run_ucx_perftest() { + if [ $# -eq 0 ] + then + with_mpi=0 + else + with_mpi=$1 + fi ucx_inst_ptest=$ucx_inst/share/ucx/perftest # hack for perftest, no way to override params used in batch # todo: fix in perftest - sed -s 's,-n [0-9]*,-n 1000,g' $ucx_inst_ptest/msg_pow2 | sort -R > $ucx_inst_ptest/msg_pow2_short + sed -s 's,-n [0-9]*,-n 100,g' $ucx_inst_ptest/msg_pow2 | sort -R > $ucx_inst_ptest/msg_pow2_short cat $ucx_inst_ptest/test_types_uct | sort -R > $ucx_inst_ptest/test_types_short_uct cat $ucx_inst_ptest/test_types_ucp | grep -v cuda | sort -R > $ucx_inst_ptest/test_types_short_ucp - UCT_PERFTEST="$ucx_inst/bin/ucx_perftest \ - -b $ucx_inst_ptest/test_types_short_uct \ - -b $ucx_inst_ptest/msg_pow2_short -w 1" + ucx_perftest="$ucx_inst/bin/ucx_perftest" + uct_test_args="-b $ucx_inst_ptest/test_types_short_uct \ + -b $ucx_inst_ptest/msg_pow2_short -w 1" - UCP_PERFTEST="$ucx_inst/bin/ucx_perftest \ - -b $ucx_inst_ptest/test_types_short_ucp \ - -b $ucx_inst_ptest/msg_pow2_short -w 1" + ucp_test_args="-b $ucx_inst_ptest/test_types_short_ucp \ + -b $ucx_inst_ptest/msg_pow2_short -w 1" - # shared memory, IB - devices="posix $(get_active_ib_devices)" + # IP ifaces + ip_ifaces=$(get_active_ip_ifaces) + + # shared memory, IB devices, IP ifaces + devices="memory $(get_active_ib_devices) ${ip_ifaces}" # Run on all devices my_devices=$(get_my_tasks $devices) @@ -722,31 +989,136 @@ run_ucx_perftest_mpi() { if [[ $ucx_dev =~ .*mlx5.* ]]; then opt_transports="-b $ucx_inst_ptest/transports" tls=`awk '{print $3 }' $ucx_inst_ptest/transports | tr '\n' ',' | sed -r 's/,$//; s/mlx5/x/g'` - ucx_env_vars="-x UCX_NET_DEVICES=$ucx_dev -x UCX_TLS=$tls" - elif [[ $ucx_dev =~ posix ]]; then - opt_transports="-x mm" - ucx_env_vars="-x UCX_TLS=mm" + dev=$ucx_dev + elif [[ $ucx_dev =~ memory ]]; then + opt_transports="-x posix" + tls="shm" + dev="all" + elif [[ " ${ip_ifaces[*]} " == *" ${ucx_dev} "* ]]; then + opt_transports="-x tcp" + tls="tcp" + dev=$ucx_dev else - opt_transports="-x rc" - ucx_env_vars="-x UCX_NET_DEVICES=$ucx_dev -x UCX_TLS=rc" + opt_transports="-x rc_verbs" + tls="rc_v" + dev=$ucx_dev fi echo "==== Running ucx_perf kit on $ucx_dev ====" - $MPIRUN -np 2 $AFFINITY $UCT_PERFTEST -d $ucx_dev $opt_transports - $MPIRUN -np 2 $ucx_env_vars $AFFINITY $UCP_PERFTEST + if [ $with_mpi -eq 1 ] + then + # Run UCT performance test + $MPIRUN -np 2 $AFFINITY $ucx_perftest $uct_test_args -d $ucx_dev $opt_transports + + # Run UCP performance test + $MPIRUN -np 2 -x UCX_NET_DEVICES=$dev -x UCX_TLS=$tls $AFFINITY $ucx_perftest $ucp_test_args + + # Run UCP performance test with 2 threads + $MPIRUN -np 2 -x UCX_NET_DEVICES=$dev -x UCX_TLS=$tls $AFFINITY $ucx_perftest $ucp_test_args -T 2 + else + export UCX_NET_DEVICES=$dev + export UCX_TLS=$tls + + # Run UCT performance test + run_client_server_app "$ucx_perftest" "$uct_test_args -d ${ucx_dev} ${opt_transports}" \ + "$(hostname)" 0 0 + + # Run UCP performance test + run_client_server_app "$ucx_perftest" "$ucp_test_args" "$(hostname)" 0 0 + + # Run UCP performance test with 2 threads + run_client_server_app "$ucx_perftest" "$ucp_test_args -T 2" "$(hostname)" 0 0 + + unset UCX_NET_DEVICES + unset UCX_TLS + fi done # run cuda tests if cuda module was loaded and GPU is found - if [ "X$have_cuda" == "Xyes" ] && (lsmod | grep -q "nv_peer_mem") && (lsmod | grep -q "gdrdrv") + if [ "X$have_cuda" == "Xyes" ] then - export CUDA_VISIBLE_DEVICES=$(($worker%$num_gpus)),$(($(($worker+1))%$num_gpus)) + tls_list="all " + gdr_options="n " + if (lsmod | grep -q "nv_peer_mem") + then + echo "GPUDirectRDMA module (nv_peer_mem) is present.." + tls_list+="rc,cuda_copy " + gdr_options+="y " + fi + + if [ "X$have_gdrcopy" == "Xyes" ] && (lsmod | grep -q "gdrdrv") + then + echo "GDRCopy module (gdrdrv) is present..." + tls_list+="rc,cuda_copy,gdr_copy " + fi + + if [ $num_gpus -gt 1 ]; then + export CUDA_VISIBLE_DEVICES=$(($worker%$num_gpus)),$(($(($worker+1))%$num_gpus)) + fi + cat $ucx_inst_ptest/test_types_ucp | grep cuda | sort -R > $ucx_inst_ptest/test_types_short_ucp + sed -s 's,-n [0-9]*,-n 10 -w 1,g' $ucx_inst_ptest/msg_pow2 | sort -R > $ucx_inst_ptest/msg_pow2_short + echo "==== Running ucx_perf with cuda memory====" - $MPIRUN -np 2 -x UCX_TLS=rc,cuda_copy,gdr_copy -x UCX_MEMTYPE_CACHE=y $AFFINITY $UCP_PERFTEST - $MPIRUN -np 2 -x UCX_TLS=rc,cuda_copy,gdr_copy -x UCX_MEMTYPE_CACHE=n $AFFINITY $UCP_PERFTEST - $MPIRUN -np 2 -x UCX_TLS=rc,cuda_copy $AFFINITY $UCP_PERFTEST - $MPIRUN -np 2 -x UCX_TLS=self,mm,cma,cuda_copy $AFFINITY $UCP_PERFTEST - $MPIRUN -np 2 $AFFINITY $UCP_PERFTEST + + for tls in $tls_list + do + for memtype_cache in y n + do + for gdr in $gdr_options + do + if [ $with_mpi -eq 1 ] + then + $MPIRUN -np 2 -x UCX_TLS=$tls -x UCX_MEMTYPE_CACHE=$memtype_cache \ + -x UCX_IB_GPU_DIRECT_RDMA=$gdr $AFFINITY $ucx_perftest $ucp_test_args + else + export UCX_TLS=$tls + export UCX_MEMTYPE_CACHE=$memtype_cache + export UCX_IB_GPU_DIRECT_RDMA=$gdr + run_client_server_app "$ucx_perftest" "$ucp_test_args" "$(hostname)" 0 0 + unset UCX_TLS + unset UCX_MEMTYPE_CACHE + unset UCX_IB_GPU_DIRECT_RDMA + fi + done + done + done + + if [ $with_mpi -eq 1 ] + then + $MPIRUN -np 2 -x UCX_TLS=self,shm,cma,cuda_copy $AFFINITY $ucx_perftest $ucp_test_args + $MPIRUN -np 2 -x UCX_TLS=self,sm,cuda_ipc,cuda_copy $AFFINITY $ucx_perftest $ucp_test_args + $MPIRUN -np 2 $AFFINITY $ucx_perftest $ucp_test_args + else + export UCX_TLS=self,shm,cma,cuda_copy + run_client_server_app "$ucx_perftest" "$ucp_test_args" "$(hostname)" 0 0 + unset UCX_TLS + + run_client_server_app "$ucx_perftest" "$ucp_test_args" "$(hostname)" 0 0 + fi + + # Specifically test cuda_ipc for large message sizes + cat $ucx_inst_ptest/test_types_ucp | grep -v cuda | sort -R > $ucx_inst_ptest/test_types_cuda_ucp + ucp_test_args_large="-b $ucx_inst_ptest/test_types_cuda_ucp \ + -b $ucx_inst_ptest/msg_pow2_large -w 1" + if [ $with_mpi -eq 1 ] + then + for ipc_cache in y n + do + $MPIRUN -np 2 -x UCX_TLS=self,sm,cuda_copy,cuda_ipc \ + -x UCX_CUDA_IPC_CACHE=$ipc_cache $AFFINITY $ucx_perftest $ucp_test_args_large + done + else + for ipc_cache in y n + do + export UCX_TLS=self,sm,cuda_copy,cuda_ipc + export UCX_CUDA_IPC_CACHE=$ipc_cache + run_client_server_app "$ucx_perftest" "$ucp_test_args_large" "$(hostname)" 0 0 + unset UCX_TLS + unset UCX_CUDA_IPC_CACHE + done + fi + unset CUDA_VISIBLE_DEVICES fi } @@ -779,25 +1151,27 @@ run_mpi_tests() { export LD_LIBRARY_PATH=${ucx_inst}/lib:$LD_LIBRARY_PATH ../contrib/configure-release --prefix=$ucx_inst --with-mpi # TODO check in -devel mode as well - $MAKEP clean + make_clean $MAKEP install $MAKEP installcheck # check whether installation is valid (it compiles examples at least) MPIRUN="mpirun \ + --bind-to none \ -x UCX_ERROR_SIGNALS \ -x UCX_HANDLE_ERRORS \ -mca pml ob1 \ -mca btl tcp,self \ -mca btl_tcp_if_include lo \ + -mca orte_allowed_exit_without_sync 1 \ -mca coll ^hcoll,ml" - run_ucx_perftest_mpi + run_ucx_perftest 1 echo "ok 1 - ucx perftest" >> mpi_tests.tap test_malloc_hooks_mpi echo "ok 2 - malloc hooks" >> mpi_tests.tap - $MAKEP distclean + make_clean distclean module unload hpcx-gcc else @@ -807,6 +1181,12 @@ run_mpi_tests() { fi } +build_ucx_profiling() { + # compile the profiling example code + gcc -o ucx_profiling ../test/apps/profiling/ucx_profiling.c \ + -lm -lucs -I${ucx_inst}/include -L${ucx_inst}/lib -Wl,-rpath=${ucx_inst}/lib +} + # # Test profiling infrastructure # @@ -815,12 +1195,11 @@ test_profiling() { # configure release mode, application profiling should work ../contrib/configure-release --prefix=$ucx_inst - $MAKEP clean + make_clean $MAKEP + $MAKEP install - # compile the profiling example code - gcc -o ucx_profiling ${ucx_inst}/share/ucx/examples/ucx_profiling.c \ - -lm -lucs -I${ucx_inst}/include -L${ucx_inst}/lib -Wl,-rpath=${ucx_inst}/lib + build_ucx_profiling UCX_PROFILE_MODE=log UCX_PROFILE_FILE=ucx_jenkins.prof ./ucx_profiling @@ -832,8 +1211,11 @@ test_profiling() { test_ucs_load() { ../contrib/configure-release --prefix=$ucx_inst - $MAKEP clean + make_clean $MAKEP + $MAKEP install + + build_ucx_profiling # Make sure UCS library constructor does not call socket() echo "==== Running UCS library loading test ====" @@ -855,8 +1237,9 @@ test_ucs_dlopen() { test_ucp_dlopen() { ../contrib/configure-release --prefix=$ucx_inst - $MAKEP clean + make_clean $MAKEP + $MAKEP install # Make sure UCP library, when opened with dlopen(), loads CMA module LIB_CMA=`find ${ucx_inst} -name libuct_cma.so.0` @@ -864,7 +1247,7 @@ test_ucp_dlopen() { then echo "==== Running UCP library loading test ====" ./test/apps/test_ucp_dlopen # just to save output to log - ./test/apps/test_ucp_dlopen | grep 'cma/cma' + ./test/apps/test_ucp_dlopen | grep 'cma/memory' else echo "==== Not running UCP library loading test ====" fi @@ -872,7 +1255,7 @@ test_ucp_dlopen() { test_memtrack() { ../contrib/configure-devel --prefix=$ucx_inst - $MAKEP clean + make_clean $MAKEP echo "==== Running memtrack test ====" @@ -933,11 +1316,53 @@ test_malloc_hook() { test_jucx() { echo "==== Running jucx test ====" echo "1..2" > jucx_tests.tap - if module_load dev/jdk && module_load dev/mvn + iface=`ibdev2netdev | grep Up | awk '{print $5}' | head -1` + if [ -z "$iface" ] + then + echo "Failed to find active ib devices." >> jucx_tests.tap + return + elif module_load dev/jdk && module_load dev/mvn then - export UCX_ERROR_SIGNALS="" - JUCX_INST=$ucx_inst $MAKE -C bindings/java/src/main/native test - unset UCX_ERROR_SIGNALS + jucx_port=$((20000 + EXECUTOR_NUMBER)) + export JUCX_TEST_PORT=$jucx_port + export UCX_MEM_EVENTS=no + $MAKE -C bindings/java/src/main/native test + ifaces=`ibdev2netdev | grep Up | awk '{print $5}'` + if [ -n "$ifaces" ] + then + $MAKE -C bindings/java/src/main/native package + fi + for iface in $ifaces + do + if [ -n "$iface" ] + then + server_ip=$(get_ifaddr ${iface}) + fi + + if [ -z "$server_ip" ] + then + echo "Interface $iface has no IPv4" + continue + fi + echo "Running standalone benchamrk on $iface" + + java -XX:ErrorFile=$WORKSPACE/hs_err_${BUILD_NUMBER}_%p.log \ + -XX:OnError="cat $WORKSPACE/hs_err_${BUILD_NUMBER}_%p.log" \ + -cp "bindings/java/resources/:bindings/java/src/main/native/build-java/*" \ + org.openucx.jucx.examples.UcxReadBWBenchmarkReceiver \ + s=$server_ip p=$JUCX_TEST_PORT & + java_pid=$! + sleep 10 + java -XX:ErrorFile=$WORKSPACE/hs_err_${BUILD_NUMBER}_%p.log \ + -XX:OnError="cat $WORKSPACE/hs_err_${BUILD_NUMBER}_%p.log" \ + -cp "bindings/java/resources/:bindings/java/src/main/native/build-java/*" \ + org.openucx.jucx.examples.UcxReadBWBenchmarkSender \ + s=$server_ip p=$JUCX_TEST_PORT t=10000000 + wait $java_pid + done + + unset JUCX_TEST_PORT + unset UCX_MEM_EVENTS module unload dev/jdk module unload dev/mvn echo "ok 1 - jucx test" >> jucx_tests.tap @@ -948,18 +1373,22 @@ test_jucx() { # # Run Coverity and report errors +# The argument is a UCX build type: devel or release # run_coverity() { echo 1..1 > coverity.tap if module_load tools/cov then + ucx_build_type=$1 + echo "==== Running coverity ====" - $MAKEP clean - cov_build_id="cov_build_${BUILD_NUMBER}" + ../contrib/configure-$ucx_build_type --prefix=$ucx_inst + make_clean + cov_build_id="cov_build_${ucx_build_type}_${BUILD_NUMBER}" cov_build="$WORKSPACE/$cov_build_id" rm -rf $cov_build - cov-build --dir $cov_build $MAKEP all - cov-analyze $COV_OPT --dir $cov_build + cov-build --dir $cov_build $MAKEP all + cov-analyze --jobs $parallel_jobs $COV_OPT --security --concurrency --dir $cov_build nerrors=$(cov-format-errors --dir $cov_build | awk '/Processing [0-9]+ errors?/ { print $2 }') rc=$(($rc+$nerrors)) @@ -974,7 +1403,8 @@ run_coverity() { cov-format-errors --dir $cov_build --emacs-style echo "not ok 1 Coverity Detected $nerrors failures # $cov_url" >> coverity.tap else - echo "ok 1 Coverity found no issues" >> coverity.tap + echo "ok 1 Coverity found no issues" >> coverity.tap + rm -rf $cov_build fi echo Coverity report: $cov_url @@ -1035,7 +1465,7 @@ run_gtest() { compiler_name=$1 shift ../contrib/configure-devel --prefix=$ucx_inst $@ - $MAKEP clean + make_clean $MAKEP echo "==== Running watchdog timeout test, $compiler_name compiler ====" @@ -1047,6 +1477,11 @@ run_gtest() { export GTEST_SHUFFLE=1 export GTEST_TAP=2 export GTEST_REPORT_DIR=$WORKSPACE/reports/tap + # Run UCT tests for TCP over fastest device only + export GTEST_UCT_TCP_FASTEST_DEV=1 + # Report TOP-20 longest test at the end of testing + export GTEST_REPORT_LONGEST_TESTS=20 + export OMP_NUM_THREADS=4 if [ $num_gpus -gt 0 ]; then export CUDA_VISIBLE_DEVICES=$(($worker%$num_gpus)) @@ -1064,7 +1499,7 @@ run_gtest() { echo "==== Running unit tests, $compiler_name compiler ====" $AFFINITY $TIMEOUT make -C test/gtest test - (cd test/gtest && rename .tap _gtest.tap *.tap && mv *.tap $GTEST_REPORT_DIR) + (cd test/gtest && rename_files .tap _gtest.tap *.tap && mv *.tap $GTEST_REPORT_DIR) echo "==== Running malloc hooks mallopt() test, $compiler_name compiler ====" # gtest returns with non zero exit code if there were no @@ -1078,7 +1513,7 @@ run_gtest() { GTEST_TOTAL_SHARDS=1 \ GTEST_FILTER=malloc_hook_cplusplus.mallopt \ make -C test/gtest test - (cd test/gtest && rename .tap _mallopt_gtest.tap malloc_hook_cplusplus.tap && mv *.tap $GTEST_REPORT_DIR) + (cd test/gtest && rename_files .tap _mallopt_gtest.tap malloc_hook_cplusplus.tap && mv *.tap $GTEST_REPORT_DIR) echo "==== Running malloc hooks mmap_ptrs test with MMAP_THRESHOLD=16384, $compiler_name compiler ====" $AFFINITY $TIMEOUT \ @@ -1087,9 +1522,10 @@ run_gtest() { GTEST_TOTAL_SHARDS=1 \ GTEST_FILTER=malloc_hook_cplusplus.mmap_ptrs \ make -C test/gtest test - (cd test/gtest && rename .tap _mmap_ptrs_gtest.tap malloc_hook_cplusplus.tap && mv *.tap $GTEST_REPORT_DIR) + (cd test/gtest && rename_files .tap _mmap_ptrs_gtest.tap malloc_hook_cplusplus.tap && mv *.tap $GTEST_REPORT_DIR) - if ! [[ $(uname -m) =~ "aarch" ]] && ! [[ $(uname -m) =~ "ppc" ]] + if ! [[ $(uname -m) =~ "aarch" ]] && ! [[ $(uname -m) =~ "ppc" ]] && \ + ! [[ -n "${JENKINS_NO_VALGRIND}" ]] then echo "==== Running valgrind tests, $compiler_name compiler ====" @@ -1099,9 +1535,8 @@ run_gtest() { module load tools/valgrind-latest fi - export VALGRIND_EXTRA_ARGS="--xml=yes --xml-file=valgrind.xml --child-silent-after-fork=yes --gen-suppressions=all" $AFFINITY $TIMEOUT_VALGRIND make -C test/gtest test_valgrind - (cd test/gtest && rename .tap _vg.tap *.tap && mv *.tap $GTEST_REPORT_DIR) + (cd test/gtest && rename_files .tap _vg.tap *.tap && mv *.tap $GTEST_REPORT_DIR) module unload tools/valgrind-latest else echo "==== Not running valgrind tests with $compiler_name compiler ====" @@ -1109,6 +1544,8 @@ run_gtest() { echo "ok 1 - # SKIP because running on $(uname -m)" >> vg_skipped.tap fi + unset OMP_NUM_THREADS + unset GTEST_UCT_TCP_FASTEST_DEV unset GTEST_SHARD_INDEX unset GTEST_TOTAL_SHARDS unset GTEST_RANDOM_SEED @@ -1116,7 +1553,6 @@ run_gtest() { unset GTEST_TAP unset GTEST_REPORT_DIR unset GTEST_EXTRA_ARGS - unset VALGRIND_EXTRA_ARGS unset CUDA_VISIBLE_DEVICES } @@ -1145,7 +1581,7 @@ run_gtest_release() { echo "1..1" > gtest_release.tap ../contrib/configure-release --prefix=$ucx_inst --enable-gtest - $MAKEP clean + make_clean $MAKEP export GTEST_SHARD_INDEX=0 @@ -1154,11 +1590,18 @@ run_gtest_release() { export GTEST_SHUFFLE=1 export GTEST_TAP=2 export GTEST_REPORT_DIR=$WORKSPACE/reports/tap + export OMP_NUM_THREADS=4 echo "==== Running unit tests (release configuration) ====" - env GTEST_FILTER=\*test_obj_size\* $AFFINITY $TIMEOUT make -C test/gtest test + # Check: + # - Important object sizes + # - Unexpected RNDV test, to cover rkey handling in tag offload flow + # (see GH #3827 for details) + env GTEST_FILTER=\*test_obj_size\*:\*test_ucp_tag_match.rndv_rts_unexp\* \ + $AFFINITY $TIMEOUT make -C test/gtest test echo "ok 1" >> gtest_release.tap + unset OMP_NUM_THREADS unset GTEST_SHARD_INDEX unset GTEST_TOTAL_SHARDS unset GTEST_RANDOM_SEED @@ -1171,7 +1614,7 @@ run_ucx_tl_check() { echo "1..1" > ucx_tl_check.tap - ../test/apps/test_ucx_tls.py $ucx_inst + ../test/apps/test_ucx_tls.py -p $ucx_inst if [ $? -ne 0 ]; then echo "not ok 1" >> ucx_tl_check.tap @@ -1189,10 +1632,17 @@ run_tests() { export UCX_ERROR_MAIL_TO=$ghprbActualCommitAuthorEmail export UCX_ERROR_MAIL_FOOTER=$JOB_URL/$BUILD_NUMBER/console + # test cuda build if cuda modules available + do_distributed_task 2 4 build_cuda + + # load cuda env only if GPU available for remaining tests + try_load_cuda_env + do_distributed_task 0 4 build_icc + do_distributed_task 0 4 build_pgi do_distributed_task 1 4 build_debug + do_distributed_task 1 4 build_prof do_distributed_task 1 4 build_ugni - do_distributed_task 2 4 build_cuda do_distributed_task 3 4 build_clang do_distributed_task 0 4 build_armclang do_distributed_task 1 4 build_gcc_latest @@ -1216,22 +1666,25 @@ run_tests() { do_distributed_task 1 4 run_ucp_hello do_distributed_task 2 4 run_uct_hello do_distributed_task 1 4 run_ucp_client_server + do_distributed_task 2 4 run_ucx_perftest + do_distributed_task 1 4 run_io_demo do_distributed_task 3 4 test_profiling - do_distributed_task 0 4 test_ucp_dlopen + do_distributed_task 0 3 test_jucx do_distributed_task 1 4 test_ucs_dlopen do_distributed_task 3 4 test_ucs_load do_distributed_task 3 4 test_memtrack do_distributed_task 0 4 test_unused_env_var do_distributed_task 2 4 test_env_var_aliases do_distributed_task 1 3 test_malloc_hook - do_distributed_task 0 3 test_jucx + do_distributed_task 0 4 test_ucp_dlopen # all are running gtest run_gtest_default run_gtest_armclang - do_distributed_task 3 4 run_coverity - do_distributed_task 0 4 run_gtest_release + do_distributed_task 3 4 run_coverity release + do_distributed_task 0 4 run_coverity devel + do_distributed_task 1 4 run_gtest_release } prepare @@ -1242,8 +1695,9 @@ do_distributed_task 0 4 build_disable_numa do_distributed_task 1 4 build_no_verbs do_distributed_task 2 4 build_release_pkg do_distributed_task 3 4 check_inst_headers - -if [ -n "$JENKINS_RUN_TESTS" ] +do_distributed_task 1 4 check_make_distcheck +do_distributed_task 2 4 check_config_h +if [ -n "$JENKINS_RUN_TESTS" ] || [ -n "$RUN_TESTS" ] then run_tests fi diff --git a/contrib/ucx_perftest_config/msg_pow2_large b/contrib/ucx_perftest_config/msg_pow2_large new file mode 100644 index 00000000000..114dba8b515 --- /dev/null +++ b/contrib/ucx_perftest_config/msg_pow2_large @@ -0,0 +1,10 @@ +4194304 -s 4194304 -n 100 +8388608 -s 8388608 -n 100 +16777216 -s 16777216 -n 100 +33554432 -s 33554432 -n 100 +67108864 -s 67108864 -n 10 +134217728 -s 134217728 -n 10 +268435456 -s 268435456 -n 10 +536870912 -s 536870912 -n 10 +1073741824 -s 1073741824 -n 10 +2147483648 -s 2147483648 -n 10 diff --git a/contrib/ucx_perftest_config/test_types_ucp b/contrib/ucx_perftest_config/test_types_ucp index bdf090fa1be..2a9ecfa44d6 100644 --- a/contrib/ucx_perftest_config/test_types_ucp +++ b/contrib/ucx_perftest_config/test_types_ucp @@ -17,8 +17,12 @@ ucp_contig_stream_lat -t stream_lat -r recv_data ucp_contig_stream_bw -t stream_bw -r recv ucp_contig_stream_lat -t stream_lat -r recv #CUDA -ucp_contig_contig_cuda_tag_lat -t tag_lat -D contig,contig -m cuda -ucp_contig_contig_cuda_tag_bw -t tag_bw -D contig,contig -m cuda +ucp_contig_contig_cuda_tag_lat -t tag_lat -D contig,contig -m cuda,cuda +ucp_contig_contig_cuda_tag_lat -t tag_lat -D contig,contig -m cuda,host +ucp_contig_contig_cuda_tag_lat -t tag_lat -D contig,contig -m host,cuda +ucp_contig_contig_cuda_tag_bw -t tag_bw -D contig,contig -m cuda,cuda +ucp_contig_contig_cuda_tag_bw -t tag_bw -D contig,contig -m cuda,host +ucp_contig_contig_cuda_tag_bw -t tag_bw -D contig,contig -m host,cuda ucp_contig_cuda_stream_bw -t stream_bw -r recv_data -m cuda ucp_contig_cuda_stream_lat -t stream_lat -r recv_data -m cuda ucp_contig_cuda_stream_bw -t stream_bw -r recv -m cuda diff --git a/contrib/ucx_perftest_config/transports b/contrib/ucx_perftest_config/transports index 6108f005285..5fbbd7da72e 100644 --- a/contrib/ucx_perftest_config/transports +++ b/contrib/ucx_perftest_config/transports @@ -1,2 +1,2 @@ -regular_verbs -x rc +regular_verbs -x rc_verbs accel_verbs -x rc_mlx5 diff --git a/contrib/upload_docs.sh b/contrib/upload_docs.sh index 60a7d802f4d..557a46f4931 100755 --- a/contrib/upload_docs.sh +++ b/contrib/upload_docs.sh @@ -16,6 +16,6 @@ git fetch --all git checkout -t origin/master -f git pull git submodule update --init --recursive --remote -cp -f ../doc/doxygen-doc/ucx.pdf ./ +cp -f ../docs/doxygen-doc/ucx.pdf ./ git commit ucx.pdf -m "update ucx.pdf for $rev" git push diff --git a/contrib/valgrind.supp b/contrib/valgrind.supp index 45f0ed05595..5c0d0e0dce4 100644 --- a/contrib/valgrind.supp +++ b/contrib/valgrind.supp @@ -54,6 +54,13 @@ ... obj:*libcuda*.so* } +{ + fun:gdr_open + Memcheck:Param + ioctl(generic) + fun:ioctl + fun:gdr_open +} { gdr_get_info_ioctl Memcheck:Param @@ -149,3 +156,124 @@ ... fun:ib_cm_open_device } +{ + + Memcheck:Leak + match-leak-kinds: possible + fun:calloc + obj:*/libhfi1verbs-rdmav22.so* + obj:*/libibverbs.so* + ... + fun:ibv_get_device_list +} +{ + rdma_core_ibv_get_device_list + Memcheck:Leak + match-leak-kinds: possible + fun:calloc + ... + obj:*/libibverbs.so* + ... + fun:ibv_get_device_list +} +{ + verbs_open_device + Memcheck:Leak + match-leak-kinds: possible + ... + fun:verbs_open_device +} +{ + uct_ib_iface_prepare_rx_wrs_val + Memcheck:Value8 + fun:uct_ib_iface_prepare_rx_wrs +} +{ + uct_ib_iface_prepare_rx_wrs_cond + Memcheck:Cond + fun:uct_ib_iface_prepare_rx_wrs +} +{ + uct_ib_iface_recv_desc_hdr + Memcheck:Value8 + fun:uct_ib_iface_recv_desc_hdr +} +{ + mlx4_ibv_post_srq_recv_val + Memcheck:Value8 + obj:*/libmlx4.so* + fun:ibv_post_srq_recv +} +{ + mlx4_ibv_post_srq_recv_cond + Memcheck:Cond + obj:*/libmlx4.so* + fun:ibv_post_srq_recv +} +{ + uct_rc_verbs_iface_common_prepost_recvs + Memcheck:Cond + fun:uct_rc_verbs_iface_common_prepost_recvs +} +{ + uct_rc_verbs_iface_post_recv_always_val + Memcheck:Value8 + fun:uct_rc_verbs_iface_post_recv_always +} +{ + uct_rc_verbs_iface_post_recv_always_cond + Memcheck:Cond + fun:uct_rc_verbs_iface_post_recv_always +} +{ + uct_rc_verbs_iface_post_recv_common_val + Memcheck:Value8 + ... + fun:uct_rc_verbs_iface_post_recv_common +} +{ + uct_rc_verbs_iface_post_recv_common_cond + Memcheck:Cond + fun:uct_rc_verbs_iface_post_recv_common +} +{ + gdrcopy_ioctl + Memcheck:Param + ioctl(generic) + fun:ioctl + fun:gdr_pin_buffer +} +{ + gdr_map + Memcheck:Cond + fun:gdr_map +} +{ + gdr_copy_to_mapping_cond + Memcheck:Cond + ... + fun:gdr_copy_to_mapping +} +{ + fun:gdr_copy_from_mapping_cond + Memcheck:Cond + ... + fun:gdr_copy_from_mapping +} +{ + gdr_copy_to_mapping_value8 + Memcheck:Value8 + ... + fun:gdr_copy_to_mapping +} +{ + gdr_copy_from_mapping_value8 + Memcheck:Value8 + ... + fun:gdr_copy_from_mapping +} +{ + gdr_unmap + Memcheck:Cond + fun:gdr_unmap +} diff --git a/debian/rules.in b/debian/rules.in index 03f867090b3..a2e812d695b 100755 --- a/debian/rules.in +++ b/debian/rules.in @@ -13,7 +13,7 @@ dh $@ override_dh_auto_configure: - @top_top_srcdir@/contrib/configure-release --prefix=/usr + @top_top_srcdir@/contrib/configure-release --prefix=/usr --enable-examples chmod +x debian/rules override_dh_shlibdeps: diff --git a/doc/CodeStyle.md b/doc/CodeStyle.md deleted file mode 100644 index e1cacd2ede3..00000000000 --- a/doc/CodeStyle.md +++ /dev/null @@ -1,65 +0,0 @@ -# The UCX code style - -* ## Style - * 4 spaces, no tabs - * up to 80 columns - * single space around operators - * no spaces in the end-of-line - * indent function arguments on column - * indent structure fields on column - * scope: open on same line, except function body, which is on a new line. - * indent multiple consecutive assignments on the column - * 2 space lines between types and prototypes (header files) - * 1 space line between functions (source files) - - -* ## Naming convention: - * lower case, underscores - * names must begin with ucp_/uct_/ucs_/ucm_ - * macro names must begin with UCP_/UCT_/UCS_/UCM_ - * an output argument which is a pointer to a user variable has _p suffix - * value types (e.g struct types, integer types) have _t suffix - * pointer to structs, which are used as API handles, have _h suffix - * macro arguments begin with _ (e.g _value) to avoid confusion with variables - * no leading underscores in function names - * ### Header file name suffixes: - * _fwd.h for a files with a types/function forward declarations - * _types.h if contains a type declarations - * .inl for inline functions - * _def.h with a preprocessor macros - - -* ## C++ - * used only for unit testing - * lower-case class names (same as stl/boost) - - -* ## Include order: - 1. config.h - 2. specific internal header - 3. UCX headers - 4. system headers - - -* ## Doxygen - * all interface H/C files should have doxygen documentation. - - -* ## Error handling - * all internal error codes must be ucs_status_t - * a function which returns error should print a log message - * the function which prints the log message is the first one which decides which - error it is. If a functions returns an error because it's callee returned - erroneous ucs_status_t, it does not have to print a log message. - * destructors are not able to propagate error code to the caller because they - return void. also, users are not ready to handle errors during cleanup flow. - therefore a destructor should handle an error by printing a warning or an - error message. - - -* ## Testing - * every major feature or bugfix must be accompanied with a unit test. In case - of a fix, the test should fail without the fix. - - -* ## Logging diff --git a/docs/CodeStyle.md b/docs/CodeStyle.md new file mode 100644 index 00000000000..cc08bc1723b --- /dev/null +++ b/docs/CodeStyle.md @@ -0,0 +1,201 @@ +# The UCX code style + +## Style + * 4 spaces, no tabs + * up to 80 columns + * single space around operators + * no spaces in the end-of-line + * indent function arguments on column + * indent structure fields on column + * scope: open on same line, except function body, which is on a new line. + * indent multiple consecutive assignments on the column + * 2 space lines between types and prototypes (header files) + * 1 space line between functions (source files) + + +## Naming convention: + * lower case, underscores + * names must begin with ucp_/uct_/ucs_/ucm_ + * macro names must begin with UCP_/UCT_/UCS_/UCM_ + * an output argument which is a pointer to a user variable has _p suffix + * value types (e.g struct types, integer types) have _t suffix + * pointer to structs, which are used as API handles, have _h suffix + * macro arguments begin with _ (e.g _value) to avoid confusion with variables + * no leading underscores in function names + * ### Header file name suffixes: + * _fwd.h for a files with a types/function forward declarations + * _types.h if contains a type declarations + * .inl for inline functions + * _def.h with a preprocessor macros + + +## C++ + * used only for unit testing + * lower-case class names (same as stl/boost) + + +## Include order: + 1. config.h + 2. specific internal header + 3. UCX headers + 4. system headers + + +## Doxygen + * all interface H/C files should have doxygen documentation. + + +## Error handling + * all internal error codes must be ucs_status_t + * a function which returns error should print a log message + * the function which prints the log message is the first one which decides which + error it is. If a functions returns an error because it's callee returned + erroneous ucs_status_t, it does not have to print a log message. + * destructors are not able to propagate error code to the caller because they + return void. also, users are not ready to handle errors during cleanup flow. + therefore a destructor should handle an error by printing a warning or an + error message. + + +## Testing + * every major feature or bugfix must be accompanied with a unit test. In case + of a fix, the test should fail without the fix. + + +## Examples + +### if style + +Good +```C + if (val != XXX) { + /* snip */ + } else if (val == YYY) { + /* code here */ + } else { + /* code here */ + } +``` + +Bad +```C + if(val != XXX) { /* Require space after if */ + if (val != XXX){ /* Require space after ) */ + if ( val != XXX) { /* Remove space after ( */ +``` + +### goto style + +Good +```C +err_free: + ucs_free(thread); +err: + --ucs_async_thread_global_context.use_count; +out_unlock: + ucs_assert_always(ucs_async_thread_global_context.thread != NULL); + *thread_p = ucs_async_thread_global_context.thread; +``` + +Bad +```C +err_free: + ucs_free(thread); +/* !!!Remove this line!!! */ +err: + --ucs_async_thread_global_context.use_count; +``` + +### structure assignment + +Good + +```C + event.events = events; + event.data.fd = event_fd; + event.data.ptr = udata; + +``` + +Bad +```C + /* Align = position */ + event.events = events; + event.data.fd = event_fd; + event.data.ptr = udata; +``` + +### comment in C file + +Good +```C +/* run-time CPU detection */ +``` + +Bad: require C style `/* .. */` comment. + +```C +// run-time CPU detection +``` + +### no spaces in the end-of-line + +Good +```C + int fd; +``` + +Bad +``` + int fd; + /* ^^ Remove trailing space */ +``` + +### macro definition + +Good +```C + #define UCS_MACRO_SHORT(_obj, _field, _val) \ + (_obj)->_field = (_val) + + #define UCS_MACRO_LONG(_obj, _field1, _field2, _val1, _val2) \ + { \ + typeof((_obj)->_field1) sum = (_val1) + (_val2); \ + \ + (_obj)->_field1 = sum; \ + (_obj)->_field2 = sum; \ + } + + #define UCS_MACRO_LONG_RET_VAL(_obj, _field, _val, _func) \ + ({ \ + ucs_status_t status; \ + \ + (_obj)->_field = (_val); \ + \ + status = _func(_obj); \ + status; \ + }) +``` + +Bad +```C + #define UCS_MACRO_SHORT(_obj, _field, _val) \ + _obj->_field = _val /* need to wrap macro arguments by () */ + + #define UCS_MACRO_LONG(_obj, _field1, _field2, _val1, _val2) \ + /* possible mixing declarations and code */ \ + typeof((_obj)->_field1) sum = (_val1) + (_val2); \ + \ + (_obj)->_field1 = sum; \ + (_obj)->_field2 = sum; + + #define UCS_MACRO_LONG_RET_VAL(_obj, _field, _val, _func) \ + ({ \ + ucs_status_t status; \ + \ + (_obj)->_field = (_val); \ + \ + status = _func(_obj); \ + status; \ + }) /* wrong alignment of "\" */ +``` diff --git a/doc/LoggingStyle.md b/docs/LoggingStyle.md similarity index 100% rename from doc/LoggingStyle.md rename to docs/LoggingStyle.md diff --git a/doc/OptimizationStyle.md b/docs/OptimizationStyle.md similarity index 100% rename from doc/OptimizationStyle.md rename to docs/OptimizationStyle.md diff --git a/doc/doxygen/Architecture.graffle b/docs/doxygen/Architecture.graffle similarity index 100% rename from doc/doxygen/Architecture.graffle rename to docs/doxygen/Architecture.graffle diff --git a/doc/doxygen/Architecture.pdf b/docs/doxygen/Architecture.pdf similarity index 100% rename from doc/doxygen/Architecture.pdf rename to docs/doxygen/Architecture.pdf diff --git a/doc/doxygen/Architecture.png b/docs/doxygen/Architecture.png similarity index 100% rename from doc/doxygen/Architecture.png rename to docs/doxygen/Architecture.png diff --git a/doc/doxygen/UCX_Logo_80x80.png b/docs/doxygen/UCX_Logo_80x80.png similarity index 100% rename from doc/doxygen/UCX_Logo_80x80.png rename to docs/doxygen/UCX_Logo_80x80.png diff --git a/doc/doxygen/UCX_Logo_930x933.png b/docs/doxygen/UCX_Logo_930x933.png similarity index 100% rename from doc/doxygen/UCX_Logo_930x933.png rename to docs/doxygen/UCX_Logo_930x933.png diff --git a/doc/doxygen/api.svg b/docs/doxygen/api.svg similarity index 100% rename from doc/doxygen/api.svg rename to docs/doxygen/api.svg diff --git a/doc/doxygen/conventions.md b/docs/doxygen/conventions.md similarity index 84% rename from doc/doxygen/conventions.md rename to docs/doxygen/conventions.md index 55324b51a46..057705e8af3 100644 --- a/doc/doxygen/conventions.md +++ b/docs/doxygen/conventions.md @@ -4,7 +4,7 @@ Conventions and Notations This section describes the conventions and notations in the UCX specification. \section Blocking Blocking Behavior -The blocking UCX routines return only when an UCX operation is complete. +The blocking UCX routines return only when a UCX operation is complete. After the return, the resources used in the UCX routine are available for reuse. @@ -17,6 +17,6 @@ necessarily available for reuse. UCX routines do not guarantee fairness. However, the routines enable UCX consumers to write efficient and fair programs. -\section Interaction with Signal Handler Functions +\section Interaction Interaction with Signal Handler Functions If UCX routines are invoked from a signal handler function, the behavior of the program is undefined. diff --git a/doc/doxygen/design.md b/docs/doxygen/design.md similarity index 100% rename from doc/doxygen/design.md rename to docs/doxygen/design.md diff --git a/doc/doxygen/doxygen.am b/docs/doxygen/doxygen.am similarity index 100% rename from doc/doxygen/doxygen.am rename to docs/doxygen/doxygen.am diff --git a/doc/doxygen/header.tex.in b/docs/doxygen/header.tex.in similarity index 100% rename from doc/doxygen/header.tex.in rename to docs/doxygen/header.tex.in diff --git a/doc/doxygen/intro.md b/docs/doxygen/intro.md similarity index 100% rename from doc/doxygen/intro.md rename to docs/doxygen/intro.md diff --git a/doc/doxygen/preface.md b/docs/doxygen/preface.md similarity index 100% rename from doc/doxygen/preface.md rename to docs/doxygen/preface.md diff --git a/doc/doxygen/release.svg b/docs/doxygen/release.svg similarity index 100% rename from doc/doxygen/release.svg rename to docs/doxygen/release.svg diff --git a/doc/doxygen/twitter.svg b/docs/doxygen/twitter.svg similarity index 100% rename from doc/doxygen/twitter.svg rename to docs/doxygen/twitter.svg diff --git a/doc/doxygen/ucx.bib b/docs/doxygen/ucx.bib similarity index 100% rename from doc/doxygen/ucx.bib rename to docs/doxygen/ucx.bib diff --git a/doc/doxygen/ucxdox b/docs/doxygen/ucxdox similarity index 99% rename from doc/doxygen/ucxdox rename to docs/doxygen/ucxdox index 07a1822b349..4e2aef2f493 100644 --- a/doc/doxygen/ucxdox +++ b/docs/doxygen/ucxdox @@ -55,7 +55,7 @@ PROJECT_BRIEF = "Unified Communication X" # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. -PROJECT_LOGO = $(SRCDIR)/doc/doxygen/UCX_Logo_80x80.png +PROJECT_LOGO = $(SRCDIR)/docs/doxygen/UCX_Logo_80x80.png # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is @@ -681,7 +681,7 @@ FILE_VERSION_FILTER = # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. -LAYOUT_FILE = $(SRCDIR)/doc/doxygen/ucxlayout.xml +LAYOUT_FILE = $(SRCDIR)/docs/doxygen/ucxlayout.xml # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib @@ -691,7 +691,7 @@ LAYOUT_FILE = $(SRCDIR)/doc/doxygen/ucxlayout.xml # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. -CITE_BIB_FILES = $(SRCDIR)/doc/doxygen/ucx.bib +CITE_BIB_FILES = $(SRCDIR)/docs/doxygen/ucx.bib #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages @@ -770,10 +770,10 @@ INPUT = $(SRCDIR)/src/ucp/api/ \ $(SRCDIR)/src/ucs/time/ \ $(SRCDIR)/src/ucs/type/ \ $(SRCDIR)/src/ucs/sys/ \ - $(SRCDIR)/doc/doxygen/preface.md \ - $(SRCDIR)/doc/doxygen/intro.md \ - $(SRCDIR)/doc/doxygen/design.md \ - $(SRCDIR)/doc/doxygen/conventions.md + $(SRCDIR)/docs/doxygen/preface.md \ + $(SRCDIR)/docs/doxygen/intro.md \ + $(SRCDIR)/docs/doxygen/design.md \ + $(SRCDIR)/docs/doxygen/conventions.md # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -852,7 +852,7 @@ EXCLUDE_SYMBOLS = *UCS_CONFIG_STRING_ARRAY_FIELD* # that contain example code fragments that are included (see the \include # command). -EXAMPLE_PATH = $(SRCDIR)/test/examples/ +EXAMPLE_PATH = $(SRCDIR)/examples/ # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and @@ -872,8 +872,8 @@ EXAMPLE_RECURSIVE = NO # that contain images that are to be included in the documentation (see the # \image command). -IMAGE_PATH = $(SRCDIR)/doc/figures \ - $(SRCDIR)/doc/doxygen +IMAGE_PATH = $(SRCDIR)/docs/figures \ + $(SRCDIR)/docs/doxygen # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program @@ -1653,7 +1653,7 @@ EXTRA_PACKAGES = times # to HTML_HEADER. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_HEADER = doc/doxygen/header.tex +LATEX_HEADER = docs/doxygen/header.tex # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the # generated LaTeX document. The footer should contain everything after the last @@ -1683,7 +1683,7 @@ LATEX_EXTRA_STYLESHEET = # markers available. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_FILES = $(SRCDIR)/doc/doxygen/UCX_Logo_930x933.png +LATEX_EXTRA_FILES = $(SRCDIR)/docs/doxygen/UCX_Logo_930x933.png # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will diff --git a/doc/doxygen/ucxlayout.xml b/docs/doxygen/ucxlayout.xml similarity index 100% rename from doc/doxygen/ucxlayout.xml rename to docs/doxygen/ucxlayout.xml diff --git a/docs/source/_static/UCX_Layers.png b/docs/source/_static/UCX_Layers.png new file mode 100644 index 00000000000..6d8581a1444 Binary files /dev/null and b/docs/source/_static/UCX_Layers.png differ diff --git a/docs/source/_static/ucxlogo.png b/docs/source/_static/ucxlogo.png new file mode 100644 index 00000000000..6ed5c4e516f Binary files /dev/null and b/docs/source/_static/ucxlogo.png differ diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 00000000000..cabcb8f1af6 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +# +# See file LICENSE for terms. +# + +# +# Project information +# + +project = u'OpenUCX' +copyright = u'2019, UCF' +author = u'UCF' + +# +# General options +# + +extensions = ['recommonmark'] # For processing Markdown pages +templates_path = ['_templates'] +source_suffix = ['.rst', '.md'] +master_doc = 'index' +language = None +exclude_patterns = [u'_build'] +pygments_style = None + + +# +# HTML options +# + +html_theme = 'sphinx_rtd_theme' +html_logo = '_static/ucxlogo.png' +html_theme_options = { + 'style_external_links': True +} +html_static_path = ['_static'] +htmlhelp_basename = 'OpenUCXdoc' + + +# +# UCX custom configuration +# + +def getLatestVersion(): + import requests + request = requests.get('https://api.github.com/repos/openucx/ucx/releases/latest') + return request.json()["name"] + +def substituteVersion(app, docname, source): + # + # Updating these 2 variables will automatically update all download and API + # documentation links. + # We don't use the normal RST substitution because it cannot substitute text + # inside code blocks and URL links. + # + version_name = getLatestVersion() + clean_version = version_name.lstrip('v') # remove leading 'v' for tag name + api_version = clean_version.rsplit('.', 1)[0] # take only MAJOR.MINOR + result = source[0].replace("{VERSION}", api_version) \ + .replace("{RELEASE}", clean_version) + source[0] = result + +def setup(app): + app.connect('source-read', substituteVersion) diff --git a/docs/source/download.md b/docs/source/download.md new file mode 100644 index 00000000000..da8e3c1c608 --- /dev/null +++ b/docs/source/download.md @@ -0,0 +1,13 @@ +# Download + +## v{RELEASE} release + +* Download [TGZ](https://github.com/openucx/ucx/releases/download/v{RELEASE}/ucx-{RELEASE}.tar.gz) [SRPM](https://github.com/openucx/ucx/releases/download/v{RELEASE}/ucx-{RELEASE}-1.fc30.src.rpm) +* [Release page](https://github.com/openucx/ucx/releases/tag/v{RELEASE}) +* [Running](running) + +
+ +## Previous releases + +[GitHub release page](https://github.com/openucx/ucx/releases) diff --git a/docs/source/faq.md b/docs/source/faq.md new file mode 100644 index 00000000000..9a0d2a70b17 --- /dev/null +++ b/docs/source/faq.md @@ -0,0 +1,312 @@ +# Frequently Asked Questions + +## General + +### Overview + +#### What is UCX? +UCX is a framework (collection of libraries and interfaces) that provides efficient +and relatively easy way to construct widely used HPC protocols: MPI tag matching, +RMA operations, rendezvous protocols, stream, fragmentation, remote atomic operations, etc. + +#### What is UCP, UCT, UCS? +* **UCT** is a transport layer that abstracts the differences across various hardware architectures and provides a low-level API that enables the implementation of communication protocols. The primary goal of the layer is to provide direct and efficient access to hardware network resources with minimal software overhead. For this purpose UCT relies on low-level drivers provided by vendors such as InfiniBand Verbs, Cray's uGNI, libfabrics, etc. In addition, the layer provides constructs for communication context management (thread-based and ap- plication level), and allocation and management of device- specific memories including those found in accelerators. In terms of communication APIs, UCT defines interfaces for immediate (short), buffered copy-and-send (bcopy), and zero- copy (zcopy) communication operations. The short operations are optimized for small messages that can be posted and completed in place. The bcopy operations are optimized for medium size messages that are typically sent through a so- called bouncing-buffer. Finally, the zcopy operations expose zero-copy memory-to-memory communication semantics. + +* **UCP** implements higher-level protocols that are typically used by message passing (MPI) and PGAS programming models by using lower-level capabilities exposed through the UCT layer. +UCP is responsible for the following functionality: initialization of the library, selection of transports for communication, message fragmentation, and multi-rail communication. Currently, the API has the following classes of interfaces: Initialization, Remote Memory Access (RMA) communication, Atomic Memory Operations (AMO), Active Message, Tag-Matching, and Collectives. + +* **UCS** is a service layer that provides the necessary func- tionality for implementing portable and efficient utilities. + +#### How can I contribute? +1. Fork +2. Fix bug or implement a new feature +3. Open Pull Request + +#### How do I get in touch with UCX developers? +Please join our mailing list: https://elist.ornl.gov/mailman/listinfo/ucx-group or +submit issues on github: https://github.com/openucx/ucx/issues + +
+ +### UCX mission + +#### What are the key features of UCX? +* **Open source framework supported by vendors** +The UCX framework is maintained and supported by hardware vendors in addition to the open source community. Every pull-request is tested and multiple hardware platforms supported by vendors community. + +* **Performance, performance, performance!** +The framework design, data structures, and components are design to provide highly optimized access to the network hardware. + +* **High level API for a broad range HPC programming models.** +UCX provides a high level API implemented in software 'UCP' to fill in the gaps across interconnects. This allows to use a single set of APIs in a library to implement multiple interconnects. This reduces the level of complexities when implementing libraries such as Open MPI or OpenSHMEM. Because of this, UCX performance portable because a single implementation (in Open MPI or OpenSHMEM) will work efficiently on multiple interconnects. (e.g. uGNI, Verbs, libfabrics, etc). + +* **Support for interaction between multiple transports (or providers) to deliver messages.** +For example, UCX has the logic (in UCP) to make 'GPUDirect', IB' and share memory work together efficiently to deliver the data where is needed without the user dealing with this. + +* **Cross-transport multi-rail capabilities.** UCX protocol layer can utilize multiple transports, + event on different types of hardware, to deliver messages faster, without the need for + any special tuning. + +* **Utilizing hardware offloads for optimized performance**, such as RDMA, Hardware tag-matching + hardware atomic operations, etc. + +#### What protocols are supported by UCX? +UCP implements RMA put/get, send/receive with tag matching, Active messages, atomic operations. In near future we plan to add support for commonly used collective operations. + +#### Is UCX replacement for GASNET? +No. GASNET exposes high level API for PGAS programming management that provides symmetric memory management capabilities and build in runtime environments. These capabilities are out of scope of UCX project. +Instead, GASNET can leverage UCX framework for fast end efficient implementation of GASNET for the network technologies support by UCX. + +#### What is the relation between UCX and network drivers? +UCX framework does not provide drivers, instead it relies on the drivers provided by vendors. Currently we use: OFA VERBs, Cray's UGNI, NVIDIA CUDA. + +#### What is the relation between UCX and OFA Verbs or Libfabrics? +UCX, is a middleware communication layer that relies on vendors provided user level drivers including OFA Verbs or libfabrics (or any other drivers provided by another communities or vendors) to implement high-level protocols which can be used to close functionality gaps between various vendors drivers including various libfabrics providers: coordination across various drivers, multi-rail capabilities, software based RMA, AMOs, tag-matching for transports and drivers that do not support such capabilities natively. + +#### Is UCX a user level driver? +No. Typically, Drivers aim to expose fine-grain access to the network architecture specific features. +UCX abstracts the differences across various drivers and fill-in the gaps using software protocols for some of the architectures that don't provide hardware level support for all the operations. + +
+ +### Dependencies + +#### What stuff should I have on my machine to use UCX? + +UCX detects the exiting libraries on the build machine and enables/disables support +for various features accordingly. +If some of the modules UCX was built with are not found during runtime, they will +be silently disabled. + +* **Basic shared memory and TCP support** - always enabled +* **Optimized shared memory** - requires knem or xpmem drivers. On modern kernels also CMA (cross-memory-attach) mechanism will be used. +* **RDMA support** - requires rdma-core or libibverbs library. +* **NVIDIA GPU support** - requires Cuda drives +* **AMD GPU support** - requires ROCm drivers + + +#### Does UCX depend on an external runtime environment? +UCX does not depend on an external runtime environment. + +`ucx_perftest` (UCX based application/benchmark) can be linked with an external runtime environment that can be used for remote `ucx_perftest` launch, but this an optional configuration which is only used for environments that do not provide direct access to compute nodes. By default this option is disabled. + +
+ + +### Configuration and tuning + +#### How can I specify special configuration and tunings for UCX? + +UCX takes parameters from specific **environment variables**, which start with the +prefix `UCX_`. +> **IMPORTANT NOTE:** Changing the values of UCX environment variables to non-default +may lead to undefined behavior. The environment variables are mostly indented for + dvanced users, or for specific tunings or workarounds recommended by UCX community. + +#### 2. Where can I see all UCX environment variables? + +* Running `ucx_info -c` prints all environment variables and their default values. +* Running `ucx_info -cf` prints the documentation for all environment variables. + + +
+ +--- +
+ +## Network capabilities + +### Selecting networks and transports + +#### Which network devices does UCX use? + +By default, UCX tries to use all available devices on the machine, and selects +best ones based on performance characteristics (bandwidth, latency, NUMA locality, etc). +Setting `UCX_NET_DEVICES=,,...` would restrict UCX to using **only** +the specified devices. +For example: +* `UCX_NET_DEVICES=eth2` - Use the Ethernet device eth2 for TCP sockets transport. +* `UCX_NET_DEVICES=mlx5_2:1` - Use the RDMA device mlx5_2, port 1 + +Running `ucx_info -d` would show all available devices on the system that UCX can utilize. + +#### Which transports does UCX use? + +By default, UCX tries to use all available transports, and select best ones +according to their performance capabilities and scale (passed as estimated number +of endpoints to *ucp_init()* API). +For example: +* On machines with Ethernet devices only, shared memory will be used for intra-node +communication and TCP sockets for inter-node communication. +* On machines with RDMA devices, RC transport will be used for small scale, and + DC transport (available with Connect-IB devices and above) will be used for large + scale. If DC is not available, UD will be used for large scale. +* If GPUs are present on the machine, GPU transports will be enabled for detecting + memory pointer type and copying to/from GPU memory. + +It's possible to restrict the transports in use by setting `UCX_TLS=,,...`. +The list of all transports supported by UCX on the current machine can be generated +by `ucx_info -d` command. +> **IMPORTANT NOTE** +> In some cases restricting the transports can lead to unexpected and undefined behavior: +> * Using *rc_verbs* or *rc_mlx5* also requires *ud_verbs* or *ud_mlx5* transport for bootstrap. +> * Applications using GPU memory must also specify GPU transports for detecting and +> handling non-host memory. + +In addition to the built-in transports it's possible to use aliases which specify multiple transports. + +##### List of main transports and aliases + + + + + + + + + + + + + + + +
alluse all the available transports.
sm or shmall shared memory transports.
ugniugni_rdma and ugni_udt.
rcRC (=reliable connection), "accelerated" transports are used if possible.
udUD (=unreliable datagram), "accelerated" is used if possible.
dcDC - Mellanox scalable offloaded dynamic connection transport
rc_xSame as "rc", but using accelerated transports only
rc_vSame as "rc", but using Verbs-based transports only
ud_xSame as "ud", but using accelerated transports only
ud_vSame as "ud", but using Verbs-based transports only
cudaCUDA (NVIDIA GPU) memory support: cuda_copy, cuda_ipc, gdr_copy
rocmROCm (AMD GPU) memory support: rocm_copy, rocm_ipc, rocm_gdr
tcpTCP over SOCK_STREAM sockets
selfLoopback transport to communicate within the same process
+ +For example: +- `UCX_TLS=rc` will select RC, UD for bootstrap, and prefer accelerated transports +- `UCX_TLS=rc,cuda` will select RC along with Cuda memory transports. + + +
+ + +### Multi-rail + +#### Does UCX support multi-rail? + +Yes. + +#### What is the default behavior in a multi-rail environment? + +By default UCX would pick the 2 best network devices, and split large +messages between the rails. For example, in a 100MB message - the 1st 50MB +would be sent on the 1st device, and the 2nd 50MB would be sent on the 2nd device. +If the device network speeds are not the same, the split will be proportional to +their speed ratio. + +The devices to use are selected according to best network speed, PCI bandwidth, +and NUMA locality. + +#### Is it possible to use more than 2 rails? + +Yes, by setting `UCX_MAX_RNDV_RAILS=`. Currently up to 4 are supported. + +#### Is it possible that each process would just use the closest device? + +Yes, by `UCX_MAX_RNDV_RAILS=1` each process would use a single network device +according to NUMA locality. + +#### Can I disable multi-rail? + +Yes, by setting `UCX_NET_DEVICES=` to the single device that should be used. + +
+ +### Adaptive routing + +#### Does UCX support adaptive routing fabrics? + +Yes. + +#### What do I need to do to run UCX with adaptive routing? + +When adaptive routing is configured on an Infiniband fabric, it is enabled per SL +(IB Service Layer). +Setting `UCX_IB_SL=` will make UCX run on the given +service level and utilize adaptive routing. + +
+ +### RoCE + +#### How to specify service level with UCX? + +Setting `UCX_IB_SL=` will make UCX run on the given service level. + +#### How to specify DSCP priority? + +Setting `UCX_IB_TRAFFIC_CLASS=`. + +#### How to specify which address to use? + +Setting `UCX_IB_GID_INDEX=` would make UCX use the specified GID index on +the RoCE port. The system command `show_gids` would print all available addresses +and their indexes. + +--- +
+ +## Working with GPU + +### GPU support + +#### How UCX supports GPU? + +UCX protocol operations can work with GPU memory pointers the same way as with Host +memory pointers. For example, the 'buffer' argument passed to `ucp_tag_send_nb()` can +be either host or GPU memory. + + +#### Which GPUs are supported? + +Currently UCX supports NVIDIA GPUs by Cuda library, and AMD GPUs by ROCm library. + + +#### Which UCX APIs support GPU memory? + +Currently only UCX tagged APIs (ucp_tag_send_XX/ucp_tag_recv_XX) and stream APIs +(ucp_stream_send/ucp_stream_recv_XX) support GPU memory. + +#### How to run UCX with GPU support? + +In order to run UCX with GPU support, you will need an application which allocates +GPU memory (for example, +[MPI OSU benchmarks with Cuda support](https://mvapich.cse.ohio-state.edu/benchmarks)), +and UCX compiled with GPU support. Then you can run the application as usual (for +example, with MPI) and whenever GPU memory is passed to UCX, it either use GPU-direct +for zero copy operations, or copy the data to/from host memory. +> NOTE When specifying UCX_TLS explicitly, must also specify cuda/rocm for GPU memory +> support, otherwise the GPU memory will not be recognized. +> For example: `UCX_TLS=rc,cuda` or `UCX_TLS=dc,rocm` + +#### I'm running UCX with GPU memory and geting a segfault, why? + +Most likely UCX does not detect that the pointer is a GPU memory and tries to +access it from CPU. It can happen if UCX is not compiled with GPU support, or fails +to load CUDA or ROCm modules due to missing library paths or version mismatch. +Please run `ucx_info -d | grep cuda` or `ucx_info -d | grep rocm` to check for +UCX GPU support. + +#### What are the current limitations of using GPU memory? + +* **Static compilation** - programs which are statically compiled with Cuda libraries + must disable memory detection cache by setting `UCX_MEMTYPE_CACHE=n`. The reason + is that memory allocation hooks do not work with static compilation. Disabling this + cache could have a negative effect on performance, especially for small messages. + +
+ +### Performance considerations + +#### Does UCX support zero-copy for GPU memory over RDMA? + +Yes. For large messages UCX can transfer GPU memory using zero-copy RDMA using +rendezvous protocol. It requires the peer memory q for the relevant GPU type +to be loaded on the system. +> **NOTE:** In some cases if the RDMA network device and the GPU are not on +the same NUMA node, such zero-copy transfer is inefficient. + + + +
diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 00000000000..15f5646572d --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,88 @@ +.. +.. Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +.. +.. See file LICENSE for terms. +.. + +******* +OpenUCX +******* + +Unified Communication X (UCX) is an `award winning `_, +optimized production proven communication framework for modern, high-bandwidth +and low-latency networks. + +UCX exposes a set of abstract communication primitives which utilize the best of +available hardware resources and offloads. These include RDMA (InfiniBand and RoCE), +TCP, GPUs, shared Memory, and network atomic operations. + +UCX facilitates rapid development by providing a high-level API, masking the +low-level details, while maintaining high-performance and scalability. + +UCX implements best practices for transfer of messages of all sizes, based on +accumulated experience gained from applications running on the world's largest +datacenters and supercomputers. The full list of features and capabilities can +be found :ref:`here`. + +UCX is a member of `UCF consortium `_. + + +.. image:: _static/UCX_Layers.png + :alt: UCX layer diagram + :align: center + +.. toctree:: + :maxdepth: 3 + :hidden: + + ucx_features + download + running + faq + + +Quick start +*********** + +The following commands will download UCX latest v{RELEASE} release, build the code, +and run a simple client/server example: + +.. code-block:: console + + $ wget https://github.com/openucx/ucx/releases/download/v{RELEASE}/ucx-{RELEASE}.tar.gz + $ tar xzf ucx-{RELEASE}tar.gz + $ cd ucx-{RELEASE} + $ ./contrib/configure-release --prefix=$PWD/install + $ make -j8 install + + $ gcc examples/ucp_client_server.c -lucp -lucs -o ucp_client_server \ + -Iinstall/include -Linstall/lib + $ export LD_LIBRARY_PATH=$PWD/instal/lib + $ ./ucp_client_server & + $ ./ucp_client_server -a # : IP address of a local RoCE or IPoIB interface + ... + ----- UCP TEST SUCCESS ------- + + UCX Client-Server Hello World + + ------------------------------ + + +Documentation +************* + +* API doc: `HTML `_ `PDF `_ +* `Examples `_ + + +Projects using UCX +****************** + +* `UCX-PY `_ +* `Dask `_ +* `SparkUCX `_ +* `NCCL `_ +* `OpenMPI `_ +* `MPICH `_ +* `Charm++ `_ +* `OSSS shmem `_ diff --git a/docs/source/running.md b/docs/source/running.md new file mode 100644 index 00000000000..76e773ed900 --- /dev/null +++ b/docs/source/running.md @@ -0,0 +1,158 @@ + + +# Running UCX + +## UCX build and install + +#### Getting the source + + +* Download latest code from github: +``` +$ git clone https://github.com/openucx/ucx.git ucx +$ cd ucx +$ ./autogen.sh +``` + +* Alternatively, download and extract one of UCX pre-configured [releases](download): +``` +$ wget https://github.com/openucx/ucx/releases/download/v{RELEASE}/ucx-{RELEASE}.tar.gz +$ tar xzf ucx-{RELEASE}.tar.gz +$ cd ucx-{RELEASE} +``` + +* (This step is only required for OpenPOWER platforms) + On Ubuntu platform the config.guess file is a bit outdated and does not have + support for power. In order to resolve the issue you have to download an updated config.guess. + From the root of the project: + ``` + $ wget https://github.com/shamisp/ucx/raw/topic/power8-config/config.guess + ``` + +#### Building + +1. Configure: + ``` + $ mkdir build + $ cd build + $ ../configure --prefix= + ``` + > **NOTE**: For best performance configuration, use **../contrib/configure-release** + > instead of **../configure**. + > This will strip all debugging and profiling code. + + +2. Build and install: + ``` + $ make -j4 + $ make install + ``` + +
+ +--- +
+ +## OpenMPI with UCX + +[OpenMPI](https://www.open-mpi.org) supports UCX starting from version 3.0, but +it's recommended to use version 4.0 or higher due to stability and performance +improvements. + +### Building + +1. Get latest-and-greatest OpenMPI version: + ``` + $ git clone https://github.com/open-mpi/ompi.git + $ cd ompi + $ ./autogen.pl + ``` + +2. Configure with UCX: + ``` + $ mkdir build-ucx + $ cd build-ucx + $ ../configure --prefix= --with-ucx= + ``` +> **NOTE**: With OpenMPI 4.0 and above, there could be compilation errors from "btl_uct" component. +> This component is not critical for using UCX; so it could be disabled this way: +> ``` +> $ ./configure ... --enable-mca-no-build=btl-uct ... +> ``` + +3. Build: + ```bash + $ make + $ make install + ``` + +
+ +### Running MPI + +Example of the command line (with optional flag to select IB device mlx5_0 port 1): +``` +$ mpirun -np 2 -mca pml ucx -x UCX_NET_DEVICES=mlx5_0:1 ./app +``` +> **IMPORTANT NOTE**: Recent OpenMPI versions contain a BTL component called 'uct', +> which can cause data corruption when enabled, due to conflict on malloc hooks +> between OPAL and UCM. +> In order to work-around this, use one of the following alternatives: +> +> Alternative 1: Disable btl/uct in OpenMPI build configuration: +> ``` +> $ ./configure ... --enable-mca-no-build=btl-uct ... +> ``` +> +> Alternative 2: Disable btl/uct at runtime +> ``` +> $ mpirun -np 2 -mca pml ucx -mca btl ^uct -x UCX_NET_DEVICES=mlx5_0:1 ./app +> ``` + +
+ +### Runtime tunings +By default OpenMPI enables build-in transports (BTLs), which may result in additional +software overheads in the OpenMPI progress function. In order to workaround this issue +you may try to disable certain BTLs. +``` +$ mpirun -np 2 -mca pml ucx --mca btl ^vader,tcp,openib,uct -x UCX_NET_DEVICES=mlx5_0:1 ./app +``` + +
+ +--- +
+ +## MPICH with UCX +UCX is supported in MPICH 3.3 and higher versions. +UCX is already embedded in the MPICH tarball, so you do not need to separately download UCX. + +### Building + +1. Download mpich-3.3 or higher from https://www.mpich.org + +2. Configure with UCX: +``` +$ mkdir build +$ cd build +$ ../configure --prefix= --with-device=ch4:ucx +``` + +3. Build MPICH: +``` +$ make -j4 +$ make install +``` + +
+ +### Running MPI +Example of the command line (with optional flag to select IB device mlx5_0 port 1): +``` +$ mpirun -np 2 -env UCX_NET_DEVICES=mlx5_0:1 ./executable +``` + diff --git a/docs/source/ucx_features.rst b/docs/source/ucx_features.rst new file mode 100644 index 00000000000..a7bfdbd52a4 --- /dev/null +++ b/docs/source/ucx_features.rst @@ -0,0 +1,62 @@ +.. +.. Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +.. +.. See file LICENSE for terms. +.. + +.. _ucx_features: + +***************** +UCX main features +***************** + +High-level API features +*********************** +- Select either a client/server connection establishment (similar to TCP), or + connect directly by passing remote address blob. +- Support sharing resources between threads, or allocating dedicated resources + per thread. +- Event-driven or polling-driven progress. +- Java and Python bindings. +- Seamless handling of GPU memory. + +Main APIs +--------- +- Stream-oriented send/receive operations. +- Tag-matched send/receive. +- Remote memory access. +- Remote atomic operations. + +Fabrics support +*************** +- RoCE +- InfiniBand +- TCP sockets +- Shared memory (CMA, knem, xpmem, SysV, mmap) +- Cray Gemini / Aries (ugni) + +Platforms support +***************** +- Supported architectures: x86_64, Arm v8, Power. +- Runs on virtual machines (using SRIOV) and containers (docker, singularity). +- Can utilize either MLNX_OFED or Inbox RDMA drivers. +- Tested on major Linux distributions (RedHat/Ubuntu/SLES). + +GPU support +*********** +- Cuda (for NVIDIA GPUs) +- ROCm (for AMD GPUs) + +Protocols, Optimizations and Advanced Features +********************************************** +- Automatic selection of best transports and devices. +- Zero-copy with registration cache. +- Scalable flow control algorithms. +- Optimized memory pools. +- Accelerated direct-verbs transport for Mellanox devices. +- Pipeline protocols for GPU memory +- QoS and traffic isolation for RDMA transports +- Platform (micro-architecture) specific optimizations (such as memcpy, memory barriers, etc.) +- Multi-rail support +- Bare-metal, containers and cloud environments support +- Advanced protocols for transfer messages of different sizes diff --git a/doc/uml/ucp.dot b/docs/uml/ucp.dot similarity index 100% rename from doc/uml/ucp.dot rename to docs/uml/ucp.dot diff --git a/doc/uml/uct.dot b/docs/uml/uct.dot similarity index 100% rename from doc/uml/uct.dot rename to docs/uml/uct.dot diff --git a/examples/Makefile.am b/examples/Makefile.am new file mode 100644 index 00000000000..05cde2765dd --- /dev/null +++ b/examples/Makefile.am @@ -0,0 +1,63 @@ +# +# Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. +# +# Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# + +examplesdir = $(pkgdatadir)/examples +dist_examples_DATA = \ + hello_world_util.h \ + ucp_hello_world.c \ + uct_hello_world.c \ + ucp_client_server.c + +if HAVE_CUDA +EXAMPLE_CUDA_LDFLAGS = $(CUDA_LDFLAGS) +# cuda.h couldn't be compiled with -pedantic flag +EXAMPLE_CUDA_CFLAGS = +EXAMPLE_CUDA_CPPFLAGS = $(CUDA_CPPFLAGS) -DHAVE_CUDA +else +EXAMPLE_CUDA_LDFLAGS = +EXAMPLE_CUDA_CFLAGS = $(CFLAGS_PEDANTIC) +EXAMPLE_CUDA_CPPFLAGS = +endif + +EXAMPLE_CCLD_FLAGS = -lucs -I$(includedir) -L$(libdir) -Wall -Werror -Wl,-rpath,$(libdir) \ + $(EXAMPLE_CUDA_LDFLAGS) $(EXAMPLE_CUDA_CPPFLAGS) + +installcheck-local: + @echo "INSTALLCHECK: Compiling examples with installed library" + $(CC) -o uct_hello_world $(examplesdir)/uct_hello_world.c -luct $(EXAMPLE_CCLD_FLAGS) + $(CC) -o ucp_hello_world $(examplesdir)/ucp_hello_world.c -lucp $(EXAMPLE_CCLD_FLAGS) + $(CC) -o ucp_client_server $(examplesdir)/ucp_client_server.c -lucp $(EXAMPLE_CCLD_FLAGS) + $(RM) *.o uct_hello_world ucp_hello_world ucp_client_server + +if HAVE_EXAMPLES + +bin_PROGRAMS = \ + ucp_hello_world \ + uct_hello_world \ + ucp_client_server + +ucp_hello_world_SOURCES = ucp_hello_world.c +ucp_hello_world_CFLAGS = $(BASE_CFLAGS) $(EXAMPLE_CUDA_CFLAGS) +ucp_hello_world_CPPFLAGS = $(BASE_CPPFLAGS) $(EXAMPLE_CUDA_CPPFLAGS) +ucp_hello_world_LDADD = $(top_builddir)/src/ucs/libucs.la \ + $(top_builddir)/src/ucp/libucp.la \ + $(EXAMPLE_CUDA_LDFLAGS) + +uct_hello_world_SOURCES = uct_hello_world.c +uct_hello_world_CFLAGS = $(BASE_CFLAGS) $(EXAMPLE_CUDA_CFLAGS) +uct_hello_world_CPPFLAGS = $(BASE_CPPFLAGS) $(EXAMPLE_CUDA_CPPFLAGS) +uct_hello_world_LDADD = $(top_builddir)/src/ucs/libucs.la \ + $(top_builddir)/src/uct/libuct.la \ + $(EXAMPLE_CUDA_LDFLAGS) + +ucp_client_server_SOURCES = ucp_client_server.c +ucp_client_server_CFLAGS = $(BASE_CFLAGS) $(CFLAGS_PEDANTIC) +ucp_client_server_CPPFLAGS = $(BASE_CPPFLAGS) +ucp_client_server_LDADD = $(top_builddir)/src/ucs/libucs.la \ + $(top_builddir)/src/ucp/libucp.la + +endif diff --git a/examples/hello_world_util.h b/examples/hello_world_util.h new file mode 100644 index 00000000000..c51134e1683 --- /dev/null +++ b/examples/hello_world_util.h @@ -0,0 +1,305 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2001-2016. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef UCX_HELLO_WORLD_H +#define UCX_HELLO_WORLD_H + +#include + +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_CUDA +# include +# include +#endif + + +#define CHKERR_ACTION(_cond, _msg, _action) \ + do { \ + if (_cond) { \ + fprintf(stderr, "Failed to %s\n", _msg); \ + _action; \ + } \ + } while (0) + + +#define CHKERR_JUMP(_cond, _msg, _label) \ + CHKERR_ACTION(_cond, _msg, goto _label) + + +#define CHKERR_JUMP_RETVAL(_cond, _msg, _label, _retval) \ + do { \ + if (_cond) { \ + fprintf(stderr, "Failed to %s, return value %d\n", _msg, _retval); \ + goto _label; \ + } \ + } while (0) + + +static ucs_memory_type_t test_mem_type = UCS_MEMORY_TYPE_HOST; + + +#define CUDA_FUNC(_func) \ + do { \ + cudaError_t _result = (_func); \ + if (cudaSuccess != _result) { \ + fprintf(stderr, "%s failed: %s\n", \ + #_func, cudaGetErrorString(_result)); \ + } \ + } while(0) + + +void print_common_help(void); + +void *mem_type_malloc(size_t length) +{ + void *ptr; + + switch (test_mem_type) { + case UCS_MEMORY_TYPE_HOST: + ptr = malloc(length); + break; +#ifdef HAVE_CUDA + case UCS_MEMORY_TYPE_CUDA: + CUDA_FUNC(cudaMalloc(&ptr, length)); + break; + case UCS_MEMORY_TYPE_CUDA_MANAGED: + CUDA_FUNC(cudaMallocManaged(&ptr, length, cudaMemAttachGlobal)); + break; +#endif + default: + fprintf(stderr, "Unsupported memory type: %d\n", test_mem_type); + ptr = NULL; + break; + } + + return ptr; +} + +void mem_type_free(void *address) +{ + switch (test_mem_type) { + case UCS_MEMORY_TYPE_HOST: + free(address); + break; +#ifdef HAVE_CUDA + case UCS_MEMORY_TYPE_CUDA: + case UCS_MEMORY_TYPE_CUDA_MANAGED: + CUDA_FUNC(cudaFree(address)); + break; +#endif + default: + fprintf(stderr, "Unsupported memory type: %d\n", test_mem_type); + break; + } +} + +void *mem_type_memcpy(void *dst, const void *src, size_t count) +{ + switch (test_mem_type) { + case UCS_MEMORY_TYPE_HOST: + memcpy(dst, src, count); + break; +#ifdef HAVE_CUDA + case UCS_MEMORY_TYPE_CUDA: + case UCS_MEMORY_TYPE_CUDA_MANAGED: + CUDA_FUNC(cudaMemcpy(dst, src, count, cudaMemcpyDefault)); + break; +#endif + default: + fprintf(stderr, "Unsupported memory type: %d\n", test_mem_type); + break; + } + + return dst; +} + +void *mem_type_memset(void *dst, int value, size_t count) +{ + switch (test_mem_type) { + case UCS_MEMORY_TYPE_HOST: + memset(dst, value, count); + break; +#ifdef HAVE_CUDA + case UCS_MEMORY_TYPE_CUDA: + case UCS_MEMORY_TYPE_CUDA_MANAGED: + CUDA_FUNC(cudaMemset(dst, value, count)); + break; +#endif + default: + fprintf(stderr, "Unsupported memory type: %d", test_mem_type); + break; + } + + return dst; +} + +int check_mem_type_support(ucs_memory_type_t mem_type) +{ + switch (test_mem_type) { + case UCS_MEMORY_TYPE_HOST: + return 1; + case UCS_MEMORY_TYPE_CUDA: + case UCS_MEMORY_TYPE_CUDA_MANAGED: +#ifdef HAVE_CUDA + return 1; +#else + return 0; +#endif + default: + fprintf(stderr, "Unsupported memory type: %d", test_mem_type); + break; + } + + return 0; +} + +ucs_memory_type_t parse_mem_type(const char *opt_arg) +{ + if (!strcmp(opt_arg, "host")) { + return UCS_MEMORY_TYPE_HOST; + } else if (!strcmp(opt_arg, "cuda") && + check_mem_type_support(UCS_MEMORY_TYPE_CUDA)) { + return UCS_MEMORY_TYPE_CUDA; + } else if (!strcmp(opt_arg, "cuda-managed") && + check_mem_type_support(UCS_MEMORY_TYPE_CUDA_MANAGED)) { + return UCS_MEMORY_TYPE_CUDA_MANAGED; + } else { + fprintf(stderr, "Unsupported memory type: \"%s\".\n", opt_arg); + } + + return UCS_MEMORY_TYPE_LAST; +} + +void print_common_help() +{ + fprintf(stderr, " -n name Set node name or IP address " + "of the server (required for client and should be ignored " + "for server)\n"); + fprintf(stderr, " -p port Set alternative server port (default:13337)\n"); + fprintf(stderr, " -s size Set test string length (default:16)\n"); + fprintf(stderr, " -m memory type of messages\n"); + fprintf(stderr, " host - system memory (default)\n"); + if (check_mem_type_support(UCS_MEMORY_TYPE_CUDA)) { + fprintf(stderr, " cuda - NVIDIA GPU memory\n"); + } + if (check_mem_type_support(UCS_MEMORY_TYPE_CUDA_MANAGED)) { + fprintf(stderr, " cuda-managed - NVIDIA GPU managed/unified memory\n"); + } +} + +int server_connect(uint16_t server_port) +{ + struct sockaddr_in inaddr; + int lsock = -1; + int dsock = -1; + int optval = 1; + int ret; + + lsock = socket(AF_INET, SOCK_STREAM, 0); + CHKERR_JUMP(lsock < 0, "open server socket", err); + + optval = 1; + ret = setsockopt(lsock, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)); + CHKERR_JUMP(ret < 0, "server setsockopt()", err_sock); + + inaddr.sin_family = AF_INET; + inaddr.sin_port = htons(server_port); + inaddr.sin_addr.s_addr = INADDR_ANY; + memset(inaddr.sin_zero, 0, sizeof(inaddr.sin_zero)); + ret = bind(lsock, (struct sockaddr*)&inaddr, sizeof(inaddr)); + CHKERR_JUMP(ret < 0, "bind server", err_sock); + + ret = listen(lsock, 0); + CHKERR_JUMP(ret < 0, "listen server", err_sock); + + fprintf(stdout, "Waiting for connection...\n"); + + /* Accept next connection */ + dsock = accept(lsock, NULL, NULL); + CHKERR_JUMP(dsock < 0, "accept server", err_sock); + + close(lsock); + + return dsock; + +err_sock: + close(lsock); + +err: + return -1; +} + +int client_connect(const char *server, uint16_t server_port) +{ + struct sockaddr_in conn_addr; + struct hostent *he; + int connfd; + int ret; + + connfd = socket(AF_INET, SOCK_STREAM, 0); + CHKERR_JUMP(connfd < 0, "open client socket", err); + + he = gethostbyname(server); + CHKERR_JUMP((he == NULL || he->h_addr_list == NULL), "found a host", err_conn); + + conn_addr.sin_family = he->h_addrtype; + conn_addr.sin_port = htons(server_port); + + memcpy(&conn_addr.sin_addr, he->h_addr_list[0], he->h_length); + memset(conn_addr.sin_zero, 0, sizeof(conn_addr.sin_zero)); + + ret = connect(connfd, (struct sockaddr*)&conn_addr, sizeof(conn_addr)); + CHKERR_JUMP(ret < 0, "connect client", err_conn); + + return connfd; + +err_conn: + close(connfd); +err: + return -1; +} + +static int barrier(int oob_sock) +{ + int dummy = 0; + ssize_t res; + + res = send(oob_sock, &dummy, sizeof(dummy), 0); + if (res < 0) { + return res; + } + + res = recv(oob_sock, &dummy, sizeof(dummy), MSG_WAITALL); + + /* number of received bytes should be the same as sent */ + return !(res == sizeof(dummy)); +} + +static int generate_test_string(char *str, int size) +{ + char *tmp_str; + int i; + + tmp_str = calloc(1, size); + CHKERR_ACTION(tmp_str == NULL, "allocate memory\n", return -1); + + for (i = 0; i < (size - 1); ++i) { + tmp_str[i] = 'A' + (i % 26); + } + + mem_type_memcpy(str, tmp_str, size); + + free(tmp_str); + return 0; +} + +#endif /* UCX_HELLO_WORLD_H */ diff --git a/examples/ucp_client_server.c b/examples/ucp_client_server.c new file mode 100644 index 00000000000..8081064a279 --- /dev/null +++ b/examples/ucp_client_server.c @@ -0,0 +1,836 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2018. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +/* + * UCP client - server example utility + * ----------------------------------------------- + * + * Server side: + * + * ./ucp_client_server + * + * Client side: + * + * ./ucp_client_server -a + * + * Notes: + * + * - The server will listen to incoming connection requests on INADDR_ANY. + * - The client needs to pass the IP address of the server side to connect to + * as an argument to the test. + * - Currently, the passed IP needs to be an IPoIB or a RoCE address. + * - The port which the server side would listen on can be modified with the + * '-p' option and should be used on both sides. The default port to use is + * 13337. + */ + +#include + +#include /* memset */ +#include /* inet_addr */ +#include /* getopt */ +#include /* atoi */ + +#define TEST_STRING_LEN sizeof(test_message) +#define DEFAULT_PORT 13337 +#define IP_STRING_LEN 50 +#define PORT_STRING_LEN 8 +#define TAG 0xCAFE +#define COMM_TYPE_DEFAULT "STREAM" +#define PRINT_INTERVAL 2000 +#define DEFAULT_NUM_ITERATIONS 1 + +const char test_message[] = "UCX Client-Server Hello World"; +static uint16_t server_port = DEFAULT_PORT; +static int num_iterations = DEFAULT_NUM_ITERATIONS; + + +typedef enum { + CLIENT_SERVER_SEND_RECV_STREAM = UCS_BIT(0), + CLIENT_SERVER_SEND_RECV_TAG = UCS_BIT(1), + CLIENT_SERVER_SEND_RECV_DEFAULT = CLIENT_SERVER_SEND_RECV_STREAM +} send_recv_type_t; + + +/** + * Server's application context to be used in the user's connection request + * callback. + * It holds the server's listener and the handle to an incoming connection request. + */ +typedef struct ucx_server_ctx { + volatile ucp_conn_request_h conn_request; + ucp_listener_h listener; +} ucx_server_ctx_t; + + +/** + * Stream request context. Holds a value to indicate whether or not the + * request is completed. + */ +typedef struct test_req { + int complete; +} test_req_t; + + +/** + * Print this application's usage help message. + */ +static void usage(void); + +static void tag_recv_cb(void *request, ucs_status_t status, + const ucp_tag_recv_info_t *info, void *user_data) +{ + test_req_t *ctx = user_data; + + ctx->complete = 1; +} + +/** + * The callback on the receiving side, which is invoked upon receiving the + * stream message. + */ +static void +stream_recv_cb(void *request, ucs_status_t status, size_t length, + void *user_data) +{ + test_req_t *ctx = user_data; + + ctx->complete = 1; +} + +/** + * The callback on the sending side, which is invoked after finishing sending + * the message. + */ +static void send_cb(void *request, ucs_status_t status, void *user_data) +{ + test_req_t *ctx = user_data; + + ctx->complete = 1; +} + +/** + * Error handling callback. + */ +static void err_cb(void *arg, ucp_ep_h ep, ucs_status_t status) +{ + printf("error handling callback was invoked with status %d (%s)\n", + status, ucs_status_string(status)); +} + +/** + * Set an address for the server to listen on - INADDR_ANY on a well known port. + */ +void set_listen_addr(const char *address_str, struct sockaddr_in *listen_addr) +{ + /* The server will listen on INADDR_ANY */ + memset(listen_addr, 0, sizeof(struct sockaddr_in)); + listen_addr->sin_family = AF_INET; + listen_addr->sin_addr.s_addr = (address_str) ? inet_addr(address_str) : INADDR_ANY; + listen_addr->sin_port = htons(server_port); +} + +/** + * Set an address to connect to. A given IP address on a well known port. + */ +void set_connect_addr(const char *address_str, struct sockaddr_in *connect_addr) +{ + memset(connect_addr, 0, sizeof(struct sockaddr_in)); + connect_addr->sin_family = AF_INET; + connect_addr->sin_addr.s_addr = inet_addr(address_str); + connect_addr->sin_port = htons(server_port); +} + +/** + * Initialize the client side. Create an endpoint from the client side to be + * connected to the remote server (to the given IP). + */ +static ucs_status_t start_client(ucp_worker_h ucp_worker, const char *ip, + ucp_ep_h *client_ep) +{ + ucp_ep_params_t ep_params; + struct sockaddr_in connect_addr; + ucs_status_t status; + + set_connect_addr(ip, &connect_addr); + + /* + * Endpoint field mask bits: + * UCP_EP_PARAM_FIELD_FLAGS - Use the value of the 'flags' field. + * UCP_EP_PARAM_FIELD_SOCK_ADDR - Use a remote sockaddr to connect + * to the remote peer. + * UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE - Error handling mode - this flag + * is temporarily required since the + * endpoint will be closed with + * UCP_EP_CLOSE_MODE_FORCE which + * requires this mode. + * Once UCP_EP_CLOSE_MODE_FORCE is + * removed, the error handling mode + * will be removed. + */ + ep_params.field_mask = UCP_EP_PARAM_FIELD_FLAGS | + UCP_EP_PARAM_FIELD_SOCK_ADDR | + UCP_EP_PARAM_FIELD_ERR_HANDLER | + UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE; + ep_params.err_mode = UCP_ERR_HANDLING_MODE_PEER; + ep_params.err_handler.cb = err_cb; + ep_params.err_handler.arg = NULL; + ep_params.flags = UCP_EP_PARAMS_FLAGS_CLIENT_SERVER; + ep_params.sockaddr.addr = (struct sockaddr*)&connect_addr; + ep_params.sockaddr.addrlen = sizeof(connect_addr); + + status = ucp_ep_create(ucp_worker, &ep_params, client_ep); + if (status != UCS_OK) { + fprintf(stderr, "failed to connect to %s (%s)\n", ip, ucs_status_string(status)); + } + + return status; +} + +/** + * Print the received message on the server side or the sent data on the client + * side. + */ +static void print_result(int is_server, char *recv_message, int current_iter) +{ + if (is_server) { + printf("Server: iteration #%d\n", (current_iter + 1)); + printf("UCX data message was received\n"); + printf("\n\n----- UCP TEST SUCCESS -------\n\n"); + printf("%s", recv_message); + printf("\n\n------------------------------\n\n"); + } else { + printf("Client: iteration #%d\n", (current_iter + 1)); + printf("\n\n-----------------------------------------\n\n"); + printf("Client sent message: \n%s.\nlength: %ld\n", + test_message, TEST_STRING_LEN); + printf("\n-----------------------------------------\n\n"); + } +} + +/** + * Progress the request until it completes. + */ +static ucs_status_t request_wait(ucp_worker_h ucp_worker, void *request, + test_req_t *ctx) +{ + ucs_status_t status; + + /* if operation was completed immediately */ + if (request == NULL) { + return UCS_OK; + } + + if (UCS_PTR_IS_ERR(request)) { + return UCS_PTR_STATUS(request); + } + + while (ctx->complete == 0) { + ucp_worker_progress(ucp_worker); + } + status = ucp_request_check_status(request); + + ucp_request_free(request); + + return status; +} + +static int request_finalize(ucp_worker_h ucp_worker, test_req_t *request, + test_req_t *ctx, int is_server, + char *recv_message, int current_iter) +{ + ucs_status_t status; + int ret = 0; + + status = request_wait(ucp_worker, request, ctx); + if (status != UCS_OK) { + fprintf(stderr, "unable to %s UCX message (%s)\n", + is_server ? "receive": "send", ucs_status_string(status)); + return -1; + } + + /* Print the output of the first, last and every PRINT_INTERVAL iteration */ + if ((current_iter == 0) || (current_iter == (num_iterations - 1)) || + !((current_iter + 1) % (PRINT_INTERVAL))) { + print_result(is_server, recv_message, current_iter); + } + + return ret; +} + +/** + * Send and receive a message using the Stream API. + * The client sends a message to the server and waits until the send it completed. + * The server receives a message from the client and waits for its completion. + */ +static int send_recv_stream(ucp_worker_h ucp_worker, ucp_ep_h ep, int is_server, + int current_iter) +{ + char recv_message[TEST_STRING_LEN]= ""; + ucp_request_param_t param; + test_req_t *request; + size_t length; + test_req_t ctx; + + ctx.complete = 0; + param.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FIELD_USER_DATA; + param.user_data = &ctx; + if (!is_server) { + /* Client sends a message to the server using the stream API */ + param.cb.send = send_cb; + request = ucp_stream_send_nbx(ep, test_message, TEST_STRING_LEN, + ¶m); + } else { + /* Server receives a message from the client using the stream API */ + param.op_attr_mask |= UCP_OP_ATTR_FIELD_FLAGS; + param.flags = UCP_STREAM_RECV_FLAG_WAITALL; + param.cb.recv_stream = stream_recv_cb; + request = ucp_stream_recv_nbx(ep, &recv_message, + TEST_STRING_LEN, + &length, ¶m); + } + + return request_finalize(ucp_worker, request, &ctx, is_server, + recv_message, current_iter); +} + +/** + * Send and receive a message using the Tag-Matching API. + * The client sends a message to the server and waits until the send it completed. + * The server receives a message from the client and waits for its completion. + */ +static int send_recv_tag(ucp_worker_h ucp_worker, ucp_ep_h ep, int is_server, + int current_iter) +{ + char recv_message[TEST_STRING_LEN]= ""; + ucp_request_param_t param; + void *request; + test_req_t ctx; + + ctx.complete = 0; + param.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FIELD_USER_DATA; + param.user_data = &ctx; + if (!is_server) { + /* Client sends a message to the server using the Tag-Matching API */ + param.cb.send = send_cb; + request = ucp_tag_send_nbx(ep, test_message, TEST_STRING_LEN, + TAG, ¶m); + } else { + /* Server receives a message from the client using the Tag-Matching API */ + param.cb.recv = tag_recv_cb; + request = ucp_tag_recv_nbx(ucp_worker, &recv_message, + TEST_STRING_LEN, TAG, 0, ¶m); + } + + return request_finalize(ucp_worker, request, &ctx, is_server, recv_message, + current_iter); +} + +/** + * Close the given endpoint. + * Currently closing the endpoint with UCP_EP_CLOSE_MODE_FORCE since we currently + * cannot rely on the client side to be present during the server's endpoint + * closing process. + */ +static void ep_close(ucp_worker_h ucp_worker, ucp_ep_h ep) +{ + ucp_request_param_t param; + ucs_status_t status; + void *close_req; + + param.op_attr_mask = UCP_OP_ATTR_FIELD_FLAGS; + param.flags = UCP_EP_CLOSE_FLAG_FORCE; + close_req = ucp_ep_close_nbx(ep, ¶m); + if (UCS_PTR_IS_PTR(close_req)) { + do { + ucp_worker_progress(ucp_worker); + status = ucp_request_check_status(close_req); + } while (status == UCS_INPROGRESS); + + ucp_request_free(close_req); + } else if (UCS_PTR_STATUS(close_req) != UCS_OK) { + fprintf(stderr, "failed to close ep %p\n", (void*)ep); + } +} + +/** + * Print this application's usage help message. + */ +static void usage() +{ + fprintf(stderr, "Usage: ucp_client_server [parameters]\n"); + fprintf(stderr, "UCP client-server example utility\n"); + fprintf(stderr, "\nParameters are:\n"); + fprintf(stderr, " -a Set IP address of the server " + "(required for client and should not be specified " + "for the server)\n"); + fprintf(stderr, " -l Set IP address where server listens " + "(If not specified, server uses INADDR_ANY; " + "Irrelevant at client)\n"); + fprintf(stderr, " -p Port number to listen/connect to (default = %d). " + "0 on the server side means select a random port and print it\n", + DEFAULT_PORT); + fprintf(stderr, " -c Communication type for the client and server. " + " Valid values are:\n" + " 'stream' : Stream API\n" + " 'tag' : Tag API\n" + " If not specified, %s API will be used.\n", COMM_TYPE_DEFAULT); + fprintf(stderr, " -i Number of iterations to run. Client and server must " + "have the same value. (default = %d).\n", + num_iterations); + fprintf(stderr, "\n"); +} + +/** + * Parse the command line arguments. + */ +static int parse_cmd(int argc, char *const argv[], char **server_addr, + char **listen_addr, send_recv_type_t *send_recv_type) +{ + int c = 0; + int port; + + opterr = 0; + + while ((c = getopt(argc, argv, "a:l:p:c:i:")) != -1) { + switch (c) { + case 'a': + *server_addr = optarg; + break; + case 'c': + if (!strcasecmp(optarg, "stream")) { + *send_recv_type = CLIENT_SERVER_SEND_RECV_STREAM; + } else if (!strcasecmp(optarg, "tag")) { + *send_recv_type = CLIENT_SERVER_SEND_RECV_TAG; + } else { + fprintf(stderr, "Wrong communication type %s. " + "Using %s as default\n", optarg, COMM_TYPE_DEFAULT); + *send_recv_type = CLIENT_SERVER_SEND_RECV_DEFAULT; + } + break; + case 'l': + *listen_addr = optarg; + break; + case 'p': + port = atoi(optarg); + if ((port < 0) || (port > UINT16_MAX)) { + fprintf(stderr, "Wrong server port number %d\n", port); + return -1; + } + server_port = port; + break; + case 'i': + num_iterations = atoi(optarg); + break; + default: + usage(); + return -1; + } + } + + return 0; +} + +static char* sockaddr_get_ip_str(const struct sockaddr_storage *sock_addr, + char *ip_str, size_t max_size) +{ + struct sockaddr_in addr_in; + struct sockaddr_in6 addr_in6; + + switch (sock_addr->ss_family) { + case AF_INET: + memcpy(&addr_in, sock_addr, sizeof(struct sockaddr_in)); + inet_ntop(AF_INET, &addr_in.sin_addr, ip_str, max_size); + return ip_str; + case AF_INET6: + memcpy(&addr_in6, sock_addr, sizeof(struct sockaddr_in6)); + inet_ntop(AF_INET6, &addr_in6.sin6_addr, ip_str, max_size); + return ip_str; + default: + return "Invalid address family"; + } +} + +static char* sockaddr_get_port_str(const struct sockaddr_storage *sock_addr, + char *port_str, size_t max_size) +{ + struct sockaddr_in addr_in; + struct sockaddr_in6 addr_in6; + + switch (sock_addr->ss_family) { + case AF_INET: + memcpy(&addr_in, sock_addr, sizeof(struct sockaddr_in)); + snprintf(port_str, max_size, "%d", ntohs(addr_in.sin_port)); + return port_str; + case AF_INET6: + memcpy(&addr_in6, sock_addr, sizeof(struct sockaddr_in6)); + snprintf(port_str, max_size, "%d", ntohs(addr_in6.sin6_port)); + return port_str; + default: + return "Invalid address family"; + } +} + +static int client_server_communication(ucp_worker_h worker, ucp_ep_h ep, + send_recv_type_t send_recv_type, + int is_server, int current_iter) +{ + int ret; + + switch (send_recv_type) { + case CLIENT_SERVER_SEND_RECV_STREAM: + /* Client-Server communication via Stream API */ + ret = send_recv_stream(worker, ep, is_server, current_iter); + break; + case CLIENT_SERVER_SEND_RECV_TAG: + /* Client-Server communication via Tag-Matching API */ + ret = send_recv_tag(worker, ep, is_server, current_iter); + break; + default: + fprintf(stderr, "unknown send-recv type %d\n", send_recv_type); + return -1; + } + + return ret; +} + +/** + * Create a ucp worker on the given ucp context. + */ +static int init_worker(ucp_context_h ucp_context, ucp_worker_h *ucp_worker) +{ + ucp_worker_params_t worker_params; + ucs_status_t status; + int ret = 0; + + memset(&worker_params, 0, sizeof(worker_params)); + + worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; + worker_params.thread_mode = UCS_THREAD_MODE_SINGLE; + + status = ucp_worker_create(ucp_context, &worker_params, ucp_worker); + if (status != UCS_OK) { + fprintf(stderr, "failed to ucp_worker_create (%s)\n", ucs_status_string(status)); + ret = -1; + } + + return ret; +} + +/** + * The callback on the server side which is invoked upon receiving a connection + * request from the client. + */ +static void server_conn_handle_cb(ucp_conn_request_h conn_request, void *arg) +{ + ucx_server_ctx_t *context = arg; + ucp_conn_request_attr_t attr; + char ip_str[IP_STRING_LEN]; + char port_str[PORT_STRING_LEN]; + ucs_status_t status; + + attr.field_mask = UCP_CONN_REQUEST_ATTR_FIELD_CLIENT_ADDR; + status = ucp_conn_request_query(conn_request, &attr); + if (status == UCS_OK) { + printf("Server received a connection request from client at address %s:%s\n", + sockaddr_get_ip_str(&attr.client_address, ip_str, sizeof(ip_str)), + sockaddr_get_port_str(&attr.client_address, port_str, sizeof(port_str))); + } else if (status != UCS_ERR_UNSUPPORTED) { + fprintf(stderr, "failed to query the connection request (%s)\n", + ucs_status_string(status)); + } + + if (context->conn_request == NULL) { + context->conn_request = conn_request; + } else { + /* The server is already handling a connection request from a client, + * reject this new one */ + printf("Rejecting a connection request. " + "Only one client at a time is supported.\n"); + status = ucp_listener_reject(context->listener, conn_request); + if (status != UCS_OK) { + fprintf(stderr, "server failed to reject a connection request: (%s)\n", + ucs_status_string(status)); + } + } +} + +static ucs_status_t server_create_ep(ucp_worker_h data_worker, + ucp_conn_request_h conn_request, + ucp_ep_h *server_ep) +{ + ucp_ep_params_t ep_params; + ucs_status_t status; + + /* Server creates an ep to the client on the data worker. + * This is not the worker the listener was created on. + * The client side should have initiated the connection, leading + * to this ep's creation */ + ep_params.field_mask = UCP_EP_PARAM_FIELD_ERR_HANDLER | + UCP_EP_PARAM_FIELD_CONN_REQUEST; + ep_params.conn_request = conn_request; + ep_params.err_handler.cb = err_cb; + ep_params.err_handler.arg = NULL; + + status = ucp_ep_create(data_worker, &ep_params, server_ep); + if (status != UCS_OK) { + fprintf(stderr, "failed to create an endpoint on the server: (%s)\n", + ucs_status_string(status)); + } + + return status; +} + +/** + * Initialize the server side. The server starts listening on the set address. + */ +static ucs_status_t start_server(ucp_worker_h ucp_worker, + ucx_server_ctx_t *context, + ucp_listener_h *listener_p, const char *ip) +{ + struct sockaddr_in listen_addr; + ucp_listener_params_t params; + ucp_listener_attr_t attr; + ucs_status_t status; + char ip_str[IP_STRING_LEN]; + char port_str[PORT_STRING_LEN]; + + set_listen_addr(ip, &listen_addr); + + params.field_mask = UCP_LISTENER_PARAM_FIELD_SOCK_ADDR | + UCP_LISTENER_PARAM_FIELD_CONN_HANDLER; + params.sockaddr.addr = (const struct sockaddr*)&listen_addr; + params.sockaddr.addrlen = sizeof(listen_addr); + params.conn_handler.cb = server_conn_handle_cb; + params.conn_handler.arg = context; + + /* Create a listener on the server side to listen on the given address.*/ + status = ucp_listener_create(ucp_worker, ¶ms, listener_p); + if (status != UCS_OK) { + fprintf(stderr, "failed to listen (%s)\n", ucs_status_string(status)); + goto out; + } + + /* Query the created listener to get the port it is listening on. */ + attr.field_mask = UCP_LISTENER_ATTR_FIELD_SOCKADDR; + status = ucp_listener_query(*listener_p, &attr); + if (status != UCS_OK) { + fprintf(stderr, "failed to query the listener (%s)\n", + ucs_status_string(status)); + ucp_listener_destroy(*listener_p); + goto out; + } + + fprintf(stderr, "server is listening on IP %s port %s\n", + sockaddr_get_ip_str(&attr.sockaddr, ip_str, IP_STRING_LEN), + sockaddr_get_port_str(&attr.sockaddr, port_str, PORT_STRING_LEN)); + + printf("Waiting for connection...\n"); + +out: + return status; +} + +static int client_server_do_work(ucp_worker_h ucp_worker, ucp_ep_h ep, + send_recv_type_t send_recv_type, int is_server) +{ + int i, ret = 0; + + for (i = 0; i < num_iterations; i++) { + ret = client_server_communication(ucp_worker, ep, send_recv_type, + is_server, i); + if (ret != 0) { + fprintf(stderr, "%s failed on iteration #%d\n", + (is_server ? "server": "client"), i + 1); + goto out; + } + } + +out: + return ret; +} + +static int run_server(ucp_context_h ucp_context, ucp_worker_h ucp_worker, + char *listen_addr, send_recv_type_t send_recv_type) +{ + ucx_server_ctx_t context; + ucp_worker_h ucp_data_worker; + ucp_ep_h server_ep; + ucs_status_t status; + int ret; + + /* Create a data worker (to be used for data exchange between the server + * and the client after the connection between them was established) */ + ret = init_worker(ucp_context, &ucp_data_worker); + if (ret != 0) { + goto err; + } + + /* Initialize the server's context. */ + context.conn_request = NULL; + + /* Create a listener on the worker created at first. The 'connection + * worker' - used for connection establishment between client and server. + * This listener will stay open for listening to incoming connection + * requests from the client */ + status = start_server(ucp_worker, &context, &context.listener, listen_addr); + if (status != UCS_OK) { + ret = -1; + goto err_worker; + } + + /* Server is always up listening */ + while (1) { + /* Wait for the server to receive a connection request from the client. + * If there are multiple clients for which the server's connection request + * callback is invoked, i.e. several clients are trying to connect in + * parallel, the server will handle only the first one and reject the rest */ + while (context.conn_request == NULL) { + ucp_worker_progress(ucp_worker); + } + + /* Server creates an ep to the client on the data worker. + * This is not the worker the listener was created on. + * The client side should have initiated the connection, leading + * to this ep's creation */ + status = server_create_ep(ucp_data_worker, context.conn_request, + &server_ep); + if (status != UCS_OK) { + ret = -1; + goto err_listener; + } + + /* The server waits for all the iterations to complete before moving on + * to the next client */ + ret = client_server_do_work(ucp_data_worker, server_ep, send_recv_type, + 1); + if (ret != 0) { + goto err_ep; + } + + /* Close the endpoint to the client */ + ep_close(ucp_data_worker, server_ep); + + /* Reinitialize the server's context to be used for the next client */ + context.conn_request = NULL; + + printf("Waiting for connection...\n"); + } + +err_ep: + ep_close(ucp_data_worker, server_ep); +err_listener: + ucp_listener_destroy(context.listener); +err_worker: + ucp_worker_destroy(ucp_data_worker); +err: + return ret; +} + +static int run_client(ucp_worker_h ucp_worker, char *server_addr, + send_recv_type_t send_recv_type) +{ + ucp_ep_h client_ep; + ucs_status_t status; + int ret; + + status = start_client(ucp_worker, server_addr, &client_ep); + if (status != UCS_OK) { + fprintf(stderr, "failed to start client (%s)\n", ucs_status_string(status)); + ret = -1; + goto out; + } + + ret = client_server_do_work(ucp_worker, client_ep, send_recv_type, 0); + + /* Close the endpoint to the server */ + ep_close(ucp_worker, client_ep); + +out: + return ret; +} + +/** + * Initialize the UCP context and worker. + */ +static int init_context(ucp_context_h *ucp_context, ucp_worker_h *ucp_worker, + send_recv_type_t send_recv_type) +{ + /* UCP objects */ + ucp_params_t ucp_params; + ucs_status_t status; + int ret = 0; + + memset(&ucp_params, 0, sizeof(ucp_params)); + + /* UCP initialization */ + ucp_params.field_mask = UCP_PARAM_FIELD_FEATURES; + + if (send_recv_type == CLIENT_SERVER_SEND_RECV_STREAM) { + ucp_params.features = UCP_FEATURE_STREAM; + } else { + ucp_params.features = UCP_FEATURE_TAG; + } + + status = ucp_init(&ucp_params, NULL, ucp_context); + if (status != UCS_OK) { + fprintf(stderr, "failed to ucp_init (%s)\n", ucs_status_string(status)); + ret = -1; + goto err; + } + + ret = init_worker(*ucp_context, ucp_worker); + if (ret != 0) { + goto err_cleanup; + } + + return ret; + +err_cleanup: + ucp_cleanup(*ucp_context); +err: + return ret; +} + + +int main(int argc, char **argv) +{ + send_recv_type_t send_recv_type = CLIENT_SERVER_SEND_RECV_DEFAULT; + char *server_addr = NULL; + char *listen_addr = NULL; + int ret; + + /* UCP objects */ + ucp_context_h ucp_context; + ucp_worker_h ucp_worker; + + ret = parse_cmd(argc, argv, &server_addr, &listen_addr, &send_recv_type); + if (ret != 0) { + goto err; + } + + /* Initialize the UCX required objects */ + ret = init_context(&ucp_context, &ucp_worker, send_recv_type); + if (ret != 0) { + goto err; + } + + /* Client-Server initialization */ + if (server_addr == NULL) { + /* Server side */ + ret = run_server(ucp_context, ucp_worker, listen_addr, send_recv_type); + } else { + /* Client side */ + ret = run_client(ucp_worker, server_addr, send_recv_type); + } + + ucp_worker_destroy(ucp_worker); + ucp_cleanup(ucp_context); +err: + return ret; +} diff --git a/test/examples/ucp_hello_world.c b/examples/ucp_hello_world.c similarity index 76% rename from test/examples/ucp_hello_world.c rename to examples/ucp_hello_world.c index eae33c71d64..f5b3f374e03 100644 --- a/test/examples/ucp_hello_world.c +++ b/examples/ucp_hello_world.c @@ -33,7 +33,7 @@ * Sergey Shalnov 7-June-2016 */ -#include "ucx_hello_world.h" +#include "hello_world_util.h" #include @@ -76,14 +76,19 @@ static ucs_status_t client_status = UCS_OK; static uint16_t server_port = 13337; static long test_string_length = 16; static const ucp_tag_t tag = 0x1337a880u; -static const ucp_tag_t tag_mask = -1; +static const ucp_tag_t tag_mask = UINT64_MAX; static ucp_address_t *local_addr; static ucp_address_t *peer_addr; static size_t local_addr_len; static size_t peer_addr_len; -static int parse_cmd(int argc, char * const argv[], char **server_name); +static ucs_status_t parse_cmd(int argc, char * const argv[], char **server_name); + +static void set_msg_data_len(struct msg *msg, uint64_t data_len) +{ + mem_type_memcpy(&msg->data_len, &data_len, sizeof(data_len)); +} static void request_init(void *request) { @@ -91,9 +96,9 @@ static void request_init(void *request) ctx->completed = 0; } -static void send_handler(void *request, ucs_status_t status) +static void send_handler(void *request, ucs_status_t status, void *ctx) { - struct ucx_context *context = (struct ucx_context *) request; + struct ucx_context *context = (struct ucx_context *) ctx; context->completed = 1; @@ -123,7 +128,7 @@ static void recv_handler(void *request, ucs_status_t status, info->length); } -static void wait(ucp_worker_h ucp_worker, struct ucx_context *context) +static void ucx_wait(ucp_worker_h ucp_worker, struct ucx_context *context) { while (context->completed == 0) { ucp_worker_progress(ucp_worker); @@ -132,11 +137,13 @@ static void wait(ucp_worker_h ucp_worker, struct ucx_context *context) static ucs_status_t test_poll_wait(ucp_worker_h ucp_worker) { - int ret = -1, err = 0; + int err = 0; + ucs_status_t ret = UCS_ERR_NO_MESSAGE; + int epoll_fd_local = 0; + int epoll_fd = 0; ucs_status_t status; - int epoll_fd_local = 0, epoll_fd = 0; struct epoll_event ev; - ev.data.u64 = 0; + ev.data.u64 = 0; status = ucp_worker_get_efd(ucp_worker, &epoll_fd); CHKERR_JUMP(UCS_OK != status, "ucp_worker_get_efd", err); @@ -158,8 +165,8 @@ static ucs_status_t test_poll_wait(ucp_worker_h ucp_worker) CHKERR_JUMP(status != UCS_OK, "ucp_worker_arm\n", err_fd); do { - ret = epoll_wait(epoll_fd_local, &ev, 1, -1); - } while ((ret == -1) && (errno == EINTR)); + err = epoll_wait(epoll_fd_local, &ev, 1, -1); + } while ((err == -1) && (errno == EINTR)); ret = UCS_OK; @@ -172,15 +179,18 @@ static ucs_status_t test_poll_wait(ucp_worker_h ucp_worker) static int run_ucx_client(ucp_worker_h ucp_worker) { + ucp_request_param_t send_param; ucp_tag_recv_info_t info_tag; ucp_tag_message_h msg_tag; ucs_status_t status; ucp_ep_h server_ep; ucp_ep_params_t ep_params; struct msg *msg = 0; - struct ucx_context *request = 0; + struct ucx_context *request; + struct ucx_context ctx; size_t msg_len = 0; int ret = -1; + char *str; /* Send client UCX address to server */ ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS | @@ -192,26 +202,30 @@ static int run_ucx_client(ucp_worker_h ucp_worker) CHKERR_JUMP(status != UCS_OK, "ucp_ep_create\n", err); msg_len = sizeof(*msg) + local_addr_len; - msg = calloc(1, msg_len); - CHKERR_JUMP(!msg, "allocate memory\n", err_ep); + msg = malloc(msg_len); + CHKERR_JUMP(msg == NULL, "allocate memory\n", err_ep); + memset(msg, 0, msg_len); msg->data_len = local_addr_len; memcpy(msg + 1, local_addr, local_addr_len); - request = ucp_tag_send_nb(server_ep, msg, msg_len, - ucp_dt_make_contig(1), tag, - send_handler); + send_param.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FIELD_USER_DATA; + send_param.cb.send = send_handler; + send_param.user_data = &ctx; + ctx.completed = 0; + request = ucp_tag_send_nbx(server_ep, msg, msg_len, tag, + &send_param); if (UCS_PTR_IS_ERR(request)) { fprintf(stderr, "unable to send UCX address message\n"); free(msg); goto err_ep; - } else if (UCS_PTR_STATUS(request) != UCS_OK) { - wait(ucp_worker, request); - request->completed = 0; /* Reset request state before recycling it */ + } else if (UCS_PTR_IS_PTR(request)) { + ucx_wait(ucp_worker, &ctx); ucp_request_release(request); } - free (msg); + free(msg); if (err_handling_opt.failure) { fprintf(stderr, "Emulating unexpected failure on client side\n"); @@ -245,8 +259,8 @@ static int run_ucx_client(ucp_worker_h ucp_worker) } } - msg = malloc(info_tag.length); - CHKERR_JUMP(!msg, "allocate memory\n", err_ep); + msg = mem_type_malloc(info_tag.length); + CHKERR_JUMP(msg == NULL, "allocate memory\n", err_ep); request = ucp_tag_msg_recv_nb(ucp_worker, msg, info_tag.length, ucp_dt_make_contig(1), msg_tag, @@ -258,17 +272,27 @@ static int run_ucx_client(ucp_worker_h ucp_worker) free(msg); goto err_ep; } else { - wait(ucp_worker, request); + /* ucp_tag_msg_recv_nb() cannot return NULL */ + assert(UCS_PTR_IS_PTR(request)); + ucx_wait(ucp_worker, request); request->completed = 0; ucp_request_release(request); printf("UCX data message was received\n"); } - printf("\n\n----- UCP TEST SUCCESS ----\n\n"); - printf("%s", (char *)(msg + 1)); - printf("\n\n---------------------------\n\n"); + str = calloc(1, test_string_length); + if (str != NULL) { + mem_type_memcpy(str, msg + 1, test_string_length); + printf("\n\n----- UCP TEST SUCCESS ----\n\n"); + printf("%s", str); + printf("\n\n---------------------------\n\n"); + free(str); + } else { + fprintf(stderr, "Memory allocation failed\n"); + goto err_ep; + } - free(msg); + mem_type_free(msg); ret = 0; @@ -279,15 +303,18 @@ static int run_ucx_client(ucp_worker_h ucp_worker) return ret; } -static void flush_callback(void *request, ucs_status_t status) +static void flush_callback(void *request, ucs_status_t status, void *user_data) { } static ucs_status_t flush_ep(ucp_worker_h worker, ucp_ep_h ep) { + ucp_request_param_t param; void *request; - request = ucp_ep_flush_nb(ep, 0, flush_callback); + param.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK; + param.cb.send = flush_callback; + request = ucp_ep_flush_nbx(ep, ¶m); if (request == NULL) { return UCS_OK; } else if (UCS_PTR_IS_ERR(request)) { @@ -305,6 +332,7 @@ static ucs_status_t flush_ep(ucp_worker_h worker, ucp_ep_h ep) static int run_ucx_server(ucp_worker_h ucp_worker) { + ucp_request_param_t send_param; ucp_tag_recv_info_t info_tag; ucp_tag_message_h msg_tag; ucs_status_t status; @@ -312,8 +340,9 @@ static int run_ucx_server(ucp_worker_h ucp_worker) ucp_ep_params_t ep_params; struct msg *msg = 0; struct ucx_context *request = 0; + struct ucx_context ctx; size_t msg_len = 0; - int ret = -1; + int ret; /* Receive client UCX address */ do { @@ -325,7 +354,7 @@ static int run_ucx_server(ucp_worker_h ucp_worker) } while (msg_tag == NULL); msg = malloc(info_tag.length); - CHKERR_JUMP(!msg, "allocate memory\n", err); + CHKERR_ACTION(msg == NULL, "allocate memory\n", ret = -1; goto err); request = ucp_tag_msg_recv_nb(ucp_worker, msg, info_tag.length, ucp_dt_make_contig(1), msg_tag, recv_handler); @@ -333,22 +362,26 @@ static int run_ucx_server(ucp_worker_h ucp_worker) fprintf(stderr, "unable to receive UCX address message (%s)\n", ucs_status_string(UCS_PTR_STATUS(request))); free(msg); + ret = -1; goto err; } else { - wait(ucp_worker, request); + /* ucp_tag_msg_recv_nb() cannot return NULL */ + assert(UCS_PTR_IS_PTR(request)); + ucx_wait(ucp_worker, request); request->completed = 0; ucp_request_release(request); printf("UCX address message was received\n"); } - peer_addr = malloc(msg->data_len); - if (!peer_addr) { + peer_addr_len = msg->data_len; + peer_addr = malloc(peer_addr_len); + if (peer_addr == NULL) { fprintf(stderr, "unable to allocate memory for peer address\n"); free(msg); + ret = -1; goto err; } - peer_addr_len = msg->data_len; memcpy(peer_addr, msg + 1, peer_addr_len); free(msg); @@ -365,26 +398,31 @@ static int run_ucx_server(ucp_worker_h ucp_worker) ep_params.user_data = &client_status; status = ucp_ep_create(ucp_worker, &ep_params, &client_ep); - CHKERR_JUMP(status != UCS_OK, "ucp_ep_create\n", err); + CHKERR_ACTION(status != UCS_OK, "ucp_ep_create\n", ret = -1; goto err); msg_len = sizeof(*msg) + test_string_length; - msg = calloc(1, msg_len); - CHKERR_JUMP(!msg, "allocate memory\n", err_ep); - - msg->data_len = msg_len - sizeof(*msg); - generate_test_string((char *)(msg + 1), test_string_length); - - request = ucp_tag_send_nb(client_ep, msg, msg_len, - ucp_dt_make_contig(1), tag, - send_handler); + msg = mem_type_malloc(msg_len); + CHKERR_ACTION(msg == NULL, "allocate memory\n", ret = -1; goto err_ep); + mem_type_memset(msg, 0, msg_len); + + set_msg_data_len(msg, msg_len - sizeof(*msg)); + ret = generate_test_string((char *)(msg + 1), test_string_length); + CHKERR_JUMP(ret < 0, "generate test string", err_free_mem_type_msg); + + send_param.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FIELD_USER_DATA; + send_param.cb.send = send_handler; + send_param.user_data = &ctx; + ctx.completed = 0; + request = ucp_tag_send_nbx(client_ep, msg, msg_len, tag, + &send_param); if (UCS_PTR_IS_ERR(request)) { fprintf(stderr, "unable to send UCX data message\n"); - free(msg); - goto err_ep; - } else if (UCS_PTR_STATUS(request) != UCS_OK) { + ret = -1; + goto err_free_mem_type_msg; + } else if (UCS_PTR_IS_PTR(request)) { printf("UCX data message was scheduled for send\n"); - wait(ucp_worker, request); - request->completed = 0; + ucx_wait(ucp_worker, &ctx); ucp_request_release(request); } @@ -393,11 +431,11 @@ static int run_ucx_server(ucp_worker_h ucp_worker) status, ucs_status_string(status)); ret = 0; - free(msg); +err_free_mem_type_msg: + mem_type_free(msg); err_ep: ucp_ep_destroy(client_ep); - err: return ret; } @@ -476,27 +514,29 @@ int main(int argc, char **argv) oob_sock = client_connect(client_target_name, server_port); CHKERR_JUMP(oob_sock < 0, "client_connect\n", err_addr); - ret = recv(oob_sock, &addr_len, sizeof(addr_len), 0); - CHKERR_JUMP(ret < 0, "receive address length\n", err_addr); + ret = recv(oob_sock, &addr_len, sizeof(addr_len), MSG_WAITALL); + CHKERR_JUMP_RETVAL(ret != (int)sizeof(addr_len), + "receive address length\n", err_addr, ret); peer_addr_len = addr_len; peer_addr = malloc(peer_addr_len); CHKERR_JUMP(!peer_addr, "allocate memory\n", err_addr); - ret = recv(oob_sock, peer_addr, peer_addr_len, 0); - CHKERR_JUMP(ret < 0, "receive address\n", err_peer_addr); + ret = recv(oob_sock, peer_addr, peer_addr_len, MSG_WAITALL); + CHKERR_JUMP_RETVAL(ret != (int)peer_addr_len, + "receive address\n", err_peer_addr, ret); } else { oob_sock = server_connect(server_port); CHKERR_JUMP(oob_sock < 0, "server_connect\n", err_peer_addr); addr_len = local_addr_len; ret = send(oob_sock, &addr_len, sizeof(addr_len), 0); - CHKERR_JUMP((ret < 0 || ret != sizeof(addr_len)), - "send address length\n", err_peer_addr); + CHKERR_JUMP_RETVAL(ret != (int)sizeof(addr_len), + "send address length\n", err_peer_addr, ret); ret = send(oob_sock, local_addr, local_addr_len, 0); - CHKERR_JUMP((ret < 0 || ret != local_addr_len), - "send address\n", err_peer_addr); + CHKERR_JUMP_RETVAL(ret != (int)local_addr_len, "send address\n", + err_peer_addr, ret); } ret = run_test(client_target_name, ucp_worker); @@ -523,15 +563,15 @@ int main(int argc, char **argv) return ret; } -int parse_cmd(int argc, char * const argv[], char **server_name) +ucs_status_t parse_cmd(int argc, char * const argv[], char **server_name) { - int c = 0, index = 0; + int c = 0, idx = 0; opterr = 0; err_handling_opt.ucp_err_mode = UCP_ERR_HANDLING_MODE_NONE; err_handling_opt.failure = 0; - while ((c = getopt(argc, argv, "wfben:p:s:h")) != -1) { + while ((c = getopt(argc, argv, "wfben:p:s:m:h")) != -1) { switch (c) { case 'w': ucp_test_mode = TEST_MODE_WAIT; @@ -563,6 +603,12 @@ int parse_cmd(int argc, char * const argv[], char **server_name) return UCS_ERR_UNSUPPORTED; } break; + case 'm': + test_mem_type = parse_mem_type(optarg); + if (test_mem_type == UCS_MEMORY_TYPE_LAST) { + return UCS_ERR_UNSUPPORTED; + } + break; case '?': if (optopt == 's') { fprintf(stderr, "Option -%c requires an argument.\n", optopt); @@ -586,11 +632,7 @@ int parse_cmd(int argc, char * const argv[], char **server_name) fprintf(stderr, " -e Emulate unexpected failure on server side" "and handle an error on client side with enabled " "UCP_ERR_HANDLING_MODE_PEER\n"); - fprintf(stderr, " -n name Set node name or IP address " - "of the server (required for client and should be ignored " - "for server)\n"); - fprintf(stderr, " -p port Set alternative server port (default:13337)\n"); - fprintf(stderr, " -s size Set test string length (default:16)\n"); + print_common_help(); fprintf(stderr, "\n"); return UCS_ERR_UNSUPPORTED; } @@ -598,8 +640,8 @@ int parse_cmd(int argc, char * const argv[], char **server_name) fprintf(stderr, "INFO: UCP_HELLO_WORLD mode = %d server = %s port = %d\n", ucp_test_mode, *server_name, server_port); - for (index = optind; index < argc; index++) { - fprintf(stderr, "WARNING: Non-option argument %s\n", argv[index]); + for (idx = optind; idx < argc; idx++) { + fprintf(stderr, "WARNING: Non-option argument %s\n", argv[idx]); } return UCS_OK; } diff --git a/test/examples/uct_hello_world.c b/examples/uct_hello_world.c similarity index 64% rename from test/examples/uct_hello_world.c rename to examples/uct_hello_world.c index 376ed2a30c9..9488a6bf044 100644 --- a/test/examples/uct_hello_world.c +++ b/examples/uct_hello_world.c @@ -4,7 +4,7 @@ * See file LICENSE for terms. */ -#include "ucx_hello_world.h" +#include "hello_world_util.h" #include #include @@ -32,10 +32,11 @@ typedef struct { } cmd_args_t; typedef struct { - uct_iface_attr_t attr; /* Interface attributes: capabilities and limitations */ - uct_iface_h iface; /* Communication interface context */ - uct_md_h md; /* Memory domain */ - uct_worker_h worker; /* Workers represent allocated resources in a communication thread */ + uct_iface_attr_t iface_attr; /* Interface attributes: capabilities and limitations */ + uct_iface_h iface; /* Communication interface context */ + uct_md_attr_t md_attr; /* Memory domain attributes: capabilities and limitations */ + uct_md_h md; /* Memory domain */ + uct_worker_h worker; /* Workers represent allocated resources in a communication thread */ } iface_info_t; /* Helper data type for am_short */ @@ -60,6 +61,8 @@ typedef struct { static void* desc_holder = NULL; +int print_err_usage(void); + static char *func_am_t_str(func_am_t func_am_type) { switch (func_am_type) { @@ -122,7 +125,7 @@ ucs_status_t do_am_short(iface_info_t *if_info, uct_ep_h ep, uint8_t id, size_t am_bcopy_data_pack_cb(void *dest, void *arg) { am_bcopy_args_t *bc_args = arg; - memcpy(dest, bc_args->data, bc_args->len); + mem_type_memcpy(dest, bc_args->data, bc_args->len); return bc_args->len; } @@ -141,7 +144,7 @@ ucs_status_t do_am_bcopy(iface_info_t *if_info, uct_ep_h ep, uint8_t id, uct_worker_progress(if_info->worker); } while (len == UCS_ERR_NO_RESOURCE); /* Negative len is an error code */ - return (len >= 0) ? UCS_OK : len; + return (len >= 0) ? UCS_OK : (ucs_status_t)len; } /* Completion callback for am_zcopy */ @@ -149,19 +152,27 @@ void zcopy_completion_cb(uct_completion_t *self, ucs_status_t status) { zcopy_comp_t *comp = (zcopy_comp_t *)self; assert((comp->uct_comp.count == 0) && (status == UCS_OK)); - uct_md_mem_dereg(comp->md, comp->memh); + if (comp->memh != UCT_MEM_HANDLE_NULL) { + uct_md_mem_dereg(comp->md, comp->memh); + } desc_holder = (void *)0xDEADBEEF; } ucs_status_t do_am_zcopy(iface_info_t *if_info, uct_ep_h ep, uint8_t id, const cmd_args_t *cmd_args, char *buf) { + ucs_status_t status = UCS_OK; uct_mem_h memh; uct_iov_t iov; zcopy_comp_t comp; - ucs_status_t status = uct_md_mem_reg(if_info->md, buf, cmd_args->test_strlen, - UCT_MD_MEM_ACCESS_RMA, &memh); + if (if_info->md_attr.cap.flags & UCT_MD_FLAG_NEED_MEMH) { + status = uct_md_mem_reg(if_info->md, buf, cmd_args->test_strlen, + UCT_MD_MEM_ACCESS_RMA, &memh); + } else { + memh = UCT_MEM_HANDLE_NULL; + } + iov.buffer = buf; iov.length = cmd_args->test_strlen; iov.memh = memh; @@ -191,7 +202,7 @@ ucs_status_t do_am_zcopy(iface_info_t *if_info, uct_ep_h ep, uint8_t id, return status; } static void print_strings(const char *label, const char *local_str, - const char *remote_str) + const char *remote_str, size_t length) { fprintf(stdout, "\n\n----- UCT TEST SUCCESS ----\n\n"); fprintf(stdout, "[%s] %s sent %s", label, local_str, remote_str); @@ -200,11 +211,13 @@ static void print_strings(const char *label, const char *local_str, } /* Callback to handle receive active message */ -static ucs_status_t hello_world(void *arg, void *data, size_t length, unsigned flags) +static ucs_status_t hello_world(void *arg, void *data, size_t length, + unsigned flags) { - recv_desc_t *rdesc; func_am_t func_am_type = *(func_am_t *)arg; - print_strings("callback", func_am_t_str(func_am_type), data); + recv_desc_t *rdesc; + + print_strings("callback", func_am_t_str(func_am_type), data, length); if (flags & UCT_CB_PARAM_FLAG_DESC) { rdesc = (recv_desc_t *)data - 1; @@ -217,6 +230,7 @@ static ucs_status_t hello_world(void *arg, void *data, size_t length, unsigned f /* We need to copy-out data and return UCS_OK if want to use the data * outside the callback */ rdesc = malloc(sizeof(*rdesc) + length); + CHKERR_ACTION(rdesc == NULL, "allocate memory\n", return UCS_ERR_NO_MEMORY); rdesc->is_uct_desc = 0; memcpy(rdesc + 1, data, length); desc_holder = rdesc; @@ -260,22 +274,26 @@ static ucs_status_t init_iface(char *dev_name, char *tl_name, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); /* Get interface attributes */ - status = uct_iface_query(iface_p->iface, &iface_p->attr); + status = uct_iface_query(iface_p->iface, &iface_p->iface_attr); CHKERR_JUMP(UCS_OK != status, "query iface", error_iface); /* Check if current device and transport support required active messages */ if ((func_am_type == FUNC_AM_SHORT) && - (iface_p->attr.cap.flags & UCT_IFACE_FLAG_AM_SHORT)) { - return UCS_OK; + (iface_p->iface_attr.cap.flags & UCT_IFACE_FLAG_AM_SHORT)) { + if (test_mem_type != UCS_MEMORY_TYPE_CUDA) { + return UCS_OK; + } else { + fprintf(stderr, "AM short protocol doesn't support CUDA memory"); + } } if ((func_am_type == FUNC_AM_BCOPY) && - (iface_p->attr.cap.flags & UCT_IFACE_FLAG_AM_BCOPY)) { + (iface_p->iface_attr.cap.flags & UCT_IFACE_FLAG_AM_BCOPY)) { return UCS_OK; } if ((func_am_type == FUNC_AM_ZCOPY) && - (iface_p->attr.cap.flags & UCT_IFACE_FLAG_AM_ZCOPY)) { + (iface_p->iface_attr.cap.flags & UCT_IFACE_FLAG_AM_ZCOPY)) { return UCS_OK; } @@ -290,64 +308,109 @@ static ucs_status_t init_iface(char *dev_name, char *tl_name, static ucs_status_t dev_tl_lookup(const cmd_args_t *cmd_args, iface_info_t *iface_p) { - uct_md_resource_desc_t *md_resources; /* Memory domain resource descriptor */ - uct_tl_resource_desc_t *tl_resources; /* Communication resource descriptor */ - unsigned num_md_resources; /* Number of memory domains */ - unsigned num_tl_resources; /* Number of transport resources resource objects created */ - uct_md_config_t *md_config; - ucs_status_t status; - int i; - int j; - - status = uct_query_md_resources(&md_resources, &num_md_resources); - CHKERR_JUMP(UCS_OK != status, "query for memory domain resources", error_ret); - - iface_p->iface = NULL; - - /* Iterate through memory domain resources */ - for (i = 0; i < num_md_resources; ++i) { - status = uct_md_config_read(md_resources[i].md_name, NULL, NULL, &md_config); - CHKERR_JUMP(UCS_OK != status, "read PD config", release_md); - - status = uct_md_open(md_resources[i].md_name, md_config, &iface_p->md); - uct_config_release(md_config); - CHKERR_JUMP(UCS_OK != status, "open memory domains", release_md); - - status = uct_md_query_tl_resources(iface_p->md, &tl_resources, &num_tl_resources); - CHKERR_JUMP(UCS_OK != status, "query transport resources", close_md); - - /* Go through each available transport and find the proper name */ - for (j = 0; j < num_tl_resources; ++j) { - if (!strcmp(cmd_args->dev_name, tl_resources[j].dev_name) && - !strcmp(cmd_args->tl_name, tl_resources[j].tl_name)) { - status = init_iface(tl_resources[j].dev_name, - tl_resources[j].tl_name, - cmd_args->func_am_type, iface_p); - if (UCS_OK == status) { - fprintf(stdout, "Using %s with %s.\n", - tl_resources[j].dev_name, - tl_resources[j].tl_name); - fflush(stdout); - uct_release_tl_resource_list(tl_resources); - goto release_md; + uct_tl_resource_desc_t *tl_resources = NULL; /* Communication resource descriptor */ + unsigned num_tl_resources = 0; /* Number of transport resources resource objects created */ + uct_component_h *components; + unsigned num_components; + unsigned cmpt_index; + uct_component_attr_t component_attr; + unsigned md_index; + unsigned tl_index; + uct_md_config_t *md_config; + ucs_status_t status; + + status = uct_query_components(&components, &num_components); + CHKERR_JUMP(UCS_OK != status, "query for components", error_ret); + + for (cmpt_index = 0; cmpt_index < num_components; ++cmpt_index) { + + component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT; + status = uct_component_query(components[cmpt_index], &component_attr); + CHKERR_JUMP(UCS_OK != status, "query component attributes", + release_component_list); + + component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES; + component_attr.md_resources = alloca(sizeof(*component_attr.md_resources) * + component_attr.md_resource_count); + status = uct_component_query(components[cmpt_index], &component_attr); + CHKERR_JUMP(UCS_OK != status, "query for memory domain resources", + release_component_list); + + iface_p->iface = NULL; + + /* Iterate through memory domain resources */ + for (md_index = 0; md_index < component_attr.md_resource_count; ++md_index) { + status = uct_md_config_read(components[cmpt_index], NULL, NULL, + &md_config); + CHKERR_JUMP(UCS_OK != status, "read MD config", + release_component_list); + + status = uct_md_open(components[cmpt_index], + component_attr.md_resources[md_index].md_name, + md_config, &iface_p->md); + uct_config_release(md_config); + CHKERR_JUMP(UCS_OK != status, "open memory domains", + release_component_list); + + status = uct_md_query(iface_p->md, &iface_p->md_attr); + CHKERR_JUMP(UCS_OK != status, "query iface", + close_md); + + status = uct_md_query_tl_resources(iface_p->md, &tl_resources, + &num_tl_resources); + CHKERR_JUMP(UCS_OK != status, "query transport resources", close_md); + + /* Go through each available transport and find the proper name */ + for (tl_index = 0; tl_index < num_tl_resources; ++tl_index) { + if (!strcmp(cmd_args->dev_name, tl_resources[tl_index].dev_name) && + !strcmp(cmd_args->tl_name, tl_resources[tl_index].tl_name)) { + if (!(iface_p->md_attr.cap.reg_mem_types & UCS_BIT(test_mem_type))) { + fprintf(stderr, "Unsupported memory type %s by " + UCT_TL_RESOURCE_DESC_FMT" on %s MD\n", + ucs_memory_type_names[test_mem_type], + UCT_TL_RESOURCE_DESC_ARG(&tl_resources[tl_index]), + component_attr.md_resources[md_index].md_name); + status = UCS_ERR_UNSUPPORTED; + break; + } + + status = init_iface(tl_resources[tl_index].dev_name, + tl_resources[tl_index].tl_name, + cmd_args->func_am_type, iface_p); + if (status != UCS_OK) { + break; + } + + fprintf(stdout, "Using "UCT_TL_RESOURCE_DESC_FMT"\n", + UCT_TL_RESOURCE_DESC_ARG(&tl_resources[tl_index])); + goto release_tl_resources; } } + +release_tl_resources: + uct_release_tl_resource_list(tl_resources); + if ((status == UCS_OK) && + (tl_index < num_tl_resources)) { + goto release_component_list; + } + + tl_resources = NULL; + num_tl_resources = 0; + uct_md_close(iface_p->md); } - uct_release_tl_resource_list(tl_resources); - uct_md_close(iface_p->md); } fprintf(stderr, "No supported (dev/tl) found (%s/%s)\n", cmd_args->dev_name, cmd_args->tl_name); status = UCS_ERR_UNSUPPORTED; -release_md: - uct_release_md_resource_list(md_resources); +release_component_list: + uct_release_component_list(components); error_ret: return status; close_md: uct_md_close(iface_p->md); - goto release_md; + goto release_component_list; } int print_err_usage() @@ -362,18 +425,17 @@ int print_err_usage() fprintf(stderr, func_template, 'z', func_am_t_str(FUNC_AM_ZCOPY), ""); fprintf(stderr, " -d Select device name\n"); fprintf(stderr, " -t Select transport layer\n"); - fprintf(stderr, " -n name Set node name or IP address " - "of the server (required for client and should be ignored " - "for server)\n"); - fprintf(stderr, " -p port Set alternative server port (default:13337)\n"); - fprintf(stderr, " -s size Set test string length (default:16)\n"); - fprintf(stderr, "\n"); + print_common_help(); + fprintf(stderr, "\nExample:\n"); + fprintf(stderr, " Server: uct_hello_world -d eth0 -t tcp\n"); + fprintf(stderr, " Client: uct_hello_world -d eth0 -t tcp -n localhost\n"); + return UCS_ERR_UNSUPPORTED; } int parse_cmd(int argc, char * const argv[], cmd_args_t *args) { - int c = 0, index = 0; + int c = 0, idx = 0; assert(args); memset(args, 0, sizeof(*args)); @@ -384,7 +446,7 @@ int parse_cmd(int argc, char * const argv[], cmd_args_t *args) args->test_strlen = 16; opterr = 0; - while ((c = getopt(argc, argv, "ibzd:t:n:p:s:h")) != -1) { + while ((c = getopt(argc, argv, "ibzd:t:n:p:s:m:h")) != -1) { switch (c) { case 'i': args->func_am_type = FUNC_AM_SHORT; @@ -419,6 +481,12 @@ int parse_cmd(int argc, char * const argv[], cmd_args_t *args) return UCS_ERR_UNSUPPORTED; } break; + case 'm': + test_mem_type = parse_mem_type(optarg); + if (test_mem_type == UCS_MEMORY_TYPE_LAST) { + return UCS_ERR_UNSUPPORTED; + } + break; case '?': if (optopt == 's') { fprintf(stderr, "Option -%c requires an argument.\n", optopt); @@ -436,8 +504,8 @@ int parse_cmd(int argc, char * const argv[], cmd_args_t *args) func_am_t_str(args->func_am_type), args->server_name, args->server_port); - for (index = optind; index < argc; index++) { - fprintf(stderr, "WARNING: Non-option argument %s\n", argv[index]); + for (idx = optind; idx < argc; idx++) { + fprintf(stderr, "WARNING: Non-option argument %s\n", argv[idx]); } if (args->dev_name == NULL) { @@ -467,14 +535,16 @@ int sendrecv(int sock, const void *sbuf, size_t slen, void **rbuf) } ret = send(sock, sbuf, slen, 0); - if ((ret < 0) || (ret != slen)) { - fprintf(stderr, "failed to send buffer\n"); + if (ret != (int)slen) { + fprintf(stderr, "failed to send buffer, return value %d\n", ret); return -1; } - ret = recv(sock, &rlen, sizeof(rlen), 0); + ret = recv(sock, &rlen, sizeof(rlen), MSG_WAITALL); if ((ret != sizeof(rlen)) || (rlen > (SIZE_MAX / 2))) { - fprintf(stderr, "failed to receive device address length\n"); + fprintf(stderr, + "failed to receive device address length, return value %d\n", + ret); return -1; } @@ -484,9 +554,10 @@ int sendrecv(int sock, const void *sbuf, size_t slen, void **rbuf) return -1; } - ret = recv(sock, *rbuf, rlen, 0); - if (ret < 0) { - fprintf(stderr, "failed to receive device address\n"); + ret = recv(sock, *rbuf, rlen, MSG_WAITALL); + if (ret != (int)rlen) { + fprintf(stderr, "failed to receive device address, return value %d\n", + ret); return -1; } @@ -510,6 +581,7 @@ int main(int argc, char **argv) cmd_args_t cmd_args; iface_info_t if_info; uct_ep_params_t ep_params; + int res; /* Parse the command line */ if (parse_cmd(argc, argv, &cmd_args)) { @@ -531,11 +603,11 @@ int main(int argc, char **argv) CHKERR_JUMP(UCS_OK != status, "find supported device and transport", out_destroy_worker); - own_dev = (uct_device_addr_t*)calloc(1, if_info.attr.device_addr_len); + own_dev = (uct_device_addr_t*)calloc(1, if_info.iface_attr.device_addr_len); CHKERR_JUMP(NULL == own_dev, "allocate memory for dev addr", out_destroy_iface); - own_iface = (uct_iface_addr_t*)calloc(1, if_info.attr.iface_addr_len); + own_iface = (uct_iface_addr_t*)calloc(1, if_info.iface_attr.iface_addr_len); CHKERR_JUMP(NULL == own_iface, "allocate memory for if addr", out_free_dev_addrs); @@ -545,38 +617,37 @@ int main(int argc, char **argv) if (cmd_args.server_name) { oob_sock = client_connect(cmd_args.server_name, cmd_args.server_port); - if (oob_sock < 0) { - goto out_free_if_addrs; - } } else { oob_sock = server_connect(cmd_args.server_port); - if (oob_sock < 0) { - goto out_free_if_addrs; - } } + CHKERR_ACTION(oob_sock < 0, "OOB connect", + status = UCS_ERR_IO_ERROR; goto out_close_oob_sock); - status = sendrecv(oob_sock, own_dev, if_info.attr.device_addr_len, - (void **)&peer_dev); - CHKERR_JUMP(0 != status, "device exchange", out_free_dev_addrs); + res = sendrecv(oob_sock, own_dev, if_info.iface_attr.device_addr_len, + (void **)&peer_dev); + CHKERR_ACTION(0 != res, "device exchange", + status = UCS_ERR_NO_MESSAGE; goto out_close_oob_sock); - status = uct_iface_is_reachable(if_info.iface, peer_dev, NULL); - CHKERR_JUMP(0 == status, "reach the peer", out_free_if_addrs); + status = (ucs_status_t)uct_iface_is_reachable(if_info.iface, peer_dev, NULL); + CHKERR_JUMP(0 == status, "reach the peer", out_close_oob_sock); /* Get interface address */ - if (if_info.attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { + if (if_info.iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { status = uct_iface_get_address(if_info.iface, own_iface); - CHKERR_JUMP(UCS_OK != status, "get interface address", out_free_if_addrs); + CHKERR_JUMP(UCS_OK != status, "get interface address", + out_close_oob_sock); - status = sendrecv(oob_sock, own_iface, if_info.attr.iface_addr_len, - (void **)&peer_iface); - CHKERR_JUMP(0 != status, "ifaces exchange", out_free_if_addrs); + status = (ucs_status_t)sendrecv(oob_sock, own_iface, if_info.iface_attr.iface_addr_len, + (void **)&peer_iface); + CHKERR_JUMP(0 != status, "ifaces exchange", out_close_oob_sock); } ep_params.field_mask = UCT_EP_PARAM_FIELD_IFACE; ep_params.iface = if_info.iface; - if (if_info.attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) { - own_ep = (uct_ep_addr_t*)calloc(1, if_info.attr.ep_addr_len); - CHKERR_JUMP(NULL == own_ep, "allocate memory for ep addrs", out_free_if_addrs); + if (if_info.iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) { + own_ep = (uct_ep_addr_t*)calloc(1, if_info.iface_attr.ep_addr_len); + CHKERR_ACTION(NULL == own_ep, "allocate memory for ep addrs", + status = UCS_ERR_NO_MEMORY; goto out_close_oob_sock); /* Create new endpoint */ status = uct_ep_create(&ep_params, &ep); @@ -586,8 +657,8 @@ int main(int argc, char **argv) status = uct_ep_get_address(ep, own_ep); CHKERR_JUMP(UCS_OK != status, "get endpoint address", out_free_ep); - status = sendrecv(oob_sock, own_ep, if_info.attr.ep_addr_len, - (void **)&peer_ep); + status = (ucs_status_t)sendrecv(oob_sock, own_ep, if_info.iface_attr.ep_addr_len, + (void **)&peer_ep); CHKERR_JUMP(0 != status, "EPs exchange", out_free_ep); /* Connect endpoint to a remote endpoint */ @@ -596,23 +667,24 @@ int main(int argc, char **argv) status = UCS_ERR_IO_ERROR; goto out_free_ep; } - } else if (if_info.attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { + } else if (if_info.iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { /* Create an endpoint which is connected to a remote interface */ ep_params.field_mask |= UCT_EP_PARAM_FIELD_DEV_ADDR | UCT_EP_PARAM_FIELD_IFACE_ADDR; ep_params.dev_addr = peer_dev; ep_params.iface_addr = peer_iface; status = uct_ep_create(&ep_params, &ep); + CHKERR_JUMP(UCS_OK != status, "create endpoint", out_free_ep_addrs); } else { status = UCS_ERR_UNSUPPORTED; + goto out_free_ep_addrs; } - CHKERR_JUMP(UCS_OK != status, "connect endpoint", out_free_ep); - if (cmd_args.test_strlen > func_am_max_size(cmd_args.func_am_type, &if_info.attr)) { + if (cmd_args.test_strlen > func_am_max_size(cmd_args.func_am_type, &if_info.iface_attr)) { status = UCS_ERR_UNSUPPORTED; fprintf(stderr, "Test string is too long: %ld, max supported: %lu\n", cmd_args.test_strlen, - func_am_max_size(cmd_args.func_am_type, &if_info.attr)); + func_am_max_size(cmd_args.func_am_type, &if_info.iface_attr)); goto out_free_ep; } @@ -622,8 +694,12 @@ int main(int argc, char **argv) CHKERR_JUMP(UCS_OK != status, "set callback", out_free_ep); if (cmd_args.server_name) { - char *str = (char *)malloc(cmd_args.test_strlen); - generate_test_string(str, cmd_args.test_strlen); + char *str = (char *)mem_type_malloc(cmd_args.test_strlen); + CHKERR_ACTION(str == NULL, "allocate memory", + status = UCS_ERR_NO_MEMORY; goto out_free_ep); + res = generate_test_string(str, cmd_args.test_strlen); + CHKERR_ACTION(res < 0, "generate test string", + status = UCS_ERR_NO_MEMORY; goto out_free_ep); /* Send active message to remote endpoint */ if (cmd_args.func_am_type == FUNC_AM_SHORT) { @@ -634,19 +710,20 @@ int main(int argc, char **argv) status = do_am_zcopy(&if_info, ep, id, &cmd_args, str); } - free(str); + mem_type_free(str); CHKERR_JUMP(UCS_OK != status, "send active msg", out_free_ep); } else { recv_desc_t *rdesc; - while (!desc_holder) { + while (desc_holder == NULL) { /* Explicitly progress any outstanding active message requests */ uct_worker_progress(if_info.worker); } rdesc = desc_holder; print_strings("main", func_am_t_str(cmd_args.func_am_type), - (char *)(rdesc + 1)); + (char *)(rdesc + 1), cmd_args.test_strlen); + if (rdesc->is_uct_desc) { /* Release descriptor because callback returns UCS_INPROGRESS */ uct_iface_release_desc(rdesc); @@ -658,13 +735,14 @@ int main(int argc, char **argv) if (barrier(oob_sock)) { status = UCS_ERR_IO_ERROR; } - close(oob_sock); out_free_ep: uct_ep_destroy(ep); out_free_ep_addrs: free(own_ep); free(peer_ep); +out_close_oob_sock: + close(oob_sock); out_free_if_addrs: free(own_iface); free(peer_iface); @@ -679,5 +757,5 @@ int main(int argc, char **argv) out_cleanup_async: ucs_async_context_destroy(async); out: - return status == UCS_ERR_UNSUPPORTED ? UCS_OK : status; + return (status == UCS_ERR_UNSUPPORTED) ? UCS_OK : status; } diff --git a/src/tools/info/Makefile.am b/src/tools/info/Makefile.am index a8db417b71f..7f200e1ca41 100644 --- a/src/tools/info/Makefile.am +++ b/src/tools/info/Makefile.am @@ -1,7 +1,7 @@ # # Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. # Copyright (C) The University of Tennessee and the University of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED. -# Copyright (C) Huawei Technologies Co.,Ltd. 2020. ALL RIGHTS RESERVED. +# Copyright (C) Huawei Technologies Co.,Ltd. 2020-2021. ALL RIGHTS RESERVED. # # See file LICENSE for terms. # diff --git a/src/tools/info/proto_info.c b/src/tools/info/proto_info.c index 65d5419222f..d2f0c1f1b72 100644 --- a/src/tools/info/proto_info.c +++ b/src/tools/info/proto_info.c @@ -1,14 +1,19 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2016. ALL RIGHTS RESERVED. - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. * + * Copyright (C) Huawei Technologies Co., Ltd. 2020-2021. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ucx_info.h" #include #include +#include #include #include #include @@ -91,7 +96,8 @@ static void print_resource_usage(const resource_usage_t *usage_before, void print_ucp_info(int print_opts, ucs_config_print_flags_t print_flags, uint64_t ctx_features, const ucp_ep_params_t *base_ep_params, - size_t estimated_num_eps, unsigned dev_type_bitmap) + size_t estimated_num_eps, size_t estimated_num_ppn, + unsigned dev_type_bitmap, const char *mem_size) { ucp_config_t *config; ucs_status_t status; @@ -113,9 +119,11 @@ void print_ucp_info(int print_opts, ucs_config_print_flags_t print_flags, memset(¶ms, 0, sizeof(params)); params.field_mask = UCP_PARAM_FIELD_FEATURES | - UCP_PARAM_FIELD_ESTIMATED_NUM_EPS; + UCP_PARAM_FIELD_ESTIMATED_NUM_EPS | + UCP_PARAM_FIELD_ESTIMATED_NUM_PPN; params.features = ctx_features; params.estimated_num_eps = estimated_num_eps; + params.estimated_num_ppn = estimated_num_ppn; get_resource_usage(&usage); @@ -135,12 +143,16 @@ void print_ucp_info(int print_opts, ucs_config_print_flags_t print_flags, goto out_release_config; } + if ((print_opts & PRINT_MEM_MAP) && (mem_size != NULL)) { + ucp_mem_print_info(mem_size, context, stdout); + } + if (print_opts & PRINT_UCP_CONTEXT) { ucp_context_print_info(context, stdout); print_resource_usage(&usage, "UCP context"); } - if (!(print_opts & (PRINT_UCP_WORKER | PRINT_UCP_EP | PRINT_UCG | PRINT_UCG_TOPO))) { + if (!(print_opts & (PRINT_UCP_WORKER|PRINT_UCP_EP|PRINT_UCG|PRINT_UCG_TOPO))) { goto out_cleanup_context; } diff --git a/src/tools/info/sys_info.c b/src/tools/info/sys_info.c index b2294ac0696..df7f05b7870 100644 --- a/src/tools/info/sys_info.c +++ b/src/tools/info/sys_info.c @@ -1,7 +1,7 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. * +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -13,6 +13,8 @@ #include #include +#include +#include #include #include @@ -26,7 +28,18 @@ static const char* cpu_model_names[] = { [UCS_CPU_MODEL_INTEL_HASWELL] = "Haswell", [UCS_CPU_MODEL_INTEL_BROADWELL] = "Broadwell", [UCS_CPU_MODEL_INTEL_SKYLAKE] = "Skylake", - [UCS_CPU_MODEL_ARM_AARCH64] = "ARM 64-bit" + [UCS_CPU_MODEL_ARM_AARCH64] = "ARM 64-bit", + [UCS_CPU_MODEL_AMD_NAPLES] = "Naples", + [UCS_CPU_MODEL_AMD_ROME] = "Rome" +}; + +static const char* cpu_vendor_names[] = { + [UCS_CPU_VENDOR_UNKNOWN] = "unknown", + [UCS_CPU_VENDOR_INTEL] = "Intel", + [UCS_CPU_VENDOR_AMD] = "AMD", + [UCS_CPU_VENDOR_GENERIC_ARM] = "Generic ARM", + [UCS_CPU_VENDOR_GENERIC_PPC] = "Generic PPC", + [UCS_CPU_VENDOR_FUJITSU_ARM] = "Fujitsu ARM" }; static double measure_memcpy_bandwidth(size_t size) @@ -53,7 +66,7 @@ static double measure_memcpy_bandwidth(size_t size) iter = 0; start_time = ucs_get_time(); do { - memcpy(dst, src, size); + ucs_memcpy_relaxed(dst, src, size); end_time = ucs_get_time(); ++iter; } while (end_time < start_time + ucs_time_from_sec(0.5)); @@ -72,9 +85,10 @@ void print_sys_info() size_t size; printf("# Timer frequency: %.3f MHz\n", ucs_get_cpu_clocks_per_sec() / 1e6); + printf("# CPU vendor: %s\n", cpu_vendor_names[ucs_arch_get_cpu_vendor()]); printf("# CPU model: %s\n", cpu_model_names[ucs_arch_get_cpu_model()]); printf("# CPU flags: 0x%08X\n", ucs_arch_get_cpu_flag()); - + ucs_arch_print_memcpy_limits(&ucs_global_opts.arch); printf("# Memcpy bandwidth:\n"); for (size = 4096; size <= 256 * UCS_MBYTE; size *= 2) { printf("# %10zu bytes: %.3f MB/s\n", size, diff --git a/src/tools/info/tl_info.c b/src/tools/info/tl_info.c index b3c97d61470..70e2403537c 100644 --- a/src/tools/info/tl_info.c +++ b/src/tools/info/tl_info.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ucx_info.h" #include @@ -42,6 +46,16 @@ print_atomic_info(UCT_ATOMIC_OP_##_name, #_name, _suffix, \ _cap.atomic32.fop_flags, _cap.atomic64.fop_flags); +#define PRINT_LINEAR_FUNC_NS(_func) \ + { \ + printf("%.0f", (_func)->c * 1e9); \ + if ((_func)->m * 1e9 > 1e-3) { \ + printf(" + %.3f * N", (_func)->m * 1e9); \ + } \ + printf(" nsec\n"); \ + } + + static char *strduplower(const char *str) { char *s, *p; @@ -104,11 +118,7 @@ static const char *size_limit_to_str(size_t min_size, size_t max_size) static void print_iface_info(uct_worker_h worker, uct_md_h md, uct_tl_resource_desc_t *resource) { - uct_iface_config_t *iface_config; - uct_iface_attr_t iface_attr; - ucs_status_t status; - uct_iface_h iface; - char buf[200] = {0}; + char buf[200] = {0}; uct_iface_params_t iface_params = { .field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE | UCT_IFACE_PARAM_FIELD_DEVICE | @@ -121,6 +131,11 @@ static void print_iface_info(uct_worker_h worker, uct_md_h md, .stats_root = ucs_stats_get_root(), .rx_headroom = 0 }; + uct_iface_config_t *iface_config; + uct_iface_attr_t iface_attr; + char max_eps_str[32]; + ucs_status_t status; + uct_iface_h iface; UCS_CPU_ZERO(&iface_params.cpu_mask); status = uct_md_iface_config_read(md, resource->tl_name, NULL, NULL, &iface_config); @@ -128,13 +143,15 @@ static void print_iface_info(uct_worker_h worker, uct_md_h md, return; } - printf("# Device: %s\n", resource->dev_name); + printf("# Transport: %s\n", resource->tl_name); + printf("# Device: %s\n", resource->dev_name); status = uct_iface_open(md, worker, &iface_params, iface_config, &iface); uct_config_release(iface_config); if (status != UCS_OK) { printf("# < failed to open interface >\n"); + /* coverity[leaked_storage] */ return; } @@ -144,13 +161,11 @@ static void print_iface_info(uct_worker_h worker, uct_md_h md, if (status != UCS_OK) { printf("# < failed to query interface >\n"); } else { - printf("# bandwidth: %-.2f MB/sec\n", iface_attr.bandwidth / UCS_MBYTE); - printf("# latency: %-.0f nsec", iface_attr.latency.overhead * 1e9); - if (iface_attr.latency.growth > 0) { - printf(" + %.0f * N\n", iface_attr.latency.growth * 1e9); - } else { - printf("\n"); - } + printf("# bandwidth: %-.2f/ppn + %-.2f MB/sec\n", + iface_attr.bandwidth.shared / UCS_MBYTE, + iface_attr.bandwidth.dedicated / UCS_MBYTE); + printf("# latency: "); + PRINT_LINEAR_FUNC_NS(&iface_attr.latency); printf("# overhead: %-.0f nsec\n", iface_attr.overhead * 1e9); PRINT_CAP(PUT_SHORT, iface_attr.cap.flags, iface_attr.cap.put.max_short); @@ -244,18 +259,22 @@ static void print_iface_info(uct_worker_h worker, uct_md_h md, if (iface_attr.cap.flags & (UCT_IFACE_FLAG_CONNECT_TO_EP | UCT_IFACE_FLAG_CONNECT_TO_IFACE)) { if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) { - strncat(buf, " to ep,", sizeof(buf) - 1); + strncat(buf, " to ep,", sizeof(buf) - strlen(buf) - 1); } if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { - strncat(buf, " to iface,", sizeof(buf) - 1); + strncat(buf, " to iface,", sizeof(buf) - strlen(buf) - 1); } buf[strlen(buf) - 1] = '\0'; } else { - strncat(buf, " none", sizeof(buf) - 1); + strncat(buf, " none", sizeof(buf) - strlen(buf) - 1); } printf("# connection:%s\n", buf); - printf("# priority: %d\n", iface_attr.priority); + printf("# device priority: %d\n", iface_attr.priority); + printf("# device num paths: %d\n", iface_attr.dev_num_paths); + printf("# max eps: %s\n", + ucs_memunits_to_str(iface_attr.max_num_eps, max_eps_str, + sizeof(max_eps_str))); printf("# device address: %zu bytes\n", iface_attr.device_addr_len); if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { @@ -276,31 +295,31 @@ static void print_iface_info(uct_worker_h worker, uct_md_h md, if (iface_attr.cap.flags & (UCT_IFACE_FLAG_ERRHANDLE_SHORT_BUF | UCT_IFACE_FLAG_ERRHANDLE_BCOPY_BUF | UCT_IFACE_FLAG_ERRHANDLE_ZCOPY_BUF)) { - strncat(buf, " buffer (", sizeof(buf) - 1); + strncat(buf, " buffer (", sizeof(buf) - strlen(buf) - 1); if (iface_attr.cap.flags & UCT_IFACE_FLAG_ERRHANDLE_SHORT_BUF) { - strncat(buf, "short,", sizeof(buf) - 1); + strncat(buf, "short,", sizeof(buf) - strlen(buf) - 1); } if (iface_attr.cap.flags & UCT_IFACE_FLAG_ERRHANDLE_BCOPY_BUF) { - strncat(buf, "bcopy,", sizeof(buf) - 1); + strncat(buf, "bcopy,", sizeof(buf) - strlen(buf) - 1); } if (iface_attr.cap.flags & UCT_IFACE_FLAG_ERRHANDLE_ZCOPY_BUF) { - strncat(buf, "zcopy,", sizeof(buf) - 1); + strncat(buf, "zcopy,", sizeof(buf) - strlen(buf) - 1); } buf[strlen(buf) - 1] = '\0'; - strncat(buf, "),", sizeof(buf) - 1); + strncat(buf, "),", sizeof(buf) - strlen(buf) - 1); } if (iface_attr.cap.flags & UCT_IFACE_FLAG_ERRHANDLE_AM_ID) { - strncat(buf, " active-message id,", sizeof(buf) - 1); + strncat(buf, " active-message id,", sizeof(buf) - strlen(buf) - 1); } if (iface_attr.cap.flags & UCT_IFACE_FLAG_ERRHANDLE_REMOTE_MEM) { - strncat(buf, " remote access,", sizeof(buf) - 1); + strncat(buf, " remote access,", sizeof(buf) - strlen(buf) - 1); } if (iface_attr.cap.flags & UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE) { - strncat(buf, " peer failure,", sizeof(buf) - 1); + strncat(buf, " peer failure,", sizeof(buf) - strlen(buf) - 1); } buf[strlen(buf) - 1] = '\0'; } else { - strncat(buf, " none", sizeof(buf) - 1); + strncat(buf, " none", sizeof(buf) - strlen(buf) - 1); } printf("# error handling:%s\n", buf); } @@ -331,8 +350,6 @@ static ucs_status_t print_tl_info(uct_md_h md, const char *tl_name, goto out; } - printf("#\n"); - printf("# Transport: %s\n", tl_name); printf("#\n"); if (num_resources == 0) { @@ -349,7 +366,9 @@ static ucs_status_t print_tl_info(uct_md_h md, const char *tl_name, return status; } -static void print_md_info(const char *md_name, int print_opts, +static void print_md_info(uct_component_h component, + const uct_component_attr_t *component_attr, + const char *md_name, int print_opts, ucs_config_print_flags_t print_flags, const char *req_tl_name) { @@ -361,12 +380,12 @@ static void print_md_info(const char *md_name, int print_opts, uct_md_attr_t md_attr; uct_md_h md; - status = uct_md_config_read(md_name, NULL, NULL, &md_config); + status = uct_md_config_read(component, NULL, NULL, &md_config); if (status != UCS_OK) { goto out; } - status = uct_md_open(md_name, md_config, &md); + status = uct_md_open(component, md_name, md_config, &md); uct_config_release(md_config); if (status != UCS_OK) { printf("# < failed to open memory domain %s >\n", md_name); @@ -400,19 +419,15 @@ static void print_md_info(const char *md_name, int print_opts, } else { printf("#\n"); printf("# Memory domain: %s\n", md_name); - printf("# component: %s\n", md_attr.component_name); + printf("# Component: %s\n", component_attr->name); if (md_attr.cap.flags & UCT_MD_FLAG_ALLOC) { printf("# allocate: %s\n", size_limit_to_str(0, md_attr.cap.max_alloc)); } if (md_attr.cap.flags & UCT_MD_FLAG_REG) { - printf("# register: %s, cost: %.0f", - size_limit_to_str(0, md_attr.cap.max_reg), - md_attr.reg_cost.overhead * 1e9); - if (md_attr.reg_cost.growth * 1e9 > 1e-3) { - printf("+(%.3f*)", md_attr.reg_cost.growth * 1e9); - } - printf(" nsec\n"); + printf("# register: %s, cost: ", + size_limit_to_str(0, md_attr.cap.max_reg)); + PRINT_LINEAR_FUNC_NS(&md_attr.reg_cost); } if (md_attr.cap.flags & UCT_MD_FLAG_NEED_RKEY) { printf("# remote key: %zu bytes\n", md_attr.rkey_packed_size); @@ -420,6 +435,9 @@ static void print_md_info(const char *md_name, int print_opts, if (md_attr.cap.flags & UCT_MD_FLAG_NEED_MEMH) { printf("# local memory handle is required for zcopy\n"); } + if (md_attr.cap.flags & UCT_MD_FLAG_RKEY_PTR) { + printf("# rkey_ptr is supported\n"); + } if (md_attr.cap.flags & UCT_MD_FLAG_SOCKADDR) { printf("# supports client-server connection establishment via sockaddr\n"); } @@ -460,25 +478,127 @@ static void print_md_info(const char *md_name, int print_opts, ; } +static void print_cm_attr(uct_worker_h worker, uct_component_h component, + const char *comp_name) +{ + uct_cm_config_t *cm_config; + uct_cm_attr_t cm_attr; + ucs_status_t status; + uct_cm_h cm; + + status = uct_cm_config_read(component, NULL, NULL, &cm_config); + if (status != UCS_OK) { + printf("# < failed to read the %s connection manager configuration >\n", + comp_name); + return; + } + + status = uct_cm_open(component, worker, cm_config, &cm); + uct_config_release(cm_config); + if (status != UCS_OK) { + printf("# < failed to open connection manager %s >\n", comp_name); + /* coverity[leaked_storage] */ + return; + } + + cm_attr.field_mask = UCT_CM_ATTR_FIELD_MAX_CONN_PRIV; + status = uct_cm_query(cm, &cm_attr); + if (status != UCS_OK) { + printf("# < failed to query connection manager >\n"); + } else { + printf("#\n"); + printf("# Connection manager: %s\n", comp_name); + printf("# max_conn_priv: %zu bytes\n", cm_attr.max_conn_priv); + } + + uct_cm_close(cm); +} + +static void print_cm_info(uct_component_h component, + const uct_component_attr_t *component_attr) +{ + ucs_async_context_t *async; + uct_worker_h worker; + ucs_status_t status; + + status = ucs_async_context_create(UCS_ASYNC_MODE_THREAD_SPINLOCK, &async); + if (status != UCS_OK) { + printf("# < failed to create asynchronous context >\n"); + return; + } + + status = uct_worker_create(async, UCS_THREAD_MODE_SINGLE, &worker); + if (status != UCS_OK) { + printf("# < failed to create uct worker >\n"); + goto out_async_ctx_destroy; + } + + print_cm_attr(worker, component, component_attr->name); + + uct_worker_destroy(worker); + +out_async_ctx_destroy: + ucs_async_context_destroy(async); +} + +static void print_uct_component_info(uct_component_h component, + int print_opts, + ucs_config_print_flags_t print_flags, + const char *req_tl_name) +{ + uct_component_attr_t component_attr; + ucs_status_t status; + unsigned i; + + component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME | + UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT | + UCT_COMPONENT_ATTR_FIELD_FLAGS; + status = uct_component_query(component, &component_attr); + if (status != UCS_OK) { + printf("# < failed to query component >\n"); + return; + } + + component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES; + component_attr.md_resources = alloca(sizeof(*component_attr.md_resources) * + component_attr.md_resource_count); + status = uct_component_query(component, &component_attr); + if (status != UCS_OK) { + printf("# < failed to query component md resources >\n"); + return; + } + + for (i = 0; i < component_attr.md_resource_count; ++i) { + print_md_info(component, &component_attr, + component_attr.md_resources[i].md_name, + print_opts, print_flags, req_tl_name); + } + + if (component_attr.flags & UCT_COMPONENT_FLAG_CM) { + print_cm_info(component, &component_attr); + } +} + void print_uct_info(int print_opts, ucs_config_print_flags_t print_flags, const char *req_tl_name) { - uct_md_resource_desc_t *resources; - unsigned i, num_resources; + uct_component_h *components; + unsigned i, num_components; ucs_status_t status; - status = uct_query_md_resources(&resources, &num_resources); + status = uct_query_components(&components, &num_components); if (status != UCS_OK) { - printf("# < failed to query MD resources >\n"); + printf("# < failed to query UCT components >\n"); return; } if (print_opts & PRINT_DEVICES) { - for (i = 0; i < num_resources; ++i) { - print_md_info(resources[i].md_name, print_opts, print_flags, req_tl_name); + for (i = 0; i < num_components; ++i) { + print_uct_component_info(components[i], print_opts, print_flags, + req_tl_name); } } - uct_release_md_resource_list(resources); + uct_release_component_list(components); } diff --git a/src/tools/info/type_info.c b/src/tools/info/type_info.c index e6870ac4803..a28137aa627 100644 --- a/src/tools/info/type_info.c +++ b/src/tools/info/type_info.c @@ -4,12 +4,12 @@ * See file LICENSE for terms. */ -#include "ucx_info.h" - #ifdef HAVE_CONFIG_H # include "config.h" #endif +#include "ucx_info.h" + #include #include #include @@ -42,7 +42,7 @@ # include # include # include -# if HAVE_MLX5_HW +# ifdef HAVE_MLX5_HW # include # endif #endif @@ -55,13 +55,13 @@ #if HAVE_TL_UD # include # include -# if HAVE_MLX5_HW_UD +# ifdef HAVE_MLX5_HW_UD # include # endif #endif -#if HAVE_TL_UGNI +#ifdef HAVE_TL_UGNI # include # include # include @@ -110,7 +110,7 @@ void print_type_info(const char * tl_name) PRINT_SIZE(ucs_ptr_array_t); PRINT_SIZE(ucs_queue_elem_t); PRINT_SIZE(ucs_queue_head_t); - PRINT_SIZE(ucs_spinlock_t); + PRINT_SIZE(ucs_recursive_spinlock_t); PRINT_SIZE(ucs_timer_t); PRINT_SIZE(ucs_timer_queue_t); PRINT_SIZE(ucs_twheel_t); @@ -146,7 +146,7 @@ void print_type_info(const char * tl_name) PRINT_SIZE(uct_tcp_ep_t); PRINT_SIZE(uct_self_ep_t); -#if HAVE_TL_UGNI +#ifdef HAVE_TL_UGNI PRINT_SIZE(uct_sockaddr_ugni_t); PRINT_SIZE(uct_sockaddr_smsg_ugni_t); PRINT_SIZE(uct_devaddr_ugni_t); @@ -168,7 +168,7 @@ void print_type_info(const char * tl_name) } #if HAVE_TL_RC - if (tl_name == NULL || !strcasecmp(tl_name, "rc") || + if (tl_name == NULL || !strcasecmp(tl_name, "rc_verbs") || !strcasecmp(tl_name, "rc_mlx5")) { printf("RC:\n"); @@ -181,13 +181,13 @@ void print_type_info(const char * tl_name) PRINT_SIZE(uct_rc_iface_send_desc_t); PRINT_SIZE(uct_rc_iface_send_desc_t); - if (tl_name == NULL || !strcasecmp(tl_name, "rc")) { + if (tl_name == NULL || !strcasecmp(tl_name, "rc_verbs")) { PRINT_SIZE(uct_rc_verbs_ep_t); PRINT_SIZE(uct_rc_verbs_iface_config_t); PRINT_SIZE(uct_rc_verbs_iface_t); } -#if HAVE_MLX5_HW +#ifdef HAVE_MLX5_HW if (tl_name == NULL || !strcasecmp(tl_name, "rc_mlx5")) { PRINT_SIZE(uct_rc_mlx5_am_short_hdr_t); PRINT_SIZE(uct_rc_mlx5_ep_t); @@ -212,7 +212,7 @@ void print_type_info(const char * tl_name) #endif #if HAVE_TL_UD - if (tl_name == NULL || !strcasecmp(tl_name, "ud") || + if (tl_name == NULL || !strcasecmp(tl_name, "ud_verbs") || !strcasecmp(tl_name, "ud_mlx5")) { printf("UD:\n"); @@ -222,16 +222,15 @@ void print_type_info(const char * tl_name) PRINT_SIZE(uct_ud_iface_config_t); PRINT_SIZE(uct_ud_ep_pending_op_t); PRINT_SIZE(uct_ud_send_skb_t); - PRINT_SIZE(uct_ud_send_skb_inl_t); PRINT_SIZE(uct_ud_recv_skb_t); PRINT_SIZE(uct_rc_iface_send_desc_t); - if (tl_name == NULL || !strcasecmp(tl_name, "ud")) { + if (tl_name == NULL || !strcasecmp(tl_name, "ud_verbs")) { PRINT_SIZE(uct_ud_verbs_ep_t); PRINT_SIZE(uct_ud_verbs_iface_t); } -#if HAVE_MLX5_HW_UD +#ifdef HAVE_MLX5_HW_UD if (tl_name == NULL || !strcasecmp(tl_name, "ud_mlx5")) { PRINT_SIZE(uct_ud_mlx5_ep_t); PRINT_SIZE(uct_ud_mlx5_iface_t); @@ -241,7 +240,7 @@ void print_type_info(const char * tl_name) } #endif -#if HAVE_TL_UGNI +#ifdef HAVE_TL_UGNI if (tl_name == NULL || !strcasecmp(tl_name, "ugni")) { printf("UGNI:\n"); PRINT_SIZE(uct_ugni_device_t); diff --git a/src/tools/info/ucx_info.c b/src/tools/info/ucx_info.c index b3b668a1788..15ec3c348f5 100644 --- a/src/tools/info/ucx_info.c +++ b/src/tools/info/ucx_info.c @@ -1,10 +1,14 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. * +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ucx_info.h" #include @@ -13,6 +17,7 @@ #include #include #include +#include static void usage() { @@ -30,6 +35,7 @@ static void usage() { printf(" -p Show UCP context information\n"); printf(" -w Show UCP worker information\n"); printf(" -e Show UCP endpoint configuration\n"); + printf(" -m Show UCP memory allocation method for a given size\n"); printf(" -u UCP context features to use. String of one or more of:\n"); printf(" 'a' : atomic operations\n"); printf(" 'r' : remote memory access\n"); @@ -40,6 +46,7 @@ static void usage() { printf("\nOther settings:\n"); printf(" -t Filter devices information using specified transport (requires -d)\n"); printf(" -n Estimated UCP endpoint count (for ucp_init)\n"); + printf(" -N Estimated UCP endpoint count per node (for ucp_init)\n"); printf(" -D Set which device types to use when creating UCP context:\n"); printf(" 'all' : all possible devices (default)\n"); printf(" 'shm' : shared memory devices only\n"); @@ -56,19 +63,22 @@ int main(int argc, char **argv) unsigned dev_type_bitmap; uint64_t ucp_features; size_t ucp_num_eps; + size_t ucp_num_ppn; unsigned print_opts; - char *tl_name; + char *tl_name, *mem_size; const char *f; int c; print_opts = 0; - print_flags = 0; + print_flags = (ucs_config_print_flags_t)0; tl_name = NULL; ucp_features = 0; ucp_num_eps = 1; - dev_type_bitmap = -1; + ucp_num_ppn = 1; + mem_size = NULL; + dev_type_bitmap = UINT_MAX; ucp_ep_params.field_mask = 0; - while ((c = getopt(argc, argv, "fahvcydbswpegt:n:u:D:P:T:C:I:R:")) != -1) { + while ((c = getopt(argc, argv, "fahvcydbswpegt:n:u:D:m:N:P:T:C:I:R:")) != -1) { switch (c) { case 'f': print_flags |= UCS_CONFIG_PRINT_CONFIG | UCS_CONFIG_PRINT_HEADER | UCS_CONFIG_PRINT_DOC; @@ -103,12 +113,19 @@ int main(int argc, char **argv) case 'e': print_opts |= PRINT_UCP_EP; break; + case 'm': + print_opts |= PRINT_MEM_MAP; + mem_size = optarg; + break; case 't': tl_name = optarg; break; case 'n': ucp_num_eps = atol(optarg); break; + case 'N': + ucp_num_ppn = atol(optarg); + break; case 'u': for (f = optarg; *f; ++f) { switch (*f) { @@ -124,9 +141,9 @@ int main(int argc, char **argv) case 'w': ucp_features |= UCP_FEATURE_WAKEUP; break; + break; case 'g': ucp_features |= UCP_FEATURE_GROUPS | UCP_FEATURE_TAG; - break; case 'e': ucp_ep_params.field_mask |= UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE; ucp_ep_params.err_mode = UCP_ERR_HANDLING_MODE_PEER; @@ -145,7 +162,7 @@ int main(int argc, char **argv) } else if (!strcasecmp(optarg, "self")) { dev_type_bitmap = UCS_BIT(UCT_DEVICE_TYPE_SELF); } else if (!strcasecmp(optarg, "all")) { - dev_type_bitmap = -1; + dev_type_bitmap = UINT_MAX; } else { usage(); return -1; @@ -183,16 +200,17 @@ int main(int argc, char **argv) if ((print_opts & PRINT_DEVICES) || (print_flags & UCS_CONFIG_PRINT_CONFIG)) { /* if UCS_CONFIG_PRINT_CONFIG is ON, trigger loading UCT modules by - * calling print_uct_info()->uct_query_md_resources() + * calling print_uct_info()->uct_component_query() */ print_uct_info(print_opts, print_flags, tl_name); } if (print_flags & UCS_CONFIG_PRINT_CONFIG) { - ucs_config_parser_print_all_opts(stdout, print_flags); + ucs_config_parser_print_all_opts(stdout, UCS_DEFAULT_ENV_PREFIX, + print_flags); } - if (print_opts & (PRINT_UCP_CONTEXT | PRINT_UCP_WORKER | PRINT_UCP_EP | + if (print_opts & (PRINT_UCP_CONTEXT|PRINT_UCP_WORKER|PRINT_UCP_EP|PRINT_MEM_MAP| PRINT_UCG | PRINT_UCG_TOPO)) { if (ucp_features == 0) { printf("Please select UCP features using -u switch: a|r|t|w|g\n"); @@ -200,7 +218,7 @@ int main(int argc, char **argv) return -1; } print_ucp_info(print_opts, print_flags, ucp_features, &ucp_ep_params, - ucp_num_eps, dev_type_bitmap); + ucp_num_eps, ucp_num_ppn, dev_type_bitmap, mem_size); } return 0; diff --git a/src/tools/info/ucx_info.h b/src/tools/info/ucx_info.h index b3f7022593c..38418ba0293 100644 --- a/src/tools/info/ucx_info.h +++ b/src/tools/info/ucx_info.h @@ -1,6 +1,6 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -21,8 +21,9 @@ enum { PRINT_UCP_CONTEXT = UCS_BIT(5), PRINT_UCP_WORKER = UCS_BIT(6), PRINT_UCP_EP = UCS_BIT(7), - PRINT_UCG = UCS_BIT(8), - PRINT_UCG_TOPO = UCS_BIT(9) + PRINT_MEM_MAP = UCS_BIT(8), + PRINT_UCG = UCS_BIT(9), + PRINT_UCG_TOPO = UCS_BIT(10) }; @@ -39,6 +40,7 @@ void print_type_info(const char * tl_name); void print_ucp_info(int print_opts, ucs_config_print_flags_t print_flags, uint64_t ctx_features, const ucp_ep_params_t *base_ep_params, - size_t estimated_num_eps, unsigned dev_type_bitmap); + size_t estimated_num_eps, size_t estimated_num_ppn, + unsigned dev_type_bitmap, const char *mem_size); #endif diff --git a/src/tools/perf/Makefile.am b/src/tools/perf/Makefile.am index 5cd2153802b..bc68f81c440 100644 --- a/src/tools/perf/Makefile.am +++ b/src/tools/perf/Makefile.am @@ -8,7 +8,7 @@ # See file LICENSE for terms. # -SUBDIRS = cuda lib +SUBDIRS = cuda rocm lib CC = $(UCX_PERFTEST_CC) noinst_HEADERS = api/libperf.h @@ -26,6 +26,7 @@ ucx_perftest_LDADD = \ perftestdir = $(pkgdatadir)/perftest dist_perftest_DATA = \ $(top_srcdir)/contrib/ucx_perftest_config/msg_pow2 \ + $(top_srcdir)/contrib/ucx_perftest_config/msg_pow2_large \ $(top_srcdir)/contrib/ucx_perftest_config/README \ $(top_srcdir)/contrib/ucx_perftest_config/test_types_uct \ $(top_srcdir)/contrib/ucx_perftest_config/test_types_ucp \ diff --git a/src/tools/perf/api/libperf.h b/src/tools/perf/api/libperf.h index ab8efb656ea..23a93f65e4b 100644 --- a/src/tools/perf/api/libperf.h +++ b/src/tools/perf/api/libperf.h @@ -19,6 +19,7 @@ BEGIN_C_DECLS #include #include #include +#include #include @@ -82,7 +83,8 @@ enum ucx_perf_test_flags { UCX_PERF_TEST_FLAG_TAG_WILDCARD = UCS_BIT(4), /* For tag tests, use wildcard mask */ UCX_PERF_TEST_FLAG_TAG_UNEXP_PROBE = UCS_BIT(5), /* For tag tests, use probe to get unexpected receive */ UCX_PERF_TEST_FLAG_VERBOSE = UCS_BIT(7), /* Print error messages */ - UCX_PERF_TEST_FLAG_STREAM_RECV_DATA = UCS_BIT(8) /* For stream tests, use recv data API */ + UCX_PERF_TEST_FLAG_STREAM_RECV_DATA = UCS_BIT(8), /* For stream tests, use recv data API */ + UCX_PERF_TEST_FLAG_FLUSH_EP = UCS_BIT(9) /* Issue flush on endpoint instead of worker */ }; @@ -90,6 +92,12 @@ enum { UCT_PERF_TEST_MAX_FC_WINDOW = 127 /* Maximal flow-control window */ }; + +#define UCT_PERF_TEST_PARAMS_FMT "%s/%s" +#define UCT_PERF_TEST_PARAMS_ARG(_params) (_params)->uct.tl_name, \ + (_params)->uct.dev_name + + /** * Performance counter type. */ @@ -115,30 +123,44 @@ typedef struct ucx_perf_result { } ucx_perf_result_t; +typedef void (*ucx_perf_rte_progress_cb_t)(void *arg); + +typedef unsigned (*ucx_perf_rte_group_size_func_t)(void *rte_group); +typedef unsigned (*ucx_perf_rte_group_index_func_t)(void *rte_group); +typedef void (*ucx_perf_rte_barrier_func_t)(void *rte_group, + ucx_perf_rte_progress_cb_t progress, + void *arg); +typedef void (*ucx_perf_rte_post_vec_func_t)(void *rte_group, + const struct iovec *iovec, + int iovcnt, void **req); +typedef void (*ucx_perf_rte_recv_func_t)(void *rte_group, unsigned src, + void *buffer, size_t max, void *req); +typedef void (*ucx_perf_rte_exchange_vec_func_t)(void *rte_group, void *req); +typedef void (*ucx_perf_rte_report_func_t)(void *rte_group, + const ucx_perf_result_t *result, + void *arg, int is_final, + int is_multi_thread); + /** * RTE used to bring-up the test */ typedef struct ucx_perf_rte { /* @return Group size */ - unsigned (*group_size)(void *rte_group); + ucx_perf_rte_group_size_func_t group_size; /* @return My index within the group */ - unsigned (*group_index)(void *rte_group); + ucx_perf_rte_group_index_func_t group_index; /* Barrier */ - void (*barrier)(void *rte_group, void (*progress)(void *arg), - void *arg); + ucx_perf_rte_barrier_func_t barrier; /* Direct modex */ - void (*post_vec)(void *rte_group, const struct iovec *iovec, - int iovcnt, void **req); - void (*recv)(void *rte_group, unsigned src, void *buffer, size_t max, - void *req); - void (*exchange_vec)(void *rte_group, void *req); + ucx_perf_rte_post_vec_func_t post_vec; + ucx_perf_rte_recv_func_t recv; + ucx_perf_rte_exchange_vec_func_t exchange_vec; /* Handle results */ - void (*report)(void *rte_group, const ucx_perf_result_t *result, - void *arg, int is_final); + ucx_perf_rte_report_func_t report; } ucx_perf_rte_t; @@ -154,7 +176,8 @@ typedef struct ucx_perf_params { unsigned thread_count; /* Number of threads in the test program */ ucs_async_mode_t async_mode; /* how async progress and locking is done */ ucx_perf_wait_mode_t wait_mode; /* How to wait */ - uct_memory_type_t mem_type; /* memory type */ + ucs_memory_type_t send_mem_type; /* Send memory type */ + ucs_memory_type_t recv_mem_type; /* Recv memory type */ unsigned flags; /* See ucx_perf_test_flags. */ size_t *msg_size_list; /* Test message sizes list. The size @@ -179,6 +202,7 @@ typedef struct ucx_perf_params { struct { char dev_name[UCT_DEVICE_NAME_MAX]; /* Device name to use */ char tl_name[UCT_TL_NAME_MAX]; /* Transport to use */ + char md_name[UCT_MD_NAME_MAX]; /* Memory domain name to use */ uct_perf_data_layout_t data_layout; /* Data layout to use */ unsigned fc_window; /* Window size for flow control <= UCX_PERF_TEST_MAX_FC_WINDOW */ } uct; @@ -206,7 +230,8 @@ void ucx_perf_global_init(); /** * Run a UCT performance test. */ -ucs_status_t ucx_perf_run(ucx_perf_params_t *params, ucx_perf_result_t *result); +ucs_status_t ucx_perf_run(const ucx_perf_params_t *params, + ucx_perf_result_t *result); END_C_DECLS diff --git a/src/tools/perf/configure.m4 b/src/tools/perf/configure.m4 index 181006bd330..509c38331f3 100644 --- a/src/tools/perf/configure.m4 +++ b/src/tools/perf/configure.m4 @@ -7,6 +7,7 @@ ucx_perftest_modules="" m4_include([src/tools/perf/lib/configure.m4]) m4_include([src/tools/perf/cuda/configure.m4]) +m4_include([src/tools/perf/rocm/configure.m4]) AC_DEFINE_UNQUOTED([ucx_perftest_MODULES], ["${ucx_perftest_modules}"], [Perftest loadable modules]) diff --git a/src/tools/perf/cuda/configure.m4 b/src/tools/perf/cuda/configure.m4 index 05f1d5db5cb..f2e5cfebf46 100644 --- a/src/tools/perf/cuda/configure.m4 +++ b/src/tools/perf/cuda/configure.m4 @@ -6,6 +6,6 @@ UCX_CHECK_CUDA -AS_IF([test "x$cuda_happy" = "xyes"], [ucx_perftest_modules+=":cuda"]) +AS_IF([test "x$cuda_happy" = "xyes"], [ucx_perftest_modules="${ucx_perftest_modules}:cuda"]) AC_CONFIG_FILES([src/tools/perf/cuda/Makefile]) diff --git a/src/tools/perf/cuda/cuda_alloc.c b/src/tools/perf/cuda/cuda_alloc.c index e98b690b66c..58bade987bf 100644 --- a/src/tools/perf/cuda/cuda_alloc.c +++ b/src/tools/perf/cuda/cuda_alloc.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include @@ -35,62 +39,156 @@ static ucs_status_t ucx_perf_cuda_init(ucx_perf_context_t *perf) return UCS_OK; } -static ucs_status_t ucp_perf_cuda_alloc(ucx_perf_context_t *perf, size_t length, - void **address_p, ucp_mem_h *memh_p, - int non_blk_flag) +static inline ucs_status_t ucx_perf_cuda_alloc(size_t length, + ucs_memory_type_t mem_type, + void **address_p) { cudaError_t cerr; - cerr = cudaMalloc(address_p, length); + ucs_assert((mem_type == UCS_MEMORY_TYPE_CUDA) || + (mem_type == UCS_MEMORY_TYPE_CUDA_MANAGED)); + + cerr = ((mem_type == UCS_MEMORY_TYPE_CUDA) ? + cudaMalloc(address_p, length) : + cudaMallocManaged(address_p, length, cudaMemAttachGlobal)); if (cerr != cudaSuccess) { + ucs_error("failed to allocate memory"); return UCS_ERR_NO_MEMORY; } return UCS_OK; } -static ucs_status_t ucp_perf_cuda_alloc_managed(ucx_perf_context_t *perf, +static ucs_status_t ucp_perf_cuda_alloc(const ucx_perf_context_t *perf, size_t length, + void **address_p, ucp_mem_h *memh_p, + int non_blk_flag) +{ + return ucx_perf_cuda_alloc(length, UCS_MEMORY_TYPE_CUDA, address_p); +} + +static ucs_status_t ucp_perf_cuda_alloc_managed(const ucx_perf_context_t *perf, size_t length, void **address_p, ucp_mem_h *memh_p, int non_blk_flag) { - cudaError_t cerr; + return ucx_perf_cuda_alloc(length, UCS_MEMORY_TYPE_CUDA_MANAGED, address_p); +} - cerr = cudaMallocManaged(address_p, length, cudaMemAttachGlobal); - if (cerr != cudaSuccess) { - return UCS_ERR_NO_MEMORY; +static void ucp_perf_cuda_free(const ucx_perf_context_t *perf, + void *address, ucp_mem_h memh) +{ + cudaFree(address); +} + +static inline ucs_status_t +uct_perf_cuda_alloc_reg_mem(const ucx_perf_context_t *perf, + size_t length, + ucs_memory_type_t mem_type, + unsigned flags, + uct_allocated_memory_t *alloc_mem) +{ + ucs_status_t status; + + status = ucx_perf_cuda_alloc(length, mem_type, &alloc_mem->address); + if (status != UCS_OK) { + return status; + } + + status = uct_md_mem_reg(perf->uct.md, alloc_mem->address, + length, flags, &alloc_mem->memh); + if (status != UCS_OK) { + cudaFree(alloc_mem->address); + ucs_error("failed to register memory"); + return status; } + alloc_mem->mem_type = mem_type; + alloc_mem->md = perf->uct.md; + return UCS_OK; } -static void ucp_perf_cuda_free(ucx_perf_context_t *perf, void *address, - ucp_mem_h memh) +static ucs_status_t uct_perf_cuda_alloc(const ucx_perf_context_t *perf, + size_t length, unsigned flags, + uct_allocated_memory_t *alloc_mem) { - cudaFree(address); + return uct_perf_cuda_alloc_reg_mem(perf, length, UCS_MEMORY_TYPE_CUDA, + flags, alloc_mem); +} + +static ucs_status_t uct_perf_cuda_managed_alloc(const ucx_perf_context_t *perf, + size_t length, unsigned flags, + uct_allocated_memory_t *alloc_mem) +{ + return uct_perf_cuda_alloc_reg_mem(perf, length, UCS_MEMORY_TYPE_CUDA_MANAGED, + flags, alloc_mem); +} + +static void uct_perf_cuda_free(const ucx_perf_context_t *perf, + uct_allocated_memory_t *alloc_mem) +{ + ucs_status_t status; + + ucs_assert(alloc_mem->md == perf->uct.md); + + status = uct_md_mem_dereg(perf->uct.md, alloc_mem->memh); + if (status != UCS_OK) { + ucs_error("failed to deregister memory"); + } + + cudaFree(alloc_mem->address); +} + +static void ucx_perf_cuda_memcpy(void *dst, ucs_memory_type_t dst_mem_type, + const void *src, ucs_memory_type_t src_mem_type, + size_t count) +{ + cudaError_t cerr; + + cerr = cudaMemcpy(dst, src, count, cudaMemcpyDefault); + if (cerr != cudaSuccess) { + ucs_error("failed to copy memory: %s", cudaGetErrorString(cerr)); + } } -static void* ucp_perf_cuda_memset(void *s, int c, size_t len) +static void* ucx_perf_cuda_memset(void *dst, int value, size_t count) { - /* NOTE: This memset is needed onl for one-sided tests (e.g ucp_put_lat) but - * they don't work with CUDA anyway. So for now it's mostly for completeness. - */ - cudaMemset(s, c, len); - return s; + cudaError_t cerr; + + cerr = cudaMemset(dst, value, count); + if (cerr != cudaSuccess) { + ucs_error("failed to set memory: %s", cudaGetErrorString(cerr)); + } + + return dst; } UCS_STATIC_INIT { static ucx_perf_allocator_t cuda_allocator = { + .mem_type = UCS_MEMORY_TYPE_CUDA, .init = ucx_perf_cuda_init, .ucp_alloc = ucp_perf_cuda_alloc, .ucp_free = ucp_perf_cuda_free, - .memset = ucp_perf_cuda_memset + .uct_alloc = uct_perf_cuda_alloc, + .uct_free = uct_perf_cuda_free, + .memcpy = ucx_perf_cuda_memcpy, + .memset = ucx_perf_cuda_memset }; static ucx_perf_allocator_t cuda_managed_allocator = { + .mem_type = UCS_MEMORY_TYPE_CUDA_MANAGED, .init = ucx_perf_cuda_init, .ucp_alloc = ucp_perf_cuda_alloc_managed, .ucp_free = ucp_perf_cuda_free, - .memset = ucp_perf_cuda_memset + .uct_alloc = uct_perf_cuda_managed_alloc, + .uct_free = uct_perf_cuda_free, + .memcpy = ucx_perf_cuda_memcpy, + .memset = ucx_perf_cuda_memset }; - ucx_perf_mem_type_allocators[UCT_MD_MEM_TYPE_CUDA] = &cuda_allocator; - ucx_perf_mem_type_allocators[UCT_MD_MEM_TYPE_CUDA_MANAGED] = &cuda_managed_allocator; + + ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_CUDA] = &cuda_allocator; + ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_CUDA_MANAGED] = &cuda_managed_allocator; +} +UCS_STATIC_CLEANUP { + ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_CUDA] = NULL; + ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_CUDA_MANAGED] = NULL; + } diff --git a/src/tools/perf/lib/Makefile.am b/src/tools/perf/lib/Makefile.am index e2ac8fa139e..9844c7f6119 100644 --- a/src/tools/perf/lib/Makefile.am +++ b/src/tools/perf/lib/Makefile.am @@ -19,7 +19,7 @@ libucxperf_la_LIBADD = \ $(abs_top_builddir)/src/ucs/libucs.la # C-linkable C++ code - must override any inherited CXXFLAGS -CXXFLAGS += -nostdlib -fno-exceptions -fno-rtti +CXXFLAGS += -nostdlib $(PERF_LIB_CXXFLAGS) noinst_HEADERS = \ libperf_int.h diff --git a/src/tools/perf/lib/configure.m4 b/src/tools/perf/lib/configure.m4 index ebae0d2b324..09ec6a8d5a2 100644 --- a/src/tools/perf/lib/configure.m4 +++ b/src/tools/perf/lib/configure.m4 @@ -4,4 +4,25 @@ # See file LICENSE for terms. # +AC_LANG_PUSH([C++]) + +CHECK_COMPILER_FLAG([-fno-exceptions], [-fno-exceptions], + [AC_LANG_SOURCE([[int main(int argc, char** argv){return 0;}]])], + [PERF_LIB_CXXFLAGS="$PERF_LIB_CXXFLAGS -fno-exceptions"], + []) + +CHECK_COMPILER_FLAG([-fno-rtti], [-fno-rtti], + [AC_LANG_SOURCE([[int main(int argc, char** argv){return 0;}]])], + [PERF_LIB_CXXFLAGS="$PERF_LIB_CXXFLAGS -fno-rtti"], + []) + +CHECK_COMPILER_FLAG([--no_exceptions], [--no_exceptions], + [AC_LANG_SOURCE([[int main(int argc, char** argv){return 0;}]])], + [PERF_LIB_CXXFLAGS="$PERF_LIB_CXXFLAGS --no_exceptions"], + []) + +AC_LANG_POP([C++]) + +AC_SUBST([PERF_LIB_CXXFLAGS], [$PERF_LIB_CXXFLAGS]) + AC_CONFIG_FILES([src/tools/perf/lib/Makefile]) diff --git a/src/tools/perf/lib/libperf.c b/src/tools/perf/lib/libperf.c index 53a5f4ff022..1a3542be960 100644 --- a/src/tools/perf/lib/libperf.c +++ b/src/tools/perf/lib/libperf.c @@ -7,11 +7,15 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include #include +#include #include -#include #include #include @@ -19,23 +23,23 @@ #include #endif /* _OPENMP */ -#define ATOMIC_OP_CONFIG(_size, _op32, _op64, _op, _msg, _params, _status) \ - _status = __get_atomic_flag((_size), (_op32), (_op64), (_op)); \ - if (_status != UCS_OK) { \ - ucs_error("%s/%s does not support atomic %s for message size %zu bytes", \ - (_params)->uct.tl_name, (_params)->uct.dev_name, \ - (_msg)[_op], (_size)); \ - return _status; \ +#define ATOMIC_OP_CONFIG(_size, _op32, _op64, _op, _msg, _params, _status) \ + _status = __get_atomic_flag((_size), (_op32), (_op64), (_op)); \ + if (_status != UCS_OK) { \ + ucs_error(UCT_PERF_TEST_PARAMS_FMT" does not support atomic %s for " \ + "message size %zu bytes", UCT_PERF_TEST_PARAMS_ARG(_params), \ + (_msg)[_op], (_size)); \ + return _status; \ } -#define ATOMIC_OP_CHECK(_size, _attr, _required, _params, _msg) \ - if (!ucs_test_all_flags(_attr, _required)) { \ - if ((_params)->flags & UCX_PERF_TEST_FLAG_VERBOSE) { \ - ucs_error("%s/%s does not support required "#_size"-bit atomic: %s", \ - (_params)->uct.tl_name, (_params)->uct.dev_name, \ - (_msg)[ucs_ffs64(~(_attr) & (_required))]); \ - } \ - return UCS_ERR_UNSUPPORTED; \ +#define ATOMIC_OP_CHECK(_size, _attr, _required, _params, _msg) \ + if (!ucs_test_all_flags(_attr, _required)) { \ + if ((_params)->flags & UCX_PERF_TEST_FLAG_VERBOSE) { \ + ucs_error(UCT_PERF_TEST_PARAMS_FMT" does not support required " \ + #_size"-bit atomic: %s", UCT_PERF_TEST_PARAMS_ARG(_params), \ + (_msg)[ucs_ffs64(~(_attr) & (_required))]); \ + } \ + return UCS_ERR_UNSUPPORTED; \ } typedef struct { @@ -46,7 +50,8 @@ typedef struct { size_t ep_addr_len; } uct; struct { - size_t addr_len; + size_t worker_addr_len; + size_t total_wireup_len; } ucp; }; size_t rkey_size; @@ -54,7 +59,7 @@ typedef struct { } ucx_perf_ep_info_t; -const ucx_perf_allocator_t* ucx_perf_mem_type_allocators[UCT_MD_MEM_TYPE_LAST]; +const ucx_perf_allocator_t* ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_LAST]; static const char *perf_iface_ops[] = { [ucs_ilog2(UCT_IFACE_FLAG_AM_SHORT)] = "am short", @@ -72,9 +77,6 @@ static const char *perf_iface_ops[] = { [ucs_ilog2(UCT_IFACE_FLAG_AM_DUP)] = "full reliability", [ucs_ilog2(UCT_IFACE_FLAG_CB_SYNC)] = "sync callback", [ucs_ilog2(UCT_IFACE_FLAG_CB_ASYNC)] = "async callback", - [ucs_ilog2(UCT_IFACE_FLAG_EVENT_SEND_COMP)] = "send completion event", - [ucs_ilog2(UCT_IFACE_FLAG_EVENT_RECV)] = "tag or active message event", - [ucs_ilog2(UCT_IFACE_FLAG_EVENT_RECV_SIG)] = "signaled message event", [ucs_ilog2(UCT_IFACE_FLAG_PENDING)] = "pending", [ucs_ilog2(UCT_IFACE_FLAG_TAG_EAGER_SHORT)] = "tag eager short", [ucs_ilog2(UCT_IFACE_FLAG_TAG_EAGER_BCOPY)] = "tag eager bcopy", @@ -156,6 +158,44 @@ static ucs_time_t __find_median_quick_select(ucs_time_t arr[], int n) } } +static ucs_status_t +uct_perf_test_alloc_host(const ucx_perf_context_t *perf, size_t length, + unsigned flags, uct_allocated_memory_t *alloc_mem) +{ + ucs_status_t status; + + status = uct_iface_mem_alloc(perf->uct.iface, length, + flags, "perftest", alloc_mem); + if (status != UCS_OK) { + ucs_free(alloc_mem); + ucs_error("failed to allocate memory: %s", ucs_status_string(status)); + return status; + } + + ucs_assert(alloc_mem->md == perf->uct.md); + + return UCS_OK; +} + +static void uct_perf_test_free_host(const ucx_perf_context_t *perf, + uct_allocated_memory_t *alloc_mem) +{ + uct_iface_mem_free(alloc_mem); +} + +static void ucx_perf_test_memcpy_host(void *dst, ucs_memory_type_t dst_mem_type, + const void *src, ucs_memory_type_t src_mem_type, + size_t count) +{ + if ((dst_mem_type != UCS_MEMORY_TYPE_HOST) || + (src_mem_type != UCS_MEMORY_TYPE_HOST)) { + ucs_error("wrong memory type passed src - %d, dst - %d", + src_mem_type, dst_mem_type); + } else { + memcpy(dst, src, count); + } +} + static ucs_status_t uct_perf_test_alloc_mem(ucx_perf_context_t *perf) { ucx_perf_params_t *params = &perf->params; @@ -172,31 +212,25 @@ static ucs_status_t uct_perf_test_alloc_mem(ucx_perf_context_t *perf) /* TODO use params->alignment */ flags = (params->flags & UCX_PERF_TEST_FLAG_MAP_NONBLOCK) ? - UCT_MD_MEM_FLAG_NONBLOCK : 0; + UCT_MD_MEM_FLAG_NONBLOCK : 0; flags |= UCT_MD_MEM_ACCESS_ALL; /* Allocate send buffer memory */ - status = uct_iface_mem_alloc(perf->uct.iface, - buffer_size * params->thread_count, - flags, "perftest", &perf->uct.send_mem); + status = perf->allocator->uct_alloc(perf, buffer_size * params->thread_count, + flags, &perf->uct.send_mem); if (status != UCS_OK) { - ucs_error("Failed allocate send buffer: %s", ucs_status_string(status)); goto err; } - ucs_assert(perf->uct.send_mem.md == perf->uct.md); perf->send_buffer = perf->uct.send_mem.address; /* Allocate receive buffer memory */ - status = uct_iface_mem_alloc(perf->uct.iface, - buffer_size * params->thread_count, - flags, "perftest", &perf->uct.recv_mem); + status = perf->allocator->uct_alloc(perf, buffer_size * params->thread_count, + flags, &perf->uct.recv_mem); if (status != UCS_OK) { - ucs_error("Failed allocate receive buffer: %s", ucs_status_string(status)); goto err_free_send; } - ucs_assert(perf->uct.recv_mem.md == perf->uct.md); perf->recv_buffer = perf->uct.recv_mem.address; /* Allocate IOV datatype memory */ @@ -208,25 +242,25 @@ static ucs_status_t uct_perf_test_alloc_mem(ucx_perf_context_t *perf) status = UCS_ERR_NO_MEMORY; ucs_error("Failed allocate send IOV(%lu) buffer: %s", perf->params.msg_size_cnt, ucs_status_string(status)); - goto err_free_send; + goto err_free_recv; } - perf->offset = 0; - ucs_debug("allocated memory. Send buffer %p, Recv buffer %p", perf->send_buffer, perf->recv_buffer); return UCS_OK; +err_free_recv: + perf->allocator->uct_free(perf, &perf->uct.recv_mem); err_free_send: - uct_iface_mem_free(&perf->uct.send_mem); + perf->allocator->uct_free(perf, &perf->uct.send_mem); err: return status; } static void uct_perf_test_free_mem(ucx_perf_context_t *perf) { - uct_iface_mem_free(&perf->uct.send_mem); - uct_iface_mem_free(&perf->uct.recv_mem); + perf->allocator->uct_free(perf, &perf->uct.send_mem); + perf->allocator->uct_free(perf, &perf->uct.recv_mem); free(perf->uct.iov); } @@ -245,7 +279,7 @@ void ucx_perf_test_start_clock(ucx_perf_context_t *perf) /* Initialize/reset all parameters that could be modified by the warm-up run */ static void ucx_perf_test_prepare_new_run(ucx_perf_context_t *perf, - ucx_perf_params_t *params) + const ucx_perf_params_t *params) { unsigned i; @@ -268,11 +302,18 @@ static void ucx_perf_test_prepare_new_run(ucx_perf_context_t *perf, } static void ucx_perf_test_init(ucx_perf_context_t *perf, - ucx_perf_params_t *params) + const ucx_perf_params_t *params) { - perf->params = *params; - perf->offset = 0; - perf->allocator = ucx_perf_mem_type_allocators[params->mem_type]; + unsigned group_index; + + perf->params = *params; + group_index = rte_call(perf, group_index); + + if (0 == group_index) { + perf->allocator = ucx_perf_mem_type_allocators[params->send_mem_type]; + } else { + perf->allocator = ucx_perf_mem_type_allocators[params->recv_mem_type]; + } ucx_perf_test_prepare_new_run(perf, params); } @@ -338,13 +379,41 @@ static ucs_status_t ucx_perf_test_check_params(ucx_perf_params_t *params) { size_t it; - if (ucx_perf_get_message_size(params) < 1) { + /* check if zero-size messages are requested and supported */ + if ((/* they are not supported by: */ + /* - UCT tests, except UCT AM Short/Bcopy */ + (params->api == UCX_PERF_API_UCT) || + (/* - UCP RMA and AMO tests */ + (params->api == UCX_PERF_API_UCP) && + (params->command != UCX_PERF_CMD_AM) && + (params->command != UCX_PERF_CMD_TAG) && + (params->command != UCX_PERF_CMD_TAG_SYNC) && + (params->command != UCX_PERF_CMD_STREAM))) && + ucx_perf_get_message_size(params) < 1) { if (params->flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_error("Message size too small, need to be at least 1"); } return UCS_ERR_INVALID_PARAM; } + if ((params->api == UCX_PERF_API_UCP) && + ((params->send_mem_type != UCS_MEMORY_TYPE_HOST) || + (params->recv_mem_type != UCS_MEMORY_TYPE_HOST)) && + ((params->command == UCX_PERF_CMD_PUT) || + (params->command == UCX_PERF_CMD_GET) || + (params->command == UCX_PERF_CMD_ADD) || + (params->command == UCX_PERF_CMD_FADD) || + (params->command == UCX_PERF_CMD_SWAP) || + (params->command == UCX_PERF_CMD_CSWAP))) { + /* TODO: remove when support for non-HOST memory types will be added */ + if (params->flags & UCX_PERF_TEST_FLAG_VERBOSE) { + ucs_error("UCP doesn't support RMA/AMO for \"%s\"<->\"%s\" memory types", + ucs_memory_type_names[params->send_mem_type], + ucs_memory_type_names[params->recv_mem_type]); + } + return UCS_ERR_INVALID_PARAM; + } + if (params->max_outstanding < 1) { if (params->flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_error("max_outstanding, need to be at least 1"); @@ -368,6 +437,32 @@ static ucs_status_t ucx_perf_test_check_params(ucx_perf_params_t *params) return UCS_OK; } +void uct_perf_ep_flush_b(ucx_perf_context_t *perf, int peer_index) +{ + uct_ep_h ep = perf->uct.peers[peer_index].ep; + uct_completion_t comp; + ucs_status_t status; + int started; + + started = 0; + comp.func = NULL; + comp.count = 2; + do { + if (!started) { + status = uct_ep_flush(ep, 0, &comp); + if (status == UCS_OK) { + --comp.count; + } else if (status == UCS_INPROGRESS) { + started = 1; + } else if (status != UCS_ERR_NO_RESOURCE) { + ucs_error("uct_ep_flush() failed: %s", ucs_status_string(status)); + return; + } + } + uct_worker_progress(perf->uct.worker); + } while (comp.count > 1); +} + void uct_perf_iface_flush_b(ucx_perf_context_t *perf) { ucs_status_t status; @@ -376,6 +471,9 @@ void uct_perf_iface_flush_b(ucx_perf_context_t *perf) status = uct_iface_flush(perf->uct.iface, 0, NULL); uct_worker_progress(perf->uct.worker); } while (status == UCS_INPROGRESS); + if (status != UCS_OK) { + ucs_error("uct_iface_flush() failed: %s", ucs_status_string(status)); + } } static inline uint64_t __get_flag(uct_perf_data_layout_t layout, uint64_t short_f, @@ -409,20 +507,47 @@ static inline size_t __get_max_size(uct_perf_data_layout_t layout, size_t short_ 0; } +static ucs_status_t uct_perf_test_check_md_support(ucx_perf_params_t *params, + ucs_memory_type_t mem_type, + uct_md_attr_t *md_attr) +{ + if (!(md_attr->cap.access_mem_type == mem_type) && + !(md_attr->cap.reg_mem_types & UCS_BIT(mem_type))) { + if (params->flags & UCX_PERF_TEST_FLAG_VERBOSE) { + ucs_error("Unsupported memory type %s by "UCT_PERF_TEST_PARAMS_FMT, + ucs_memory_type_names[mem_type], + UCT_PERF_TEST_PARAMS_ARG(params)); + return UCS_ERR_INVALID_PARAM; + } + } + return UCS_OK; +} + static ucs_status_t uct_perf_test_check_capabilities(ucx_perf_params_t *params, - uct_iface_h iface) + uct_iface_h iface, uct_md_h md) { uint64_t required_flags = 0; uint64_t atomic_op32 = 0; uint64_t atomic_op64 = 0; uint64_t atomic_fop32 = 0; uint64_t atomic_fop64 = 0; + uct_md_attr_t md_attr; uct_iface_attr_t attr; ucs_status_t status; size_t min_size, max_size, max_iov, message_size; + status = uct_md_query(md, &md_attr); + if (status != UCS_OK) { + ucs_error("uct_md_query(%s) failed: %s", + params->uct.md_name, ucs_status_string(status)); + return status; + } + status = uct_iface_query(iface, &attr); if (status != UCS_OK) { + ucs_error("uct_iface_query("UCT_PERF_TEST_PARAMS_FMT") failed: %s", + UCT_PERF_TEST_PARAMS_ARG(params), + ucs_status_string(status)); return status; } @@ -500,8 +625,8 @@ static ucs_status_t uct_perf_test_check_capabilities(ucx_perf_params_t *params, if (!(atomic_op32 | atomic_op64 | atomic_fop32 | atomic_fop64) && (!ucs_test_all_flags(attr.cap.flags, required_flags) || !required_flags)) { if (params->flags & UCX_PERF_TEST_FLAG_VERBOSE) { - ucs_error("%s/%s does not support operation %s", - params->uct.tl_name, params->uct.dev_name, + ucs_error(UCT_PERF_TEST_PARAMS_FMT" does not support operation %s", + UCT_PERF_TEST_PARAMS_ARG(params), perf_iface_ops[ucs_ffs64(~attr.cap.flags & required_flags)]); } return UCS_ERR_UNSUPPORTED; @@ -589,12 +714,22 @@ static ucs_status_t uct_perf_test_check_capabilities(ucx_perf_params_t *params, } } + status = uct_perf_test_check_md_support(params, params->send_mem_type, &md_attr); + if (status != UCS_OK) { + return status; + } + + status = uct_perf_test_check_md_support(params, params->recv_mem_type, &md_attr); + if (status != UCS_OK) { + return status; + } + return UCS_OK; } static ucs_status_t uct_perf_test_setup_endpoints(ucx_perf_context_t *perf) { - const size_t buffer_size = 2048; + const size_t buffer_size = ADDR_BUF_SIZE; ucx_perf_ep_info_t info, *remote_info; unsigned group_size, i, group_index; uct_device_addr_t *dev_addr; @@ -639,10 +774,11 @@ static ucs_status_t uct_perf_test_setup_endpoints(ucx_perf_context_t *perf) info.recv_buffer = (uintptr_t)perf->recv_buffer; rkey_buffer = buffer; - dev_addr = (void*)rkey_buffer + info.rkey_size; - iface_addr = (void*)dev_addr + info.uct.dev_addr_len; - ep_addr = (void*)iface_addr + info.uct.iface_addr_len; - ucs_assert_always((void*)ep_addr + info.uct.ep_addr_len <= buffer + buffer_size); + dev_addr = UCS_PTR_BYTE_OFFSET(rkey_buffer, info.rkey_size); + iface_addr = UCS_PTR_BYTE_OFFSET(dev_addr, info.uct.dev_addr_len); + ep_addr = UCS_PTR_BYTE_OFFSET(iface_addr, info.uct.iface_addr_len); + ucs_assert_always(UCS_PTR_BYTE_OFFSET(ep_addr, info.uct.ep_addr_len) <= + UCS_PTR_BYTE_OFFSET(buffer, buffer_size)); status = uct_iface_get_device_address(perf->uct.iface, dev_addr); if (status != UCS_OK) { @@ -716,9 +852,9 @@ static ucs_status_t uct_perf_test_setup_endpoints(ucx_perf_context_t *perf) remote_info = buffer; rkey_buffer = remote_info + 1; - dev_addr = (void*)rkey_buffer + remote_info->rkey_size; - iface_addr = (void*)dev_addr + remote_info->uct.dev_addr_len; - ep_addr = (void*)iface_addr + remote_info->uct.iface_addr_len; + dev_addr = UCS_PTR_BYTE_OFFSET(rkey_buffer, remote_info->rkey_size); + iface_addr = UCS_PTR_BYTE_OFFSET(dev_addr, remote_info->uct.dev_addr_len); + ep_addr = UCS_PTR_BYTE_OFFSET(iface_addr, remote_info->uct.iface_addr_len); perf->uct.peers[i].remote_addr = remote_info->recv_buffer; if (!uct_iface_is_reachable(perf->uct.iface, dev_addr, @@ -730,14 +866,14 @@ static ucs_status_t uct_perf_test_setup_endpoints(ucx_perf_context_t *perf) } if (remote_info->rkey_size > 0) { - status = uct_rkey_unpack(rkey_buffer, &perf->uct.peers[i].rkey); + status = uct_rkey_unpack(perf->uct.cmpt, rkey_buffer, + &perf->uct.peers[i].rkey); if (status != UCS_OK) { ucs_error("Failed to uct_rkey_unpack: %s", ucs_status_string(status)); goto err_destroy_eps; } } else { perf->uct.peers[i].rkey.handle = NULL; - perf->uct.peers[i].rkey.type = NULL; perf->uct.peers[i].rkey.rkey = UCT_INVALID_RKEY; } @@ -763,8 +899,8 @@ static ucs_status_t uct_perf_test_setup_endpoints(ucx_perf_context_t *perf) err_destroy_eps: for (i = 0; i < group_size; ++i) { - if (perf->uct.peers[i].rkey.type != NULL) { - uct_rkey_release(&perf->uct.peers[i].rkey); + if (perf->uct.peers[i].rkey.rkey != UCT_INVALID_RKEY) { + uct_rkey_release(perf->uct.cmpt, &perf->uct.peers[i].rkey); } if (perf->uct.peers[i].ep != NULL) { uct_ep_destroy(perf->uct.peers[i].ep); @@ -791,7 +927,7 @@ static void uct_perf_test_cleanup_endpoints(ucx_perf_context_t *perf) for (i = 0; i < group_size; ++i) { if (i != group_index) { if (perf->uct.peers[i].rkey.rkey != UCT_INVALID_RKEY) { - uct_rkey_release(&perf->uct.peers[i].rkey); + uct_rkey_release(perf->uct.cmpt, &perf->uct.peers[i].rkey); } if (perf->uct.peers[i].ep) { uct_ep_destroy(perf->uct.peers[i].ep); @@ -804,7 +940,8 @@ static void uct_perf_test_cleanup_endpoints(ucx_perf_context_t *perf) static ucs_status_t ucp_perf_test_fill_params(ucx_perf_params_t *params, ucp_params_t *ucp_params) { - ucs_status_t status, message_size; + ucs_status_t status; + size_t message_size; message_size = ucx_perf_get_message_size(params); switch (params->command) { @@ -830,14 +967,10 @@ static ucs_status_t ucp_perf_test_fill_params(ucx_perf_params_t *params, break; case UCX_PERF_CMD_TAG: case UCX_PERF_CMD_TAG_SYNC: - ucp_params->features |= UCP_FEATURE_TAG; - ucp_params->field_mask |= UCP_PARAM_FIELD_REQUEST_SIZE; - ucp_params->request_size = sizeof(ucp_perf_request_t); + ucp_params->features |= UCP_FEATURE_TAG; break; case UCX_PERF_CMD_STREAM: - ucp_params->features |= UCP_FEATURE_STREAM; - ucp_params->field_mask |= UCP_PARAM_FIELD_REQUEST_SIZE; - ucp_params->request_size = sizeof(ucp_perf_request_t); + ucp_params->features |= UCP_FEATURE_STREAM; break; default: if (params->flags & UCX_PERF_TEST_FLAG_VERBOSE) { @@ -873,7 +1006,7 @@ static ucs_status_t ucp_perf_test_alloc_iov_mem(ucp_perf_datatype_t datatype, } static ucs_status_t -ucp_perf_test_alloc_host(ucx_perf_context_t *perf, size_t length, +ucp_perf_test_alloc_host(const ucx_perf_context_t *perf, size_t length, void **address_p, ucp_mem_h *memh, int non_blk_flag) { ucp_mem_map_params_t mem_map_params; @@ -908,8 +1041,8 @@ ucp_perf_test_alloc_host(ucx_perf_context_t *perf, size_t length, return status; } -static void ucp_perf_test_free_host(ucx_perf_context_t *perf, void *address, - ucp_mem_h memh) +static void ucp_perf_test_free_host(const ucx_perf_context_t *perf, + void *address, ucp_mem_h memh) { ucs_status_t status; @@ -988,39 +1121,35 @@ static void ucp_perf_test_free_mem(ucx_perf_context_t *perf) perf->allocator->ucp_free(perf, perf->send_buffer, perf->ucp.send_memh); } -static void ucp_perf_test_destroy_eps(ucx_perf_context_t* perf, - unsigned group_size) +static void ucp_perf_test_destroy_eps(ucx_perf_context_t* perf) { - ucs_status_ptr_t *reqs; - ucp_tag_recv_info_t info; + unsigned i, thread_count = perf->params.thread_count; + ucs_status_ptr_t *req; ucs_status_t status; - unsigned i; - - reqs = calloc(sizeof(*reqs), group_size); - for (i = 0; i < group_size; ++i) { - if (perf->ucp.peers[i].rkey != NULL) { - ucp_rkey_destroy(perf->ucp.peers[i].rkey); - } - if (perf->ucp.peers[i].ep != NULL) { - reqs[i] = ucp_disconnect_nb(perf->ucp.peers[i].ep); + for (i = 0; i < thread_count; ++i) { + if (perf->ucp.tctx[i].perf.ucp.rkey != NULL) { + ucp_rkey_destroy(perf->ucp.tctx[i].perf.ucp.rkey); } - } - for (i = 0; i < group_size; ++i) { - if (!UCS_PTR_IS_PTR(reqs[i])) { - continue; + if (perf->ucp.tctx[i].perf.ucp.ep != NULL) { + req = ucp_ep_close_nb(perf->ucp.tctx[i].perf.ucp.ep, + UCP_EP_CLOSE_MODE_FLUSH); + + if (UCS_PTR_IS_PTR(req)) { + do { + ucp_worker_progress(perf->ucp.tctx[i].perf.ucp.worker); + status = ucp_request_check_status(req); + } while (status == UCS_INPROGRESS); + + ucp_request_release(req); + } else if (UCS_PTR_STATUS(req) != UCS_OK) { + ucs_warn("failed to close ep %p on thread %d: %s\n", + perf->ucp.tctx[i].perf.ucp.ep, i, + ucs_status_string(UCS_PTR_STATUS(req))); + } } - - do { - ucp_worker_progress(perf->ucp.worker); - status = ucp_request_test(reqs[i], &info); - } while (status == UCS_INPROGRESS); - ucp_request_release(reqs[i]); } - - free(reqs); - free(perf->ucp.peers); } static ucs_status_t ucp_perf_test_exchange_status(ucx_perf_context_t *perf, @@ -1046,131 +1175,232 @@ static ucs_status_t ucp_perf_test_exchange_status(ucx_perf_context_t *perf, return collective_status; } -static ucs_status_t ucp_perf_test_setup_endpoints(ucx_perf_context_t *perf, - uint64_t features) +static ucs_status_t ucp_perf_test_receive_remote_data(ucx_perf_context_t *perf) { - const size_t buffer_size = 2048; - ucx_perf_ep_info_t info, *remote_info; - unsigned group_size, i, group_index; - ucp_address_t *address; - size_t address_length = 0; + unsigned thread_count = perf->params.thread_count; + void *rkey_buffer = NULL; + void *req = NULL; + unsigned group_size, group_index, i; + ucx_perf_ep_info_t *remote_info; ucp_ep_params_t ep_params; + ucp_address_t *address; ucs_status_t status; - struct iovec vec[3]; - void *rkey_buffer; - void *req = NULL; + size_t buffer_size; void *buffer; group_size = rte_call(perf, group_size); group_index = rte_call(perf, group_index); - status = ucp_worker_get_address(perf->ucp.worker, &address, &address_length); - if (status != UCS_OK) { - if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { - ucs_error("ucp_worker_get_address() failed: %s", ucs_status_string(status)); - } - goto err; - } - - info.ucp.addr_len = address_length; - info.recv_buffer = (uintptr_t)perf->recv_buffer; - - vec[0].iov_base = &info; - vec[0].iov_len = sizeof(info); - vec[1].iov_base = address; - vec[1].iov_len = address_length; - - if (features & (UCP_FEATURE_RMA|UCP_FEATURE_AMO32|UCP_FEATURE_AMO64)) { - status = ucp_rkey_pack(perf->ucp.context, perf->ucp.recv_memh, - &rkey_buffer, &info.rkey_size); - if (status != UCS_OK) { - if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { - ucs_error("ucp_rkey_pack() failed: %s", ucs_status_string(status)); - } - ucp_worker_release_address(perf->ucp.worker, address); - goto err; - } - - vec[2].iov_base = rkey_buffer; - vec[2].iov_len = info.rkey_size; - rte_call(perf, post_vec, vec, 3, &req); - ucp_rkey_buffer_release(rkey_buffer); - } else { - info.rkey_size = 0; - rte_call(perf, post_vec, vec, 2, &req); + if (group_size != 2) { + ucs_error("perftest requires group size to be exactly 2 " + "(actual group size: %u)", group_size); + return UCS_ERR_UNSUPPORTED; } - ucp_worker_release_address(perf->ucp.worker, address); - rte_call(perf, exchange_vec, req); - - perf->ucp.peers = calloc(group_size, sizeof(*perf->uct.peers)); - if (perf->ucp.peers == NULL) { - goto err; - } + buffer_size = ADDR_BUF_SIZE * thread_count; buffer = malloc(buffer_size); if (buffer == NULL) { - ucs_error("Failed to allocate RTE receive buffer"); + ucs_error("failed to allocate RTE receive buffer"); status = UCS_ERR_NO_MEMORY; - goto err_destroy_eps; + goto err; } - for (i = 0; i < group_size; ++i) { - if (i == group_index) { - continue; - } + /* Initialize all endpoints and rkeys to NULL to handle error flow */ + for (i = 0; i < thread_count; i++) { + perf->ucp.tctx[i].perf.ucp.ep = NULL; + perf->ucp.tctx[i].perf.ucp.rkey = NULL; + } - rte_call(perf, recv, i, buffer, buffer_size, req); + /* receive the data from the remote peer, extract the address from it + * (along with additional wireup info) and create an endpoint to the peer */ + rte_call(perf, recv, 1 - group_index, buffer, buffer_size, req); - remote_info = buffer; - address = (void*)(remote_info + 1); - rkey_buffer = (void*)address + remote_info->ucp.addr_len; - perf->ucp.peers[i].remote_addr = remote_info->recv_buffer; + remote_info = buffer; + for (i = 0; i < thread_count; i++) { + address = (ucp_address_t*)(remote_info + 1); + rkey_buffer = UCS_PTR_BYTE_OFFSET(address, + remote_info->ucp.worker_addr_len); + perf->ucp.tctx[i].perf.ucp.remote_addr = remote_info->recv_buffer; ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; ep_params.address = address; - status = ucp_ep_create(perf->ucp.worker, &ep_params, &perf->ucp.peers[i].ep); + status = ucp_ep_create(perf->ucp.tctx[i].perf.ucp.worker, &ep_params, + &perf->ucp.tctx[i].perf.ucp.ep); if (status != UCS_OK) { if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_error("ucp_ep_create() failed: %s", ucs_status_string(status)); } - goto err_free_buffer; + goto err_free_eps_buffer; } if (remote_info->rkey_size > 0) { - status = ucp_ep_rkey_unpack(perf->ucp.peers[i].ep, rkey_buffer, - &perf->ucp.peers[i].rkey); + status = ucp_ep_rkey_unpack(perf->ucp.tctx[i].perf.ucp.ep, rkey_buffer, + &perf->ucp.tctx[i].perf.ucp.rkey); if (status != UCS_OK) { if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_fatal("ucp_rkey_unpack() failed: %s", ucs_status_string(status)); } - goto err_free_buffer; + goto err_free_eps_buffer; } } else { - perf->ucp.peers[i].rkey = NULL; + perf->ucp.tctx[i].perf.ucp.rkey = NULL; } + + remote_info = UCS_PTR_BYTE_OFFSET(remote_info, + remote_info->ucp.total_wireup_len); } free(buffer); + return UCS_OK; +err_free_eps_buffer: + ucp_perf_test_destroy_eps(perf); + free(buffer); +err: + return status; +} + +static ucs_status_t ucp_perf_test_send_local_data(ucx_perf_context_t *perf, + uint64_t features) +{ + unsigned i, j, thread_count = perf->params.thread_count; + size_t address_length = 0; + void *rkey_buffer = NULL; + void *req = NULL; + ucx_perf_ep_info_t *info; + ucp_address_t *address; + ucs_status_t status; + struct iovec *vec; + size_t rkey_size; + + if (features & (UCP_FEATURE_RMA|UCP_FEATURE_AMO32|UCP_FEATURE_AMO64)) { + status = ucp_rkey_pack(perf->ucp.context, perf->ucp.recv_memh, + &rkey_buffer, &rkey_size); + if (status != UCS_OK) { + if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { + ucs_error("ucp_rkey_pack() failed: %s", ucs_status_string(status)); + } + goto err; + } + } else { + rkey_size = 0; + } + + /* each thread has an iovec with 3 entries to send to the remote peer: + * ep_info, worker_address and rkey buffer */ + vec = calloc(3 * thread_count, sizeof(struct iovec)); + if (vec == NULL) { + ucs_error("failed to allocate iovec"); + status = UCS_ERR_NO_MEMORY; + goto err_rkey_release; + } + + /* get the worker address created for every thread and send it to the remote + * peer */ + for (i = 0; i < thread_count; i++) { + status = ucp_worker_get_address(perf->ucp.tctx[i].perf.ucp.worker, + &address, &address_length); + if (status != UCS_OK) { + if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { + ucs_error("ucp_worker_get_address() failed: %s", + ucs_status_string(status)); + } + goto err_free_workers_vec; + } + + vec[i * 3].iov_base = malloc(sizeof(*info)); + if (vec[i * 3].iov_base == NULL) { + ucs_error("failed to allocate vec entry for info"); + status = UCS_ERR_NO_MEMORY; + ucp_worker_destroy(perf->ucp.tctx[i].perf.ucp.worker); + goto err_free_workers_vec; + } + + info = vec[i * 3].iov_base; + info->ucp.worker_addr_len = address_length; + info->ucp.total_wireup_len = sizeof(*info) + address_length + rkey_size; + info->rkey_size = rkey_size; + info->recv_buffer = (uintptr_t)perf->ucp.tctx[i].perf.recv_buffer; + + vec[(i * 3) + 0].iov_len = sizeof(*info); + vec[(i * 3) + 1].iov_base = address; + vec[(i * 3) + 1].iov_len = address_length; + vec[(i * 3) + 2].iov_base = rkey_buffer; + vec[(i * 3) + 2].iov_len = info->rkey_size; + + address_length = 0; + } + + /* send to the remote peer */ + rte_call(perf, post_vec, vec, 3 * thread_count, &req); + rte_call(perf, exchange_vec, req); + + if (features & (UCP_FEATURE_RMA|UCP_FEATURE_AMO32|UCP_FEATURE_AMO64)) { + ucp_rkey_buffer_release(rkey_buffer); + } + + for (i = 0; i < thread_count; i++) { + free(vec[i * 3].iov_base); + ucp_worker_release_address(perf->ucp.tctx[i].perf.ucp.worker, + vec[(i * 3) + 1].iov_base); + } + + free(vec); + + return UCS_OK; + +err_free_workers_vec: + for (j = 0; j < i; j++) { + ucp_worker_destroy(perf->ucp.tctx[i].perf.ucp.worker); + } + free(vec); +err_rkey_release: + if (features & (UCP_FEATURE_RMA|UCP_FEATURE_AMO32|UCP_FEATURE_AMO64)) { + ucp_rkey_buffer_release(rkey_buffer); + } +err: + return status; +} + +static ucs_status_t ucp_perf_test_setup_endpoints(ucx_perf_context_t *perf, + uint64_t features) +{ + ucs_status_t status; + unsigned i; + + /* pack the local endpoints data and send to the remote peer */ + status = ucp_perf_test_send_local_data(perf, features); + if (status != UCS_OK) { + goto err; + } + + /* receive remote peer's endpoints' data and connect to them */ + status = ucp_perf_test_receive_remote_data(perf); + if (status != UCS_OK) { + goto err; + } + + /* sync status across all processes */ status = ucp_perf_test_exchange_status(perf, UCS_OK); if (status != UCS_OK) { - ucp_perf_test_destroy_eps(perf, group_size); + goto err_destroy_eps; } /* force wireup completion */ - status = ucp_worker_flush(perf->ucp.worker); - if (status != UCS_OK) { - ucs_warn("ucp_worker_flush() failed: %s", ucs_status_string(status)); + for (i = 0; i < perf->params.thread_count; i++) { + status = ucp_worker_flush(perf->ucp.tctx[i].perf.ucp.worker); + if (status != UCS_OK) { + ucs_warn("ucp_worker_flush() failed on theread %d: %s", + i, ucs_status_string(status)); + } } return status; -err_free_buffer: - free(buffer); err_destroy_eps: - ucp_perf_test_destroy_eps(perf, group_size); + ucp_perf_test_destroy_eps(perf); err: (void)ucp_perf_test_exchange_status(perf, status); return status; @@ -1178,75 +1408,111 @@ static ucs_status_t ucp_perf_test_setup_endpoints(ucx_perf_context_t *perf, static void ucp_perf_test_cleanup_endpoints(ucx_perf_context_t *perf) { - unsigned group_size; - ucp_perf_barrier(perf); + ucp_perf_test_destroy_eps(perf); +} - group_size = rte_call(perf, group_size); +static void ucp_perf_test_destroy_workers(ucx_perf_context_t *perf) +{ + unsigned i; - ucp_perf_test_destroy_eps(perf, group_size); + for (i = 0; i < perf->params.thread_count; i++) { + if (perf->ucp.tctx[i].perf.ucp.worker != NULL) { + ucp_worker_destroy(perf->ucp.tctx[i].perf.ucp.worker); + } + } } -static void ucx_perf_set_warmup(ucx_perf_context_t* perf, ucx_perf_params_t* params) +static void ucx_perf_set_warmup(ucx_perf_context_t* perf, + const ucx_perf_params_t* params) { - perf->max_iter = ucs_min(params->warmup_iter, ucs_div_round_up(params->max_iter, 10)); - perf->report_interval = -1; + perf->max_iter = ucs_min(params->warmup_iter, + ucs_div_round_up(params->max_iter, 10)); + perf->report_interval = ULONG_MAX; } static ucs_status_t uct_perf_create_md(ucx_perf_context_t *perf) { - uct_md_resource_desc_t *md_resources; + uct_component_h *uct_components; + uct_component_attr_t component_attr; uct_tl_resource_desc_t *tl_resources; - unsigned i, num_md_resources; - unsigned j, num_tl_resources; + unsigned md_index, num_components; + unsigned tl_index, num_tl_resources; + unsigned cmpt_index; ucs_status_t status; uct_md_h md; uct_md_config_t *md_config; - status = uct_query_md_resources(&md_resources, &num_md_resources); + + status = uct_query_components(&uct_components, &num_components); if (status != UCS_OK) { goto out; } - for (i = 0; i < num_md_resources; ++i) { - status = uct_md_config_read(md_resources[i].md_name, NULL, NULL, &md_config); - if (status != UCS_OK) { - goto out_release_md_resources; - } + for (cmpt_index = 0; cmpt_index < num_components; ++cmpt_index) { - status = uct_md_open(md_resources[i].md_name, md_config, &md); - uct_config_release(md_config); + component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT; + status = uct_component_query(uct_components[cmpt_index], &component_attr); if (status != UCS_OK) { - goto out_release_md_resources; + goto out_release_components_list; } - status = uct_md_query_tl_resources(md, &tl_resources, &num_tl_resources); + component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES; + component_attr.md_resources = alloca(sizeof(*component_attr.md_resources) * + component_attr.md_resource_count); + status = uct_component_query(uct_components[cmpt_index], &component_attr); if (status != UCS_OK) { - uct_md_close(md); - goto out_release_md_resources; + goto out_release_components_list; } - for (j = 0; j < num_tl_resources; ++j) { - if (!strcmp(perf->params.uct.tl_name, tl_resources[j].tl_name) && - !strcmp(perf->params.uct.dev_name, tl_resources[j].dev_name)) - { - uct_release_tl_resource_list(tl_resources); - perf->uct.md = md; - status = UCS_OK; - goto out_release_md_resources; + for (md_index = 0; md_index < component_attr.md_resource_count; ++md_index) { + status = uct_md_config_read(uct_components[cmpt_index], NULL, NULL, + &md_config); + if (status != UCS_OK) { + goto out_release_components_list; } - } - uct_md_close(md); - uct_release_tl_resource_list(tl_resources); + ucs_strncpy_zero(perf->params.uct.md_name, + component_attr.md_resources[md_index].md_name, + UCT_MD_NAME_MAX); + + status = uct_md_open(uct_components[cmpt_index], + component_attr.md_resources[md_index].md_name, + md_config, &md); + uct_config_release(md_config); + if (status != UCS_OK) { + goto out_release_components_list; + } + + status = uct_md_query_tl_resources(md, &tl_resources, &num_tl_resources); + if (status != UCS_OK) { + uct_md_close(md); + goto out_release_components_list; + } + + for (tl_index = 0; tl_index < num_tl_resources; ++tl_index) { + if (!strcmp(perf->params.uct.tl_name, tl_resources[tl_index].tl_name) && + !strcmp(perf->params.uct.dev_name, tl_resources[tl_index].dev_name)) + { + uct_release_tl_resource_list(tl_resources); + perf->uct.cmpt = uct_components[cmpt_index]; + perf->uct.md = md; + status = UCS_OK; + goto out_release_components_list; + } + } + + uct_md_close(md); + uct_release_tl_resource_list(tl_resources); + } } - ucs_error("Cannot use transport %s on device %s", perf->params.uct.tl_name, - perf->params.uct.dev_name); + ucs_error("Cannot use "UCT_PERF_TEST_PARAMS_FMT, + UCT_PERF_TEST_PARAMS_ARG(&perf->params)); status = UCS_ERR_NO_DEVICE; -out_release_md_resources: - uct_release_md_resource_list(md_resources); +out_release_components_list: + uct_release_component_list(uct_components); out: return status; } @@ -1260,7 +1526,11 @@ void uct_perf_barrier(ucx_perf_context_t *perf) void ucp_perf_barrier(ucx_perf_context_t *perf) { rte_call(perf, barrier, (void(*)(void*))ucp_worker_progress, - (void*)perf->ucp.worker); +#if _OPENMP + (void*)perf->ucp.tctx[omp_get_thread_num()].perf.ucp.worker); +#else + (void*)perf->ucp.tctx[0].perf.ucp.worker); +#endif } static ucs_status_t uct_perf_setup(ucx_perf_context_t *perf) @@ -1311,7 +1581,8 @@ static ucs_status_t uct_perf_setup(ucx_perf_context_t *perf) goto out_destroy_md; } - status = uct_perf_test_check_capabilities(params, perf->uct.iface); + status = uct_perf_test_check_capabilities(params, perf->uct.iface, + perf->uct.md); /* sync status across all processes */ status = ucp_perf_test_exchange_status(perf, status); if (status != UCS_OK) { @@ -1362,15 +1633,35 @@ static void uct_perf_cleanup(ucx_perf_context_t *perf) ucs_async_context_cleanup(&perf->uct.async); } +static void ucp_perf_request_init(void *req) +{ + ucp_perf_request_t *request = req; + + request->context = NULL; +} + static ucs_status_t ucp_perf_setup(ucx_perf_context_t *perf) { ucp_params_t ucp_params; ucp_worker_params_t worker_params; ucp_config_t *config; ucs_status_t status; + unsigned i, thread_count; + size_t message_size; + + ucp_params.field_mask = UCP_PARAM_FIELD_FEATURES | + UCP_PARAM_FIELD_REQUEST_SIZE | + UCP_PARAM_FIELD_REQUEST_INIT; + ucp_params.features = 0; + ucp_params.request_size = sizeof(ucp_perf_request_t); + ucp_params.request_init = ucp_perf_request_init; - ucp_params.field_mask = UCP_PARAM_FIELD_FEATURES; - ucp_params.features = 0; + if (perf->params.thread_count > 1) { + /* when there is more than one thread, a ucp_worker would be created for + * each. all of them will share the same ucp_context */ + ucp_params.features |= UCP_PARAM_FIELD_MT_WORKERS_SHARED; + ucp_params.mt_workers_shared = 1; + } status = ucp_perf_test_fill_params(&perf->params, &ucp_params); if (status != UCS_OK) { @@ -1388,19 +1679,38 @@ static ucs_status_t ucp_perf_setup(ucx_perf_context_t *perf) goto err; } - worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; - worker_params.thread_mode = perf->params.thread_mode; + thread_count = perf->params.thread_count; + message_size = ucx_perf_get_message_size(&perf->params); - status = ucp_worker_create(perf->ucp.context, &worker_params, - &perf->ucp.worker); + status = ucp_perf_test_alloc_mem(perf); if (status != UCS_OK) { + ucs_warn("ucp test failed to allocate memory"); goto err_cleanup; } - status = ucp_perf_test_alloc_mem(perf); - if (status != UCS_OK) { - ucs_warn("ucp test failed to alocate memory"); - goto err_destroy_worker; + perf->ucp.tctx = calloc(thread_count, sizeof(ucx_perf_thread_context_t)); + if (perf->ucp.tctx == NULL) { + ucs_warn("ucp test failed to allocate memory for thread contexts"); + goto err_free_mem; + } + + worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; + worker_params.thread_mode = perf->params.thread_mode; + + for (i = 0; i < thread_count; i++) { + perf->ucp.tctx[i].tid = i; + perf->ucp.tctx[i].perf = *perf; + /* Doctor the src and dst buffers to make them thread specific */ + perf->ucp.tctx[i].perf.send_buffer = + UCS_PTR_BYTE_OFFSET(perf->send_buffer, i * message_size); + perf->ucp.tctx[i].perf.recv_buffer = + UCS_PTR_BYTE_OFFSET(perf->recv_buffer, i * message_size); + + status = ucp_worker_create(perf->ucp.context, &worker_params, + &perf->ucp.tctx[i].perf.ucp.worker); + if (status != UCS_OK) { + goto err_free_tctx_destroy_workers; + } } status = ucp_perf_test_setup_endpoints(perf, ucp_params.features); @@ -1408,15 +1718,16 @@ static ucs_status_t ucp_perf_setup(ucx_perf_context_t *perf) if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_error("Failed to setup endpoints: %s", ucs_status_string(status)); } - goto err_free_mem; + goto err_free_tctx_destroy_workers; } return UCS_OK; +err_free_tctx_destroy_workers: + ucp_perf_test_destroy_workers(perf); + free(perf->ucp.tctx); err_free_mem: ucp_perf_test_free_mem(perf); -err_destroy_worker: - ucp_worker_destroy(perf->ucp.worker); err_cleanup: ucp_cleanup(perf->ucp.context); err: @@ -1428,7 +1739,8 @@ static void ucp_perf_cleanup(ucx_perf_context_t *perf) ucp_perf_test_cleanup_endpoints(perf); ucp_perf_barrier(perf); ucp_perf_test_free_mem(perf); - ucp_worker_destroy(perf->ucp.worker); + ucp_perf_test_destroy_workers(perf); + free(perf->ucp.tctx); ucp_cleanup(perf->ucp.context); } @@ -1444,10 +1756,11 @@ static struct { ucp_perf_test_dispatch, ucp_perf_barrier} }; -static int ucx_perf_thread_spawn(ucx_perf_context_t *perf, - ucx_perf_result_t* result); +static ucs_status_t ucx_perf_thread_spawn(ucx_perf_context_t *perf, + ucx_perf_result_t* result); -ucs_status_t ucx_perf_run(ucx_perf_params_t *params, ucx_perf_result_t *result) +ucs_status_t ucx_perf_run(const ucx_perf_params_t *params, + ucx_perf_result_t *result) { ucx_perf_context_t *perf; ucs_status_t status; @@ -1475,11 +1788,21 @@ ucs_status_t ucx_perf_run(ucx_perf_params_t *params, ucx_perf_result_t *result) ucx_perf_test_init(perf, params); if (perf->allocator == NULL) { - ucs_error("Unsupported memory type"); + ucs_error("Unsupported memory types %s<->%s", + ucs_memory_type_names[params->send_mem_type], + ucs_memory_type_names[params->recv_mem_type]); status = UCS_ERR_UNSUPPORTED; goto out_free; } + if ((params->api == UCX_PERF_API_UCT) && + (perf->allocator->mem_type != UCS_MEMORY_TYPE_HOST)) { + ucs_warn("UCT tests also copy 2-byte values from %s memory to " + "%s memory, which may impact performance results", + ucs_memory_type_names[perf->allocator->mem_type], + ucs_memory_type_names[UCS_MEMORY_TYPE_HOST]); + } + status = perf->allocator->init(perf); if (status != UCS_OK) { goto out_free; @@ -1490,7 +1813,14 @@ ucs_status_t ucx_perf_run(ucx_perf_params_t *params, ucx_perf_result_t *result) goto out_free; } - if (UCS_THREAD_MODE_SINGLE == params->thread_mode) { + if (params->thread_count == 1) { + if (params->api == UCX_PERF_API_UCP) { + perf->ucp.worker = perf->ucp.tctx[0].perf.ucp.worker; + perf->ucp.ep = perf->ucp.tctx[0].perf.ucp.ep; + perf->ucp.remote_addr = perf->ucp.tctx[0].perf.ucp.remote_addr; + perf->ucp.rkey = perf->ucp.tctx[0].perf.ucp.rkey; + } + if (params->warmup_iter > 0) { ucx_perf_set_warmup(perf, params); status = ucx_perf_funcs[params->api].run(perf); @@ -1507,7 +1837,7 @@ ucs_status_t ucx_perf_run(ucx_perf_params_t *params, ucx_perf_result_t *result) ucx_perf_funcs[params->api].barrier(perf); if (status == UCS_OK) { ucx_perf_calc_result(perf, result); - rte_call(perf, report, result, perf->params.report_arg, 1); + rte_call(perf, report, result, perf->params.report_arg, 1, 0); } } else { status = ucx_perf_thread_spawn(perf, result); @@ -1522,112 +1852,121 @@ ucs_status_t ucx_perf_run(ucx_perf_params_t *params, ucx_perf_result_t *result) } #if _OPENMP -/* multiple threads sharing the same worker/iface */ - -typedef struct { - pthread_t pt; - int tid; - int ntid; - ucs_status_t* statuses; - ucx_perf_context_t perf; - ucx_perf_result_t result; -} ucx_perf_thread_context_t; - -static void* ucx_perf_thread_run_test(void* arg) +static ucs_status_t ucx_perf_thread_run_test(void* arg) { - ucx_perf_thread_context_t* tctx = (ucx_perf_thread_context_t*) arg; - ucx_perf_result_t* result = &tctx->result; - ucx_perf_context_t* perf = &tctx->perf; - ucx_perf_params_t* params = &perf->params; - ucs_status_t* statuses = tctx->statuses; - int tid = tctx->tid; - int i; + ucx_perf_thread_context_t* tctx = (ucx_perf_thread_context_t*) arg; /* a single thread context */ + ucx_perf_result_t* result = &tctx->result; + ucx_perf_context_t* perf = &tctx->perf; + ucx_perf_params_t* params = &perf->params; + ucs_status_t status; if (params->warmup_iter > 0) { ucx_perf_set_warmup(perf, params); - statuses[tid] = ucx_perf_funcs[params->api].run(perf); + status = ucx_perf_funcs[params->api].run(perf); ucx_perf_funcs[params->api].barrier(perf); - for (i = 0; i < tctx->ntid; i++) { - if (UCS_OK != statuses[i]) { - goto out; - } + if (UCS_OK != status) { + goto out; } ucx_perf_test_prepare_new_run(perf, params); } /* Run test */ #pragma omp barrier - statuses[tid] = ucx_perf_funcs[params->api].run(perf); + status = ucx_perf_funcs[params->api].run(perf); ucx_perf_funcs[params->api].barrier(perf); - for (i = 0; i < tctx->ntid; i++) { - if (UCS_OK != statuses[i]) { - goto out; - } - } -#pragma omp master - { - /* Assuming all threads are fairly treated, reporting only tid==0 - TODO: aggregate reports */ - ucx_perf_calc_result(perf, result); - rte_call(perf, report, result, perf->params.report_arg, 1); + if (UCS_OK != status) { + goto out; } + ucx_perf_calc_result(perf, result); + out: - return &statuses[tid]; + return status; } -static int ucx_perf_thread_spawn(ucx_perf_context_t *perf, - ucx_perf_result_t* result) +static void ucx_perf_thread_report_aggregated_results(ucx_perf_context_t *perf) { - ucx_perf_thread_context_t* tctx; + ucx_perf_thread_context_t* tctx = perf->ucp.tctx; /* all the thread contexts on perf */ + unsigned i, thread_count = perf->params.thread_count; + double lat_sum_total_avegare = 0.0; + ucx_perf_result_t agg_result; + + agg_result.iters = tctx[0].result.iters; + agg_result.bytes = tctx[0].result.bytes; + agg_result.elapsed_time = tctx[0].result.elapsed_time; + + agg_result.bandwidth.total_average = 0.0; + agg_result.bandwidth.typical = 0.0; /* Undefined since used only for latency calculations */ + agg_result.latency.total_average = 0.0; + agg_result.msgrate.total_average = 0.0; + agg_result.msgrate.typical = 0.0; /* Undefined since used only for latency calculations */ + + /* when running with multiple threads, the moment average value is + * undefined since we don't capture the values of the last iteration */ + agg_result.msgrate.moment_average = 0.0; + agg_result.bandwidth.moment_average = 0.0; + agg_result.latency.moment_average = 0.0; + agg_result.latency.typical = 0.0; + + /* in case of multiple threads, we have to aggregate the results so that the + * final output of the result would show the performance numbers that were + * collected from all the threads. + * BW and message rate values will be the sum of their values from all + * the threads, while the latency value is the average latency from the + * threads. */ + + for (i = 0; i < thread_count; i++) { + agg_result.bandwidth.total_average += tctx[i].result.bandwidth.total_average; + agg_result.msgrate.total_average += tctx[i].result.msgrate.total_average; + lat_sum_total_avegare += tctx[i].result.latency.total_average; + } + + agg_result.latency.total_average = lat_sum_total_avegare / thread_count; + + rte_call(perf, report, &agg_result, perf->params.report_arg, 1, 1); +} + +static ucs_status_t ucx_perf_thread_spawn(ucx_perf_context_t *perf, + ucx_perf_result_t* result) +{ + ucx_perf_thread_context_t* tctx = perf->ucp.tctx; /* all the thread contexts on perf */ + int ti, thread_count = perf->params.thread_count; ucs_status_t* statuses; - size_t message_size; ucs_status_t status; - int ti, nti; - message_size = ucx_perf_get_message_size(&perf->params); - omp_set_num_threads(perf->params.thread_count); - nti = perf->params.thread_count; + omp_set_num_threads(thread_count); - tctx = calloc(nti, sizeof(ucx_perf_thread_context_t)); - statuses = calloc(nti, sizeof(ucs_status_t)); - if ((tctx == NULL) || (statuses == NULL)) { + statuses = calloc(thread_count, sizeof(ucs_status_t)); + if (statuses == NULL) { status = UCS_ERR_NO_MEMORY; - goto out_free; + goto out; } #pragma omp parallel private(ti) { - ti = omp_get_thread_num(); - tctx[ti].tid = ti; - tctx[ti].ntid = nti; - tctx[ti].statuses = statuses; - tctx[ti].perf = *perf; - /* Doctor the src and dst buffers to make them thread specific */ - tctx[ti].perf.send_buffer += ti * message_size; - tctx[ti].perf.recv_buffer += ti * message_size; - tctx[ti].perf.offset = ti * message_size; - ucx_perf_thread_run_test((void*)&tctx[ti]); + ti = omp_get_thread_num(); + tctx[ti].status = ucx_perf_thread_run_test((void*)&tctx[ti]); } status = UCS_OK; - for (ti = 0; ti < nti; ti++) { - if (UCS_OK != statuses[ti]) { + for (ti = 0; ti < thread_count; ti++) { + if (UCS_OK != tctx[ti].status) { ucs_error("Thread %d failed to run test: %s", tctx[ti].tid, - ucs_status_string(statuses[ti])); - status = statuses[ti]; + ucs_status_string(tctx[ti].status)); + status = tctx[ti].status; } } -out_free: + ucx_perf_thread_report_aggregated_results(perf); + free(statuses); - free(tctx); +out: return status; } #else -static int ucx_perf_thread_spawn(ucx_perf_context_t *perf, - ucx_perf_result_t* result) { +static ucs_status_t ucx_perf_thread_spawn(ucx_perf_context_t *perf, + ucx_perf_result_t* result) { ucs_error("Invalid test parameter (thread mode requested without OpenMP capabilities)"); return UCS_ERR_INVALID_PARAM; } @@ -1636,14 +1975,18 @@ static int ucx_perf_thread_spawn(ucx_perf_context_t *perf, void ucx_perf_global_init() { static ucx_perf_allocator_t host_allocator = { + .mem_type = UCS_MEMORY_TYPE_HOST, .init = ucs_empty_function_return_success, .ucp_alloc = ucp_perf_test_alloc_host, .ucp_free = ucp_perf_test_free_host, + .uct_alloc = uct_perf_test_alloc_host, + .uct_free = uct_perf_test_free_host, + .memcpy = ucx_perf_test_memcpy_host, .memset = memset }; UCS_MODULE_FRAMEWORK_DECLARE(ucx_perftest); - ucx_perf_mem_type_allocators[UCT_MD_MEM_TYPE_HOST] = &host_allocator; + ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_HOST] = &host_allocator; /* FIXME Memtype allocator modules must be loaded to global scope, otherwise * alloc hooks, which are using dlsym() to get pointer to original function, diff --git a/src/tools/perf/lib/libperf_int.h b/src/tools/perf/lib/libperf_int.h index 96b803fd231..74592000db0 100644 --- a/src/tools/perf/lib/libperf_int.h +++ b/src/tools/perf/lib/libperf_int.h @@ -25,31 +25,38 @@ BEGIN_C_DECLS #define TIMING_QUEUE_SIZE 2048 #define UCT_PERF_TEST_AM_ID 5 +#define ADDR_BUF_SIZE 2048 -typedef struct ucx_perf_context ucx_perf_context_t; -typedef struct uct_peer uct_peer_t; -typedef struct ucp_peer ucp_peer_t; -typedef struct ucp_perf_request ucp_perf_request_t; +typedef struct ucx_perf_context ucx_perf_context_t; +typedef struct uct_peer uct_peer_t; +typedef struct ucp_perf_request ucp_perf_request_t; +typedef struct ucx_perf_thread_context ucx_perf_thread_context_t; struct ucx_perf_allocator { + ucs_memory_type_t mem_type; ucs_status_t (*init)(ucx_perf_context_t *perf); - ucs_status_t (*ucp_alloc)(ucx_perf_context_t *perf, size_t length, + ucs_status_t (*ucp_alloc)(const ucx_perf_context_t *perf, size_t length, void **address_p, ucp_mem_h *memh, int non_blk_flag); - void (*ucp_free)(ucx_perf_context_t *perf, void *address, + void (*ucp_free)(const ucx_perf_context_t *perf, void *address, ucp_mem_h memh); - void* (*memset)(void *s, int c, size_t len); + ucs_status_t (*uct_alloc)(const ucx_perf_context_t *perf, size_t length, + unsigned flags, uct_allocated_memory_t *alloc_mem); + void (*uct_free)(const ucx_perf_context_t *perf, + uct_allocated_memory_t *alloc_mem); + void (*memcpy)(void *dst, ucs_memory_type_t dst_mem_type, + const void *src, ucs_memory_type_t src_mem_type, + size_t count); + void* (*memset)(void *dst, int value, size_t count); }; - struct ucx_perf_context { ucx_perf_params_t params; /* Buffers */ void *send_buffer; void *recv_buffer; - ptrdiff_t offset; /* Measurements */ double start_time_acc; /* accurate start time */ @@ -73,40 +80,46 @@ struct ucx_perf_context { union { struct { - ucs_async_context_t async; - uct_md_h md; - uct_worker_h worker; - uct_iface_h iface; - uct_peer_t *peers; + ucs_async_context_t async; + uct_component_h cmpt; + uct_md_h md; + uct_worker_h worker; + uct_iface_h iface; + uct_peer_t *peers; uct_allocated_memory_t send_mem; uct_allocated_memory_t recv_mem; - uct_iov_t *iov; + uct_iov_t *iov; } uct; struct { - ucp_context_h context; - ucp_worker_h worker; - ucp_peer_t *peers; - ucp_mem_h send_memh; - ucp_mem_h recv_memh; - ucp_dt_iov_t *send_iov; - ucp_dt_iov_t *recv_iov; + ucp_context_h context; + ucx_perf_thread_context_t* tctx; + ucp_worker_h worker; + ucp_ep_h ep; + ucp_rkey_h rkey; + unsigned long remote_addr; + ucp_mem_h send_memh; + ucp_mem_h recv_memh; + ucp_dt_iov_t *send_iov; + ucp_dt_iov_t *recv_iov; } ucp; }; }; -struct uct_peer { - uct_ep_h ep; - unsigned long remote_addr; - uct_rkey_bundle_t rkey; +struct ucx_perf_thread_context { + pthread_t pt; + int tid; + ucs_status_t status; + ucx_perf_context_t perf; + ucx_perf_result_t result; }; -struct ucp_peer { - ucp_ep_h ep; +struct uct_peer { + uct_ep_h ep; unsigned long remote_addr; - ucp_rkey_h rkey; + uct_rkey_bundle_t rkey; }; @@ -125,6 +138,9 @@ struct ucp_perf_request { void ucx_perf_test_start_clock(ucx_perf_context_t *perf); +void uct_perf_ep_flush_b(ucx_perf_context_t *perf, int peer_index); + + void uct_perf_iface_flush_b(ucx_perf_context_t *perf); @@ -155,6 +171,15 @@ static inline void ucx_perf_get_time(ucx_perf_context_t *perf) perf->current.time_acc = ucs_get_accurate_time(); } +static inline void ucx_perf_omp_barrier(ucx_perf_context_t *perf) +{ +#if _OPENMP + if (perf->params.thread_count > 1) { +#pragma omp barrier + } +#endif +} + static inline void ucx_perf_update(ucx_perf_context_t *perf, ucx_perf_counter_t iters, size_t bytes) { @@ -177,18 +202,8 @@ static inline void ucx_perf_update(ucx_perf_context_t *perf, if (perf->current.time - perf->prev.time >= perf->report_interval) { ucx_perf_get_time(perf); - /* Disable all other threads' report generation and output. - * The master clause cannot be used here as the unit test - * uct_test_perf runs on single pthreads with no parallel region, - * using that clause will result in undefined behavior. - */ -#if _OPENMP - if (omp_get_thread_num() == 0) -#endif /* _OPENMP */ - { - ucx_perf_calc_result(perf, &result); - rte_call(perf, report, &result, perf->params.report_arg, 0); - } + ucx_perf_calc_result(perf, &result); + rte_call(perf, report, &result, perf->params.report_arg, 0, 0); perf->prev = perf->current; } @@ -213,7 +228,6 @@ size_t ucx_perf_get_message_size(const ucx_perf_params_t *params) return length; } - END_C_DECLS #endif diff --git a/src/tools/perf/lib/ucp_tests.cc b/src/tools/perf/lib/ucp_tests.cc index 8738aca7153..febd43d2713 100644 --- a/src/tools/perf/lib/ucp_tests.cc +++ b/src/tools/perf/lib/ucp_tests.cc @@ -6,6 +6,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include extern "C" { @@ -15,12 +19,15 @@ extern "C" { } #include +#include + template class ucp_perf_test_runner { public: static const ucp_tag_t TAG = 0x1337a880u; - static const ucp_tag_t TAG_MASK = (FLAGS & UCX_PERF_TEST_FLAG_TAG_WILDCARD) ? 0 : -1; + static const ucp_tag_t TAG_MASK = (FLAGS & UCX_PERF_TEST_FLAG_TAG_WILDCARD) ? + 0 : (ucp_tag_t)-1; typedef uint8_t psn_t; @@ -91,23 +98,6 @@ class ucp_perf_test_runner { ucp_worker_progress(m_perf.ucp.worker); } - ucs_status_t UCS_F_ALWAYS_INLINE wait(void *request, bool is_requestor) - { - if (ucs_likely(!UCS_PTR_IS_PTR(request))) { - return UCS_PTR_STATUS(request); - } - - while (!ucp_request_is_completed(request)) { - if (is_requestor) { - progress_requestor(); - } else { - progress_responder(); - } - } - ucp_request_release(request); - return UCS_OK; - } - ssize_t UCS_F_ALWAYS_INLINE wait_stream_recv(void *request) { size_t length; @@ -125,18 +115,42 @@ class ucp_perf_test_runner { } static void send_cb(void *request, ucs_status_t status) + { + ucp_perf_request_t *r = reinterpret_cast( + request); + ucp_perf_test_runner *test = (ucp_perf_test_runner*)r->context; + + test->op_completed(); + r->context = NULL; + ucp_request_free(request); + } + + static void tag_recv_cb(void *request, ucs_status_t status, + ucp_tag_recv_info_t *info) { ucp_perf_request_t *r = reinterpret_cast(request); - ucp_perf_test_runner *sender = (ucp_perf_test_runner*)r->context; + ucp_perf_test_runner *test; - sender->send_completed(); - ucp_request_release(request); + /* if the request is completed during tag_recv_nb(), the context is + * still NULL */ + if (r->context == NULL) { + return; + } + + test = (ucp_perf_test_runner*)r->context; + test->op_completed(); + r->context = NULL; + ucp_request_free(request); } - void UCS_F_ALWAYS_INLINE wait_window(unsigned n) + void UCS_F_ALWAYS_INLINE wait_window(unsigned n, bool is_requestor) { while (m_outstanding >= (m_max_outstanding - n + 1)) { - progress_requestor(); + if (is_requestor) { + progress_requestor(); + } else { + progress_responder(); + } } } @@ -151,7 +165,7 @@ class ucp_perf_test_runner { case UCX_PERF_CMD_TAG: case UCX_PERF_CMD_TAG_SYNC: case UCX_PERF_CMD_STREAM: - wait_window(1); + wait_window(1, true); /* coverity[switch_selector_expr_is_constant] */ switch (CMD) { case UCX_PERF_CMD_TAG: @@ -174,7 +188,7 @@ class ucp_perf_test_runner { return UCS_PTR_STATUS(request); } reinterpret_cast(request)->context = this; - send_started(); + op_started(); return UCS_OK; case UCX_PERF_CMD_PUT: *((uint8_t*)buffer + length - 1) = sn; @@ -229,6 +243,7 @@ class ucp_perf_test_runner { switch (CMD) { case UCX_PERF_CMD_TAG: case UCX_PERF_CMD_TAG_SYNC: + wait_window(1, false); if (FLAGS & UCX_PERF_TEST_FLAG_TAG_UNEXP_PROBE) { ucp_tag_recv_info_t tag_info; while (ucp_tag_probe_nb(worker, TAG, TAG_MASK, 0, &tag_info) == NULL) { @@ -236,8 +251,18 @@ class ucp_perf_test_runner { } } request = ucp_tag_recv_nb(worker, buffer, length, datatype, TAG, TAG_MASK, - (ucp_tag_recv_callback_t)ucs_empty_function); - return wait(request, false); + tag_recv_cb); + if (ucs_likely(!UCS_PTR_IS_PTR(request))) { + return UCS_PTR_STATUS(request); + } + if (ucp_request_is_completed(request)) { + /* request is already completed and callback was called */ + ucp_request_free(request); + return UCS_OK; + } + reinterpret_cast(request)->context = this; + op_started(); + return UCS_OK; case UCX_PERF_CMD_PUT: /* coverity[switch_selector_expr_is_constant] */ switch (TYPE) { @@ -267,17 +292,27 @@ class ucp_perf_test_runner { } case UCX_PERF_CMD_STREAM: if (FLAGS & UCX_PERF_TEST_FLAG_STREAM_RECV_DATA) { - return recv_stream_data(ep, length, datatype, sn); + return recv_stream_data(ep, length, datatype); } else { - return recv_stream(ep, buffer, length, datatype, sn); + return recv_stream(ep, buffer, length, datatype); } default: return UCS_ERR_INVALID_PARAM; } } + void flush() + { + if (m_perf.params.flags & UCX_PERF_TEST_FLAG_FLUSH_EP) { + ucp_ep_flush(m_perf.ucp.ep); + } else { + ucp_worker_flush(m_perf.ucp.worker); + } + } + ucs_status_t run_pingpong() { + const psn_t unknown_psn = std::numeric_limits::max(); unsigned my_index; ucp_worker_h worker; ucp_ep_h ep; @@ -293,7 +328,12 @@ class ucp_perf_test_runner { ucp_perf_test_prepare_iov_buffers(); - m_perf.allocator->memset((char*)m_perf.recv_buffer + length - 1, -1, 1); + if (CMD == UCX_PERF_CMD_PUT) { + m_perf.allocator->memcpy((psn_t*)m_perf.recv_buffer + length - 1, + m_perf.allocator->mem_type, + &unknown_psn, UCS_MEMORY_TYPE_HOST, + sizeof(unknown_psn)); + } ucp_perf_barrier(&m_perf); @@ -301,12 +341,14 @@ class ucp_perf_test_runner { ucx_perf_test_start_clock(&m_perf); + ucx_perf_omp_barrier(&m_perf); + send_buffer = m_perf.send_buffer; recv_buffer = m_perf.recv_buffer; worker = m_perf.ucp.worker; - ep = m_perf.ucp.peers[1 - my_index].ep; - remote_addr = m_perf.ucp.peers[1 - my_index].remote_addr + m_perf.offset; - rkey = m_perf.ucp.peers[1 - my_index].rkey; + ep = m_perf.ucp.ep; + remote_addr = m_perf.ucp.remote_addr; + rkey = m_perf.ucp.rkey; sn = 0; send_length = length; recv_length = length; @@ -333,8 +375,11 @@ class ucp_perf_test_runner { } } - wait_window(m_max_outstanding); - ucp_worker_flush(m_perf.ucp.worker); + wait_window(m_max_outstanding, true); + flush(); + + ucx_perf_omp_barrier(&m_perf); + ucx_perf_get_time(&m_perf); ucp_perf_barrier(&m_perf); return UCS_OK; @@ -363,12 +408,14 @@ class ucp_perf_test_runner { ucx_perf_test_start_clock(&m_perf); + ucx_perf_omp_barrier(&m_perf); + send_buffer = m_perf.send_buffer; recv_buffer = m_perf.recv_buffer; worker = m_perf.ucp.worker; - ep = m_perf.ucp.peers[1 - my_index].ep; - remote_addr = m_perf.ucp.peers[1 - my_index].remote_addr + m_perf.offset; - rkey = m_perf.ucp.peers[1 - my_index].rkey; + ep = m_perf.ucp.ep; + remote_addr = m_perf.ucp.remote_addr; + rkey = m_perf.ucp.rkey; sn = 0; send_length = length; recv_length = length; @@ -394,8 +441,11 @@ class ucp_perf_test_runner { } } - wait_window(m_max_outstanding); - ucp_worker_flush(m_perf.ucp.worker); + wait_window(m_max_outstanding, true); + flush(); + + ucx_perf_omp_barrier(&m_perf); + ucx_perf_get_time(&m_perf); ucp_perf_barrier(&m_perf); @@ -418,8 +468,7 @@ class ucp_perf_test_runner { private: ucs_status_t UCS_F_ALWAYS_INLINE - recv_stream_data(ucp_ep_h ep, unsigned length, ucp_datatype_t datatype, - uint8_t sn) + recv_stream_data(ucp_ep_h ep, unsigned length, ucp_datatype_t datatype) { void *data; size_t data_length; @@ -438,8 +487,7 @@ class ucp_perf_test_runner { } ucs_status_t UCS_F_ALWAYS_INLINE - recv_stream(ucp_ep_h ep, void *buf, unsigned length, ucp_datatype_t datatype, - uint8_t sn) + recv_stream(ucp_ep_h ep, void *buf, unsigned length, ucp_datatype_t datatype) { ssize_t total = 0; void *rreq; @@ -467,12 +515,12 @@ class ucp_perf_test_runner { return UCS_OK; } - void UCS_F_ALWAYS_INLINE send_started() + void UCS_F_ALWAYS_INLINE op_started() { ++m_outstanding; } - void UCS_F_ALWAYS_INLINE send_completed() + void UCS_F_ALWAYS_INLINE op_completed() { --m_outstanding; } diff --git a/src/tools/perf/lib/uct_tests.cc b/src/tools/perf/lib/uct_tests.cc index edfbd8ad757..3dcab19d90e 100644 --- a/src/tools/perf/lib/uct_tests.cc +++ b/src/tools/perf/lib/uct_tests.cc @@ -6,6 +6,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include extern "C" { @@ -15,6 +19,7 @@ extern "C" { #include } +#include template class uct_perf_test_runner { @@ -32,6 +37,7 @@ class uct_perf_test_runner { m_completion.count = 1; m_completion.func = NULL; + m_last_recvd_sn = 0; ucs_status_t status; uct_iface_attr_t attr; @@ -42,7 +48,7 @@ class uct_perf_test_runner { UCT_IFACE_FLAG_AM_ZCOPY)) { status = uct_iface_set_am_handler(m_perf.uct.iface, UCT_PERF_TEST_AM_ID, am_hander, - m_perf.recv_buffer, 0); + (void*)&m_last_recvd_sn, 0); ucs_assert_always(status == UCS_OK); } } @@ -96,7 +102,8 @@ class uct_perf_test_runner { start_iov_buffer_size = m_perf.params.am_hdr_size; } uct_perf_get_buffer_iov(m_perf.uct.iov, m_perf.send_buffer, - start_iov_buffer_size, m_perf.uct.send_mem.memh, + start_iov_buffer_size, + m_perf.uct.send_mem.memh, &m_perf); } } @@ -119,6 +126,54 @@ class uct_perf_test_runner { return length; } + inline void set_sn(void *dst_sn, + ucs_memory_type_t dst_mem_type, + const void *src_sn) const { + if (ucs_likely(m_perf.allocator->mem_type == UCS_MEMORY_TYPE_HOST)) { + ucs_assert(dst_mem_type == UCS_MEMORY_TYPE_HOST); + *reinterpret_cast(dst_sn) = *reinterpret_cast(src_sn); + } + + m_perf.allocator->memcpy(dst_sn, dst_mem_type, + src_sn, UCS_MEMORY_TYPE_HOST, + sizeof(psn_t)); + } + + inline psn_t get_sn(const volatile void *sn, + ucs_memory_type_t mem_type) const { + if (ucs_likely(mem_type == UCS_MEMORY_TYPE_HOST)) { + return *reinterpret_cast(sn); + } + + psn_t host_sn; + m_perf.allocator->memcpy(&host_sn, UCS_MEMORY_TYPE_HOST, + const_cast(sn), + mem_type, sizeof(psn_t)); + return host_sn; + } + + inline void set_recv_sn(void *recv_sn, + ucs_memory_type_t recv_mem_type, + const void *src_sn) const { + if (CMD == UCX_PERF_CMD_AM) { + ucs_assert(&m_last_recvd_sn == recv_sn); + *(psn_t*)recv_sn = *(const psn_t*)src_sn; + } else { + set_sn(recv_sn, recv_mem_type, src_sn); + } + } + + inline psn_t get_recv_sn(const volatile void *recv_sn, + ucs_memory_type_t recv_mem_type) const { + if (CMD == UCX_PERF_CMD_AM) { + /* it has to be updated after AM completion */ + ucs_assert(&m_last_recvd_sn == recv_sn); + return m_last_recvd_sn; + } else { + return get_sn(recv_sn, recv_mem_type); + } + } + void UCS_F_ALWAYS_INLINE progress_responder() { if (!ONESIDED) { uct_worker_progress(m_perf.uct.worker); @@ -139,6 +194,7 @@ class uct_perf_test_runner { static ucs_status_t am_hander(void *arg, void *data, size_t length, unsigned flags) { + /* we always assume that buffers provided by TLs are host memory */ ucs_assert(UCS_CIRCULAR_COMPARE8(*(psn_t*)arg, <=, *(psn_t*)data)); *(psn_t*)arg = *(psn_t*)data; return UCS_OK; @@ -149,10 +205,28 @@ class uct_perf_test_runner { uct_perf_test_runner *self = (uct_perf_test_runner *)arg; size_t length = ucx_perf_get_message_size(&self->m_perf.params); - memcpy(dest, self->m_perf.send_buffer, length); + self->m_perf.allocator->memcpy(/* we always assume that buffers + * provided by TLs are host memory */ + dest, UCS_MEMORY_TYPE_HOST, + self->m_perf.send_buffer, + self->m_perf.uct.send_mem.mem_type, + length); + return length; } + static void unpack_cb(void *arg, const void *data, size_t length) + { + uct_perf_test_runner *self = (uct_perf_test_runner *)arg; + + self->m_perf.allocator->memcpy(self->m_perf.send_buffer, + self->m_perf.uct.send_mem.mem_type, + /* we always assume that buffers + * provided by TLs are host memory */ + data, UCS_MEMORY_TYPE_HOST, + length); + } + ucs_status_t UCS_F_ALWAYS_INLINE send(uct_ep_h ep, psn_t sn, psn_t prev_sn, void *buffer, unsigned length, uint64_t remote_addr, uct_rkey_t rkey, uct_completion_t *comp) @@ -172,12 +246,12 @@ class uct_perf_test_runner { (char*)buffer + sizeof(am_short_hdr), length - sizeof(am_short_hdr)); case UCT_PERF_DATA_LAYOUT_BCOPY: - *(psn_t*)buffer = sn; + set_sn(buffer, m_perf.uct.send_mem.mem_type, &sn); packed_len = uct_ep_am_bcopy(ep, UCT_PERF_TEST_AM_ID, pack_cb, (void*)this, 0); return (packed_len >= 0) ? UCS_OK : (ucs_status_t)packed_len; case UCT_PERF_DATA_LAYOUT_ZCOPY: - *(psn_t*)buffer = sn; + set_sn(buffer, m_perf.uct.send_mem.mem_type, &sn); header_size = m_perf.params.am_hdr_size; return uct_ep_am_zcopy(ep, UCT_PERF_TEST_AM_ID, buffer, header_size, m_perf.uct.iov, m_perf.params.msg_size_cnt, @@ -188,7 +262,9 @@ class uct_perf_test_runner { case UCX_PERF_CMD_PUT: if (TYPE == UCX_PERF_TEST_TYPE_PINGPONG) { /* Put the control word at the latest byte of the IOV message */ - *((psn_t*)buffer + uct_perf_get_buffer_extent(&m_perf.params) - 1) = sn; + set_sn(UCS_PTR_BYTE_OFFSET(buffer, + uct_perf_get_buffer_extent(&m_perf.params) - 1), + m_perf.uct.send_mem.mem_type, &sn); } /* coverity[switch_selector_expr_is_constant] */ switch (DATA) { @@ -207,8 +283,8 @@ class uct_perf_test_runner { /* coverity[switch_selector_expr_is_constant] */ switch (DATA) { case UCT_PERF_DATA_LAYOUT_BCOPY: - return uct_ep_get_bcopy(ep, (uct_unpack_callback_t)memcpy, - buffer, length, remote_addr, rkey, comp); + return uct_ep_get_bcopy(ep, unpack_cb, (void*)this, + length, remote_addr, rkey, comp); case UCT_PERF_DATA_LAYOUT_ZCOPY: return uct_ep_get_zcopy(ep, m_perf.uct.iov, m_perf.params.msg_size_cnt, remote_addr, rkey, comp); @@ -285,9 +361,18 @@ class uct_perf_test_runner { }; } + void flush(int peer_index) + { + if (m_perf.params.flags & UCX_PERF_TEST_FLAG_FLUSH_EP) { + uct_perf_ep_flush_b(&m_perf, peer_index); + } else { + uct_perf_iface_flush_b(&m_perf); + } + } + ucs_status_t run_pingpong() { - psn_t send_sn, *recv_sn; + psn_t send_sn, *recv_sn, sn; unsigned my_index; uct_ep_h ep; uint64_t remote_addr; @@ -301,6 +386,8 @@ class uct_perf_test_runner { /* coverity[switch_selector_expr_is_constant] */ switch (CMD) { case UCX_PERF_CMD_AM: + recv_sn = &m_last_recvd_sn; + break; case UCX_PERF_CMD_ADD: recv_sn = (psn_t*)m_perf.recv_buffer; break; @@ -315,7 +402,9 @@ class uct_perf_test_runner { uct_perf_test_prepare_iov_buffer(); - *recv_sn = -1; + sn = std::numeric_limits::max(); + set_recv_sn(recv_sn, m_perf.uct.recv_mem.mem_type, &sn); + uct_perf_barrier(&m_perf); my_index = rte_call(&m_perf, group_index); @@ -323,7 +412,7 @@ class uct_perf_test_runner { ucx_perf_test_start_clock(&m_perf); buffer = m_perf.send_buffer; - remote_addr = m_perf.uct.peers[1 - my_index].remote_addr + m_perf.offset; + remote_addr = m_perf.uct.peers[1 - my_index].remote_addr; rkey = m_perf.uct.peers[1 - my_index].rkey.rkey; ep = m_perf.uct.peers[1 - my_index].ep; @@ -333,16 +422,21 @@ class uct_perf_test_runner { send_b(ep, send_sn, send_sn - 1, buffer, length, remote_addr, rkey, NULL); ucx_perf_update(&m_perf, 1, length); - while (*recv_sn != send_sn) { + + do { progress_responder(); - } + sn = get_recv_sn(recv_sn, m_perf.uct.recv_mem.mem_type); + } while (sn != send_sn); + ++send_sn; } } else if (my_index == 1) { UCX_PERF_TEST_FOREACH(&m_perf) { - while (*recv_sn != send_sn) { + do { progress_responder(); - } + sn = get_recv_sn(recv_sn, m_perf.uct.recv_mem.mem_type); + } while (sn != send_sn); + send_b(ep, send_sn, send_sn - 1, buffer, length, remote_addr, rkey, NULL); ucx_perf_update(&m_perf, 1, length); @@ -350,7 +444,7 @@ class uct_perf_test_runner { } } - uct_perf_iface_flush_b(&m_perf); + flush(1 - my_index); ucx_perf_get_time(&m_perf); return UCS_OK; } @@ -372,13 +466,16 @@ class uct_perf_test_runner { ucs_assert(length >= sizeof(psn_t)); ucs_assert(m_perf.params.uct.fc_window <= ((psn_t)-1) / 2); - memset(m_perf.send_buffer, 0, length); - memset(m_perf.recv_buffer, 0, length); + m_perf.allocator->memset(m_perf.send_buffer, 0, length); + m_perf.allocator->memset(m_perf.recv_buffer, 0, length); uct_perf_test_prepare_iov_buffer(); - recv_sn = direction_to_responder ? (psn_t*)m_perf.recv_buffer : - (psn_t*)m_perf.send_buffer; + recv_sn = (direction_to_responder ? + ((CMD == UCX_PERF_CMD_AM) ? + &m_last_recvd_sn : + (psn_t*)m_perf.recv_buffer) : + (psn_t*)m_perf.send_buffer); my_index = rte_call(&m_perf, group_index); uct_perf_barrier(&m_perf); @@ -387,7 +484,7 @@ class uct_perf_test_runner { ep = m_perf.uct.peers[1 - my_index].ep; buffer = m_perf.send_buffer; - remote_addr = m_perf.uct.peers[1 - my_index].remote_addr + m_perf.offset; + remote_addr = m_perf.uct.peers[1 - my_index].remote_addr; rkey = m_perf.uct.peers[1 - my_index].rkey.rkey; fc_window = m_perf.params.uct.fc_window; @@ -398,15 +495,19 @@ class uct_perf_test_runner { } else{ send_sn = 0; /* Remote buffer will remain 0 throughout the test */ } - *(psn_t*)buffer = send_sn; + + set_sn(buffer, m_perf.uct.send_mem.mem_type, &send_sn); UCX_PERF_TEST_FOREACH(&m_perf) { if (flow_control) { /* Wait until getting ACK from responder */ - ucs_assertv(UCS_CIRCULAR_COMPARE8(send_sn - 1, >=, *recv_sn), - "recv_sn=%d iters=%ld", *recv_sn, m_perf.current.iters); - while (UCS_CIRCULAR_COMPARE8(send_sn, >, (psn_t)(*recv_sn + fc_window))) { + sn = get_recv_sn(recv_sn, m_perf.uct.recv_mem.mem_type); + ucs_assertv(UCS_CIRCULAR_COMPARE8(send_sn - 1, >=, sn), + "recv_sn=%d iters=%ld", sn, m_perf.current.iters); + + while (UCS_CIRCULAR_COMPARE8(send_sn, >, sn + fc_window)) { progress_responder(); + sn = get_recv_sn(recv_sn, m_perf.uct.recv_mem.mem_type); } } @@ -427,21 +528,26 @@ class uct_perf_test_runner { } if (!flow_control) { + sn = 2; /* Send "sentinel" value */ if (direction_to_responder) { wait_for_window(send_window); - *(psn_t*)buffer = 2; + set_sn(buffer, m_perf.uct.send_mem.mem_type, &sn); send_b(ep, 2, send_sn, buffer, length, remote_addr, rkey, &m_completion); } else { - *(psn_t*)m_perf.recv_buffer = 2; + set_sn(m_perf.recv_buffer, + m_perf.uct.recv_mem.mem_type, + &sn); } } else { /* Wait for last ACK, to make sure no more messages will arrive. */ ucs_assert(direction_to_responder); - while (UCS_CIRCULAR_COMPARE8((psn_t)(send_sn - 1), >, *recv_sn)) { + + do { progress_responder(); - } + sn = get_recv_sn(recv_sn, m_perf.uct.recv_mem.mem_type); + } while (UCS_CIRCULAR_COMPARE8((psn_t)(send_sn - 1), >, sn)); } } else if (my_index == 0) { if (flow_control) { @@ -451,8 +557,9 @@ class uct_perf_test_runner { send_sn = (psn_t)-1; /* Last SN we have sent (as acknowledgment) */ ucs_assert(direction_to_responder); UCX_PERF_TEST_FOREACH(&m_perf) { - sn = *recv_sn; progress_responder(); + sn = get_recv_sn(recv_sn, m_perf.uct.recv_mem.mem_type); + if (UCS_CIRCULAR_COMPARE8(sn, >, (psn_t)(send_sn + (fc_window / 2)))) { /* Send ACK every half-window */ wait_for_window(send_window); @@ -467,16 +574,20 @@ class uct_perf_test_runner { } /* Send ACK for last packet */ - if (UCS_CIRCULAR_COMPARE8(*recv_sn, >, send_sn)) { + sn = get_recv_sn(recv_sn, m_perf.uct.recv_mem.mem_type); + if (UCS_CIRCULAR_COMPARE8(sn, >, send_sn)) { wait_for_window(send_window); - send_b(ep, *recv_sn, send_sn, buffer, length, remote_addr, + sn = get_recv_sn(recv_sn, m_perf.uct.recv_mem.mem_type); + send_b(ep, sn, send_sn, buffer, length, remote_addr, rkey, &m_completion); } } else { /* Wait for "sentinel" value */ ucs_time_t poll_time = ucs_get_time(); - while (*recv_sn != 2) { + + do { progress_responder(); + sn = get_recv_sn(recv_sn, m_perf.uct.recv_mem.mem_type); if (!direction_to_responder) { if (ucs_get_time() > poll_time + ucs_time_from_msec(1.0)) { wait_for_window(send_window); @@ -485,11 +596,11 @@ class uct_perf_test_runner { poll_time = ucs_get_time(); } } - } + } while (sn != 2); } } - uct_perf_iface_flush_b(&m_perf); + flush(1 - my_index); ucx_perf_get_time(&m_perf); ucs_assert(outstanding() == 0); if (my_index == 1) { @@ -552,6 +663,8 @@ class uct_perf_test_runner { const unsigned m_max_outstanding; uct_completion_t m_completion; int m_send_b_count; + /* this is only valid for UCT AM tests */ + psn_t m_last_recvd_sn; const static int N_SEND_B_PER_PROGRESS = 16; }; diff --git a/src/tools/perf/perftest.c b/src/tools/perf/perftest.c index 987c113aa93..f55206b5cbc 100644 --- a/src/tools/perf/perftest.c +++ b/src/tools/perf/perftest.c @@ -31,16 +31,17 @@ #include #include #include -#if HAVE_MPI +#if defined (HAVE_MPI) # include -#elif HAVE_RTE +#elif defined (HAVE_RTE) # include #endif #define MAX_BATCH_FILES 32 +#define MAX_CPUS 1024 #define TL_RESOURCE_NAME_NONE "" #define TEST_PARAMS_ARGS "t:n:s:W:O:w:D:i:H:oSCqM:r:T:d:x:A:BUm:" - +#define TEST_ID_UNDEFINED -1 enum { TEST_FLAG_PRINT_RESULTS = UCS_BIT(0), @@ -63,15 +64,24 @@ typedef struct test_type { ucx_perf_cmd_t command; ucx_perf_test_type_t test_type; const char *desc; + const char *overhead_lat; + unsigned window_size; } test_type_t; +typedef struct perftest_params { + ucx_perf_params_t super; + int test_id; +} perftest_params_t; + + struct perftest_context { - ucx_perf_params_t params; + perftest_params_t params; const char *server_addr; int port; int mpi; - unsigned cpu; + unsigned num_cpus; + unsigned cpus[MAX_CPUS]; unsigned flags; unsigned num_batch_files; @@ -84,73 +94,73 @@ struct perftest_context { test_type_t tests[] = { {"am_lat", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG, - "active message latency"}, + "active message latency", "latency", 1}, {"put_lat", UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_PINGPONG, - "put latency"}, + "put latency", "latency", 1}, {"add_lat", UCX_PERF_API_UCT, UCX_PERF_CMD_ADD, UCX_PERF_TEST_TYPE_PINGPONG, - "atomic add latency"}, + "atomic add latency", "latency", 1}, {"get", UCX_PERF_API_UCT, UCX_PERF_CMD_GET, UCX_PERF_TEST_TYPE_STREAM_UNI, - "get latency / bandwidth / message rate"}, + "get latency / bandwidth / message rate", "latency", 1}, {"fadd", UCX_PERF_API_UCT, UCX_PERF_CMD_FADD, UCX_PERF_TEST_TYPE_STREAM_UNI, - "atomic fetch-and-add latency / rate"}, + "atomic fetch-and-add latency / rate", "latency", 1}, {"swap", UCX_PERF_API_UCT, UCX_PERF_CMD_SWAP, UCX_PERF_TEST_TYPE_STREAM_UNI, - "atomic swap latency / rate"}, + "atomic swap latency / rate", "latency", 1}, {"cswap", UCX_PERF_API_UCT, UCX_PERF_CMD_CSWAP, UCX_PERF_TEST_TYPE_STREAM_UNI, - "atomic compare-and-swap latency / rate"}, + "atomic compare-and-swap latency / rate", "latency", 1}, {"am_bw", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI, - "active message bandwidth / message rate"}, + "active message bandwidth / message rate", "overhead", 1}, {"put_bw", UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI, - "put bandwidth / message rate"}, + "put bandwidth / message rate", "overhead", 1}, {"add_mr", UCX_PERF_API_UCT, UCX_PERF_CMD_ADD, UCX_PERF_TEST_TYPE_STREAM_UNI, - "atomic add message rate"}, + "atomic add message rate", "overhead", 1}, {"tag_lat", UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_PINGPONG, - "tag match latency"}, + "tag match latency", "latency", 1}, {"tag_bw", UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_STREAM_UNI, - "tag match bandwidth"}, + "tag match bandwidth", "overhead", 32}, {"tag_sync_lat", UCX_PERF_API_UCP, UCX_PERF_CMD_TAG_SYNC, UCX_PERF_TEST_TYPE_PINGPONG, - "tag sync match latency"}, + "tag sync match latency", "latency", 1}, {"tag_sync_bw", UCX_PERF_API_UCP, UCX_PERF_CMD_TAG_SYNC, UCX_PERF_TEST_TYPE_STREAM_UNI, - "tag sync match bandwidth"}, + "tag sync match bandwidth", "overhead", 32}, {"ucp_put_lat", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_PINGPONG, - "put latency"}, + "put latency", "latency", 1}, {"ucp_put_bw", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI, - "put bandwidth"}, + "put bandwidth", "overhead", 32}, {"ucp_get", UCX_PERF_API_UCP, UCX_PERF_CMD_GET, UCX_PERF_TEST_TYPE_STREAM_UNI, - "get latency / bandwidth / message rate"}, + "get latency / bandwidth / message rate", "latency", 1}, {"ucp_add", UCX_PERF_API_UCP, UCX_PERF_CMD_ADD, UCX_PERF_TEST_TYPE_STREAM_UNI, - "atomic add bandwidth / message rate"}, + "atomic add bandwidth / message rate", "overhead", 1}, {"ucp_fadd", UCX_PERF_API_UCP, UCX_PERF_CMD_FADD, UCX_PERF_TEST_TYPE_STREAM_UNI, - "atomic fetch-and-add latency / bandwidth / rate"}, + "atomic fetch-and-add latency / bandwidth / rate", "latency", 1}, {"ucp_swap", UCX_PERF_API_UCP, UCX_PERF_CMD_SWAP, UCX_PERF_TEST_TYPE_STREAM_UNI, - "atomic swap latency / bandwidth / rate"}, + "atomic swap latency / bandwidth / rate", "latency", 1}, {"ucp_cswap", UCX_PERF_API_UCP, UCX_PERF_CMD_CSWAP, UCX_PERF_TEST_TYPE_STREAM_UNI, - "atomic compare-and-swap latency / bandwidth / rate"}, + "atomic compare-and-swap latency / bandwidth / rate", "latency", 1}, {"stream_bw", UCX_PERF_API_UCP, UCX_PERF_CMD_STREAM, UCX_PERF_TEST_TYPE_STREAM_UNI, - "stream bandwidth"}, + "stream bandwidth", "overhead", 1}, {"stream_lat", UCX_PERF_API_UCP, UCX_PERF_CMD_STREAM, UCX_PERF_TEST_TYPE_PINGPONG, - "stream latency"}, + "stream latency", "latency", 1}, {NULL} }; @@ -195,7 +205,9 @@ static int sock_io(int sock, ssize_t (*sock_call)(int, void *, size_t, int), static int safe_send(int sock, void *data, size_t size, void (*progress)(void *arg), void *arg) { - return sock_io(sock, (void*)send, POLLOUT, data, size, progress, arg, "send"); + typedef ssize_t (*sock_call)(int, void *, size_t, int); + + return sock_io(sock, (sock_call)send, POLLOUT, data, size, progress, arg, "send"); } static int safe_recv(int sock, void *data, size_t size, @@ -206,11 +218,11 @@ static int safe_recv(int sock, void *data, size_t size, static void print_progress(char **test_names, unsigned num_names, const ucx_perf_result_t *result, unsigned flags, - int final) + int final, int is_server, int is_multi_thread) { - static const char *fmt_csv = "%.0f,%.3f,%.3f,%.3f,%.2f,%.2f,%.0f,%.0f\n"; - static const char *fmt_numeric = "%'14.0f %9.3f %9.3f %9.3f %10.2f %10.2f %'11.0f %'11.0f\n"; - static const char *fmt_plain = "%14.0f %9.3f %9.3f %9.3f %10.2f %10.2f %11.0f %11.0f\n"; + static const char *fmt_csv; + static const char *fmt_numeric; + static const char *fmt_plain; unsigned i; if (!(flags & TEST_FLAG_PRINT_RESULTS) || @@ -225,63 +237,89 @@ static void print_progress(char **test_names, unsigned num_names, } } - printf((flags & TEST_FLAG_PRINT_CSV) ? fmt_csv : - (flags & TEST_FLAG_NUMERIC_FMT) ? fmt_numeric : - fmt_plain, - (double)result->iters, - result->latency.typical * 1000000.0, - result->latency.moment_average * 1000000.0, - result->latency.total_average * 1000000.0, - result->bandwidth.moment_average / (1024.0 * 1024.0), - result->bandwidth.total_average / (1024.0 * 1024.0), - result->msgrate.moment_average, - result->msgrate.total_average); +#if _OPENMP + if (!final) { + printf("[thread %d]", omp_get_thread_num()); + } else if (flags & TEST_FLAG_PRINT_RESULTS) { + printf("Final: "); + } +#endif + + if (is_multi_thread && final) { + fmt_csv = "%4.0f,%.3f,%.2f,%.0f\n"; + fmt_numeric = "%'18.0f %29.3f %22.2f %'24.0f\n"; + fmt_plain = "%18.0f %29.3f %22.2f %23.0f\n"; + + printf((flags & TEST_FLAG_PRINT_CSV) ? fmt_csv : + (flags & TEST_FLAG_NUMERIC_FMT) ? fmt_numeric : + fmt_plain, + (double)result->iters, + result->latency.total_average * 1000000.0, + result->bandwidth.total_average / (1024.0 * 1024.0), + result->msgrate.total_average); + } else { + fmt_csv = "%4.0f,%.3f,%.3f,%.3f,%.2f,%.2f,%.0f,%.0f\n"; + fmt_numeric = "%'18.0f %9.3f %9.3f %9.3f %11.2f %10.2f %'11.0f %'11.0f\n"; + fmt_plain = "%18.0f %9.3f %9.3f %9.3f %11.2f %10.2f %11.0f %11.0f\n"; + + printf((flags & TEST_FLAG_PRINT_CSV) ? fmt_csv : + (flags & TEST_FLAG_NUMERIC_FMT) ? fmt_numeric : + fmt_plain, + (double)result->iters, + result->latency.typical * 1000000.0, + result->latency.moment_average * 1000000.0, + result->latency.total_average * 1000000.0, + result->bandwidth.moment_average / (1024.0 * 1024.0), + result->bandwidth.total_average / (1024.0 * 1024.0), + result->msgrate.moment_average, + result->msgrate.total_average); + } + fflush(stdout); } static void print_header(struct perftest_context *ctx) { - const char *test_api_str; + const char *overhead_lat_str; const char *test_data_str; + const char *test_api_str; test_type_t *test; unsigned i; - if (ctx->flags & TEST_FLAG_PRINT_TEST) { - for (test = tests; test->name; ++test) { - if ((test->command == ctx->params.command) && (test->test_type == ctx->params.test_type)) { + test = (ctx->params.test_id == TEST_ID_UNDEFINED) ? NULL : + &tests[ctx->params.test_id]; + + if ((ctx->flags & TEST_FLAG_PRINT_TEST) && (test != NULL)) { + if (test->api == UCX_PERF_API_UCT) { + test_api_str = "transport layer"; + switch (ctx->params.super.uct.data_layout) { + case UCT_PERF_DATA_LAYOUT_SHORT: + test_data_str = "short"; + break; + case UCT_PERF_DATA_LAYOUT_BCOPY: + test_data_str = "bcopy"; + break; + case UCT_PERF_DATA_LAYOUT_ZCOPY: + test_data_str = "zcopy"; + break; + default: + test_data_str = "(undefined)"; break; } + } else if (test->api == UCX_PERF_API_UCP) { + test_api_str = "protocol layer"; + test_data_str = "(automatic)"; /* TODO contig/stride/stream */ + } else { + return; } - if (test->name != NULL) { - if (test->api == UCX_PERF_API_UCT) { - test_api_str = "transport layer"; - switch (ctx->params.uct.data_layout) { - case UCT_PERF_DATA_LAYOUT_SHORT: - test_data_str = "short"; - break; - case UCT_PERF_DATA_LAYOUT_BCOPY: - test_data_str = "bcopy"; - break; - case UCT_PERF_DATA_LAYOUT_ZCOPY: - test_data_str = "zcopy"; - break; - default: - test_data_str = "(undefined)"; - break; - } - } else if (test->api == UCX_PERF_API_UCP) { - test_api_str = "protocol layer"; - test_data_str = "(automatic)"; /* TODO contig/stride/stream */ - } else { - return; - } - printf("+------------------------------------------------------------------------------------------+\n"); - printf("| API: %-60s |\n", test_api_str); - printf("| Test: %-60s |\n", test->desc); - printf("| Data layout: %-60s |\n", test_data_str); - printf("| Message size: %-60zu |\n", ucx_perf_get_message_size(&ctx->params)); - } + printf("+------------------------------------------------------------------------------------------+\n"); + printf("| API: %-60s |\n", test_api_str); + printf("| Test: %-60s |\n", test->desc); + printf("| Data layout: %-60s |\n", test_data_str); + printf("| Send memory: %-60s |\n", ucs_memory_type_names[ctx->params.super.send_mem_type]); + printf("| Recv memory: %-60s |\n", ucs_memory_type_names[ctx->params.super.recv_mem_type]); + printf("| Message size: %-60zu |\n", ucx_perf_get_message_size(&ctx->params.super)); } if (ctx->flags & TEST_FLAG_PRINT_CSV) { @@ -293,11 +331,13 @@ static void print_header(struct perftest_context *ctx) } } else { if (ctx->flags & TEST_FLAG_PRINT_RESULTS) { - printf("+--------------+-----------------------------+---------------------+-----------------------+\n"); - printf("| | latency (usec) | bandwidth (MB/s) | message rate (msg/s) |\n"); - printf("+--------------+---------+---------+---------+----------+----------+-----------+-----------+\n"); - printf("| # iterations | typical | average | overall | average | overall | average | overall |\n"); - printf("+--------------+---------+---------+---------+----------+----------+-----------+-----------+\n"); + overhead_lat_str = (test == NULL) ? "overhead" : test->overhead_lat; + + printf("+--------------+--------------+-----------------------------+---------------------+-----------------------+\n"); + printf("| | | %8s (usec) | bandwidth (MB/s) | message rate (msg/s) |\n", overhead_lat_str); + printf("+--------------+--------------+---------+---------+---------+----------+----------+-----------+-----------+\n"); + printf("| Stage | # iterations | typical | average | overall | average | overall | average | overall |\n"); + printf("+--------------+--------------+---------+---------+---------+----------+----------+-----------+-----------+\n"); } else if (ctx->flags & TEST_FLAG_PRINT_TEST) { printf("+------------------------------------------------------------------------------------------+\n"); } @@ -328,6 +368,18 @@ static void print_test_name(struct perftest_context *ctx) } } +static void print_memory_type_usage(void) +{ + ucs_memory_type_t it; + for (it = UCS_MEMORY_TYPE_HOST; it < UCS_MEMORY_TYPE_LAST; it++) { + if (ucx_perf_mem_type_allocators[it] != NULL) { + printf(" %s - %s\n", + ucs_memory_type_names[it], + ucs_memory_type_descs[it]); + } + } +} + static void usage(const struct perftest_context *ctx, const char *program) { static const char* api_names[] = { @@ -337,17 +389,17 @@ static void usage(const struct perftest_context *ctx, const char *program) test_type_t *test; int UCS_V_UNUSED rank; -#if HAVE_MPI +#ifdef HAVE_MPI MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (ctx->mpi && (rank != 0)) { return; } #endif -#if HAVE_MPI +#if defined (HAVE_MPI) printf(" Note: test can be also launched as an MPI application\n"); printf("\n"); -#elif HAVE_RTE +#elif defined (HAVE_RTE) printf(" Note: this test can be also launched as an libRTE application\n"); printf("\n"); #endif @@ -361,24 +413,26 @@ static void usage(const struct perftest_context *ctx, const char *program) } printf("\n"); printf(" -s list of scatter-gather sizes for single message (%zu)\n", - ctx->params.msg_size_list[0]); + ctx->params.super.msg_size_list[0]); printf(" for example: \"-s 16,48,8192,8192,14\"\n"); - printf(" -n number of iterations to run (%ld)\n", ctx->params.max_iter); + printf(" -m [,]\n"); + printf(" memory type of message for sender and receiver (host)\n"); + print_memory_type_usage(); + printf(" -n number of iterations to run (%ld)\n", ctx->params.super.max_iter); printf(" -w number of warm-up iterations (%zu)\n", - ctx->params.warmup_iter); - printf(" -c set affinity to this CPU (off)\n"); - printf(" -O maximal number of uncompleted outstanding sends (%u)\n", - ctx->params.max_outstanding); + ctx->params.super.warmup_iter); + printf(" -c set affinity to this CPU list (separated by comma) (off)\n"); + printf(" -O maximal number of uncompleted outstanding sends\n"); printf(" -i distance between consecutive scatter-gather entries (%zu)\n", - ctx->params.iov_stride); - printf(" -T number of threads in the test (%d), if >1 implies \"-M multi\"\n", - ctx->params.thread_count); + ctx->params.super.iov_stride); + printf(" -T number of threads in the test (%d)\n", + ctx->params.super.thread_count); printf(" -B register memory with NONBLOCK flag\n"); printf(" -b read and execute tests from a batch file: every line in the\n"); printf(" file is a test to run, first word is test name, the rest of\n"); printf(" the line is command-line arguments for the test.\n"); printf(" -p TCP port to use for data exchange (%d)\n", ctx->port); -#if HAVE_MPI +#ifdef HAVE_MPI printf(" -P <0|1> disable/enable MPI mode (%d)\n", ctx->mpi); #endif printf(" -h show this help message\n"); @@ -397,9 +451,9 @@ static void usage(const struct perftest_context *ctx, const char *program) printf(" zcopy - zero-copy (cannot be used for atomics)\n"); printf(" iov - scatter-gather list (iovec)\n"); printf(" -W flow control window size, for active messages (%u)\n", - ctx->params.uct.fc_window); + ctx->params.super.uct.fc_window); printf(" -H active message header size (%zu)\n", - ctx->params.am_hdr_size); + ctx->params.super.am_hdr_size); printf(" -A asynchronous progress mode (thread_spinlock)\n"); printf(" thread_spinlock - separate progress thread with spin locking\n"); printf(" thread_mutex - separate progress thread with mutex locking\n"); @@ -419,27 +473,13 @@ static void usage(const struct perftest_context *ctx, const char *program) printf(" -r receive mode for stream tests (recv)\n"); printf(" recv : Use ucp_stream_recv_nb\n"); printf(" recv_data : Use ucp_stream_recv_data_nb\n"); - printf(" -m memory type of messages\n"); - printf(" host - system memory(default)\n"); - if (ucx_perf_mem_type_allocators[UCT_MD_MEM_TYPE_CUDA] != NULL) { - printf(" cuda - NVIDIA GPU memory\n"); - } - if (ucx_perf_mem_type_allocators[UCT_MD_MEM_TYPE_CUDA_MANAGED] != NULL) { - printf(" cuda-managed - NVIDIA cuda managed/unified memory\n"); - } printf("\n"); printf(" NOTE: When running UCP tests, transport and device should be specified by\n"); printf(" environment variables: UCX_TLS and UCX_[SELF|SHM|NET]_DEVICES.\n"); printf("\n"); } -static const char *__basename(const char *path) -{ - const char *p = strrchr(path, '/'); - return (p == NULL) ? path : (p + 1); -} - -static ucs_status_t parse_ucp_datatype_params(const char *optarg, +static ucs_status_t parse_ucp_datatype_params(const char *opt_arg, ucp_perf_datatype_t *datatype) { const char *iov_type = "iov"; @@ -447,9 +487,9 @@ static ucs_status_t parse_ucp_datatype_params(const char *optarg, const char *contig_type = "contig"; const size_t contig_type_size = strlen("contig"); - if (0 == strncmp(optarg, iov_type, iov_type_size)) { + if (0 == strncmp(opt_arg, iov_type, iov_type_size)) { *datatype = UCP_PERF_DATATYPE_IOV; - } else if (0 == strncmp(optarg, contig_type, contig_type_size)) { + } else if (0 == strncmp(opt_arg, contig_type, contig_type_size)) { *datatype = UCP_PERF_DATATYPE_CONTIG; } else { return UCS_ERR_INVALID_PARAM; @@ -458,14 +498,49 @@ static ucs_status_t parse_ucp_datatype_params(const char *optarg, return UCS_OK; } -static ucs_status_t parse_message_sizes_params(const char *optarg, +static ucs_status_t parse_mem_type(const char *opt_arg, + ucs_memory_type_t *mem_type) +{ + ucs_memory_type_t it; + for (it = UCS_MEMORY_TYPE_HOST; it < UCS_MEMORY_TYPE_LAST; it++) { + if(!strcmp(opt_arg, ucs_memory_type_names[it]) && + (ucx_perf_mem_type_allocators[it] != NULL)) { + *mem_type = it; + return UCS_OK; + } + } + ucs_error("Unsupported memory type: \"%s\"", opt_arg); + return UCS_ERR_INVALID_PARAM; +} + +static ucs_status_t parse_mem_type_params(const char *opt_arg, + ucs_memory_type_t *send_mem_type, + ucs_memory_type_t *recv_mem_type) +{ + const char *delim = ","; + char *token = strtok((char*)opt_arg, delim); + + if (UCS_OK != parse_mem_type(token, send_mem_type)) { + return UCS_ERR_INVALID_PARAM; + } + + token = strtok(NULL, delim); + if (NULL == token) { + *recv_mem_type = *send_mem_type; + return UCS_OK; + } else { + return parse_mem_type(token, recv_mem_type); + } +} + +static ucs_status_t parse_message_sizes_params(const char *opt_arg, ucx_perf_params_t *params) { - char *optarg_ptr, *optarg_ptr2; - size_t token_num, token_it; const char delim = ','; + size_t *msg_size_list, token_num, token_it; + char *optarg_ptr, *optarg_ptr2; - optarg_ptr = (char *)optarg; + optarg_ptr = (char *)opt_arg; token_num = 0; /* count the number of given message sizes */ while ((optarg_ptr = strchr(optarg_ptr, delim)) != NULL) { @@ -474,13 +549,15 @@ static ucs_status_t parse_message_sizes_params(const char *optarg, } ++token_num; - params->msg_size_list = realloc(params->msg_size_list, - sizeof(*params->msg_size_list) * token_num); - if (NULL == params->msg_size_list) { + msg_size_list = realloc(params->msg_size_list, + sizeof(*params->msg_size_list) * token_num); + if (NULL == msg_size_list) { return UCS_ERR_NO_MEMORY; } - optarg_ptr = (char *)optarg; + params->msg_size_list = msg_size_list; + + optarg_ptr = (char *)opt_arg; errno = 0; for (token_it = 0; token_it < token_num; ++token_it) { params->msg_size_list[token_it] = strtoul(optarg_ptr, &optarg_ptr2, 10); @@ -499,191 +576,215 @@ static ucs_status_t parse_message_sizes_params(const char *optarg, return UCS_OK; } -static void init_test_params(ucx_perf_params_t *params) +static ucs_status_t init_test_params(perftest_params_t *params) { memset(params, 0, sizeof(*params)); - params->api = UCX_PERF_API_LAST; - params->command = UCX_PERF_CMD_LAST; - params->test_type = UCX_PERF_TEST_TYPE_LAST; - params->thread_mode = UCS_THREAD_MODE_SINGLE; - params->thread_count = 1; - params->async_mode = UCS_ASYNC_THREAD_LOCK_TYPE; - params->wait_mode = UCX_PERF_WAIT_MODE_LAST; - params->max_outstanding = 1; - params->warmup_iter = 10000; - params->am_hdr_size = 8; - params->alignment = ucs_get_page_size(); - params->max_iter = 1000000l; - params->max_time = 0.0; - params->report_interval = 1.0; - params->flags = UCX_PERF_TEST_FLAG_VERBOSE; - params->uct.fc_window = UCT_PERF_TEST_MAX_FC_WINDOW; - params->uct.data_layout = UCT_PERF_DATA_LAYOUT_SHORT; - params->mem_type = UCT_MD_MEM_TYPE_HOST; - params->msg_size_cnt = 1; - params->iov_stride = 0; - params->ucp.send_datatype = UCP_PERF_DATATYPE_CONTIG; - params->ucp.recv_datatype = UCP_PERF_DATATYPE_CONTIG; - strcpy(params->uct.dev_name, TL_RESOURCE_NAME_NONE); - strcpy(params->uct.tl_name, TL_RESOURCE_NAME_NONE); - - params->msg_size_list = malloc(sizeof(*params->msg_size_list) * - params->msg_size_cnt); - params->msg_size_list[0] = 8; + params->super.api = UCX_PERF_API_LAST; + params->super.command = UCX_PERF_CMD_LAST; + params->super.test_type = UCX_PERF_TEST_TYPE_LAST; + params->super.thread_mode = UCS_THREAD_MODE_SINGLE; + params->super.thread_count = 1; + params->super.async_mode = UCS_ASYNC_THREAD_LOCK_TYPE; + params->super.wait_mode = UCX_PERF_WAIT_MODE_LAST; + params->super.max_outstanding = 0; + params->super.warmup_iter = 10000; + params->super.am_hdr_size = 8; + params->super.alignment = ucs_get_page_size(); + params->super.max_iter = 1000000l; + params->super.max_time = 0.0; + params->super.report_interval = 1.0; + params->super.flags = UCX_PERF_TEST_FLAG_VERBOSE; + params->super.uct.fc_window = UCT_PERF_TEST_MAX_FC_WINDOW; + params->super.uct.data_layout = UCT_PERF_DATA_LAYOUT_SHORT; + params->super.send_mem_type = UCS_MEMORY_TYPE_HOST; + params->super.recv_mem_type = UCS_MEMORY_TYPE_HOST; + params->super.msg_size_cnt = 1; + params->super.iov_stride = 0; + params->super.ucp.send_datatype = UCP_PERF_DATATYPE_CONTIG; + params->super.ucp.recv_datatype = UCP_PERF_DATATYPE_CONTIG; + strcpy(params->super.uct.dev_name, TL_RESOURCE_NAME_NONE); + strcpy(params->super.uct.tl_name, TL_RESOURCE_NAME_NONE); + + params->super.msg_size_list = calloc(params->super.msg_size_cnt, + sizeof(*params->super.msg_size_list)); + if (params->super.msg_size_list == NULL) { + return UCS_ERR_NO_MEMORY; + } + + params->super.msg_size_list[0] = 8; + params->test_id = TEST_ID_UNDEFINED; + + return UCS_OK; } -static ucs_status_t parse_test_params(ucx_perf_params_t *params, char opt, const char *optarg) +static ucs_status_t parse_test_params(perftest_params_t *params, char opt, + const char *opt_arg) { - test_type_t *test; char *optarg2 = NULL; + test_type_t *test; + unsigned i; switch (opt) { case 'd': - ucs_snprintf_zero(params->uct.dev_name, sizeof(params->uct.dev_name), - "%s", optarg); + ucs_snprintf_zero(params->super.uct.dev_name, + sizeof(params->super.uct.dev_name), "%s", opt_arg); return UCS_OK; case 'x': - ucs_snprintf_zero(params->uct.tl_name, sizeof(params->uct.tl_name), - "%s", optarg); + ucs_snprintf_zero(params->super.uct.tl_name, + sizeof(params->super.uct.tl_name), "%s", opt_arg); return UCS_OK; case 't': - for (test = tests; test->name; ++test) { - if (!strcmp(optarg, test->name)) { - params->api = test->api; - params->command = test->command; - params->test_type = test->test_type; + for (i = 0; tests[i].name != NULL; ++i) { + test = &tests[i]; + if (!strcmp(opt_arg, test->name)) { + params->super.api = test->api; + params->super.command = test->command; + params->super.test_type = test->test_type; + params->test_id = i; break; } } - if (test->name == NULL) { + if (params->test_id == TEST_ID_UNDEFINED) { ucs_error("Invalid option argument for -t"); return UCS_ERR_INVALID_PARAM; } return UCS_OK; case 'D': - if (!strcmp(optarg, "short")) { - params->uct.data_layout = UCT_PERF_DATA_LAYOUT_SHORT; - } else if (!strcmp(optarg, "bcopy")) { - params->uct.data_layout = UCT_PERF_DATA_LAYOUT_BCOPY; - } else if (!strcmp(optarg, "zcopy")) { - params->uct.data_layout = UCT_PERF_DATA_LAYOUT_ZCOPY; - } else if (UCS_OK == parse_ucp_datatype_params(optarg, - ¶ms->ucp.send_datatype)) { - optarg2 = strchr(optarg, ','); + if (!strcmp(opt_arg, "short")) { + params->super.uct.data_layout = UCT_PERF_DATA_LAYOUT_SHORT; + } else if (!strcmp(opt_arg, "bcopy")) { + params->super.uct.data_layout = UCT_PERF_DATA_LAYOUT_BCOPY; + } else if (!strcmp(opt_arg, "zcopy")) { + params->super.uct.data_layout = UCT_PERF_DATA_LAYOUT_ZCOPY; + } else if (UCS_OK == parse_ucp_datatype_params(opt_arg, + ¶ms->super.ucp.send_datatype)) { + optarg2 = strchr(opt_arg, ','); if (optarg2) { if (UCS_OK != parse_ucp_datatype_params(optarg2 + 1, - ¶ms->ucp.recv_datatype)) { - return -1; + ¶ms->super.ucp.recv_datatype)) { + return UCS_ERR_INVALID_PARAM; } } } else { ucs_error("Invalid option argument for -D"); - return -1; + return UCS_ERR_INVALID_PARAM; } return UCS_OK; case 'i': - params->iov_stride = atol(optarg); + params->super.iov_stride = atol(opt_arg); return UCS_OK; case 'n': - params->max_iter = atol(optarg); + params->super.max_iter = atol(opt_arg); return UCS_OK; case 's': - return parse_message_sizes_params(optarg, params); + return parse_message_sizes_params(opt_arg, ¶ms->super); case 'H': - params->am_hdr_size = atol(optarg); + params->super.am_hdr_size = atol(opt_arg); return UCS_OK; case 'W': - params->uct.fc_window = atoi(optarg); + params->super.uct.fc_window = atoi(opt_arg); return UCS_OK; case 'O': - params->max_outstanding = atoi(optarg); + params->super.max_outstanding = atoi(opt_arg); return UCS_OK; case 'w': - params->warmup_iter = atol(optarg); + params->super.warmup_iter = atol(opt_arg); return UCS_OK; case 'o': - params->flags |= UCX_PERF_TEST_FLAG_ONE_SIDED; + params->super.flags |= UCX_PERF_TEST_FLAG_ONE_SIDED; return UCS_OK; case 'B': - params->flags |= UCX_PERF_TEST_FLAG_MAP_NONBLOCK; + params->super.flags |= UCX_PERF_TEST_FLAG_MAP_NONBLOCK; return UCS_OK; case 'q': - params->flags &= ~UCX_PERF_TEST_FLAG_VERBOSE; + params->super.flags &= ~UCX_PERF_TEST_FLAG_VERBOSE; return UCS_OK; case 'C': - params->flags |= UCX_PERF_TEST_FLAG_TAG_WILDCARD; + params->super.flags |= UCX_PERF_TEST_FLAG_TAG_WILDCARD; return UCS_OK; case 'U': - params->flags |= UCX_PERF_TEST_FLAG_TAG_UNEXP_PROBE; + params->super.flags |= UCX_PERF_TEST_FLAG_TAG_UNEXP_PROBE; return UCS_OK; case 'M': - if (!strcmp(optarg, "single")) { - params->thread_mode = UCS_THREAD_MODE_SINGLE; + if (!strcmp(opt_arg, "single")) { + params->super.thread_mode = UCS_THREAD_MODE_SINGLE; return UCS_OK; - } else if (!strcmp(optarg, "serialized")) { - params->thread_mode = UCS_THREAD_MODE_SERIALIZED; + } else if (!strcmp(opt_arg, "serialized")) { + params->super.thread_mode = UCS_THREAD_MODE_SERIALIZED; return UCS_OK; - } else if (!strcmp(optarg, "multi")) { - params->thread_mode = UCS_THREAD_MODE_MULTI; + } else if (!strcmp(opt_arg, "multi")) { + params->super.thread_mode = UCS_THREAD_MODE_MULTI; return UCS_OK; } else { ucs_error("Invalid option argument for -M"); return UCS_ERR_INVALID_PARAM; } case 'T': - params->thread_count = atoi(optarg); - params->thread_mode = UCS_THREAD_MODE_MULTI; + params->super.thread_count = atoi(opt_arg); return UCS_OK; case 'A': - if (!strcmp(optarg, "thread") || !strcmp(optarg, "thread_spinlock")) { - params->async_mode = UCS_ASYNC_MODE_THREAD_SPINLOCK; + if (!strcmp(opt_arg, "thread") || !strcmp(opt_arg, "thread_spinlock")) { + params->super.async_mode = UCS_ASYNC_MODE_THREAD_SPINLOCK; return UCS_OK; - } else if (!strcmp(optarg, "thread_mutex")) { - params->async_mode = UCS_ASYNC_MODE_THREAD_MUTEX; + } else if (!strcmp(opt_arg, "thread_mutex")) { + params->super.async_mode = UCS_ASYNC_MODE_THREAD_MUTEX; return UCS_OK; - } else if (!strcmp(optarg, "signal")) { - params->async_mode = UCS_ASYNC_MODE_SIGNAL; + } else if (!strcmp(opt_arg, "signal")) { + params->super.async_mode = UCS_ASYNC_MODE_SIGNAL; return UCS_OK; } else { ucs_error("Invalid option argument for -A"); return UCS_ERR_INVALID_PARAM; } case 'r': - if (!strcmp(optarg, "recv_data")) { - params->flags |= UCX_PERF_TEST_FLAG_STREAM_RECV_DATA; + if (!strcmp(opt_arg, "recv_data")) { + params->super.flags |= UCX_PERF_TEST_FLAG_STREAM_RECV_DATA; return UCS_OK; - } else if (!strcmp(optarg, "recv")) { - params->flags &= ~UCX_PERF_TEST_FLAG_STREAM_RECV_DATA; + } else if (!strcmp(opt_arg, "recv")) { + params->super.flags &= ~UCX_PERF_TEST_FLAG_STREAM_RECV_DATA; return UCS_OK; } return UCS_ERR_INVALID_PARAM; case 'm': - if (!strcmp(optarg, "host")) { - params->mem_type = UCT_MD_MEM_TYPE_HOST; - return UCS_OK; - } else if (!strcmp(optarg, "cuda") && - (ucx_perf_mem_type_allocators[UCT_MD_MEM_TYPE_CUDA] != NULL)) { - params->mem_type = UCT_MD_MEM_TYPE_CUDA; - return UCS_OK; - } else if (!strcmp(optarg, "cuda-managed") && - (ucx_perf_mem_type_allocators[UCT_MD_MEM_TYPE_CUDA_MANAGED] != NULL)) { - params->mem_type = UCT_MD_MEM_TYPE_CUDA_MANAGED; - return UCS_OK; + if (UCS_OK != parse_mem_type_params(opt_arg, + ¶ms->super.send_mem_type, + ¶ms->super.recv_mem_type)) { + return UCS_ERR_INVALID_PARAM; } - return UCS_ERR_INVALID_PARAM; + return UCS_OK; default: return UCS_ERR_INVALID_PARAM; } } +static ucs_status_t adjust_test_params(perftest_params_t *params, + const char *error_prefix) +{ + test_type_t *test; + + if (params->test_id == TEST_ID_UNDEFINED) { + ucs_error("%smissing test name", error_prefix); + return UCS_ERR_INVALID_PARAM; + } + + test = &tests[params->test_id]; + + if (params->super.max_outstanding == 0) { + params->super.max_outstanding = test->window_size; + } + + return UCS_OK; +} + static ucs_status_t read_batch_file(FILE *batch_file, const char *file_name, - int *line_num, ucx_perf_params_t *params, + int *line_num, perftest_params_t *params, char** test_name_p) { #define MAX_SIZE 256 #define MAX_ARG_SIZE 2048 ucs_status_t status; char buf[MAX_ARG_SIZE]; + char error_prefix[MAX_ARG_SIZE]; int argc; char *argv[MAX_SIZE + 1]; int c; @@ -704,20 +805,61 @@ static ucs_status_t read_batch_file(FILE *batch_file, const char *file_name, argv[argc] = NULL; } while ((argc == 0) || (argv[0][0] == '#')); + ucs_snprintf_safe(error_prefix, sizeof(error_prefix), + "in batch file '%s' line %d: ", file_name, *line_num); + optind = 1; while ((c = getopt (argc, argv, TEST_PARAMS_ARGS)) != -1) { status = parse_test_params(params, c, optarg); if (status != UCS_OK) { - ucs_error("in batch file '%s' line %d: -%c %s: %s", - file_name, *line_num, c, optarg, ucs_status_string(status)); + ucs_error("%s-%c %s: %s", error_prefix, c, optarg, + ucs_status_string(status)); return status; } } + status = adjust_test_params(params, error_prefix); + if (status != UCS_OK) { + return status; + } + *test_name_p = strdup(argv[0]); return UCS_OK; } +static ucs_status_t parse_cpus(char *opt_arg, struct perftest_context *ctx) +{ + char *endptr, *cpu_list = opt_arg; + int cpu; + + ctx->num_cpus = 0; + cpu = strtol(cpu_list, &endptr, 10); + + while (((*endptr == ',') || (*endptr == '\0')) && (ctx->num_cpus < MAX_CPUS)) { + if (cpu < 0) { + ucs_error("invalid cpu number detected: (%d)", cpu); + return UCS_ERR_INVALID_PARAM; + } + + ctx->cpus[ctx->num_cpus++] = cpu; + + if (*endptr == '\0') { + break; + } + + cpu_list = endptr + 1; /* skip the comma */ + cpu = strtol(cpu_list, &endptr, 10); + } + + if (*endptr == ',') { + ucs_error("number of listed cpus exceeds the maximum supported value (%d)", + MAX_CPUS); + return UCS_ERR_INVALID_PARAM; + } + + return UCS_OK; +} + static ucs_status_t parse_opts(struct perftest_context *ctx, int mpi_initialized, int argc, char **argv) { @@ -728,7 +870,11 @@ static ucs_status_t parse_opts(struct perftest_context *ctx, int mpi_initialized ucx_perf_global_init(); /* initialize memory types */ - init_test_params(&ctx->params); + status = init_test_params(&ctx->params); + if (status != UCS_OK) { + return status; + } + ctx->server_addr = NULL; ctx->num_batch_files = 0; ctx->port = 13337; @@ -757,20 +903,23 @@ static ucs_status_t parse_opts(struct perftest_context *ctx, int mpi_initialized break; case 'c': ctx->flags |= TEST_FLAG_SET_AFFINITY; - ctx->cpu = atoi(optarg); + status = parse_cpus(optarg, ctx); + if (status != UCS_OK) { + return status; + } break; case 'P': -#if HAVE_MPI +#ifdef HAVE_MPI ctx->mpi = atoi(optarg) && mpi_initialized; break; #endif case 'h': - usage(ctx, __basename(argv[0])); + usage(ctx, ucs_basename(argv[0])); return UCS_ERR_CANCELED; default: status = parse_test_params(&ctx->params, c, optarg); if (status != UCS_OK) { - usage(ctx, __basename(argv[0])); + usage(ctx, ucs_basename(argv[0])); return status; } break; @@ -778,7 +927,7 @@ static ucs_status_t parse_opts(struct perftest_context *ctx, int mpi_initialized } if (optind < argc) { - ctx->server_addr = argv[optind]; + ctx->server_addr = argv[optind]; } return UCS_OK; @@ -804,15 +953,15 @@ static void sock_rte_barrier(void *rte_group, void (*progress)(void *arg), { sock_rte_group_t *group = rte_group; const unsigned magic = 0xdeadbeef; - unsigned sync; + unsigned snc; - sync = magic; - safe_send(group->connfd, &sync, sizeof(unsigned), progress, arg); + snc = magic; + safe_send(group->connfd, &snc, sizeof(unsigned), progress, arg); - sync = 0; - safe_recv(group->connfd, &sync, sizeof(unsigned), progress, arg); + snc = 0; + safe_recv(group->connfd, &snc, sizeof(unsigned), progress, arg); - ucs_assert(sync == magic); + ucs_assert(snc == magic); } #pragma omp barrier } @@ -855,11 +1004,11 @@ static void sock_rte_recv(void *rte_group, unsigned src, void *buffer, } static void sock_rte_report(void *rte_group, const ucx_perf_result_t *result, - void *arg, int is_final) + void *arg, int is_final, int is_multi_thread) { struct perftest_context *ctx = arg; print_progress(ctx->test_names, ctx->num_batch_files, result, ctx->flags, - is_final); + is_final, ctx->server_addr == NULL, is_multi_thread); } static ucx_perf_rte_t sock_rte = { @@ -868,7 +1017,7 @@ static ucx_perf_rte_t sock_rte = { .barrier = sock_rte_barrier, .post_vec = sock_rte_post_vec, .recv = sock_rte_recv, - .exchange_vec = (void*)ucs_empty_function, + .exchange_vec = (ucx_perf_rte_exchange_vec_func_t)ucs_empty_function, .report = sock_rte_report, }; @@ -925,17 +1074,30 @@ static ucs_status_t setup_sock_rte(struct perftest_context *ctx) } close(sockfd); - safe_recv(connfd, &ctx->params, sizeof(ctx->params), NULL, NULL); - if (ctx->params.msg_size_cnt) { - ctx->params.msg_size_list = malloc(sizeof(*ctx->params.msg_size_list) * - ctx->params.msg_size_cnt); - if (NULL == ctx->params.msg_size_list) { + + ret = safe_recv(connfd, &ctx->params, sizeof(ctx->params), NULL, NULL); + if (ret) { + status = UCS_ERR_IO_ERROR; + goto err_close_connfd; + } + + if (ctx->params.super.msg_size_cnt) { + ctx->params.super.msg_size_list = + calloc(ctx->params.super.msg_size_cnt, + sizeof(*ctx->params.super.msg_size_list)); + if (NULL == ctx->params.super.msg_size_list) { status = UCS_ERR_NO_MEMORY; goto err_close_connfd; } - safe_recv(connfd, ctx->params.msg_size_list, - sizeof(*ctx->params.msg_size_list) * ctx->params.msg_size_cnt, - NULL, NULL); + + ret = safe_recv(connfd, ctx->params.super.msg_size_list, + sizeof(*ctx->params.super.msg_size_list) * + ctx->params.super.msg_size_cnt, + NULL, NULL); + if (ret) { + status = UCS_ERR_IO_ERROR; + goto err_close_connfd; + } } ctx->sock_rte_group.connfd = connfd; @@ -963,9 +1125,10 @@ static ucs_status_t setup_sock_rte(struct perftest_context *ctx) } safe_send(sockfd, &ctx->params, sizeof(ctx->params), NULL, NULL); - if (ctx->params.msg_size_cnt) { - safe_send(sockfd, ctx->params.msg_size_list, - sizeof(*ctx->params.msg_size_list) * ctx->params.msg_size_cnt, + if (ctx->params.super.msg_size_cnt) { + safe_send(sockfd, ctx->params.super.msg_size_list, + sizeof(*ctx->params.super.msg_size_list) * + ctx->params.super.msg_size_cnt, NULL, NULL); } @@ -979,9 +1142,9 @@ static ucs_status_t setup_sock_rte(struct perftest_context *ctx) ctx->flags |= TEST_FLAG_PRINT_RESULTS; } - ctx->params.rte_group = &ctx->sock_rte_group; - ctx->params.rte = &sock_rte; - ctx->params.report_arg = ctx; + ctx->params.super.rte_group = &ctx->sock_rte_group; + ctx->params.super.rte = &sock_rte; + ctx->params.super.report_arg = ctx; return UCS_OK; err_close_connfd: @@ -999,7 +1162,7 @@ static ucs_status_t cleanup_sock_rte(struct perftest_context *ctx) return UCS_OK; } -#if HAVE_MPI +#if defined (HAVE_MPI) static unsigned mpi_rte_group_size(void *rte_group) { int size; @@ -1026,7 +1189,7 @@ static void mpi_rte_barrier(void *rte_group, void (*progress)(void *arg), #pragma omp barrier #pragma omp master - + { /* * Naive non-blocking barrier implementation over send/recv, to call user * progress while waiting for completion. @@ -1076,7 +1239,7 @@ static void mpi_rte_barrier(void *rte_group, void (*progress)(void *arg), MPI_COMM_WORLD); } } - + } #pragma omp barrier } @@ -1129,23 +1292,13 @@ static void mpi_rte_recv(void *rte_group, unsigned src, void *buffer, size_t max } static void mpi_rte_report(void *rte_group, const ucx_perf_result_t *result, - void *arg, int is_final) + void *arg, int is_final, int is_multi_thread) { struct perftest_context *ctx = arg; print_progress(ctx->test_names, ctx->num_batch_files, result, ctx->flags, - is_final); + is_final, ctx->server_addr == NULL, is_multi_thread); } - -static ucx_perf_rte_t mpi_rte = { - .group_size = mpi_rte_group_size, - .group_index = mpi_rte_group_index, - .barrier = mpi_rte_barrier, - .post_vec = mpi_rte_post_vec, - .recv = mpi_rte_recv, - .exchange_vec = (void*)ucs_empty_function, - .report = mpi_rte_report, -}; -#elif HAVE_RTE +#elif defined (HAVE_RTE) static unsigned ext_rte_group_size(void *rte_group) { rte_group_t group = (rte_group_t)rte_group; @@ -1250,11 +1403,11 @@ static void ext_rte_exchange_vec(void *rte_group, void * req) } static void ext_rte_report(void *rte_group, const ucx_perf_result_t *result, - void *arg, int is_final) + void *arg, int is_final, int is_multi_thread) { struct perftest_context *ctx = arg; print_progress(ctx->test_names, ctx->num_batch_files, result, ctx->flags, - is_final); + is_final, ctx->server_addr == NULL, is_multi_thread); } static ucx_perf_rte_t ext_rte = { @@ -1272,7 +1425,17 @@ static ucs_status_t setup_mpi_rte(struct perftest_context *ctx) { ucs_trace_func(""); -#if HAVE_MPI +#if defined (HAVE_MPI) + static ucx_perf_rte_t mpi_rte = { + .group_size = mpi_rte_group_size, + .group_index = mpi_rte_group_index, + .barrier = mpi_rte_barrier, + .post_vec = mpi_rte_post_vec, + .recv = mpi_rte_recv, + .exchange_vec = (void*)ucs_empty_function, + .report = mpi_rte_report, + }; + int size, rank; MPI_Comm_size(MPI_COMM_WORLD, &size); @@ -1286,10 +1449,13 @@ static ucs_status_t setup_mpi_rte(struct perftest_context *ctx) ctx->flags |= TEST_FLAG_PRINT_RESULTS; } + ctx->params.super.rte_group = NULL; + ctx->params.super.rte = &mpi_rte; + ctx->params.super.report_arg = ctx; +#elif defined (HAVE_RTE) ctx->params.rte_group = NULL; ctx->params.rte = &mpi_rte; ctx->params.report_arg = ctx; -#elif HAVE_RTE rte_group_t group; rte_init(NULL, NULL, &group); @@ -1297,16 +1463,16 @@ static ucs_status_t setup_mpi_rte(struct perftest_context *ctx) ctx->flags |= TEST_FLAG_PRINT_RESULTS; } - ctx->params.rte_group = group; - ctx->params.rte = &ext_rte; - ctx->params.report_arg = ctx; + ctx->params.super.rte_group = group; + ctx->params.super.rte = &ext_rte; + ctx->params.super.report_arg = ctx; #endif return UCS_OK; } static ucs_status_t cleanup_mpi_rte(struct perftest_context *ctx) { -#if HAVE_RTE +#ifdef HAVE_RTE rte_finalize(); #endif return UCS_OK; @@ -1314,7 +1480,7 @@ static ucs_status_t cleanup_mpi_rte(struct perftest_context *ctx) static ucs_status_t check_system(struct perftest_context *ctx) { - cpu_set_t cpuset; + ucs_sys_cpuset_t cpuset; unsigned i, count, nr_cpus; int ret; @@ -1329,19 +1495,24 @@ static ucs_status_t check_system(struct perftest_context *ctx) memset(&cpuset, 0, sizeof(cpuset)); if (ctx->flags & TEST_FLAG_SET_AFFINITY) { - if (ctx->cpu >= nr_cpus) { - ucs_error("cpu (%u) ot of range (0..%u)", ctx->cpu, nr_cpus - 1); - return UCS_ERR_INVALID_PARAM; + for (i = 0; i < ctx->num_cpus; i++) { + if (ctx->cpus[i] >= nr_cpus) { + ucs_error("cpu (%u) out of range (0..%u)", ctx->cpus[i], nr_cpus - 1); + return UCS_ERR_INVALID_PARAM; + } + } + + for (i = 0; i < ctx->num_cpus; i++) { + CPU_SET(ctx->cpus[i], &cpuset); } - CPU_SET(ctx->cpu, &cpuset); - ret = sched_setaffinity(0, sizeof(cpuset), &cpuset); + ret = ucs_sys_setaffinity(&cpuset); if (ret) { ucs_warn("sched_setaffinity() failed: %m"); return UCS_ERR_INVALID_PARAM; } } else { - ret = sched_getaffinity(0, sizeof(cpuset), &cpuset); + ret = ucs_sys_getaffinity(&cpuset); if (ret) { ucs_warn("sched_getaffinity() failed: %m"); return UCS_ERR_INVALID_PARAM; @@ -1355,28 +1526,36 @@ static ucs_status_t check_system(struct perftest_context *ctx) } if (count > 2) { ucs_warn("CPU affinity is not set (bound to %u cpus)." - " Performance may be impacted.", count); + " Performance may be impacted.", count); } } return UCS_OK; } -static void clone_params(ucx_perf_params_t *dest, const ucx_perf_params_t *src) +static ucs_status_t clone_params(perftest_params_t *dest, + const perftest_params_t *src) { size_t msg_size_list_size; - *dest = *src; - msg_size_list_size = dest->msg_size_cnt * sizeof(*dest->msg_size_list); - dest->msg_size_list = malloc(msg_size_list_size); - memcpy(dest->msg_size_list, src->msg_size_list, msg_size_list_size); + *dest = *src; + msg_size_list_size = dest->super.msg_size_cnt * + sizeof(*dest->super.msg_size_list); + dest->super.msg_size_list = malloc(msg_size_list_size); + if (dest->super.msg_size_list == NULL) { + return ((msg_size_list_size != 0) ? UCS_ERR_NO_MEMORY : UCS_OK); + } + + memcpy(dest->super.msg_size_list, src->super.msg_size_list, + msg_size_list_size); + return UCS_OK; } static ucs_status_t run_test_recurs(struct perftest_context *ctx, - ucx_perf_params_t *parent_params, + const perftest_params_t *parent_params, unsigned depth) { - ucx_perf_params_t params; + perftest_params_t params; ucx_perf_result_t result; ucs_status_t status; FILE *batch_file; @@ -1384,20 +1563,20 @@ static ucs_status_t run_test_recurs(struct perftest_context *ctx, ucs_trace_func("depth=%u, num_files=%u", depth, ctx->num_batch_files); - if (parent_params->api == UCX_PERF_API_UCP) { - if (strcmp(parent_params->uct.dev_name, TL_RESOURCE_NAME_NONE)) { + if (parent_params->super.api == UCX_PERF_API_UCP) { + if (strcmp(parent_params->super.uct.dev_name, TL_RESOURCE_NAME_NONE)) { ucs_warn("-d '%s' ignored for UCP test; see NOTES section in help message", - parent_params->uct.dev_name); + parent_params->super.uct.dev_name); } - if (strcmp(parent_params->uct.tl_name, TL_RESOURCE_NAME_NONE)) { + if (strcmp(parent_params->super.uct.tl_name, TL_RESOURCE_NAME_NONE)) { ucs_warn("-x '%s' ignored for UCP test; see NOTES section in help message", - parent_params->uct.tl_name); + parent_params->super.uct.tl_name); } } if (depth >= ctx->num_batch_files) { print_test_name(ctx); - return ucx_perf_run(parent_params, &result); + return ucx_perf_run(&parent_params->super, &result); } batch_file = fopen(ctx->batch_files[depth], "r"); @@ -1406,32 +1585,55 @@ static ucs_status_t run_test_recurs(struct perftest_context *ctx, return UCS_ERR_IO_ERROR; } - clone_params(¶ms, parent_params); + status = clone_params(¶ms, parent_params); + if (status != UCS_OK) { + goto out; + } + line_num = 0; while ((status = read_batch_file(batch_file, ctx->batch_files[depth], &line_num, ¶ms, &ctx->test_names[depth])) == UCS_OK) { - status = run_test_recurs(ctx, ¶ms, depth + 1); - free(params.msg_size_list); + run_test_recurs(ctx, ¶ms, depth + 1); + free(params.super.msg_size_list); free(ctx->test_names[depth]); ctx->test_names[depth] = NULL; - clone_params(¶ms, parent_params); + status = clone_params(¶ms, parent_params); + if (status != UCS_OK) { + goto out; + } + } + + if (status == UCS_ERR_NO_ELEM) { + status = UCS_OK; } - free(params.msg_size_list); + free(params.super.msg_size_list); +out: fclose(batch_file); - return UCS_OK; + return status; } static ucs_status_t run_test(struct perftest_context *ctx) { + const char *error_prefix; ucs_status_t status; ucs_trace_func(""); setlocale(LC_ALL, "en_US"); + /* no batch files, only command line params */ + if (ctx->num_batch_files == 0) { + error_prefix = (ctx->flags & TEST_FLAG_PRINT_RESULTS) ? + "command line: " : ""; + status = adjust_test_params(&ctx->params, error_prefix); + if (status != UCS_OK) { + return status; + } + } + print_header(ctx); status = run_test_recurs(ctx, &ctx->params, 0); @@ -1450,8 +1652,26 @@ int main(int argc, char **argv) int mpi_rte; int ret; -#if HAVE_MPI - mpi_initialized = !isatty(0) && (MPI_Init(&argc, &argv) == 0); +#ifdef HAVE_MPI + int provided; + + mpi_initialized = !isatty(0) && + /* Using MPI_THREAD_FUNNELED since ucx_perftest supports + * using multiple threads when only the main one makes + * MPI calls (which is also suitable for a single threaded + * run). + * MPI_THREAD_FUNNELED: + * The process may be multi-threaded, but only the main + * thread will make MPI calls (all MPI calls are funneled + * to the main thread). */ + (MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided) == 0); + + if (mpi_initialized && (provided != MPI_THREAD_FUNNELED)) { + printf("MPI_Init_thread failed to set MPI_THREAD_FUNNELED. (provided = %d)\n", + provided); + ret = -1; + goto out; + } #else mpi_initialized = 0; #endif @@ -1460,7 +1680,7 @@ int main(int argc, char **argv) status = parse_opts(&ctx, mpi_initialized, argc, argv); if (status != UCS_OK) { ret = (status == UCS_ERR_CANCELED) ? 0 : -127; - goto out; + goto out_msg_size_list; } #ifdef __COVERITY__ @@ -1471,7 +1691,7 @@ int main(int argc, char **argv) if (ctx.mpi) { mpi_rte = 1; } else { -#if HAVE_RTE +#ifdef HAVE_RTE mpi_rte = 1; #else mpi_rte = 0; @@ -1481,14 +1701,14 @@ int main(int argc, char **argv) status = check_system(&ctx); if (status != UCS_OK) { ret = -1; - goto out; + goto out_msg_size_list; } /* Create RTE */ status = (mpi_rte) ? setup_mpi_rte(&ctx) : setup_sock_rte(&ctx); if (status != UCS_OK) { ret = -1; - goto out; + goto out_msg_size_list; } /* Run the test */ @@ -1502,12 +1722,15 @@ int main(int argc, char **argv) out_cleanup_rte: (mpi_rte) ? cleanup_mpi_rte(&ctx) : cleanup_sock_rte(&ctx); -out: - if (ctx.params.msg_size_list) { - free(ctx.params.msg_size_list); +out_msg_size_list: + if (ctx.params.super.msg_size_list) { + free(ctx.params.super.msg_size_list); } - if (mpi_initialized) { #if HAVE_MPI +out: +#endif + if (mpi_initialized) { +#ifdef HAVE_MPI MPI_Finalize(); #endif } diff --git a/src/tools/perf/rocm/Makefile.am b/src/tools/perf/rocm/Makefile.am new file mode 100644 index 00000000000..889f5139a18 --- /dev/null +++ b/src/tools/perf/rocm/Makefile.am @@ -0,0 +1,20 @@ +# +# Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED. +# +# See file LICENSE for terms. +# + +if HAVE_HIP + +module_LTLIBRARIES = libucx_perftest_rocm.la +libucx_perftest_rocm_la_CPPFLAGS = $(BASE_CPPFLAGS) $(HIP_CPPFLAGS) +libucx_perftest_rocm_la_CFLAGS = $(BASE_CFLAGS) $(HIP_CFLAGS) +libucx_perftest_rocm_la_LDFLAGS = $(HIP_LDFLAGS) $(HIP_LIBS) -version-info $(SOVERSION) \ + $(patsubst %, -Xlinker %, -L$(ROCM_ROOT)/lib -rpath $(ROCM_ROOT)/hip/lib -rpath $(ROCM_ROOT)/lib) \ + $(patsubst %, -Xlinker %, --enable-new-dtags) \ + $(patsubst %, -Xlinker %, -rpath $(ROCM_ROOT)/lib64) +libucx_perftest_rocm_la_SOURCES = rocm_alloc.c + +include $(top_srcdir)/config/module.am + +endif diff --git a/src/tools/perf/rocm/configure.m4 b/src/tools/perf/rocm/configure.m4 new file mode 100644 index 00000000000..cb662a4d540 --- /dev/null +++ b/src/tools/perf/rocm/configure.m4 @@ -0,0 +1,11 @@ +# +# Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED. +# +# See file LICENSE for terms. +# + +UCX_CHECK_ROCM + +AS_IF([test "x$rocm_happy" = "xyes"], [ucx_perftest_modules="${ucx_perftest_modules}:rocm"]) + +AC_CONFIG_FILES([src/tools/perf/rocm/Makefile]) diff --git a/src/tools/perf/rocm/rocm_alloc.c b/src/tools/perf/rocm/rocm_alloc.c new file mode 100644 index 00000000000..630df233c89 --- /dev/null +++ b/src/tools/perf/rocm/rocm_alloc.c @@ -0,0 +1,194 @@ +/** + * Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include + +#include +#include + + +static ucs_status_t ucx_perf_rocm_init(ucx_perf_context_t *perf) +{ + hipError_t ret; + unsigned group_index; + int num_gpus; + int gpu_index; + + group_index = rte_call(perf, group_index); + + ret = hipGetDeviceCount(&num_gpus); + if (ret != hipSuccess) { + return UCS_ERR_NO_DEVICE; + } + + gpu_index = group_index % num_gpus; + + ret = hipSetDevice(gpu_index); + if (ret != hipSuccess) { + return UCS_ERR_NO_DEVICE; + } + + return UCS_OK; +} + +static inline ucs_status_t ucx_perf_rocm_alloc(size_t length, + ucs_memory_type_t mem_type, + void **address_p) +{ + hipError_t ret; + + ucs_assert((mem_type == UCS_MEMORY_TYPE_ROCM) || + (mem_type == UCS_MEMORY_TYPE_ROCM_MANAGED)); + + ret = ((mem_type == UCS_MEMORY_TYPE_ROCM) ? + hipMalloc(address_p, length) : + hipMallocManaged(address_p, length, hipMemAttachGlobal)); + if (ret != hipSuccess) { + ucs_error("failed to allocate memory"); + return UCS_ERR_NO_MEMORY; + } + + return UCS_OK; +} + +static ucs_status_t ucp_perf_rocm_alloc(const ucx_perf_context_t *perf, size_t length, + void **address_p, ucp_mem_h *memh_p, + int non_blk_flag) +{ + return ucx_perf_rocm_alloc(length, UCS_MEMORY_TYPE_ROCM, address_p); +} + +static ucs_status_t ucp_perf_rocm_alloc_managed(const ucx_perf_context_t *perf, + size_t length, void **address_p, + ucp_mem_h *memh_p, int non_blk_flag) +{ + return ucx_perf_rocm_alloc(length, UCS_MEMORY_TYPE_ROCM_MANAGED, address_p); +} + +static void ucp_perf_rocm_free(const ucx_perf_context_t *perf, + void *address, ucp_mem_h memh) +{ + hipFree(address); +} + +static inline ucs_status_t +uct_perf_rocm_alloc_reg_mem(const ucx_perf_context_t *perf, + size_t length, + ucs_memory_type_t mem_type, + unsigned flags, + uct_allocated_memory_t *alloc_mem) +{ + ucs_status_t status; + + status = ucx_perf_rocm_alloc(length, mem_type, &alloc_mem->address); + if (status != UCS_OK) { + return status; + } + + status = uct_md_mem_reg(perf->uct.md, alloc_mem->address, + length, flags, &alloc_mem->memh); + if (status != UCS_OK) { + hipFree(alloc_mem->address); + ucs_error("failed to register memory"); + return status; + } + + alloc_mem->mem_type = mem_type; + alloc_mem->md = perf->uct.md; + + return UCS_OK; +} + +static ucs_status_t uct_perf_rocm_alloc(const ucx_perf_context_t *perf, + size_t length, unsigned flags, + uct_allocated_memory_t *alloc_mem) +{ + return uct_perf_rocm_alloc_reg_mem(perf, length, UCS_MEMORY_TYPE_ROCM, + flags, alloc_mem); +} + +static ucs_status_t uct_perf_rocm_managed_alloc(const ucx_perf_context_t *perf, + size_t length, unsigned flags, + uct_allocated_memory_t *alloc_mem) +{ + return uct_perf_rocm_alloc_reg_mem(perf, length, UCS_MEMORY_TYPE_ROCM_MANAGED, + flags, alloc_mem); +} + +static void uct_perf_rocm_free(const ucx_perf_context_t *perf, + uct_allocated_memory_t *alloc_mem) +{ + ucs_status_t status; + + ucs_assert(alloc_mem->md == perf->uct.md); + + status = uct_md_mem_dereg(perf->uct.md, alloc_mem->memh); + if (status != UCS_OK) { + ucs_error("failed to deregister memory"); + } + + hipFree(alloc_mem->address); +} + +static void ucx_perf_rocm_memcpy(void *dst, ucs_memory_type_t dst_mem_type, + const void *src, ucs_memory_type_t src_mem_type, + size_t count) +{ + hipError_t ret; + + ret = hipMemcpy(dst, src, count, hipMemcpyDefault); + if (ret != hipSuccess) { + ucs_error("failed to copy memory: %s", hipGetErrorString(ret)); + } +} + +static void* ucx_perf_rocm_memset(void *dst, int value, size_t count) +{ + hipError_t ret; + + ret = hipMemset(dst, value, count); + if (ret != hipSuccess) { + ucs_error("failed to set memory: %s", hipGetErrorString(ret)); + } + + return dst; +} + +UCS_STATIC_INIT { + static ucx_perf_allocator_t rocm_allocator = { + .mem_type = UCS_MEMORY_TYPE_ROCM, + .init = ucx_perf_rocm_init, + .ucp_alloc = ucp_perf_rocm_alloc, + .ucp_free = ucp_perf_rocm_free, + .uct_alloc = uct_perf_rocm_alloc, + .uct_free = uct_perf_rocm_free, + .memcpy = ucx_perf_rocm_memcpy, + .memset = ucx_perf_rocm_memset + }; + static ucx_perf_allocator_t rocm_managed_allocator = { + .mem_type = UCS_MEMORY_TYPE_ROCM_MANAGED, + .init = ucx_perf_rocm_init, + .ucp_alloc = ucp_perf_rocm_alloc_managed, + .ucp_free = ucp_perf_rocm_free, + .uct_alloc = uct_perf_rocm_managed_alloc, + .uct_free = uct_perf_rocm_free, + .memcpy = ucx_perf_rocm_memcpy, + .memset = ucx_perf_rocm_memset + }; + + ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_ROCM] = &rocm_allocator; + ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_ROCM_MANAGED] = &rocm_managed_allocator; +} +UCS_STATIC_CLEANUP { + ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_ROCM] = NULL; + ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_ROCM_MANAGED] = NULL; + +} diff --git a/src/tools/profile/read_profile.c b/src/tools/profile/read_profile.c index 53e142c706e..14204086930 100644 --- a/src/tools/profile/read_profile.c +++ b/src/tools/profile/read_profile.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include @@ -16,11 +20,16 @@ #include #include #include +#include #include +#include #define INDENT 4 -#define LESS_COMMAND "less" +#define PAGER_LESS "less" +#define PAGER_LESS_CMD PAGER_LESS " -R" +#define FUNC_NAME_MAX_LEN 35 +#define MAX_THREADS 256 #define TERM_COLOR_CLEAR "\x1B[0m" #define TERM_COLOR_RED "\x1B[31m" @@ -32,6 +41,16 @@ #define TERM_COLOR_WHITE "\x1B[37m" #define TERM_COLOR_GRAY "\x1B[90m" +#define NAME_COLOR (opts->raw ? "" : TERM_COLOR_CYAN) +#define HEAD_COLOR (opts->raw ? "" : TERM_COLOR_RED) +#define TS_COLOR (opts->raw ? "" : TERM_COLOR_WHITE) +#define LOC_COLOR (opts->raw ? "" : TERM_COLOR_GRAY) +#define REQ_COLOR (opts->raw ? "" : TERM_COLOR_YELLOW) +#define CLEAR_COLOR (opts->raw ? "" : TERM_COLOR_CLEAR) + +#define print_error(_fmt, ...) \ + fprintf(stderr, "Error: " _fmt "\n", ## __VA_ARGS__) + typedef enum { TIME_UNITS_NSEC, @@ -46,61 +65,104 @@ typedef struct options { const char *filename; int raw; time_units_t time_units; + int thread_list[MAX_THREADS + 1]; } options_t; +typedef struct { + const ucs_profile_thread_header_t *header; + const ucs_profile_thread_location_t *locations; + const ucs_profile_record_t *records; +} profile_thread_data_t; + + typedef struct { void *mem; size_t length; const ucs_profile_header_t *header; const ucs_profile_location_t *locations; - const ucs_profile_record_t *records; + profile_thread_data_t *threads; } profile_data_t; +typedef struct { + uint64_t total_time; + size_t count; + unsigned location_idx; +} profile_sorted_location_t; + + /* Used to redirect output to a "less" command */ static int output_pipefds[2] = {-1, -1}; static const char* time_units_str[] = { - [TIME_UNITS_NSEC] = "nsec", - [TIME_UNITS_USEC] = "usec", - [TIME_UNITS_MSEC] = "msec", - [TIME_UNITS_SEC] = "sec", + [TIME_UNITS_NSEC] = "(nsec)", + [TIME_UNITS_USEC] = "(usec)", + [TIME_UNITS_MSEC] = "(msec)", + [TIME_UNITS_SEC] = "(sec)", [TIME_UNITS_LAST] = NULL }; static int read_profile_data(const char *file_name, profile_data_t *data) { - struct stat stat; + uint32_t thread_idx; + struct stat stt; + const void *ptr; int ret, fd; fd = open(file_name, O_RDONLY); if (fd < 0) { - fprintf(stderr, "Failed to open %s: %m\n", file_name); + print_error("failed to open %s: %m", file_name); ret = fd; goto out; } - ret = fstat(fd, &stat); + ret = fstat(fd, &stt); if (ret < 0) { - fprintf(stderr, "fstat(%s) failed: %m\n", file_name); + print_error("fstat(%s) failed: %m", file_name); goto out_close; } - data->length = stat.st_size; - data->mem = mmap(NULL, stat.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + data->length = stt.st_size; + data->mem = mmap(NULL, stt.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (data->mem == MAP_FAILED) { - fprintf(stderr, "mmap(%s, length=%zd) failed: %m\n", file_name, - data->length); + print_error("mmap(%s, length=%zd) failed: %m", file_name, + data->length); ret = -1; goto out_close; } - data->header = data->mem; - data->locations = (const void*)(data->header + 1); - data->records = (const void*)(data->locations + data->header->num_locations); + ptr = data->mem; + data->header = ptr; + ptr = data->header + 1; + + if (data->header->version != UCS_PROFILE_FILE_VERSION) { + print_error("invalid file version, expected: %u, actual: %u", + UCS_PROFILE_FILE_VERSION, data->header->version); + ret = -EINVAL; + goto err_munmap; + } + + data->locations = ptr; + ptr = data->locations + data->header->num_locations; + + data->threads = calloc(data->header->num_threads, sizeof(*data->threads)); + if (data->threads == NULL) { + print_error("failed to allocate threads array"); + goto err_munmap; + } + + for (thread_idx = 0; thread_idx < data->header->num_threads; ++thread_idx) { + profile_thread_data_t *thread = &data->threads[thread_idx]; + thread->header = ptr; + ptr = thread->header + 1; + thread->locations = ptr; + ptr = thread->locations + data->header->num_locations; + thread->records = ptr; + ptr = thread->records + thread->header->num_records; + } ret = 0; @@ -108,14 +170,108 @@ static int read_profile_data(const char *file_name, profile_data_t *data) close(fd); out: return ret; +err_munmap: + munmap(data->mem, data->length); + goto out_close; } static void release_profile_data(profile_data_t *data) { + free(data->threads); munmap(data->mem, data->length); } -static double time_to_usec(profile_data_t *data, options_t *opts, uint64_t time) +static int parse_thread_list(int *thread_list, const char *str) +{ + char *str_dup, *p, *saveptr, *tailptr; + int thread_idx; + unsigned idx; + int ret; + + str_dup = strdup(str); + if (str_dup == NULL) { + ret = -ENOMEM; + print_error("failed to duplicate thread list string"); + goto out; + } + + idx = 0; + + /* the special value 'all' will create an empty thread list, which means + * use all threads + */ + if (!strcasecmp(str_dup, "all")) { + goto out_terminate; + } + + p = strtok_r(str_dup, ",", &saveptr); + while (p != NULL) { + if (idx >= MAX_THREADS) { + ret = -EINVAL; + print_error("up to %d threads are supported", MAX_THREADS); + goto out; + } + + thread_idx = strtol(p, &tailptr, 0); + if (*tailptr != '\0') { + ret = -ENOMEM; + print_error("failed to parse thread number '%s'", p); + goto out; + } + + if (thread_idx <= 0) { + ret = -EINVAL; + print_error("invalid thread index %d", thread_idx); + goto out; + } + + thread_list[idx++] = thread_idx; + p = strtok_r(NULL, ",", &saveptr); + } + + if (idx == 0) { + ret = -EINVAL; + print_error("empty thread list"); + goto out; + } + +out_terminate: + ret = 0; + thread_list[idx] = -1; /* terminator */ +out: + free(str_dup); + return ret; +} + +static const char* thread_list_str(const int *thread_list, char *buf, size_t max) +{ + char *p, *endp; + const int *t; + int ret; + + p = buf; + endp = buf + max - 4; /* leave room for "...\0" */ + + for (t = thread_list; *t != -1; ++t) { + ret = snprintf(p, endp - p, "%d,", *t); + if (ret >= endp - p) { + /* truncated */ + strcat(p, "..."); + return buf; + } + + p += strlen(p); + } + + if (p > buf) { + *(p - 1) = '\0'; + } else { + *buf = '\0'; + } + return buf; +} + +static double time_to_units(profile_data_t *data, options_t *opts, uint64_t time) { static const double time_units_val[] = { [TIME_UNITS_NSEC] = 1e9, @@ -129,70 +285,155 @@ static double time_to_usec(profile_data_t *data, options_t *opts, uint64_t time) static int compare_locations(const void *l1, const void *l2) { - const ucs_profile_location_t *loc1 = l1; - const ucs_profile_location_t *loc2 = l2; + const ucs_profile_thread_location_t *loc1 = l1; + const ucs_profile_thread_location_t *loc2 = l2; return (loc1->total_time > loc2->total_time) ? -1 : (loc1->total_time < loc2->total_time) ? +1 : 0; } -static void show_profile_data_accum(profile_data_t *data, options_t *opts) +static int show_profile_data_accum(profile_data_t *data, options_t *opts) { - uint32_t num_locations = data->header->num_locations; - ucs_profile_location_t *sorted_locations; - ucs_profile_location_t *loc; + typedef struct { + long overall_time; /* overall threads runtime */ + int thread_list[MAX_THREADS + 1]; + int *last; + } location_thread_info_t; + + const uint32_t num_locations = data->header->num_locations; + profile_sorted_location_t *sorted_locations = NULL; + location_thread_info_t *locations_thread_info = NULL; + const ucs_profile_thread_location_t *thread_location; + location_thread_info_t *loc_thread_info; + profile_sorted_location_t *sorted_loc; + const profile_thread_data_t *thread; + const ucs_profile_location_t *loc; + unsigned location_idx, thread_idx; + char avg_buf[20], total_buf[20], overall_buf[20]; + char thread_list_buf[20]; + char *avg_str, *total_str, *overall_str; + int ret; + int *t; - sorted_locations = malloc(sizeof(*sorted_locations) * num_locations); - if (sorted_locations == NULL) { - return; + sorted_locations = calloc(num_locations, sizeof(*sorted_locations)); + locations_thread_info = calloc(num_locations, sizeof(*locations_thread_info)); + if ((sorted_locations == NULL) || (locations_thread_info == NULL)) { + print_error("failed to allocate locations info"); + ret = -ENOMEM; + goto out; + } + + /* Go over the list of threads provided by the user and accumulate the times + * and counts from all threads. In addition, track which calls were made from + * which threads. + */ + for (location_idx = 0; location_idx < num_locations; ++location_idx) { + sorted_loc = &sorted_locations[location_idx]; + loc_thread_info = &locations_thread_info[location_idx]; + sorted_loc->location_idx = location_idx; + loc_thread_info->thread_list[0] = -1; + loc_thread_info->last = loc_thread_info->thread_list; + loc_thread_info->overall_time = 0; + + for (t = opts->thread_list; *t != -1; ++t) { + thread_idx = *t - 1; + thread = &data->threads[thread_idx]; + thread_location = &thread->locations[location_idx]; + sorted_loc->count += thread_location->count; + sorted_loc->total_time += thread_location->total_time; + + if (thread_location->count > 0) { + loc_thread_info->overall_time += thread->header->end_time - + thread->header->start_time; + *(loc_thread_info->last++) = thread_idx + 1; + } + } + + *loc_thread_info->last = -1; } /* Sort locations */ - memcpy(sorted_locations, data->locations, sizeof(*sorted_locations) * num_locations); - qsort(sorted_locations, num_locations, sizeof(*sorted_locations), compare_locations); + qsort(sorted_locations, num_locations, sizeof(*sorted_locations), + compare_locations); /* Print locations */ - printf("%30s %13s %13s %10s FILE FUNCTION\n", - "NAME", "AVG", "TOTAL", "COUNT"); - for (loc = sorted_locations; loc < sorted_locations + num_locations; ++loc) { + printf("%s%*s %6s %-6s %6s %-6s %13s %12s %18s%-6s %-*s %s%s\n", + HEAD_COLOR, + FUNC_NAME_MAX_LEN, + "NAME", + "AVG", time_units_str[opts->time_units], + "TOTAL", time_units_str[opts->time_units], + "%OVERALL", + "COUNT", + "FILE", + ":LINE", + FUNC_NAME_MAX_LEN, + "FUNCTION", + "THREADS", + CLEAR_COLOR); + + for (sorted_loc = sorted_locations; + sorted_loc < (sorted_locations + num_locations); ++sorted_loc) { + + if (sorted_loc->count == 0) { + continue; + } + + loc = &data->locations[sorted_loc->location_idx]; + loc_thread_info = &locations_thread_info[sorted_loc->location_idx]; + switch (loc->type) { - case UCS_PROFILE_TYPE_SAMPLE: - printf("%30s %13s %13s %10ld %18s:%-4d %s()\n", - loc->name, - "-", - "-", - (long)loc->count, - loc->file, loc->line, loc->function); - break; case UCS_PROFILE_TYPE_SCOPE_END: - printf("%30s %13.3f %13.0f %10ld %18s:%-4d %s()\n", - loc->name, - time_to_usec(data, opts, loc->total_time) / loc->count, - time_to_usec(data, opts, loc->total_time), - (long)loc->count, - loc->file, loc->line, loc->function); + snprintf(avg_buf, sizeof(avg_buf) - 1, "%.3f", + time_to_units(data, opts, + sorted_loc->total_time / sorted_loc->count)); + snprintf(total_buf, sizeof(total_buf) - 1, "%.2f", + time_to_units(data, opts, sorted_loc->total_time)); + snprintf(overall_buf, sizeof(overall_buf) - 1, "%.3f", + sorted_loc->total_time * 100.0 / loc_thread_info->overall_time); + + avg_str = avg_buf; + total_str = total_buf; + overall_str = overall_buf; break; + case UCS_PROFILE_TYPE_SAMPLE: case UCS_PROFILE_TYPE_REQUEST_EVENT: - printf("%30s %13s %13s %10ld %18s:%-4d %s()\n", - loc->name, - "n/a", - "n/a", - (long)loc->count, - loc->file, loc->line, loc->function); + avg_str = total_str = overall_str = "n/a"; break; default: - break; + continue; } - } + printf("%s%*.*s%s %13s %13s %13s %12zu %s%18s:%-6d %-*s %-13s%s\n", + NAME_COLOR, FUNC_NAME_MAX_LEN, FUNC_NAME_MAX_LEN, loc->name, CLEAR_COLOR, + avg_str, + total_str, + overall_str, + sorted_loc->count, + LOC_COLOR, + loc->file, loc->line, + FUNC_NAME_MAX_LEN, loc->function, + thread_list_str(loc_thread_info->thread_list, thread_list_buf, + sizeof(thread_list_buf)), + CLEAR_COLOR); + } + + ret = 0; + +out: + free(locations_thread_info); free(sorted_locations); + return ret; } -KHASH_MAP_INIT_INT64(request_ids, int) +KHASH_MAP_INIT_INT64(request_ids, size_t) -static void show_profile_data_log(profile_data_t *data, options_t *opts) +static void show_profile_data_log(profile_data_t *data, options_t *opts, + int thread_idx) { - size_t num_recods = data->header->num_records; + profile_thread_data_t *thread = &data->threads[thread_idx]; + size_t num_records = thread->header->num_records; + size_t reqid_ctr = 1; const ucs_profile_record_t **stack[UCS_PROFILE_STACK_MAX * 2]; const ucs_profile_record_t **scope_ends; const ucs_profile_location_t *loc; @@ -204,15 +445,10 @@ static void show_profile_data_log(profile_data_t *data, options_t *opts) khash_t(request_ids) reqids; int hash_extra_status; khiter_t hash_it; - int reqid, reqid_ctr = 1; + size_t reqid; -#define NAME_COLOR (opts->raw ? "" : TERM_COLOR_CYAN) -#define TS_COLOR (opts->raw ? "" : TERM_COLOR_WHITE) -#define LOC_COLOR (opts->raw ? "" : TERM_COLOR_GRAY) -#define REQ_COLOR (opts->raw ? "" : TERM_COLOR_YELLOW) -#define CLEAR_COLOR (opts->raw ? "" : TERM_COLOR_CLEAR) #define RECORD_FMT "%s%10.3f%s%*s" -#define RECORD_ARG(_ts) TS_COLOR, time_to_usec(data, opts, (_ts)), CLEAR_COLOR, \ +#define RECORD_ARG(_ts) TS_COLOR, time_to_units(data, opts, (_ts)), CLEAR_COLOR, \ INDENT * nesting, "" #define PRINT_RECORD() printf("%-*s %s%15s:%-4d %s()%s\n", \ (int)(60 + strlen(NAME_COLOR) + \ @@ -223,22 +459,29 @@ static void show_profile_data_log(profile_data_t *data, options_t *opts) basename(loc->file), loc->line, loc->function, \ CLEAR_COLOR) - scope_ends = calloc(1, sizeof(*scope_ends) * num_recods); + scope_ends = calloc(1, sizeof(*scope_ends) * num_records); if (scope_ends == NULL) { - fprintf(stderr, "Failed to allocate memory\n"); + print_error("failed to allocate memory for scope ends"); return; } + printf("\n"); + printf("%sThread %d (tid %d%s)%s\n", HEAD_COLOR, thread_idx + 1, + thread->header->tid, + (thread->header->tid == data->header->pid) ? ", main" : "", + CLEAR_COLOR); + printf("\n"); + memset(stack, 0, sizeof(stack)); /* Find the first record with minimal nesting level, which is the base of call stack */ nesting = 0; min_nesting = 0; - for (rec = data->records; rec < data->records + num_recods; ++rec) { + for (rec = thread->records; rec < thread->records + num_records; ++rec) { loc = &data->locations[rec->location]; switch (loc->type) { case UCS_PROFILE_TYPE_SCOPE_BEGIN: - stack[nesting + UCS_PROFILE_STACK_MAX] = &scope_ends[rec - data->records]; + stack[nesting + UCS_PROFILE_STACK_MAX] = &scope_ends[rec - thread->records]; ++nesting; break; case UCS_PROFILE_TYPE_SCOPE_END: @@ -256,8 +499,8 @@ static void show_profile_data_log(profile_data_t *data, options_t *opts) } } - if (num_recods > 0) { - prev_time = data->records[0].timestamp; + if (num_records > 0) { + prev_time = thread->records[0].timestamp; } else { prev_time = 0; } @@ -266,16 +509,17 @@ static void show_profile_data_log(profile_data_t *data, options_t *opts) /* Display records */ nesting = -min_nesting; - for (rec = data->records; rec < data->records + num_recods; ++rec) { + for (rec = thread->records; rec < thread->records + num_records; ++rec) { loc = &data->locations[rec->location]; switch (loc->type) { case UCS_PROFILE_TYPE_SCOPE_BEGIN: - se = scope_ends[rec - data->records]; + se = scope_ends[rec - thread->records]; if (se != NULL) { snprintf(buf, sizeof(buf), RECORD_FMT" %s%s%s %s%.3f%s {", RECORD_ARG(rec->timestamp - prev_time), - NAME_COLOR, data->locations[se->location].name, CLEAR_COLOR, - TS_COLOR, time_to_usec(data, opts, se->timestamp - rec->timestamp), + NAME_COLOR, data->locations[se->location].name, + CLEAR_COLOR, TS_COLOR, + time_to_units(data, opts, se->timestamp - rec->timestamp), CLEAR_COLOR); } else { snprintf(buf, sizeof(buf), ""); @@ -313,12 +557,13 @@ static void show_profile_data_log(profile_data_t *data, options_t *opts) reqid = reqid_ctr++; kh_value(&reqids, hash_it) = reqid; } - action = "NEW "; + action = "NEW"; } else { hash_it = kh_get(request_ids, &reqids, rec->param64); if (hash_it == kh_end(&reqids)) { reqid = 0; /* could not find request */ } else { + assert(reqid_ctr > 1); reqid = kh_value(&reqids, hash_it); if (loc->type == UCS_PROFILE_TYPE_REQUEST_FREE) { kh_del(request_ids, &reqids, hash_it); @@ -330,7 +575,7 @@ static void show_profile_data_log(profile_data_t *data, options_t *opts) action = ""; } } - snprintf(buf, sizeof(buf), RECORD_FMT" %s%s%s%s %s{%d}%s", + snprintf(buf, sizeof(buf), RECORD_FMT" %s%s%s%s %s{%zu}%s", RECORD_ARG(rec->timestamp - prev_time), REQ_COLOR, action, loc->name, CLEAR_COLOR, REQ_COLOR, reqid, CLEAR_COLOR); @@ -352,42 +597,51 @@ static void close_pipes() close(output_pipefds[1]); } -static int redirect_output(const ucs_profile_header_t *hdr) +static int redirect_output(const profile_data_t *data, options_t *opts) { - char *less_argv[] = {LESS_COMMAND, - "-R" /* show colors */, - NULL};; + const char *shell_cmd = "sh"; struct winsize wsz; uint64_t num_lines; + const char *pager_cmd; pid_t pid; int ret; + int *t; ret = ioctl(STDOUT_FILENO, TIOCGWINSZ, &wsz); if (ret < 0) { - fprintf(stderr, "ioctl(TIOCGWINSZ) failed: %m\n"); + print_error("ioctl(TIOCGWINSZ) failed: %m"); return ret; } num_lines = 6 + /* header */ - ((hdr->mode & UCS_BIT(UCS_PROFILE_MODE_ACCUM)) ? - (hdr->num_locations + 2) : 0) + - ((hdr->mode & UCS_BIT(UCS_PROFILE_MODE_LOG)) ? - (hdr->num_records + 1) : 0) + 1; /* footer */ + if (data->header->mode & UCS_BIT(UCS_PROFILE_MODE_ACCUM)) { + num_lines += 1 + /* locations title */ + data->header->num_locations + /* locations data */ + 1; /* locations footer */ + } + + if (data->header->mode & UCS_BIT(UCS_PROFILE_MODE_LOG)) { + for (t = opts->thread_list; *t != -1; ++t) { + num_lines += 3 + /* thread header */ + data->threads[*t - 1].header->num_records; /* thread records */ + } + } + if (num_lines <= wsz.ws_row) { return 0; /* no need to use 'less' */ } ret = pipe(output_pipefds); if (ret < 0) { - fprintf(stderr, "pipe() failed: %m\n"); + print_error("pipe() failed: %m"); return ret; } pid = fork(); if (pid < 0) { - fprintf(stderr, "fork() failed: %m\n"); + print_error("fork() failed: %m"); close_pipes(); return pid; } @@ -399,7 +653,7 @@ static int redirect_output(const ucs_profile_header_t *hdr) /* redirect output to pipe */ ret = dup2(output_pipefds[1], fileno(stdout)); if (ret < 0) { - fprintf(stderr, "Failed to redirect stdout: %m\n"); + print_error("failed to redirect stdout: %m"); close_pipes(); return ret; } @@ -410,31 +664,96 @@ static int redirect_output(const ucs_profile_header_t *hdr) /* redirect input from pipe */ ret = dup2(output_pipefds[0], fileno(stdin)); if (ret < 0) { - fprintf(stderr, "Failed to redirect stdin: %m\n"); + print_error("failed to redirect stdin: %m"); exit(ret); } close_pipes(); - return execvp(LESS_COMMAND, less_argv); + + /* If PAGER environment variable is set, use it. If it's not set, or it + * is equal to "less", use "less -R" to show colors. + */ + pager_cmd = getenv("PAGER"); + if ((pager_cmd == NULL) || !strcmp(pager_cmd, PAGER_LESS)) { + pager_cmd = PAGER_LESS_CMD; + } + + /* coverity[tainted_string] */ + ret = execlp(shell_cmd, shell_cmd, "-c", pager_cmd, NULL); + if (ret) { + print_error("failed to execute shell '%s': %m", shell_cmd); + } + return ret; } } static void show_header(profile_data_t *data, options_t *opts) { + char buf[80]; + printf("\n"); - printf(" command : %s\n", data->header->cmdline); + printf(" ucs lib : %s\n", data->header->ucs_path); printf(" host : %s\n", data->header->hostname); + printf(" command : %s\n", data->header->cmdline); printf(" pid : %d\n", data->header->pid); - printf(" units : %s\n", time_units_str[opts->time_units]); - printf("\n"); + printf(" threads : %-3d", data->header->num_threads); + if (opts->thread_list[0] != -1) { + printf("(showing %s", + (opts->thread_list[1] == -1) ? "thread" : "threads"); + printf(" %s)", thread_list_str(opts->thread_list, buf, sizeof(buf))); + } + printf("\n\n"); +} + +static int compare_int(const void *a, const void *b) +{ + return *(const int*)a - *(const int*)b; } static int show_profile_data(profile_data_t *data, options_t *opts) { + unsigned i, thread_list_len; int ret; + int *t; + + if (data->header->num_threads > MAX_THREADS) { + print_error("the profile contains %u threads, but only up to %d are " + "supported", data->header->num_threads, MAX_THREADS); + return -EINVAL; + } + + /* validate and count thread numbers */ + if (opts->thread_list[0] == -1) { + for (i = 0; i < data->header->num_threads; ++i) { + opts->thread_list[i] = i + 1; + } + opts->thread_list[i] = -1; + } else { + thread_list_len = 0; + for (t = opts->thread_list; *t != -1; ++t) { + if (*t > data->header->num_threads) { + print_error("thread number %d is out of range (1..%u)", + *t, data->header->num_threads); + return -EINVAL; + } + + ++thread_list_len; + } + assert(thread_list_len > 0); + + /* sort thread numbers and check for duplicates */ + qsort(opts->thread_list, thread_list_len, sizeof(int), compare_int); + for (t = opts->thread_list; *t != -1; ++t) { + if (t[0] == t[1]) { + print_error("duplicate thread number %d", t[0]); + return -EINVAL; + } + } + } + /* redirect output if needed */ if (!opts->raw) { - ret = redirect_output(data->header); + ret = redirect_output(data, opts); if (ret < 0) { return ret; } @@ -448,7 +767,9 @@ static int show_profile_data(profile_data_t *data, options_t *opts) } if (data->header->mode & UCS_BIT(UCS_PROFILE_MODE_LOG)) { - show_profile_data_log(data, opts); + for (t = opts->thread_list; *t != -1; ++t) { + show_profile_data_log(data, opts, *t - 1); + } printf("\n"); } @@ -460,6 +781,8 @@ static void usage() printf("Usage: ucx_read_profile [options] [profile-file]\n"); printf("Options are:\n"); printf(" -r Show raw output\n"); + printf(" -T Comma-separated list of threads to show, " + "e.g. \"1,2,3\", or \"all\" to show all threads\n"); printf(" -t Select time units to use:\n"); printf(" sec - seconds\n"); printf(" msec - milliseconds\n"); @@ -468,18 +791,28 @@ static void usage() printf(" -h Show this help message\n"); } -int parse_args(int argc, char **argv, options_t *opts) +static int parse_args(int argc, char **argv, options_t *opts) { - int c; + int ret, c; opts->raw = !isatty(fileno(stdout)); opts->time_units = TIME_UNITS_USEC; + ret = parse_thread_list(opts->thread_list, "all"); + if (ret < 0) { + return ret; + } - while ( (c = getopt(argc, argv, "hrt:")) != -1 ) { + while ( (c = getopt(argc, argv, "rT:t:h")) != -1 ) { switch (c) { case 'r': opts->raw = 1; break; + case 'T': + ret = parse_thread_list(opts->thread_list, optarg); + if (ret < 0) { + return ret; + } + break; case 't': if (!strcasecmp(optarg, "sec")) { opts->time_units = TIME_UNITS_SEC; @@ -490,6 +823,8 @@ int parse_args(int argc, char **argv, options_t *opts) } else if (!strcasecmp(optarg, "nsec")) { opts->time_units = TIME_UNITS_NSEC; } else { + print_error("invalid time units '%s'\n", optarg); + usage(); return -1; } break; @@ -503,7 +838,7 @@ int parse_args(int argc, char **argv, options_t *opts) } if (optind >= argc) { - printf("Error: missing profile file argument\n"); + print_error("missing profile file argument\n"); usage(); return -1; } @@ -523,8 +858,9 @@ int main(int argc, char **argv) return (ret == -127) ? 0 : ret; } - if (read_profile_data(opts.filename, &data) < 0) { - return -1; + ret = read_profile_data(opts.filename, &data); + if (ret < 0) { + return ret; } ret = show_profile_data(&data, &opts); diff --git a/src/ucm/api/ucm.h b/src/ucm/api/ucm.h index 5a669b6caa1..c887fe626f8 100644 --- a/src/ucm/api/ucm.h +++ b/src/ucm/api/ucm.h @@ -14,6 +14,7 @@ BEGIN_C_DECLS #include +#include #include #include @@ -26,6 +27,8 @@ BEGIN_C_DECLS * @brief Memory event types */ typedef enum ucm_event_type { + /* Default initialization value */ + UCM_EVENT_NONE = 0, /* Native events */ UCM_EVENT_MMAP = UCS_BIT(0), UCM_EVENT_MUNMAP = UCS_BIT(1), @@ -43,23 +46,16 @@ typedef enum ucm_event_type { UCM_EVENT_MEM_TYPE_ALLOC = UCS_BIT(20), UCM_EVENT_MEM_TYPE_FREE = UCS_BIT(21), - /* Auxiliary flags */ - UCM_EVENT_FLAG_NO_INSTALL = UCS_BIT(24) - -} ucm_event_type_t; + /* Add event handler, but don't install new hooks */ + UCM_EVENT_FLAG_NO_INSTALL = UCS_BIT(24), + /* When the event handler is added, generate approximated events for + * existing memory allocations. + * Currently implemented only for @ref UCM_EVENT_MEM_TYPE_ALLOC. + */ + UCM_EVENT_FLAG_EXISTING_ALLOC = UCS_BIT(25) -/** - * @brief Memory types for alloc and free events - */ -typedef enum ucm_mem_type { - /*cuda memory */ - UCM_MEM_TYPE_CUDA = UCS_BIT(0), - UCM_MEM_TYPE_CUDA_MANAGED = UCS_BIT(1), - /* rocm memory */ - UCM_MEM_TYPE_ROCM = UCS_BIT(2), - UCM_MEM_TYPE_ROCM_MANAGED = UCS_BIT(3), -} ucm_mem_type_t; +} ucm_event_type_t; /** @@ -82,13 +78,13 @@ typedef union ucm_event { * callbacks: pre, post */ struct { - void *result; - void *address; - size_t size; - int prot; - int flags; - int fd; - off_t offset; + void *result; + void *address; + size_t size; + int prot; + int flags; + int fd; + off_t offset; } mmap; /* @@ -96,9 +92,9 @@ typedef union ucm_event { * munmap() is called. */ struct { - int result; - void *address; - size_t size; + int result; + void *address; + size_t size; } munmap; /* @@ -106,11 +102,11 @@ typedef union ucm_event { * mremap() is called. */ struct { - void *result; - void *address; - size_t old_size; - size_t new_size; - int flags; + void *result; + void *address; + size_t old_size; + size_t new_size; + int flags; } mremap; /* @@ -118,10 +114,10 @@ typedef union ucm_event { * shmat() is called. */ struct { - void *result; - int shmid; - const void *shmaddr; - int shmflg; + void *result; + int shmid; + const void *shmaddr; + int shmflg; } shmat; /* @@ -129,8 +125,8 @@ typedef union ucm_event { * shmdt() is called. */ struct { - int result; - const void *shmaddr; + int result; + const void *shmaddr; } shmdt; /* @@ -138,8 +134,8 @@ typedef union ucm_event { * sbrk() is called. */ struct { - void *result; - intptr_t increment; + void *result; + intptr_t increment; } sbrk; /* @@ -147,10 +143,10 @@ typedef union ucm_event { * madvise() is called. */ struct { - int result; - void *addr; - size_t length; - int advice; + int result; + void *addr; + size_t length; + int advice; } madvise; /* @@ -164,17 +160,21 @@ typedef union ucm_event { * For UCM_EVENT_VM_UNMAPPED, callbacks are pre */ struct { - void *address; - size_t size; + void *address; + size_t size; } vm_mapped, vm_unmapped; /* - * memory type allocation and deallocation event + * UCM_EVENT_MEM_TYPE_ALLOC, UCM_EVENT_MEM_TYPE_FREE + * + * Memory type allocation and deallocation event. + * If mem_type is @ref UCS_MEMORY_TYPE_LAST, the memory type is unknown, and + * further memory type detection is required. */ struct { - void *address; - size_t size; - ucm_mem_type_t mem_type; + void *address; + size_t size; + ucs_memory_type_t mem_type; } mem_type; } ucm_event_t; @@ -194,6 +194,7 @@ typedef struct ucm_global_config { int enable_cuda_reloc; /* Enable installing CUDA relocations */ int enable_dynamic_mmap_thresh; /* Enable adaptive mmap threshold */ size_t alloc_alignment; /* Alignment for memory allocations */ + int dlopen_process_rpath; /* Process RPATH section in dlopen hook */ } ucm_global_config_t; @@ -322,6 +323,20 @@ void ucm_unset_external_event(int events); ucs_status_t ucm_test_events(int events); +/** + * @brief Test event external handlers + * + * This routine checks if external events, as set by @ref ucm_set_external_event, + * are actually being reported (by calling APIs such as @ref ucm_vm_munmap). + * + * @param [in] events Bit-mask of events which are supposed to be handled + * externally. + * + * @return Status code. + */ +ucs_status_t ucm_test_external_events(int events); + + /** * @brief Call the original implementation of @ref mmap without triggering events. */ @@ -435,12 +450,19 @@ int ucm_brk(void *addr); /** - * @brief Call the original implementation of @ref ucm_madvise and all handlers + * @brief Call the original implementation of @ref madvise and all handlers * associated with it. */ int ucm_madvise(void *addr, size_t length, int advice); +/** + * @brief Call the original implementation of @ref dlopen and all handlers + * associated with it. + */ +void *ucm_dlopen(const char *filename, int flag); + + END_C_DECLS #endif diff --git a/src/ucm/bistro/bistro.c b/src/ucm/bistro/bistro.c index 79bdbecb7eb..1e53cb28db8 100644 --- a/src/ucm/bistro/bistro.c +++ b/src/ucm/bistro/bistro.c @@ -29,7 +29,7 @@ static void *ucm_bistro_page_align_ptr(void *ptr) static ucs_status_t ucm_bistro_protect(void *addr, size_t len, int prot) { void *aligned = ucm_bistro_page_align_ptr(addr); - size_t size = addr - aligned + len; + size_t size = UCS_PTR_BYTE_DIFF(aligned, addr) + len; int res; res = mprotect(aligned, size, prot) ? UCS_ERR_INVALID_PARAM : UCS_OK; @@ -54,7 +54,7 @@ ucs_status_t ucm_bistro_apply_patch(void *dst, void *patch, size_t len) status = ucm_bistro_protect(dst, len, UCM_PROT_READ_EXEC); if (!UCS_STATUS_IS_ERR(status)) { - ucs_clear_cache(dst, dst + len); + ucs_clear_cache(dst, UCS_PTR_BYTE_OFFSET(dst, len)); } return status; } diff --git a/src/ucm/configure.m4 b/src/ucm/configure.m4 index fc8be3623c3..2a752b680f1 100644 --- a/src/ucm/configure.m4 +++ b/src/ucm/configure.m4 @@ -5,7 +5,7 @@ # AC_SUBST([UCM_MODULE_LDFLAGS], - ["-Xlinker -z -Xlinker interpose"]) + ["-Xlinker -z -Xlinker interpose -Xlinker --no-as-needed"]) ucm_modules="" m4_include([src/ucm/cuda/configure.m4]) diff --git a/src/ucm/cuda/Makefile.am b/src/ucm/cuda/Makefile.am index 9fccfc3e743..438960e2c28 100644 --- a/src/ucm/cuda/Makefile.am +++ b/src/ucm/cuda/Makefile.am @@ -10,7 +10,8 @@ module_LTLIBRARIES = libucm_cuda.la libucm_cuda_la_CPPFLAGS = $(BASE_CPPFLAGS) $(CUDA_CPPFLAGS) libucm_cuda_la_CFLAGS = $(BASE_CFLAGS) $(CUDA_CFLAGS) libucm_cuda_la_LIBADD = ../libucm.la -libucm_cuda_la_LDFLAGS = $(UCM_MODULE_LDFLAGS) $(CUDA_LDFLAGS) \ +libucm_cuda_la_LDFLAGS = $(UCM_MODULE_LDFLAGS) \ + $(patsubst %, -Xlinker %, $(CUDA_LDFLAGS)) \ -version-info $(SOVERSION) noinst_HEADERS = \ diff --git a/src/ucm/cuda/configure.m4 b/src/ucm/cuda/configure.m4 index 08b46048442..626ea546130 100644 --- a/src/ucm/cuda/configure.m4 +++ b/src/ucm/cuda/configure.m4 @@ -5,5 +5,5 @@ # UCX_CHECK_CUDA -AS_IF([test "x$cuda_happy" = "xyes"], [ucm_modules+=":cuda"]) +AS_IF([test "x$cuda_happy" = "xyes"], [ucm_modules="${ucm_modules}:cuda"]) AC_CONFIG_FILES([src/ucm/cuda/Makefile]) diff --git a/src/ucm/cuda/cudamem.c b/src/ucm/cuda/cudamem.c index 03b5ccf8305..758fd2b5c98 100644 --- a/src/ucm/cuda/cudamem.c +++ b/src/ucm/cuda/cudamem.c @@ -1,5 +1,6 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * See file LICENSE for terms. */ @@ -14,15 +15,20 @@ #include #include #include +#include #include #include #include +#include +#include -#include +#include #include +#include +#include -UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemFree, CUresult,-1, CUdeviceptr) +UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemFree, CUresult, -1, CUdeviceptr) UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemFreeHost, CUresult, -1, void *) UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemAlloc, CUresult, -1, CUdeviceptr *, size_t) UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemAllocManaged, CUresult, -1, CUdeviceptr *, @@ -60,8 +66,26 @@ UCM_OVERRIDE_FUNC(cudaHostUnregister, cudaError_t) #endif +static void ucm_cuda_set_ptr_attr(CUdeviceptr dptr) +{ + if ((void*)dptr == NULL) { + ucm_trace("skipping cuPointerSetAttribute for null pointer"); + return; + } + + unsigned int value = 1; + CUresult ret; + const char *cu_err_str; + + ret = cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, dptr); + if (ret != CUDA_SUCCESS) { + cuGetErrorString(ret, &cu_err_str); + ucm_warn("cuPointerSetAttribute(%p) failed: %s", (void *) dptr, cu_err_str); + } +} + static UCS_F_ALWAYS_INLINE void -ucm_dispatch_mem_type_alloc(void *addr, size_t length, ucm_mem_type_t mem_type) +ucm_dispatch_mem_type_alloc(void *addr, size_t length, ucs_memory_type_t mem_type) { ucm_event_t event; @@ -72,7 +96,7 @@ ucm_dispatch_mem_type_alloc(void *addr, size_t length, ucm_mem_type_t mem_type) } static UCS_F_ALWAYS_INLINE void -ucm_dispatch_mem_type_free(void *addr, size_t length, ucm_mem_type_t mem_type) +ucm_dispatch_mem_type_free(void *addr, size_t length, ucs_memory_type_t mem_type) { ucm_event_t event; @@ -82,24 +106,28 @@ ucm_dispatch_mem_type_free(void *addr, size_t length, ucm_mem_type_t mem_type) ucm_event_dispatch(UCM_EVENT_MEM_TYPE_FREE, &event); } -static void ucm_cudafree_dispatch_events(void *dptr) +static void ucm_cudafree_dispatch_events(CUdeviceptr dptr, const char *func_name) { CUresult ret; CUdeviceptr pbase; size_t psize; - if (dptr == NULL) { + if (dptr == 0) { return; } - ret = cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr) dptr); - if (ret != CUDA_SUCCESS) { - ucm_warn("cuMemGetAddressRange(devPtr=%p) failed", (void *)dptr); + ret = cuMemGetAddressRange(&pbase, &psize, dptr); + if (ret == CUDA_SUCCESS) { + if (dptr != pbase) { + ucm_warn("%s(%p) called with unexpected pointer (expected: %p)", + func_name, (void*)dptr, (void*)pbase); + } + } else { + ucm_debug("cuMemGetAddressRange(devPtr=%p) failed", (void*)dptr); psize = 1; /* set minimum length */ } - ucs_assert(dptr == (void *)pbase); - ucm_dispatch_mem_type_free((void *)dptr, psize, UCM_MEM_TYPE_CUDA); + ucm_dispatch_mem_type_free((void *)dptr, psize, UCS_MEMORY_TYPE_CUDA); } CUresult ucm_cuMemFree(CUdeviceptr dptr) @@ -108,9 +136,9 @@ CUresult ucm_cuMemFree(CUdeviceptr dptr) ucm_event_enter(); - ucm_trace("ucm_cuMemFree(dptr=%p)",(void *)dptr); + ucm_trace("ucm_cuMemFree(dptr=%p)",(void*)dptr); - ucm_cudafree_dispatch_events((void *)dptr); + ucm_cudafree_dispatch_events(dptr, "cuMemFree"); ret = ucm_orig_cuMemFree(dptr); @@ -143,7 +171,8 @@ CUresult ucm_cuMemAlloc(CUdeviceptr *dptr, size_t size) ret = ucm_orig_cuMemAlloc(dptr, size); if (ret == CUDA_SUCCESS) { ucm_trace("ucm_cuMemAlloc(dptr=%p size:%lu)",(void *)*dptr, size); - ucm_dispatch_mem_type_alloc((void *)*dptr, size, UCM_MEM_TYPE_CUDA); + ucm_dispatch_mem_type_alloc((void *)*dptr, size, UCS_MEMORY_TYPE_CUDA); + ucm_cuda_set_ptr_attr(*dptr); } ucm_event_leave(); @@ -160,7 +189,8 @@ CUresult ucm_cuMemAllocManaged(CUdeviceptr *dptr, size_t size, unsigned int flag if (ret == CUDA_SUCCESS) { ucm_trace("ucm_cuMemAllocManaged(dptr=%p size:%lu, flags:%d)", (void *)*dptr, size, flags); - ucm_dispatch_mem_type_alloc((void *)*dptr, size, UCM_MEM_TYPE_CUDA_MANAGED); + ucm_dispatch_mem_type_alloc((void *)*dptr, size, + UCS_MEMORY_TYPE_CUDA_MANAGED); } ucm_event_leave(); @@ -180,7 +210,8 @@ CUresult ucm_cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, ucm_trace("ucm_cuMemAllocPitch(dptr=%p size:%lu)",(void *)*dptr, (WidthInBytes * Height)); ucm_dispatch_mem_type_alloc((void *)*dptr, WidthInBytes * Height, - UCM_MEM_TYPE_CUDA); + UCS_MEMORY_TYPE_CUDA); + ucm_cuda_set_ptr_attr(*dptr); } ucm_event_leave(); @@ -224,7 +255,7 @@ cudaError_t ucm_cudaFree(void *devPtr) ucm_trace("ucm_cudaFree(devPtr=%p)", devPtr); - ucm_cudafree_dispatch_events((void *)devPtr); + ucm_cudafree_dispatch_events((CUdeviceptr)devPtr, "cudaFree"); ret = ucm_orig_cudaFree(devPtr); @@ -258,7 +289,8 @@ cudaError_t ucm_cudaMalloc(void **devPtr, size_t size) ret = ucm_orig_cudaMalloc(devPtr, size); if (ret == cudaSuccess) { ucm_trace("ucm_cudaMalloc(devPtr=%p size:%lu)", *devPtr, size); - ucm_dispatch_mem_type_alloc(*devPtr, size, UCM_MEM_TYPE_CUDA); + ucm_dispatch_mem_type_alloc(*devPtr, size, UCS_MEMORY_TYPE_CUDA); + ucm_cuda_set_ptr_attr((CUdeviceptr) *devPtr); } ucm_event_leave(); @@ -276,7 +308,7 @@ cudaError_t ucm_cudaMallocManaged(void **devPtr, size_t size, unsigned int flags if (ret == cudaSuccess) { ucm_trace("ucm_cudaMallocManaged(devPtr=%p size:%lu flags:%d)", *devPtr, size, flags); - ucm_dispatch_mem_type_alloc(*devPtr, size, UCM_MEM_TYPE_CUDA_MANAGED); + ucm_dispatch_mem_type_alloc(*devPtr, size, UCS_MEMORY_TYPE_CUDA_MANAGED); } ucm_event_leave(); @@ -294,7 +326,8 @@ cudaError_t ucm_cudaMallocPitch(void **devPtr, size_t *pitch, ret = ucm_orig_cudaMallocPitch(devPtr, pitch, width, height); if (ret == cudaSuccess) { ucm_trace("ucm_cudaMallocPitch(devPtr=%p size:%lu)",*devPtr, (width * height)); - ucm_dispatch_mem_type_alloc(*devPtr, (width * height), UCM_MEM_TYPE_CUDA); + ucm_dispatch_mem_type_alloc(*devPtr, (width * height), UCS_MEMORY_TYPE_CUDA); + ucm_cuda_set_ptr_attr((CUdeviceptr) *devPtr); } ucm_event_leave(); @@ -388,9 +421,109 @@ static ucs_status_t ucm_cudamem_install(int events) return status; } +static int ucm_cudamem_scan_regions_cb(void *arg, void *addr, size_t length, + int prot, const char *path) +{ + static const char *cuda_path_pattern = "/dev/nvidia"; + ucm_event_handler_t *handler = arg; + ucm_event_t event; + + /* we are interested in blocks which don't have any access permissions, or + * mapped to nvidia device. + */ + if ((prot & (PROT_READ|PROT_WRITE|PROT_EXEC)) && + strncmp(path, cuda_path_pattern, strlen(cuda_path_pattern))) { + return 0; + } + + ucm_debug("dispatching initial memtype allocation for %p..%p %s", + addr, UCS_PTR_BYTE_OFFSET(addr, length), path); + + event.mem_type.address = addr; + event.mem_type.size = length; + event.mem_type.mem_type = UCS_MEMORY_TYPE_LAST; /* unknown memory type */ + + ucm_event_enter(); + handler->cb(UCM_EVENT_MEM_TYPE_ALLOC, &event, handler->arg); + ucm_event_leave(); + + return 0; +} + +static void ucm_cudamem_get_existing_alloc(ucm_event_handler_t *handler) +{ + if (handler->events & UCM_EVENT_MEM_TYPE_ALLOC) { + ucm_parse_proc_self_maps(ucm_cudamem_scan_regions_cb, handler); + } +} + +ucs_status_t ucm_cuda_get_current_device_info(ucs_sys_bus_id_t *bus_id, + ucs_memory_type_t mem_type) +{ + static ucs_sys_bus_id_t cached_bus_id = {0xffff, 0xff, 0xff, 0xff}; + CUresult cu_err; + CUdevice cuda_device; + CUdevice_attribute attribute; + int attr_result; + + ucm_trace("ucm_cuda_get_current_device_info"); + + if (mem_type != UCS_MEMORY_TYPE_CUDA) { + return UCS_ERR_UNSUPPORTED; + } + + if (cached_bus_id.slot != 0xff) { + memcpy(bus_id, &cached_bus_id, sizeof(cached_bus_id)); + return UCS_OK; + } + + /* Find cuda dev that the current ctx is using and find it's path*/ + cu_err = cuCtxGetDevice(&cuda_device); + if (CUDA_SUCCESS != cu_err) { + ucm_debug("no cuda device context found"); + return UCS_ERR_NO_RESOURCE; + } + + attribute = CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID; + cu_err = cuDeviceGetAttribute(&attr_result, attribute, cuda_device); + if (CUDA_SUCCESS != cu_err) { + ucm_error("unable to get cuda device domain"); + return UCS_ERR_IO_ERROR; + } + + bus_id->domain = (uint16_t)attr_result; + + attribute = CU_DEVICE_ATTRIBUTE_PCI_BUS_ID; + cu_err = cuDeviceGetAttribute(&attr_result, attribute, cuda_device); + if (CUDA_SUCCESS != cu_err) { + ucm_error("unable to get cuda device bus id"); + return UCS_ERR_IO_ERROR; + } + + bus_id->bus = (uint8_t)attr_result; + bus_id->slot = 0; + bus_id->function = 0; + cached_bus_id = *bus_id; + + ucm_trace("found bus_id %x:%x:%x:%x for device %d", bus_id->domain, + bus_id->bus, + bus_id->slot, + bus_id->function, + cuda_device); + + return UCS_OK; +} + +static ucm_event_installer_t ucm_cuda_initializer = { + .install = ucm_cudamem_install, + .get_existing_alloc = ucm_cudamem_get_existing_alloc, + .get_mem_type_current_device_info = ucm_cuda_get_current_device_info +}; + UCS_STATIC_INIT { - static ucm_event_installer_t cuda_initializer = { - .func = ucm_cudamem_install - }; - ucs_list_add_tail(&ucm_event_installer_list, &cuda_initializer.list); + ucs_list_add_tail(&ucm_event_installer_list, &ucm_cuda_initializer.list); +} + +UCS_STATIC_CLEANUP { + ucs_list_del(&ucm_cuda_initializer.list); } diff --git a/src/ucm/event/event.c b/src/ucm/event/event.c index 40d5b6c268f..23a63bacdff 100644 --- a/src/ucm/event/event.c +++ b/src/ucm/event/event.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -30,7 +31,7 @@ UCS_LIST_HEAD(ucm_event_installer_list); -static pthread_spinlock_t ucm_kh_lock; +static ucs_recursive_spinlock_t ucm_kh_lock; #define ucm_ptr_hash(_ptr) kh_int64_hash_func((uintptr_t)(_ptr)) KHASH_INIT(ucm_ptr_size, const void*, size_t, 1, ucm_ptr_hash, kh_int64_hash_equal) @@ -127,7 +128,7 @@ static ucs_list_link_t ucm_event_handlers = &ucm_event_orig_handler.list); - void ucm_event_dispatch(ucm_event_type_t event_type, ucm_event_t *event) +void ucm_event_dispatch(ucm_event_type_t event_type, ucm_event_t *event) { ucm_event_handler_t *handler; @@ -264,6 +265,25 @@ void *ucm_mremap(void *old_address, size_t old_size, size_t new_size, int flags) return event.mremap.result; } +static int ucm_shm_del_entry_from_khash(const void *addr, size_t *size) +{ /* must be called in locked ucm_kh_lock */ + khiter_t iter; + + ucs_recursive_spin_lock(&ucm_kh_lock); + iter = kh_get(ucm_ptr_size, &ucm_shmat_ptrs, addr); + if (iter != kh_end(&ucm_shmat_ptrs)) { + if (size != NULL) { + *size = kh_value(&ucm_shmat_ptrs, iter); + } + kh_del(ucm_ptr_size, &ucm_shmat_ptrs, iter); + ucs_recursive_spin_unlock(&ucm_kh_lock); + return 1; + } + + ucs_recursive_spin_unlock(&ucm_kh_lock); + return 0; +} + void *ucm_shmat(int shmid, const void *shmaddr, int shmflg) { uintptr_t attach_addr; @@ -285,6 +305,7 @@ void *ucm_shmat(int shmid, const void *shmaddr, int shmflg) attach_addr -= attach_addr % SHMLBA; } ucm_dispatch_vm_munmap((void*)attach_addr, size); + ucm_shm_del_entry_from_khash((void*)attach_addr, NULL); } event.shmat.result = MAP_FAILED; @@ -293,16 +314,14 @@ void *ucm_shmat(int shmid, const void *shmaddr, int shmflg) event.shmat.shmflg = shmflg; ucm_event_dispatch(UCM_EVENT_SHMAT, &event); - pthread_spin_lock(&ucm_kh_lock); if (event.shmat.result != MAP_FAILED) { + ucs_recursive_spin_lock(&ucm_kh_lock); iter = kh_put(ucm_ptr_size, &ucm_shmat_ptrs, event.mmap.result, &result); if (result != -1) { kh_value(&ucm_shmat_ptrs, iter) = size; } - pthread_spin_unlock(&ucm_kh_lock); + ucs_recursive_spin_unlock(&ucm_kh_lock); ucm_dispatch_vm_mmap(event.shmat.result, size); - } else { - pthread_spin_unlock(&ucm_kh_lock); } ucm_event_leave(); @@ -313,22 +332,15 @@ void *ucm_shmat(int shmid, const void *shmaddr, int shmflg) int ucm_shmdt(const void *shmaddr) { ucm_event_t event; - khiter_t iter; size_t size; ucm_event_enter(); ucm_debug("ucm_shmdt(shmaddr=%p)", shmaddr); - pthread_spin_lock(&ucm_kh_lock); - iter = kh_get(ucm_ptr_size, &ucm_shmat_ptrs, shmaddr); - if (iter != kh_end(&ucm_shmat_ptrs)) { - size = kh_value(&ucm_shmat_ptrs, iter); - kh_del(ucm_ptr_size, &ucm_shmat_ptrs, iter); - } else { + if (!ucm_shm_del_entry_from_khash(shmaddr, &size)) { size = ucm_get_shm_seg_size(shmaddr); } - pthread_spin_unlock(&ucm_kh_lock); ucm_dispatch_vm_munmap((void*)shmaddr, size); @@ -350,7 +362,8 @@ void *ucm_sbrk(intptr_t increment) ucm_trace("ucm_sbrk(increment=%+ld)", increment); if (increment < 0) { - ucm_dispatch_vm_munmap(ucm_orig_sbrk(0) + increment, -increment); + ucm_dispatch_vm_munmap(UCS_PTR_BYTE_OFFSET(ucm_orig_sbrk(0), increment), + -increment); } event.sbrk.result = MAP_FAILED; @@ -358,7 +371,8 @@ void *ucm_sbrk(intptr_t increment) ucm_event_dispatch(UCM_EVENT_SBRK, &event); if ((increment > 0) && (event.sbrk.result != MAP_FAILED)) { - ucm_dispatch_vm_mmap(ucm_orig_sbrk(0) - increment, increment); + ucm_dispatch_vm_mmap(UCS_PTR_BYTE_OFFSET(ucm_orig_sbrk(0), -increment), + increment); } ucm_event_leave(); @@ -382,7 +396,8 @@ int ucm_brk(void *addr) ucm_trace("ucm_brk(addr=%p)", addr); if (increment < 0) { - ucm_dispatch_vm_munmap(old_addr + increment, -increment); + ucm_dispatch_vm_munmap(UCS_PTR_BYTE_OFFSET(old_addr, increment), + -increment); } event.sbrk.result = (void*)-1; @@ -459,39 +474,20 @@ void ucm_event_handler_remove(ucm_event_handler_t *handler) ucm_event_leave(); } -static int ucm_events_to_native_events(int events) -{ - int native_events; - - native_events = events & ~(UCM_EVENT_VM_MAPPED | UCM_EVENT_VM_UNMAPPED | - UCM_EVENT_MEM_TYPE_ALLOC | UCM_EVENT_MEM_TYPE_FREE); - if (events & UCM_EVENT_VM_MAPPED) { - native_events |= UCM_NATIVE_EVENT_VM_MAPPED; - } - if (events & UCM_EVENT_VM_UNMAPPED) { - native_events |= UCM_NATIVE_EVENT_VM_UNMAPPED; - } - - return native_events; -} - static ucs_status_t ucm_event_install(int events) { - static ucs_init_once_t init_once = UCS_INIT_ONCE_INIITIALIZER; + static ucs_init_once_t init_once = UCS_INIT_ONCE_INITIALIZER; UCS_MODULE_FRAMEWORK_DECLARE(ucm); ucm_event_installer_t *event_installer; - int native_events, malloc_events; + int malloc_events; ucs_status_t status; UCS_INIT_ONCE(&init_once) { ucm_prevent_dl_unload(); } - /* Replace aggregate events with the native events which make them */ - native_events = ucm_events_to_native_events(events); - /* TODO lock */ - status = ucm_mmap_install(native_events); + status = ucm_mmap_install(events); if (status != UCS_OK) { ucm_debug("failed to install mmap events"); goto out_unlock; @@ -512,7 +508,7 @@ static ucs_status_t ucm_event_install(int events) /* Call extra event installers */ UCS_MODULE_FRAMEWORK_LOAD(ucm, UCS_MODULE_LOAD_FLAG_NODELETE); ucs_list_for_each(event_installer, &ucm_event_installer_list, list) { - status = event_installer->func(events); + status = event_installer->install(events); if (status != UCS_OK) { goto out_unlock; } @@ -528,14 +524,32 @@ static ucs_status_t ucm_event_install(int events) ucs_status_t ucm_set_event_handler(int events, int priority, ucm_event_callback_t cb, void *arg) { + ucm_event_installer_t *event_installer; ucm_event_handler_t *handler; ucs_status_t status; + int flags; + + if (events & ~(UCM_EVENT_MMAP|UCM_EVENT_MUNMAP|UCM_EVENT_MREMAP| + UCM_EVENT_SHMAT|UCM_EVENT_SHMDT| + UCM_EVENT_SBRK| + UCM_EVENT_MADVISE| + UCM_EVENT_VM_MAPPED|UCM_EVENT_VM_UNMAPPED| + UCM_EVENT_MEM_TYPE_ALLOC|UCM_EVENT_MEM_TYPE_FREE| + UCM_EVENT_FLAG_NO_INSTALL| + UCM_EVENT_FLAG_EXISTING_ALLOC)) { + return UCS_ERR_INVALID_PARAM; + } - if (!ucm_global_opts.enable_events) { + if (events && !ucm_global_opts.enable_events) { return UCS_ERR_UNSUPPORTED; } - if (!(events & UCM_EVENT_FLAG_NO_INSTALL) && (events & ~ucm_external_events)) { + /* separate event flags from real events */ + flags = events & (UCM_EVENT_FLAG_NO_INSTALL | + UCM_EVENT_FLAG_EXISTING_ALLOC); + events &= ~flags; + + if (!(flags & UCM_EVENT_FLAG_NO_INSTALL) && (events & ~ucm_external_events)) { status = ucm_event_install(events & ~ucm_external_events); if (status != UCS_OK) { return status; @@ -554,6 +568,12 @@ ucs_status_t ucm_set_event_handler(int events, int priority, ucm_event_handler_add(handler); + if (flags & UCM_EVENT_FLAG_EXISTING_ALLOC) { + ucs_list_for_each(event_installer, &ucm_event_installer_list, list) { + event_installer->get_existing_alloc(handler); + } + } + ucm_debug("added user handler (func=%p arg=%p) for events=0x%x prio=%d", cb, arg, events, priority); return UCS_OK; @@ -591,23 +611,33 @@ void ucm_unset_event_handler(int events, ucm_event_callback_t cb, void *arg) ucm_event_leave(); /* Do not release memory while we hold event lock - may deadlock */ - while (!ucs_list_is_empty(&gc_list)) { - elem = ucs_list_extract_head(&gc_list, ucm_event_handler_t, list); + ucs_list_for_each_safe(elem, tmp, &gc_list, list) { free(elem); } } ucs_status_t ucm_test_events(int events) { - return ucm_mmap_test_installed_events(ucm_events_to_native_events(events)); + return ucm_mmap_test_installed_events(events); +} + +ucs_status_t ucm_test_external_events(int events) +{ + return ucm_mmap_test_events(events & ucm_external_events, "external"); } UCS_STATIC_INIT { - pthread_spin_init(&ucm_kh_lock, PTHREAD_PROCESS_PRIVATE); + ucs_recursive_spinlock_init(&ucm_kh_lock, 0); kh_init_inplace(ucm_ptr_size, &ucm_shmat_ptrs); } UCS_STATIC_CLEANUP { + ucs_status_t status; + kh_destroy_inplace(ucm_ptr_size, &ucm_shmat_ptrs); - pthread_spin_destroy(&ucm_kh_lock); + + status = ucs_recursive_spinlock_destroy(&ucm_kh_lock); + if (status != UCS_OK) { + ucm_warn("ucs_recursive_spinlock_destroy() failed (%d)", status); + } } diff --git a/src/ucm/event/event.h b/src/ucm/event/event.h index 174b035e0c0..9dd5f522b59 100644 --- a/src/ucm/event/event.h +++ b/src/ucm/event/event.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include #define UCM_NATIVE_EVENT_VM_MAPPED (UCM_EVENT_MMAP | UCM_EVENT_MREMAP | \ UCM_EVENT_SHMAT | UCM_EVENT_SBRK) @@ -31,7 +33,9 @@ typedef struct ucm_event_handler { typedef struct ucm_event_installer { - ucs_status_t (*func)(int events); + ucs_status_t (*install)(int events); + void (*get_existing_alloc)(ucm_event_handler_t *handler); + ucs_status_t (*get_mem_type_current_device_info)(ucs_sys_bus_id_t *bus_id, ucs_memory_type_t mem_type); ucs_list_link_t list; } ucm_event_installer_t; diff --git a/src/ucm/malloc/malloc_hook.c b/src/ucm/malloc/malloc_hook.c index 56742779499..f073398e93c 100644 --- a/src/ucm/malloc/malloc_hook.c +++ b/src/ucm/malloc/malloc_hook.c @@ -10,7 +10,12 @@ #include "malloc_hook.h" +#ifdef HAVE_MALLOC_H #include +#endif +#ifdef HAVE_MALLOC_NP_H +#include +#endif #undef M_TRIM_THRESHOLD #undef M_MMAP_THRESHOLD #include "allocator.h" /* have to be included after malloc.h */ @@ -26,6 +31,7 @@ #include #include #include +#include /* make khash allocate memory directly from operating system */ @@ -68,44 +74,48 @@ KHASH_INIT(mmap_ptrs, void*, char, 0, ucm_mmap_ptr_hash, ucm_mmap_ptr_equal) /* Pointer to memory release function */ typedef void (*ucm_release_func_t)(void *ptr); +/* Pointer to get usable size function */ +typedef size_t (*ucm_usable_size_func_t)(void *ptr); + typedef struct ucm_malloc_hook_state { /* * State of hook installment */ - pthread_mutex_t install_mutex; /* Protect hooks installation */ - int install_state; /* State of hook installation */ - int installed_events; /* Which events are working */ - int mmap_thresh_set; /* mmap threshold set by user */ - int trim_thresh_set; /* trim threshold set by user */ - int hook_called; /* Our malloc hook was called */ - size_t max_freed_size; /* Maximal size released so far */ - size_t (*usable_size)(void*); /* function pointer to get usable size */ + pthread_mutex_t install_mutex; /* Protect hooks installation */ + int install_state; /* State of hook installation */ + int installed_events; /* Which events are working */ + int mmap_thresh_set; /* mmap threshold set by user */ + int trim_thresh_set; /* trim threshold set by user */ + int hook_called; /* Our malloc hook was called */ + size_t max_freed_size; /* Maximal size released so far */ + + ucm_usable_size_func_t usable_size; /* function pointer to get usable size */ - ucm_release_func_t free; /* function pointer to release memory */ + ucm_release_func_t free; /* function pointer to release memory */ /* * Track record of which pointers are ours */ - pthread_spinlock_t lock; /* Protect heap counters. + ucs_recursive_spinlock_t lock; /* Protect heap counters. Note: Cannot modify events when this lock is held - may deadlock */ /* Our heap address range. Used to identify whether a released pointer is ours, * or was allocated by the previous heap manager. */ - void *heap_start; - void *heap_end; + void *heap_start; + void *heap_end; /* Save the pointers that we have allocated with mmap, so when they are * released we would know they are ours, despite the fact they are not in the * heap address range. */ - khash_t(mmap_ptrs) ptrs; + khash_t(mmap_ptrs) ptrs; /** * Save the environment strings we've allocated */ - pthread_mutex_t env_lock; - char **env_strs; - unsigned num_env_strs; + pthread_mutex_t env_lock; + char **env_strs; + unsigned num_env_strs; } ucm_malloc_hook_state_t; @@ -117,8 +127,8 @@ static ucm_malloc_hook_state_t ucm_malloc_hook_state = { .trim_thresh_set = 0, .hook_called = 0, .max_freed_size = 0, - .usable_size = malloc_usable_size, - .free = free, + .usable_size = NULL, + .free = NULL, .heap_start = (void*)-1, .heap_end = (void*)-1, .ptrs = {0}, @@ -134,14 +144,14 @@ static void ucm_malloc_mmaped_ptr_add(void *ptr) int hash_extra_status; khiter_t hash_it; - pthread_spin_lock(&ucm_malloc_hook_state.lock); + ucs_recursive_spin_lock(&ucm_malloc_hook_state.lock); hash_it = kh_put(mmap_ptrs, &ucm_malloc_hook_state.ptrs, ptr, &hash_extra_status); ucs_assert_always(hash_extra_status >= 0); ucs_assert_always(hash_it != kh_end(&ucm_malloc_hook_state.ptrs)); - pthread_spin_unlock(&ucm_malloc_hook_state.lock); + ucs_recursive_spin_unlock(&ucm_malloc_hook_state.lock); } static int ucm_malloc_mmaped_ptr_remove_if_exists(void *ptr) @@ -149,7 +159,7 @@ static int ucm_malloc_mmaped_ptr_remove_if_exists(void *ptr) khiter_t hash_it; int found; - pthread_spin_lock(&ucm_malloc_hook_state.lock); + ucs_recursive_spin_lock(&ucm_malloc_hook_state.lock); hash_it = kh_get(mmap_ptrs, &ucm_malloc_hook_state.ptrs, ptr); if (hash_it == kh_end(&ucm_malloc_hook_state.ptrs)) { @@ -159,7 +169,7 @@ static int ucm_malloc_mmaped_ptr_remove_if_exists(void *ptr) kh_del(mmap_ptrs, &ucm_malloc_hook_state.ptrs, hash_it); } - pthread_spin_unlock(&ucm_malloc_hook_state.lock); + ucs_recursive_spin_unlock(&ucm_malloc_hook_state.lock); return found; } @@ -167,10 +177,10 @@ static int ucm_malloc_is_address_in_heap(void *ptr) { int in_heap; - pthread_spin_lock(&ucm_malloc_hook_state.lock); + ucs_recursive_spin_lock(&ucm_malloc_hook_state.lock); in_heap = (ptr >= ucm_malloc_hook_state.heap_start) && (ptr < ucm_malloc_hook_state.heap_end); - pthread_spin_unlock(&ucm_malloc_hook_state.lock); + ucs_recursive_spin_unlock(&ucm_malloc_hook_state.lock); return in_heap; } @@ -298,6 +308,12 @@ static void *ucm_malloc(size_t size, const void *caller) return ucm_malloc_impl(size, "malloc"); } +static size_t ucm_malloc_usable_size_common(void *mem, int foreign) +{ + return foreign ? ucm_malloc_hook_state.usable_size(mem) : + dlmalloc_usable_size(mem); +} + static void *ucm_realloc(void *oldptr, size_t size, const void *caller) { void *newptr; @@ -316,7 +332,7 @@ static void *ucm_realloc(void *oldptr, size_t size, const void *caller) newptr = ucm_dlmalloc(size); ucm_malloc_allocated(newptr, size, "realloc"); - oldsz = ucm_malloc_hook_state.usable_size(oldptr); + oldsz = ucm_malloc_usable_size_common(oldptr, foreign); memcpy(newptr, oldptr, ucs_min(size, oldsz)); if (foreign) { @@ -335,7 +351,7 @@ static void *ucm_realloc(void *oldptr, size_t size, const void *caller) static void ucm_free(void *ptr, const void *caller) { - return ucm_free_impl(ptr, ucm_malloc_hook_state.free, "free"); + ucm_free_impl(ptr, ucm_malloc_hook_state.free, "free"); } static void *ucm_memalign(size_t alignment, size_t size, const void *caller) @@ -383,8 +399,9 @@ static void ucm_operator_delete(void* ptr) { static ucm_release_func_t orig_delete = NULL; if (orig_delete == NULL) { - orig_delete = ucm_reloc_get_orig(UCM_OPERATOR_DELETE_SYMBOL, - ucm_operator_delete); + orig_delete = + (ucm_release_func_t)ucm_reloc_get_orig(UCM_OPERATOR_DELETE_SYMBOL, + ucm_operator_delete); } ucm_free_impl(ptr, orig_delete, "operator delete"); } @@ -398,8 +415,9 @@ static void ucm_operator_vec_delete(void* ptr) { static ucm_release_func_t orig_vec_delete = NULL; if (orig_vec_delete == NULL) { - orig_vec_delete = ucm_reloc_get_orig(UCM_OPERATOR_VEC_DELETE_SYMBOL, - ucm_operator_vec_delete); + orig_vec_delete = + (ucm_release_func_t)ucm_reloc_get_orig(UCM_OPERATOR_VEC_DELETE_SYMBOL, + ucm_operator_vec_delete); } ucm_free_impl(ptr, orig_vec_delete, "operator delete[]"); } @@ -451,7 +469,7 @@ static int ucm_asprintf(char **strp, const char *fmt, ...) static int ucm_add_to_environ(char *env_str) { char *saved_env_str; - unsigned index; + unsigned idx; size_t len; char *p; @@ -464,19 +482,19 @@ static int ucm_add_to_environ(char *env_str) } /* Check if we already have variable with same name */ - index = 0; - while (index < ucm_malloc_hook_state.num_env_strs) { - saved_env_str = ucm_malloc_hook_state.env_strs[index]; + idx = 0; + while (idx < ucm_malloc_hook_state.num_env_strs) { + saved_env_str = ucm_malloc_hook_state.env_strs[idx]; if ((strlen(saved_env_str) >= len) && !strncmp(env_str, saved_env_str, len)) { ucm_trace("replace `%s' with `%s'", saved_env_str, env_str); ucm_free(saved_env_str, NULL); goto out_insert; } - ++index; + ++idx; } /* Not found - enlarge array by one */ - index = ucm_malloc_hook_state.num_env_strs; + idx = ucm_malloc_hook_state.num_env_strs; ++ucm_malloc_hook_state.num_env_strs; ucm_malloc_hook_state.env_strs = ucm_realloc(ucm_malloc_hook_state.env_strs, @@ -484,7 +502,7 @@ static int ucm_add_to_environ(char *env_str) NULL); out_insert: - ucm_malloc_hook_state.env_strs[index] = env_str; + ucm_malloc_hook_state.env_strs[idx] = env_str; return 0; } @@ -533,7 +551,7 @@ static int ucm_setenv(const char *name, const char *value, int overwrite) static void ucm_malloc_sbrk(ucm_event_type_t event_type, ucm_event_t *event, void *arg) { - pthread_spin_lock(&ucm_malloc_hook_state.lock); + ucs_recursive_spin_lock(&ucm_malloc_hook_state.lock); /* Copy return value from call. We assume the event handler uses a lock. */ if (ucm_malloc_hook_state.heap_start == (void*)-1) { @@ -545,19 +563,25 @@ static void ucm_malloc_sbrk(ucm_event_type_t event_type, event->sbrk.increment, event->sbrk.result, ucm_malloc_hook_state.heap_start, ucm_malloc_hook_state.heap_end); - pthread_spin_unlock(&ucm_malloc_hook_state.lock); + ucs_recursive_spin_unlock(&ucm_malloc_hook_state.lock); } -static int ucs_malloc_is_ready(int events) +static int ucs_malloc_is_ready(int events, const char *title) { /* - * If malloc hooks are installed - we're good here. + * In RELOC mode, if malloc hooks are installed - we're good here. * Otherwise, we have to make sure all events are indeed working - because * we can't be sure what the existing implementation is doing. * The implication of this is that in some cases (e.g infinite mmap threshold) - * we will install out memory hooks, even though it may not be required. + * we will install our memory hooks, even though it may not be required. */ - return ucm_malloc_hook_state.hook_called || + ucm_debug("ucs_malloc_is_ready(%s): have 0x%x/0x%x events;" + " mmap_mode=%d hook_called=%d", + title, ucm_malloc_hook_state.installed_events, events, + ucm_mmap_hook_mode(), ucm_malloc_hook_state.hook_called); + + return ((ucm_mmap_hook_mode() == UCM_MMAP_HOOK_RELOC) && + ucm_malloc_hook_state.hook_called) || ucs_test_all_flags(ucm_malloc_hook_state.installed_events, events); } @@ -627,7 +651,7 @@ static void ucm_malloc_test(int events) static void ucm_malloc_populate_glibc_cache() { - char hostname[NAME_MAX]; + char hostname[HOST_NAME_MAX]; /* Trigger NSS initialization before we install malloc hooks. * This is needed because NSS could allocate strings with our malloc(), but @@ -641,11 +665,33 @@ static void ucm_malloc_populate_glibc_cache() static void ucm_malloc_install_symbols(ucm_reloc_patch_t *patches) { ucm_reloc_patch_t *patch; + for (patch = patches; patch->symbol != NULL; ++patch) { ucm_reloc_modify(patch); } } +static void* ucm_malloc_patchlist_prev_value(const ucm_reloc_patch_t *patches, + const char *symbol) +{ + const ucm_reloc_patch_t *patch; + for (patch = patches; patch->symbol != NULL; ++patch) { + if (!strcmp(patch->symbol, symbol)) { + ucm_debug("previous function pointer for '%s' is %p", symbol, + patch->prev_value); + if (patch->prev_value == NULL) { + goto not_found; + } + + return patch->prev_value; + } + } + +not_found: + ucm_fatal("could not find the previous value of '%s'", symbol); + return NULL; +} + static int ucm_malloc_mallopt(int param_number, int value) { int success; @@ -664,6 +710,12 @@ static int ucm_malloc_mallopt(int param_number, int value) return success; } +static size_t ucm_malloc_usable_size(void *mem) +{ + return ucm_malloc_usable_size_common(mem, + !ucm_malloc_is_address_in_heap(mem)); +} + static char *ucm_malloc_blacklist[] = { "libnvidia-fatbinaryloader.so", NULL @@ -693,7 +745,7 @@ static ucm_reloc_patch_t ucm_malloc_optional_symbol_patches[] = { { "mallinfo", ucm_dlmallinfo }, { "malloc_stats", ucm_dlmalloc_stats }, { "malloc_trim", ucm_dlmalloc_trim }, - { "malloc_usable_size", ucm_dlmalloc_usable_size }, + { "malloc_usable_size", ucm_malloc_usable_size }, { NULL, NULL } }; @@ -701,6 +753,10 @@ static void ucm_malloc_install_optional_symbols() { if (!(ucm_malloc_hook_state.install_state & UCM_MALLOC_INSTALLED_OPT_SYMS)) { ucm_malloc_install_symbols(ucm_malloc_optional_symbol_patches); + ucm_malloc_hook_state.usable_size = + (ucm_usable_size_func_t)ucm_malloc_patchlist_prev_value( + ucm_malloc_optional_symbol_patches, + "malloc_usable_size"); ucm_malloc_hook_state.install_state |= UCM_MALLOC_INSTALLED_OPT_SYMS; } } @@ -725,6 +781,23 @@ static void ucm_malloc_set_env_mallopt() } } +static void ucm_malloc_init_orig_funcs() +{ + /* We cannot use global initializer for these variables; if we do it, + * GCC makes them part of .got, and patching .got actually changes the + * values of these global variables. As a workaround, we initialize + * them here. + * NOTE This also makes sure that libucm.so has a reference to these symbols, + * so patching the relocation tables would find their previous value by libucm + */ + if (ucm_malloc_hook_state.usable_size == NULL) { + ucm_malloc_hook_state.usable_size = (size_t (*)(void *))malloc_usable_size; + } + if ( ucm_malloc_hook_state.free == NULL) { + ucm_malloc_hook_state.free = free; + } +} + ucs_status_t ucm_malloc_install(int events) { static ucm_event_handler_t sbrk_handler = { @@ -736,18 +809,22 @@ ucs_status_t ucm_malloc_install(int events) pthread_mutex_lock(&ucm_malloc_hook_state.install_mutex); - if (ucs_malloc_is_ready(events)) { + ucm_malloc_init_orig_funcs(); + + if (ucs_malloc_is_ready(events, "before test")) { goto out_succ; } ucm_malloc_test(events); - if (ucs_malloc_is_ready(events)) { + if (ucs_malloc_is_ready(events, "after test")) { goto out_succ; } if (!ucm_malloc_hook_state.hook_called) { +#ifdef HAVE_MALLOC_TRIM /* Try to leak less memory from original malloc */ malloc_trim(0); +#endif } if (!(ucm_malloc_hook_state.install_state & UCM_MALLOC_INSTALLED_SBRK_EVH)) { @@ -792,8 +869,9 @@ ucs_status_t ucm_malloc_install(int events) ucm_debug("installing malloc relocations"); ucm_malloc_populate_glibc_cache(); ucm_malloc_install_symbols(ucm_malloc_symbol_patches); - ucs_assert(ucm_malloc_symbol_patches[0].value == ucm_free); - ucm_malloc_hook_state.free = ucm_malloc_symbol_patches[0].prev_value; + ucm_malloc_hook_state.free = + (ucm_release_func_t)ucm_malloc_patchlist_prev_value( + ucm_malloc_symbol_patches, "free"); ucm_malloc_hook_state.install_state |= UCM_MALLOC_INSTALLED_MALL_SYMS; } } else { @@ -802,7 +880,7 @@ ucs_status_t ucm_malloc_install(int events) /* Just installed the symbols, test again */ ucm_malloc_test(events); - if (ucm_malloc_hook_state.hook_called) { + if (ucs_malloc_is_ready(events, "after install")) { goto out_install_opt_syms; } @@ -828,6 +906,6 @@ void ucm_malloc_state_reset(int default_mmap_thresh, int default_trim_thresh) } UCS_STATIC_INIT { - pthread_spin_init(&ucm_malloc_hook_state.lock, 0); + ucs_recursive_spinlock_init(&ucm_malloc_hook_state.lock, 0); kh_init_inplace(mmap_ptrs, &ucm_malloc_hook_state.ptrs); } diff --git a/src/ucm/mmap/install.c b/src/ucm/mmap/install.c index c58afb37e02..6b46baaeecf 100644 --- a/src/ucm/mmap/install.c +++ b/src/ucm/mmap/install.c @@ -17,10 +17,11 @@ #include #include #include -#include #include +#include #include #include +#include #include #include @@ -42,9 +43,22 @@ _call; \ ucm_trace("after %s: got 0x%x/0x%x", UCS_PP_MAKE_STRING(_call), \ (_data)->fired_events, exp_events); \ + /* in case if any event is missed - set correcponding bit to 0 */ \ + /* same as equation: */ \ + /* (_data)->out_events &= ~(exp_events ^ */ \ + /* ((_data)->fired_events & exp_events)); */ \ (_data)->out_events &= ~exp_events | (_data)->fired_events; \ } while(0) +#define UCM_MMAP_EVENT_NAME_ENTRY(_event) \ + [ucs_ilog2(UCM_EVENT_##_event)] = #_event + +#define UCM_MMAP_MAX_EVENT_NAME_LEN sizeof("VM_UNMAPPED") + +#define UCM_MMAP_REPORT_BUF_LEN \ + ((UCM_MMAP_MAX_EVENT_NAME_LEN + 2) * \ + ucs_array_size(ucm_mmap_event_name)) + extern const char *ucm_mmap_hook_modes[]; typedef enum ucm_mmap_hook_type { @@ -63,25 +77,43 @@ typedef struct ucm_mmap_func { typedef struct ucm_mmap_test_events_data { uint32_t fired_events; int out_events; + pid_t tid; } ucm_mmap_test_events_data_t; static ucm_mmap_func_t ucm_mmap_funcs[] = { - { {"mmap", ucm_override_mmap}, UCM_EVENT_MMAP, 0, UCM_HOOK_BOTH}, - { {"munmap", ucm_override_munmap}, UCM_EVENT_MUNMAP, 0, UCM_HOOK_BOTH}, - { {"mremap", ucm_override_mremap}, UCM_EVENT_MREMAP, 0, UCM_HOOK_BOTH}, - { {"shmat", ucm_override_shmat}, UCM_EVENT_SHMAT, 0, UCM_HOOK_BOTH}, + { {"mmap", ucm_override_mmap}, UCM_EVENT_MMAP, UCM_EVENT_NONE, UCM_HOOK_BOTH}, + { {"munmap", ucm_override_munmap}, UCM_EVENT_MUNMAP, UCM_EVENT_NONE, UCM_HOOK_BOTH}, +#if HAVE_MREMAP + { {"mremap", ucm_override_mremap}, UCM_EVENT_MREMAP, UCM_EVENT_NONE, UCM_HOOK_BOTH}, +#endif + { {"shmat", ucm_override_shmat}, UCM_EVENT_SHMAT, UCM_EVENT_NONE, UCM_HOOK_BOTH}, { {"shmdt", ucm_override_shmdt}, UCM_EVENT_SHMDT, UCM_EVENT_SHMAT, UCM_HOOK_BOTH}, - { {"sbrk", ucm_override_sbrk}, UCM_EVENT_SBRK, 0, UCM_HOOK_RELOC}, + { {"sbrk", ucm_override_sbrk}, UCM_EVENT_SBRK, UCM_EVENT_NONE, UCM_HOOK_RELOC}, #if UCM_BISTRO_HOOKS - { {"brk", ucm_override_brk}, UCM_EVENT_SBRK, 0, UCM_HOOK_BISTRO}, + { {"brk", ucm_override_brk}, UCM_EVENT_SBRK, UCM_EVENT_NONE, UCM_HOOK_BISTRO}, #endif - { {"madvise", ucm_override_madvise}, UCM_EVENT_MADVISE, 0, UCM_HOOK_BOTH}, - { {NULL, NULL}, 0} + { {"madvise", ucm_override_madvise}, UCM_EVENT_MADVISE, UCM_EVENT_NONE, UCM_HOOK_BOTH}, + { {NULL, NULL}, UCM_EVENT_NONE} }; static pthread_mutex_t ucm_mmap_install_mutex = PTHREAD_MUTEX_INITIALIZER; static int ucm_mmap_installed_events = 0; /* events that were reported as installed */ +static const char *ucm_mmap_event_name[] = { + /* Native events */ + UCM_MMAP_EVENT_NAME_ENTRY(MMAP), + UCM_MMAP_EVENT_NAME_ENTRY(MUNMAP), + UCM_MMAP_EVENT_NAME_ENTRY(MREMAP), + UCM_MMAP_EVENT_NAME_ENTRY(SHMAT), + UCM_MMAP_EVENT_NAME_ENTRY(SHMDT), + UCM_MMAP_EVENT_NAME_ENTRY(SBRK), + UCM_MMAP_EVENT_NAME_ENTRY(MADVISE), + + /* Aggregate events */ + UCM_MMAP_EVENT_NAME_ENTRY(VM_MAPPED), + UCM_MMAP_EVENT_NAME_ENTRY(VM_UNMAPPED), +}; + static void ucm_mmap_event_test_callback(ucm_event_type_t event_type, ucm_event_t *event, void *arg) { @@ -89,10 +121,12 @@ static void ucm_mmap_event_test_callback(ucm_event_type_t event_type, /* This callback may be called from multiple threads, which are just calling * memory allocations/release, and not testing mmap hooks at the moment. - * So in order to ensure the thread which tests events sees all fired - * events, use atomic OR operation. + * So ignore calls from other threads to ensure the only requested events + * are proceeded. */ - ucs_atomic_or32(&data->fired_events, event_type); + if (data->tid == ucs_get_tid()) { + data->fired_events |= event_type; + } } /* Fire events with pre/post action. The problem is in call sequence: we @@ -114,6 +148,7 @@ ucm_fire_mmap_events_internal(int events, ucm_mmap_test_events_data_t *data) UCM_FIRE_EVENT(events, UCM_EVENT_MMAP|UCM_EVENT_VM_MAPPED, data, p = mmap(NULL, ucm_get_page_size(), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); +#ifdef HAVE_MREMAP /* generate MAP event */ UCM_FIRE_EVENT(events, UCM_EVENT_MREMAP|UCM_EVENT_VM_MAPPED|UCM_EVENT_VM_UNMAPPED, data, p = mremap(p, ucm_get_page_size(), @@ -121,6 +156,7 @@ ucm_fire_mmap_events_internal(int events, ucm_mmap_test_events_data_t *data) /* generate UNMAP event */ UCM_FIRE_EVENT(events, UCM_EVENT_MREMAP|UCM_EVENT_VM_MAPPED|UCM_EVENT_VM_UNMAPPED, data, p = mremap(p, ucm_get_page_size() * 2, ucm_get_page_size(), 0)); +#endif /* generate UNMAP event */ UCM_FIRE_EVENT(events, UCM_EVENT_MMAP|UCM_EVENT_VM_MAPPED, data, p = mmap(p, ucm_get_page_size(), PROT_READ | PROT_WRITE, @@ -165,7 +201,7 @@ ucm_fire_mmap_events_internal(int events, ucm_mmap_test_events_data_t *data) p = mmap(NULL, ucm_get_page_size(), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0)); if (p != MAP_FAILED) { - UCM_FIRE_EVENT(events, UCM_EVENT_MADVISE, data, + UCM_FIRE_EVENT(events, UCM_EVENT_MADVISE|UCM_EVENT_VM_UNMAPPED, data, madvise(p, ucm_get_page_size(), MADV_DONTNEED)); UCM_FIRE_EVENT(events, UCM_EVENT_MUNMAP|UCM_EVENT_VM_UNMAPPED, data, munmap(p, ucm_get_page_size())); @@ -182,8 +218,39 @@ void ucm_fire_mmap_events(int events) ucm_fire_mmap_events_internal(events, &data); } +static void ucm_mmap_event_report_missing(int expected, int actual, + const char *event_type) +{ + int events_count = 0; + int missing_events; + int idx; + char *buf; + char *buf_p; + char *end_p; + + UCS_STATIC_ASSERT(UCM_MMAP_REPORT_BUF_LEN <= UCS_ALLOCA_MAX_SIZE) + + buf = buf_p = ucs_alloca(UCM_MMAP_REPORT_BUF_LEN); + end_p = buf_p + UCM_MMAP_REPORT_BUF_LEN; + missing_events = expected & ~actual & + UCS_MASK(ucs_array_size(ucm_mmap_event_name)); + + ucs_for_each_bit(idx, missing_events) { + /* coverity[overrun-local] */ + snprintf(buf_p, end_p - buf_p, "%s%s", ((events_count > 0) ? ", " : ""), + ucm_mmap_event_name[idx]); + events_count++; + buf_p += strlen(buf_p); + } + + if (events_count) { + ucm_diag("missing %s memory events: %s", event_type, buf); + } +} + /* Called with lock held */ -static ucs_status_t ucm_mmap_test_events(int events) +static ucs_status_t +ucm_mmap_test_events_nolock(int events, const char *event_type) { ucm_event_handler_t handler; ucm_mmap_test_events_data_t data; @@ -193,6 +260,7 @@ static ucs_status_t ucm_mmap_test_events(int events) handler.cb = ucm_mmap_event_test_callback; handler.arg = &data; data.out_events = events; + data.tid = ucs_get_tid(); ucm_event_handler_add(&handler); ucm_fire_mmap_events_internal(events, &data); @@ -202,27 +270,36 @@ static ucs_status_t ucm_mmap_test_events(int events) /* Return success if we caught all wanted events */ if (!ucs_test_all_flags(data.out_events, events)) { + ucm_mmap_event_report_missing(events, data.out_events, event_type); return UCS_ERR_UNSUPPORTED; } return UCS_OK; } -ucs_status_t ucm_mmap_test_installed_events(int events) +ucs_status_t ucm_mmap_test_events(int events, const char *event_type) { ucs_status_t status; /* - * return UCS_OK iff all installed events are actually working - * we don't check the status of events which were not successfully installed + * return UCS_OK iff all events are actually working */ pthread_mutex_lock(&ucm_mmap_install_mutex); - status = ucm_mmap_test_events(events & ucm_mmap_installed_events); + status = ucm_mmap_test_events_nolock(events, event_type); pthread_mutex_unlock(&ucm_mmap_install_mutex); return status; } +ucs_status_t ucm_mmap_test_installed_events(int events) +{ + /* + * return UCS_OK iff all installed events are actually working + * we don't check the status of events which were not successfully installed + */ + return ucm_mmap_test_events(events & ucm_mmap_installed_events, "internal"); +} + /* Called with lock held */ static ucs_status_t ucs_mmap_install_reloc(int events) { @@ -269,36 +346,56 @@ static ucs_status_t ucs_mmap_install_reloc(int events) return UCS_OK; } +static int ucm_mmap_events_to_native_events(int events) +{ + int native_events; + + native_events = events & ~(UCM_EVENT_MEM_TYPE_ALLOC | + UCM_EVENT_MEM_TYPE_FREE); + + if (events & UCM_EVENT_VM_MAPPED) { + native_events |= UCM_NATIVE_EVENT_VM_MAPPED; + } + if (events & UCM_EVENT_VM_UNMAPPED) { + native_events |= UCM_NATIVE_EVENT_VM_UNMAPPED; + } + + return native_events; +} + ucs_status_t ucm_mmap_install(int events) { ucs_status_t status; + int native_events; pthread_mutex_lock(&ucm_mmap_install_mutex); - if (ucs_test_all_flags(ucm_mmap_installed_events, events)) { + /* Replace aggregate events with the native events which make them */ + native_events = ucm_mmap_events_to_native_events(events); + if (ucs_test_all_flags(ucm_mmap_installed_events, native_events)) { /* if we already installed these events, check that they are still * working, and if not - reinstall them. */ - status = ucm_mmap_test_events(events); + status = ucm_mmap_test_events_nolock(native_events, 0); if (status == UCS_OK) { goto out_unlock; } } - status = ucs_mmap_install_reloc(events); + status = ucs_mmap_install_reloc(native_events); if (status != UCS_OK) { ucm_debug("failed to install relocations for mmap"); goto out_unlock; } - status = ucm_mmap_test_events(events); + status = ucm_mmap_test_events_nolock(native_events, 0); if (status != UCS_OK) { ucm_debug("failed to install mmap events"); goto out_unlock; } /* status == UCS_OK */ - ucm_mmap_installed_events |= events; + ucm_mmap_installed_events |= native_events; ucm_debug("mmap installed events = 0x%x", ucm_mmap_installed_events); out_unlock: diff --git a/src/ucm/mmap/mmap.h b/src/ucm/mmap/mmap.h index 58252de7dfc..ed90a801238 100644 --- a/src/ucm/mmap/mmap.h +++ b/src/ucm/mmap/mmap.h @@ -35,14 +35,19 @@ void *ucm_brk_syscall(void *addr); int ucm_override_madvise(void *addr, size_t length, int advice); void ucm_fire_mmap_events(int events); ucs_status_t ucm_mmap_test_installed_events(int events); +ucs_status_t ucm_mmap_test_events(int events, const char *event_type); static UCS_F_ALWAYS_INLINE ucm_mmap_hook_mode_t ucm_mmap_hook_mode(void) { +#ifdef __SANITIZE_ADDRESS__ + return UCM_MMAP_HOOK_NONE; +#else if (RUNNING_ON_VALGRIND && (ucm_global_opts.mmap_hook_mode == UCM_MMAP_HOOK_BISTRO)) { return UCM_MMAP_HOOK_RELOC; } return ucm_global_opts.mmap_hook_mode; +#endif } #endif diff --git a/src/ucm/ptmalloc286/malloc.c b/src/ucm/ptmalloc286/malloc.c index 17f4f39f74f..e1779967885 100644 --- a/src/ucm/ptmalloc286/malloc.c +++ b/src/ucm/ptmalloc286/malloc.c @@ -2222,7 +2222,7 @@ typedef unsigned int flag_t; /* The type of various bit flag sets */ ((MCHUNK_SIZE + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK) /* conversion from malloc headers to user pointers, and back */ -#define chunk2mem(p) ((void*)((char*)(p) + TWO_SIZE_T_SIZES)) +#define chunk2mem(p) ((void*)((long)(p) + TWO_SIZE_T_SIZES)) #define mem2chunk(mem) ((mchunkptr)((char*)(mem) - TWO_SIZE_T_SIZES)) /* chunk associated with aligned address A */ #define align_as_chunk(A) (mchunkptr)((A) + align_offset(chunk2mem(A))) @@ -5364,7 +5364,7 @@ size_t dlmalloc_set_footprint_limit(size_t bytes) { size_t result; /* invert sense of 0 */ if (bytes == 0) result = granularity_align(1); /* Use minimal size */ - if (bytes == MAX_SIZE_T) + else if (bytes == MAX_SIZE_T) result = 0; /* disable */ else result = granularity_align(bytes); diff --git a/src/ucm/rocm/Makefile.am b/src/ucm/rocm/Makefile.am index 0bf779bc970..47b9cc3e576 100644 --- a/src/ucm/rocm/Makefile.am +++ b/src/ucm/rocm/Makefile.am @@ -11,7 +11,11 @@ module_LTLIBRARIES = libucm_rocm.la libucm_rocm_la_CPPFLAGS = $(BASE_CPPFLAGS) $(ROCM_CPPFLAGS) libucm_rocm_la_CFLAGS = $(BASE_CFLAGS) $(ROCM_CFLAGS) libucm_rocm_la_LIBADD = ../libucm.la -libucm_rocm_la_LDFLAGS = $(UCM_MODULE_LDFLAGS) $(ROCM_LDFLAGS) \ +libucm_rocm_la_LDFLAGS = $(UCM_MODULE_LDFLAGS) \ + $(patsubst %, -Xlinker %, $(ROCM_LDFLAGS)) \ + $(patsubst %, -Xlinker %, -L$(ROCM_ROOT)/lib -rpath $(ROCM_ROOT)/hip/lib -rpath $(ROCM_ROOT)/lib) \ + $(patsubst %, -Xlinker %, --enable-new-dtags) \ + $(patsubst %, -Xlinker %, -rpath $(ROCM_ROOT)/lib64) \ -version-info $(SOVERSION) noinst_HEADERS = \ diff --git a/src/ucm/rocm/configure.m4 b/src/ucm/rocm/configure.m4 index 47b6d8b7501..02c520bc57c 100644 --- a/src/ucm/rocm/configure.m4 +++ b/src/ucm/rocm/configure.m4 @@ -6,5 +6,5 @@ # UCX_CHECK_ROCM -AS_IF([test "x$rocm_happy" = "xyes"], [ucm_modules+=":rocm"]) +AS_IF([test "x$rocm_happy" = "xyes"], [ucm_modules="${ucm_modules}:rocm"]) AC_CONFIG_FILES([src/ucm/rocm/Makefile]) diff --git a/src/ucm/rocm/rocmmem.c b/src/ucm/rocm/rocmmem.c index 5fca1dd52a9..f19b32e286b 100644 --- a/src/ucm/rocm/rocmmem.c +++ b/src/ucm/rocm/rocmmem.c @@ -34,7 +34,7 @@ UCM_OVERRIDE_FUNC(hsa_amd_memory_pool_free, hsa_status_t) #endif static UCS_F_ALWAYS_INLINE void -ucm_dispatch_mem_type_alloc(void *addr, size_t length, ucm_mem_type_t mem_type) +ucm_dispatch_mem_type_alloc(void *addr, size_t length, ucs_memory_type_t mem_type) { ucm_event_t event; @@ -45,7 +45,7 @@ ucm_dispatch_mem_type_alloc(void *addr, size_t length, ucm_mem_type_t mem_type) } static UCS_F_ALWAYS_INLINE void -ucm_dispatch_mem_type_free(void *addr, size_t length, ucm_mem_type_t mem_type) +ucm_dispatch_mem_type_free(void *addr, size_t length, ucs_memory_type_t mem_type) { ucm_event_t event; @@ -60,7 +60,7 @@ static void ucm_hsa_amd_memory_pool_free_dispatch_events(void *ptr) size_t size; hsa_status_t status; hsa_device_type_t dev_type; - int mem_type = UCM_MEM_TYPE_ROCM; + ucs_memory_type_t mem_type = UCS_MEMORY_TYPE_ROCM; hsa_amd_pointer_info_t info = { .size = sizeof(hsa_amd_pointer_info_t), }; @@ -86,7 +86,7 @@ static void ucm_hsa_amd_memory_pool_free_dispatch_events(void *ptr) } if (dev_type != HSA_DEVICE_TYPE_GPU) { - mem_type= UCM_MEM_TYPE_ROCM_MANAGED; + mem_type = UCS_MEMORY_TYPE_ROCM_MANAGED; } } @@ -113,16 +113,16 @@ hsa_status_t ucm_hsa_amd_memory_pool_allocate( hsa_amd_memory_pool_t memory_pool, size_t size, uint32_t flags, void** ptr) { + ucs_memory_type_t type = UCS_MEMORY_TYPE_ROCM; + uint32_t pool_flags = 0; hsa_status_t status; - uint32_t pool_flags = 0; - int type = UCM_MEM_TYPE_ROCM; status = hsa_amd_memory_pool_get_info(memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &pool_flags); if (status == HSA_STATUS_SUCCESS && !(pool_flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) { - type = UCM_MEM_TYPE_ROCM_MANAGED; + type = UCS_MEMORY_TYPE_ROCM_MANAGED; } ucm_event_enter(); @@ -181,9 +181,20 @@ static ucs_status_t ucm_rocmmem_install(int events) return status; } +static void ucm_rocmmem_get_existing_alloc(ucm_event_handler_t *handler) +{ +} + +static ucm_event_installer_t ucm_rocm_initializer = { + .install = ucm_rocmmem_install, + .get_existing_alloc = ucm_rocmmem_get_existing_alloc, + .get_mem_type_current_device_info = NULL +}; + UCS_STATIC_INIT { - static ucm_event_installer_t rocm_initializer = { - .func = ucm_rocmmem_install - }; - ucs_list_add_tail(&ucm_event_installer_list, &rocm_initializer.list); + ucs_list_add_tail(&ucm_event_installer_list, &ucm_rocm_initializer.list); +} + +UCS_STATIC_CLEANUP { + ucs_list_del(&ucm_rocm_initializer.list); } diff --git a/src/ucm/util/log.c b/src/ucm/util/log.c index 464db9af280..ec41746a477 100644 --- a/src/ucm/util/log.c +++ b/src/ucm/util/log.c @@ -4,10 +4,15 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "log.h" #include "sys.h" #include +#include #include #include #include @@ -17,17 +22,18 @@ #include #include #include +#include +#define UCM_LOG_BUG_SIZE 512 -#define UCM_LOG_BUG_SIZE 256 - -static int ucm_log_fileno = 1; /* stdout */ -static char ucm_log_hostname[40] = {0}; +static int ucm_log_fileno = 1; /* stdout */ +static char ucm_log_hostname[HOST_NAME_MAX] = {0}; const char *ucm_log_level_names[] = { [UCS_LOG_LEVEL_FATAL] = "FATAL", [UCS_LOG_LEVEL_ERROR] = "ERROR", [UCS_LOG_LEVEL_WARN] = "WARN", + [UCS_LOG_LEVEL_DIAG] = "DIAG", [UCS_LOG_LEVEL_INFO] = "INFO", [UCS_LOG_LEVEL_DEBUG] = "DEBUG", [UCS_LOG_LEVEL_TRACE] = "TRACE", @@ -59,7 +65,7 @@ static char *ucm_log_ltoa(char *p, char *end, long n, int base, int flags, int pad) { static const char digits[] = "0123456789abcdef"; - long div; + long divider; if (((n < 0) || (flags & UCM_LOG_LTOA_FLAG_SIGN)) && (p < end)) { *(p++) = (n < 0 ) ? '-' : '+'; @@ -74,9 +80,9 @@ static char *ucm_log_ltoa(char *p, char *end, long n, int base, int flags, n = labs(n); - div = 1; - while ((n / div) != 0) { - div *= base; + divider = 1; + while ((n / divider) != 0) { + divider *= base; --pad; } @@ -85,10 +91,10 @@ static char *ucm_log_ltoa(char *p, char *end, long n, int base, int flags, (flags & UCM_LOG_LTOA_FLAG_PAD0) ? '0' : ' '); } - div /= base; - while ((p < end) && (div > 0)) { - *(p++) = digits[(n / div + base) % base]; - div /= base; + divider /= base; + while ((p < end) && (divider > 0)) { + *(p++) = digits[(n / divider + base) % base]; + divider /= base; } if (flags & UCM_LOG_LTOA_PAD_LEFT) { @@ -258,7 +264,7 @@ void __ucm_log(const char *file, unsigned line, const char *function, gettimeofday(&tv, NULL); ucm_log_snprintf(buf, UCM_LOG_BUG_SIZE - 1, "[%lu.%06lu] [%s:%d] %18s:%-4d UCX %s ", tv.tv_sec, tv.tv_usec, ucm_log_hostname, getpid(), - basename(file), line, ucm_log_level_names[level]); + ucs_basename(file), line, ucm_log_level_names[level]); buf[UCM_LOG_BUG_SIZE - 1] = '\0'; length = strlen(buf); diff --git a/src/ucm/util/log.h b/src/ucm/util/log.h index c69782341cc..9dcfd317406 100644 --- a/src/ucm/util/log.h +++ b/src/ucm/util/log.h @@ -16,7 +16,8 @@ #define ucm_log(_level, _message, ...) \ - if (((_level) <= UCS_MAX_LOG_LEVEL) && ((_level) <= ucm_global_opts.log_level)) { \ + if (((_level) <= UCS_MAX_LOG_LEVEL) && \ + ((_level) <= (int)ucm_global_opts.log_level)) { \ __ucm_log(__FILE__, __LINE__, __FUNCTION__, (_level), _message, \ ## __VA_ARGS__); \ } @@ -24,8 +25,9 @@ #define ucm_fatal(_message, ...) ucm_log(UCS_LOG_LEVEL_FATAL, _message, ## __VA_ARGS__) #define ucm_error(_message, ...) ucm_log(UCS_LOG_LEVEL_ERROR, _message, ## __VA_ARGS__) #define ucm_warn(_message, ...) ucm_log(UCS_LOG_LEVEL_WARN, _message, ## __VA_ARGS__) +#define ucm_diag(_message, ...) ucm_log(UCS_LOG_LEVEL_DIAG, _message, ## __VA_ARGS__) #define ucm_info(_message, ...) ucm_log(UCS_LOG_LEVEL_INFO, _message, ## __VA_ARGS__) -#define ucm_debug(_message, ...) ucm_log(UCS_LOG_LEVEL_DEBUG, _message, ## __VA_ARGS__) +#define ucm_debug(_message, ...) ucm_log(UCS_LOG_LEVEL_DEBUG, _message, ## __VA_ARGS__) #define ucm_trace(_message, ...) ucm_log(UCS_LOG_LEVEL_TRACE, _message, ## __VA_ARGS__) extern const char *ucm_log_level_names[]; diff --git a/src/ucm/util/reloc.c b/src/ucm/util/reloc.c index 921bdca5f45..1349497e48d 100644 --- a/src/ucm/util/reloc.c +++ b/src/ucm/util/reloc.c @@ -16,18 +16,26 @@ #include "reloc.h" +#include #include +#include +#include #include #include #include +#include +#include #include #include #include #include #include #include +#include +typedef void * (*ucm_reloc_dlopen_func_t)(const char *, int); +typedef int (*ucm_reloc_dlclose_func_t)(void *); typedef struct ucm_auxv { long type; @@ -36,17 +44,32 @@ typedef struct ucm_auxv { typedef struct ucm_reloc_dl_iter_context { - Dl_info def_dlinfo; ucm_reloc_patch_t *patch; ucs_status_t status; + ElfW(Addr) libucm_base_addr; /* Base address to store previous value */ } ucm_reloc_dl_iter_context_t; +/* Hash of symbols in a dynamic object */ +KHASH_MAP_INIT_STR(ucm_dl_symbol_hash, void*); + +/* Hash of loaded dynamic objects */ +typedef struct { + khash_t(ucm_dl_symbol_hash) symbols; +} ucm_dl_info_t; + +KHASH_MAP_INIT_INT64(ucm_dl_info_hash, ucm_dl_info_t) + /* List of patches to be applied to additional libraries */ static UCS_LIST_HEAD(ucm_reloc_patch_list); -static void * (*ucm_reloc_orig_dlopen)(const char *, int) = NULL; static pthread_mutex_t ucm_reloc_patch_list_lock = PTHREAD_MUTEX_INITIALIZER; +static khash_t(ucm_dl_info_hash) ucm_dl_info_hash; +static ucm_reloc_dlopen_func_t ucm_reloc_orig_dlopen = NULL; +static ucm_reloc_dlclose_func_t ucm_reloc_orig_dlclose = NULL; + +/* forward declaration */ +static void ucm_reloc_get_orig_dl_funcs(); static uintptr_t ucm_reloc_get_entry(ElfW(Addr) base, const ElfW(Phdr) *dphdr, ElfW(Sxword) tag) @@ -71,7 +94,7 @@ static void ucm_reloc_file_lock(int fd, int l_type) } } -static int ucm_reloc_get_aux_phsize() +static ucs_status_t ucm_reloc_get_aux_phsize(int *phsize_p) { #define UCM_RELOC_AUXV_BUF_LEN 16 static const char *proc_auxv_filename = "/proc/self/auxv"; @@ -84,251 +107,578 @@ static int ucm_reloc_get_aux_phsize() int fd; /* Can avoid lock here - worst case we'll read the file more than once */ - if (phsize == 0) { - fd = open(proc_auxv_filename, O_RDONLY); - if (fd < 0) { - ucm_error("failed to open '%s' for reading: %m", proc_auxv_filename); - return fd; - } + if (phsize != 0) { + *phsize_p = phsize; + return UCS_OK; + } - if (RUNNING_ON_VALGRIND) { - /* Work around a bug caused by valgrind's fake /proc/self/auxv - - * every time this file is opened when running with valgrind, a - * a duplicate of the same fd is returned, so all share the same - * file offset. - */ - ucm_reloc_file_lock(fd, F_WRLCK); - lseek(fd, 0, SEEK_SET); + fd = open(proc_auxv_filename, O_RDONLY); + if (fd < 0) { + ucm_error("failed to open '%s' for reading: %m", proc_auxv_filename); + return UCS_ERR_IO_ERROR; + } + + if (RUNNING_ON_VALGRIND) { + /* Work around a bug caused by valgrind's fake /proc/self/auxv - + * every time this file is opened when running with valgrind, a + * a duplicate of the same fd is returned, so all share the same + * file offset. + */ + ucm_reloc_file_lock(fd, F_WRLCK); + lseek(fd, 0, SEEK_SET); + } + + /* Use small buffer on the stack, avoid using malloc() */ + found = 0; + do { + nread = read(fd, buffer, sizeof(buffer)); + if (nread < 0) { + ucm_error("failed to read %lu bytes from %s (ret=%ld): %m", + sizeof(buffer), proc_auxv_filename, nread); + break; } - /* Use small buffer on the stack, avoid using malloc() */ - found = 0; - do { - nread = read(fd, buffer, sizeof(buffer)); - if (nread < 0) { - ucm_error("failed to read %lu bytes from %s (ret=%ld): %m", - sizeof(buffer), proc_auxv_filename, nread); + count = nread / sizeof(buffer[0]); + for (auxv = buffer; (auxv < (buffer + count)) && (auxv->type != AT_NULL); + ++auxv) + { + if ((auxv->type == AT_PHENT) && (auxv->value > 0)) { + found = 1; + phsize = auxv->value; + ucm_debug("read phent from %s: %d", proc_auxv_filename, phsize); + if (phsize == 0) { + ucm_error("phsize is 0"); + } break; } + } + } while ((count > 0) && !found); - count = nread / sizeof(buffer[0]); - for (auxv = buffer; (auxv < buffer + count) && (auxv->type != AT_NULL); - ++auxv) - { - if (auxv->type == AT_PHENT) { - found = 1; - phsize = auxv->value; - ucm_debug("read phent from %s: %d", proc_auxv_filename, phsize); - if (phsize == 0) { - ucm_error("phsize is 0"); - } - break; - } - } - } while ((count > 0) && (phsize == 0)); + if (RUNNING_ON_VALGRIND) { + ucm_reloc_file_lock(fd, F_UNLCK); + } + close(fd); - if (!found) { - ucm_error("AT_PHENT entry not found in %s", proc_auxv_filename); - } + if (!found) { + ucm_error("AT_PHENT entry not found in %s", proc_auxv_filename); + return UCS_ERR_NO_ELEM; + } + + *phsize_p = phsize; + return UCS_OK; +} - if (RUNNING_ON_VALGRIND) { - ucm_reloc_file_lock(fd, F_UNLCK); +ElfW(Rela) *ucm_reloc_find_sym(void *table, size_t table_size, const char *symbol, + void *strtab, ElfW(Sym) *symtab) +{ + ElfW(Rela) *reloc; + char *elf_sym; + + for (reloc = table; + (void*)reloc < UCS_PTR_BYTE_OFFSET(table, table_size); + ++reloc) { + elf_sym = (char*)strtab + symtab[ELF64_R_SYM(reloc->r_info)].st_name; + if (!strcmp(symbol, elf_sym)) { + return reloc; } - close(fd); } - return phsize; + return NULL; } static ucs_status_t -ucm_reloc_modify_got(ElfW(Addr) base, const ElfW(Phdr) *phdr, const char *phname, - int phnum, int phsize, - const ucm_reloc_dl_iter_context_t *ctx) +ucm_reloc_dl_apply_patch(const ucm_dl_info_t *dl_info, const char *dl_basename, + int store_prev, ucm_reloc_patch_t *patch) { - ElfW(Phdr) *dphdr; - ElfW(Rela) *reloc; - ElfW(Sym) *symtab; - void *jmprel, *strtab; - size_t pltrelsz; + void *prev_value; + khiter_t khiter; long page_size; - char *elf_sym; void **entry; void *page; int ret; - int i; - Dl_info entry_dlinfo; - int success; + /* find symbol in our hash table */ + khiter = kh_get(ucm_dl_symbol_hash, &dl_info->symbols, patch->symbol); + if (khiter == kh_end(&dl_info->symbols)) { + ucm_trace("symbol '%s' not found in %s", patch->symbol, dl_basename); + return UCS_OK; + } + + /* get entry address from hash table */ + entry = kh_val(&dl_info->symbols, khiter); + prev_value = *entry; + + if (prev_value == patch->value) { + ucm_trace("symbol '%s' in %s at [%p] up-to-date", patch->symbol, + dl_basename, entry); + return UCS_OK; + } + + /* enable writing to the page */ page_size = ucm_get_page_size(); + page = ucs_align_down_pow2_ptr(entry, page_size); + ret = mprotect(page, page_size, PROT_READ|PROT_WRITE); + if (ret < 0) { + ucm_error("failed to modify %s page %p to rw: %m", dl_basename, page); + return UCS_ERR_UNSUPPORTED; + } + + /* modify the relocation to the new value */ + *entry = patch->value; + ucm_debug("symbol '%s' in %s at [%p] modified from %p to %p", + patch->symbol, dl_basename, entry, prev_value, patch->value); + + /* store default entry to prev_value to guarantee valid pointers + * throughout life time of the process */ + if (store_prev) { + patch->prev_value = prev_value; + ucm_debug("'%s' prev_value is %p", patch->symbol, prev_value); + } + + return UCS_OK; +} + +static unsigned +ucm_dl_populate_symbols(ucm_dl_info_t *dl_info, uintptr_t dlpi_addr, void *table, + size_t table_size, void *strtab, ElfW(Sym) *symtab, + const char *dl_name) +{ + ElfW(Rela) *reloc; + khiter_t khiter; + unsigned count; + char *elf_sym; + int ret; + + count = 0; + for (reloc = table; (void*)reloc < UCS_PTR_BYTE_OFFSET(table, table_size); + ++reloc) { + elf_sym = (char*)strtab + symtab[ELF64_R_SYM(reloc->r_info)].st_name; + if (*elf_sym == '\0') { + /* skip empty symbols */ + continue; + } + + khiter = kh_put(ucm_dl_symbol_hash, &dl_info->symbols, elf_sym, &ret); + if ((ret == UCS_KH_PUT_BUCKET_EMPTY) || + (ret == UCS_KH_PUT_BUCKET_CLEAR)) { + /* do not override previous values */ + kh_val(&dl_info->symbols, khiter) = (void*)(dlpi_addr + + reloc->r_offset); + ++count; + } else if (ret == UCS_KH_PUT_KEY_PRESENT) { + ucm_trace("ignoring duplicate symbol '%s' in %s", elf_sym, dl_name); + } else { + ucm_debug("failed to add symbol '%s' in %s", elf_sym, dl_name); + } + } + + return count; +} + +static ucs_status_t ucm_reloc_dl_info_get(const struct dl_phdr_info *phdr_info, + const char *dl_name, + const ucm_dl_info_t **dl_info_p) +{ + uintptr_t dlpi_addr = phdr_info->dlpi_addr; + unsigned UCS_V_UNUSED num_symbols; + void *jmprel, *rela, *strtab; + size_t pltrelsz, relasz; + ucm_dl_info_t *dl_info; + ucs_status_t status; + ElfW(Phdr) *dphdr; + ElfW(Sym) *symtab; + khiter_t khiter; + int i, ret; + int phsize; + + status = ucm_reloc_get_aux_phsize(&phsize); + if (status != UCS_OK) { + return status; + } + + khiter = kh_put(ucm_dl_info_hash, &ucm_dl_info_hash, dlpi_addr, &ret); + if (ret == UCS_KH_PUT_FAILED) { + ucm_error("failed to add dl info hash entry"); + return UCS_ERR_NO_MEMORY; + } + + dl_info = &kh_val(&ucm_dl_info_hash, khiter); + if (ret == UCS_KH_PUT_KEY_PRESENT) { + /* exists */ + goto out; + } + + kh_init_inplace(ucm_dl_symbol_hash, &dl_info->symbols); /* find PT_DYNAMIC */ dphdr = NULL; - for (i = 0; i < phnum; ++i) { - dphdr = (void*)phdr + phsize * i; + for (i = 0; i < phdr_info->dlpi_phnum; ++i) { + dphdr = UCS_PTR_BYTE_OFFSET(phdr_info->dlpi_phdr, phsize * i); if (dphdr->p_type == PT_DYNAMIC) { break; } } if (dphdr == NULL) { - return UCS_ERR_NO_ELEM; + /* No dynamic section */ + ucm_debug("%s has no dynamic section - skipping", dl_name) + goto out; } /* Get ELF tables pointers */ - jmprel = (void*)ucm_reloc_get_entry(base, dphdr, DT_JMPREL); - symtab = (void*)ucm_reloc_get_entry(base, dphdr, DT_SYMTAB); - strtab = (void*)ucm_reloc_get_entry(base, dphdr, DT_STRTAB); - pltrelsz = ucm_reloc_get_entry(base, dphdr, DT_PLTRELSZ); + symtab = (void*)ucm_reloc_get_entry(dlpi_addr, dphdr, DT_SYMTAB); + strtab = (void*)ucm_reloc_get_entry(dlpi_addr, dphdr, DT_STRTAB); + if ((symtab == NULL) || (strtab == NULL)) { + /* no DT_SYMTAB or DT_STRTAB sections are defined */ + ucm_debug("%s has no dynamic symbols - skipping", dl_name) + goto out; + } - /* Find matching symbol and replace it */ - for (reloc = jmprel; (void*)reloc < jmprel + pltrelsz; ++reloc) { - elf_sym = (char*)strtab + symtab[ELF64_R_SYM(reloc->r_info)].st_name; - if (!strcmp(ctx->patch->symbol, elf_sym)) { - entry = (void *)(base + reloc->r_offset); + num_symbols = 0; - ucm_trace("'%s' entry in '%s' is at %p", ctx->patch->symbol, - basename(phname), entry); + /* populate .got.plt */ + jmprel = (void*)ucm_reloc_get_entry(dlpi_addr, dphdr, DT_JMPREL); + if (jmprel != NULL) { + pltrelsz = ucm_reloc_get_entry(dlpi_addr, dphdr, DT_PLTRELSZ); + num_symbols += ucm_dl_populate_symbols(dl_info, dlpi_addr, jmprel, + pltrelsz, strtab, symtab, dl_name); + } - page = (void *)((intptr_t)entry & ~(page_size - 1)); - ret = mprotect(page, page_size, PROT_READ|PROT_WRITE); - if (ret < 0) { - ucm_error("failed to modify GOT page %p to rw: %m", page); - return UCS_ERR_UNSUPPORTED; - } + /* populate .got */ + rela = (void*)ucm_reloc_get_entry(dlpi_addr, dphdr, DT_RELA); + if (rela != NULL) { + relasz = ucm_reloc_get_entry(dlpi_addr, dphdr, DT_RELASZ); + num_symbols += ucm_dl_populate_symbols(dl_info, dlpi_addr, rela, relasz, + strtab, symtab, dl_name); + } - success = dladdr(*entry, &entry_dlinfo); - ucs_assertv_always(success, "can't find shared object with entry %p", - *entry); + ucm_debug("added dl_info %p for %s with %u symbols", dl_info, + ucs_basename(dl_name), num_symbols); - /* store default entry to prev_value to guarantee valid pointers - * throughout life time of the process */ - if (ctx->def_dlinfo.dli_fbase == entry_dlinfo.dli_fbase) { - ctx->patch->prev_value = *entry; - ucm_trace("'%s' prev_value is %p from '%s'", ctx->patch->symbol, - *entry, basename(entry_dlinfo.dli_fname)); - } +out: + *dl_info_p = dl_info; + return UCS_OK; +} - *entry = ctx->patch->value; - break; - } +static void ucm_reloc_dl_info_cleanup(ElfW(Addr) dlpi_addr, const char *dl_name) +{ + ucm_dl_info_t *dl_info; + khiter_t khiter; + + khiter = kh_get(ucm_dl_info_hash, &ucm_dl_info_hash, dlpi_addr); + if (khiter == kh_end(&ucm_dl_info_hash)) { + ucm_debug("no dl_info entry for address 0x%lx", dlpi_addr); + return; } - return UCS_OK; + /* destroy symbols hash table */ + dl_info = &kh_val(&ucm_dl_info_hash, khiter); + kh_destroy_inplace(ucm_dl_symbol_hash, &dl_info->symbols); + + /* delete entry in dl_info hash */ + kh_del(ucm_dl_info_hash, &ucm_dl_info_hash, khiter); + + ucm_debug("removed dl_info %p for %s", dl_info, ucs_basename(dl_name)); } -static int ucm_reloc_phdr_iterator(struct dl_phdr_info *info, size_t size, void *data) +static int +ucm_reloc_patch_is_dl_blacklisted(const char *dlpi_name, + const ucm_reloc_patch_t *patch) { - ucm_reloc_dl_iter_context_t *ctx = data; - int phsize; - int i; + unsigned i; - /* check if module is black-listed for this patch */ - if (ctx->patch->blacklist) { - for (i = 0; ctx->patch->blacklist[i]; i++) { - if (strstr(info->dlpi_name, ctx->patch->blacklist[i])) { - /* module is black-listed */ - ctx->status = UCS_OK; - return 0; - } - } + if (patch->blacklist == NULL) { + return 0; } - phsize = ucm_reloc_get_aux_phsize(); - if (phsize <= 0) { - ucm_error("failed to read phent size"); - ctx->status = UCS_ERR_UNSUPPORTED; - return -1; + for (i = 0; patch->blacklist[i] != NULL; i++) { + if (strstr(dlpi_name, patch->blacklist[i])) { + return 1; + } } - ctx->status = ucm_reloc_modify_got(info->dlpi_addr, info->dlpi_phdr, - info->dlpi_name, info->dlpi_phnum, - phsize, ctx); - if (ctx->status == UCS_OK) { - return 0; /* continue iteration and patch all objects */ + return 0; +} + +static const char* +ucm_reloc_get_dl_name(const char *dlpi_name, ElfW(Addr) dlpi_addr, char *buf, + size_t max) +{ + if (strcmp(dlpi_name, "")) { + return dlpi_name; } else { + snprintf(buf, max, "(anonymous dl @ 0x%lx)", dlpi_addr); + return buf; + } +} + +static int ucm_reloc_phdr_iterator(struct dl_phdr_info *phdr_info, size_t size, + void *data) +{ + ucm_reloc_dl_iter_context_t *ctx = data; + const ucm_dl_info_t *dl_info; + char dl_name_buffer[256]; + const char *dl_name; + int store_prev; + + /* check if shared object is black-listed for this patch */ + if (ucm_reloc_patch_is_dl_blacklisted(phdr_info->dlpi_name, ctx->patch)) { + return 0; + } + + dl_name = ucm_reloc_get_dl_name(phdr_info->dlpi_name, phdr_info->dlpi_addr, + dl_name_buffer, sizeof(dl_name_buffer)); + + ctx->status = ucm_reloc_dl_info_get(phdr_info, dl_name, &dl_info); + if (ctx->status != UCS_OK) { return -1; /* stop iteration if got a real error */ } + + store_prev = phdr_info->dlpi_addr == ctx->libucm_base_addr; + ctx->status = ucm_reloc_dl_apply_patch(dl_info, ucs_basename(dl_name), + store_prev, ctx->patch); + if (ctx->status != UCS_OK) { + return -1; /* stop iteration if got a real error */ + } + + /* Continue iteration and patch all remaining objects. */ + return 0; } /* called with lock held */ -static ucs_status_t ucm_reloc_apply_patch(ucm_reloc_patch_t *patch) +static ucs_status_t ucm_reloc_apply_patch(ucm_reloc_patch_t *patch, + ElfW(Addr) libucm_base_addr) { ucm_reloc_dl_iter_context_t ctx; - int success; - - /* Find default shared object, usually libc */ - success = dladdr(getpid, &ctx.def_dlinfo); - if (!success) { - return UCS_ERR_UNSUPPORTED; - } - ctx.patch = patch; - ctx.status = UCS_OK; + ctx.patch = patch; + ctx.status = UCS_OK; + ctx.libucm_base_addr = libucm_base_addr; /* Avoid locks here because we don't modify ELF data structures. * Worst case the same symbol will be written more than once. */ + ucm_trace("patch symbol '%s'", patch->symbol); (void)dl_iterate_phdr(ucm_reloc_phdr_iterator, &ctx); - if (ctx.status == UCS_OK) { - ucm_debug("modified '%s' from %p to %p", patch->symbol, - patch->prev_value, patch->value); - } return ctx.status; } -static void *ucm_dlopen(const char *filename, int flag) +/* read serinfo from 'module_path', result buffer must be destroyed + * by free() call */ +static Dl_serinfo *ucm_dlopen_load_serinfo(const char *module_path) +{ + Dl_serinfo *serinfo = NULL; + Dl_serinfo serinfo_size; + void *module; + int res; + + module = ucm_reloc_orig_dlopen(module_path, RTLD_LAZY); + if (module == NULL) { /* requested module can't be loaded */ + ucm_debug("failed to open %s: %s", module_path, dlerror()); + return NULL; + } + + /* try to get search info from requested module */ + res = dlinfo(module, RTLD_DI_SERINFOSIZE, &serinfo_size); + if (res) { + ucm_debug("dlinfo(RTLD_DI_SERINFOSIZE) failed"); + goto close_module; + } + + serinfo = malloc(serinfo_size.dls_size); + if (serinfo == NULL) { + ucm_error("failed to allocate %zu bytes for Dl_serinfo", + serinfo_size.dls_size); + goto close_module; + } + + *serinfo = serinfo_size; + res = dlinfo(module, RTLD_DI_SERINFO, serinfo); + if (res) { + ucm_debug("dlinfo(RTLD_DI_SERINFO) failed"); + free(serinfo); + serinfo = NULL; + } + +close_module: + ucm_reloc_orig_dlclose(module); + return serinfo; +} + +void *ucm_dlopen(const char *filename, int flag) { - ucm_reloc_patch_t *patch; void *handle; + ucm_reloc_patch_t *patch; + Dl_serinfo *serinfo; + Dl_info dl_info; + int res; + int i; + char file_path[PATH_MAX]; + struct stat file_stat; - if (ucm_reloc_orig_dlopen == NULL) { - ucm_fatal("ucm_reloc_orig_dlopen is NULL"); - return NULL; + ucm_debug("open module: %s, flag: %x", filename, flag); + + ucm_reloc_get_orig_dl_funcs(); + + if (!ucm_global_opts.dlopen_process_rpath) { + goto fallback_load_lib; + } + + if (filename == NULL) { + /* return handle to main program */ + goto fallback_load_lib; + } + + /* failed to open module directly, try to use RPATH from from caller + * to locate requested module */ + if (filename[0] == '/') { /* absolute path - fallback to legacy mode */ + goto fallback_load_lib; + } + + /* try to get module info */ + res = dladdr(__builtin_return_address(0), &dl_info); + if (!res) { + ucm_debug("dladdr failed"); + goto fallback_load_lib; + } + + serinfo = ucm_dlopen_load_serinfo(dl_info.dli_fname); + if (serinfo == NULL) { + /* failed to load serinfo, try just dlopen */ + goto fallback_load_lib; } + for (i = 0; i < serinfo->dls_cnt; i++) { + ucm_concat_path(file_path, sizeof(file_path), + serinfo->dls_serpath[i].dls_name, filename); + ucm_debug("check for %s", file_path); + + res = stat(file_path, &file_stat); + if (res) { + continue; + } + + free(serinfo); + /* ok, file exists, let's try to load it */ + handle = ucm_reloc_orig_dlopen(file_path, flag); + if (handle == NULL) { + return NULL; + } + + goto out_apply_patches; + } + + free(serinfo); + /* ok, we can't lookup module in dirs listed in caller module, + * let's fallback to legacy mode */ +fallback_load_lib: handle = ucm_reloc_orig_dlopen(filename, flag); - if (handle != NULL) { + if (handle == NULL) { + return NULL; + } + +out_apply_patches: + /* + * Every time a new shared object is loaded, we must update its relocations + * with our list of patches (including dlopen itself). We have to go over + * the entire list of shared objects, since there more objects could be + * loaded due to dependencies. + */ + + ucm_trace("dlopen(%s) = %p", filename, handle); + + pthread_mutex_lock(&ucm_reloc_patch_list_lock); + ucs_list_for_each(patch, &ucm_reloc_patch_list, list) { + ucm_debug("in dlopen(%s), re-applying '%s' to %p", filename, + patch->symbol, patch->value); + ucm_reloc_apply_patch(patch, 0); + } + pthread_mutex_unlock(&ucm_reloc_patch_list_lock); + + return handle; +} + +static int ucm_dlclose(void *handle) +{ + struct link_map *lm_entry; + char dl_name_buffer[256]; + const char *dl_name; + int ret; + + ucm_trace("dlclose(%p)", handle); + + ret = dlinfo(handle, RTLD_DI_LINKMAP, &lm_entry); + if (ret != 0) { + ucm_warn("dlinfo(handle=%p) failed during dlclose() hook, symbol" + "table may become unreliable", handle); + } else { /* - * Every time a new object is loaded, we must update its relocations - * with our list of patches (including dlopen itself). This code is less - * efficient and will modify all existing objects every time, but good - * enough. + * Cleanup the cached information about the library. + * NOTE: The library may not actually be unloaded (if its reference + * count is > 1). Since we have no safe way to know it, we remove the + * cached information anyway, and it may be re-added on the next call to + * ucm_reloc_apply_patch(). */ + dl_name = ucm_reloc_get_dl_name(lm_entry->l_name, lm_entry->l_addr, + dl_name_buffer, sizeof(dl_name_buffer)); pthread_mutex_lock(&ucm_reloc_patch_list_lock); - ucs_list_for_each(patch, &ucm_reloc_patch_list, list) { - ucm_debug("in dlopen(%s), re-applying '%s' to %p", filename, - patch->symbol, patch->value); - ucm_reloc_apply_patch(patch); - } + ucm_reloc_dl_info_cleanup(lm_entry->l_addr, dl_name); pthread_mutex_unlock(&ucm_reloc_patch_list_lock); } - return handle; + + ucm_reloc_get_orig_dl_funcs(); + + return ucm_reloc_orig_dlclose(handle); } -static ucm_reloc_patch_t ucm_reloc_dlopen_patch = { - .symbol = "dlopen", - .value = ucm_dlopen +static ucm_reloc_patch_t ucm_dlopen_reloc_patches[] = { + { .symbol = "dlopen", .value = ucm_dlopen }, + { .symbol = "dlclose", .value = ucm_dlclose } }; +static void ucm_reloc_get_orig_dl_funcs() +{ + ucm_reloc_patch_t *patch; + + /* pointer to original dlopen() */ + if (ucm_reloc_orig_dlopen == NULL) { + patch = &ucm_dlopen_reloc_patches[0]; + ucm_reloc_orig_dlopen = (ucm_reloc_dlopen_func_t) + ucm_reloc_get_orig(patch->symbol, patch->value); + if (ucm_reloc_orig_dlopen == NULL) { + ucm_fatal("ucm_reloc_orig_dlopen is NULL"); + } + } + + /* pointer to original dlclose() */ + if (ucm_reloc_orig_dlclose == NULL) { + patch = &ucm_dlopen_reloc_patches[1]; + ucm_reloc_orig_dlclose = (ucm_reloc_dlclose_func_t) + ucm_reloc_get_orig(patch->symbol, patch->value); + if (ucm_reloc_orig_dlclose == NULL) { + ucm_fatal("ucm_reloc_orig_dlclose is NULL"); + } + } +} /* called with lock held */ -static ucs_status_t ucm_reloc_install_dlopen() +static ucs_status_t ucm_reloc_install_dl_hooks() { static int installed = 0; ucs_status_t status; + size_t i; if (installed) { return UCS_OK; } - ucm_reloc_orig_dlopen = ucm_reloc_get_orig(ucm_reloc_dlopen_patch.symbol, - ucm_reloc_dlopen_patch.value); + for (i = 0; i < ucs_array_size(ucm_dlopen_reloc_patches); ++i) { + status = ucm_reloc_apply_patch(&ucm_dlopen_reloc_patches[i], 0); + if (status != UCS_OK) { + return status; + } - status = ucm_reloc_apply_patch(&ucm_reloc_dlopen_patch); - if (status != UCS_OK) { - return status; + ucs_list_add_tail(&ucm_reloc_patch_list, &ucm_dlopen_reloc_patches[i].list); } - ucs_list_add_tail(&ucm_reloc_patch_list, &ucm_reloc_dlopen_patch.list); - installed = 1; return UCS_OK; } @@ -336,18 +686,29 @@ static ucs_status_t ucm_reloc_install_dlopen() ucs_status_t ucm_reloc_modify(ucm_reloc_patch_t *patch) { ucs_status_t status; + Dl_info dl_info; + int ret; + + ucm_reloc_get_orig_dl_funcs(); + + /* Take default symbol value from the current library */ + ret = dladdr(ucm_reloc_modify, &dl_info); + if (!ret) { + ucm_error("dladdr() failed to query current library"); + return UCS_ERR_UNSUPPORTED; + } /* Take lock first to handle a possible race where dlopen() is called * from another thread and we may end up not patching it. */ pthread_mutex_lock(&ucm_reloc_patch_list_lock); - status = ucm_reloc_install_dlopen(); + status = ucm_reloc_install_dl_hooks(); if (status != UCS_OK) { goto out_unlock; } - status = ucm_reloc_apply_patch(patch); + status = ucm_reloc_apply_patch(patch, (uintptr_t)dl_info.dli_fbase); if (status != UCS_OK) { goto out_unlock; } @@ -359,3 +720,6 @@ ucs_status_t ucm_reloc_modify(ucm_reloc_patch_t *patch) return status; } +UCS_STATIC_INIT { + kh_init_inplace(ucm_dl_info_hash, &ucm_dl_info_hash); +} diff --git a/src/ucm/util/reloc.h b/src/ucm/util/reloc.h index 36acdbc22e5..285cfaac2a8 100644 --- a/src/ucm/util/reloc.h +++ b/src/ucm/util/reloc.h @@ -46,7 +46,7 @@ ucs_status_t ucm_reloc_modify(ucm_reloc_patch_t* patch); * If the replacement function is defined in a loadbale module, the symbols it * imports from other libraries may not be available in global scope. * - * @param [in] symbol Symbol name, + * @param [in] symbol Symbol name. * @param [in] replacement Symbol replacement, which should be ignored. * * @return Original function pointer for 'symbol'. diff --git a/src/ucm/util/replace.c b/src/ucm/util/replace.c index 325ebb75c71..6d8abae9405 100644 --- a/src/ucm/util/replace.c +++ b/src/ucm/util/replace.c @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -20,15 +21,31 @@ #include #include - +#ifndef MAP_FAILED #define MAP_FAILED ((void*)-1) +#endif +#ifdef PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP pthread_mutex_t ucm_reloc_get_orig_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; -pthread_t volatile ucm_reloc_get_orig_thread = -1; +#else +pthread_mutex_t ucm_reloc_get_orig_lock; +static void ucm_reloc_get_orig_lock_init(void) __attribute__((constructor(101))); +static void ucm_reloc_get_orig_lock_init(void) +{ + pthread_mutexattr_t attr; + + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); + pthread_mutex_init(&ucm_reloc_get_orig_lock, &attr); +} +#endif +pthread_t volatile ucm_reloc_get_orig_thread = (pthread_t)-1; UCM_DEFINE_REPLACE_FUNC(mmap, void*, MAP_FAILED, void*, size_t, int, int, int, off_t) UCM_DEFINE_REPLACE_FUNC(munmap, int, -1, void*, size_t) +#if HAVE_MREMAP UCM_DEFINE_REPLACE_FUNC(mremap, void*, MAP_FAILED, void*, size_t, size_t, int) +#endif UCM_DEFINE_REPLACE_FUNC(shmat, void*, MAP_FAILED, int, const void*, int) UCM_DEFINE_REPLACE_FUNC(shmdt, int, -1, const void*) UCM_DEFINE_REPLACE_FUNC(sbrk, void*, MAP_FAILED, intptr_t) @@ -37,7 +54,9 @@ UCM_DEFINE_REPLACE_FUNC(madvise, int, -1, void*, size_t, int) UCM_DEFINE_SELECT_FUNC(mmap, void*, MAP_FAILED, SYS_mmap, void*, size_t, int, int, int, off_t) UCM_DEFINE_SELECT_FUNC(munmap, int, -1, SYS_munmap, void*, size_t) +#if HAVE_MREMAP UCM_DEFINE_SELECT_FUNC(mremap, void*, MAP_FAILED, SYS_mremap, void*, size_t, size_t, int) +#endif UCM_DEFINE_SELECT_FUNC(madvise, int, -1, SYS_madvise, void*, size_t, int) #if UCM_BISTRO_HOOKS @@ -133,7 +152,7 @@ void *ucm_orig_sbrk(intptr_t increment) return ucm_orig_dlsym_sbrk(increment); } else { prev = ucm_brk_syscall(0); - return ucm_orig_brk(prev + increment) ? (void*)-1 : prev; + return ucm_orig_brk(UCS_PTR_BYTE_OFFSET(prev, increment)) ? (void*)-1 : prev; } } diff --git a/src/ucm/util/replace.h b/src/ucm/util/replace.h index 72b5be68f29..4b91b037d99 100644 --- a/src/ucm/util/replace.h +++ b/src/ucm/util/replace.h @@ -37,7 +37,7 @@ extern pthread_t volatile ucm_reloc_get_orig_thread; ucm_trace("%s()", __FUNCTION__); \ \ if (ucs_unlikely(ucm_reloc_get_orig_thread == pthread_self())) { \ - return _fail_val; \ + return (_rettype)_fail_val; \ } \ res = _ucm_name(UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \ UCM_BISTRO_EPILOGUE; \ @@ -65,9 +65,9 @@ extern pthread_t volatile ucm_reloc_get_orig_thread; if (ucs_unlikely(orig_func_ptr == NULL)) { \ pthread_mutex_lock(&ucm_reloc_get_orig_lock); \ ucm_reloc_get_orig_thread = pthread_self(); \ - orig_func_ptr = ucm_reloc_get_orig(UCS_PP_QUOTE(_name), \ - _over_name); \ - ucm_reloc_get_orig_thread = -1; \ + orig_func_ptr = (func_ptr_t)ucm_reloc_get_orig(UCS_PP_QUOTE(_name), \ + _over_name); \ + ucm_reloc_get_orig_thread = (pthread_t)-1; \ pthread_mutex_unlock(&ucm_reloc_get_orig_lock); \ } \ return orig_func_ptr(UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \ diff --git a/src/ucm/util/sys.c b/src/ucm/util/sys.c index dd553e6e02c..a9b040b7fd3 100644 --- a/src/ucm/util/sys.c +++ b/src/ucm/util/sys.c @@ -4,7 +4,9 @@ * See file LICENSE for terms. */ -#define _GNU_SOURCE /* for dladdr */ +#ifndef _GNU_SOURCE +# define _GNU_SOURCE /* for dladdr */ +#endif #include "sys.h" @@ -14,8 +16,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -36,7 +40,8 @@ ucm_global_config_t ucm_global_opts = { .enable_malloc_reloc = 0, .enable_cuda_reloc = 1, .enable_dynamic_mmap_thresh = 1, - .alloc_alignment = 16 + .alloc_alignment = 16, + .dlopen_process_rpath = 1 }; size_t ucm_get_page_size() @@ -58,7 +63,7 @@ size_t ucm_get_page_size() static void *ucm_sys_complete_alloc(void *ptr, size_t size) { *(size_t*)ptr = size; - return ptr + sizeof(size_t); + return UCS_PTR_BYTE_OFFSET(ptr, sizeof(size_t)); } void *ucm_sys_malloc(size_t size) @@ -70,6 +75,7 @@ void *ucm_sys_malloc(size_t size) ptr = ucm_orig_mmap(NULL, sys_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (ptr == MAP_FAILED) { + ucm_error("mmap(size=%zu) failed: %m", sys_size); return NULL; } @@ -98,7 +104,10 @@ void ucm_sys_free(void *ptr) return; } - ptr -= sizeof(size_t); + /* Do not use UCS_PTR_BYTE_OFFSET macro here due to coverity + * false positive. + * TODO: check for false positive on newer coverity. */ + ptr = (char*)ptr - sizeof(size_t); size = *(size_t*)ptr; munmap(ptr, size); } @@ -112,7 +121,7 @@ void *ucm_sys_realloc(void *ptr, size_t size) return ucm_sys_malloc(size); } - oldptr = ptr - sizeof(size_t); + oldptr = UCS_PTR_BYTE_OFFSET(ptr, -sizeof(size_t)); oldsize = *(size_t*)oldptr; sys_size = ucs_align_up_pow2(size + sizeof(size_t), ucm_get_page_size()); @@ -122,6 +131,8 @@ void *ucm_sys_realloc(void *ptr, size_t size) newptr = ucm_orig_mremap(oldptr, oldsize, sys_size, MREMAP_MAYMOVE); if (newptr == MAP_FAILED) { + ucm_error("mremap(oldptr=%p oldsize=%zu, newsize=%zu) failed: %m", + oldptr, oldsize, sys_size); return NULL; } @@ -136,10 +147,12 @@ void ucm_parse_proc_self_maps(ucm_proc_maps_cb_t cb, void *arg) ssize_t read_size, offset; unsigned long start, end; char prot_c[4]; + int line_num; int prot; char *ptr, *newline; int maps_fd; int ret; + int n; maps_fd = open(UCM_PROC_SELF_MAPS, O_RDONLY); if (maps_fd < 0) { @@ -195,31 +208,39 @@ void ucm_parse_proc_self_maps(ucm_proc_maps_cb_t cb, void *arg) pthread_rwlock_rdlock(&lock); - ptr = buffer; + ptr = buffer; + line_num = 1; while ( (newline = strchr(ptr, '\n')) != NULL ) { - /* 00400000-0040b000 r-xp ... \n */ - ret = sscanf(ptr, "%lx-%lx %4c", &start, &end, prot_c); - if (ret != 3) { - ucm_fatal("failed to parse %s error at offset %zd", - UCM_PROC_SELF_MAPS, ptr - buffer); - } - - prot = 0; - if (prot_c[0] == 'r') { - prot |= PROT_READ; - } - if (prot_c[1] == 'w') { - prot |= PROT_WRITE; - } - if (prot_c[2] == 'x') { - prot |= PROT_EXEC; - } + /* address perms offset dev inode pathname + * 00400000-0040b000 r-xp 00001a00 0a:0b 12345 /dev/mydev + */ + *newline = '\0'; + ret = sscanf(ptr, "%lx-%lx %4c %*x %*x:%*x %*d %n", + &start, &end, prot_c, + /* ignore offset, dev, inode */ + &n /* save number of chars before path begins */); + if (ret < 3) { + ucm_warn("failed to parse %s line %d: '%s'", + UCM_PROC_SELF_MAPS, line_num, ptr); + } else { + prot = 0; + if (prot_c[0] == 'r') { + prot |= PROT_READ; + } + if (prot_c[1] == 'w') { + prot |= PROT_WRITE; + } + if (prot_c[2] == 'x') { + prot |= PROT_EXEC; + } - if (cb(arg, (void*)start, end - start, prot)) { - goto out; + if (cb(arg, (void*)start, end - start, prot, ptr + n)) { + goto out; + } } ptr = newline + 1; + ++line_num; } out: @@ -231,7 +252,8 @@ typedef struct { size_t seg_size; } ucm_get_shm_seg_size_ctx_t; -static int ucm_get_shm_seg_size_cb(void *arg, void *addr, size_t length, int prot) +static int ucm_get_shm_seg_size_cb(void *arg, void *addr, size_t length, + int prot, const char *path) { ucm_get_shm_seg_size_ctx_t *ctx = arg; if (addr == ctx->shmaddr) { @@ -290,3 +312,52 @@ void ucm_prevent_dl_unload() /* Now we drop our reference to the lib, and it won't be unloaded anymore */ dlclose(dl); } + +char *ucm_concat_path(char *buffer, size_t max, const char *dir, const char *file) +{ + size_t len; + + len = strlen(dir); + while (len && (dir[len - 1] == '/')) { + len--; /* trim closing '/' */ + } + + len = ucs_min(len, max); + memcpy(buffer, dir, len); + max -= len; + if (max < 2) { /* buffer is shorter than dir - copy dir only */ + buffer[len - 1] = '\0'; + return buffer; + } + + buffer[len] = '/'; + max--; + + while (file[0] == '/') { + file++; /* trim beginning '/' */ + } + + strncpy(buffer + len + 1, file, max); + buffer[max + len] = '\0'; /* force close string */ + + return buffer; +} + +ucs_status_t ucm_get_mem_type_current_device_info(ucs_memory_type_t memtype, ucs_sys_bus_id_t *bus_id) +{ + ucs_status_t status = UCS_ERR_UNSUPPORTED; + ucm_event_installer_t *event_installer; + + ucs_list_for_each(event_installer, &ucm_event_installer_list, list) { + if (NULL == event_installer->get_mem_type_current_device_info) { + continue; + } + + status = event_installer->get_mem_type_current_device_info(bus_id, memtype); + if (UCS_OK == status) { + break; + } + } + + return status; +} diff --git a/src/ucm/util/sys.h b/src/ucm/util/sys.h index fbce9bd45f7..6627fe0ac78 100644 --- a/src/ucm/util/sys.h +++ b/src/ucm/util/sys.h @@ -9,6 +9,8 @@ #define UCM_UTIL_SYS_H_ #include +#include +#include /* @@ -29,10 +31,12 @@ void *ucm_sys_realloc(void *oldptr, size_t newsize); * @param [in] addr Mapping start address. * @param [in] length Mapping length. * @param [in] prot Mapping memory protection flags (PROT_xx). + * @param [in] path Backing file path, or NULL for anonymous mapping. * * @return 0 to continue iteration, nonzero - stop iteration. */ -typedef int (*ucm_proc_maps_cb_t)(void *arg, void *addr, size_t length, int prot); +typedef int (*ucm_proc_maps_cb_t)(void *arg, void *addr, size_t length, + int prot, const char *path); /** @@ -73,4 +77,27 @@ void ucm_strerror(int eno, char *buf, size_t max); void ucm_prevent_dl_unload(); +/* + * Concatenate directory and file names into full path. + * + * @param buffer Filled with the result path. + * @param max Maximal buffer size. + * @param dir Directory name. + * @param file File name. + * + * @return Result buffer. + */ +char *ucm_concat_path(char *buffer, size_t max, const char *dir, const char *file); + + +/* + * Get device information associated with memory type + * + * @param [in] memtype Memory type. + * @param [out] bus_id Bus ID. + * + * @return Status code + */ +ucs_status_t ucm_get_mem_type_current_device_info(ucs_memory_type_t memtype, ucs_sys_bus_id_t *bus_id); + #endif diff --git a/src/ucp/Makefile.am b/src/ucp/Makefile.am index 30ef6b86aba..a17f694978d 100644 --- a/src/ucp/Makefile.am +++ b/src/ucp/Makefile.am @@ -1,6 +1,7 @@ # # Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. # Copyright (c) UT-Battelle, LLC. 2017. ALL RIGHTS RESERVED. +# Copyright (C) Los Alamos National Security, LLC. 2019. ALL RIGHTS RESERVED. # See file LICENSE for terms. # @@ -20,6 +21,8 @@ nobase_dist_libucp_la_HEADERS = \ api/ucp.h noinst_HEADERS = \ + core/ucp_am.h \ + core/ucp_am.inl \ core/ucp_context.h \ core/ucp_ep.h \ core/ucp_ep.inl \ @@ -36,8 +39,11 @@ noinst_HEADERS = \ dt/dt_contig.h \ dt/dt_iov.h \ dt/dt_generic.h \ - proto/proto.h \ + proto/lane_type.h \ + proto/proto_am.h \ proto/proto_am.inl \ + proto/proto_select.h \ + proto/proto.h \ rma/rma.h \ rma/rma.inl \ tag/eager.h \ @@ -49,6 +55,7 @@ noinst_HEADERS = \ wireup/ep_match.h \ wireup/wireup_ep.h \ wireup/wireup.h \ + wireup/wireup_cm.h \ stream/stream.h devel_headers = \ @@ -68,6 +75,7 @@ endif libucp_la_SOURCES = \ core/ucp_context.c \ + core/ucp_am.c \ core/ucp_ep.c \ core/ucp_listener.c \ core/ucp_mm.c \ @@ -80,7 +88,9 @@ libucp_la_SOURCES = \ dt/dt_iov.c \ dt/dt_generic.c \ dt/dt.c \ + proto/lane_type.c \ proto/proto_am.c \ + proto/proto.c \ rma/amo_basic.c \ rma/amo_send.c \ rma/amo_sw.c \ @@ -102,6 +112,7 @@ libucp_la_SOURCES = \ wireup/signaling_ep.c \ wireup/wireup_ep.c \ wireup/wireup.c \ + wireup/wireup_cm.c \ stream/stream_send.c \ stream/stream_recv.c diff --git a/src/ucp/api/ucp.h b/src/ucp/api/ucp.h index 7da81b38812..90b3df7af62 100644 --- a/src/ucp/api/ucp.h +++ b/src/ucp/api/ucp.h @@ -3,7 +3,7 @@ * Copyright (C) UT-Battelle, LLC. 2014-2017. ALL RIGHTS RESERVED. * Copyright (C) ARM Ltd. 2016-2017. ALL RIGHTS RESERVED. * Copyright (C) Los Alamos National Security, LLC. 2018 ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -114,7 +114,7 @@ BEGIN_C_DECLS * @brief UCP context parameters field mask. * * The enumeration allows specifying which fields in @ref ucp_params_t are - * present. It is used for the enablement of backward compatibility support. + * present. It is used to enable backward compatibility support. */ enum ucp_params_field { UCP_PARAM_FIELD_FEATURES = UCS_BIT(0), /**< features */ @@ -123,7 +123,8 @@ enum ucp_params_field { UCP_PARAM_FIELD_REQUEST_CLEANUP = UCS_BIT(3), /**< request_cleanup */ UCP_PARAM_FIELD_TAG_SENDER_MASK = UCS_BIT(4), /**< tag_sender_mask */ UCP_PARAM_FIELD_MT_WORKERS_SHARED = UCS_BIT(5), /**< mt_workers_shared */ - UCP_PARAM_FIELD_ESTIMATED_NUM_EPS = UCS_BIT(6) /**< estimated_num_eps */ + UCP_PARAM_FIELD_ESTIMATED_NUM_EPS = UCS_BIT(6), /**< estimated_num_eps */ + UCP_PARAM_FIELD_ESTIMATED_NUM_PPN = UCS_BIT(7) /**< estimated_num_ppn */ }; @@ -147,9 +148,8 @@ enum ucp_feature { UCP_FEATURE_WAKEUP = UCS_BIT(4), /**< Request interrupt notification support */ UCP_FEATURE_STREAM = UCS_BIT(5), /**< Request stream support */ - UCP_FEATURE_EXPERIMENTAL = UCS_BIT(6), /**< Request all - experimental - features support */ + UCP_FEATURE_AM = UCS_BIT(6), /**< Request Active Message + support */ UCP_FEATURE_GROUPS = UCS_BIT(7) /**< Request Collective operations support */ }; @@ -160,7 +160,7 @@ enum ucp_feature { * @brief UCP worker parameters field mask. * * The enumeration allows specifying which fields in @ref ucp_worker_params_t are - * present. It is used for the enablement of backward compatibility support. + * present. It is used to enable backward compatibility support. */ enum ucp_worker_params_field { UCP_WORKER_PARAM_FIELD_THREAD_MODE = UCS_BIT(0), /**< UCP thread mode */ @@ -177,7 +177,7 @@ enum ucp_worker_params_field { * @brief UCP listener parameters field mask. * * The enumeration allows specifying which fields in @ref ucp_listener_params_t - * are present. It is used for the enablement of backward compatibility support. + * are present. It is used to enable backward compatibility support. */ enum ucp_listener_params_field { /** @@ -218,7 +218,7 @@ typedef enum { * @brief UCP endpoint parameters field mask. * * The enumeration allows specifying which fields in @ref ucp_ep_params_t are - * present. It is used for the enablement of backward compatibility support. + * present. It is used to enable backward compatibility support. */ enum ucp_ep_params_field { UCP_EP_PARAM_FIELD_REMOTE_ADDRESS = UCS_BIT(0), /**< Address of remote @@ -262,6 +262,33 @@ enum ucp_ep_params_flags_field { }; +/** + * @ingroup UCP_ENDPOINT + * @brief Close UCP endpoint modes. + * + * The enumeration is used to specify the behavior of @ref ucp_ep_close_nbx. + */ +typedef enum { + UCP_EP_CLOSE_FLAG_FORCE = UCS_BIT(0) /**< @ref ucp_ep_close_nbx releases + the endpoint without any + confirmation from the peer. All + outstanding requests will be + completed with + @ref UCS_ERR_CANCELED error. + @note This mode may cause + transport level errors on remote + side, so it requires set + @ref UCP_ERR_HANDLING_MODE_PEER + for all endpoints created on + both (local and remote) sides to + avoid undefined behavior. If this + flag is not set then + @ref ucp_ep_close_nbx schedules + flushes on all outstanding + operations. */ +} ucp_ep_close_flags_t; + + /** * @ingroup UCP_ENDPOINT * @brief Close UCP endpoint modes. @@ -293,7 +320,7 @@ enum ucp_ep_close_mode { * @brief UCP memory mapping parameters field mask. * * The enumeration allows specifying which fields in @ref ucp_mem_map_params_t are - * present. It is used for the enablement of backward compatibility support. + * present. It is used to enable backward compatibility support. */ enum ucp_mem_map_params_field { UCP_MEM_MAP_PARAM_FIELD_ADDRESS = UCS_BIT(0), /**< Address of the memory that @@ -311,7 +338,7 @@ enum ucp_mem_map_params_field { * @brief UCP memory advice parameters field mask. * * The enumeration allows specifying which fields in @ref ucp_mem_advise_params_t are - * present. It is used for the enablement of backward compatibility support. + * present. It is used to enable backward compatibility support. */ enum ucp_mem_advise_params_field { UCP_MEM_ADVISE_PARAM_FIELD_ADDRESS = UCS_BIT(0), /**< Address of the memory */ @@ -325,19 +352,20 @@ enum ucp_mem_advise_params_field { * @brief UCP context attributes field mask. * * The enumeration allows specifying which fields in @ref ucp_context_attr_t are - * present. It is used for the enablement of backward compatibility support. + * present. It is used to enable backward compatibility support. */ enum ucp_context_attr_field { UCP_ATTR_FIELD_REQUEST_SIZE = UCS_BIT(0), /**< UCP request size */ UCP_ATTR_FIELD_THREAD_MODE = UCS_BIT(1) /**< UCP context thread flag */ }; + /** * @ingroup UCP_WORKER * @brief UCP worker attributes field mask. * * The enumeration allows specifying which fields in @ref ucp_worker_attr_t are - * present. It is used for the enablement of backward compatibility support. + * present. It is used to enable backward compatibility support. */ enum ucp_worker_attr_field { UCP_WORKER_ATTR_FIELD_THREAD_MODE = UCS_BIT(0), /**< UCP thread mode */ @@ -345,6 +373,31 @@ enum ucp_worker_attr_field { UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS = UCS_BIT(2) /**< UCP address flags */ }; + +/** + * @ingroup UCP_WORKER + * @brief UCP listener attributes field mask. + * + * The enumeration allows specifying which fields in @ref ucp_listener_attr_t are + * present. It is used to enable backward compatibility support. + */ +enum ucp_listener_attr_field { + UCP_LISTENER_ATTR_FIELD_SOCKADDR = UCS_BIT(0) /**< Sockaddr used for listening */ +}; + + +/** + * @ingroup UCP_WORKER + * @brief UCP listener's connection request attributes field mask. + * + * The enumeration allows specifying which fields in @ref ucp_conn_request_attr_t + * are present. It is used to enable backward compatibility support. + */ +enum ucp_conn_request_attr_field { + UCP_CONN_REQUEST_ATTR_FIELD_CLIENT_ADDR = UCS_BIT(0) /**< Client's address */ +}; + + /** * @ingroup UCP_DATATYPE * @brief UCP data type classification @@ -388,6 +441,49 @@ enum { }; +/** + * @ingroup UCP_WORKER + * @brief Flags for a UCP Active Message callback. + * + * Flags that indicate how to handle UCP Active Messages + * Currently only UCP_AM_FLAG_WHOLE_MSG is supported, + * which indicates the entire message is handled in one + * callback. + */ +enum ucp_am_cb_flags { + UCP_AM_FLAG_WHOLE_MSG = UCS_BIT(0) +}; + + +/** + * @ingroup UCP_WORKER + * @brief Flags for sending a UCP Active Message. + * + * Flags dictate the behavior of ucp_am_send_nb + * currently the only flag tells UCP to pass in + * the sending endpoint to the call + * back so a reply can be defined. + */ +enum ucp_send_am_flags { + UCP_AM_SEND_REPLY = UCS_BIT(0) +}; + + +/** + * @ingroup UCP_ENDPOINT + * @brief Descriptor flags for Active Message callback. + * + * In a callback, if flags is set to UCP_CB_PARAM_FLAG_DATA in + * a callback then data was allocated, so if UCS_INPROGRESS is + * returned from the callback, the data parameter will persist + * and the user has to call @ref ucp_am_data_release when data is + * no longer needed. + */ +enum ucp_cb_param_flags { + UCP_CB_PARAM_FLAG_DATA = UCS_BIT(0) +}; + + /** * @ingroup UCP_COMM * @brief Atomic operation requested for ucp_atomic_post @@ -424,6 +520,24 @@ typedef enum { } ucp_atomic_fetch_op_t; +/** + * @ingroup UCP_COMM + * @brief Atomic operation requested for ucp_atomic_op_nbx + * + * This enumeration defines which atomic memory operation should be + * performed by the @ref ucp_atomic_op_nbx routine. + */ +typedef enum { + UCP_ATOMIC_OP_ADD, /**< Atomic add */ + UCP_ATOMIC_OP_SWAP, /**< Atomic swap */ + UCP_ATOMIC_OP_CSWAP, /**< Atomic conditional swap */ + UCP_ATOMIC_OP_AND, /**< Atomic and */ + UCP_ATOMIC_OP_OR, /**< Atomic or */ + UCP_ATOMIC_OP_XOR, /**< Atomic xor */ + UCP_ATOMIC_OP_LAST +} ucp_atomic_op_t; + + /** * @ingroup UCP_COMM * @brief Flags to define behavior of @ref ucp_stream_recv_nb function @@ -432,14 +546,48 @@ typedef enum { */ typedef enum { UCP_STREAM_RECV_FLAG_WAITALL = UCS_BIT(0) /**< This flag requests that - operation will not be - completed untill all amout - of requested data is - received and placed in the - user buffer. */ + the operation will not be + completed until all + requested data is received + and placed in the user + buffer. */ } ucp_stream_recv_flags_t; +/** + * @ingroup UCP_COMM + * @brief UCP operation fields and flags + * + * The enumeration allows specifying which fields in @ref ucp_request_param_t are + * present and operation flags are used. It is used to enable backward + * compatibility support. + */ +typedef enum { + UCP_OP_ATTR_FIELD_REQUEST = UCS_BIT(0), /**< request field */ + UCP_OP_ATTR_FIELD_CALLBACK = UCS_BIT(1), /**< cb field */ + UCP_OP_ATTR_FIELD_USER_DATA = UCS_BIT(2), /**< user_data field */ + UCP_OP_ATTR_FIELD_DATATYPE = UCS_BIT(3), /**< datatype field */ + UCP_OP_ATTR_FIELD_FLAGS = UCS_BIT(4), /**< operation-specific flags */ + UCP_OP_ATTR_FIELD_REPLY_BUFFER = UCS_BIT(5), /**< reply_buffer field */ + + UCP_OP_ATTR_FLAG_NO_IMM_CMPL = UCS_BIT(16), /**< deny immediate completion */ + UCP_OP_ATTR_FLAG_FAST_CMPL = UCS_BIT(17), /**< expedite local completion, + even if it delays remote + data delivery. Note for + implementer: this option + can disable zero copy + and/or rendezvous protocols + which require + synchronization with the + remote peer before releasing + the local send buffer */ + UCP_OP_ATTR_FLAG_FORCE_IMM_CMPL = UCS_BIT(18) /**< force immediate complete + operation, fail if the + operation cannot be + completed immediately */ +} ucp_op_attr_t; + + /** * @ingroup UCP_DATATYPE * @brief Generate an identifier for contiguous data type. @@ -516,7 +664,7 @@ typedef struct ucp_generic_dt_ops { * @param [in] count Number of elements to pack into the buffer. * * @return A custom state that is passed to the following - * @ref ucp_generic_dt_ops::unpack "pack()" routine. + * @ref ucp_generic_dt_ops::pack "pack()" routine. */ void* (*start_pack)(void *context, const void *buffer, size_t count); @@ -691,7 +839,7 @@ typedef struct ucp_params { /** * An optimization hint of how many endpoints will be created on this context. - * For example, when used from MPI or SHMEM libraries, this number would specify + * For example, when used from MPI or SHMEM libraries, this number will specify * the number of ranks (or processing elements) in the job. * Does not affect semantics, but only transport selection criteria and the * resulting performance. @@ -700,6 +848,15 @@ typedef struct ucp_params { */ size_t estimated_num_eps; + /** + * An optimization hint for a single node. For example, when used from MPI or + * OpenSHMEM libraries, this number will specify the number of Processes Per + * Node (PPN) in the job. Does not affect semantics, only transport selection + * criteria and the resulting performance. + * The value can be also set by the UCX_NUM_PPN environment variable, which + * will override the number of endpoints set by @e estimated_num_ppn + */ + size_t estimated_num_ppn; } ucp_params_t; @@ -733,6 +890,7 @@ typedef struct ucp_context_attr { ucs_thread_mode_t thread_mode; } ucp_context_attr_t; + /** * @ingroup UCP_WORKER * @brief UCP worker attributes. @@ -853,6 +1011,54 @@ typedef struct ucp_worker_params { } ucp_worker_params_t; +/** + * @ingroup UCP_WORKER + * @brief UCP listener attributes. + * + * The structure defines the attributes which characterize + * the particular listener. + */ +typedef struct ucp_listener_attr { + /** + * Mask of valid fields in this structure, using bits from + * @ref ucp_listener_attr_field. + * Fields not specified in this mask will be ignored. + * Provides ABI compatibility with respect to adding new fields. + */ + uint64_t field_mask; + + /** + * Sockaddr on which this listener is listening for incoming connection + * requests. + */ + struct sockaddr_storage sockaddr; +} ucp_listener_attr_t; + + +/** + * @ingroup UCP_WORKER + * @brief UCP listener's connection request attributes. + * + * The structure defines the attributes that characterize + * the particular connection request received on the server side. + */ +typedef struct ucp_conn_request_attr { + /** + * Mask of valid fields in this structure, using bits from + * @ref ucp_conn_request_attr_field. + * Fields not specified in this mask will be ignored. + * Provides ABI compatibility with respect to adding new fields. + */ + uint64_t field_mask; + + /** + * The address of the remote client that sent the connection request to the + * server. + */ + struct sockaddr_storage client_address; +} ucp_conn_request_attr_t; + + /** * @ingroup UCP_WORKER * @brief Parameters for a UCP listener object. @@ -902,7 +1108,7 @@ typedef struct ucp_listener_params { * * The structure defines the endpoint and its user data. */ -typedef struct { +typedef struct ucp_stream_poll_ep { /** * Endpoint handle. */ @@ -925,6 +1131,7 @@ typedef struct { uint8_t reserved[16]; } ucp_stream_poll_ep_t; + /** * @ingroup UCP_MEM * @brief Tuning parameters for the UCP memory mapping. @@ -990,6 +1197,95 @@ struct ucp_tag_recv_info { }; +/** + * @ingroup UCP_CONTEXT + * @brief Operation parameters passed to @ref ucp_tag_send_nbx, + * @ref ucp_tag_send_sync_nbx, @ref ucp_tag_recv_nbx, @ref ucp_put_nbx, + * @ref ucp_get_nbx + * + * The structure @ref ucp_request_param_t is used to specify datatype of + * operation, provide user request in case the external request is used, + * set completion callback and custom user data passed to this callback. + * + * Example: implementation of function to send contiguous buffer to ep and + * invoke callback function at operation completion. If the + * operation completed immediately (status == UCS_OK) then + * callback is not called. + * + * @code{.c} + * ucs_status_ptr_t send_data(ucp_ep_h ep, void *buffer, size_t length, + * ucp_tag_t tag, void *request) + * { + * ucp_request_param_t param = { + * .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | + * UCP_OP_ATTR_FIELD_REQUEST, + * .request = request, + * .cb.ucp_send_nbx_callback_t = custom_send_callback_f, + * .user_data = pointer_to_user_context_passed_to_cb + * }; + * + * ucs_status_ptr_t status; + * + * status = ucp_tag_send_nbx(ep, buffer, length, tag, ¶m); + * if (UCS_PTR_IS_ERR(status)) { + * handle_error(status); + * } else if (status == UCS_OK) { + * // operation is completed + * } + * + * return status; + * } + * @endcode + */ +typedef struct { + /** + * Mask of valid fields in this structure and operation flags, using + * bits from @ref ucp_op_attr_t. Fields not specified in this mask will be + * ignored. Provides ABI compatibility with respect to adding new fields. + */ + uint32_t op_attr_mask; + + /* Operation specific flags. */ + uint32_t flags; + + /** + * Request handle allocated by the user. There should + * be at least UCP request size bytes of available + * space before the @a request. The size of the UCP request + * can be obtained by @ref ucp_context_query function. + */ + void *request; + + /** + * Callback function that is invoked whenever the + * send or receive operation is completed. + */ + union { + ucp_send_nbx_callback_t send; + ucp_tag_recv_nbx_callback_t recv; + ucp_stream_recv_nbx_callback_t recv_stream; + } cb; + + /** + * Datatype descriptor for the elements in the buffer. In case the + * op_attr_mask & UCP_OP_ATTR_FIELD_DATATYPE bit is not set, then use + * default datatype ucp_dt_make_contig(1) + */ + ucp_datatype_t datatype; + + /** + * Pointer to user data passed to callback function. + */ + void *user_data; + + /** + * Reply buffer. Can be used for storing operation result, for example by + * @ref ucp_atomic_op_nbx. + */ + void *reply_buffer; +} ucp_request_param_t; + + /** * @ingroup UCP_CONFIG * @brief Read UCP configuration descriptor @@ -1003,7 +1299,7 @@ struct ucp_tag_recv_info { * * @param [in] env_prefix If non-NULL, the routine searches for the * environment variables that start with - * @e UCX__ prefix. + * @e \_UCX_ prefix. * Otherwise, the routine searches for the * environment variables that start with * @e UCX_ prefix. @@ -1150,12 +1446,6 @@ static inline ucs_status_t ucp_init(const ucp_params_t *params, context_p); } -typedef ucs_status_t (*ucp_extension_init_f) (void *ctx); -typedef void (*ucp_extension_cleanup_f)(void *ctx); -ucs_status_t ucp_extend(ucp_context_h context, size_t extension_ctx_length, - ucp_extension_init_f init, ucp_extension_cleanup_f cleanup, - size_t *extension_ctx_offset_in_worker, unsigned *am_id); - /** * @ingroup UCP_CONTEXT @@ -1177,6 +1467,13 @@ ucs_status_t ucp_extend(ucp_context_h context, size_t extension_ctx_length, void ucp_cleanup(ucp_context_h context_p); +typedef ucs_status_t (*ucp_extension_init_f) (void *ctx); +typedef void (*ucp_extension_cleanup_f)(void *ctx); +ucs_status_t ucp_extend(ucp_context_h context, size_t extension_ctx_length, + ucp_extension_init_f init, ucp_extension_cleanup_f cleanup, + size_t *extension_ctx_offset_in_worker, unsigned *am_id); + + /** * @ingroup UCP_CONTEXT * @brief Get attributes specific to a particular context. @@ -1253,6 +1550,7 @@ ucs_status_t ucp_worker_create(ucp_context_h context, */ void ucp_worker_destroy(ucp_worker_h worker); + /** * @ingroup UCP_WORKER * @brief Get attributes specific to a particular worker. @@ -1267,6 +1565,7 @@ void ucp_worker_destroy(ucp_worker_h worker); ucs_status_t ucp_worker_query(ucp_worker_h worker, ucp_worker_attr_t *attr); + /** * @ingroup UCP_WORKER * @brief Print information about the worker. @@ -1541,7 +1840,7 @@ ucs_status_t ucp_worker_arm(ucp_worker_h worker); * waiting on a file descriptor from @ref ucp_worker_get_efd to return, even * if no event from the underlying interfaces has taken place. * - * @note It’s safe to use this routine from any thread, even if UCX is compiled + * @note It's safe to use this routine from any thread, even if UCX is compiled * without multi-threading support and/or initialized with any value of * @ref ucp_params_t::mt_workers_shared and * @ref ucp_worker_params_t::thread_mode parameters @@ -1589,6 +1888,36 @@ ucs_status_t ucp_listener_create(ucp_worker_h worker, void ucp_listener_destroy(ucp_listener_h listener); +/** + * @ingroup UCP_WORKER + * @brief Get attributes specific to a particular listener. + * + * This routine fetches information about the listener. + * + * @param [in] listener listener object to query. + * @param [out] attr Filled with attributes of the listener. + * + * @return Error code as defined by @ref ucs_status_t + */ +ucs_status_t ucp_listener_query(ucp_listener_h listener, ucp_listener_attr_t *attr); + + +/** + * @ingroup UCP_WORKER + * @brief Get attributes specific to a particular connection request received + * on the server side. + * + * This routine fetches information about the connection request. + * + * @param [in] conn_request connection request object to query. + * @param [out] attr Filled with attributes of the connection request. + * + * @return Error code as defined by @ref ucs_status_t + */ +ucs_status_t ucp_conn_request_query(ucp_conn_request_h conn_request, + ucp_conn_request_attr_t *attr); + + /** * @ingroup UCP_ENDPOINT * @brief Create and connect an endpoint. @@ -1654,6 +1983,32 @@ ucs_status_t ucp_ep_create(ucp_worker_h worker, const ucp_ep_params_t *params, ucs_status_ptr_t ucp_ep_close_nb(ucp_ep_h ep, unsigned mode); +/** + * @ingroup UCP_ENDPOINT + * + * @brief Non-blocking @ref ucp_ep_h "endpoint" closure. + * + * @param [in] ep Handle to the endpoint to close. + * @param [in] param Operation parameters, see @ref ucp_request_param_t. + * This operation supports specific flags, which can be + * passed in @a param by @ref ucp_request_param_t.flags. + * The exact set of flags is defined + * by @ref ucp_ep_close_flags_t. + * + * @return NULL - The endpoint is closed successfully. + * @return UCS_PTR_IS_ERR(_ptr) - The closure failed and an error code indicates + * the transport level status. However, resources + * are released and the @a endpoint can no longer + * be used. + * @return otherwise - The closure process is started, and can be + * completed at any point in time. A request + * handle is returned to the application in order + * to track progress of the endpoint closure. + */ +ucs_status_ptr_t ucp_ep_close_nbx(ucp_ep_h ep, + const ucp_request_param_t *param); + + /** * @ingroup UCP_WORKER * @@ -1703,7 +2058,7 @@ void ucp_ep_print_info(ucp_ep_h ep, FILE *stream); * @param [in] cb Callback which will be called when the flush operation * completes. * - * @return UCS_OK - The flush operation was completed immediately. + * @return NULL - The flush operation was completed immediately. * @return UCS_PTR_IS_ERR(_ptr) - The flush operation failed. * @return otherwise - Flush operation was scheduled and can be completed * in any point in time. The request handle is returned @@ -1744,6 +2099,30 @@ ucs_status_ptr_t ucp_ep_flush_nb(ucp_ep_h ep, unsigned flags, ucp_send_callback_t cb); +/** + * @ingroup UCP_ENDPOINT + * + * @brief Non-blocking flush of outstanding AMO and RMA operations on the + * @ref ucp_ep_h "endpoint". + * + * This routine flushes all outstanding AMO and RMA communications on the + * @ref ucp_ep_h "endpoint". All the AMO and RMA operations issued on the + * @a ep prior to this call are completed both at the origin and at the target + * @ref ucp_ep_h "endpoint" when this call returns. + * + * @param [in] ep UCP endpoint. + * @param [in] param Operation parameters, see @ref ucp_request_param_t. + * + * @return NULL - The flush operation was completed immediately. + * @return UCS_PTR_IS_ERR(_ptr) - The flush operation failed. + * @return otherwise - Flush operation was scheduled and can be + * completed in any point in time. The request + * handle is returned to the application in + * order to track progress. + */ +ucs_status_ptr_t ucp_ep_flush_nbx(ucp_ep_h ep, const ucp_request_param_t *param); + + /** * @ingroup UCP_MEM * @brief Map or allocate memory for zero-copy operations. @@ -1868,6 +2247,21 @@ ucs_status_t ucp_mem_unmap(ucp_context_h context, ucp_mem_h memh); ucs_status_t ucp_mem_query(const ucp_mem_h memh, ucp_mem_attr_t *attr); +/** + * @ingroup UCP_MEM + * @brief Print memory mapping information. + * + * This routine maps memory and prints information about the created memory handle: + * including the mapped memory length, the allocation method, and other useful + * information associated with the memory handle. + * + * @param [in] mem_size Size of the memory to map. + * @param [in] context The context on which the memory is mapped. + * @param [in] stream Output stream on which to print the information. + */ +void ucp_mem_print_info(const char *mem_size, ucp_context_h context, FILE *stream); + + /** * @ingroup UCP_MEM * @brief list of UCP memory use advice. @@ -2004,8 +2398,10 @@ void ucp_rkey_buffer_release(void *rkey_buffer); * buffer. * * @note The application is responsible for releasing the RKEY object when - * it is no longer needed by calling the @ref ucp_rkey_destroy + * it is no longer needed, by calling the @ref ucp_rkey_destroy * "ucp_rkey_destroy()" routine. + * @note The remote key object can be used for communications only on the + * endpoint on which it was unpacked. * * @param [in] ep Endpoint to access using the remote key. * @param [in] rkey_buffer Packed rkey. @@ -2052,14 +2448,91 @@ ucs_status_t ucp_rkey_ptr(ucp_rkey_h rkey, uint64_t raddr, void **addr_p); * @li Once the RKEY object is released an access to the memory will cause an * undefined failure. * @li If the RKEY object was not created using - * @ref ucp_ep_rkey_unpack "ucp_ep_rkey_unpack()" routine the behaviour of this + * @ref ucp_ep_rkey_unpack "ucp_ep_rkey_unpack()" routine the behavior of this * routine is undefined. + * @li The RKEY object must be destroyed after all outstanding operations which + * are using it are flushed, and before the endpoint on which it was unpacked + * is destroyed. * * @param [in] rkey Remote key to destroy. */ void ucp_rkey_destroy(ucp_rkey_h rkey); +/** + * @ingroup UCP_WORKER + * @brief Add user defined callback for Active Message. + * + * This routine installs a user defined callback to handle incoming Active + * Messages with a specific id. This callback is called whenever an Active + * Message that was sent from the remote peer by @ref ucp_am_send_nb is + * received on this worker. + * + * @param [in] worker UCP worker on which to set the Active Message + * handler. + * @param [in] id Active Message id. + * @param [in] cb Active Message callback. NULL to clear. + * @param [in] arg Active Message argument, which will be passed + * in to every invocation of the callback as the + * arg argument. + * @param [in] flags Dictates how an Active Message is handled on the + * remote endpoint. Currently only + * UCP_AM_FLAG_WHOLE_MSG is supported, which + * indicates the callback will not be invoked + * until all data has arrived. + * + * @return error code if the worker does not support Active Messages or + * requested callback flags. + */ +ucs_status_t ucp_worker_set_am_handler(ucp_worker_h worker, uint16_t id, + ucp_am_callback_t cb, void *arg, + uint32_t flags); + + +/** + * @ingroup UCP_COMM + * @brief Send Active Message. + * + * This routine sends an Active Message to an ep. It does not support + * CUDA memory. + * + * @param [in] ep UCP endpoint where the Active Message will be run. + * @param [in] id Active Message id. Specifies which registered + * callback to run. + * @param [in] buffer Pointer to the data to be sent to the target node + * of the Active Message. + * @param [in] count Number of elements to send. + * @param [in] datatype Datatype descriptor for the elements in the buffer. + * @param [in] cb Callback that is invoked upon completion of the + * data transfer if it is not completed immediately. + * @param [in] flags For Future use. + * + * @return NULL Active Message was sent immediately. + * @return UCS_PTR_IS_ERR(_ptr) Error sending Active Message. + * @return otherwise Pointer to request, and Active Message is known + * to be completed after cb is run. + */ +ucs_status_ptr_t ucp_am_send_nb(ucp_ep_h ep, uint16_t id, + const void *buffer, size_t count, + ucp_datatype_t datatype, + ucp_send_callback_t cb, unsigned flags); + + +/** + * @ingroup UCP_COMM + * @brief Releases Active Message data. + * + * This routine releases data that persisted through an Active Message + * callback because that callback returned UCS_INPROGRESS. + * + * @param [in] worker Worker which received the Active Message. + * @param [in] data Pointer to data that was passed into + * the Active Message callback as the data + * parameter. + */ +void ucp_am_data_release(ucp_worker_h worker, void *data); + + /** * @ingroup UCP_COMM * @brief Non-blocking stream send operation. @@ -2089,7 +2562,7 @@ void ucp_rkey_destroy(ucp_rkey_h rkey); * the operation cannot be completed in place. * @param [in] flags Reserved for future use. * - * @return UCS_OK - The send operation was completed immediately. + * @return NULL - The send operation was completed immediately. * @return UCS_PTR_IS_ERR(_ptr) - The send operation failed. * @return otherwise - Operation was scheduled for send and can be * completed in any point in time. The request handle @@ -2103,6 +2576,36 @@ ucs_status_ptr_t ucp_stream_send_nb(ucp_ep_h ep, const void *buffer, size_t coun unsigned flags); +/** + * @ingroup UCP_COMM + * @brief Non-blocking stream send operation. + * + * This routine sends data that is described by the local address @a buffer, + * size @a count object to the destination endpoint @a ep. The routine is + * non-blocking and therefore returns immediately, however the actual send + * operation may be delayed. The send operation is considered completed when + * it is safe to reuse the source @e buffer. If the send operation is + * completed immediately the routine returns UCS_OK. + * + * @note The user should not modify any part of the @a buffer after this + * operation is called, until the operation completes. + * + * @param [in] ep Destination endpoint handle. + * @param [in] buffer Pointer to the message buffer (payload). + * @param [in] count Number of elements to send. + * @param [in] param Operation parameters, see @ref ucp_request_param_t. + * + * @return NULL - The send operation was completed immediately. + * @return UCS_PTR_IS_ERR(_ptr) - The send operation failed. + * @return otherwise - Operation was scheduled for send and can be + * completed at any point in time. The request + * handle is returned to the application in + * order to track progress of the message. + */ +ucs_status_ptr_t ucp_stream_send_nbx(ucp_ep_h ep, const void *buffer, size_t count, + const ucp_request_param_t *param); + + /** * @ingroup UCP_COMM * @brief Non-blocking tagged-send operations @@ -2110,10 +2613,10 @@ ucs_status_ptr_t ucp_stream_send_nb(ucp_ep_h ep, const void *buffer, size_t coun * This routine sends a messages that is described by the local address @a * buffer, size @a count, and @a datatype object to the destination endpoint * @a ep. Each message is associated with a @a tag value that is used for - * message matching on the @ref ucp_tag_recv_nb "receiver". The routine is + * message matching on the @ref ucp_tag_recv_nb "receiver". The routine is * non-blocking and therefore returns immediately, however the actual send - * operation may be delayed. The send operation is considered completed when - * it is safe to reuse the source @e buffer. If the send operation is + * operation may be delayed. The send operation is considered completed when + * it is safe to reuse the source @e buffer. If the send operation is * completed immediately the routine return UCS_OK and the call-back function * @a cb is @b not invoked. If the operation is @b not completed immediately * and no error reported then the UCP library will schedule to invoke the @@ -2134,7 +2637,7 @@ ucs_status_ptr_t ucp_stream_send_nb(ucp_ep_h ep, const void *buffer, size_t coun * that the call-back is only invoked in a case when * the operation cannot be completed in place. * - * @return UCS_OK - The send operation was completed immediately. + * @return NULL - The send operation was completed immediately. * @return UCS_PTR_IS_ERR(_ptr) - The send operation failed. * @return otherwise - Operation was scheduled for send and can be * completed in any point in time. The request handle @@ -2264,6 +2767,81 @@ ucs_status_ptr_t ucp_tag_send_sync_nb(ucp_ep_h ep, const void *buffer, size_t co ucp_send_callback_t cb); +/** + * @ingroup UCP_COMM + * @brief Non-blocking tagged-send operation + * + * This routine sends a messages that is described by the local address @a + * buffer, size @a count object to the destination endpoint @a ep. Each + * message is associated with a @a tag value that is used for message + * matching on the @ref ucp_tag_recv_nb or @ref ucp_tag_recv_nbx "receiver". + * The routine is non-blocking and therefore returns immediately, however the + * actual send operation may be delayed. The send operation is considered + * completed when it is safe to reuse the source @e buffer. If the send + * operation is completed immediately the routine returns UCS_OK and the + * call-back function is @b not invoked. If the operation is @b not completed + * immediately and no error reported then the UCP library will schedule to + * invoke the call-back whenever the send operation is completed. In other + * words, the completion of a message can be signaled by the return code or + * the call-back. + * Immediate completion signals can be fine-tuned via the + * @ref ucp_request_param_t.op_attr_mask field in the + * @ref ucp_request_param_t structure. The values of this field + * are a bit-wise OR of the @ref ucp_op_attr_t enumeration. + * + * @note The user should not modify any part of the @a buffer after this + * operation is called, until the operation completes. + * + * @param [in] ep Destination endpoint handle. + * @param [in] buffer Pointer to the message buffer (payload). + * @param [in] count Number of elements to send + * @param [in] tag Message tag. + * @param [in] param Operation parameters, see @ref ucp_request_param_t + * + * @return UCS_OK - The send operation was completed immediately. + * @return UCS_PTR_IS_ERR(_ptr) - The send operation failed. + * @return otherwise - Operation was scheduled for send and can be + * completed in any point in time. The request handle + * is returned to the application in order to track + * progress of the message. + */ +ucs_status_ptr_t ucp_tag_send_nbx(ucp_ep_h ep, const void *buffer, size_t count, + ucp_tag_t tag, const ucp_request_param_t *param); + + +/** + * @ingroup UCP_COMM + * @brief Non-blocking synchronous tagged-send operation. + * + * Same as @ref ucp_tag_send_nbx, except the request completes only after there + * is a remote tag match on the message (which does not always mean the remote + * receive has been completed). This function never completes "in-place", and + * always returns a request handle. + * + * @note The user should not modify any part of the @a buffer after this + * operation is called, until the operation completes. + * @note Returns @ref UCS_ERR_UNSUPPORTED if @ref UCP_ERR_HANDLING_MODE_PEER is + * enabled. This is a temporary implementation-related constraint that + * will be addressed in future releases. + * + * @param [in] ep Destination endpoint handle. + * @param [in] buffer Pointer to the message buffer (payload). + * @param [in] count Number of elements to send + * @param [in] tag Message tag. + * @param [in] param Operation parameters, see @ref ucp_request_param_t + * + * @return UCS_OK - The send operation was completed immediately. + * @return UCS_PTR_IS_ERR(_ptr) - The send operation failed. + * @return otherwise - Operation was scheduled for send and can be + * completed in any point in time. The request handle + * is returned to the application in order to track + * progress of the message. + */ +ucs_status_ptr_t ucp_tag_send_sync_nbx(ucp_ep_h ep, const void *buffer, + size_t count, ucp_tag_t tag, + const ucp_request_param_t *param); + + /** * @ingroup UCP_COMM * @brief Non-blocking stream receive operation of structured data into a @@ -2295,7 +2873,7 @@ ucs_status_ptr_t ucp_tag_send_sync_nb(ucp_ep_h ep, const void *buffer, size_t co * integral multiple of the @a datatype size. * @param [in] flags Flags defined in @ref ucp_stream_recv_flags_t. * - * @return UCS_OK - The receive operation was completed + * @return NULL - The receive operation was completed * immediately. * @return UCS_PTR_IS_ERR(_ptr) - The receive operation failed. * @return otherwise - Operation was scheduled for receive. A request @@ -2311,6 +2889,45 @@ ucs_status_ptr_t ucp_stream_recv_nb(ucp_ep_h ep, void *buffer, size_t count, size_t *length, unsigned flags); +/** + * @ingroup UCP_COMM + * @brief Non-blocking stream receive operation of structured data into a + * user-supplied buffer. + * + * This routine receives data that is described by the local address @a buffer, + * size @a count object on the endpoint @a ep. The routine is non-blocking + * and therefore returns immediately. The receive operation is considered + * complete when the message is delivered to the buffer. If the receive + * operation cannot be started, the routine returns an error. + * + * @param [in] ep UCP endpoint that is used for the receive operation. + * @param [in] buffer Pointer to the buffer that will receive the data. + * @param [in] count Number of elements to receive into @a buffer. + * @param [out] length Size of the received data in bytes. The value is + * valid only if return code is NULL. + * @param [in] param Operation parameters, see @ref ucp_request_param_t. + * This operation supports specific flags, which can be + * passed in @a param by @ref ucp_request_param_t.flags. + * The exact set of flags is defined by + * @ref ucp_stream_recv_flags_t. + * + * @return NULL - The receive operation was completed + * immediately. In this case the value pointed by + * @a length is updated by the size of received + * data. + * @return UCS_PTR_IS_ERR(_ptr) - The receive operation failed. + * @return otherwise - Operation was scheduled for receive. A request + * handle is returned to the application in order + * to track progress of the operation. + * + * @note The amount of data received, in bytes, is always an integral multiple + * of the @a datatype size. + */ +ucs_status_ptr_t ucp_stream_recv_nbx(ucp_ep_h ep, void *buffer, size_t count, + size_t *length, + const ucp_request_param_t *param); + + /** * @ingroup UCP_COMM * @brief Non-blocking stream receive operation of unstructured data into @@ -2327,7 +2944,7 @@ ucs_status_ptr_t ucp_stream_recv_nb(ucp_ep_h ep, void *buffer, size_t count, * operation. * @param [out] length Length of received data. * - * @return UCS_OK - No received data available on the @a ep. + * @return NULL - No received data available on the @a ep. * @return UCS_PTR_IS_ERR(_ptr) - the receive operation failed and * UCS_PTR_STATUS(_ptr) indicates an error. * @return otherwise - The pointer to the data UCS_STATUS_PTR(_ptr) @@ -2350,11 +2967,11 @@ ucs_status_ptr_t ucp_stream_recv_data_nb(ucp_ep_h ep, size_t *length); * @ingroup UCP_COMM * @brief Non-blocking tagged-receive operation. * - * This routine receives a messages that is described by the local address @a - * buffer, size @a count, and @a datatype object on the @a worker. The tag + * This routine receives a message that is described by the local address @a + * buffer, size @a count, and @a datatype object on the @a worker. The tag * value of the receive message has to match the @a tag and @a tag_mask values, - * where the @a tag_mask indicates what bits of the tag have to be matched. The - * routine is a non-blocking and therefore returns immediately. The receive + * where the @a tag_mask indicates which bits of the tag have to be matched. The + * routine is non-blocking and therefore returns immediately. The receive * operation is considered completed when the message is delivered to the @a * buffer. In order to notify the application about completion of the receive * operation the UCP library will invoke the call-back @a cb when the received @@ -2394,10 +3011,10 @@ ucs_status_ptr_t ucp_tag_recv_nb(ucp_worker_h worker, void *buffer, size_t count * @brief Non-blocking tagged-receive operation. * * This routine receives a message that is described by the local address @a - * buffer, size @a count, and @a datatype object on the @a worker. The tag + * buffer, size @a count, and @a datatype object on the @a worker. The tag * value of the receive message has to match the @a tag and @a tag_mask values, - * where the @a tag_mask indicates what bits of the tag have to be matched. The - * routine is a non-blocking and therefore returns immediately. The receive + * where the @a tag_mask indicates which bits of the tag have to be matched. The + * routine is non-blocking and therefore returns immediately. The receive * operation is considered completed when the message is delivered to the @a * buffer. In order to monitor completion of the operation * @ref ucp_request_check_status or @ref ucp_tag_recv_request_test should be @@ -2423,6 +3040,46 @@ ucs_status_t ucp_tag_recv_nbr(ucp_worker_h worker, void *buffer, size_t count, ucp_tag_t tag_mask, void *req); +/** + * @ingroup UCP_COMM + * @brief Non-blocking tagged-receive operation. + * + * This routine receives a message that is described by the local address @a + * buffer, size @a count, and @a info object on the @a worker. The tag + * value of the receive message has to match the @a tag and @a tag_mask values, + * where the @a tag_mask indicates what bits of the tag have to be matched. The + * routine is a non-blocking and therefore returns immediately. The receive + * operation is considered completed when the message is delivered to the @a + * buffer. In order to notify the application about completion of the receive + * operation the UCP library will invoke the call-back @a cb when the received + * message is in the receive buffer and ready for application access. If the + * receive operation cannot be stated the routine returns an error. + * + * @note This routine cannot return UCS_OK. It always returns a request + * handle or an error. + * + * @param [in] worker UCP worker that is used for the receive operation. + * @param [in] buffer Pointer to the buffer to receive the data to. + * @param [in] count Number of elements to receive + * @param [in] tag Message tag to expect. + * @param [in] tag_mask Bit mask that indicates the bits that are used for + * the matching of the incoming tag + * against the expected tag. + * @param [in] param Operation parameters, see @ref ucp_request_param_t + * + * @return UCS_PTR_IS_ERR(_ptr) - The receive operation failed. + * @return otherwise - Operation was scheduled for receive. The request + * handle is returned to the application in order + * to track progress of the operation. The + * application is responsible for releasing the + * handle using @ref ucp_request_free + * "ucp_request_free()" routine. + */ +ucs_status_ptr_t ucp_tag_recv_nbx(ucp_worker_h worker, void *buffer, size_t count, + ucp_tag_t tag, ucp_tag_t tag_mask, + const ucp_request_param_t *param); + + /** * @ingroup UCP_COMM * @brief Non-blocking probe and return a message. @@ -2472,16 +3129,16 @@ ucp_tag_message_h ucp_tag_probe_nb(ucp_worker_h worker, ucp_tag_t tag, * @ingroup UCP_COMM * @brief Non-blocking receive operation for a probed message. * - * This routine receives a messages that is described by the local address @a + * This routine receives a message that is described by the local address @a * buffer, size @a count, @a message handle, and @a datatype object on the @a - * worker. The @a message handle can be obtain by calling the @ref + * worker. The @a message handle can be obtained by calling the @ref * ucp_tag_probe_nb "ucp_tag_probe_nb()" routine. @ref ucp_tag_msg_recv_nb - * "ucp_tag_msg_recv_nb()" routine is a non-blocking and therefore returns + * "ucp_tag_msg_recv_nb()" routine is non-blocking and therefore returns * immediately. The receive operation is considered completed when the message - * is delivered to the @a buffer. In order to notify the application about + * is delivered to the @a buffer. In order to notify the application about * completion of the receive operation the UCP library will invoke the * call-back @a cb when the received message is in the receive buffer and ready - * for application access. If the receive operation cannot be stated the + * for application access. If the receive operation cannot be started the * routine returns an error. * * @param [in] worker UCP worker that is used for the receive operation. @@ -2514,7 +3171,7 @@ ucs_status_ptr_t ucp_tag_msg_recv_nb(ucp_worker_h worker, void *buffer, * This routine initiates a storage of contiguous block of data that is * described by the local address @a buffer in the remote contiguous memory * region described by @a remote_addr address and the @ref ucp_rkey_h "memory - * handle" @a rkey. The routine returns immediately and @b does @b not + * handle" @a rkey. The routine returns immediately and @b does @b not * guarantee re-usability of the source address @e buffer. If the operation is * completed immediately the routine return UCS_OK, otherwise UCS_INPROGRESS * or an error is returned to user. @@ -2570,7 +3227,7 @@ ucs_status_t ucp_put_nbi(ucp_ep_h ep, const void *buffer, size_t length, * can be modified. Does not guarantee remote * completion. * - * @return UCS_OK - The operation was completed immediately. + * @return NULL - The operation was completed immediately. * @return UCS_PTR_IS_ERR(_ptr) - The operation failed. * @return otherwise - Operation was scheduled and can be * completed at any point in time. The request handle @@ -2584,6 +3241,61 @@ ucs_status_ptr_t ucp_put_nb(ucp_ep_h ep, const void *buffer, size_t length, ucp_send_callback_t cb); +/** + * @ingroup UCP_COMM + * @brief Non-blocking remote memory put operation. + * + * This routine initiates a storage of contiguous block of data that is + * described by the local address @a buffer in the remote contiguous memory + * region described by @a remote_addr address and the @ref ucp_rkey_h "memory + * handle" @a rkey. The routine returns immediately and @b does @b not + * guarantee re-usability of the source address @e buffer. If the operation is + * completed immediately the routine return UCS_OK, otherwise UCS_INPROGRESS + * or an error is returned to user. If the put operation completes immediately, + * the routine returns UCS_OK and the call-back routine @a param.cb.send is + * @b not invoked. If the operation is @b not completed immediately and no + * error is reported, then the UCP library will schedule invocation of the + * call-back routine @a param.cb.send upon completion of the put operation. + * In other words, the completion of a put operation can be signaled by the + * return code or execution of the call-back. + * Immediate completion signals can be fine-tuned via the + * @ref ucp_request_param_t.op_attr_mask field in the + * @ref ucp_request_param_t structure. The values of this field + * are a bit-wise OR of the @ref ucp_op_attr_t enumeration. + * + * @note A user can use @ref ucp_worker_flush_nb "ucp_worker_flush_nb()" + * in order to guarantee re-usability of the source address @e buffer. + * + * @param [in] ep Remote endpoint handle. + * @param [in] buffer Pointer to the local source address. + * @param [in] count Number of elements of type + * @ref ucp_request_param_t.datatype to put. If + * @ref ucp_request_param_t.datatype is not specified, + * the type defaults to ucp_dt_make_contig(1), which + * corresponds to byte elements. + * @param [in] remote_addr Pointer to the destination remote memory address + * to write to. + * @param [in] rkey Remote memory key associated with the + * remote memory address. + * @param [in] param Operation parameters, see @ref ucp_request_param_t + * + * @return UCS_OK - The operation was completed immediately. + * @return UCS_PTR_IS_ERR(_ptr) - The operation failed. + * @return otherwise - Operation was scheduled and can be + * completed at any point in time. The request handle + * is returned to the application in order to track + * progress of the operation. The application is + * responsible for releasing the handle using + * @ref ucp_request_free "ucp_request_free()" routine. + * + * @note Only the datatype ucp_dt_make_contig(1) is supported + * for @a param->datatype, see @ref ucp_dt_make_contig. + */ +ucs_status_ptr_t ucp_put_nbx(ucp_ep_h ep, const void *buffer, size_t count, + uint64_t remote_addr, ucp_rkey_h rkey, + const ucp_request_param_t *param); + + /** * @ingroup UCP_COMM * @brief Non-blocking implicit remote memory get operation. @@ -2591,7 +3303,7 @@ ucs_status_ptr_t ucp_put_nb(ucp_ep_h ep, const void *buffer, size_t length, * This routine initiate a load of contiguous block of data that is described * by the remote memory address @a remote_addr and the @ref ucp_rkey_h "memory handle" * @a rkey in the local contiguous memory region described by @a buffer - * address. The routine returns immediately and @b does @b not guarantee that + * address. The routine returns immediately and @b does @b not guarantee that * remote data is loaded and stored under the local address @e buffer. * * @note A user can use @ref ucp_worker_flush_nb "ucp_worker_flush_nb()" in order @@ -2599,11 +3311,11 @@ ucs_status_ptr_t ucp_put_nb(ucp_ep_h ep, const void *buffer, size_t length, * @e buffer. * * @param [in] ep Remote endpoint handle. - * @param [in] buffer Pointer to the local source address. + * @param [in] buffer Pointer to the local destination address. * @param [in] length Length of the data (in bytes) stored under the - * source address. - * @param [in] remote_addr Pointer to the destination remote memory address - * to write to. + * destination address. + * @param [in] remote_addr Pointer to the source remote memory address + * to read from. * @param [in] rkey Remote memory key associated with the * remote memory address. * @@ -2634,18 +3346,18 @@ ucs_status_t ucp_get_nbi(ucp_ep_h ep, void *buffer, size_t length, * in order to guarantee re-usability of the source address @e buffer. * * @param [in] ep Remote endpoint handle. - * @param [in] buffer Pointer to the local source address. + * @param [in] buffer Pointer to the local destination address. * @param [in] length Length of the data (in bytes) stored under the - * source address. - * @param [in] remote_addr Pointer to the destination remote memory address - * to write to. + * destination address. + * @param [in] remote_addr Pointer to the source remote memory address + * to read from. * @param [in] rkey Remote memory key associated with the * remote memory address. * @param [in] cb Call-back function that is invoked whenever the * get operation is completed and the data is * visible to the local process. * - * @return UCS_OK - The operation was completed immediately. + * @return NULL - The operation was completed immediately. * @return UCS_PTR_IS_ERR(_ptr) - The operation failed. * @return otherwise - Operation was scheduled and can be * completed at any point in time. The request handle @@ -2658,6 +3370,58 @@ ucs_status_ptr_t ucp_get_nb(ucp_ep_h ep, void *buffer, size_t length, uint64_t remote_addr, ucp_rkey_h rkey, ucp_send_callback_t cb); + +/** + * @ingroup UCP_COMM + * @brief Non-blocking remote memory get operation. + * + * This routine initiates a load of a contiguous block of data that is + * described by the remote memory address @a remote_addr and the @ref ucp_rkey_h + * "memory handle" @a rkey in the local contiguous memory region described + * by @a buffer address. The routine returns immediately and @b does @b not + * guarantee that remote data is loaded and stored under the local address @e + * buffer. If the operation is completed immediately the routine return UCS_OK, + * otherwise UCS_INPROGRESS or an error is returned to user. If the get + * operation completes immediately, the routine returns UCS_OK and the + * call-back routine @a param.cb.send is @b not invoked. If the operation is + * @b not completed immediately and no error is reported, then the UCP library + * will schedule invocation of the call-back routine @a param.cb.send upon + * completion of the get operation. In other words, the completion of a get + * operation can be signaled by the return code or execution of the call-back. + * + * @note A user can use @ref ucp_worker_flush_nb "ucp_worker_flush_nb()" + * in order to guarantee re-usability of the source address @e buffer. + * + * @param [in] ep Remote endpoint handle. + * @param [in] buffer Pointer to the local destination address. + * @param [in] count Number of elements of type + * @ref ucp_request_param_t.datatype to put. If + * @ref ucp_request_param_t.datatype is not specified, + * the type defaults to ucp_dt_make_contig(1), which + * corresponds to byte elements. + * @param [in] remote_addr Pointer to the source remote memory address + * to read from. + * @param [in] rkey Remote memory key associated with the + * remote memory address. + * @param [in] param Operation parameters, see @ref ucp_request_param_t. + * + * @return UCS_OK - The operation was completed immediately. + * @return UCS_PTR_IS_ERR(_ptr) - The operation failed. + * @return otherwise - Operation was scheduled and can be + * completed at any point in time. The request handle + * is returned to the application in order to track + * progress of the operation. The application is + * responsible for releasing the handle using + * @ref ucp_request_free "ucp_request_free()" routine. + * + * @note Only the datatype ucp_dt_make_contig(1) is supported + * for @a param->datatype, see @ref ucp_dt_make_contig. + */ +ucs_status_ptr_t ucp_get_nbx(ucp_ep_h ep, void *buffer, size_t count, + uint64_t remote_addr, ucp_rkey_h rkey, + const ucp_request_param_t *param); + + /** * @ingroup UCP_COMM * @brief Post an atomic memory operation. @@ -2720,7 +3484,7 @@ ucs_status_t ucp_atomic_post(ucp_ep_h ep, ucp_atomic_post_op_t opcode, uint64_t * that the call-back function is only invoked in a case when * the operation cannot be completed in place. * - * @return UCS_OK - The operation was completed immediately. + * @return NULL - The operation was completed immediately. * @return UCS_PTR_IS_ERR(_ptr) - The operation failed. * @return otherwise - Operation was scheduled and can be * completed at any point in time. The request handle @@ -2736,6 +3500,76 @@ ucp_atomic_fetch_nb(ucp_ep_h ep, ucp_atomic_fetch_op_t opcode, ucp_send_callback_t cb); +/** + * @ingroup UCP_COMM + * @brief Post an atomic memory operation. + * + * This routine will post an atomic operation to remote memory. + * The remote value is described by the combination of the remote + * memory address @a remote_addr and the @ref ucp_rkey_h "remote memory handle" + * @a rkey. The routine is non-blocking and therefore returns immediately. + * However, the actual atomic operation may be delayed. In order to enable + * fetching semantics for atomic operations user has to specify + * @a param.reply_buffer. Please see @ref atomic_ops "table" below for more + * details. + * + * @note The user should not modify any part of the @a buffer (or also + * @a param->reply_buffer for fetch operations), until the operation + * completes. + * @note Only ucp_dt_make_config(4) and ucp_dt_make_contig(8) are supported + * in @a param->datatype, see @ref ucp_dt_make_contig. + * + * + * + *
Atomic Operations Semantic
Atomic Operation Pseudo code + * X Y Z + * Result + *
@ref UCP_ATOMIC_OP_ADD Result=Y; Y+=X + * bufferremote_addr- + * param.reply_buffer(optional) + *
@ref UCP_ATOMIC_OP_SWAP Result=Y; Y=X + * bufferremote_addr - + * param.reply_buffer + *
@ref UCP_ATOMIC_OP_CSWAP + * Result=Y; if (X==Y) then Y=Zbuffer + * remote_addr param.reply_buffer + * param.reply_buffer + *
@ref UCP_ATOMIC_OP_AND Result=Y; Y&=X + * bufferremote_addr - + * param.reply_buffer(optional) + *
@ref UCP_ATOMIC_OP_OR Result=Y; Y|=X + * bufferremote_addr - + * param.reply_buffer(optional) + *
@ref UCP_ATOMIC_OP_XOR Result=Y; Y^=X + * bufferremote_addr - + * param.reply_buffer(optional) + *
+ * + * @param [in] ep UCP endpoint. + * @param [in] opcode One of @ref ucp_atomic_op_t. + * @param [in] buffer Address of operand for the atomic operation. See + * @ref atomic_ops "Atomic Operations Semantic table" + * for exact usage by different atomic operations. + * @param [in] count Number of elements in @a buffer and @a result. The + * size of each element is specified by + * @ref ucp_request_param_t.datatype + * @param [in] remote_addr Remote address to operate on. + * @param [in] rkey Remote key handle for the remote memory address. + * @param [in] param Operation parameters, see @ref ucp_request_param_t. + * + * @return NULL - The operation completed immediately. + * @return UCS_PTR_IS_ERR(_ptr) - The operation failed. + * @return otherwise - Operation was scheduled and can be + * completed at some time in the future. The + * request handle is returned to the application + * in order to track progress of the operation. + */ +ucs_status_ptr_t +ucp_atomic_op_nbx(ucp_ep_h ep, ucp_atomic_op_t opcode, const void *buffer, + size_t count, uint64_t remote_addr, ucp_rkey_h rkey, + const ucp_request_param_t *param); + + /** * @ingroup UCP_COMM * @brief Check the status of non-blocking request. @@ -2840,6 +3674,22 @@ void ucp_stream_data_release(ucp_ep_h ep, void *data); void ucp_request_free(void *request); +/** + * @ingroup UCP_COMM + * @brief Create an empty communications request. + * + * @param [in] worker UCP worker. + * + * @return Error code as defined by @ref ucs_status_t + * + * This routine creates request which may be used in functions + * @ref ucp_tag_send_nbx, @ref ucp_tag_recv_nbx, etc. The application + * is responsible for releasing the handle using the @ref ucp_request_free + * routine + */ +void *ucp_request_alloc(ucp_worker_h worker); + + /** * @ingroup UCP_DATATYPE * @brief Create a generic datatype. @@ -2929,7 +3779,7 @@ ucs_status_t ucp_worker_fence(ucp_worker_h worker); * @param [in] cb Callback which will be called when the flush operation * completes. * - * @return UCS_OK - The flush operation was completed immediately. + * @return NULL - The flush operation was completed immediately. * @return UCS_PTR_IS_ERR(_ptr) - The flush operation failed. * @return otherwise - Flush operation was scheduled and can be completed * in any point in time. The request handle is returned @@ -2942,6 +3792,35 @@ ucs_status_ptr_t ucp_worker_flush_nb(ucp_worker_h worker, unsigned flags, ucp_send_callback_t cb); +/** + * @ingroup UCP_WORKER + * + * @brief Flush outstanding AMO and RMA operations on the @ref ucp_worker_h + * "worker" + * + * This routine flushes all outstanding AMO and RMA communications on the + * @ref ucp_worker_h "worker". All the AMO and RMA operations issued on the + * @a worker prior to this call are completed both at the origin and at the + * target when this call returns. + * + * @note For description of the differences between @ref ucp_worker_flush_nb + * "flush" and @ref ucp_worker_fence "fence" operations please see + * @ref ucp_worker_fence "ucp_worker_fence()" + * + * @param [in] worker UCP worker. + * @param [in] param Operation parameters, see @ref ucp_request_param_t + * + * @return NULL - The flush operation was completed immediately. + * @return UCS_PTR_IS_ERR(_ptr) - The flush operation failed. + * @return otherwise - Flush operation was scheduled and can be + * completed in any point in time. The request + * handle is returned to the application in order + * to track progress. + */ +ucs_status_ptr_t ucp_worker_flush_nbx(ucp_worker_h worker, + const ucp_request_param_t *param); + + /** * @example ucp_hello_world.c * UCP hello world client / server example utility. diff --git a/src/ucp/api/ucp_def.h b/src/ucp/api/ucp_def.h index 5ebd444875a..c24a8e58c04 100644 --- a/src/ucp/api/ucp_def.h +++ b/src/ucp/api/ucp_def.h @@ -198,7 +198,7 @@ typedef struct ucp_mem_attr { * @brief UCP Memory handle attributes field mask. * * The enumeration allows specifying which fields in @ref ucp_mem_attr_t are - * present. It is used for the enablement of backward compatibility support. + * present. It is used to enable backward compatibility support. */ enum ucp_mem_attr_field { UCP_MEM_ATTR_FIELD_ADDRESS = UCS_BIT(0), /**< Virtual address */ @@ -211,11 +211,11 @@ enum ucp_mem_attr_field { * @brief UCP Worker * * UCP worker is an opaque object representing the communication context. The - * worker represents an instance of a local communication resource and progress - * engine associated with it. Progress engine is a construct that is - * responsible for asynchronous and independent progress of communication - * directives. The progress engine could be implement in hardware or software. - * The worker object abstract an instance of network resources such as a host + * worker represents an instance of a local communication resource and the + * progress engine associated with it. The progress engine is a construct that + * is responsible for asynchronous and independent progress of communication + * directives. The progress engine could be implemented in hardware or software. + * The worker object abstracts an instance of network resources such as a host * channel adapter port, network interface, or multiple resources such as * multiple network interfaces or communication ports. It could also represent * virtual communication resources that are defined across multiple devices. @@ -296,7 +296,7 @@ typedef void (*ucp_request_cleanup_callback_t)(void *request); * * @param [in] request The completed send request. * @param [in] status Completion status. If the send operation was completed - * successfully UCX_OK is returned. If send operation was + * successfully UCS_OK is returned. If send operation was * canceled UCS_ERR_CANCELED is returned. * Otherwise, an @ref ucs_status_t "error status" is * returned. @@ -305,6 +305,27 @@ typedef void (*ucp_send_callback_t)(void *request, ucs_status_t status); /** + * @ingroup UCP_COMM + * @brief Completion callback for non-blocking sends ucp_tag_send_nbx call. + * + * This callback routine is invoked whenever the @ref ucp_tag_send_nbx + * "send operation" is completed. It is important to note that the call-back is + * only invoked in a case when the operation cannot be completed in place. + * + * @param [in] request The completed send request. + * @param [in] status Completion status. If the send operation was completed + * successfully UCS_OK is returned. If send operation was + * canceled UCS_ERR_CANCELED is returned. + * Otherwise, an @ref ucs_status_t "error status" is + * returned. + * @param [in] user_data User data passed to "user_data" value, + * see @ref ucp_request_param_t + */ +typedef void (*ucp_send_nbx_callback_t)(void *request, ucs_status_t status, + void *user_data); + + +/** * @ingroup UCP_COMM * @brief Callback to process peer failure. * @@ -402,7 +423,7 @@ typedef struct ucp_listener_conn_handler { * * @param [in] request The completed receive request. * @param [in] status Completion status. If the send operation was completed - * successfully UCX_OK is returned. Otherwise, + * successfully UCS_OK is returned. Otherwise, * an @ref ucs_status_t "error status" is returned. * @param [in] length The size of the received data in bytes, always * boundary of base datatype size. The value is valid @@ -412,6 +433,28 @@ typedef void (*ucp_stream_recv_callback_t)(void *request, ucs_status_t status, size_t length); +/** + * @ingroup UCP_COMM + * @brief Completion callback for non-blocking stream receives + * ucp_stream_recv_nbx call. + * + * This callback routine is invoked whenever the @ref ucp_stream_recv_nbx + * "receive operation" is completed and the data is ready in the receive buffer. + * + * @param [in] request The completed receive request. + * @param [in] status Completion status. If the send operation was completed + * successfully UCS_OK is returned. Otherwise, + * an @ref ucs_status_t "error status" is returned. + * @param [in] length The size of the received data in bytes, always on the + * boundary of base datatype size. The value is valid + * only if the status is UCS_OK. + * @param [in] user_data User data passed to "user_data" value, + * see @ref ucp_request_param_t. + */ +typedef void (*ucp_stream_recv_nbx_callback_t)(void *request, ucs_status_t status, + size_t length, void *user_data); + + /** * @ingroup UCP_COMM * @brief Completion callback for non-blocking tag receives. @@ -421,7 +464,7 @@ typedef void (*ucp_stream_recv_callback_t)(void *request, ucs_status_t status, * * @param [in] request The completed receive request. * @param [in] status Completion status. If the send operation was completed - * successfully UCX_OK is returned. If send operation was + * successfully UCS_OK is returned. If send operation was * canceled UCS_ERR_CANCELED is returned. If the data can * not fit into the receive buffer the * @ref UCS_ERR_MESSAGE_TRUNCATED error code is returned. @@ -434,6 +477,33 @@ typedef void (*ucp_stream_recv_callback_t)(void *request, ucs_status_t status, typedef void (*ucp_tag_recv_callback_t)(void *request, ucs_status_t status, ucp_tag_recv_info_t *info); + +/** + * @ingroup UCP_COMM + * @brief Completion callback for non-blocking tag receives ucp_tag_recv_nbx call. + * + * This callback routine is invoked whenever the @ref ucp_tag_recv_nbx + * "receive operation" is completed and the data is ready in the receive buffer. + * + * @param [in] request The completed receive request. + * @param [in] status Completion status. If the send operation was completed + * successfully UCS_OK is returned. If send operation was + * canceled UCS_ERR_CANCELED is returned. If the data can + * not fit into the receive buffer the + * @ref UCS_ERR_MESSAGE_TRUNCATED error code is returned. + * Otherwise, an @ref ucs_status_t "error status" is + * returned. + * @param [in] info @ref ucp_tag_recv_info_t "Completion information" + * The @a info descriptor is Valid only if the status is + * UCS_OK. + * @param [in] user_data User data passed to "user_data" value, + * see @ref ucp_request_param_t + */ +typedef void (*ucp_tag_recv_nbx_callback_t)(void *request, ucs_status_t status, + const ucp_tag_recv_info_t *tag_info, + void *user_data); + + /** * @ingroup UCP_WORKER * @brief UCP worker wakeup events mask. @@ -471,6 +541,41 @@ typedef enum ucp_wakeup_event_types { } ucp_wakeup_event_t; +/** + * @ingroup UCP_ENDPOINT + * @brief Callback to process incoming Active Message. + * + * When the callback is called, @a flags indicates how @a data should be handled. + * + * @param [in] arg User-defined argument. + * @param [in] data Points to the received data. This data may + * persist after the callback returns and needs + * to be freed with @ref ucp_am_data_release. + * @param [in] length Length of data. + * @param [in] reply_ep If the Active Message is sent with the + * UCP_AM_SEND_REPLY flag, the sending ep + * will be passed in. If not, NULL will be passed. + * @param [in] flags If this flag is set to UCP_CB_PARAM_FLAG_DATA, + * the callback can return UCS_INPROGRESS and + * data will persist after the callback returns. + * + * @return UCS_OK @a data will not persist after the callback returns. + * + * @return UCS_INPROGRESS Can only be returned if flags is set to + * UCP_CB_PARAM_FLAG_DATA. If UCP_INPROGRESS + * is returned, data will persist after the + * callback has returned. To free the memory, + * a pointer to the data must be passed into + * @ref ucp_am_data_release. + * + * @note This callback should be set and released + * by @ref ucp_worker_set_am_handler function. + * + */ +typedef ucs_status_t (*ucp_am_callback_t)(void *arg, void *data, size_t length, + ucp_ep_h reply_ep, unsigned flags); + + /** * @ingroup UCP_ENDPOINT * @brief Tuning parameters for the UCP endpoint. diff --git a/src/ucp/api/ucpx.h b/src/ucp/api/ucpx.h index 14828c1425c..c6cc8eddb39 100644 --- a/src/ucp/api/ucpx.h +++ b/src/ucp/api/ucpx.h @@ -19,152 +19,6 @@ BEGIN_C_DECLS -/** - * @ingroup UCP_ENDPOINT - * @brief Callback to process incoming active message - * - * When the callback is called, @a flags indicates how @a data should be handled. - * - * @param [in] arg User-defined argument. - * @param [in] data Points to the received data. This data may - * persist after the callback returns and need - * to be freed with @ref ucp_am_data_release - * @param [in] length Length of data. - * @param [in] reply_ep If the active message is sent with the - * UCP_AM_SEND_REPLY flag, the sending ep - * will be passed in. If not, NULL will be passed - * @param [in] flags If this flag is set to UCP_CB_PARAM_FLAG_DATA, - * the callback can return UCS_INPROGRESS and - * data will persist after the callback returns - * - * @return UCS_OK @a data will not persist after the callback returns - * - * @return UCS_INPROGRESS Can only be returned if flags is set to - * UCP_CB_PARAM_FLAG_DATA. If UCP_INPROGRESS - * is returned, data will persist after the - * callback has returned. To free the memory, - * a pointer to the data must be passed into - * @ref ucp_am_data_release - * - * @note This callback could be set and released - * by @ref ucp_worker_set_am_handler function. - * - */ -typedef ucs_status_t (*ucp_am_callback_t)(void *arg, void *data, size_t length, - ucp_ep_h reply_ep, unsigned flags); - - -/** - * @ingroup UCP_WORKER - * @brief Flags for a UCP AM callback - * - * Flags that indicate how to handle UCP Active Messages - * Currently only UCP_AM_FLAG_WHOLE_MSG is supported, - * which indicates the entire message is handled in one - * callback - */ -enum ucp_am_cb_flags { - UCP_AM_FLAG_WHOLE_MSG = UCS_BIT(0) -}; - - -/** - * @ingroup UCP_WORKER - * @brief Flags for sending a UCP AM - * - * Flags dictate the behavior of ucp_am_send_nb - * currently the only flag tells ucp to pass in - * the sending endpoint to the call - * back so a reply can be defined - */ -enum ucp_send_am_flags { - UCP_AM_SEND_REPLY = UCS_BIT(0) -}; - - -/** - * @ingroup UCP_ENDPOINT - * @brief Descriptor flags for Active Message Callback - * - * In a callback, if flags is set to UCP_CB_PARAM_FLAG_DATA, data - * was allocated, so if UCS_INPROGRESS is returned from the - * callback, the data parameter will persist and the user has to call - * @ref ucp_am_data_release - */ -enum ucp_cb_param_flags { - UCP_CB_PARAM_FLAG_DATA = UCS_BIT(0) -}; - - -/** - * @ingroup UCP_WORKER - * @brief Add user defined callback for active message. - * - * This routine installs a user defined callback to handle incoming active - * messages with a specific id. This callback is called whenever an active message, - * which was sent from the remote peer by @ref for ucp_am_send_nb, is received on - * this worker. - * - * @param [in] worker UCP worker on which to set the am handler - * @param [in] id Active message id. - * @param [in] cb Active message callback. NULL to clear. - * @param [in] arg Active message argument, which will be passed in to - * every invocation of the callback as the arg argument. - * @param [in] flags Dictates how an Active Message is handled on the remote endpoint. - * Currently only UCP_AM_FLAG_WHOLE_MSG is supported, which indicates - * the callback will not be invoked until all data has arrived. - * - * @return error code if the worker does not support active messages or - * requested callback flags - */ -ucs_status_t ucp_worker_set_am_handler(ucp_worker_h worker, uint16_t id, - ucp_am_callback_t cb, void *arg, - uint32_t flags); - - -/** - * @ingroup UCP_COMM - * @brief Send Active Message - * - * This routine sends an Active Message to an ep. It does not support - * CUDA memory. - * - * @param [in] ep UCP endpoint where the active message will be run - * @param [in] id Active Message id. Specifies which registered - * callback to run. - * @param [in] buffer Pointer to the data to be sent to the target node - * for the AM. - * @param [in] count Number of elements to send. - * @param [in] datatype Datatype descriptor for the elements in the buffer. - * @param [in] cb Callback that is invoked upon completion of the data - * transfer if it is not completed immediately - * @param [in] flags For Future use - * - * @return UCS_OK Active message was sent immediately - * @return UCS_PTR_IS_ERR(_ptr) Error sending Active Message - * @return otherwise Pointer to request, and Active Message is known - * to be completed after cb is run - */ -ucs_status_ptr_t ucp_am_send_nb(ucp_ep_h ep, uint16_t id, - const void *buffer, size_t count, - ucp_datatype_t datatype, - ucp_send_callback_t cb, unsigned flags); - - -/** - * @ingroup UCP_COMM - * @brief Releases am data - * - * This routine releases back data that persisted through an AM - * callback because that callback returned UCS_INPROGRESS - * - * @param [in] worker Worker which received the active message - * @param [in] data Pointer to data that was passed into - * the Active Message callback as the data - * parameter and the callback flags were set to - * UCP_CB_PARAM_FLAG_DATA - */ -void ucp_am_data_release(ucp_worker_h worker, void *data); END_C_DECLS diff --git a/src/ucp/core/ucp_am.c b/src/ucp/core/ucp_am.c new file mode 100644 index 00000000000..c2a245e2e26 --- /dev/null +++ b/src/ucp/core/ucp_am.c @@ -0,0 +1,775 @@ +/** +* Copyright (C) Los Alamos National Security, LLC. 2019 ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "ucp_am.h" +#include "ucp_am.inl" + +#include +#include +#include +#include +#include +#include +#include + + +ucs_status_t ucp_am_init(ucp_worker_h worker) +{ + if (!(worker->context->config.features & UCP_FEATURE_AM)) { + return UCS_OK; + } + + worker->am.cbs_array_len = 0ul; + worker->am.cbs = NULL; + + return UCS_OK; +} + +void ucp_am_cleanup(ucp_worker_h worker) +{ + if (!(worker->context->config.features & UCP_FEATURE_AM)) { + return; + } + + ucs_free(worker->am.cbs); + worker->am.cbs_array_len = 0; +} + +void ucp_am_ep_init(ucp_ep_h ep) +{ + ucp_ep_ext_proto_t *ep_ext = ucp_ep_ext_proto(ep); + + if (ep->worker->context->config.features & UCP_FEATURE_AM) { + ucs_list_head_init(&ep_ext->am.started_ams); + ucs_queue_head_init(&ep_ext->am.mid_rdesc_q); + } +} + +void ucp_am_ep_cleanup(ucp_ep_h ep) +{ + ucp_ep_ext_proto_t *ep_ext = ucp_ep_ext_proto(ep); + + if (ep->worker->context->config.features & UCP_FEATURE_AM) { + if (ucs_unlikely(!ucs_list_is_empty(&ep_ext->am.started_ams))) { + ucs_warn("worker %p: not all UCP active messages have been" + " run to completion on ep %p", ep->worker, ep); + } + + if (ucs_unlikely(!ucs_queue_is_empty(&ep_ext->am.mid_rdesc_q))) { + ucs_warn("worker %p: unhandled middle fragments left on ep %p", + ep->worker, ep); + } + } +} + +UCS_PROFILE_FUNC_VOID(ucp_am_data_release, (worker, data), + ucp_worker_h worker, void *data) +{ + ucp_recv_desc_t *rdesc = (ucp_recv_desc_t *)data - 1; + + if (ucs_unlikely(rdesc->flags & UCP_RECV_DESC_FLAG_MALLOC)) { + /* Don't use UCS_PTR_BYTE_OFFSET here due to coverity false + * positive report. Need to step back by first_header size, where + * originally allocated pointer resides. */ + ucs_free((char*)rdesc - sizeof(ucp_am_first_hdr_t)); + return; + } + + UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); + ucp_recv_desc_release(rdesc); + UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker); +} + +UCS_PROFILE_FUNC(ucs_status_t, ucp_worker_set_am_handler, + (worker, id, cb, arg, flags), + ucp_worker_h worker, uint16_t id, + ucp_am_callback_t cb, void *arg, + uint32_t flags) +{ + size_t num_entries; + ucp_am_entry_t *am_cbs; + int i; + + UCP_CONTEXT_CHECK_FEATURE_FLAGS(worker->context, UCP_FEATURE_AM, + return UCS_ERR_INVALID_PARAM); + + UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); + + if (id >= worker->am.cbs_array_len) { + num_entries = ucs_align_up_pow2(id + 1, UCP_AM_CB_BLOCK_SIZE); + am_cbs = ucs_realloc(worker->am.cbs, num_entries * + sizeof(ucp_am_entry_t), + "UCP AM callback array"); + if (ucs_unlikely(am_cbs == NULL)) { + ucs_error("failed to grow UCP am cbs array to %zu", num_entries); + return UCS_ERR_NO_MEMORY; + } + + for (i = worker->am.cbs_array_len; i < num_entries; ++i) { + am_cbs[i].cb = NULL; + am_cbs[i].context = NULL; + am_cbs[i].flags = 0; + } + + worker->am.cbs = am_cbs; + worker->am.cbs_array_len = num_entries; + } + + worker->am.cbs[id].cb = cb; + worker->am.cbs[id].context = arg; + worker->am.cbs[id].flags = flags; + + UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker); + + return UCS_OK; +} + +static UCS_F_ALWAYS_INLINE int ucp_am_recv_check_id(ucp_worker_h worker, + uint16_t am_id) +{ + if (ucs_unlikely((am_id >= worker->am.cbs_array_len) || + (worker->am.cbs[am_id].cb == NULL))) { + ucs_warn("UCP Active Message was received with id : %u, but there" + " is no registered callback for that id", am_id); + return 0; + } + + return 1; +} + +static UCS_F_ALWAYS_INLINE void +ucp_am_fill_header(ucp_am_hdr_t *hdr, ucp_request_t *req) +{ + hdr->am_id = req->send.msg_proto.am.am_id; + hdr->flags = req->send.msg_proto.am.flags; + hdr->padding = 0; +} + +static UCS_F_ALWAYS_INLINE void +ucp_am_fill_middle_header(ucp_am_mid_hdr_t *hdr, ucp_request_t *req) +{ + hdr->msg_id = req->send.msg_proto.message_id; + hdr->offset = req->send.state.dt.offset; + hdr->ep_ptr = ucp_request_get_dest_ep_ptr(req); +} + +static UCS_F_ALWAYS_INLINE void +ucp_am_fill_first_header(ucp_am_first_hdr_t *hdr, ucp_request_t *req) +{ + ucp_am_fill_header(&hdr->super.super, req); + hdr->super.ep_ptr = ucp_request_get_dest_ep_ptr(req); + hdr->msg_id = req->send.msg_proto.message_id; + hdr->total_size = req->send.length; +} + +static size_t +ucp_am_bcopy_pack_args_single(void *dest, void *arg) +{ + ucp_am_hdr_t *hdr = dest; + ucp_request_t *req = arg; + size_t length; + + ucs_assert(req->send.state.dt.offset == 0); + + ucp_am_fill_header(hdr, req); + + length = ucp_dt_pack(req->send.ep->worker, req->send.datatype, + UCS_MEMORY_TYPE_HOST, hdr + 1, req->send.buffer, + &req->send.state.dt, req->send.length); + ucs_assert(length == req->send.length); + + return sizeof(*hdr) + length; +} + +static size_t +ucp_am_bcopy_pack_args_single_reply(void *dest, void *arg) +{ + ucp_am_reply_hdr_t *reply_hdr = dest; + ucp_request_t *req = arg; + size_t length; + + ucs_assert(req->send.state.dt.offset == 0); + + ucp_am_fill_header(&reply_hdr->super, req); + reply_hdr->ep_ptr = ucp_request_get_dest_ep_ptr(req); + + length = ucp_dt_pack(req->send.ep->worker, req->send.datatype, + UCS_MEMORY_TYPE_HOST, reply_hdr + 1, + req->send.buffer, + &req->send.state.dt, req->send.length); + ucs_assert(length == req->send.length); + + return sizeof(*reply_hdr) + length; +} + +static size_t +ucp_am_bcopy_pack_args_first(void *dest, void *arg) +{ + ucp_am_first_hdr_t *hdr = dest; + ucp_request_t *req = arg; + size_t length; + + length = ucs_min(req->send.length, + ucp_ep_get_max_bcopy(req->send.ep, req->send.lane) - + sizeof(*hdr)); + + ucp_am_fill_first_header(hdr, req); + + ucs_assert(req->send.state.dt.offset == 0); + + return sizeof(*hdr) + ucp_dt_pack(req->send.ep->worker, + req->send.datatype, + UCS_MEMORY_TYPE_HOST, + hdr + 1, req->send.buffer, + &req->send.state.dt, length); +} + +static size_t +ucp_am_bcopy_pack_args_mid(void *dest, void *arg) +{ + ucp_am_mid_hdr_t *hdr = dest; + ucp_request_t *req = arg; + size_t max_bcopy = ucp_ep_get_max_bcopy(req->send.ep, req->send.lane); + size_t length = ucs_min(max_bcopy - sizeof(*hdr), + req->send.length - req->send.state.dt.offset); + + ucp_am_fill_middle_header(hdr, req); + + return sizeof(*hdr) + ucp_dt_pack(req->send.ep->worker, + req->send.datatype, + UCS_MEMORY_TYPE_HOST, + hdr + 1, req->send.buffer, + &req->send.state.dt, length); +} + +static ucs_status_t ucp_am_send_short(ucp_ep_h ep, uint16_t id, + const void *payload, size_t length) +{ + uct_ep_h am_ep = ucp_ep_get_am_uct_ep(ep); + ucp_am_hdr_t hdr; + + UCS_STATIC_ASSERT(sizeof(ucp_am_hdr_t) == sizeof(uint64_t)); + hdr.am_id = id; + hdr.flags = 0; + hdr.padding = 0; + + return uct_ep_am_short(am_ep, UCP_AM_ID_SINGLE, hdr.u64, + (void *)payload, length); +} + +static ucs_status_t ucp_am_contig_short(uct_pending_req_t *self) +{ + ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); + ucp_ep_t *ep = req->send.ep; + ucs_status_t status; + + req->send.lane = ucp_ep_get_am_lane(ep); + status = ucp_am_send_short(ep, req->send.msg_proto.am.am_id, + req->send.buffer, req->send.length); + if (ucs_likely(status == UCS_OK)) { + ucp_request_complete_send(req, UCS_OK); + } + + return status; +} + +static ucs_status_t ucp_am_bcopy_single(uct_pending_req_t *self) +{ + ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); + ucs_status_t status; + + status = ucp_do_am_bcopy_single(self, UCP_AM_ID_SINGLE, + ucp_am_bcopy_pack_args_single); + if (status == UCS_OK) { + ucp_request_send_generic_dt_finish(req); + ucp_request_complete_send(req, UCS_OK); + } + + return status; +} + +static ucs_status_t ucp_am_bcopy_single_reply(uct_pending_req_t *self) +{ + ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); + ucs_status_t status; + + status = ucp_do_am_bcopy_single(self, UCP_AM_ID_SINGLE_REPLY, + ucp_am_bcopy_pack_args_single_reply); + if (status == UCS_OK) { + ucp_request_send_generic_dt_finish(req); + ucp_request_complete_send(req, UCS_OK); + } + + return status; +} + +static ucs_status_t ucp_am_bcopy_multi(uct_pending_req_t *self) +{ + ucs_status_t status = ucp_do_am_bcopy_multi(self, UCP_AM_ID_FIRST, + UCP_AM_ID_MIDDLE, + ucp_am_bcopy_pack_args_first, + ucp_am_bcopy_pack_args_mid, 0); + ucp_request_t *req; + + if (status == UCS_OK) { + req = ucs_container_of(self, ucp_request_t, send.uct); + ucp_request_send_generic_dt_finish(req); + ucp_request_complete_send(req, UCS_OK); + } else if (status == UCP_STATUS_PENDING_SWITCH) { + status = UCS_OK; + } + + return status; +} + +static ucs_status_t ucp_am_zcopy_single(uct_pending_req_t *self) +{ + ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); + ucp_am_hdr_t hdr; + + ucp_am_fill_header(&hdr, req); + + return ucp_do_am_zcopy_single(self, UCP_AM_ID_SINGLE, &hdr, + sizeof(hdr), ucp_proto_am_zcopy_req_complete); +} + +static ucs_status_t ucp_am_zcopy_single_reply(uct_pending_req_t *self) +{ + ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); + ucp_am_reply_hdr_t reply_hdr; + + ucp_am_fill_header(&reply_hdr.super, req); + reply_hdr.ep_ptr = ucp_request_get_dest_ep_ptr(req); + + return ucp_do_am_zcopy_single(self, UCP_AM_ID_SINGLE_REPLY, + &reply_hdr, sizeof(reply_hdr), + ucp_proto_am_zcopy_req_complete); +} + +static ucs_status_t ucp_am_zcopy_multi(uct_pending_req_t *self) +{ + ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); + ucp_am_first_hdr_t first_hdr; + ucp_am_mid_hdr_t mid_hdr; + + ucp_am_fill_first_header(&first_hdr, req); + ucp_am_fill_middle_header(&mid_hdr, req); + + return ucp_do_am_zcopy_multi(self, UCP_AM_ID_FIRST, UCP_AM_ID_MIDDLE, + &first_hdr, sizeof(first_hdr), &mid_hdr, + sizeof(mid_hdr), ucp_proto_am_zcopy_req_complete, + 1); +} + +static void ucp_am_send_req_init(ucp_request_t *req, ucp_ep_h ep, + const void *buffer, uintptr_t datatype, + size_t count, uint16_t flags, + uint16_t am_id) +{ + req->flags = UCP_REQUEST_FLAG_SEND_AM; + req->send.ep = ep; + req->send.msg_proto.am.am_id = am_id; + req->send.msg_proto.am.flags = flags; + req->send.buffer = (void *)buffer; + req->send.datatype = datatype; + req->send.mem_type = UCS_MEMORY_TYPE_HOST; + req->send.lane = ep->am_lane; + + ucp_request_send_state_init(req, datatype, count); + req->send.length = ucp_dt_length(req->send.datatype, count, + req->send.buffer, + &req->send.state.dt); +} + +static UCS_F_ALWAYS_INLINE ucs_status_ptr_t +ucp_am_send_req(ucp_request_t *req, size_t count, + const ucp_ep_msg_config_t *msg_config, + ucp_send_callback_t cb, const ucp_request_send_proto_t *proto) +{ + + size_t zcopy_thresh = ucp_proto_get_zcopy_threshold(req, msg_config, + count, SIZE_MAX); + ssize_t max_short = ucp_am_get_short_max(req, msg_config); + ucs_status_t status; + + status = ucp_request_send_start(req, max_short, + zcopy_thresh, SIZE_MAX, + count, msg_config, + proto); + if (status != UCS_OK) { + return UCS_STATUS_PTR(status); + } + + /* Start the request. + * If it is completed immediately, release the request and return the status. + * Otherwise, return the request. + */ + status = ucp_request_send(req, 0); + if (req->flags & UCP_REQUEST_FLAG_COMPLETED) { + ucs_trace_req("releasing send request %p, returning status %s", req, + ucs_status_string(status)); + ucp_request_put(req); + return UCS_STATUS_PTR(status); + } + + ucp_request_set_callback(req, send.cb, (ucp_send_nbx_callback_t)cb, NULL); + + return req + 1; +} + +UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_am_send_nb, + (ep, id, payload, count, datatype, cb, flags), + ucp_ep_h ep, uint16_t id, const void *payload, + size_t count, uintptr_t datatype, + ucp_send_callback_t cb, unsigned flags) +{ + ucs_status_t status; + ucs_status_ptr_t ret; + ucp_request_t *req; + size_t length; + + UCP_CONTEXT_CHECK_FEATURE_FLAGS(ep->worker->context, UCP_FEATURE_AM, + return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM)); + + if (ucs_unlikely((flags != 0) && !(flags & UCP_AM_SEND_REPLY))) { + return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); + } + + UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker); + + if (ucs_likely(!(flags & UCP_AM_SEND_REPLY)) && + (ucs_likely(UCP_DT_IS_CONTIG(datatype)))) { + length = ucp_contig_dt_length(datatype, count); + + if (ucs_likely((ssize_t)length <= ucp_ep_config(ep)->am.max_short)) { + status = ucp_am_send_short(ep, id, payload, length); + if (ucs_likely(status != UCS_ERR_NO_RESOURCE)) { + UCP_EP_STAT_TAG_OP(ep, EAGER); + ret = UCS_STATUS_PTR(status); + goto out; + } + } + } + + req = ucp_request_get(ep->worker); + if (ucs_unlikely(req == NULL)) { + ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); + goto out; + } + + ucp_am_send_req_init(req, ep, payload, datatype, count, flags, id); + status = ucp_ep_resolve_dest_ep_ptr(ep, ep->am_lane); + if (ucs_unlikely(status != UCS_OK)) { + ret = UCS_STATUS_PTR(status); + goto out; + } + + if (flags & UCP_AM_SEND_REPLY) { + ret = ucp_am_send_req(req, count, &ucp_ep_config(ep)->am, cb, + ucp_ep_config(ep)->am_u.reply_proto); + } else { + ret = ucp_am_send_req(req, count, &ucp_ep_config(ep)->am, cb, + ucp_ep_config(ep)->am_u.proto); + } + +out: + UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(ep->worker); + return ret; +} + +static UCS_F_ALWAYS_INLINE ucs_status_t +ucp_am_handler_common(ucp_worker_h worker, void *hdr_end, size_t hdr_size, + size_t total_length, ucp_ep_h reply_ep, uint16_t am_id, + unsigned am_flags) +{ + ucp_recv_desc_t *desc = NULL; + ucs_status_t status; + unsigned flags; + + if (ucs_unlikely(!ucp_am_recv_check_id(worker, am_id))) { + return UCS_OK; + } + + if (ucs_unlikely(am_flags & UCT_CB_PARAM_FLAG_DESC)) { + flags = UCP_CB_PARAM_FLAG_DATA; + } else { + flags = 0; + } + + status = worker->am.cbs[am_id].cb(worker->am.cbs[am_id].context, + hdr_end, total_length - hdr_size, + reply_ep, flags); + if (status != UCS_INPROGRESS) { + return UCS_OK; /* we do not need UCT desc, just return UCS_OK */ + } + + if (ucs_unlikely(!(flags & UCP_CB_PARAM_FLAG_DATA))) { + ucs_error("can't hold data, UCP_CB_PARAM_FLAG_DATA flag is not set"); + return UCS_OK; + } + + ucs_assert(am_flags & UCT_CB_PARAM_FLAG_DESC); + status = ucp_recv_desc_init(worker, hdr_end, total_length, 0, + UCT_CB_PARAM_FLAG_DESC, /* pass as a const */ + 0, 0, -hdr_size, &desc); + if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) { + ucs_error("worker %p could not allocate descriptor for active" + " message on callback : %u", worker, am_id); + return UCS_OK; + } + ucs_assert(desc != NULL); + + return UCS_INPROGRESS; +} + +static ucs_status_t +ucp_am_handler_reply(void *am_arg, void *am_data, size_t am_length, + unsigned am_flags) +{ + ucp_am_reply_hdr_t *hdr = (ucp_am_reply_hdr_t *)am_data; + ucp_worker_h worker = (ucp_worker_h)am_arg; + uint16_t am_id = hdr->super.am_id; + ucp_ep_h reply_ep; + + reply_ep = ucp_worker_get_ep_by_ptr(worker, hdr->ep_ptr); + + return ucp_am_handler_common(worker, hdr + 1, sizeof(*hdr), am_length, + reply_ep, am_id, am_flags); +} + +static ucs_status_t +ucp_am_handler(void *am_arg, void *am_data, size_t am_length, + unsigned am_flags) +{ + ucp_worker_h worker = (ucp_worker_h)am_arg; + ucp_am_hdr_t *hdr = (ucp_am_hdr_t *)am_data; + uint16_t am_id = hdr->am_id; + + return ucp_am_handler_common(worker, hdr + 1, sizeof(*hdr), am_length, + NULL, am_id, am_flags); +} + +static UCS_F_ALWAYS_INLINE ucp_recv_desc_t* +ucp_am_find_first_rdesc(ucp_worker_h worker, ucp_ep_ext_proto_t *ep_ext, + uint64_t msg_id) +{ + ucp_recv_desc_t *rdesc; + ucp_am_first_hdr_t *first_hdr; + + ucs_list_for_each(rdesc, &ep_ext->am.started_ams, am_first.list) { + first_hdr = (ucp_am_first_hdr_t*)(rdesc + 1); + if (first_hdr->msg_id == msg_id) { + return rdesc; + } + } + + return NULL; +} + +static UCS_F_ALWAYS_INLINE void +ucp_am_copy_data_fragment(ucp_recv_desc_t *first_rdesc, void *data, + size_t length, size_t offset) +{ + memcpy(UCS_PTR_BYTE_OFFSET(first_rdesc + 1, offset), data, length); + first_rdesc->am_first.remaining -= length; +} + +static UCS_F_ALWAYS_INLINE void +ucp_am_handle_unfinished(ucp_worker_h worker, ucp_recv_desc_t *first_rdesc, + void *data, size_t length, size_t offset) +{ + uint16_t am_id; + ucs_status_t status; + ucp_am_first_hdr_t *first_hdr; + void *msg; + ucp_ep_h reply_ep; + + ucp_am_copy_data_fragment(first_rdesc, data, length, offset); + + if (first_rdesc->am_first.remaining > 0) { + /* not all fragments arrived yet */ + return; + } + + first_hdr = (ucp_am_first_hdr_t*)(first_rdesc + 1); + am_id = first_hdr->super.super.am_id; + msg = first_hdr + 1; + + /* message assembled, remove first fragment descriptor from the list in + * ep AM extension */ + ucs_list_del(&first_rdesc->am_first.list); + + if (ucs_unlikely(!ucp_am_recv_check_id(worker, am_id))) { + goto out_free_data; + } + + reply_ep = (first_hdr->super.super.flags & UCP_AM_SEND_REPLY) ? + ucp_worker_get_ep_by_ptr(worker, first_hdr->super.ep_ptr) : NULL; + + status = worker->am.cbs[am_id].cb(worker->am.cbs[am_id].context, msg, + first_hdr->total_size, reply_ep, + UCP_CB_PARAM_FLAG_DATA); + if (status != UCS_INPROGRESS) { + goto out_free_data; + } + + /* Need to reinit descriptor, because we passed data shifted by + * ucp_am_first_hdr_t size to the cb. In ucp_am_data_release function, + * we calculate desc as "data_pointer - sizeof(desc)", which would not point + * to the beginning of the original desc. + * original desc layout: |desc|first_hdr|data| + * new desc layout: |desc|data| (first header is not needed + * anymore, can overwrite) + */ + first_rdesc = (ucp_recv_desc_t*)msg - 1; + first_rdesc->flags = UCP_RECV_DESC_FLAG_MALLOC; + + return; + +out_free_data: + /* user does not need to hold this data */ + ucs_free(first_rdesc); + return; +} + +static ucs_status_t ucp_am_long_first_handler(void *am_arg, void *am_data, + size_t am_length, unsigned am_flags) +{ + ucp_worker_h worker = am_arg; + ucp_am_first_hdr_t *first_hdr = am_data; + ucp_ep_h ep = ucp_worker_get_ep_by_ptr(worker, + first_hdr->super.ep_ptr); + ucp_ep_ext_proto_t *ep_ext = ucp_ep_ext_proto(ep); + uint16_t am_id = first_hdr->super.super.am_id; + ucp_recv_desc_t *mid_rdesc, *first_rdesc; + ucp_ep_h reply_ep; + ucp_am_mid_hdr_t *mid_hdr; + ucs_queue_iter_t iter; + size_t remaining; + + remaining = first_hdr->total_size - (am_length - sizeof(*first_hdr)); + + if (ucs_unlikely(remaining == 0)) { + /* Can be a single fragment if send was issued on stub ep */ + reply_ep = (first_hdr->super.super.flags & UCP_AM_SEND_REPLY) ? ep : NULL; + return ucp_am_handler_common(worker, first_hdr + 1, sizeof(*first_hdr), + am_length, reply_ep, am_id, am_flags); + } + + /* This is the first fragment, other fragments (if arrived) should be on + * ep_ext->am.mid_rdesc_q queue */ + ucs_assert(NULL == ucp_am_find_first_rdesc(worker, ep_ext, + first_hdr->msg_id)); + + /* Alloc buffer for the data and its desc, as we know total_size. + * Need to allocate a separate rdesc which would be in one contigious chunk + * with data buffer. */ + first_rdesc = ucs_malloc(first_hdr->total_size + sizeof(ucp_recv_desc_t) + + sizeof(*first_hdr), + "ucp recv desc for long AM"); + if (ucs_unlikely(first_rdesc == NULL)) { + ucs_error("failed to allocate buffer for assembling UCP AM (id %u)", + am_id); + return UCS_OK; /* release UCT desc */ + } + + first_rdesc->am_first.remaining = first_hdr->total_size + sizeof(*first_hdr); + + /* Copy all already arrived middle fragments to the data buffer */ + ucs_queue_for_each_safe(mid_rdesc, iter, &ep_ext->am.mid_rdesc_q, + am_mid_queue) { + mid_hdr = (ucp_am_mid_hdr_t*)(mid_rdesc + 1); + if (mid_hdr->msg_id != first_hdr->msg_id) { + continue; + } + ucs_queue_del_iter(&ep_ext->am.mid_rdesc_q, iter); + ucp_am_copy_data_fragment(first_rdesc, mid_hdr + 1, + mid_rdesc->length - sizeof(*mid_hdr), + mid_hdr->offset + sizeof(*first_hdr)); + ucp_recv_desc_release(mid_rdesc); + } + + ucs_list_add_tail(&ep_ext->am.started_ams, &first_rdesc->am_first.list); + + /* Note: copy first chunk of data together with header, which contains + * data needed to process other fragments. */ + ucp_am_handle_unfinished(worker, first_rdesc, first_hdr, am_length, 0); + + return UCS_OK; /* release UCT desc */ +} + +static UCS_F_ALWAYS_INLINE ucs_status_t +ucp_am_long_middle_handler(void *am_arg, void *am_data, size_t am_length, + unsigned am_flags) +{ + ucp_worker_h worker = am_arg; + ucp_am_mid_hdr_t *mid_hdr = am_data; + ucp_ep_h ep = ucp_worker_get_ep_by_ptr(worker, + mid_hdr->ep_ptr); + ucp_ep_ext_proto_t *ep_ext = ucp_ep_ext_proto(ep); + uint64_t msg_id = mid_hdr->msg_id; + ucp_recv_desc_t *mid_rdesc = NULL, *first_rdesc = NULL; + ucs_status_t status; + + first_rdesc = ucp_am_find_first_rdesc(worker, ep_ext, msg_id); + if (first_rdesc != NULL) { + /* First fragment already arrived, just copy the data */ + ucp_am_handle_unfinished(worker, first_rdesc, mid_hdr + 1, + am_length - sizeof(*mid_hdr), + mid_hdr->offset + sizeof(ucp_am_first_hdr_t)); + return UCS_OK; /* data is copied, release UCT desc */ + } + + /* Init desc and put it on the queue in ep AM extension, because data + * buffer is not allocated yet. When first fragment arrives (carrying total + * data size), all middle fragments will be copied to the data buffer. */ + status = ucp_recv_desc_init(worker, am_data, am_length, 0, am_flags, + sizeof(*mid_hdr), 0, 0, &mid_rdesc); + if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) { + ucs_error("worker %p could not allocate desc for assembling AM", + worker); + return UCS_OK; /* release UCT desc */ + } + + ucs_assert(mid_rdesc != NULL); + ucs_queue_push(&ep_ext->am.mid_rdesc_q, &mid_rdesc->am_mid_queue); + + return status; +} + +UCP_DEFINE_AM(UCP_FEATURE_AM, UCP_AM_ID_SINGLE, + ucp_am_handler, NULL, 0); +UCP_DEFINE_AM(UCP_FEATURE_AM, UCP_AM_ID_FIRST, + ucp_am_long_first_handler, NULL, 0); +UCP_DEFINE_AM(UCP_FEATURE_AM, UCP_AM_ID_MIDDLE, + ucp_am_long_middle_handler, NULL, 0); +UCP_DEFINE_AM(UCP_FEATURE_AM, UCP_AM_ID_SINGLE_REPLY, + ucp_am_handler_reply, NULL, 0); + +const ucp_request_send_proto_t ucp_am_proto = { + .contig_short = ucp_am_contig_short, + .bcopy_single = ucp_am_bcopy_single, + .bcopy_multi = ucp_am_bcopy_multi, + .zcopy_single = ucp_am_zcopy_single, + .zcopy_multi = ucp_am_zcopy_multi, + .zcopy_completion = ucp_proto_am_zcopy_completion, + .only_hdr_size = sizeof(ucp_am_hdr_t) +}; + +const ucp_request_send_proto_t ucp_am_reply_proto = { + .contig_short = NULL, + .bcopy_single = ucp_am_bcopy_single_reply, + .bcopy_multi = ucp_am_bcopy_multi, + .zcopy_single = ucp_am_zcopy_single_reply, + .zcopy_multi = ucp_am_zcopy_multi, + .zcopy_completion = ucp_proto_am_zcopy_completion, + .only_hdr_size = sizeof(ucp_am_reply_hdr_t) +}; diff --git a/src/ucp/core/ucp_am.h b/src/ucp/core/ucp_am.h new file mode 100644 index 00000000000..6a7549700dc --- /dev/null +++ b/src/ucp/core/ucp_am.h @@ -0,0 +1,79 @@ +/** + * Copyright (C) Los Alamos National Security, LLC. 2019 ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCP_AM_H_ +#define UCP_AM_H_ + +#include "ucp_ep.h" + + +#define UCP_AM_CB_BLOCK_SIZE 16 + + +/** + * Data that is stored about each callback registered with a worker + */ +typedef struct ucp_am_entry { + ucp_am_callback_t cb; /* user defined callback*/ + void *context; /* user defined callback argument */ + unsigned flags; /* flags affecting callback behavior */ +} ucp_am_entry_t; + + +typedef struct ucp_am_context { + ucp_am_entry_t *cbs; /* array of callbacks and their data */ + size_t cbs_array_len; /* len of callbacks array */ +} ucp_am_context_t; + + +typedef union { + struct { + uint16_t am_id; /* index into callback array */ + uint16_t flags; /* operation flags */ + uint32_t padding; + }; + + uint64_t u64; /* this is used to ensure the size of + the header is 64 bytes and aligned */ +} UCS_S_PACKED ucp_am_hdr_t; + + +typedef struct { + ucp_am_hdr_t super; + uintptr_t ep_ptr; /* ep which can be used for reply */ +} UCS_S_PACKED ucp_am_reply_hdr_t; + + +typedef struct { + ucp_am_reply_hdr_t super; + uint64_t msg_id; /* method to match parts of the same AM */ + size_t total_size; /* length of buffer needed for all data */ +} UCS_S_PACKED ucp_am_first_hdr_t; + + +typedef struct { + uint64_t msg_id; /* method to match parts of the same AM */ + size_t offset; /* offset in the entire AM buffer */ + uintptr_t ep_ptr; /* ep which can be used for reply */ +} UCS_S_PACKED ucp_am_mid_hdr_t; + + +typedef struct { + ucs_list_link_t list; /* entry into list of unfinished AM's */ + size_t remaining; /* how many bytes left to receive */ +} ucp_am_first_desc_t; + + +ucs_status_t ucp_am_init(ucp_worker_h worker); + +void ucp_am_cleanup(ucp_worker_h worker); + +void ucp_am_ep_init(ucp_ep_h ep); + +void ucp_am_ep_cleanup(ucp_ep_h ep); + +#endif diff --git a/src/ucp/core/ucp_am.inl b/src/ucp/core/ucp_am.inl new file mode 100644 index 00000000000..d9604d6a5dd --- /dev/null +++ b/src/ucp/core/ucp_am.inl @@ -0,0 +1,19 @@ +/** +* Copyright (C) Los Alamos National Security, LLC. 2019 ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#include + +static UCS_F_ALWAYS_INLINE ssize_t +ucp_am_get_short_max(const ucp_request_t *req, + const ucp_ep_msg_config_t *msg_config) +{ + return (!UCP_DT_IS_CONTIG(req->send.datatype) || + (req->flags & UCP_REQUEST_FLAG_SYNC) || + (!UCP_MEM_IS_ACCESSIBLE_FROM_CPU(req->send.mem_type))) || + ((req->flags & UCP_REQUEST_FLAG_SEND_AM) && + (req->send.msg_proto.am.flags & UCP_AM_SEND_REPLY)) ? + -1 : msg_config->max_short; +} diff --git a/src/ucp/core/ucp_context.c b/src/ucp/core/ucp_context.c index 930eca5b374..5f64f2ed73e 100644 --- a/src/ucp/core/ucp_context.c +++ b/src/ucp/core/ucp_context.c @@ -1,30 +1,34 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. * Copyright (C) ARM Ltd. 2016. ALL RIGHTS RESERVED. - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. + * Copyright (C) NVIDIA Corporation. 2019. ALL RIGHTS RESERVED. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ucp_context.h" #include "ucp_request.h" -#include #include #include #include #include +#include #include #include #include #include -#include #include #define UCP_RSC_CONFIG_ALL "all" -ucp_am_handler_t ucp_am_handlers[UCP_AM_ID_MAX] = {{0}}; +ucp_am_handler_t ucp_am_handlers[UCP_AM_ID_MAX] = {{0, NULL, NULL}}; static const char *ucp_atomic_modes[] = { [UCP_ATOMIC_MODE_CPU] = "cpu", @@ -47,13 +51,6 @@ static const char * ucp_rndv_modes[] = { [UCP_RNDV_MODE_LAST] = NULL, }; -uct_memory_type_t ucm_to_uct_mem_type_map[] = { - [UCM_MEM_TYPE_CUDA] = UCT_MD_MEM_TYPE_CUDA, - [UCM_MEM_TYPE_CUDA_MANAGED] = UCT_MD_MEM_TYPE_CUDA_MANAGED, - [UCM_MEM_TYPE_ROCM] = UCT_MD_MEM_TYPE_ROCM, - [UCM_MEM_TYPE_ROCM_MANAGED] = UCT_MD_MEM_TYPE_ROCM_MANAGED, -}; - static ucs_config_field_t ucp_config_table[] = { {"NET_DEVICES", UCP_RSC_CONFIG_ALL, "Specifies which network device(s) to use. The order is not meaningful.\n" @@ -77,16 +74,21 @@ static ucs_config_field_t ucp_config_table[] = { {"TLS", UCP_RSC_CONFIG_ALL, "Comma-separated list of transports to use. The order is not meaningful.\n" - " - all : use all the available transports.\n" - " - sm/shm : all shared memory transports.\n" - " - mm : shared memory transports - only memory mappers.\n" - " - ugni : ugni_rdma and ugni_udt.\n" - " - ib : all infiniband transports.\n" - " - rc : rc verbs (uses ud for bootstrap).\n" - " - rc_x : rc with accelerated verbs (uses ud_x for bootstrap).\n" - " - ud : ud verbs.\n" - " - ud_x : ud with accelerated verbs.\n" - " - dc_x : dc with accelerated verbs.\n" + " - all : use all the available transports.\n" + " - sm/shm : all shared memory transports (mm, cma, knem).\n" + " - mm : shared memory transports - only memory mappers.\n" + " - ugni : ugni_smsg and ugni_rdma (uses ugni_udt for bootstrap).\n" + " - ib : all infiniband transports (rc/rc_mlx5, ud/ud_mlx5, dc_mlx5).\n" + " - rc_v : rc verbs (uses ud for bootstrap).\n" + " - rc_x : rc with accelerated verbs (uses ud_mlx5 for bootstrap).\n" + " - rc : rc_v and rc_x (preferably if available).\n" + " - ud_v : ud verbs.\n" + " - ud_x : ud with accelerated verbs.\n" + " - ud : ud_v and ud_x (preferably if available).\n" + " - dc/dc_x : dc with accelerated verbs.\n" + " - tcp : sockets over TCP/IP.\n" + " - cuda : CUDA (NVIDIA GPU) memory support.\n" + " - rocm : ROCm (AMD GPU) memory support.\n" " Using a \\ prefix before a transport name treats it as an explicit transport name\n" " and disables aliasing.\n", ucs_offsetof(ucp_config_t, tls), UCS_CONFIG_TYPE_STRING_ARRAY}, @@ -94,11 +96,16 @@ static ucs_config_field_t ucp_config_table[] = { {"ALLOC_PRIO", "md:sysv,md:posix,huge,thp,md:*,mmap,heap", "Priority of memory allocation methods. Each item in the list can be either\n" "an allocation method (huge, thp, mmap, libc) or md: which means to use the\n" - "specified memory domain for allocation. NAME can be either a MD component\n" - "name, or a wildcard - '*' - which expands to all MD components.", + "specified memory domain for allocation. NAME can be either a UCT component\n" + "name, or a wildcard - '*' - which is equivalent to all UCT components.", ucs_offsetof(ucp_config_t, alloc_prio), UCS_CONFIG_TYPE_STRING_ARRAY}, - {"SOCKADDR_AUX_TLS", "ud,ud_x", + {"SOCKADDR_TLS_PRIORITY", "rdmacm,sockcm", + "Priority of sockaddr transports for client/server connection establishment.\n" + "The '*' wildcard expands to all the available sockaddr transports.", + ucs_offsetof(ucp_config_t, sockaddr_cm_tls), UCS_CONFIG_TYPE_STRING_ARRAY}, + + {"SOCKADDR_AUX_TLS", "ud", "Transports to use for exchanging additional address information while\n" "establishing client/server connection. ", ucs_offsetof(ucp_config_t, sockaddr_aux_tls), UCS_CONFIG_TYPE_STRING_ARRAY}, @@ -130,6 +137,11 @@ static ucs_config_field_t ucp_config_table[] = { "the eager_zcopy protocol", ucs_offsetof(ucp_config_t, ctx.rndv_perf_diff), UCS_CONFIG_TYPE_DOUBLE}, + {"MULTI_LANE_MAX_RATIO", "10", + "Maximal allowed ratio between slowest and fastest lane in a multi-lane " + "protocol. Lanes slower than the specified ratio will not be used.", + ucs_offsetof(ucp_config_t, ctx.multi_lane_max_ratio), UCS_CONFIG_TYPE_DOUBLE}, + {"MAX_EAGER_LANES", NULL, "", ucs_offsetof(ucp_config_t, ctx.max_eager_lanes), UCS_CONFIG_TYPE_UINT}, @@ -151,13 +163,17 @@ static ucs_config_field_t ucp_config_table[] = { " auto - runtime automatically chooses optimal scheme to use.\n", ucs_offsetof(ucp_config_t, ctx.rndv_mode), UCS_CONFIG_TYPE_ENUM(ucp_rndv_modes)}, + {"RKEY_PTR_SEG_SIZE", "512k", + "Segment size that is used to perform data transfer when doing RKEY PTR progress", + ucs_offsetof(ucp_config_t, ctx.rkey_ptr_seg_size), UCS_CONFIG_TYPE_MEMUNITS}, + {"ZCOPY_THRESH", "auto", "Threshold for switching from buffer copy to zero copy protocol", ucs_offsetof(ucp_config_t, ctx.zcopy_thresh), UCS_CONFIG_TYPE_MEMUNITS}, - {"BCOPY_BW", "5800mb", + {"BCOPY_BW", "auto", "Estimation of buffer copy bandwidth", - ucs_offsetof(ucp_config_t, ctx.bcopy_bw), UCS_CONFIG_TYPE_MEMUNITS}, + ucs_offsetof(ucp_config_t, ctx.bcopy_bw), UCS_CONFIG_TYPE_BW}, {"ATOMIC_MODE", "guess", "Atomic operations synchronization mode.\n" @@ -170,14 +186,18 @@ static ucs_config_field_t ucp_config_table[] = { " Otherwise the CPU mode is selected.", ucs_offsetof(ucp_config_t, ctx.atomic_mode), UCS_CONFIG_TYPE_ENUM(ucp_atomic_modes)}, - {"MAX_WORKER_NAME", UCS_PP_MAKE_STRING(UCP_WORKER_NAME_MAX), - "Maximal length of worker name. " + {"ADDRESS_DEBUG_INFO", #if ENABLE_DEBUG_DATA - "Sent to remote peer as part of worker address." + "y", #else - "Not sent to remote peer per build configuration." + "n", #endif - , + "Add debugging information to worker address.", + ucs_offsetof(ucp_config_t, ctx.address_debug_info), UCS_CONFIG_TYPE_BOOL}, + + {"MAX_WORKER_NAME", UCS_PP_MAKE_STRING(UCP_WORKER_NAME_MAX), + "Maximal length of worker name. Sent to remote peer as part of worker address\n" + "if UCX_ADDRESS_DEBUG_INFO is set to 'yes'", ucs_offsetof(ucp_config_t, ctx.max_worker_name), UCS_CONFIG_TYPE_UINT}, {"USE_MT_MUTEX", "n", "Use mutex for multithreading support in UCP.\n" @@ -214,6 +234,11 @@ static ucs_config_field_t ucp_config_table[] = { "cases (non-contig buffer, or sender wildcard).", ucs_offsetof(ucp_config_t, ctx.tm_force_thresh), UCS_CONFIG_TYPE_MEMUNITS}, + {"TM_SW_RNDV", "n", + "Use software rendezvous protocol with tag offload. If enabled, tag offload\n" + "mode will be used for messages sent with eager protocol only.", + ucs_offsetof(ucp_config_t, ctx.tm_sw_rndv), UCS_CONFIG_TYPE_BOOL}, + {"NUM_EPS", "auto", "An optimization hint of how many endpoints would be created on this context.\n" "Does not affect semantics, but only transport selection criteria and the\n" @@ -222,12 +247,22 @@ static ucs_config_field_t ucp_config_table[] = { "to ucp_init()", ucs_offsetof(ucp_config_t, ctx.estimated_num_eps), UCS_CONFIG_TYPE_ULUNITS}, - {"RNDV_FRAG_SIZE", "256k", + {"NUM_PPN", "auto", + "An optimization hint for the number of processes expected to be launched\n" + "on a single node. Does not affect semantics, only transport selection criteria\n" + "and the resulting performance.\n", + ucs_offsetof(ucp_config_t, ctx.estimated_num_ppn), UCS_CONFIG_TYPE_ULUNITS}, + + {"RNDV_FRAG_SIZE", "512k", "RNDV fragment size \n", ucs_offsetof(ucp_config_t, ctx.rndv_frag_size), UCS_CONFIG_TYPE_MEMUNITS}, + {"RNDV_PIPELINE_SEND_THRESH", "inf", + "RNDV size threshold to enable sender side pipeline for mem type\n", + ucs_offsetof(ucp_config_t, ctx.rndv_pipeline_send_thresh), UCS_CONFIG_TYPE_MEMUNITS}, + {"MEMTYPE_CACHE", "y", - "Enable memory type(cuda) cache \n", + "Enable memory type (cuda/rocm) cache \n", ucs_offsetof(ucp_config_t, ctx.enable_memtype_cache), UCS_CONFIG_TYPE_BOOL}, {"FLUSH_WORKER_EPS", "y", @@ -242,29 +277,60 @@ static ucs_config_field_t ucp_config_table[] = { "of all entities which connect to each other are the same.", ucs_offsetof(ucp_config_t, ctx.unified_mode), UCS_CONFIG_TYPE_BOOL}, + {"SOCKADDR_CM_ENABLE", "n" /* TODO: set try by default */, + "Enable alternative wireup protocol for sockaddr connected endpoints.\n" + "Enabling this mode changes underlying UCT mechanism for connection\n" + "establishment and enables synchronized close protocol which does not\n" + "require out of band synchronization before destroying UCP resources.", + ucs_offsetof(ucp_config_t, ctx.sockaddr_cm_enable), UCS_CONFIG_TYPE_TERNARY}, + + {"PROTO_ENABLE", "n", + "Experimental: enable new protocol selection logic", + ucs_offsetof(ucp_config_t, ctx.proto_enable), UCS_CONFIG_TYPE_BOOL}, + {NULL} }; UCS_CONFIG_REGISTER_TABLE(ucp_config_table, "UCP context", NULL, ucp_config_t) static ucp_tl_alias_t ucp_tl_aliases[] = { - { "sm", { "mm", "knem", "cma", "rdmacm", NULL } }, - { "shm", { "mm", "knem", "cma", "rdmacm", NULL } }, - { "ib", { "rc", "ud", "rc_mlx5", "ud_mlx5", "dc_mlx5", "rdmacm", NULL } }, - { "ud", { "ud", "rdmacm", NULL } }, + { "mm", { "posix", "sysv", "xpmem" } }, /* for backward compatibility */ + { "sm", { "posix", "sysv", "xpmem", "knem", "cma", "rdmacm", "sockcm", NULL } }, + { "shm", { "posix", "sysv", "xpmem", "knem", "cma", "rdmacm", "sockcm", NULL } }, + { "ib", { "rc_verbs", "ud_verbs", "rc_mlx5", "ud_mlx5", "dc_mlx5", "rdmacm", NULL } }, + { "ud_v", { "ud_verbs", "rdmacm", NULL } }, { "ud_x", { "ud_mlx5", "rdmacm", NULL } }, - { "rc", { "rc", "ud:aux", "rdmacm", NULL } }, + { "ud", { "ud_mlx5", "ud_verbs", "rdmacm", NULL } }, + { "rc_v", { "rc_verbs", "ud_verbs:aux", "rdmacm", NULL } }, { "rc_x", { "rc_mlx5", "ud_mlx5:aux", "rdmacm", NULL } }, + { "rc", { "rc_mlx5", "ud_mlx5:aux", "rc_verbs", "ud_verbs:aux", "rdmacm", NULL } }, { "dc", { "dc_mlx5", "rdmacm", NULL } }, { "dc_x", { "dc_mlx5", "rdmacm", NULL } }, { "ugni", { "ugni_smsg", "ugni_udt:aux", "ugni_rdma", NULL } }, + { "cuda", { "cuda_copy", "cuda_ipc", "gdr_copy", NULL } }, + { "rocm", { "rocm_copy", "rocm_ipc", "rocm_gdr", NULL } }, { NULL } }; +const char *ucp_feature_str[] = { + [ucs_ilog2(UCP_FEATURE_TAG)] = "UCP_FEATURE_TAG", + [ucs_ilog2(UCP_FEATURE_RMA)] = "UCP_FEATURE_RMA", + [ucs_ilog2(UCP_FEATURE_AMO32)] = "UCP_FEATURE_AMO32", + [ucs_ilog2(UCP_FEATURE_AMO64)] = "UCP_FEATURE_AMO64", + [ucs_ilog2(UCP_FEATURE_WAKEUP)] = "UCP_FEATURE_WAKEUP", + [ucs_ilog2(UCP_FEATURE_STREAM)] = "UCP_FEATURE_STREAM", + [ucs_ilog2(UCP_FEATURE_AM)] = "UCP_FEATURE_AM", + [ucs_ilog2(UCP_FEATURE_GROUPS)] = "UCP_FEATURE_GROUPS", + NULL +}; + + ucs_status_t ucp_config_read(const char *env_prefix, const char *filename, ucp_config_t **config_p) { + unsigned full_prefix_len = sizeof(UCS_DEFAULT_ENV_PREFIX) + 1; + unsigned env_prefix_len = 0; ucp_config_t *config; ucs_status_t status; @@ -274,16 +340,37 @@ ucs_status_t ucp_config_read(const char *env_prefix, const char *filename, goto err; } - status = ucs_config_parser_fill_opts(config, ucp_config_table, env_prefix, - NULL, 0); + if (env_prefix != NULL) { + env_prefix_len = strlen(env_prefix); + full_prefix_len += env_prefix_len; + } + + config->env_prefix = ucs_malloc(full_prefix_len, "ucp config"); + if (config->env_prefix == NULL) { + status = UCS_ERR_NO_MEMORY; + goto err_free_config; + } + + if (env_prefix_len != 0) { + ucs_snprintf_zero(config->env_prefix, full_prefix_len, "%s_%s", + env_prefix, UCS_DEFAULT_ENV_PREFIX); + } else { + ucs_snprintf_zero(config->env_prefix, full_prefix_len, "%s", + UCS_DEFAULT_ENV_PREFIX); + } + + status = ucs_config_parser_fill_opts(config, ucp_config_table, + config->env_prefix, NULL, 0); if (status != UCS_OK) { - goto err_free; + goto err_free_prefix; } *config_p = config; return UCS_OK; -err_free: +err_free_prefix: + ucs_free(config->env_prefix); +err_free_config: ucs_free(config); err: return status; @@ -292,6 +379,7 @@ ucs_status_t ucp_config_read(const char *env_prefix, const char *filename, void ucp_config_release(ucp_config_t *config) { ucs_config_parser_release_opts(config, ucp_config_table); + ucs_free(config->env_prefix); ucs_free(config); } @@ -304,8 +392,8 @@ ucs_status_t ucp_config_modify(ucp_config_t *config, const char *name, void ucp_config_print(const ucp_config_t *config, FILE *stream, const char *title, ucs_config_print_flags_t print_flags) { - ucs_config_parser_print_opts(stream, title, config, ucp_config_table, NULL, - print_flags); + ucs_config_parser_print_opts(stream, title, config, ucp_config_table, + NULL, UCS_DEFAULT_ENV_PREFIX, print_flags); } /* Search str in the array. If str_suffix is specified, search for @@ -440,7 +528,8 @@ static int ucp_is_resource_in_transports_list(const char *tl_name, */ alias_arr_count = ucp_tl_alias_count(alias); snprintf(info, sizeof(info), "for alias '%s'", alias->alias); - tmp_rsc_flags = 0; + dummy_mask = 0; + tmp_rsc_flags = 0; tmp_tl_cfg_mask = 0; if (ucp_config_is_tl_enabled(names, count, alias->alias, 1, &tmp_rsc_flags, &tmp_tl_cfg_mask) && @@ -483,7 +572,7 @@ static int ucp_is_resource_enabled(const uct_tl_resource_desc_t *resource, } static void ucp_add_tl_resource_if_enabled(ucp_context_h context, ucp_tl_md_t *md, - ucp_rsc_index_t md_index, + ucp_md_index_t md_index, const ucp_config_t *config, const uct_tl_resource_desc_t *resource, uint8_t rsc_flags, unsigned *num_resources_p, @@ -516,13 +605,16 @@ static void ucp_add_tl_resource_if_enabled(ucp_context_h context, ucp_tl_md_t *m } } -static ucs_status_t ucp_add_tl_resources(ucp_context_h context, ucp_tl_md_t *md, - ucp_rsc_index_t md_index, +static ucs_status_t ucp_add_tl_resources(ucp_context_h context, + ucp_md_index_t md_index, const ucp_config_t *config, unsigned *num_resources_p, + ucs_string_set_t avail_devices[], + ucs_string_set_t *avail_tls, uint64_t dev_cfg_masks[], uint64_t *tl_cfg_mask) { + ucp_tl_md_t *md = &context->tl_mds[md_index]; uct_tl_resource_desc_t *tl_resources; uct_tl_resource_desc_t sa_rsc; ucp_tl_resource_desc_t *tmp; @@ -567,6 +659,12 @@ static ucs_status_t ucp_add_tl_resources(ucp_context_h context, ucp_tl_md_t *md, /* copy only the resources enabled by user configuration */ context->tl_rscs = tmp; for (i = 0; i < num_tl_resources; ++i) { + if (!(md->attr.cap.flags & UCT_MD_FLAG_SOCKADDR)) { + ucs_string_set_addf(&avail_devices[tl_resources[i].dev_type], + "'%s'(%s)", tl_resources[i].dev_name, + context->tl_cmpts[md->cmpt_index].attr.name); + ucs_string_set_add(avail_tls, tl_resources[i].tl_name); + } ucp_add_tl_resource_if_enabled(context, md, md_index, config, &tl_resources[i], 0, num_resources_p, dev_cfg_masks, tl_cfg_mask); @@ -592,16 +690,55 @@ static ucs_status_t ucp_add_tl_resources(ucp_context_h context, ucp_tl_md_t *md, return status; } +static void ucp_get_aliases_set(ucs_string_set_t *avail_tls) +{ + ucp_tl_alias_t *alias; + const char **tl_name; + + for (alias = ucp_tl_aliases; alias->alias != NULL; ++alias) { + for (tl_name = alias->tls; *tl_name != NULL; ++tl_name) { + if (ucs_string_set_contains(avail_tls, *tl_name)) { + ucs_string_set_add(avail_tls, alias->alias); + break; + } + } + } +} + static void ucp_report_unavailable(const ucs_config_names_array_t* cfg, - uint64_t mask, const char *title) + uint64_t mask, const char *title1, + const char *title2, + const ucs_string_set_t *avail_names) { - int i; + ucs_string_buffer_t avail_strb, unavail_strb; + unsigned i; + int found; + ucs_string_buffer_init(&unavail_strb); + + found = 0; for (i = 0; i < cfg->count; i++) { - if (!(mask & UCS_BIT(i)) && strcmp(cfg->names[i], UCP_RSC_CONFIG_ALL)) { - ucs_warn("%s '%s' is not available", title, cfg->names[i]); + if (!(mask & UCS_BIT(i)) && strcmp(cfg->names[i], UCP_RSC_CONFIG_ALL) && + !ucs_string_set_contains(avail_names, cfg->names[i])) { + ucs_string_buffer_appendf(&unavail_strb, "%s'%s'", + found++ ? "," : "", + cfg->names[i]); } } + + if (found) { + ucs_string_buffer_init(&avail_strb); + ucs_string_set_print_sorted(avail_names, &avail_strb, ", "); + ucs_warn("%s%s%s %s %s not available, please use one or more of: %s", + title1, title2, + (found > 1) ? "s" : "", + ucs_string_buffer_cstr(&unavail_strb), + (found > 1) ? "are" : "is", + ucs_string_buffer_cstr(&avail_strb)); + ucs_string_buffer_cleanup(&avail_strb); + } + + ucs_string_buffer_cleanup(&unavail_strb); } const char * ucp_find_tl_name_by_csum(ucp_context_t *context, uint16_t tl_name_csum) @@ -652,52 +789,6 @@ const char* ucp_tl_bitmap_str(ucp_context_h context, uint64_t tl_bitmap, return str; } -static const char* ucp_feature_flag_str(unsigned feature_flag) -{ - switch (feature_flag) { - case UCP_FEATURE_TAG: - return "UCP_FEATURE_TAG"; - case UCP_FEATURE_RMA: - return "UCP_FEATURE_RMA"; - case UCP_FEATURE_AMO32: - return "UCP_FEATURE_AMO32"; - case UCP_FEATURE_AMO64: - return "UCP_FEATURE_AMO64"; - case UCP_FEATURE_WAKEUP: - return "UCP_FEATURE_WAKEUP"; - case UCP_FEATURE_STREAM: - return "UCP_FEATURE_STREAM"; - case UCP_FEATURE_GROUPS: - return "UCP_FEATURE_GROUPS"; - default: - ucs_fatal("Unknown feature flag value %u", feature_flag); - } -} - -const char* ucp_feature_flags_str(unsigned feature_flags, char *str, - size_t max_str_len) -{ - unsigned i, count; - char *p, *endp; - - p = str; - endp = str + max_str_len; - count = 0; - - ucs_for_each_bit(i, feature_flags) { - ucs_snprintf_zero(p, endp - p, "%s%s", (count == 0) ? "" : "|", - ucp_feature_flag_str(UCS_BIT(i))); - count++; - p += strlen(p); - } - - if (count == 0) { - ucs_assert(max_str_len > 0); - str[0] = '\0'; /* empty string */ - } - - return str; -} static void ucp_free_resources(ucp_context_t *context) { @@ -712,6 +803,7 @@ static void ucp_free_resources(ucp_context_t *context) uct_md_close(context->tl_mds[i].md); } ucs_free(context->tl_mds); + ucs_free(context->tl_cmpts); } static ucs_status_t ucp_check_resource_config(const ucp_config_t *config) @@ -738,23 +830,27 @@ static ucs_status_t ucp_check_resource_config(const ucp_config_t *config) return UCS_OK; } -static ucs_status_t ucp_fill_tl_md(const uct_md_resource_desc_t *md_rsc, +static ucs_status_t ucp_fill_tl_md(ucp_context_h context, + ucp_rsc_index_t cmpt_index, + const uct_md_resource_desc_t *md_rsc, ucp_tl_md_t *tl_md) { uct_md_config_t *md_config; ucs_status_t status; - /* Save MD resource */ - tl_md->rsc = *md_rsc; + /* Initialize tl_md structure */ + tl_md->cmpt_index = cmpt_index; + tl_md->rsc = *md_rsc; /* Read MD configuration */ - status = uct_md_config_read(md_rsc->md_name, NULL, NULL, &md_config); + status = uct_md_config_read(context->tl_cmpts[cmpt_index].cmpt, NULL, NULL, + &md_config); if (status != UCS_OK) { return status; } - /* Open MD */ - status = uct_md_open(md_rsc->md_name, md_config, &tl_md->md); + status = uct_md_open(context->tl_cmpts[cmpt_index].cmpt, md_rsc->md_name, + md_config, &tl_md->md); uct_config_release(md_config); if (status != UCS_OK) { return status; @@ -832,16 +928,16 @@ static void ucp_fill_sockaddr_aux_tls_config(ucp_context_h context, const ucp_config_t *config) { const char **tl_names = (const char**)config->sockaddr_aux_tls.aux_tls; - unsigned count = config->sockaddr_aux_tls.count; + unsigned count = config->sockaddr_aux_tls.count; + uint8_t dummy_flags = 0; + uint64_t dummy_mask = 0; ucp_rsc_index_t tl_id; - uint8_t dummy_flags; - uint64_t dummy_mask; context->config.sockaddr_aux_rscs_bitmap = 0; /* Check if any of the context's resources are present in the sockaddr * auxiliary transports for the client-server flow */ - for (tl_id = 0; tl_id < context->num_tls; ++tl_id) { + ucs_for_each_bit(tl_id, context->tl_bitmap) { if (ucp_is_resource_in_transports_list(context->tl_rscs[tl_id].tl_rsc.tl_name, tl_names, count, &dummy_flags, &dummy_mask)) { @@ -850,6 +946,109 @@ static void ucp_fill_sockaddr_aux_tls_config(ucp_context_h context, } } +static void ucp_fill_sockaddr_tls_prio_list(ucp_context_h context, + const char **sockaddr_tl_names, + ucp_rsc_index_t num_sockaddr_tls) +{ + uint64_t sa_tls_bitmap = 0; + ucp_rsc_index_t idx = 0; + ucp_tl_resource_desc_t *resource; + ucp_rsc_index_t tl_id; + ucp_tl_md_t *tl_md; + ucp_rsc_index_t j; + + /* Set a bitmap of sockaddr transports */ + for (j = 0; j < context->num_tls; ++j) { + resource = &context->tl_rscs[j]; + tl_md = &context->tl_mds[resource->md_index]; + if (tl_md->attr.cap.flags & UCT_MD_FLAG_SOCKADDR) { + sa_tls_bitmap |= UCS_BIT(j); + } + } + + /* Parse the sockaddr transports priority list */ + for (j = 0; j < num_sockaddr_tls; j++) { + /* go over the priority list and find the transport's tl_id in the + * sockaddr tls bitmap. save the tl_id's for the client/server usage + * later */ + ucs_for_each_bit(tl_id, sa_tls_bitmap) { + resource = &context->tl_rscs[tl_id]; + + if (!strcmp(sockaddr_tl_names[j], "*") || + !strncmp(sockaddr_tl_names[j], resource->tl_rsc.tl_name, + UCT_TL_NAME_MAX)) { + context->config.sockaddr_tl_ids[idx] = tl_id; + idx++; + sa_tls_bitmap &= ~UCS_BIT(tl_id); + } + } + } + + context->config.num_sockaddr_tls = idx; +} + +static void ucp_fill_sockaddr_cms_prio_list(ucp_context_h context, + const char **sockaddr_cm_names, + ucp_rsc_index_t num_sockaddr_cms, + int sockaddr_cm_enable) +{ + uint64_t cm_cmpts_bitmap = context->config.cm_cmpts_bitmap; + uint64_t cm_cmpts_bitmap_safe; + ucp_rsc_index_t cmpt_idx, cm_idx; + + memset(&context->config.cm_cmpt_idxs, UCP_NULL_RESOURCE, UCP_MAX_RESOURCES); + context->config.num_cm_cmpts = 0; + + if (!sockaddr_cm_enable) { + return; + } + + /* Parse the sockaddr CMs priority list */ + for (cm_idx = 0; cm_idx < num_sockaddr_cms; ++cm_idx) { + /* go over the priority list and find the CM's cm_idx in the + * sockaddr CMs bitmap. Save the cmpt_idx for the client/server usage + * later */ + cm_cmpts_bitmap_safe = cm_cmpts_bitmap; + ucs_for_each_bit(cmpt_idx, cm_cmpts_bitmap_safe) { + if (!strcmp(sockaddr_cm_names[cm_idx], "*") || + !strncmp(sockaddr_cm_names[cm_idx], + context->tl_cmpts[cmpt_idx].attr.name, + UCT_COMPONENT_NAME_MAX)) { + context->config.cm_cmpt_idxs[context->config.num_cm_cmpts++] = cmpt_idx; + cm_cmpts_bitmap &= ~UCS_BIT(cmpt_idx); + } + } + } +} + +static ucs_status_t ucp_fill_sockaddr_prio_list(ucp_context_h context, + const ucp_config_t *config) +{ + const char **sockaddr_tl_names = (const char**)config->sockaddr_cm_tls.cm_tls; + unsigned num_sockaddr_tls = config->sockaddr_cm_tls.count; + int sockaddr_cm_enable = context->config.ext.sockaddr_cm_enable != + UCS_NO; + + /* Check if a list of sockaddr transports/CMs has valid length */ + if (num_sockaddr_tls > UCP_MAX_RESOURCES) { + ucs_warn("sockaddr transports or connection managers list is too long, " + "only first %d entries will be used", UCP_MAX_RESOURCES); + num_sockaddr_tls = UCP_MAX_RESOURCES; + } + + ucp_fill_sockaddr_tls_prio_list(context, sockaddr_tl_names, + num_sockaddr_tls); + ucp_fill_sockaddr_cms_prio_list(context, sockaddr_tl_names, + num_sockaddr_tls, sockaddr_cm_enable); + if ((context->config.ext.sockaddr_cm_enable == UCS_YES) && + (context->config.num_cm_cmpts == 0)) { + ucs_error("UCX_SOCKADDR_CM_ENABLE is set to yes but none of the available components supports SOCKADDR_CM"); + return UCS_ERR_UNSUPPORTED; + } + + return UCS_OK; +} + static ucs_status_t ucp_check_resources(ucp_context_h context, const ucp_config_t *config) { @@ -871,13 +1070,14 @@ static ucs_status_t ucp_check_resources(ucp_context_h context, if (num_usable_tls == 0) { ucp_resource_config_str(config, info_str, sizeof(info_str)); - ucs_error("No usable transports/devices, asked %s", info_str); + ucs_error("no usable transports/devices (asked %s)", info_str); return UCS_ERR_NO_DEVICE; } /* Error check: Make sure there are not too many transports */ if (context->num_tls >= UCP_MAX_RESOURCES) { - ucs_error("Exceeded transports/devices limit (%u requested, up to %d are supported)", + ucs_error("exceeded transports/devices limit " + "(%u requested, up to %d are supported)", context->num_tls, UCP_MAX_RESOURCES); return UCS_ERR_EXCEEDS_LIMIT; } @@ -885,95 +1085,190 @@ static ucs_status_t ucp_check_resources(ucp_context_h context, return ucp_check_tl_names(context); } -static ucs_status_t ucp_fill_resources(ucp_context_h context, - const ucp_config_t *config) +static ucs_status_t ucp_add_component_resources(ucp_context_h context, + ucp_rsc_index_t cmpt_index, + ucs_string_set_t avail_devices[], + ucs_string_set_t *avail_tls, + uint64_t dev_cfg_masks[], + uint64_t *tl_cfg_mask, + const ucp_config_t *config) { - uint64_t dev_cfg_masks[UCT_DEVICE_TYPE_LAST] = {0}; - uint64_t tl_cfg_mask = 0; + const ucp_tl_cmpt_t *tl_cmpt = &context->tl_cmpts[cmpt_index]; + uct_component_attr_t uct_component_attr; unsigned num_tl_resources; - unsigned num_md_resources; - uct_md_resource_desc_t *md_rscs; ucs_status_t status; ucp_rsc_index_t i; unsigned md_index; uint64_t mem_type_mask; - uct_memory_type_t mem_type; + uint64_t mem_type_bitmap; - context->tl_mds = NULL; - context->num_mds = 0; - context->tl_rscs = NULL; - context->num_tls = 0; - context->num_mem_type_mds = 0; - context->memtype_cache = NULL; - - status = ucp_check_resource_config(config); - if (status != UCS_OK) { - goto err; - } /* List memory domain resources */ - status = uct_query_md_resources(&md_rscs, &num_md_resources); + uct_component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES; + uct_component_attr.md_resources = + ucs_alloca(tl_cmpt->attr.md_resource_count * + sizeof(*uct_component_attr.md_resources)); + status = uct_component_query(tl_cmpt->cmpt, &uct_component_attr); if (status != UCS_OK) { - goto err; - } - - /* Error check: Make sure there is at least one MD */ - if (num_md_resources == 0) { - ucs_error("No memory domain resources found"); - status = UCS_ERR_NO_DEVICE; - goto err_release_md_resources; - } - - /* Allocate actual array of MDs */ - context->tl_mds = ucs_malloc(num_md_resources * sizeof(*context->tl_mds), - "ucp_tl_mds"); - if (context->tl_mds == NULL) { - status = UCS_ERR_NO_MEMORY; - goto err_release_md_resources; + goto out; } /* Open all memory domains */ - md_index = 0; - mem_type_mask = UCS_BIT(UCT_MD_MEM_TYPE_HOST); - for (i = 0; i < num_md_resources; ++i) { - status = ucp_fill_tl_md(&md_rscs[i], &context->tl_mds[md_index]); + mem_type_mask = UCS_BIT(UCS_MEMORY_TYPE_HOST); + for (i = 0; i < tl_cmpt->attr.md_resource_count; ++i) { + md_index = context->num_mds; + status = ucp_fill_tl_md(context, cmpt_index, + &uct_component_attr.md_resources[i], + &context->tl_mds[md_index]); if (status != UCS_OK) { continue; } /* Add communication resources of each MD */ - status = ucp_add_tl_resources(context, &context->tl_mds[md_index], - md_index, config, &num_tl_resources, - dev_cfg_masks, &tl_cfg_mask); + status = ucp_add_tl_resources(context, md_index, config, + &num_tl_resources, avail_devices, + avail_tls, dev_cfg_masks, tl_cfg_mask); if (status != UCS_OK) { uct_md_close(context->tl_mds[md_index].md); - goto err_free_context_resources; + goto out; } /* If the MD does not have transport resources (device or sockaddr), * don't use it */ if (num_tl_resources > 0) { /* List of memory type MDs */ - mem_type = context->tl_mds[md_index].attr.cap.mem_type; - if (!(mem_type_mask & UCS_BIT(mem_type))) { - context->mem_type_tl_mds[context->num_mem_type_mds] = md_index; - ++context->num_mem_type_mds; - mem_type_mask |= UCS_BIT(mem_type); + mem_type_bitmap = context->tl_mds[md_index].attr.cap.detect_mem_types; + if (~mem_type_mask & mem_type_bitmap) { + context->mem_type_detect_mds[context->num_mem_type_detect_mds] = md_index; + ++context->num_mem_type_detect_mds; + mem_type_mask |= mem_type_bitmap; } - ++md_index; ++context->num_mds; } else { ucs_debug("closing md %s because it has no selected transport resources", - md_rscs[i].md_name); + context->tl_mds[md_index].rsc.md_name); uct_md_close(context->tl_mds[md_index].md); } } - if (context->num_mem_type_mds && context->config.ext.enable_memtype_cache) { + status = UCS_OK; +out: + return status; +} + +static ucs_status_t ucp_fill_resources(ucp_context_h context, + const ucp_config_t *config) +{ + uint64_t dev_cfg_masks[UCT_DEVICE_TYPE_LAST] = {}; + uint64_t tl_cfg_mask = 0; + ucs_string_set_t avail_devices[UCT_DEVICE_TYPE_LAST]; + ucs_string_set_t avail_tls; + uct_component_h *uct_components; + unsigned i, num_uct_components; + uct_device_type_t dev_type; + ucs_status_t status; + unsigned max_mds; + + context->tl_cmpts = NULL; + context->num_cmpts = 0; + context->tl_mds = NULL; + context->num_mds = 0; + context->tl_rscs = NULL; + context->num_tls = 0; + context->memtype_cache = NULL; + context->num_mem_type_detect_mds = 0; + + for (i = 0; i < UCS_MEMORY_TYPE_LAST; ++i) { + context->mem_type_access_tls[i] = 0; + } + + ucs_string_set_init(&avail_tls); + UCS_STATIC_ASSERT(UCT_DEVICE_TYPE_NET == 0); + for (dev_type = UCT_DEVICE_TYPE_NET; dev_type < UCT_DEVICE_TYPE_LAST; ++dev_type) { + ucs_string_set_init(&avail_devices[dev_type]); + } + + status = ucp_check_resource_config(config); + if (status != UCS_OK) { + goto out_cleanup_avail_devices; + } + + status = uct_query_components(&uct_components, &num_uct_components); + if (status != UCS_OK) { + goto out_cleanup_avail_devices; + } + + if (num_uct_components > UCP_MAX_RESOURCES) { + ucs_error("too many components: %u, max: %u", num_uct_components, + UCP_MAX_RESOURCES); + status = UCS_ERR_EXCEEDS_LIMIT; + goto out_release_components; + } + + context->num_cmpts = num_uct_components; + context->tl_cmpts = ucs_calloc(context->num_cmpts, + sizeof(*context->tl_cmpts), "ucp_tl_cmpts"); + if (context->tl_cmpts == NULL) { + status = UCS_ERR_NO_MEMORY; + goto out_release_components; + } + + context->config.cm_cmpts_bitmap = 0; + + max_mds = 0; + for (i = 0; i < context->num_cmpts; ++i) { + memset(&context->tl_cmpts[i].attr, 0, sizeof(context->tl_cmpts[i].attr)); + context->tl_cmpts[i].cmpt = uct_components[i]; + context->tl_cmpts[i].attr.field_mask = + UCT_COMPONENT_ATTR_FIELD_NAME | + UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT | + UCT_COMPONENT_ATTR_FIELD_FLAGS; + status = uct_component_query(context->tl_cmpts[i].cmpt, + &context->tl_cmpts[i].attr); + if (status != UCS_OK) { + goto err_free_resources; + } + + if (context->tl_cmpts[i].attr.flags & UCT_COMPONENT_FLAG_CM) { + context->config.cm_cmpts_bitmap |= UCS_BIT(i); + } + + max_mds += context->tl_cmpts[i].attr.md_resource_count; + } + + if ((context->config.ext.sockaddr_cm_enable == UCS_YES) && + (context->config.cm_cmpts_bitmap == 0)) { + ucs_error("there are no UCT components with CM capability"); + status = UCS_ERR_UNSUPPORTED; + goto err_free_resources; + } + + /* Allocate actual array of MDs */ + context->tl_mds = ucs_malloc(max_mds * sizeof(*context->tl_mds), + "ucp_tl_mds"); + if (context->tl_mds == NULL) { + status = UCS_ERR_NO_MEMORY; + goto err_free_resources; + } + + /* Collect resources of each component */ + for (i = 0; i < context->num_cmpts; ++i) { + status = ucp_add_component_resources(context, i, avail_devices, + &avail_tls, dev_cfg_masks, + &tl_cfg_mask, config); + if (status != UCS_OK) { + goto err_free_resources; + } + } + + /* Create memtype cache if we have memory type MDs, and it's enabled by + * configuration + */ + if (context->num_mem_type_detect_mds && context->config.ext.enable_memtype_cache) { status = ucs_memtype_cache_create(&context->memtype_cache); if (status != UCS_OK) { ucs_debug("could not create memtype cache for mem_type allocations"); - goto err_free_context_resources; + goto err_free_resources; } } @@ -983,33 +1278,47 @@ static ucs_status_t ucp_fill_resources(ucp_context_h context, */ context->tl_bitmap = config->ctx.unified_mode ? 0 : UCS_MASK(context->num_tls); + /* Warn about devices and transports which were specified explicitly in the + * configuration, but are not available + */ + if (config->warn_invalid_config) { + UCS_STATIC_ASSERT(UCT_DEVICE_TYPE_NET == 0); + for (dev_type = UCT_DEVICE_TYPE_NET; dev_type < UCT_DEVICE_TYPE_LAST; ++dev_type) { + ucp_report_unavailable(&config->devices[dev_type], + dev_cfg_masks[dev_type], + ucp_device_type_names[dev_type], " device", + &avail_devices[dev_type]); + } + + ucp_get_aliases_set(&avail_tls); + ucp_report_unavailable(&config->tls, tl_cfg_mask, "", "transport", + &avail_tls); + } + /* Validate context resources */ status = ucp_check_resources(context, config); if (status != UCS_OK) { - goto err_free_context_resources; + goto err_free_resources; } - uct_release_md_resource_list(md_rscs); - - if (config->warn_invalid_config) { - /* Notify the user if there are devices or transports from the command line - * that are not available - */ - for (i = 0; i < UCT_DEVICE_TYPE_LAST; ++i) { - ucp_report_unavailable(&config->devices[i], dev_cfg_masks[i], "device"); - } - ucp_report_unavailable(&config->tls, tl_cfg_mask, "transport"); + ucp_fill_sockaddr_aux_tls_config(context, config); + status = ucp_fill_sockaddr_prio_list(context, config); + if (status != UCS_OK) { + goto err_free_resources; } - ucp_fill_sockaddr_aux_tls_config(context, config); + goto out_release_components; - return UCS_OK; - -err_free_context_resources: +err_free_resources: ucp_free_resources(context); -err_release_md_resources: - uct_release_md_resource_list(md_rscs); -err: +out_release_components: + uct_release_component_list(uct_components); +out_cleanup_avail_devices: + UCS_STATIC_ASSERT(UCT_DEVICE_TYPE_NET == 0); + for (dev_type = UCT_DEVICE_TYPE_NET; dev_type < UCT_DEVICE_TYPE_LAST; ++dev_type) { + ucs_string_set_cleanup(&avail_devices[dev_type]); + } + ucs_string_set_cleanup(&avail_tls); return status; } @@ -1055,6 +1364,12 @@ static void ucp_apply_params(ucp_context_h context, const ucp_params_t *params, context->config.est_num_eps = 1; } + if (params->field_mask & UCP_PARAM_FIELD_ESTIMATED_NUM_PPN) { + context->config.est_num_ppn = params->estimated_num_ppn; + } else { + context->config.est_num_ppn = 1; + } + if ((params->field_mask & UCP_PARAM_FIELD_MT_WORKERS_SHARED) && params->mt_workers_shared) { context->mt_lock.mt_type = mt_type; @@ -1074,25 +1389,47 @@ static ucs_status_t ucp_fill_config(ucp_context_h context, ucp_apply_params(context, params, config->ctx.use_mt_mutex ? UCP_MT_TYPE_MUTEX : UCP_MT_TYPE_SPINLOCK); + context->config.ext = config->ctx; - if (context->config.ext.estimated_num_eps != UCS_CONFIG_ULUNITS_AUTO) { - /* num_eps were set via the env variable. Override current value */ + if (context->config.ext.estimated_num_eps != UCS_ULUNITS_AUTO) { + /* num_eps was set via the env variable. Override current value */ context->config.est_num_eps = context->config.ext.estimated_num_eps; } - ucs_debug("Estimated number of endpoints is %d", + ucs_debug("estimated number of endpoints is %d", context->config.est_num_eps); + if (context->config.ext.estimated_num_ppn != UCS_ULUNITS_AUTO) { + /* num_ppn was set via the env variable. Override current value */ + context->config.est_num_ppn = context->config.ext.estimated_num_ppn; + } + ucs_debug("estimated number of endpoints per node is %d", + context->config.est_num_ppn); + + if (UCS_CONFIG_BW_IS_AUTO(context->config.ext.bcopy_bw)) { + /* bcopy_bw wasn't set via the env variable. Calculate the value */ + context->config.ext.bcopy_bw = ucs_cpu_get_memcpy_bw(); + } + ucs_debug("estimated bcopy bandwidth is %f", + context->config.ext.bcopy_bw); + /* always init MT lock in context even though it is disabled by user, * because we need to use context lock to protect ucp_mm_ and ucp_rkey_ * routines */ UCP_THREAD_LOCK_INIT(&context->mt_lock); + /* save environment prefix to later notify user for unused variables */ + context->config.env_prefix = ucs_strdup(config->env_prefix, "ucp config"); + if (context->config.env_prefix == NULL) { + status = UCS_ERR_NO_MEMORY; + goto err; + } + /* Get allocation alignment from configuration, make sure it's valid */ if (config->alloc_prio.count == 0) { ucs_error("No allocation methods specified - aborting"); status = UCS_ERR_INVALID_PARAM; - goto err; + goto err_free_env_prefix; } num_alloc_methods = config->alloc_prio.count; @@ -1104,7 +1441,7 @@ static ucs_status_t ucp_fill_config(ucp_context_h context, "ucp_alloc_methods"); if (context->config.alloc_methods == NULL) { status = UCS_ERR_NO_MEMORY; - goto err; + goto err_free_env_prefix; } /* Parse the allocation methods specified in the configuration */ @@ -1115,8 +1452,8 @@ static ucs_status_t ucp_fill_config(ucp_context_h context, * component name. */ context->config.alloc_methods[i].method = UCT_ALLOC_METHOD_MD; - ucs_strncpy_zero(context->config.alloc_methods[i].mdc_name, - method_name + 3, UCT_MD_COMPONENT_NAME_MAX); + ucs_strncpy_zero(context->config.alloc_methods[i].cmpt_name, + method_name + 3, UCT_COMPONENT_NAME_MAX); ucs_debug("allocation method[%d] is md '%s'", i, method_name + 3); } else { /* Otherwise, this is specific allocation method name. @@ -1127,8 +1464,8 @@ static ucs_status_t ucp_fill_config(ucp_context_h context, !strcmp(method_name, uct_alloc_method_names[method])) { /* Found the allocation method in the internal name list */ - context->config.alloc_methods[i].method = method; - strcpy(context->config.alloc_methods[i].mdc_name, ""); + context->config.alloc_methods[i].method = (uct_alloc_method_t)method; + strcpy(context->config.alloc_methods[i].cmpt_name, ""); ucs_debug("allocation method[%d] is '%s'", i, method_name); break; } @@ -1136,12 +1473,12 @@ static ucs_status_t ucp_fill_config(ucp_context_h context, if (context->config.alloc_methods[i].method == UCT_ALLOC_METHOD_LAST) { ucs_error("Invalid allocation method: %s", method_name); status = UCS_ERR_INVALID_PARAM; - goto err_free; + goto err_free_alloc_methods; } } } - /* Need to check MAX_BCOPY value if it is enabled only */ + /* Need to check TM_SEG_SIZE value if it is enabled only */ if (context->config.ext.tm_max_bb_size > context->config.ext.tm_thresh) { if (context->config.ext.tm_max_bb_size < sizeof(ucp_request_hdr_t)) { /* In case of expected SW RNDV message, the header (ucp_request_hdr_t) is @@ -1162,8 +1499,10 @@ static ucs_status_t ucp_fill_config(ucp_context_h context, return UCS_OK; -err_free: +err_free_alloc_methods: ucs_free(context->config.alloc_methods); +err_free_env_prefix: + ucs_free(context->config.env_prefix); err: UCP_THREAD_LOCK_FINALIZE(&context->mt_lock); return status; @@ -1172,15 +1511,9 @@ static ucs_status_t ucp_fill_config(ucp_context_h context, static void ucp_free_config(ucp_context_h context) { ucs_free(context->config.alloc_methods); + ucs_free(context->config.env_prefix); } -static ucs_mpool_ops_t ucp_rkey_mpool_ops = { - .chunk_alloc = ucs_mpool_chunk_malloc, - .chunk_release = ucs_mpool_chunk_free, - .obj_init = NULL, - .obj_cleanup = NULL -}; - ucs_status_t ucp_init_version(unsigned api_major_version, unsigned api_minor_version, const ucp_params_t *params, const ucp_config_t *config, ucp_context_h *context_p) @@ -1228,15 +1561,6 @@ ucs_status_t ucp_init_version(unsigned api_major_version, unsigned api_minor_ver goto err_free_config; } - /* create memory pool for small rkeys */ - status = ucs_mpool_init(&context->rkey_mp, 0, - sizeof(ucp_rkey_t) + sizeof(uct_rkey_bundle_t) * UCP_RKEY_MPOOL_MAX_MD, - 0, UCS_SYS_CACHE_LINE_SIZE, 128, -1, - &ucp_rkey_mpool_ops, "ucp_rkeys"); - if (status != UCS_OK) { - goto err_free_resources; - } - if (dfl_config != NULL) { ucp_config_release(dfl_config); } @@ -1250,8 +1574,6 @@ ucs_status_t ucp_init_version(unsigned api_major_version, unsigned api_minor_ver *context_p = context; return UCS_OK; -err_free_resources: - ucp_free_resources(context); err_free_config: ucp_free_config(context); err_free_ctx: @@ -1264,6 +1586,17 @@ ucs_status_t ucp_init_version(unsigned api_major_version, unsigned api_minor_ver return status; } +void ucp_cleanup(ucp_context_h context) +{ + while (!ucs_list_is_empty(&context->extensions)) { + ucs_free(ucs_list_extract_head(&context->extensions, ucp_context_extension_t, list)); + } + ucp_free_resources(context); + ucp_free_config(context); + UCP_THREAD_LOCK_FINALIZE(&context->mt_lock); + ucs_free(context); +} + ucs_status_t ucp_extend(ucp_context_h context, size_t extension_ctx_length, ucp_extension_init_f init, ucp_extension_cleanup_f cleanup, size_t *extension_ctx_offset_in_worker, unsigned *am_id) @@ -1286,18 +1619,6 @@ ucs_status_t ucp_extend(ucp_context_h context, size_t extension_ctx_length, return UCS_OK; } -void ucp_cleanup(ucp_context_h context) -{ - while (!ucs_list_is_empty(&context->extensions)) { - ucs_free(ucs_list_extract_head(&context->extensions, ucp_context_extension_t, list)); - } - ucs_mpool_cleanup(&context->rkey_mp, 1); - ucp_free_resources(context); - ucp_free_config(context); - UCP_THREAD_LOCK_FINALIZE(&context->mt_lock); - ucs_free(context); -} - void ucp_dump_payload(ucp_context_h context, char *buffer, size_t max, const void *data, size_t length) { @@ -1361,15 +1682,22 @@ ucs_status_t ucp_context_query(ucp_context_h context, ucp_context_attr_t *attr) void ucp_context_print_info(ucp_context_h context, FILE *stream) { - ucp_rsc_index_t md_index, rsc_index; + ucp_rsc_index_t cmpt_index, md_index, rsc_index; fprintf(stream, "#\n"); fprintf(stream, "# UCP context\n"); fprintf(stream, "#\n"); + for (cmpt_index = 0; cmpt_index < context->num_cmpts; ++cmpt_index) { + fprintf(stream, "# component %-2d : %s\n", + cmpt_index, context->tl_cmpts[cmpt_index].attr.name); + } + fprintf(stream, "#\n"); + for (md_index = 0; md_index < context->num_mds; ++md_index) { - fprintf(stream, "# md %-2d : %s\n", - md_index, context->tl_mds[md_index].rsc.md_name); + fprintf(stream, "# md %-2d : component %-2d %s \n", + md_index, context->tl_mds[md_index].cmpt_index, + context->tl_mds[md_index].rsc.md_name); } fprintf(stream, "#\n"); @@ -1399,3 +1727,62 @@ uct_md_h ucp_context_find_tl_md(ucp_context_h context, const char *md_name) return NULL; } + +ucs_memory_type_t +ucp_memory_type_detect_mds(ucp_context_h context, const void *address, size_t size) +{ + ucs_memory_type_t mem_type; + unsigned i, md_index; + ucs_status_t status; + + for (i = 0; i < context->num_mem_type_detect_mds; ++i) { + md_index = context->mem_type_detect_mds[i]; + status = uct_md_detect_memory_type(context->tl_mds[md_index].md, + address, size, &mem_type); + if (status == UCS_OK) { + if (context->memtype_cache != NULL) { + ucs_memtype_cache_update(context->memtype_cache, address, size, + mem_type); + } + return mem_type; + } + } + + /* Memory type not detected by any memtype MD - assume it is host memory */ + return UCS_MEMORY_TYPE_HOST; +} + +uint64_t ucp_context_dev_tl_bitmap(ucp_context_h context, const char *dev_name) +{ + uint64_t tl_bitmap; + ucp_rsc_index_t tl_idx; + + tl_bitmap = 0; + + ucs_for_each_bit(tl_idx, context->tl_bitmap) { + if (strcmp(context->tl_rscs[tl_idx].tl_rsc.dev_name, dev_name)) { + continue; + } + + tl_bitmap |= UCS_BIT(tl_idx); + } + + return tl_bitmap; +} + +uint64_t ucp_context_dev_idx_tl_bitmap(ucp_context_h context, + ucp_rsc_index_t dev_idx) +{ + uint64_t tl_bitmap; + ucp_rsc_index_t tl_idx; + + tl_bitmap = 0; + + ucs_for_each_bit(tl_idx, context->tl_bitmap) { + if (context->tl_rscs[tl_idx].dev_index == dev_idx) { + tl_bitmap |= UCS_BIT(tl_idx); + } + } + + return tl_bitmap; +} diff --git a/src/ucp/core/ucp_context.h b/src/ucp/core/ucp_context.h index dd5b3bb0650..ef6e2d84ec6 100644 --- a/src/ucp/core/ucp_context.h +++ b/src/ucp/core/ucp_context.h @@ -2,7 +2,7 @@ * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. * Copyright (C) ARM Ltd. 2016. ALL RIGHTS RESERVED. * Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED. - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -20,6 +20,7 @@ #include #include #include +#include enum { @@ -46,16 +47,23 @@ typedef struct ucp_context_config { /** The percentage allowed for performance difference between rendezvous * and the eager_zcopy protocol */ double rndv_perf_diff; + /** Maximal allowed ratio between slowest and fastest lane in a multi-lane + * protocol. Lanes slower than the specified ratio will not be used */ + double multi_lane_max_ratio; /** Threshold for switching UCP to zero copy protocol */ size_t zcopy_thresh; /** Communication scheme in RNDV protocol */ ucp_rndv_mode_t rndv_mode; + /** RKEY PTR segment size */ + size_t rkey_ptr_seg_size; /** Estimation of bcopy bandwidth */ - size_t bcopy_bw; + double bcopy_bw; /** Segment size in the worker pre-registered memory pool */ size_t seg_size; /** RNDV pipeline fragment size */ size_t rndv_frag_size; + /** RNDV pipeline send threshold */ + size_t rndv_pipeline_send_thresh; /** Threshold for using tag matching offload capabilities. Smaller buffers * will not be posted to the transport. */ size_t tm_thresh; @@ -64,6 +72,10 @@ typedef struct ucp_context_config { /** Upper bound for posting tm offload receives with internal UCP * preregistered bounce buffers. */ size_t tm_max_bb_size; + /** Enabling SW rndv protocol with tag offload mode */ + int tm_sw_rndv; + /** Pack debug information in worker address */ + int address_debug_info; /** Maximal size of worker name for debugging */ unsigned max_worker_name; /** Atomic mode */ @@ -78,12 +90,18 @@ typedef struct ucp_context_config { unsigned max_rndv_lanes; /** Estimated number of endpoints */ size_t estimated_num_eps; + /** Estimated number of processes per node */ + size_t estimated_num_ppn; /** Memtype cache */ int enable_memtype_cache; /** Enable flushing endpoints while flushing a worker */ int flush_worker_eps; /** Enable optimizations suitable for homogeneous systems */ int unified_mode; + /** Enable cm wireup-and-close protocol for client-server connections */ + ucs_ternary_value_t sockaddr_cm_enable; + /** Enable new protocol selection logic */ + int proto_enable; } ucp_context_config_t; @@ -98,8 +116,12 @@ struct ucp_config { UCS_CONFIG_STRING_ARRAY_FIELD(methods) alloc_prio; /** Array of transports for partial worker address to pack */ UCS_CONFIG_STRING_ARRAY_FIELD(aux_tls) sockaddr_aux_tls; + /** Array of transports for client-server transports and port selection */ + UCS_CONFIG_STRING_ARRAY_FIELD(cm_tls) sockaddr_cm_tls; /** Warn on invalid configuration */ int warn_invalid_config; + /** This config environment prefix */ + char *env_prefix; /** Configuration saved directly in the context */ ucp_context_config_t ctx; }; @@ -109,12 +131,12 @@ struct ucp_config { * UCP communication resource descriptor */ typedef struct ucp_tl_resource_desc { - uct_tl_resource_desc_t tl_rsc; /* UCT resource descriptor */ + uct_tl_resource_desc_t tl_rsc; /* UCT resource descriptor */ uint16_t tl_name_csum; /* Checksum of transport name */ - ucp_rsc_index_t md_index; /* Memory domain index (within the context) */ - ucp_rsc_index_t dev_index; /* Arbitrary device index. Resources - with same index have same device name. */ - uint8_t flags; /* Flags that describe resource specifics */ + ucp_md_index_t md_index; /* Memory domain index (within the context) */ + ucp_rsc_index_t dev_index; /* Arbitrary device index. Resources + with same index have same device name. */ + uint8_t flags; /* Flags that describe resource specifics */ } ucp_tl_resource_desc_t; @@ -127,13 +149,23 @@ typedef struct ucp_tl_alias { } ucp_tl_alias_t; +/** + * UCT component + */ +typedef struct ucp_tl_cmpt { + uct_component_h cmpt; /* UCT component handle */ + uct_component_attr_t attr; /* UCT component attributes */ +} ucp_tl_cmpt_t; + + /** * Memory domain. */ typedef struct ucp_tl_md { - uct_md_h md; /* Memory domain handle */ - uct_md_resource_desc_t rsc; /* Memory domain resource */ - uct_md_attr_t attr; /* Memory domain attributes */ + uct_md_h md; /* Memory domain handle */ + ucp_rsc_index_t cmpt_index; /* Index of owning component */ + uct_md_resource_desc_t rsc; /* Memory domain resource */ + uct_md_attr_t attr; /* Memory domain attributes */ } ucp_tl_md_t; @@ -149,24 +181,26 @@ typedef struct ucp_context_extenstion { * UCP context */ typedef struct ucp_context { + + ucp_tl_cmpt_t *tl_cmpts; /* UCT components */ + ucp_rsc_index_t num_cmpts; /* Number of UCT components */ + ucp_tl_md_t *tl_mds; /* Memory domain resources */ - ucp_rsc_index_t num_mds; /* Number of memory domains */ + ucp_md_index_t num_mds; /* Number of memory domains */ /* List of MDs which detect non host memory type */ - ucp_rsc_index_t mem_type_tl_mds[UCT_MD_MEM_TYPE_LAST]; - ucp_rsc_index_t num_mem_type_mds; /* Number of mem type MDs */ - ucs_memtype_cache_t *memtype_cache; /* mem type allocation cache*/ + ucp_md_index_t mem_type_detect_mds[UCS_MEMORY_TYPE_LAST]; + ucp_md_index_t num_mem_type_detect_mds; /* Number of mem type MDs */ + ucs_memtype_cache_t *memtype_cache; /* mem type allocation cache */ ucp_tl_resource_desc_t *tl_rscs; /* Array of communication resources */ uint64_t tl_bitmap; /* Cached map of tl resources used by workers. - Not all resources may be used if unified - mode is enabled. */ - ucp_rsc_index_t num_tls; /* Number of resources in the array*/ + * Not all resources may be used if unified + * mode is enabled. */ + ucp_rsc_index_t num_tls; /* Number of resources in the array */ /* Mask of memory type communication resources */ - uint64_t mem_type_tls[UCT_MD_MEM_TYPE_LAST]; - - ucs_mpool_t rkey_mp; /* Pool for memory keys */ + uint64_t mem_type_access_tls[UCS_MEMORY_TYPE_LAST]; ucs_list_link_t extensions; /* List of registered extensions */ size_t extension_size; /* Total size of worker extension */ @@ -181,6 +215,9 @@ typedef struct ucp_context { /* How many endpoints are expected to be created */ int est_num_eps; + /* How many endpoints are expected to be created on single node */ + int est_num_ppn; + struct { size_t size; /* Request size for user */ ucp_request_init_callback_t init; /* Initialization user callback */ @@ -192,16 +229,31 @@ typedef struct ucp_context { /* Allocation method */ uct_alloc_method_t method; - /* MD name to use, if method is MD */ - char mdc_name[UCT_MD_COMPONENT_NAME_MAX]; + /* Component name to use, if method is MD */ + char cmpt_name[UCT_COMPONENT_NAME_MAX]; } *alloc_methods; unsigned num_alloc_methods; + /* Cached map of components which support CM capability */ + uint64_t cm_cmpts_bitmap; + /* Bitmap of sockaddr auxiliary transports to pack for client/server flow */ uint64_t sockaddr_aux_rscs_bitmap; + /* Array of sockaddr transports indexes. + * The indexes appear in the configured priority order */ + ucp_rsc_index_t sockaddr_tl_ids[UCP_MAX_RESOURCES]; + ucp_rsc_index_t num_sockaddr_tls; + /* Array of CMs indexes. The indexes appear in the configured priority + * order. */ + ucp_rsc_index_t cm_cmpt_idxs[UCP_MAX_RESOURCES]; + ucp_rsc_index_t num_cm_cmpts; + /* Configuration supplied by the user */ ucp_context_config_t ext; + + /* Config environment prefix used to create the context */ + char *env_prefix; } config; @@ -226,7 +278,6 @@ typedef struct ucp_tl_iface_atomic_flags { } atomic32, atomic64; } ucp_tl_iface_atomic_flags_t; - #define UCP_ATOMIC_OP_MASK (UCS_BIT(UCT_ATOMIC_OP_ADD) | \ UCS_BIT(UCT_ATOMIC_OP_AND) | \ UCS_BIT(UCT_ATOMIC_OP_OR) | \ @@ -286,13 +337,13 @@ typedef struct ucp_tl_iface_atomic_flags { #define UCP_CONTEXT_CHECK_FEATURE_FLAGS(_context, _flags, _action) \ do { \ if (ENABLE_PARAMS_CHECK && \ - ucs_unlikely(!((_context)->config.features & (_flags)))) { \ + ucs_unlikely(!((_context)->config.features & (_flags)))) { \ size_t feature_list_str_max = 512; \ - char *feature_list_str = ucs_alloca(feature_list_str_max); \ + char *feature_list_str = ucs_alloca(feature_list_str_max); \ ucs_error("feature flags %s were not set for ucp_init()", \ - ucp_feature_flags_str((_flags) & \ - ~(_context)->config.features, \ - feature_list_str, feature_list_str_max)); \ + ucs_flags_str(feature_list_str, feature_list_str_max, \ + (_flags) & ~(_context)->config.features, \ + ucp_feature_str)); \ _action; \ } \ } while (0) @@ -303,9 +354,13 @@ typedef struct ucp_tl_iface_atomic_flags { (_params)->_name : (_default)) -extern ucp_am_handler_t ucp_am_handlers[]; +#define ucp_assert_memtype(_context, _buffer, _length, _mem_type) \ + ucs_assert(ucp_memory_type_detect(_context, _buffer, _length) == (_mem_type)) +extern ucp_am_handler_t ucp_am_handlers[]; +extern const char *ucp_feature_str[]; + void ucp_dump_payload(ucp_context_h context, char *buffer, size_t max, const void *data, size_t length); @@ -322,52 +377,107 @@ const char* ucp_tl_bitmap_str(ucp_context_h context, uint64_t tl_bitmap, const char* ucp_feature_flags_str(unsigned feature_flags, char *str, size_t max_str_len); +ucs_memory_type_t +ucp_memory_type_detect_mds(ucp_context_h context, const void *address, size_t length); + +/** + * Calculate a small value to overcome float imprecision + * between two float values + */ +static UCS_F_ALWAYS_INLINE +double ucp_calc_epsilon(double val1, double val2) +{ + return (val1 + val2) * (1e-6); +} + +/** + * Compare two scores and return: + * - `-1` if score1 < score2 + * - `0` if score1 == score2 + * - `1` if score1 > score2 + */ +static UCS_F_ALWAYS_INLINE +int ucp_score_cmp(double score1, double score2) +{ + double diff = score1 - score2; + return ((fabs(diff) < ucp_calc_epsilon(score1, score2)) ? + 0 : ucs_signum(diff)); +} + +/** + * Compare two scores taking into account priorities if scores are equal + */ +static UCS_F_ALWAYS_INLINE +int ucp_score_prio_cmp(double score1, int prio1, double score2, int prio2) +{ + int score_res = ucp_score_cmp(score1, score2); + + return score_res ? score_res : ucs_signum(prio1 - prio2); +} + +static UCS_F_ALWAYS_INLINE +int ucp_is_scalable_transport(ucp_context_h context, size_t max_num_eps) +{ + return (max_num_eps >= (size_t)context->config.est_num_eps); +} + size_t ucp_worker_base_size(ucp_context_h context, unsigned *config_max); static UCS_F_ALWAYS_INLINE double -ucp_tl_iface_latency(ucp_context_h context, const uct_iface_attr_t *iface_attr) +ucp_tl_iface_latency(ucp_context_h context, const ucs_linear_func_t *latency) { - return iface_attr->latency.overhead + - (iface_attr->latency.growth * context->config.est_num_eps); + return ucs_linear_func_apply(*latency, context->config.est_num_eps); } -extern uct_memory_type_t ucm_to_uct_mem_type_map[]; -static UCS_F_ALWAYS_INLINE int ucp_memory_type_cache_is_empty(ucp_context_h context) +static UCS_F_ALWAYS_INLINE double +ucp_tl_iface_bandwidth(ucp_context_h context, const uct_ppn_bandwidth_t *bandwidth) { - return !(context->memtype_cache && - context->memtype_cache->pgtable.num_regions); + return bandwidth->dedicated + + (bandwidth->shared / context->config.est_num_ppn); } -static UCS_F_ALWAYS_INLINE ucs_status_t -ucp_memory_type_detect_mds(ucp_context_h context, void *addr, size_t length, - uct_memory_type_t *mem_type_p) +static UCS_F_ALWAYS_INLINE int ucp_memory_type_cache_is_empty(ucp_context_h context) { - unsigned i, md_index; - ucm_mem_type_t ucm_mem_type; + return (context->memtype_cache && + !context->memtype_cache->pgtable.num_regions); +} - *mem_type_p = UCT_MD_MEM_TYPE_HOST; +static UCS_F_ALWAYS_INLINE ucs_memory_type_t +ucp_memory_type_detect(ucp_context_h context, const void *address, size_t length) +{ + ucs_memory_type_t mem_type; + ucs_status_t status; - if (ucs_likely(!context->num_mem_type_mds)) { - return UCS_OK; + if (ucs_likely(context->num_mem_type_detect_mds == 0)) { + return UCS_MEMORY_TYPE_HOST; } - if (context->memtype_cache != NULL) { - if (!ucp_memory_type_cache_is_empty(context) && - ucs_memtype_cache_lookup(context->memtype_cache, addr, - length, &ucm_mem_type) == UCS_OK) { - *mem_type_p = ucm_to_uct_mem_type_map[ucm_mem_type]; + if (ucs_likely(context->memtype_cache != NULL)) { + if (!context->memtype_cache->pgtable.num_regions) { + return UCS_MEMORY_TYPE_HOST; } - return UCS_OK; - } - for (i = 0; i < context->num_mem_type_mds; ++i) { - md_index = context->mem_type_tl_mds[i]; - if (uct_md_is_mem_type_owned(context->tl_mds[md_index].md, addr, length)) { - *mem_type_p = context->tl_mds[md_index].attr.cap.mem_type; - return UCS_OK; + status = ucs_memtype_cache_lookup(context->memtype_cache, address, + length, &mem_type); + if (status != UCS_OK) { + ucs_assert(status == UCS_ERR_NO_ELEM); + return UCS_MEMORY_TYPE_HOST; } + + if (mem_type != UCS_MEMORY_TYPE_LAST) { + return mem_type; + } + + /* mem_type is UCS_MEMORY_TYPE_LAST: fall thru to memory detection by + * UCT memory domains */ } - return UCS_OK; + return ucp_memory_type_detect_mds(context, address, length); } + +uint64_t ucp_context_dev_tl_bitmap(ucp_context_h context, const char *dev_name); + +uint64_t ucp_context_dev_idx_tl_bitmap(ucp_context_h context, + ucp_rsc_index_t dev_idx); + #endif diff --git a/src/ucp/core/ucp_ep.c b/src/ucp/core/ucp_ep.c index bc90c841f08..84631046865 100644 --- a/src/ucp/core/ucp_ep.c +++ b/src/ucp/core/ucp_ep.c @@ -1,18 +1,26 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. +* Copyright (C) Los Alamos National Security, LLC. 2019 ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ucp_ep.h" #include "ucp_worker.h" +#include "ucp_am.h" #include "ucp_ep.inl" #include "ucp_request.inl" #include #include +#include #include #include +#include #include #include #include @@ -31,9 +39,11 @@ typedef struct { size_t bw; } ucp_ep_thresh_params_t; -extern const ucp_proto_t ucp_stream_am_proto; +extern const ucp_request_send_proto_t ucp_stream_am_proto; +extern const ucp_request_send_proto_t ucp_am_proto; +extern const ucp_request_send_proto_t ucp_am_reply_proto; -#if ENABLE_STATS +#ifdef ENABLE_STATS static ucs_stats_class_t ucp_ep_stats_class = { .name = "ucp_ep", .num_counters = UCP_EP_STAT_LAST, @@ -48,13 +58,24 @@ static ucs_stats_class_t ucp_ep_stats_class = { void ucp_ep_config_key_reset(ucp_ep_config_key_t *key) { + ucp_lane_index_t i; + memset(key, 0, sizeof(*key)); key->num_lanes = 0; + for (i = 0; i < UCP_MAX_LANES; ++i) { + key->lanes[i].rsc_index = UCP_NULL_RESOURCE; + key->lanes[i].proxy_lane = UCP_NULL_LANE; + key->lanes[i].lane_types = 0; + key->lanes[i].dst_md_index = UCP_MAX_MDS; + } key->am_lane = UCP_NULL_LANE; key->wireup_lane = UCP_NULL_LANE; + key->cm_lane = UCP_NULL_LANE; + key->rkey_ptr_lane = UCP_NULL_LANE; key->tag_lane = UCP_NULL_LANE; key->rma_bw_md_map = 0; key->reachable_md_map = 0; + key->dst_md_cmpts = NULL; key->err_mode = UCP_ERR_HANDLING_MODE_NONE; key->status = UCS_OK; memset(key->am_bw_lanes, UCP_NULL_LANE, sizeof(key->am_bw_lanes)); @@ -63,12 +84,11 @@ void ucp_ep_config_key_reset(ucp_ep_config_key_t *key) memset(key->amo_lanes, UCP_NULL_LANE, sizeof(key->amo_lanes)); } -ucs_status_t ucp_ep_new(ucp_worker_h worker, const char *peer_name, - const char *message, ucp_ep_h *ep_p) +ucs_status_t ucp_ep_create_base(ucp_worker_h worker, const char *peer_name, + const char *message, ucp_ep_h *ep_p) { - ucs_status_t status; - ucp_ep_config_key_t key; ucp_lane_index_t lane; + ucs_status_t status; ucp_ep_h ep; ep = ucs_strided_alloc_get(&worker->ep_alloc, "ucp_ep"); @@ -78,12 +98,11 @@ ucs_status_t ucp_ep_new(ucp_worker_h worker, const char *peer_name, goto err; } - ucp_ep_config_key_reset(&key); + ep->cfg_index = UCP_NULL_CFG_INDEX; ep->worker = worker; - ep->cfg_index = ucp_worker_get_ep_config(worker, &key); ep->am_lane = UCP_NULL_LANE; ep->flags = 0; - ep->conn_sn = -1; + ep->conn_sn = (ucp_ep_conn_sn_t)-1; ucp_ep_ext_gen(ep)->user_data = NULL; ucp_ep_ext_gen(ep)->dest_ep_ptr = 0; ucp_ep_ext_gen(ep)->err_cb = NULL; @@ -95,6 +114,7 @@ ucs_status_t ucp_ep_new(ucp_worker_h worker, const char *peer_name, sizeof(ucp_ep_ext_gen(ep)->ep_match)); ucp_stream_ep_init(ep); + ucp_am_ep_init(ep); for (lane = 0; lane < UCP_MAX_LANES; ++lane) { ep->uct_eps[lane] = NULL; @@ -111,17 +131,36 @@ ucs_status_t ucp_ep_new(ucp_worker_h worker, const char *peer_name, goto err_free_ep; } - ucs_list_add_tail(&worker->all_eps, &ucp_ep_ext_gen(ep)->ep_list); + ucs_list_head_init(&ucp_ep_ext_gen(ep)->ep_list); + *ep_p = ep; ucs_debug("created ep %p to %s %s", ep, ucp_ep_peer_name(ep), message); return UCS_OK; err_free_ep: - ucs_free(ep); + ucs_strided_alloc_put(&worker->ep_alloc, ep); err: return status; } +ucs_status_t ucp_worker_create_ep(ucp_worker_h worker, const char *peer_name, + const char *message, ucp_ep_h *ep_p) +{ + ucs_status_t status; + ucp_ep_h ep; + + status = ucp_ep_create_base(worker, peer_name, message, &ep); + if (status != UCS_OK) { + return status; + } + + ucs_list_add_tail(&worker->all_eps, &ucp_ep_ext_gen(ep)->ep_list); + + *ep_p = ep; + + return UCS_OK; +} + void ucp_ep_delete(ucp_ep_h ep) { ucs_callbackq_remove_if(&ep->worker->uct->progress_q, @@ -131,29 +170,27 @@ void ucp_ep_delete(ucp_ep_h ep) ucs_strided_alloc_put(&ep->worker->ep_alloc, ep); } -ucs_status_t ucp_ep_create_sockaddr_aux(ucp_worker_h worker, - const ucp_ep_params_t *params, - const ucp_unpacked_address_t *remote_address, - ucp_ep_h *ep_p) +ucs_status_t +ucp_ep_create_sockaddr_aux(ucp_worker_h worker, unsigned ep_init_flags, + const ucp_unpacked_address_t *remote_address, + ucp_ep_h *ep_p) { ucp_wireup_ep_t *wireup_ep; ucs_status_t status; ucp_ep_h ep; /* allocate endpoint */ - status = ucp_ep_new(worker, remote_address->name, "listener", &ep); + status = ucp_worker_create_ep(worker, remote_address->name, "listener", &ep); if (status != UCS_OK) { goto err; } - status = ucp_ep_init_create_wireup(ep, params, &wireup_ep); + status = ucp_ep_init_create_wireup(ep, ep_init_flags, &wireup_ep); if (status != UCS_OK) { goto err_delete; } - status = ucp_wireup_ep_connect_aux(wireup_ep, params, - remote_address->address_count, - remote_address->address_list); + status = ucp_wireup_ep_connect_aux(wireup_ep, ep_init_flags, remote_address); if (status != UCS_OK) { goto err_destroy_wireup_ep; } @@ -169,11 +206,11 @@ ucs_status_t ucp_ep_create_sockaddr_aux(ucp_worker_h worker, return status; } -void ucp_ep_config_key_set_params(ucp_ep_config_key_t *key, - const ucp_ep_params_t *params) +void ucp_ep_config_key_set_err_mode(ucp_ep_config_key_t *key, + unsigned ep_init_flags) { - key->err_mode = UCP_PARAM_VALUE(EP, params, err_mode, ERR_HANDLING_MODE, - UCP_ERR_HANDLING_MODE_NONE); + key->err_mode = (ep_init_flags & UCP_EP_INIT_ERR_MODE_PEER_FAILURE) ? + UCP_ERR_HANDLING_MODE_PEER : UCP_ERR_HANDLING_MODE_NONE; } int ucp_ep_is_sockaddr_stub(ucp_ep_h ep) @@ -212,39 +249,35 @@ ucs_status_t ucp_worker_create_mem_type_endpoints(ucp_worker_h worker) { ucp_context_h context = worker->context; ucp_unpacked_address_t local_address; - unsigned i, mem_type, md_index; + unsigned i, mem_type; ucs_status_t status; void *address_buffer; size_t address_length; - ucp_ep_params_t params; - - for (i = 0; i < UCT_MD_MEM_TYPE_LAST; i++) { - worker->mem_type_ep[i] = NULL; - } - - if (!context->num_mem_type_mds) { - return UCS_OK; - } - - params.field_mask = 0; - for (i = 0; i < context->num_mem_type_mds; ++i) { - md_index = context->mem_type_tl_mds[i]; - mem_type = context->tl_mds[md_index].attr.cap.mem_type; + for (mem_type = 0; mem_type < UCS_MEMORY_TYPE_LAST; mem_type++) { + if (UCP_MEM_IS_ACCESSIBLE_FROM_CPU(mem_type) || + !context->mem_type_access_tls[mem_type]) { + continue; + } - status = ucp_address_pack(worker, NULL, context->mem_type_tls[mem_type], NULL, + status = ucp_address_pack(worker, NULL, + context->mem_type_access_tls[mem_type], + UCP_ADDRESS_PACK_FLAGS_ALL, NULL, &address_length, &address_buffer); if (status != UCS_OK) { goto err_cleanup_eps; } - status = ucp_address_unpack(worker, address_buffer, &local_address); + status = ucp_address_unpack(worker, address_buffer, + UCP_ADDRESS_PACK_FLAGS_ALL, &local_address); if (status != UCS_OK) { goto err_free_address_buffer; } - status = ucp_ep_create_to_worker_addr(worker, ¶ms, &local_address, - UCP_EP_INIT_FLAG_MEM_TYPE, "mem type", + status = ucp_ep_create_to_worker_addr(worker, UINT64_MAX, + &local_address, + UCP_EP_INIT_FLAG_MEM_TYPE, + "mem type", &worker->mem_type_ep[mem_type]); if (status != UCS_OK) { goto err_free_address_list; @@ -261,7 +294,7 @@ ucs_status_t ucp_worker_create_mem_type_endpoints(ucp_worker_h worker) err_free_address_buffer: ucs_free(address_buffer); err_cleanup_eps: - for (i = 0; i < UCT_MD_MEM_TYPE_LAST; i++) { + for (i = 0; i < UCS_MEMORY_TYPE_LAST; i++) { if (worker->mem_type_ep[i]) { ucp_ep_destroy_internal(worker->mem_type_ep[i]); } @@ -269,31 +302,34 @@ ucs_status_t ucp_worker_create_mem_type_endpoints(ucp_worker_h worker) return status; } -ucs_status_t ucp_ep_init_create_wireup(ucp_ep_h ep, - const ucp_ep_params_t *params, +ucs_status_t ucp_ep_init_create_wireup(ucp_ep_h ep, unsigned ep_init_flags, ucp_wireup_ep_t **wireup_ep) { ucp_ep_config_key_t key; ucs_status_t status; ucp_ep_config_key_reset(&key); - ucp_ep_config_key_set_params(&key, params); + ucp_ep_config_key_set_err_mode(&key, ep_init_flags); - /* all operations will use the first lane, which is a stub endpoint */ key.num_lanes = 1; - key.lanes[0].rsc_index = UCP_NULL_RESOURCE; - key.lanes[0].dst_md_index = UCP_NULL_RESOURCE; - key.am_lane = 0; - key.wireup_lane = 0; - key.tag_lane = 0; - key.am_bw_lanes[0] = 0; - key.rma_lanes[0] = 0; - key.rma_bw_lanes[0] = 0; - key.amo_lanes[0] = 0; - - ep->cfg_index = ucp_worker_get_ep_config(ep->worker, &key); - ep->am_lane = 0; - ep->flags |= UCP_EP_FLAG_CONNECT_REQ_QUEUED; + /* all operations will use the first lane, which is a stub endpoint before + * reconfiguration */ + key.am_lane = 0; + if (ucp_worker_sockaddr_is_cm_proto(ep->worker)) { + key.cm_lane = 0; + } else { + key.wireup_lane = 0; + } + + status = ucp_worker_get_ep_config(ep->worker, &key, 0, &ep->cfg_index); + if (status != UCS_OK) { + return status; + } + + ep->am_lane = key.am_lane; + if (!ucp_ep_has_cm_lane(ep)) { + ep->flags |= UCP_EP_FLAG_CONNECT_REQ_QUEUED; + } status = ucp_wireup_ep_create(ep, &ep->uct_eps[0]); if (status != UCS_OK) { @@ -305,39 +341,33 @@ ucs_status_t ucp_ep_init_create_wireup(ucp_ep_h ep, } ucs_status_t ucp_ep_create_to_worker_addr(ucp_worker_h worker, - const ucp_ep_params_t *params, + uint64_t local_tl_bitmap, const ucp_unpacked_address_t *remote_address, unsigned ep_init_flags, const char *message, ucp_ep_h *ep_p) { - uint8_t addr_indices[UCP_MAX_LANES]; + unsigned addr_indices[UCP_MAX_LANES]; ucs_status_t status; ucp_ep_h ep; /* allocate endpoint */ - status = ucp_ep_new(worker, remote_address->name, message, &ep); + status = ucp_worker_create_ep(worker, remote_address->name, message, &ep); if (status != UCS_OK) { goto err; } /* initialize transport endpoints */ - status = ucp_wireup_init_lanes(ep, params, ep_init_flags, - remote_address->address_count, - remote_address->address_list, addr_indices); + status = ucp_wireup_init_lanes(ep, ep_init_flags, local_tl_bitmap, + remote_address, addr_indices); if (status != UCS_OK) { goto err_delete; } - status = ucp_ep_adjust_params(ep, params); - if (status != UCS_OK) { - goto err_cleanup_lanes; - } + ucs_assert(!(ucp_ep_get_tl_bitmap(ep) & ~local_tl_bitmap)); *ep_p = ep; return UCS_OK; -err_cleanup_lanes: - ucp_ep_cleanup_lanes(ep); err_delete: ucp_ep_delete(ep); err: @@ -364,12 +394,13 @@ static ucs_status_t ucp_ep_create_to_sock_addr(ucp_worker_h worker, /* allocate endpoint */ ucs_sockaddr_str(params->sockaddr.addr, peer_name, sizeof(peer_name)); - status = ucp_ep_new(worker, peer_name, "from api call", &ep); + status = ucp_worker_create_ep(worker, peer_name, "from api call", &ep); if (status != UCS_OK) { goto err; } - status = ucp_ep_init_create_wireup(ep, params, &wireup_ep); + status = ucp_ep_init_create_wireup(ep, ucp_ep_init_flags(worker, params), + &wireup_ep); if (status != UCS_OK) { goto err_delete; } @@ -379,7 +410,9 @@ static ucs_status_t ucp_ep_create_to_sock_addr(ucp_worker_h worker, goto err_cleanup_lanes; } - status = ucp_wireup_ep_connect_to_sockaddr(ep->uct_eps[0], params); + status = ucp_worker_sockaddr_is_cm_proto(ep->worker) ? + ucp_ep_client_cm_connect_start(ep, params) : + ucp_wireup_ep_connect_to_sockaddr(ep->uct_eps[0], params); if (status != UCS_OK) { goto err_cleanup_lanes; } @@ -398,51 +431,108 @@ static ucs_status_t ucp_ep_create_to_sock_addr(ucp_worker_h worker, /** * Create an endpoint on the server side connected to the client endpoint. */ -ucs_status_t ucp_ep_create_accept(ucp_worker_h worker, - const ucp_wireup_client_data_t *client_data, - ucp_ep_h *ep_p) +ucs_status_t ucp_ep_create_server_accept(ucp_worker_h worker, + const ucp_conn_request_h conn_request, + ucp_ep_h *ep_p) { - ucp_ep_params_t params; - ucp_unpacked_address_t remote_address; - ucs_status_t status; + const ucp_wireup_sockaddr_data_t *sa_data = &conn_request->sa_data; + unsigned ep_init_flags = 0; + ucp_unpacked_address_t remote_addr; + uint64_t addr_flags; + unsigned i; + ucs_status_t status; - params.field_mask = UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE; - params.err_mode = client_data->err_mode; + if (sa_data->err_mode == UCP_ERR_HANDLING_MODE_PEER) { + ep_init_flags |= UCP_EP_INIT_ERR_MODE_PEER_FAILURE; + } + + if (sa_data->addr_mode == UCP_WIREUP_SA_DATA_CM_ADDR) { + addr_flags = UCP_ADDRESS_PACK_FLAG_IFACE_ADDR | + UCP_ADDRESS_PACK_FLAG_EP_ADDR; + } else { + addr_flags = UCP_ADDRESS_PACK_FLAGS_ALL; + } - status = ucp_address_unpack(worker, client_data + 1, &remote_address); + /* coverity[overrun-local] */ + status = ucp_address_unpack(worker, sa_data + 1, addr_flags, &remote_addr); if (status != UCS_OK) { - goto out; + ucp_listener_reject(conn_request->listener, conn_request); + return status; } - if (client_data->is_full_addr) { + switch (sa_data->addr_mode) { + case UCP_WIREUP_SA_DATA_FULL_ADDR: /* create endpoint to the worker address we got in the private data */ - status = ucp_ep_create_to_worker_addr(worker, ¶ms, &remote_address, - UCP_EP_CREATE_AM_LANE, "listener", - ep_p); - if (status == UCS_OK) { - ucp_ep_flush_state_reset(*ep_p); - } else { - goto out_free_address; - } - } else { - status = ucp_ep_create_sockaddr_aux(worker, ¶ms, &remote_address, - ep_p); - if (status == UCS_OK) { - /* the server's ep should be aware of the sent address from the client */ - (*ep_p)->flags |= UCP_EP_FLAG_LISTENER; - /* NOTE: protect union */ - ucs_assert(!((*ep_p)->flags & (UCP_EP_FLAG_ON_MATCH_CTX | - UCP_EP_FLAG_FLUSH_STATE_VALID))); - } else { - goto out_free_address; + status = ucp_ep_create_to_worker_addr(worker, UINT64_MAX, &remote_addr, + ep_init_flags | + UCP_EP_INIT_CREATE_AM_LANE, + "listener", ep_p); + if (status != UCS_OK) { + goto non_cm_err_reject; } - } - ucp_ep_update_dest_ep_ptr(*ep_p, client_data->ep_ptr); + ucs_assert(ucp_ep_config(*ep_p)->key.err_mode == sa_data->err_mode); + ucp_ep_flush_state_reset(*ep_p); + ucp_ep_update_dest_ep_ptr(*ep_p, sa_data->ep_ptr); + /* send wireup request message, to connect the client to the server's + new endpoint */ + ucs_assert(!((*ep_p)->flags & UCP_EP_FLAG_CONNECT_REQ_QUEUED)); + status = ucp_wireup_send_request(*ep_p); + if (status != UCS_OK) { + goto non_cm_err_destroy_ep; + } + break; + case UCP_WIREUP_SA_DATA_PARTIAL_ADDR: + status = ucp_ep_create_sockaddr_aux(worker, ep_init_flags, + &remote_addr, ep_p); + if (status != UCS_OK) { + goto non_cm_err_reject; + } -out_free_address: - ucs_free(remote_address.address_list); -out: + ucp_ep_update_dest_ep_ptr(*ep_p, sa_data->ep_ptr); + /* the server's ep should be aware of the sent address from the client */ + (*ep_p)->flags |= UCP_EP_FLAG_LISTENER; + /* NOTE: protect union */ + ucs_assert(!((*ep_p)->flags & (UCP_EP_FLAG_ON_MATCH_CTX | + UCP_EP_FLAG_FLUSH_STATE_VALID))); + status = ucp_wireup_send_pre_request(*ep_p); + if (status != UCS_OK) { + goto non_cm_err_destroy_ep; + } + break; + case UCP_WIREUP_SA_DATA_CM_ADDR: + ucs_assert(ucp_worker_sockaddr_is_cm_proto(worker)); + for (i = 0; i < remote_addr.address_count; ++i) { + remote_addr.address_list[i].dev_addr = conn_request->remote_dev_addr; + remote_addr.address_list[i].dev_index = conn_request->sa_data.dev_index; + } + status = ucp_ep_cm_server_create_connected(worker, + ep_init_flags | + UCP_EP_INIT_CM_WIREUP_SERVER, + &remote_addr, conn_request, + ep_p); + ucs_free(remote_addr.address_list); + return status; + default: + ucs_fatal("client sockaddr data contains invalid address mode %d", + sa_data->addr_mode); + } + + /* common non-CM flow */ + status = uct_iface_accept(conn_request->uct.iface, + conn_request->uct_req); + goto non_cm_out; + +non_cm_err_destroy_ep: + ucp_ep_destroy_internal(*ep_p); +non_cm_err_reject: + ucs_error("connection request failed on listener %p with status %s", + conn_request->listener, ucs_status_string(status)); + uct_iface_reject(conn_request->uct.iface, conn_request->uct_req); +non_cm_out: + ucs_free(conn_request); + ucs_free(remote_addr.address_list); + ucs_assert(!ucp_worker_sockaddr_is_cm_proto(worker)); return status; } @@ -454,42 +544,17 @@ ucp_ep_create_api_conn_request(ucp_worker_h worker, ucp_ep_h ep; ucs_status_t status; - /* coverity[overrun-buffer-val] */ - status = ucp_ep_create_accept(worker, &conn_request->client_data, &ep); + status = ucp_ep_create_server_accept(worker, conn_request, &ep); if (status != UCS_OK) { - goto out; + return status; } status = ucp_ep_adjust_params(ep, params); - if (status != UCS_OK) { - goto out_ep_destroy; - } - - if (ep->flags & UCP_EP_FLAG_LISTENER) { - status = ucp_wireup_send_pre_request(ep); - } else { - /* send wireup request message, to connect the client to the server's - new endpoint */ - ucs_assert(!(ep->flags & UCP_EP_FLAG_CONNECT_REQ_QUEUED)); - status = ucp_wireup_send_request(ep); - } - if (status == UCS_OK) { *ep_p = ep; - goto out; - } - -out_ep_destroy: - ucp_ep_destroy_internal(ep); -out: - if (status == UCS_OK) { - status = uct_iface_accept(conn_request->listener->wiface.iface, - conn_request->uct_req); } else { - uct_iface_reject(conn_request->listener->wiface.iface, - conn_request->uct_req); + ucp_ep_destroy_internal(ep); } - ucs_free(params->conn_request); return status; } @@ -512,7 +577,8 @@ ucp_ep_create_api_to_worker_addr(ucp_worker_h worker, UCP_CHECK_PARAM_NON_NULL(params->address, status, goto out); - status = ucp_address_unpack(worker, params->address, &remote_address); + status = ucp_address_unpack(worker, params->address, + UCP_ADDRESS_PACK_FLAGS_ALL, &remote_address); if (status != UCS_OK) { goto out; } @@ -542,12 +608,19 @@ ucp_ep_create_api_to_worker_addr(ucp_worker_h worker, goto out_free_address; } - status = ucp_ep_create_to_worker_addr(worker, params, &remote_address, 0, + status = ucp_ep_create_to_worker_addr(worker, UINT64_MAX, &remote_address, + ucp_ep_init_flags(worker, params), "from api call", &ep); if (status != UCS_OK) { goto out_free_address; } + status = ucp_ep_adjust_params(ep, params); + if (status != UCS_OK) { + ucp_ep_destroy_internal(ep); + goto out_free_address; + } + ep->conn_sn = conn_sn; /* @@ -570,7 +643,6 @@ ucp_ep_create_api_to_worker_addr(ucp_worker_h worker, ucs_assert(!(ep->flags & UCP_EP_FLAG_CONNECT_REQ_QUEUED)); status = ucp_wireup_send_request(ep); if (status != UCS_OK) { - ucp_ep_destroy_internal(ep); goto out_free_address; } } @@ -693,6 +765,7 @@ void ucp_ep_cleanup_lanes(ucp_ep_h ep) } } +/* Must be called with async lock held */ void ucp_ep_disconnected(ucp_ep_h ep, int force) { /* remove pending slow-path progress in case it wasn't removed yet */ @@ -703,13 +776,18 @@ void ucp_ep_disconnected(ucp_ep_h ep, int force) ucs_callbackq_remove_if(&ep->worker->uct->progress_q, ucp_listener_accept_cb_remove_filter, ep); + ucp_ep_cm_slow_cbq_cleanup(ep); + ucp_stream_ep_cleanup(ep); + ucp_am_ep_cleanup(ep); ep->flags &= ~UCP_EP_FLAG_USED; - ep->flags |= UCP_EP_FLAG_CLOSED; - if ((ep->flags & (UCP_EP_FLAG_CONNECT_REQ_QUEUED|UCP_EP_FLAG_REMOTE_CONNECTED)) - && !force) { + if ((ep->flags & (UCP_EP_FLAG_CONNECT_REQ_QUEUED | + UCP_EP_FLAG_REMOTE_CONNECTED)) && !force) { + /* in case of CM connection ep has to be disconnected */ + ucs_assert(!ucp_ep_has_cm_lane(ep)); + /* Endpoints which have remote connection are destroyed only when the * worker is destroyed, to enable remote endpoints keep sending * TODO negotiate disconnect. @@ -722,14 +800,19 @@ void ucp_ep_disconnected(ucp_ep_h ep, int force) ucp_ep_destroy_internal(ep); } -static unsigned ucp_ep_do_disconnect(void *arg) +unsigned ucp_ep_local_disconnect_progress(void *arg) { - ucp_request_t *req = arg; + ucp_request_t *req = arg; + ucp_ep_h ep = req->send.ep; + ucs_async_context_t *async = &ep->worker->async; /* ep becomes invalid */ ucs_assert(!(req->flags & UCP_REQUEST_FLAG_COMPLETED)); - ucp_ep_disconnected(req->send.ep, req->send.flush.uct_flags & - UCT_FLUSH_FLAG_CANCEL); + UCS_ASYNC_BLOCK(async); + ucs_debug("ep %p: disconnected with request %p, %s", ep, req, + ucs_status_string(req->status)); + ucp_ep_disconnected(ep, req->send.flush.uct_flags & UCT_FLUSH_FLAG_CANCEL); + UCS_ASYNC_UNBLOCK(async); /* Complete send request from here, to avoid releasing the request while * slow-path element is still pending */ @@ -738,40 +821,106 @@ static unsigned ucp_ep_do_disconnect(void *arg) return 0; } +static void ucp_ep_set_close_request(ucp_ep_h ep, ucp_request_t *request, + const char *debug_msg) +{ + ucs_trace("ep %p: setting close request %p, %s", ep, request, debug_msg); + + ucp_ep_flush_state_invalidate(ep); + ucp_ep_ext_gen(ep)->close_req.req = request; + ep->flags |= UCP_EP_FLAG_CLOSE_REQ_VALID; +} + static void ucp_ep_close_flushed_callback(ucp_request_t *req) { - ucp_ep_h ep = req->send.ep; + ucp_ep_h ep = req->send.ep; + ucs_async_context_t *async = &ep->worker->async; + + /* in case of force close, schedule ucp_ep_local_disconnect_progress to + * destroy the ep and all its lanes */ + if (req->send.flush.uct_flags & UCT_FLUSH_FLAG_CANCEL) { + goto out; + } + + UCS_ASYNC_BLOCK(async); + + ucs_debug("ep %p: flags 0x%x close flushed callback for request %p", ep, + ep->flags, req); + if (ucp_ep_is_cm_local_connected(ep)) { + /* Now, when close flush is completed and we are still locally connected, + * we have to notify remote side */ + ucp_ep_cm_disconnect_cm_lane(ep); + if (ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED) { + /* Wait disconnect notification from remote side to complete this + * request */ + ucp_ep_set_close_request(ep, req, "close flushed callback"); + UCS_ASYNC_UNBLOCK(async); + return; + } + } + UCS_ASYNC_UNBLOCK(async); + +out: /* If a flush is completed from a pending/completion callback, we need to * schedule slow-path callback to release the endpoint later, since a UCT * endpoint cannot be released from pending/completion callback context. */ ucs_trace("adding slow-path callback to destroy ep %p", ep); req->send.disconnect.prog_id = UCS_CALLBACKQ_ID_NULL; - uct_worker_progress_register_safe(ep->worker->uct, ucp_ep_do_disconnect, + uct_worker_progress_register_safe(ep->worker->uct, + ucp_ep_local_disconnect_progress, req, UCS_CALLBACKQ_FLAG_ONESHOT, &req->send.disconnect.prog_id); } ucs_status_ptr_t ucp_ep_close_nb(ucp_ep_h ep, unsigned mode) { - ucp_worker_h worker = ep->worker; - void *request; + const ucp_request_param_t param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_FLAGS, + .flags = (mode == UCP_EP_CLOSE_MODE_FORCE) ? + UCP_EP_CLOSE_FLAG_FORCE : 0 + }; - if ((mode == UCP_EP_CLOSE_MODE_FORCE) && + return ucp_ep_close_nbx(ep, ¶m); +} + +ucs_status_ptr_t ucp_ep_close_nbx(ucp_ep_h ep, const ucp_request_param_t *param) +{ + ucp_worker_h worker = ep->worker; + int force; + void *request; + ucp_request_t *close_req; + unsigned uct_flags; + + force = ucp_request_param_flags(param) & UCP_EP_CLOSE_FLAG_FORCE; + if (force && !ucp_ep_has_cm_lane(ep) && (ucp_ep_config(ep)->key.err_mode != UCP_ERR_HANDLING_MODE_PEER)) { return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); } UCS_ASYNC_BLOCK(&worker->async); - request = ucp_ep_flush_internal(ep, - (mode == UCP_EP_CLOSE_MODE_FLUSH) ? - UCT_FLUSH_FLAG_LOCAL : UCT_FLUSH_FLAG_CANCEL, - NULL, 0, NULL, - ucp_ep_close_flushed_callback, "close"); + ep->flags |= UCP_EP_FLAG_CLOSED; + uct_flags = force ? UCT_FLUSH_FLAG_CANCEL : UCT_FLUSH_FLAG_LOCAL; + request = ucp_ep_flush_internal(ep, uct_flags, 0, + &ucp_request_null_param, NULL, + ucp_ep_close_flushed_callback, + "close"); if (!UCS_PTR_IS_PTR(request)) { - ucp_ep_disconnected(ep, mode == UCP_EP_CLOSE_MODE_FORCE); + if (ucp_ep_is_cm_local_connected(ep) && !force) { + /* lanes already flushed, start disconnect on CM lane */ + ucp_ep_cm_disconnect_cm_lane(ep); + close_req = ucp_ep_cm_close_request_get(ep); + if (close_req != NULL) { + request = close_req + 1; + ucp_ep_set_close_request(ep, close_req, "close"); + } else { + request = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); + } + } else { + ucp_ep_disconnected(ep, force); + } } UCS_ASYNC_UNBLOCK(&worker->async); @@ -815,7 +964,7 @@ int ucp_ep_config_is_equal(const ucp_ep_config_key_t *key1, const ucp_ep_config_key_t *key2) { ucp_lane_index_t lane; - + int i; if ((key1->num_lanes != key2->num_lanes) || memcmp(key1->rma_lanes, key2->rma_lanes, sizeof(key1->rma_lanes)) || @@ -827,6 +976,8 @@ int ucp_ep_config_is_equal(const ucp_ep_config_key_t *key1, (key1->am_lane != key2->am_lane) || (key1->tag_lane != key2->tag_lane) || (key1->wireup_lane != key2->wireup_lane) || + (key1->cm_lane != key2->cm_lane) || + (key1->rkey_ptr_lane != key2->rkey_ptr_lane) || (key1->err_mode != key2->err_mode) || (key1->status != key2->status)) { @@ -836,12 +987,20 @@ int ucp_ep_config_is_equal(const ucp_ep_config_key_t *key1, for (lane = 0; lane < key1->num_lanes; ++lane) { if ((key1->lanes[lane].rsc_index != key2->lanes[lane].rsc_index) || (key1->lanes[lane].proxy_lane != key2->lanes[lane].proxy_lane) || - (key1->lanes[lane].dst_md_index != key2->lanes[lane].dst_md_index)) + (key1->lanes[lane].dst_md_index != key2->lanes[lane].dst_md_index) || + (key1->lanes[lane].path_index != key2->lanes[lane].path_index) || + (key1->lanes[lane].lane_types != key2->lanes[lane].lane_types)) { return 0; } } + for (i = 0; i < ucs_popcount(key1->reachable_md_map); ++i) { + if (key1->dst_md_cmpts[i] != key2->dst_md_cmpts[i]) { + return 0; + } + } + return 1; } @@ -862,8 +1021,12 @@ static void ucp_ep_config_calc_params(ucp_worker_h worker, memset(params, 0, sizeof(*params)); for (i = 0; (i < UCP_MAX_LANES) && (lanes[i] != UCP_NULL_LANE); i++) { - lane = lanes[i]; - rsc_index = config->key.lanes[lane].rsc_index; + lane = lanes[i]; + rsc_index = config->key.lanes[lane].rsc_index; + if (rsc_index == UCP_NULL_RESOURCE) { + continue; + } + md_index = config->md_index[lane]; iface_attr = ucp_worker_iface_get_attr(worker, rsc_index); @@ -871,13 +1034,15 @@ static void ucp_ep_config_calc_params(ucp_worker_h worker, md_map |= UCS_BIT(md_index); md_attr = &context->tl_mds[md_index].attr; if (md_attr->cap.flags & UCT_MD_FLAG_REG) { - params->reg_growth += md_attr->reg_cost.growth; - params->reg_overhead += md_attr->reg_cost.overhead; + params->reg_growth += md_attr->reg_cost.m; + params->reg_overhead += md_attr->reg_cost.c; params->overhead += iface_attr->overhead; - params->latency += ucp_tl_iface_latency(context, iface_attr); + params->latency += ucp_tl_iface_latency(context, + &iface_attr->latency); } } - params->bw += worker->ifaces[rsc_index].attr.bandwidth; + + params->bw += ucp_tl_iface_bandwidth(context, &iface_attr->bandwidth); } } @@ -902,7 +1067,7 @@ static size_t ucp_ep_config_calc_rndv_thresh(ucp_worker_t *worker, ucp_ep_config_calc_params(worker, config, eager_lanes, &eager_zcopy); ucp_ep_config_calc_params(worker, config, rndv_lanes, &rndv); - if (!eager_zcopy.bw || !rndv.bw) { + if ((eager_zcopy.bw == 0) || (rndv.bw == 0)) { goto fallback; } @@ -910,7 +1075,7 @@ static size_t ucp_ep_config_calc_rndv_thresh(ucp_worker_t *worker, eager_iface_attr = ucp_worker_iface_get_attr(worker, eager_rsc_index); /* RTS/RTR latency is used from lanes[0] */ - rts_latency = ucp_tl_iface_latency(context, eager_iface_attr); + rts_latency = ucp_tl_iface_latency(context, &eager_iface_attr->latency); numerator = diff_percent * (rndv.reg_overhead * (1 + recv_reg_cost) + (2 * rts_latency) + (2 * rndv.latency) + @@ -943,20 +1108,76 @@ static size_t ucp_ep_thresh(size_t thresh_value, size_t min_value, return thresh; } -static void ucp_ep_config_set_am_rndv_thresh(ucp_worker_h worker, uct_iface_attr_t *iface_attr, - uct_md_attr_t *md_attr, ucp_ep_config_t *config, +static size_t ucp_ep_config_calc_rma_zcopy_thresh(ucp_worker_t *worker, + const ucp_ep_config_t *config, + const ucp_lane_index_t *rma_lanes) +{ + ucp_context_h context = worker->context; + double bcopy_bw = context->config.ext.bcopy_bw; + ucp_ep_thresh_params_t rma; + uct_md_attr_t *md_attr; + double numerator, denumerator; + double reg_overhead, reg_growth; + + ucp_ep_config_calc_params(worker, config, rma_lanes, &rma); + + if (rma.bw == 0) { + goto fallback; + } + + md_attr = &context->tl_mds[config->md_index[rma_lanes[0]]].attr; + if (md_attr->cap.flags & UCT_MD_FLAG_NEED_MEMH) { + reg_overhead = rma.reg_overhead; + reg_growth = rma.reg_growth; + } else { + reg_overhead = 0; + reg_growth = 0; + } + + numerator = reg_overhead; + denumerator = (1 / bcopy_bw) - reg_growth; + + if (denumerator > 0) { + return numerator / denumerator; + } + +fallback: + return SIZE_MAX; +} + +static void ucp_ep_config_adjust_max_short(ssize_t *max_short, + size_t thresh) +{ + *max_short = ucs_min((size_t)(*max_short + 1), thresh) - 1; + ucs_assert(*max_short >= -1); +} + +/* With tag offload, SW RNDV requests are temporarily stored in the receiver + * user buffer when matched. Thus, minimum message size allowed to be sent with + * RNDV protocol should be bigger than maximal possible SW RNDV request + * (i.e. header plus packed keys size). */ +size_t ucp_ep_tag_offload_min_rndv_thresh(ucp_ep_config_t *config) +{ + return sizeof(ucp_rndv_rts_hdr_t) + config->tag.rndv.rkey_size; +} + +static void ucp_ep_config_set_am_rndv_thresh(ucp_worker_h worker, + uct_iface_attr_t *iface_attr, + uct_md_attr_t *md_attr, + ucp_ep_config_t *config, + size_t min_rndv_thresh, size_t max_rndv_thresh) { ucp_context_h context = worker->context; - size_t rndv_thresh, rndv_nbr_thresh; + size_t rndv_thresh, rndv_nbr_thresh, min_thresh; ucs_assert(config->key.am_lane != UCP_NULL_LANE); ucs_assert(config->key.lanes[config->key.am_lane].rsc_index != UCP_NULL_RESOURCE); - if (config->key.err_mode == UCP_ERR_HANDLING_MODE_PEER) { + if (!ucp_ep_config_test_rndv_support(config)) { /* Disable RNDV */ rndv_thresh = rndv_nbr_thresh = SIZE_MAX; - } else if (context->config.ext.rndv_thresh == UCS_CONFIG_MEMUNITS_AUTO) { + } else if (context->config.ext.rndv_thresh == UCS_MEMUNITS_AUTO) { /* auto - Make UCX calculate the AM rndv threshold on its own.*/ rndv_thresh = ucp_ep_config_calc_rndv_thresh(worker, config, config->key.am_bw_lanes, @@ -967,14 +1188,20 @@ static void ucp_ep_config_set_am_rndv_thresh(ucp_worker_h worker, uct_iface_attr } else { rndv_thresh = context->config.ext.rndv_thresh; rndv_nbr_thresh = context->config.ext.rndv_thresh; + + /* adjust max_short if rndv_thresh is set externally */ + ucp_ep_config_adjust_max_short(&config->tag.eager.max_short, + rndv_thresh); } + min_thresh = ucs_max(iface_attr->cap.am.min_zcopy, min_rndv_thresh); + config->tag.rndv.am_thresh = ucp_ep_thresh(rndv_thresh, - iface_attr->cap.am.min_zcopy, + min_thresh, max_rndv_thresh); config->tag.rndv_send_nbr.am_thresh = ucp_ep_thresh(rndv_nbr_thresh, - iface_attr->cap.am.min_zcopy, + min_thresh, max_rndv_thresh); ucs_trace("Active Message rndv threshold is %zu (send_nbr: %zu)", @@ -984,13 +1211,13 @@ static void ucp_ep_config_set_am_rndv_thresh(ucp_worker_h worker, uct_iface_attr static void ucp_ep_config_set_rndv_thresh(ucp_worker_t *worker, ucp_ep_config_t *config, ucp_lane_index_t *lanes, - uint64_t rndv_cap_flag, + size_t min_rndv_thresh, size_t max_rndv_thresh) { ucp_context_t *context = worker->context; ucp_lane_index_t lane = lanes[0]; ucp_rsc_index_t rsc_index; - size_t rndv_thresh, rndv_nbr_thresh; + size_t rndv_thresh, rndv_nbr_thresh, min_thresh; uct_iface_attr_t *iface_attr; if (lane == UCP_NULL_LANE) { @@ -1004,9 +1231,11 @@ static void ucp_ep_config_set_rndv_thresh(ucp_worker_t *worker, } iface_attr = ucp_worker_iface_get_attr(worker, rsc_index); - ucs_assert_always(iface_attr->cap.flags & rndv_cap_flag); - if (context->config.ext.rndv_thresh == UCS_CONFIG_MEMUNITS_AUTO) { + if (!ucp_ep_config_test_rndv_support(config)) { + /* Disable RNDV */ + rndv_thresh = rndv_nbr_thresh = SIZE_MAX; + } else if (context->config.ext.rndv_thresh == UCS_MEMUNITS_AUTO) { /* auto - Make UCX calculate the RMA (get_zcopy) rndv threshold on its own.*/ rndv_thresh = ucp_ep_config_calc_rndv_thresh(worker, config, config->key.am_bw_lanes, @@ -1015,21 +1244,37 @@ static void ucp_ep_config_set_rndv_thresh(ucp_worker_t *worker, } else { rndv_thresh = context->config.ext.rndv_thresh; rndv_nbr_thresh = context->config.ext.rndv_thresh; + + /* adjust max_short if rndv_thresh is set externally */ + ucp_ep_config_adjust_max_short(&config->tag.eager.max_short, + rndv_thresh); } - config->tag.rndv.max_put_zcopy = iface_attr->cap.put.max_zcopy; - config->tag.rndv.rma_thresh = ucp_ep_thresh(rndv_thresh, - iface_attr->cap.get.min_zcopy, - max_rndv_thresh); + min_thresh = ucs_max(iface_attr->cap.get.min_zcopy, min_rndv_thresh); + + /* TODO: need to check minimal PUT Zcopy */ + config->tag.rndv.rma_thresh = ucp_ep_thresh(rndv_thresh, + min_thresh, + max_rndv_thresh); config->tag.rndv_send_nbr.rma_thresh = ucp_ep_thresh(rndv_nbr_thresh, - iface_attr->cap.get.min_zcopy, + min_thresh, max_rndv_thresh); ucs_trace("rndv threshold is %zu (send_nbr: %zu)", config->tag.rndv.rma_thresh, config->tag.rndv_send_nbr.rma_thresh); } +static void ucp_ep_config_set_memtype_thresh(ucp_memtype_thresh_t *max_eager_short, + ssize_t max_short, int num_mem_type_mds) +{ + if (!num_mem_type_mds) { + max_eager_short->memtype_off = max_short; + } + + max_eager_short->memtype_on = max_short; +} + static void ucp_ep_config_init_attrs(ucp_worker_t *worker, ucp_rsc_index_t rsc_index, ucp_ep_msg_config_t *config, size_t max_short, size_t max_bcopy, size_t max_zcopy, @@ -1046,8 +1291,7 @@ static void ucp_ep_config_init_attrs(ucp_worker_t *worker, ucp_rsc_index_t rsc_i iface_attr = ucp_worker_iface_get_attr(worker, rsc_index); - if ((iface_attr->cap.flags & short_flag) && - (context->config.ext.enable_memtype_cache)) { + if ((iface_attr->cap.flags & short_flag)) { config->max_short = max_short - hdr_len; } else { config->max_short = -1; @@ -1055,24 +1299,27 @@ static void ucp_ep_config_init_attrs(ucp_worker_t *worker, ucp_rsc_index_t rsc_i if (iface_attr->cap.flags & bcopy_flag) { config->max_bcopy = max_bcopy; + } else { + config->max_bcopy = SIZE_MAX; } md_attr = &context->tl_mds[context->tl_rscs[rsc_index].md_index].attr; - if (!((iface_attr->cap.flags & zcopy_flag) && - (md_attr->cap.flags & UCT_MD_FLAG_REG))) { + if (!(iface_attr->cap.flags & zcopy_flag) || + ((md_attr->cap.flags & UCT_MD_FLAG_NEED_MEMH) && + !(md_attr->cap.flags & UCT_MD_FLAG_REG))) { return; } config->max_zcopy = max_zcopy; config->max_iov = ucs_min(UCP_MAX_IOV, max_iov); - if (context->config.ext.zcopy_thresh == UCS_CONFIG_MEMUNITS_AUTO) { + if (context->config.ext.zcopy_thresh == UCS_MEMUNITS_AUTO) { config->zcopy_auto_thresh = 1; for (it = 0; it < UCP_MAX_IOV; ++it) { - zcopy_thresh = ucp_ep_config_get_zcopy_auto_thresh(it + 1, - &md_attr->reg_cost, - context, - iface_attr->bandwidth); + zcopy_thresh = ucp_ep_config_get_zcopy_auto_thresh( + it + 1, &md_attr->reg_cost, context, + ucp_tl_iface_bandwidth(context, + &iface_attr->bandwidth)); zcopy_thresh = ucs_min(zcopy_thresh, adjust_min_val); config->sync_zcopy_thresh[it] = zcopy_thresh; config->zcopy_thresh[it] = zcopy_thresh; @@ -1081,13 +1328,14 @@ static void ucp_ep_config_init_attrs(ucp_worker_t *worker, ucp_rsc_index_t rsc_i config->zcopy_auto_thresh = 0; config->sync_zcopy_thresh[0] = config->zcopy_thresh[0] = ucs_min(context->config.ext.zcopy_thresh, adjust_min_val); + /* adjust max_short if zcopy_thresh is set externally */ - config->max_short = ucs_min(config->max_short, - (ssize_t)config->zcopy_thresh[0]); + ucp_ep_config_adjust_max_short(&config->max_short, + config->zcopy_thresh[0]); } - for (mem_type = 0; mem_type < UCT_MD_MEM_TYPE_LAST; mem_type++) { - if (UCP_MEM_IS_HOST(mem_type)) { + for (mem_type = 0; mem_type < UCS_MEMORY_TYPE_LAST; mem_type++) { + if (UCP_MEM_IS_ACCESSIBLE_FROM_CPU(mem_type)) { config->mem_type_zcopy_thresh[mem_type] = config->zcopy_thresh[0]; } else if (md_attr->cap.reg_mem_types & UCS_BIT(mem_type)) { config->mem_type_zcopy_thresh[mem_type] = 1; @@ -1095,21 +1343,51 @@ static void ucp_ep_config_init_attrs(ucp_worker_t *worker, ucp_rsc_index_t rsc_i } } -void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) +static ucs_status_t ucp_ep_config_key_copy(ucp_ep_config_key_t *dst, + const ucp_ep_config_key_t *src) { - ucp_context_h context = worker->context; - ucp_lane_index_t tag_lanes[2] = {UCP_NULL_LANE, UCP_NULL_LANE}; + *dst = *src; + dst->dst_md_cmpts = ucs_calloc(ucs_popcount(src->reachable_md_map), + sizeof(*dst->dst_md_cmpts), + "ucp_dst_md_cmpts"); + if (dst->dst_md_cmpts == NULL) { + ucs_error("failed to allocate ucp_ep dest component list"); + return UCS_ERR_NO_MEMORY; + } + + memcpy(dst->dst_md_cmpts, src->dst_md_cmpts, + ucs_popcount(src->reachable_md_map) * sizeof(*dst->dst_md_cmpts)); + return UCS_OK; +} + +ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config, + const ucp_ep_config_key_t *key) +{ + ucp_context_h context = worker->context; + ucp_lane_index_t tag_lanes[2] = {UCP_NULL_LANE, UCP_NULL_LANE}; + ucp_lane_index_t rkey_ptr_lanes[2] = {UCP_NULL_LANE, UCP_NULL_LANE}; + ucp_lane_index_t get_zcopy_lane_count; + ucp_lane_index_t put_zcopy_lane_count; ucp_ep_rma_config_t *rma_config; uct_iface_attr_t *iface_attr; uct_md_attr_t *md_attr; - uct_memory_type_t mem_type; + ucs_memory_type_t mem_type; ucp_rsc_index_t rsc_index; - ucp_lane_index_t lane; + ucp_lane_index_t lane, i; + size_t max_rndv_thresh, max_am_rndv_thresh; + size_t min_rndv_thresh, min_am_rndv_thresh; + size_t rma_zcopy_thresh; + double rndv_max_bw[UCS_MEMORY_TYPE_LAST], scale, bw; + ucs_status_t status; size_t it; - size_t max_rndv_thresh; - size_t max_am_rndv_thresh; - double rndv_max_bw; - int i; + uint8_t mem_type_index; + + memset(config, 0, sizeof(*config)); + + status = ucp_ep_config_key_copy(&config->key, key); + if (status != UCS_OK) { + goto err; + } /* Default settings */ for (it = 0; it < UCP_MAX_IOV; ++it) { @@ -1119,7 +1397,8 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) config->tag.eager.sync_zcopy_thresh[it] = SIZE_MAX; } - for (mem_type = 0; mem_type < UCT_MD_MEM_TYPE_LAST; mem_type++) { + UCS_STATIC_ASSERT(UCS_MEMORY_TYPE_HOST == 0); + for (mem_type = UCS_MEMORY_TYPE_HOST; mem_type < UCS_MEMORY_TYPE_LAST; mem_type++) { config->am.mem_type_zcopy_thresh[mem_type] = SIZE_MAX; config->tag.eager.mem_type_zcopy_thresh[mem_type] = SIZE_MAX; } @@ -1132,24 +1411,38 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) config->tag.proto = &ucp_tag_eager_proto; config->tag.sync_proto = &ucp_tag_eager_sync_proto; config->tag.rndv.rma_thresh = SIZE_MAX; + config->tag.rndv.min_get_zcopy = 0; config->tag.rndv.max_get_zcopy = SIZE_MAX; + config->tag.rndv.min_put_zcopy = 0; config->tag.rndv.max_put_zcopy = SIZE_MAX; config->tag.rndv.am_thresh = SIZE_MAX; config->tag.rndv_send_nbr.am_thresh = SIZE_MAX; config->tag.rndv_send_nbr.rma_thresh = SIZE_MAX; config->tag.rndv.rkey_size = ucp_rkey_packed_size(context, config->key.rma_bw_md_map); + for (lane = 0; lane < UCP_MAX_LANES; ++lane) { + config->tag.rndv.get_zcopy_lanes[lane] = UCP_NULL_LANE; + config->tag.rndv.put_zcopy_lanes[lane] = UCP_NULL_LANE; + } + + config->tag.rndv.rkey_ptr_dst_mds = 0; config->stream.proto = &ucp_stream_am_proto; - config->tag.offload.max_eager_short = -1; - config->tag.max_eager_short = -1; + config->am_u.proto = &ucp_am_proto; + config->am_u.reply_proto = &ucp_am_reply_proto; max_rndv_thresh = SIZE_MAX; max_am_rndv_thresh = SIZE_MAX; + min_am_rndv_thresh = 0; + + config->tag.offload.max_eager_short.memtype_on = -1; + config->tag.offload.max_eager_short.memtype_off = -1; + config->tag.max_eager_short.memtype_on = -1; + config->tag.max_eager_short.memtype_off = -1; for (lane = 0; lane < config->key.num_lanes; ++lane) { rsc_index = config->key.lanes[lane].rsc_index; if (rsc_index != UCP_NULL_RESOURCE) { config->md_index[lane] = context->tl_rscs[rsc_index].md_index; - if (ucp_worker_is_tl_p2p(worker, rsc_index)) { + if (ucp_ep_config_connect_p2p(worker, &config->key, rsc_index)) { config->p2p_lanes |= UCS_BIT(lane); } } else { @@ -1158,8 +1451,33 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) } /* configuration for rndv */ - config->tag.rndv.min_get_zcopy = 0; - rndv_max_bw = 0; + get_zcopy_lane_count = 0; + put_zcopy_lane_count = 0; + + for (i = 0; i < UCS_MEMORY_TYPE_LAST; i++) { + rndv_max_bw[i] = 0; + } + + for (i = 0; (i < config->key.num_lanes) && + (config->key.rma_bw_lanes[i] != UCP_NULL_LANE); ++i) { + lane = config->key.rma_bw_lanes[i]; + rsc_index = config->key.lanes[lane].rsc_index; + if (rsc_index == UCP_NULL_RESOURCE) { + continue; + } + + md_attr = &context->tl_mds[context->tl_rscs[rsc_index].md_index].attr; + iface_attr = ucp_worker_iface_get_attr(worker, rsc_index); + if (iface_attr->cap.flags & UCT_IFACE_FLAG_GET_ZCOPY) { + /* only GET Zcopy RNDV scheme supports multi-rail */ + bw = ucp_tl_iface_bandwidth(context, &iface_attr->bandwidth); + ucs_for_each_bit(mem_type_index, md_attr->cap.reg_mem_types) { + ucs_assert(mem_type_index < UCS_MEMORY_TYPE_LAST); + rndv_max_bw[mem_type_index] = ucs_max(rndv_max_bw[mem_type_index], bw); + } + } + } + for (i = 0; (i < config->key.num_lanes) && (config->key.rma_bw_lanes[i] != UCP_NULL_LANE); ++i) { lane = config->key.rma_bw_lanes[i]; @@ -1167,27 +1485,74 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) if (rsc_index != UCP_NULL_RESOURCE) { iface_attr = ucp_worker_iface_get_attr(worker, rsc_index); - config->tag.rndv.min_get_zcopy = ucs_max(config->tag.rndv.min_get_zcopy, - iface_attr->cap.get.min_zcopy); + md_attr = &context->tl_mds[context->tl_rscs[rsc_index].md_index].attr; + + /* GET Zcopy */ + if (iface_attr->cap.flags & UCT_IFACE_FLAG_GET_ZCOPY) { + ucs_for_each_bit(mem_type_index, md_attr->cap.reg_mem_types) { + ucs_assert(mem_type_index < UCS_MEMORY_TYPE_LAST); + scale = ucp_tl_iface_bandwidth(context, &iface_attr->bandwidth) / + rndv_max_bw[mem_type_index]; + if (scale < (1. / context->config.ext.multi_lane_max_ratio)) { + continue; + } + + config->tag.rndv.min_get_zcopy = ucs_max(config->tag.rndv.min_get_zcopy, + iface_attr->cap.get.min_zcopy); + + config->tag.rndv.max_get_zcopy = ucs_min(config->tag.rndv.max_get_zcopy, + iface_attr->cap.get.max_zcopy); + ucs_assert(get_zcopy_lane_count < UCP_MAX_LANES); + config->tag.rndv.get_zcopy_lanes[get_zcopy_lane_count++] = lane; + config->tag.rndv.scale[lane] = scale; + break; + } + } - config->tag.rndv.max_get_zcopy = ucs_min(config->tag.rndv.max_get_zcopy, - iface_attr->cap.get.max_zcopy); + /* PUT Zcopy */ + if (iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_ZCOPY) { + config->tag.rndv.min_put_zcopy = ucs_max(config->tag.rndv.min_put_zcopy, + iface_attr->cap.put.min_zcopy); - rndv_max_bw = ucs_max(rndv_max_bw, iface_attr->bandwidth); + config->tag.rndv.max_put_zcopy = ucs_min(config->tag.rndv.max_put_zcopy, + iface_attr->cap.put.max_zcopy); + ucs_assert(put_zcopy_lane_count < UCP_MAX_LANES); + config->tag.rndv.put_zcopy_lanes[put_zcopy_lane_count++] = lane; + } } } - if (rndv_max_bw > 0) { - for (i = 0; (i < config->key.num_lanes) && - (config->key.rma_bw_lanes[i] != UCP_NULL_LANE); ++i) { - lane = config->key.rma_bw_lanes[i]; - rsc_index = config->key.lanes[lane].rsc_index; + if (get_zcopy_lane_count == 0) { + /* if there are no RNDV RMA BW lanes that support GET Zcopy, reset + * min/max values to show that the scheme is unsupported */ + config->tag.rndv.min_get_zcopy = SIZE_MAX; + config->tag.rndv.max_get_zcopy = 0; + config->tag.rndv.get_zcopy_split = 0; + } else { + config->tag.rndv.get_zcopy_split = config->tag.rndv.min_get_zcopy <= + (config->tag.rndv.max_get_zcopy / 2); + } - if (rsc_index != UCP_NULL_RESOURCE) { - iface_attr = ucp_worker_iface_get_attr(worker, rsc_index); - config->tag.rndv.scale[lane] = iface_attr->bandwidth / rndv_max_bw; - } - } + if (put_zcopy_lane_count == 0) { + /* if there are no RNDV RMA BW lanes that support PUT Zcopy, reset + * min/max values to show that the scheme is unsupported */ + config->tag.rndv.min_put_zcopy = SIZE_MAX; + config->tag.rndv.max_put_zcopy = 0; + config->tag.rndv.put_zcopy_split = 0; + } else { + config->tag.rndv.put_zcopy_split = config->tag.rndv.min_put_zcopy <= + (config->tag.rndv.max_put_zcopy / 2); + } + + /* Rkey ptr */ + if (key->rkey_ptr_lane != UCP_NULL_LANE) { + lane = key->rkey_ptr_lane; + rsc_index = config->key.lanes[lane].rsc_index; + md_attr = &context->tl_mds[context->tl_rscs[rsc_index].md_index].attr; + ucs_assert_always(md_attr->cap.flags & UCT_MD_FLAG_RKEY_PTR); + + config->tag.rndv.rkey_ptr_dst_mds = + UCS_BIT(config->key.lanes[lane].dst_md_index); } /* Configuration for tag offload */ @@ -1206,29 +1571,34 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) UCT_IFACE_FLAG_TAG_EAGER_ZCOPY, 0, iface_attr->cap.tag.eager.max_bcopy); - config->tag.offload.max_rndv_iov = iface_attr->cap.tag.rndv.max_iov; - config->tag.offload.max_rndv_zcopy = iface_attr->cap.tag.rndv.max_zcopy; - config->tag.offload.max_eager_short = config->tag.eager.max_short; - config->tag.sync_proto = &ucp_tag_offload_sync_proto; - config->tag.proto = &ucp_tag_offload_proto; - config->tag.lane = lane; - max_rndv_thresh = iface_attr->cap.tag.eager.max_zcopy; - max_am_rndv_thresh = iface_attr->cap.tag.eager.max_bcopy; + config->tag.offload.max_rndv_iov = iface_attr->cap.tag.rndv.max_iov; + config->tag.offload.max_rndv_zcopy = iface_attr->cap.tag.rndv.max_zcopy; + config->tag.sync_proto = &ucp_tag_offload_sync_proto; + config->tag.proto = &ucp_tag_offload_proto; + config->tag.lane = lane; + max_rndv_thresh = iface_attr->cap.tag.eager.max_zcopy; + max_am_rndv_thresh = iface_attr->cap.tag.eager.max_bcopy; + min_rndv_thresh = ucp_ep_tag_offload_min_rndv_thresh(config); + min_am_rndv_thresh = min_rndv_thresh; ucs_assert_always(iface_attr->cap.tag.rndv.max_hdr >= sizeof(ucp_tag_offload_unexp_rndv_hdr_t)); - ucs_assert_always(config->tag.offload.max_eager_short >= 0); if (config->key.am_lane != UCP_NULL_LANE) { /* Must have active messages for using rendezvous */ tag_lanes[0] = lane; ucp_ep_config_set_rndv_thresh(worker, config, tag_lanes, - UCT_IFACE_FLAG_TAG_RNDV_ZCOPY, - max_rndv_thresh); + min_rndv_thresh, max_rndv_thresh); } + + /* Max Eager short has to be set after Zcopy and RNDV thresholds */ + ucp_ep_config_set_memtype_thresh(&config->tag.offload.max_eager_short, + config->tag.eager.max_short, + context->num_mem_type_detect_mds); } } + /* Configuration for active messages */ if (config->key.am_lane != UCP_NULL_LANE) { lane = config->key.am_lane; rsc_index = config->key.lanes[lane].rsc_index; @@ -1245,11 +1615,6 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) UCT_IFACE_FLAG_AM_ZCOPY, sizeof(ucp_eager_hdr_t), SIZE_MAX); - /* Calculate rndv threshold for AM Rendezvous, which may be used by - * any tag-matching protocol (AM and offload). */ - ucp_ep_config_set_am_rndv_thresh(worker, iface_attr, md_attr, config, - max_am_rndv_thresh); - /* All keys must fit in RNDV packet. * TODO remove some MDs if they don't */ @@ -1259,29 +1624,56 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) /* Tag offload is disabled, AM will be used for all * tag-matching protocols */ /* TODO: set threshold level based on all available lanes */ - ucp_ep_config_set_rndv_thresh(worker, config, - config->key.rma_bw_lanes, - UCT_IFACE_FLAG_GET_ZCOPY, - max_rndv_thresh); - config->tag.eager = config->am; - config->tag.lane = lane; - config->tag.max_eager_short = config->tag.eager.max_short; + + config->tag.eager = config->am; + config->tag.lane = lane; + min_rndv_thresh = iface_attr->cap.get.min_zcopy; + min_am_rndv_thresh = iface_attr->cap.am.min_zcopy; + + if (config->key.rkey_ptr_lane != UCP_NULL_LANE) { + rkey_ptr_lanes[0] = config->key.rkey_ptr_lane; + ucp_ep_config_set_rndv_thresh(worker, config, + rkey_ptr_lanes, + min_rndv_thresh, + max_rndv_thresh); + } else { + ucp_ep_config_set_rndv_thresh(worker, config, + config->key.rma_bw_lanes, + min_rndv_thresh, + max_rndv_thresh); + } + + /* Max Eager short has to be set after Zcopy and RNDV thresholds */ + ucp_ep_config_set_memtype_thresh(&config->tag.max_eager_short, + config->tag.eager.max_short, + context->num_mem_type_detect_mds); } + + /* Calculate rndv threshold for AM Rendezvous, which may be used by + * any tag-matching protocol (AM and offload). */ + ucp_ep_config_set_am_rndv_thresh(worker, iface_attr, md_attr, config, + min_am_rndv_thresh, + max_am_rndv_thresh); } else { /* Stub endpoint */ - config->am.max_bcopy = UCP_MIN_BCOPY; - } + config->am.max_bcopy = UCP_MIN_BCOPY; + config->tag.eager.max_bcopy = UCP_MIN_BCOPY; + config->tag.lane = lane; + } } memset(&config->rma, 0, sizeof(config->rma)); + rma_zcopy_thresh = ucp_ep_config_calc_rma_zcopy_thresh(worker, config, + config->key.rma_lanes); + /* Configuration for remote memory access */ for (lane = 0; lane < config->key.num_lanes; ++lane) { rma_config = &config->rma[lane]; rma_config->put_zcopy_thresh = SIZE_MAX; rma_config->get_zcopy_thresh = SIZE_MAX; - rma_config->max_put_short = SIZE_MAX; - rma_config->max_get_short = SIZE_MAX; + rma_config->max_put_short = -1; + rma_config->max_get_short = -1; rma_config->max_put_bcopy = SIZE_MAX; rma_config->max_get_bcopy = SIZE_MAX; @@ -1294,13 +1686,19 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) if (rsc_index != UCP_NULL_RESOURCE) { iface_attr = ucp_worker_iface_get_attr(worker, rsc_index); /* PUT */ + if (iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_SHORT) { + rma_config->max_put_short = iface_attr->cap.put.max_short; + } if (iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_ZCOPY) { - rma_config->max_put_zcopy = iface_attr->cap.put.max_zcopy; - /* TODO: formula */ - if (context->config.ext.zcopy_thresh == UCS_CONFIG_MEMUNITS_AUTO) { - rma_config->put_zcopy_thresh = 16384; + rma_config->max_put_zcopy = iface_attr->cap.put.max_zcopy; + if (context->config.ext.zcopy_thresh == UCS_MEMUNITS_AUTO) { + /* TODO: Use calculated value for PUT Zcopy threshold */ + rma_config->put_zcopy_thresh = 16384; } else { - rma_config->put_zcopy_thresh = context->config.ext.zcopy_thresh; + rma_config->put_zcopy_thresh = context->config.ext.zcopy_thresh; + + ucp_ep_config_adjust_max_short(&rma_config->max_put_short, + rma_config->put_zcopy_thresh); } rma_config->put_zcopy_thresh = ucs_max(rma_config->put_zcopy_thresh, iface_attr->cap.put.min_zcopy); @@ -1309,19 +1707,20 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) rma_config->max_put_bcopy = ucs_min(iface_attr->cap.put.max_bcopy, rma_config->put_zcopy_thresh); } - if (iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_SHORT) { - rma_config->max_put_short = ucs_min(iface_attr->cap.put.max_short, - rma_config->max_put_bcopy); - } /* GET */ + if (iface_attr->cap.flags & UCT_IFACE_FLAG_GET_SHORT) { + rma_config->max_get_short = iface_attr->cap.get.max_short; + } if (iface_attr->cap.flags & UCT_IFACE_FLAG_GET_ZCOPY) { - /* TODO: formula */ rma_config->max_get_zcopy = iface_attr->cap.get.max_zcopy; - if (context->config.ext.zcopy_thresh == UCS_CONFIG_MEMUNITS_AUTO) { - rma_config->get_zcopy_thresh = 16384; + if (context->config.ext.zcopy_thresh == UCS_MEMUNITS_AUTO) { + rma_config->get_zcopy_thresh = rma_zcopy_thresh; } else { - rma_config->get_zcopy_thresh = context->config.ext.zcopy_thresh; + rma_config->get_zcopy_thresh = context->config.ext.zcopy_thresh; + + ucp_ep_config_adjust_max_short(&rma_config->max_get_short, + rma_config->get_zcopy_thresh); } rma_config->get_zcopy_thresh = ucs_max(rma_config->get_zcopy_thresh, iface_attr->cap.get.min_zcopy); @@ -1330,14 +1729,25 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) rma_config->max_get_bcopy = ucs_min(iface_attr->cap.get.max_bcopy, rma_config->get_zcopy_thresh); } - if (iface_attr->cap.flags & UCT_IFACE_FLAG_GET_SHORT) { - rma_config->max_get_short = ucs_min(iface_attr->cap.get.max_short, - rma_config->max_get_bcopy); - } - } else { - rma_config->max_put_bcopy = UCP_MIN_BCOPY; /* Stub endpoint */ } } + + return UCS_OK; + +err: + return status; +} + +void ucp_ep_config_cleanup(ucp_worker_h worker, ucp_ep_config_t *config) +{ + ucs_free(config->key.dst_md_cmpts); +} + +static int ucp_ep_is_short_lower_thresh(ssize_t max_short, + size_t thresh) +{ + return ((max_short < 0) || + (((size_t)max_short + 1) < thresh)); } static void ucp_ep_config_print_tag_proto(FILE *stream, const char *name, @@ -1346,34 +1756,44 @@ static void ucp_ep_config_print_tag_proto(FILE *stream, const char *name, size_t rndv_rma_thresh, size_t rndv_am_thresh) { - size_t max_bcopy, min_rndv; + size_t max_bcopy, min_rndv, max_short; + + min_rndv = ucs_min(rndv_rma_thresh, rndv_am_thresh); + max_bcopy = ucs_min(zcopy_thresh, min_rndv); fprintf(stream, "# %23s: 0", name); + + /* print eager short */ if (max_eager_short > 0) { - fprintf(stream, "....%zd" , max_eager_short + 1); + max_short = max_eager_short; + ucs_assert(max_short <= SSIZE_MAX); + fprintf(stream, "....%zu" , max_short + 1); + } else if (!max_eager_short) { + fprintf(stream, "....%zu" , max_eager_short); } - min_rndv = ucs_min(rndv_rma_thresh, rndv_am_thresh); - max_bcopy = ucs_min(zcopy_thresh, min_rndv); - - /* Check whether maximum Eager short attribute is negative or not - * before comparing it with maximum Bcopy attribute (unsigned) */ - if ((max_eager_short < 0) || ((size_t)max_eager_short < max_bcopy)) { + /* print eager bcopy */ + if (ucp_ep_is_short_lower_thresh(max_eager_short, max_bcopy) && max_bcopy) { fprintf(stream, "...."); if (max_bcopy < SIZE_MAX) { fprintf(stream, "%zu", max_bcopy); } } - if (zcopy_thresh < min_rndv) { + + /* print eager zcopy */ + if (ucp_ep_is_short_lower_thresh(max_eager_short, min_rndv) && + (zcopy_thresh < min_rndv)) { fprintf(stream, "...."); if (min_rndv < SIZE_MAX) { fprintf(stream, "%zu", min_rndv); } } + /* print rendezvous */ if (min_rndv < SIZE_MAX) { fprintf(stream, "...."); } + fprintf(stream, "(inf)\n"); } @@ -1381,7 +1801,6 @@ static void ucp_ep_config_print_rma_proto(FILE *stream, const char *name, ucp_lane_index_t lane, size_t bcopy_thresh, size_t zcopy_thresh) { - fprintf(stream, "# %20s[%d]: 0", name, lane); if (bcopy_thresh > 0) { fprintf(stream, ".."); @@ -1393,7 +1812,10 @@ static void ucp_ep_config_print_rma_proto(FILE *stream, const char *name, fprintf(stream, ".."); } if (zcopy_thresh < SIZE_MAX) { - fprintf(stream, "..%zu..", zcopy_thresh); + if (zcopy_thresh > 0) { + fprintf(stream, "..%zu", zcopy_thresh); + } + fprintf(stream, ".."); } fprintf(stream, "..(inf)\n"); } @@ -1412,7 +1834,7 @@ int ucp_ep_config_get_multi_lane_prio(const ucp_lane_index_t *lanes, void ucp_ep_config_lane_info_str(ucp_context_h context, const ucp_ep_config_key_t *key, - const uint8_t *addr_indices, + const unsigned *addr_indices, ucp_lane_index_t lane, ucp_rsc_index_t aux_rsc_index, char *buf, size_t max) @@ -1420,6 +1842,9 @@ void ucp_ep_config_lane_info_str(ucp_context_h context, uct_tl_resource_desc_t *rsc; ucp_rsc_index_t rsc_index; ucp_lane_index_t proxy_lane; + ucp_md_index_t dst_md_index; + ucp_rsc_index_t cmpt_index; + unsigned path_index; char *p, *endp; char *desc_str; int prio; @@ -1436,8 +1861,9 @@ void ucp_ep_config_lane_info_str(ucp_context_h context, } else { desc_str = ""; } - snprintf(p, endp - p, "lane[%d]: %2d:" UCT_TL_RESOURCE_DESC_FMT " md[%d]%s %-*c-> ", - lane, rsc_index, UCT_TL_RESOURCE_DESC_ARG(rsc), + path_index = key->lanes[lane].path_index; + snprintf(p, endp - p, "lane[%d]: %2d:" UCT_TL_RESOURCE_DESC_FMT ".%u md[%d]%s %-*c-> ", + lane, rsc_index, UCT_TL_RESOURCE_DESC_ARG(rsc), path_index, context->tl_rscs[rsc_index].md_index, desc_str, 20 - (int)(strlen(rsc->dev_name) + strlen(rsc->tl_name) + strlen(desc_str)), ' '); @@ -1454,7 +1880,10 @@ void ucp_ep_config_lane_info_str(ucp_context_h context, p += strlen(p); } - snprintf(p, endp - p, "md[%d]", key->lanes[lane].dst_md_index); + dst_md_index = key->lanes[lane].dst_md_index; + cmpt_index = ucp_ep_config_get_dst_md_cmpt(key, dst_md_index); + snprintf(p, endp - p, "md[%d]/%-8s", dst_md_index, + context->tl_cmpts[cmpt_index].attr.name); p += strlen(p); prio = ucp_ep_config_get_multi_lane_prio(key->rma_lanes, lane); @@ -1480,6 +1909,11 @@ void ucp_ep_config_lane_info_str(ucp_context_h context, p += strlen(p); } + if (key->rkey_ptr_lane == lane) { + snprintf(p, endp - p, " rkey_ptr"); + p += strlen(p); + } + prio = ucp_ep_config_get_multi_lane_prio(key->am_bw_lanes, lane); if (prio != -1) { snprintf(p, endp - p, " am_bw#%d", prio); @@ -1497,19 +1931,17 @@ void ucp_ep_config_lane_info_str(ucp_context_h context, if (aux_rsc_index != UCP_NULL_RESOURCE) { snprintf(p, endp - p, "{" UCT_TL_RESOURCE_DESC_FMT "}", UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[aux_rsc_index].tl_rsc)); - p += strlen(p); } } } static void ucp_ep_config_print(FILE *stream, ucp_worker_h worker, const ucp_ep_config_t *config, - const uint8_t *addr_indices, + const unsigned *addr_indices, ucp_rsc_index_t aux_rsc_index) { ucp_context_h context = worker->context; char lane_info[128] = {0}; - const ucp_ep_msg_config_t *tag_config; ucp_md_index_t md_index; ucp_lane_index_t lane; @@ -1521,25 +1953,24 @@ static void ucp_ep_config_print(FILE *stream, ucp_worker_h worker, fprintf(stream, "#\n"); if (context->config.features & UCP_FEATURE_TAG) { - tag_config = (ucp_ep_is_tag_offload_enabled((ucp_ep_config_t *)config)) ? - &config->tag.eager : &config->am; - ucp_ep_config_print_tag_proto(stream, "tag_send", - tag_config->max_short, - tag_config->zcopy_thresh[0], - config->tag.rndv.rma_thresh, - config->tag.rndv.am_thresh); - ucp_ep_config_print_tag_proto(stream, "tag_send_nbr", - tag_config->max_short, - /* disable zcopy */ - config->tag.rndv_send_nbr.rma_thresh, - config->tag.rndv_send_nbr.rma_thresh, - config->tag.rndv_send_nbr.am_thresh); - ucp_ep_config_print_tag_proto(stream, "tag_send_sync", - tag_config->max_short, - tag_config->sync_zcopy_thresh[0], - config->tag.rndv.rma_thresh, - config->tag.rndv.am_thresh); - } + ucp_ep_config_print_tag_proto(stream, "tag_send", + config->tag.eager.max_short, + config->tag.eager.zcopy_thresh[0], + config->tag.rndv.rma_thresh, + config->tag.rndv.am_thresh); + ucp_ep_config_print_tag_proto(stream, "tag_send_nbr", + config->tag.eager.max_short, + /* disable zcopy */ + ucs_min(config->tag.rndv_send_nbr.rma_thresh, + config->tag.rndv_send_nbr.am_thresh), + config->tag.rndv_send_nbr.rma_thresh, + config->tag.rndv_send_nbr.am_thresh); + ucp_ep_config_print_tag_proto(stream, "tag_send_sync", + config->tag.eager.max_short, + config->tag.eager.sync_zcopy_thresh[0], + config->tag.rndv.rma_thresh, + config->tag.rndv.am_thresh); + } if (context->config.features & UCP_FEATURE_RMA) { for (lane = 0; lane < config->key.num_lanes; ++lane) { @@ -1547,8 +1978,7 @@ static void ucp_ep_config_print(FILE *stream, ucp_worker_h worker, continue; } ucp_ep_config_print_rma_proto(stream, "put", lane, - ucs_max(config->rma[lane].max_put_short + 1, - config->bcopy_thresh), + config->rma[lane].max_put_short + 1, config->rma[lane].put_zcopy_thresh); ucp_ep_config_print_rma_proto(stream, "get", lane, 0, config->rma[lane].get_zcopy_thresh); @@ -1600,19 +2030,102 @@ void ucp_ep_print_info(ucp_ep_h ep, FILE *stream) } size_t ucp_ep_config_get_zcopy_auto_thresh(size_t iovcnt, - const uct_linear_growth_t *reg_cost, + const ucs_linear_func_t *reg_cost, const ucp_context_h context, double bandwidth) { double zcopy_thresh; double bcopy_bw = context->config.ext.bcopy_bw; - zcopy_thresh = (iovcnt * reg_cost->overhead) / - ((1.0 / bcopy_bw) - (1.0 / bandwidth) - (iovcnt * reg_cost->growth)); + zcopy_thresh = (iovcnt * reg_cost->c) / + ((1.0 / bcopy_bw) - (1.0 / bandwidth) - (iovcnt * reg_cost->m)); - if ((zcopy_thresh < 0.0) || (zcopy_thresh > SIZE_MAX)) { + if (zcopy_thresh < 0.0) { return SIZE_MAX; } return zcopy_thresh; } + +ucp_wireup_ep_t * ucp_ep_get_cm_wireup_ep(ucp_ep_h ep) +{ + ucp_lane_index_t lane; + + if (ep->cfg_index == UCP_NULL_CFG_INDEX) { + return NULL; + } + + lane = ucp_ep_get_cm_lane(ep); + if (lane == UCP_NULL_LANE) { + return NULL; + } + + return ucp_wireup_ep_test(ep->uct_eps[lane]) ? + ucs_derived_of(ep->uct_eps[lane], ucp_wireup_ep_t) : NULL; +} + +uct_ep_h ucp_ep_get_cm_uct_ep(ucp_ep_h ep) +{ + ucp_lane_index_t lane; + ucp_wireup_ep_t *wireup_ep; + + lane = ucp_ep_get_cm_lane(ep); + if (lane == UCP_NULL_LANE) { + return NULL; + } + + wireup_ep = ucp_ep_get_cm_wireup_ep(ep); + return (wireup_ep == NULL) ? ep->uct_eps[lane] : wireup_ep->super.uct_ep; +} + +int ucp_ep_is_cm_local_connected(ucp_ep_h ep) +{ + return ucp_ep_has_cm_lane(ep) && (ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED); +} + +uint64_t ucp_ep_get_tl_bitmap(ucp_ep_h ep) +{ + uint64_t tl_bitmap = 0; + ucp_lane_index_t lane; + ucp_rsc_index_t rsc_idx; + + for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { + if (lane == ucp_ep_get_cm_lane(ep)) { + continue; + } + + rsc_idx = ucp_ep_get_rsc_index(ep, lane); + if (rsc_idx == UCP_NULL_RESOURCE) { + continue; + } + + tl_bitmap |= UCS_BIT(rsc_idx); + } + + return tl_bitmap; +} + +void ucp_ep_invoke_err_cb(ucp_ep_h ep, ucs_status_t status) +{ + /* Do not invoke error handler if it's not enabled */ + if ((ucp_ep_config(ep)->key.err_mode == UCP_ERR_HANDLING_MODE_NONE) || + /* error callback is not set */ + (ucp_ep_ext_gen(ep)->err_cb == NULL) || + /* the EP has been closed by user, or error callback already called */ + (ep->flags & (UCP_EP_FLAG_CLOSED | UCP_EP_FLAG_ERR_HANDLER_INVOKED))) { + return; + } + + ucs_assert(ep->flags & UCP_EP_FLAG_USED); + ucs_debug("ep %p: calling user error callback %p with arg %p and status %s", + ep, ucp_ep_ext_gen(ep)->err_cb, ucp_ep_ext_gen(ep)->user_data, + ucs_status_string(status)); + ep->flags |= UCP_EP_FLAG_ERR_HANDLER_INVOKED; + ucp_ep_ext_gen(ep)->err_cb(ucp_ep_ext_gen(ep)->user_data, ep, status); +} + +int ucp_ep_config_test_rndv_support(const ucp_ep_config_t *config) +{ + return (config->key.err_mode == UCP_ERR_HANDLING_MODE_NONE) || + (config->key.cm_lane != UCP_NULL_LANE); +} diff --git a/src/ucp/core/ucp_ep.h b/src/ucp/core/ucp_ep.h index 1246d2a6564..ac7aac1a9d0 100644 --- a/src/ucp/core/ucp_ep.h +++ b/src/ucp/core/ucp_ep.h @@ -1,5 +1,6 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * Copyright (C) Los Alamos National Security, LLC. 2019 ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -9,21 +10,25 @@ #include "ucp_types.h" +#include #include #include #include #include #include +#include + #define UCP_MAX_IOV 16UL /* Configuration */ typedef uint16_t ucp_ep_cfg_index_t; +#define UCP_NULL_CFG_INDEX UINT16_MAX /* Endpoint flags type */ -#if ENABLE_DEBUG_DATA || ENABLE_ASSERT +#if ENABLE_DEBUG_DATA || UCS_ENABLE_ASSERT typedef uint32_t ucp_ep_flags_t; #else typedef uint16_t ucp_ep_flags_t; @@ -34,7 +39,11 @@ typedef uint16_t ucp_ep_flags_t; * Endpoint flags */ enum { - UCP_EP_FLAG_LOCAL_CONNECTED = UCS_BIT(0), /* All local endpoints are connected */ + UCP_EP_FLAG_LOCAL_CONNECTED = UCS_BIT(0), /* All local endpoints are connected, + for CM case - local address was packed, + UCT did not report errors during + connection establishment protocol + and disconnect not called yet */ UCP_EP_FLAG_REMOTE_CONNECTED = UCS_BIT(1), /* All remote endpoints are connected */ UCP_EP_FLAG_CONNECT_REQ_QUEUED = UCS_BIT(2), /* Connection request was queued */ UCP_EP_FLAG_FAILED = UCS_BIT(3), /* EP is in failed state */ @@ -47,6 +56,9 @@ enum { worker address from the client) */ UCP_EP_FLAG_CONNECT_PRE_REQ_QUEUED = UCS_BIT(9), /* Pre-Connection request was queued */ UCP_EP_FLAG_CLOSED = UCS_BIT(10),/* EP was closed */ + UCP_EP_FLAG_CLOSE_REQ_VALID = UCS_BIT(11),/* close protocol is started and + close_req is valid */ + UCP_EP_FLAG_ERR_HANDLER_INVOKED = UCS_BIT(12),/* error handler was called */ /* DEBUG bits */ UCP_EP_FLAG_CONNECT_REQ_SENT = UCS_BIT(16),/* DEBUG: Connection request was sent */ @@ -58,7 +70,6 @@ enum { to the remote peer when starting connection establishment on this EP */ UCP_EP_FLAG_FLUSH_STATE_VALID = UCS_BIT(22) /* DEBUG: flush_state is valid */ - }; @@ -78,7 +89,13 @@ enum { */ enum { UCP_EP_INIT_FLAG_MEM_TYPE = UCS_BIT(0), /**< Endpoint for local mem type transfers */ - UCP_EP_CREATE_AM_LANE = UCS_BIT(1) /**< Endpoint requires an AM lane */ + UCP_EP_INIT_CREATE_AM_LANE = UCS_BIT(1), /**< Endpoint requires an AM lane */ + UCP_EP_INIT_CM_WIREUP_CLIENT = UCS_BIT(2), /**< Endpoint wireup protocol is based on CM, + client side */ + UCP_EP_INIT_CM_WIREUP_SERVER = UCS_BIT(3), /**< Endpoint wireup protocol is based on CM, + server side */ + UCP_EP_INIT_ERR_MODE_PEER_FAILURE = UCS_BIT(4) /**< Endpoint requires an + @ref UCP_ERR_HANDLING_MODE_PEER */ }; @@ -91,58 +108,70 @@ enum { * This is filled by to the transport selection logic, according to the local * resources and set of remote addresses. */ -typedef struct ucp_ep_config_key { +struct ucp_ep_config_key { - ucp_lane_index_t num_lanes; /* Number of active lanes */ + ucp_lane_index_t num_lanes; /* Number of active lanes */ struct { - ucp_rsc_index_t rsc_index; /* Resource index */ - ucp_lane_index_t proxy_lane; /* UCP_NULL_LANE - no proxy + ucp_rsc_index_t rsc_index; /* Resource index */ + ucp_lane_index_t proxy_lane; /* UCP_NULL_LANE - no proxy otherwise - in which lane the real transport endpoint is stored */ - ucp_md_index_t dst_md_index; /* Destination memory domain index */ + ucp_md_index_t dst_md_index; /* Destination memory domain index */ + uint8_t path_index; /* Device path index */ + ucp_lane_type_mask_t lane_types; /* Which types of operations this lane + was selected for */ } lanes[UCP_MAX_LANES]; - ucp_lane_index_t am_lane; /* Lane for AM (can be NULL) */ - ucp_lane_index_t tag_lane; /* Lane for tag matching offload (can be NULL) */ - ucp_lane_index_t wireup_lane; /* Lane for wireup messages (can be NULL) */ + ucp_lane_index_t am_lane; /* Lane for AM (can be NULL) */ + ucp_lane_index_t tag_lane; /* Lane for tag matching offload (can be NULL) */ + ucp_lane_index_t wireup_lane; /* Lane for wireup messages (can be NULL) */ + ucp_lane_index_t cm_lane; /* Lane for holding a CM connection */ /* Lanes for remote memory access, sorted by priority, highest first */ - ucp_lane_index_t rma_lanes[UCP_MAX_LANES]; + ucp_lane_index_t rma_lanes[UCP_MAX_LANES]; /* Lanes for high-bw memory access, sorted by priority, highest first */ - ucp_lane_index_t rma_bw_lanes[UCP_MAX_LANES]; + ucp_lane_index_t rma_bw_lanes[UCP_MAX_LANES]; + + /* Lane for obtaining remote memory pointer */ + ucp_lane_index_t rkey_ptr_lane; /* Lanes for atomic operations, sorted by priority, highest first */ - ucp_lane_index_t amo_lanes[UCP_MAX_LANES]; + ucp_lane_index_t amo_lanes[UCP_MAX_LANES]; /* Lanes for high-bw active messages, sorted by priority, highest first */ - ucp_lane_index_t am_bw_lanes[UCP_MAX_LANES]; + ucp_lane_index_t am_bw_lanes[UCP_MAX_LANES]; /* Local memory domains to send remote keys for in high-bw rma protocols * NOTE: potentially it can be different than what is imposed by rma_bw_lanes, * since these are the MDs used by remote side for accessing our memory. */ - ucp_md_map_t rma_bw_md_map; + ucp_md_map_t rma_bw_md_map; /* Bitmap of remote mds which are reachable from this endpoint (with any set * of transports which could be selected in the future). */ - ucp_md_map_t reachable_md_map; + ucp_md_map_t reachable_md_map; + + /* Array with popcount(reachable_md_map) elements, each entry holds the local + * component index to be used for unpacking remote key from each set bit in + * reachable_md_map */ + ucp_rsc_index_t *dst_md_cmpts; /* Error handling mode */ - ucp_err_handling_mode_t err_mode; - ucs_status_t status; -} ucp_ep_config_key_t; + ucp_err_handling_mode_t err_mode; + ucs_status_t status; +}; /* * Configuration for RMA protocols */ typedef struct ucp_ep_rma_config { - size_t max_put_short; /* Maximal payload of put short */ + ssize_t max_put_short; /* Maximal payload of put short */ size_t max_put_bcopy; /* Maximal total size of put_bcopy */ size_t max_put_zcopy; - size_t max_get_short; /* Maximal payload of get short */ + ssize_t max_get_short; /* Maximal payload of get short */ size_t max_get_bcopy; /* Maximal total size of get_bcopy */ size_t max_get_zcopy; size_t put_zcopy_thresh; @@ -163,7 +192,7 @@ typedef struct ucp_ep_msg_config { size_t zcopy_thresh[UCP_MAX_IOV]; /* zero-copy threshold for mem type buffers */ - size_t mem_type_zcopy_thresh[UCT_MD_MEM_TYPE_LAST]; + size_t mem_type_zcopy_thresh[UCS_MEMORY_TYPE_LAST]; /* zero-copy threshold for operations which anyways have to wait for remote side */ size_t sync_zcopy_thresh[UCP_MAX_IOV]; @@ -171,7 +200,16 @@ typedef struct ucp_ep_msg_config { } ucp_ep_msg_config_t; -typedef struct ucp_ep_config { +/* + * Thresholds with and without non-host memory + */ +typedef struct ucp_memtype_thresh { + ssize_t memtype_on; + ssize_t memtype_off; +} ucp_memtype_thresh_t; + + +struct ucp_ep_config { /* A key which uniquely defines the configuration, and all other fields of * configuration (in the current worker) and defined only by it. @@ -198,13 +236,14 @@ typedef struct ucp_ep_config { struct { /* Protocols used for tag matching operations * (can be AM based or tag offload). */ - const ucp_proto_t *proto; - const ucp_proto_t *sync_proto; + const ucp_request_send_proto_t *proto; + const ucp_request_send_proto_t *sync_proto; /* Lane used for tag matching operations. */ ucp_lane_index_t lane; - ssize_t max_eager_short; + /* Maximal size for eager short. */ + ucp_memtype_thresh_t max_eager_short; /* Configuration of the lane used for eager protocols * (can be AM or tag offload). */ @@ -215,16 +254,30 @@ typedef struct ucp_ep_config { size_t max_get_zcopy; /* Minimal size of rndv_get_zcopy */ size_t min_get_zcopy; + /* Can the message > `max_get_zcopy` be split to + * the segments that are >= `min_get_zcopy` */ + int get_zcopy_split; /* Maximal total size of rndv_put_zcopy */ size_t max_put_zcopy; + /* Minimal size of rndv_put_zcopy */ + size_t min_put_zcopy; + /* Can the message > `max_put_zcopy` be split to + * the segments that are >= `min_put_zcopy` */ + int put_zcopy_split; /* Threshold for switching from eager to RMA based rendezvous */ size_t rma_thresh; /* Threshold for switching from eager to AM based rendezvous */ size_t am_thresh; /* Total size of packed rkey, according to high-bw md_map */ size_t rkey_size; + /* remote memory domains which support rkey_ptr */ + ucp_md_map_t rkey_ptr_dst_mds; + /* Lanes for GET zcopy */ + ucp_lane_index_t get_zcopy_lanes[UCP_MAX_LANES]; + /* Lanes for PUT zcopy */ + ucp_lane_index_t put_zcopy_lanes[UCP_MAX_LANES]; /* BW based scale factor */ - double scale[UCP_MAX_LANES]; + double scale[UCP_MAX_LANES]; } rndv; /* special thresholds for the ucp_tag_send_nbr() */ @@ -236,8 +289,9 @@ typedef struct ucp_ep_config { } rndv_send_nbr; struct { - /* Maximal size for eager short */ - ssize_t max_eager_short; + /* Maximal size for eager short. */ + ucp_memtype_thresh_t max_eager_short; + /* Maximal iov count for RNDV offload */ size_t max_rndv_iov; /* Maximal total size for RNDV offload */ @@ -248,9 +302,16 @@ typedef struct ucp_ep_config { struct { /* Protocols used for stream operations * (currently it's only AM based). */ - const ucp_proto_t *proto; + const ucp_request_send_proto_t *proto; } stream; -} ucp_ep_config_t; + + struct { + /* Protocols used for am operations */ + const ucp_request_send_proto_t *proto; + const ucp_request_send_proto_t *reply_proto; + } am_u; + +}; /** @@ -271,7 +332,7 @@ typedef struct ucp_ep { char peer_name[UCP_WORKER_NAME_MAX]; #endif - UCS_STATS_NODE_DECLARE(stats); + UCS_STATS_NODE_DECLARE(stats) } ucp_ep_t; @@ -287,6 +348,14 @@ typedef struct { } ucp_ep_flush_state_t; +/** + * Status of protocol-level remote completions + */ +typedef struct { + ucp_request_t *req; /* Flush request which is + used in close protocol */ +} ucp_ep_close_proto_req_t; + /* * Endpoint extension for generic non fast-path data */ @@ -304,6 +373,7 @@ typedef struct { ucp_ep_match_t ep_match; /* Matching with remote endpoints */ ucp_ep_flush_state_t flush_state; /* Remove completion status */ ucp_listener_h listener; /* Listener that may be associated with ep */ + ucp_ep_close_proto_req_t close_req; /* Close protocol request */ }; } ucp_ep_ext_gen_t; @@ -317,22 +387,55 @@ typedef struct { ucs_queue_head_t match_q; /* Queue of receive data or requests, depends on UCP_EP_FLAG_STREAM_HAS_DATA */ } stream; + + struct { + ucs_list_link_t started_ams; + ucs_queue_head_t mid_rdesc_q; /* queue of middle fragments, which + arrived before the first one */ + } am; } ucp_ep_ext_proto_t; -typedef struct ucp_wireup_client_data { - uintptr_t ep_ptr; /**< Client-side endpoint pointer */ - ucp_err_handling_mode_t err_mode; /**< Error handling mode */ - uint8_t is_full_addr; /**< Whether the attached address is - full or partial */ +enum { + UCP_WIREUP_SA_DATA_FULL_ADDR = 0, /* Sockaddr client data contains full + address. */ + UCP_WIREUP_SA_DATA_PARTIAL_ADDR, /* Sockaddr client data contains partial + address, wireup protocol requires + extra MSGs. */ + UCP_WIREUP_SA_DATA_CM_ADDR /* Sockaddr client data contains address + for CM based wireup: there is only + iface and ep address of transport + lanes, remote device address is + provided by CM and has to be added to + unpacked UCP address locally. */ +}; + + +struct ucp_wireup_sockaddr_data { + uintptr_t ep_ptr; /**< Endpoint pointer */ + uint8_t err_mode; /**< Error handling mode */ + uint8_t addr_mode; /**< The attached address format + defined by + UCP_WIREUP_SA_DATA_xx */ + uint8_t dev_index; /**< Device address index used to + build remote address in + UCP_WIREUP_SA_DATA_CM_ADDR + mode */ /* packed worker address follows */ -} UCS_S_PACKED ucp_wireup_client_data_t; +} UCS_S_PACKED; typedef struct ucp_conn_request { ucp_listener_h listener; + union { + uct_listener_h listener; + uct_iface_h iface; + } uct; uct_conn_request_h uct_req; - ucp_wireup_client_data_t client_data; + char dev_name[UCT_DEVICE_NAME_MAX]; + uct_device_addr_t *remote_dev_addr; + struct sockaddr_storage client_address; + ucp_wireup_sockaddr_data_t sa_data; /* packed worker address follows */ } ucp_conn_request_t; @@ -341,44 +444,46 @@ void ucp_ep_config_key_reset(ucp_ep_config_key_t *key); void ucp_ep_config_lane_info_str(ucp_context_h context, const ucp_ep_config_key_t *key, - const uint8_t *addr_indices, + const unsigned *addr_indices, ucp_lane_index_t lane, ucp_rsc_index_t aux_rsc_index, char *buf, size_t max); -ucs_status_t ucp_ep_new(ucp_worker_h worker, const char *peer_name, - const char *message, ucp_ep_h *ep_p); +ucs_status_t ucp_ep_create_base(ucp_worker_h worker, const char *peer_name, + const char *message, ucp_ep_h *ep_p); + +ucs_status_t ucp_worker_create_ep(ucp_worker_h worker, const char *peer_name, + const char *message, ucp_ep_h *ep_p); void ucp_ep_delete(ucp_ep_h ep); -ucs_status_t ucp_ep_init_create_wireup(ucp_ep_h ep, - const ucp_ep_params_t *params, +ucs_status_t ucp_ep_init_create_wireup(ucp_ep_h ep, unsigned ep_init_flags, ucp_wireup_ep_t **wireup_ep); ucs_status_t ucp_ep_create_to_worker_addr(ucp_worker_h worker, - const ucp_ep_params_t *params, + uint64_t local_tl_bitmap, const ucp_unpacked_address_t *remote_address, unsigned ep_init_flags, const char *message, ucp_ep_h *ep_p); -ucs_status_t ucp_ep_create_accept(ucp_worker_h worker, - const ucp_wireup_client_data_t *client_data, - ucp_ep_h *ep_p); +ucs_status_t ucp_ep_create_server_accept(ucp_worker_h worker, + const ucp_conn_request_h conn_request, + ucp_ep_h *ep_p); ucs_status_ptr_t ucp_ep_flush_internal(ucp_ep_h ep, unsigned uct_flags, - ucp_send_callback_t req_cb, unsigned req_flags, + const ucp_request_param_t *param, ucp_request_t *worker_req, ucp_request_callback_t flushed_cb, const char *debug_name); -ucs_status_t ucp_ep_create_sockaddr_aux(ucp_worker_h worker, - const ucp_ep_params_t *params, - const ucp_unpacked_address_t *remote_address, - ucp_ep_h *ep_p); +ucs_status_t +ucp_ep_create_sockaddr_aux(ucp_worker_h worker, unsigned ep_init_flags, + const ucp_unpacked_address_t *remote_address, + ucp_ep_h *ep_p); -void ucp_ep_config_key_set_params(ucp_ep_config_key_t *key, - const ucp_ep_params_t *params); +void ucp_ep_config_key_set_err_mode(ucp_ep_config_key_t *key, + unsigned ep_init_flags); void ucp_ep_err_pending_purge(uct_pending_req_t *self, void *arg); @@ -390,7 +495,10 @@ void ucp_ep_cleanup_lanes(ucp_ep_h ep); int ucp_ep_is_sockaddr_stub(ucp_ep_h ep); -void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config); +ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config, + const ucp_ep_config_key_t *key); + +void ucp_ep_config_cleanup(ucp_worker_h worker, ucp_ep_config_t *config); int ucp_ep_config_is_equal(const ucp_ep_config_key_t *key1, const ucp_ep_config_key_t *key2); @@ -399,10 +507,30 @@ int ucp_ep_config_get_multi_lane_prio(const ucp_lane_index_t *lanes, ucp_lane_index_t lane); size_t ucp_ep_config_get_zcopy_auto_thresh(size_t iovcnt, - const uct_linear_growth_t *reg_cost, + const ucs_linear_func_t *reg_cost, const ucp_context_h context, double bandwidth); ucs_status_t ucp_worker_create_mem_type_endpoints(ucp_worker_h worker); +ucp_wireup_ep_t * ucp_ep_get_cm_wireup_ep(ucp_ep_h ep); + +uint64_t ucp_ep_get_tl_bitmap(ucp_ep_h ep); + +uct_ep_h ucp_ep_get_cm_uct_ep(ucp_ep_h ep); + +int ucp_ep_is_cm_local_connected(ucp_ep_h ep); + +unsigned ucp_ep_local_disconnect_progress(void *arg); + +size_t ucp_ep_tag_offload_min_rndv_thresh(ucp_ep_config_t *config); + +void ucp_ep_invoke_err_cb(ucp_ep_h ep, ucs_status_t status); + +int ucp_ep_config_test_rndv_support(const ucp_ep_config_t *config); + +void ucp_ep_flush_completion(uct_completion_t *self, ucs_status_t status); + +void ucp_ep_flush_request_ff(ucp_request_t *req, ucs_status_t status); + #endif diff --git a/src/ucp/core/ucp_ep.inl b/src/ucp/core/ucp_ep.inl index 797d7e5e3cf..35b931fb1ab 100644 --- a/src/ucp/core/ucp_ep.inl +++ b/src/ucp/core/ucp_ep.inl @@ -1,6 +1,6 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2016. ALL RIGHTS RESERVED. - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -19,6 +19,7 @@ static inline ucp_ep_config_t *ucp_ep_config(ucp_ep_h ep) { + ucs_assert(ep->cfg_index != UCP_NULL_CFG_INDEX); return &ep->worker->ep_config[ep->cfg_index]; } @@ -63,9 +64,16 @@ static inline uct_ep_h ucp_ep_get_tag_uct_ep(ucp_ep_h ep) static inline ucp_rsc_index_t ucp_ep_get_rsc_index(ucp_ep_h ep, ucp_lane_index_t lane) { + ucs_assert(lane < UCP_MAX_LANES); /* to suppress coverity */ return ucp_ep_config(ep)->key.lanes[lane].rsc_index; } +static inline ucp_rsc_index_t ucp_ep_get_path_index(ucp_ep_h ep, + ucp_lane_index_t lane) +{ + return ucp_ep_config(ep)->key.lanes[lane].path_index; +} + static inline uct_iface_attr_t *ucp_ep_get_iface_attr(ucp_ep_h ep, ucp_lane_index_t lane) { return ucp_worker_iface_get_attr(ep->worker, ucp_ep_get_rsc_index(ep, lane)); @@ -112,16 +120,16 @@ static inline ucp_md_index_t ucp_ep_md_index(ucp_ep_h ep, ucp_lane_index_t lane) return ucp_ep_config(ep)->md_index[lane]; } -static inline const uct_md_attr_t* ucp_ep_md_attr(ucp_ep_h ep, ucp_lane_index_t lane) +static inline uct_md_h ucp_ep_md(ucp_ep_h ep, ucp_lane_index_t lane) { ucp_context_h context = ep->worker->context; - return &context->tl_mds[ucp_ep_md_index(ep, lane)].attr; + return context->tl_mds[ucp_ep_md_index(ep, lane)].md; } -static inline uct_md_h ucp_ep_md(ucp_ep_h ep, ucp_lane_index_t lane) +static inline const uct_md_attr_t* ucp_ep_md_attr(ucp_ep_h ep, ucp_lane_index_t lane) { ucp_context_h context = ep->worker->context; - return context->tl_mds[ucp_ep_md_index(ep, lane)].md; + return &context->tl_mds[ucp_ep_md_index(ep, lane)].attr; } static inline uct_md_h ucp_ep_get_am_uct_md(ucp_ep_h ep) @@ -159,12 +167,13 @@ static UCS_F_ALWAYS_INLINE ucp_ep_flush_state_t* ucp_ep_flush_state(ucp_ep_h ep) ucs_assert(ep->flags & UCP_EP_FLAG_FLUSH_STATE_VALID); ucs_assert(!(ep->flags & UCP_EP_FLAG_ON_MATCH_CTX)); ucs_assert(!(ep->flags & UCP_EP_FLAG_LISTENER)); + ucs_assert(!(ep->flags & UCP_EP_FLAG_CLOSE_REQ_VALID)); return &ucp_ep_ext_gen(ep)->flush_state; } static UCS_F_ALWAYS_INLINE uintptr_t ucp_ep_dest_ep_ptr(ucp_ep_h ep) { -#if ENABLE_ASSERT +#if UCS_ENABLE_ASSERT if (!(ep->flags & UCP_EP_FLAG_DEST_EP)) { return 0; /* Let remote side assert if it gets NULL pointer */ } @@ -205,7 +214,7 @@ static inline const char* ucp_ep_peer_name(ucp_ep_h ep) #if ENABLE_DEBUG_DATA return ep->peer_name; #else - return ""; + return UCP_WIREUP_EMPTY_PEER_NAME; #endif } @@ -215,16 +224,60 @@ static inline void ucp_ep_flush_state_reset(ucp_ep_h ep) ucs_assert(!(ep->flags & (UCP_EP_FLAG_ON_MATCH_CTX | UCP_EP_FLAG_LISTENER))); - if (!(ep->flags & UCP_EP_FLAG_FLUSH_STATE_VALID)) { - flush_state->send_sn = 0; - flush_state->cmpl_sn = 0; - ucs_queue_head_init(&flush_state->reqs); - ep->flags |= UCP_EP_FLAG_FLUSH_STATE_VALID; - } else { - ucs_assert(flush_state->send_sn == 0); - ucs_assert(flush_state->cmpl_sn == 0); - ucs_assert(ucs_queue_is_empty(&flush_state->reqs)); - } + ucs_assert(!(ep->flags & UCP_EP_FLAG_FLUSH_STATE_VALID) || + ((flush_state->send_sn == 0) && + (flush_state->cmpl_sn == 0) && + ucs_queue_is_empty(&flush_state->reqs))); + + flush_state->send_sn = 0; + flush_state->cmpl_sn = 0; + ucs_queue_head_init(&flush_state->reqs); + ep->flags |= UCP_EP_FLAG_FLUSH_STATE_VALID; +} + +static inline void ucp_ep_flush_state_invalidate(ucp_ep_h ep) +{ + ucs_assert(ucs_queue_is_empty(&ucp_ep_flush_state(ep)->reqs)); + ep->flags &= ~UCP_EP_FLAG_FLUSH_STATE_VALID; +} + +/* get index of the local component which can reach a remote memory domain */ +static inline ucp_rsc_index_t +ucp_ep_config_get_dst_md_cmpt(const ucp_ep_config_key_t *key, + ucp_md_index_t dst_md_index) +{ + unsigned idx = ucs_popcount(key->reachable_md_map & UCS_MASK(dst_md_index)); + + return key->dst_md_cmpts[idx]; +} + +static inline int +ucp_ep_config_key_has_cm_lane(const ucp_ep_config_key_t *config_key) +{ + return config_key->cm_lane != UCP_NULL_LANE; +} + +static inline int ucp_ep_has_cm_lane(ucp_ep_h ep) +{ + return (ep->cfg_index != UCP_NULL_CFG_INDEX) && + ucp_ep_config_key_has_cm_lane(&ucp_ep_config(ep)->key); +} + +static UCS_F_ALWAYS_INLINE ucp_lane_index_t ucp_ep_get_cm_lane(ucp_ep_h ep) +{ + return ucp_ep_config(ep)->key.cm_lane; +} + +static inline int +ucp_ep_config_connect_p2p(ucp_worker_h worker, + const ucp_ep_config_key_t *ep_config_key, + ucp_rsc_index_t rsc_index) +{ + /* The EP with CM lane has to be connected to remote EP, so prefer native + * UCT p2p capability. */ + return ucp_ep_config_key_has_cm_lane(ep_config_key) ? + ucp_worker_is_tl_p2p(worker, rsc_index) : + !ucp_worker_is_tl_2iface(worker, rsc_index); } #endif diff --git a/src/ucp/core/ucp_listener.c b/src/ucp/core/ucp_listener.c index 27e95d17f08..ef53f227690 100644 --- a/src/ucp/core/ucp_listener.c +++ b/src/ucp/core/ucp_listener.c @@ -4,10 +4,16 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ucp_listener.h" +#include "uct/base/uct_cm.h" #include #include +#include #include #include #include @@ -60,12 +66,11 @@ void ucp_listener_schedule_accept_cb(ucp_ep_h ep) static unsigned ucp_listener_conn_request_progress(void *arg) { - ucp_conn_request_h conn_request = arg; - ucp_listener_h listener = conn_request->listener; - const ucp_wireup_client_data_t *client_data = &conn_request->client_data; - ucp_worker_h worker; - ucp_ep_h ep; - ucs_status_t status; + ucp_conn_request_h conn_request = arg; + ucp_listener_h listener = conn_request->listener; + ucp_worker_h worker = listener->worker; + ucp_ep_h ep; + ucs_status_t status; ucs_trace_func("listener=%p", listener); @@ -74,31 +79,9 @@ static unsigned ucp_listener_conn_request_progress(void *arg) return 1; } - worker = listener->wiface.worker; UCS_ASYNC_BLOCK(&worker->async); - /* coverity[overrun-buffer-val] */ - status = ucp_ep_create_accept(worker, client_data, &ep); - - if (status != UCS_OK) { - goto out; - } - - if (ep->flags & UCP_EP_FLAG_LISTENER) { - status = ucp_wireup_send_pre_request(ep); - } else { - /* send wireup request message, to connect the client to the server's - new endpoint */ - ucs_assert(!(ep->flags & UCP_EP_FLAG_CONNECT_REQ_QUEUED)); - status = ucp_wireup_send_request(ep); - } - - if (status != UCS_OK) { - goto out; - } - - status = uct_iface_accept(listener->wiface.iface, conn_request->uct_req); + status = ucp_ep_create_server_accept(worker, conn_request, &ep); if (status != UCS_OK) { - ucp_ep_destroy_internal(ep); goto out; } @@ -113,14 +96,7 @@ static unsigned ucp_listener_conn_request_progress(void *arg) } out: - if (status != UCS_OK) { - ucs_error("connection request failed on listener %p with status %s", - listener, ucs_status_string(status)); - uct_iface_reject(listener->wiface.iface, conn_request->uct_req); - } - UCS_ASYNC_UNBLOCK(&worker->async); - ucs_free(conn_request); return 1; } @@ -144,89 +120,253 @@ static void ucp_listener_conn_request_callback(uct_iface_h tl_iface, void *arg, ucs_trace("listener %p: got connection request", listener); /* Defer wireup init and user's callback to be invoked from the main thread */ - conn_request = ucs_malloc(ucs_offsetof(ucp_conn_request_t, client_data) + + conn_request = ucs_malloc(ucs_offsetof(ucp_conn_request_t, sa_data) + length, "accept connection request"); if (conn_request == NULL) { - ucs_error("failed to allocate connect request, rejecting connection request %p on TL iface %p, reason %s", + ucs_error("failed to allocate connect request, " + "rejecting connection request %p on TL iface %p, reason %s", uct_req, tl_iface, ucs_status_string(UCS_ERR_NO_MEMORY)); uct_iface_reject(tl_iface, uct_req); return; } - conn_request->listener = listener; - conn_request->uct_req = uct_req; - memcpy(&conn_request->client_data, conn_priv_data, length); + conn_request->listener = listener; + conn_request->uct_req = uct_req; + conn_request->uct.iface = tl_iface; + memset(&conn_request->client_address, 0, sizeof(struct sockaddr_storage)); + memcpy(&conn_request->sa_data, conn_priv_data, length); - uct_worker_progress_register_safe(listener->wiface.worker->uct, + uct_worker_progress_register_safe(listener->worker->uct, ucp_listener_conn_request_progress, conn_request, UCS_CALLBACKQ_FLAG_ONESHOT, &prog_id); /* If the worker supports the UCP_FEATURE_WAKEUP feature, signal the user so * that he can wake-up on this event */ - ucp_worker_signal_internal(listener->wiface.worker); + ucp_worker_signal_internal(listener->worker); } -ucs_status_t ucp_listener_create(ucp_worker_h worker, - const ucp_listener_params_t *params, - ucp_listener_h *listener_p) +ucs_status_t ucp_conn_request_query(ucp_conn_request_h conn_request, + ucp_conn_request_attr_t *attr) { - ucp_context_h context = worker->context; - ucp_tl_resource_desc_t *resource; - uct_iface_params_t iface_params; - ucp_listener_h listener = NULL; - ucp_rsc_index_t tl_id; ucs_status_t status; - ucp_tl_md_t *tl_md; - char saddr_str[UCS_SOCKADDR_STRING_LEN]; - if (!(params->field_mask & UCP_LISTENER_PARAM_FIELD_SOCK_ADDR)) { - ucs_error("Missing sockaddr for listener"); - return UCS_ERR_INVALID_PARAM; + if (attr->field_mask & UCP_CONN_REQUEST_ATTR_FIELD_CLIENT_ADDR) { + if (conn_request->client_address.ss_family == 0) { + return UCS_ERR_UNSUPPORTED; + } + + status = ucs_sockaddr_copy((struct sockaddr *)&attr->client_address, + (struct sockaddr *)&conn_request->client_address); + if (status != UCS_OK) { + return status; + } } - UCP_CHECK_PARAM_NON_NULL(params->sockaddr.addr, status, return status); + return UCS_OK; +} - if (ucs_test_all_flags(params->field_mask, - UCP_LISTENER_PARAM_FIELD_ACCEPT_HANDLER | - UCP_LISTENER_PARAM_FIELD_CONN_HANDLER)) { - ucs_error("Only one accept handler should be provided"); - return UCS_ERR_INVALID_PARAM; +ucs_status_t ucp_listener_query(ucp_listener_h listener, + ucp_listener_attr_t *attr) +{ + ucs_status_t status; + + if (attr->field_mask & UCP_LISTENER_ATTR_FIELD_SOCKADDR) { + status = ucs_sockaddr_copy((struct sockaddr *)&attr->sockaddr, + (struct sockaddr *)&listener->sockaddr); + if (status != UCS_OK) { + return status; + } } - UCS_ASYNC_BLOCK(&worker->async); + return UCS_OK; +} + +static void ucp_listener_close_uct_listeners(ucp_listener_h listener) +{ + ucp_rsc_index_t i; + + ucs_assert_always(ucp_worker_sockaddr_is_cm_proto(listener->worker)); + + for (i = 0; i < listener->num_rscs; ++i) { + uct_listener_destroy(listener->listeners[i]); + } + + ucs_free(listener->listeners); + + listener->listeners = NULL; + listener->num_rscs = 0; +} + +static void ucp_listener_close_ifaces(ucp_listener_h listener) +{ + ucp_worker_h worker; + int i; + + ucs_assert_always(!ucp_worker_sockaddr_is_cm_proto(listener->worker)); + + for (i = 0; i < listener->num_rscs; i++) { + worker = listener->wifaces[i]->worker; + ucs_assert_always(worker == listener->worker); + /* remove pending slow-path progress in case it wasn't removed yet */ + ucs_callbackq_remove_if(&worker->uct->progress_q, + ucp_listener_remove_filter, listener); + ucp_worker_iface_cleanup(listener->wifaces[i]); + } + + ucs_free(listener->wifaces); +} + +static ucs_status_t +ucp_listen_on_cm(ucp_listener_h listener, const ucp_listener_params_t *params) +{ + ucp_worker_h worker = listener->worker; + const ucp_rsc_index_t num_cms = ucp_worker_num_cm_cmpts(worker); + struct sockaddr_storage addr_storage; + struct sockaddr *addr; + uct_listener_h *uct_listeners; + uct_listener_params_t uct_params; + uct_listener_attr_t uct_attr; + uint16_t port, uct_listen_port; + ucp_rsc_index_t i; + char addr_str[UCS_SOCKADDR_STRING_LEN]; + ucp_worker_cm_t *ucp_cm; + ucs_status_t status; + + addr = (struct sockaddr *)&addr_storage; + status = ucs_sockaddr_copy(addr, params->sockaddr.addr); + if (status != UCS_OK) { + return status; + } + + ucs_assert_always(num_cms > 0); + + uct_params.field_mask = UCT_LISTENER_PARAM_FIELD_CONN_REQUEST_CB | + UCT_LISTENER_PARAM_FIELD_USER_DATA; + uct_params.conn_request_cb = ucp_cm_server_conn_request_cb; + uct_params.user_data = listener; + + listener->num_rscs = 0; + uct_listeners = ucs_calloc(num_cms, sizeof(*uct_listeners), + "uct_listeners_arr"); + if (uct_listeners == NULL) { + ucs_error("Can't allocate memory for UCT listeners array"); + return UCS_ERR_NO_MEMORY; + } + + listener->listeners = uct_listeners; + + for (i = 0; i < num_cms; ++i) { + ucp_cm = &worker->cms[i]; + status = uct_listener_create(ucp_cm->cm, addr, + params->sockaddr.addrlen, &uct_params, + &uct_listeners[listener->num_rscs]); + if (status != UCS_OK) { + ucs_debug("failed to create UCT listener on CM %p (component %s) " + "with address %s status %s", ucp_cm->cm, + worker->context->tl_cmpts[ucp_cm->cmpt_idx].attr.name, + ucs_sockaddr_str(params->sockaddr.addr, addr_str, + UCS_SOCKADDR_STRING_LEN), + ucs_status_string(status)); + continue; + } + + ++listener->num_rscs; + + status = ucs_sockaddr_get_port(addr, &port); + if (status != UCS_OK) { + goto err_destroy_listeners; + } + + uct_attr.field_mask = UCT_LISTENER_ATTR_FIELD_SOCKADDR; + status = uct_listener_query(uct_listeners[listener->num_rscs - 1], + &uct_attr); + if (status != UCS_OK) { + goto err_destroy_listeners; + } + + status = ucs_sockaddr_get_port((struct sockaddr *)&uct_attr.sockaddr, + &uct_listen_port); + if (status != UCS_OK) { + goto err_destroy_listeners; + } + + if (port != uct_listen_port) { + ucs_assert(port == 0); + status = ucs_sockaddr_set_port(addr, uct_listen_port); + if (status != UCS_OK) { + goto err_destroy_listeners; + } + } + } + + if (listener->num_rscs > 0) { + status = ucs_sockaddr_copy((struct sockaddr *)&listener->sockaddr, + addr); + if (status != UCS_OK) { + goto err_destroy_listeners; + } + } + + /* return the status of the last call of uct_listener_create if no listener + was created */ + return (listener->num_rscs > 0) ? UCS_OK : status; + +err_destroy_listeners: + ucp_listener_close_uct_listeners(listener); + return status; +} + +static ucs_status_t +ucp_listen_on_iface(ucp_listener_h listener, + const ucp_listener_params_t *params) +{ + ucp_worker_h worker = listener->worker; + ucp_context_h context = listener->worker->context; + int sockaddr_tls = 0; + char saddr_str[UCS_SOCKADDR_STRING_LEN]; + ucp_tl_resource_desc_t *resource; + uct_iface_params_t iface_params; + struct sockaddr_storage *listen_sock; + ucp_worker_iface_t **tmp; + ucp_rsc_index_t tl_id; + ucs_status_t status; + ucp_tl_md_t *tl_md; + uint16_t port; + int i; + + status = ucs_sockaddr_get_port(params->sockaddr.addr, &port); + if (status != UCS_OK) { + return status; + } /* Go through all the available resources and for each one, check if the given - * sockaddr is accessible from its md. Start listening on the first md that - * satisfies this. + * sockaddr is accessible from its md. Start listening on all the mds that + * satisfy this. + * If the given port is set to 0, i.e. use a random port, the first transport + * in the sockaddr priority list from the environment configuration will + * dictate the port to listen on for the other sockaddr transports in the list. * */ - ucs_for_each_bit(tl_id, context->tl_bitmap) { + for (i = 0; i < context->config.num_sockaddr_tls; i++) { + tl_id = context->config.sockaddr_tl_ids[i]; resource = &context->tl_rscs[tl_id]; tl_md = &context->tl_mds[resource->md_index]; - if (!(tl_md->attr.cap.flags & UCT_MD_FLAG_SOCKADDR) || - !uct_md_is_sockaddr_accessible(tl_md->md, ¶ms->sockaddr, + if (!uct_md_is_sockaddr_accessible(tl_md->md, ¶ms->sockaddr, UCT_SOCKADDR_ACC_LOCAL)) { continue; } - listener = ucs_calloc(1, sizeof(*listener), "ucp_listener"); - if (listener == NULL) { + tmp = ucs_realloc(listener->wifaces, + sizeof(*tmp) * (sockaddr_tls + 1), + "listener wifaces"); + if (tmp == NULL) { + ucs_error("failed to allocate listener wifaces"); status = UCS_ERR_NO_MEMORY; - goto out; + goto err_close_listener_wifaces; } - if (params->field_mask & UCP_LISTENER_PARAM_FIELD_ACCEPT_HANDLER) { - UCP_CHECK_PARAM_NON_NULL(params->accept_handler.cb, status, - goto err_free); - listener->accept_cb = params->accept_handler.cb; - listener->arg = params->accept_handler.arg; - } else if (params->field_mask & UCP_LISTENER_PARAM_FIELD_CONN_HANDLER) { - UCP_CHECK_PARAM_NON_NULL(params->conn_handler.cb, status, - goto err_free); - listener->conn_cb = params->conn_handler.cb; - listener->arg = params->conn_handler.arg; - } + listener->wifaces = tmp; iface_params.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE | UCT_IFACE_PARAM_FIELD_SOCKADDR; @@ -236,35 +376,133 @@ ucs_status_t ucp_listener_create(ucp_worker_h worker, iface_params.mode.sockaddr.listen_sockaddr = params->sockaddr; iface_params.mode.sockaddr.cb_flags = UCT_CB_FLAG_ASYNC; + if (port) { + /* Set the port for the next sockaddr iface. This port was either + * obtained from the user or generated by the first created sockaddr + * iface if the port from the user was equal to zero */ + status = ucs_sockaddr_set_port( + (struct sockaddr *) + iface_params.mode.sockaddr.listen_sockaddr.addr, port); + if (status != UCS_OK) { + ucs_error("failed to set port parameter (%d) for creating %s iface", + port, resource->tl_rsc.tl_name); + goto err_close_listener_wifaces; + } + } + status = ucp_worker_iface_open(worker, tl_id, &iface_params, - &listener->wiface); + &listener->wifaces[sockaddr_tls]); if (status != UCS_OK) { - goto err_free; + ucs_error("failed to open listener on %s on md %s", + ucs_sockaddr_str( + iface_params.mode.sockaddr.listen_sockaddr.addr, + saddr_str, sizeof(saddr_str)), + tl_md->rsc.md_name); + goto err_close_listener_wifaces; } - status = ucp_worker_iface_init(worker, tl_id, &listener->wiface); + status = ucp_worker_iface_init(worker, tl_id, + listener->wifaces[sockaddr_tls]); if ((status != UCS_OK) || ((context->config.features & UCP_FEATURE_WAKEUP) && - !(listener->wiface.attr.cap.flags & UCT_IFACE_FLAG_CB_ASYNC))) { - ucp_worker_iface_cleanup(&listener->wiface); - ucs_free(listener); - continue; + !(listener->wifaces[sockaddr_tls]->attr.cap.flags & + UCT_IFACE_FLAG_CB_ASYNC))) { + ucp_worker_iface_cleanup(listener->wifaces[sockaddr_tls]); + goto err_close_listener_wifaces; } - ucs_trace("listener %p: accepting connections on %s", listener, - tl_md->rsc.md_name); + listen_sock = &listener->wifaces[sockaddr_tls]->attr.listen_sockaddr; + status = ucs_sockaddr_get_port((struct sockaddr *)listen_sock, &port); + if (status != UCS_OK) { + goto err_close_listener_wifaces; + } + + sockaddr_tls++; + listener->num_rscs = sockaddr_tls; + ucs_trace("listener %p: accepting connections on %s on %s", + listener, tl_md->rsc.md_name, + ucs_sockaddr_str(iface_params.mode.sockaddr.listen_sockaddr.addr, + saddr_str, sizeof(saddr_str))); + } + if (!sockaddr_tls) { + ucs_error("none of the available transports can listen for connections on %s", + ucs_sockaddr_str(params->sockaddr.addr, saddr_str, + sizeof(saddr_str))); + listener->num_rscs = 0; + status = UCS_ERR_UNREACHABLE; + goto err_close_listener_wifaces; + } + + listen_sock = &listener->wifaces[sockaddr_tls - 1]->attr.listen_sockaddr; + status = ucs_sockaddr_copy((struct sockaddr *)&listener->sockaddr, + (struct sockaddr *)listen_sock); + if (status != UCS_OK) { + goto err_close_listener_wifaces; + } + + return UCS_OK; + +err_close_listener_wifaces: + ucp_listener_close_ifaces(listener); + return status; +} + +ucs_status_t ucp_listener_create(ucp_worker_h worker, + const ucp_listener_params_t *params, + ucp_listener_h *listener_p) +{ + ucp_listener_h listener; + ucs_status_t status; + + if (!(params->field_mask & UCP_LISTENER_PARAM_FIELD_SOCK_ADDR)) { + ucs_error("missing sockaddr for listener"); + return UCS_ERR_INVALID_PARAM; + } + + UCP_CHECK_PARAM_NON_NULL(params->sockaddr.addr, status, return status); + + if (ucs_test_all_flags(params->field_mask, + UCP_LISTENER_PARAM_FIELD_ACCEPT_HANDLER | + UCP_LISTENER_PARAM_FIELD_CONN_HANDLER)) { + ucs_error("only one accept handler should be provided"); + return UCS_ERR_INVALID_PARAM; + } + + listener = ucs_calloc(1, sizeof(*listener), "ucp_listener"); + if (listener == NULL) { + ucs_error("cannot allocate memory for UCP listener"); + return UCS_ERR_NO_MEMORY; + } + + UCS_ASYNC_BLOCK(&worker->async); + + listener->worker = worker; + + if (params->field_mask & UCP_LISTENER_PARAM_FIELD_ACCEPT_HANDLER) { + UCP_CHECK_PARAM_NON_NULL(params->accept_handler.cb, status, + goto err_free_listener); + listener->accept_cb = params->accept_handler.cb; + listener->arg = params->accept_handler.arg; + } else if (params->field_mask & UCP_LISTENER_PARAM_FIELD_CONN_HANDLER) { + UCP_CHECK_PARAM_NON_NULL(params->conn_handler.cb, status, + goto err_free_listener); + listener->conn_cb = params->conn_handler.cb; + listener->arg = params->conn_handler.arg; + } + + if (ucp_worker_sockaddr_is_cm_proto(worker)) { + status = ucp_listen_on_cm(listener, params); + } else { + status = ucp_listen_on_iface(listener, params); + } + + if (status == UCS_OK) { *listener_p = listener; - status = UCS_OK; goto out; } - ucs_error("none of the available transports can listen for connections on %s", - ucs_sockaddr_str(params->sockaddr.addr, saddr_str, sizeof(saddr_str))); - status = UCS_ERR_UNREACHABLE; - goto out; - -err_free: +err_free_listener: ucs_free(listener); out: UCS_ASYNC_UNBLOCK(&worker->async); @@ -275,21 +513,28 @@ void ucp_listener_destroy(ucp_listener_h listener) { ucs_trace("listener %p: destroying", listener); - /* remove pending slow-path progress in case it wasn't removed yet */ - ucs_callbackq_remove_if(&listener->wiface.worker->uct->progress_q, - ucp_listener_remove_filter, listener); - ucp_worker_iface_cleanup(&listener->wiface); + if (ucp_worker_sockaddr_is_cm_proto(listener->worker)) { + ucp_listener_close_uct_listeners(listener); + } else { + ucp_listener_close_ifaces(listener); + } + ucs_free(listener); } ucs_status_t ucp_listener_reject(ucp_listener_h listener, ucp_conn_request_h conn_request) { - ucp_worker_h worker = listener->wiface.worker; + ucp_worker_h worker = listener->worker; UCS_ASYNC_BLOCK(&worker->async); - uct_iface_reject(listener->wiface.iface, conn_request->uct_req); + if (ucp_worker_sockaddr_is_cm_proto(worker)) { + uct_listener_reject(conn_request->uct.listener, conn_request->uct_req); + ucs_free(conn_request->remote_dev_addr); + } else { + uct_iface_reject(conn_request->uct.iface, conn_request->uct_req); + } UCS_ASYNC_UNBLOCK(&worker->async); diff --git a/src/ucp/core/ucp_listener.h b/src/ucp/core/ucp_listener.h index 8b8f33f3860..5385a2e93dc 100644 --- a/src/ucp/core/ucp_listener.h +++ b/src/ucp/core/ucp_listener.h @@ -9,24 +9,34 @@ #define UCP_LISTENER_H_ #include "ucp_worker.h" -#include "wireup/wireup_ep.h" - /** * UCP listener */ typedef struct ucp_listener { - ucp_worker_iface_t wiface; /* UCT iface to listen on */ - ucp_listener_accept_callback_t accept_cb; /* Listen accept callback - which creates an endpoint - */ - ucp_listener_conn_callback_t conn_cb; /* Listen callback which - creates a handle to - connection request to the - remote endpoint */ - void *arg; /* User's arg for the accept - callback */ - uct_worker_cb_id_t prog_id; /* Slow-path callback */ + ucp_worker_h worker; + + union { + ucp_worker_iface_t **wifaces; /* Array of UCT interface + pointers to listen on */ + uct_listener_h *listeners;/* Array of UCT listeners to + listen on */ + }; + + struct sockaddr_storage sockaddr; /* Listening sockaddr */ + ucp_rsc_index_t num_rscs; /* Number of UCT listening + resources (wifaces or + listeners) */ + ucp_listener_accept_callback_t accept_cb; /* Listen accept callback + which creates an endpoint + */ + ucp_listener_conn_callback_t conn_cb; /* Listen callback which + creates a handle to + connection request to the + remote endpoint */ + void *arg; /* User's arg for the accept + callback */ + uct_worker_cb_id_t prog_id; /* Slow-path callback */ } ucp_listener_t; diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c index 2884714ded9..de012b2192b 100644 --- a/src/ucp/core/ucp_mm.c +++ b/src/ucp/core/ucp_mm.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ucp_mm.h" #include "ucp_context.h" #include "ucp_worker.h" @@ -11,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -27,7 +32,7 @@ static ucp_mem_t ucp_mem_dummy_handle = { ucs_status_t ucp_mem_rereg_mds(ucp_context_h context, ucp_md_map_t reg_md_map, void *address, size_t length, unsigned uct_flags, - uct_md_h alloc_md, uct_memory_type_t mem_type, + uct_md_h alloc_md, ucs_memory_type_t mem_type, uct_mem_h *alloc_md_memh_p, uct_mem_h *uct_memh, ucp_md_map_t *md_map_p) { @@ -38,13 +43,13 @@ ucs_status_t ucp_mem_rereg_mds(ucp_context_h context, ucp_md_map_t reg_md_map, unsigned prev_num_memh; unsigned md_index; ucs_status_t status; - int level; + ucs_log_level_t level; if (reg_md_map == *md_map_p) { return UCS_OK; /* shortcut - no changes required */ } - prev_num_memh = ucs_popcount(*md_map_p); + prev_num_memh = ucs_popcount(*md_map_p & reg_md_map); prev_uct_memh = ucs_alloca(prev_num_memh * sizeof(*prev_uct_memh)); /* Go over previous handles, save only the ones we will need */ @@ -79,7 +84,7 @@ ucs_status_t ucp_mem_rereg_mds(ucp_context_h context, ucp_md_map_t reg_md_map, } /* prev_uct_memh should contain the handles which should be reused */ - ucs_assert(prev_memh_index == ucs_popcount(*md_map_p & reg_md_map)); + ucs_assert(prev_memh_index == prev_num_memh); /* Go over requested MD map, and use / register new handles */ new_md_map = 0; @@ -89,6 +94,7 @@ ucs_status_t ucp_mem_rereg_mds(ucp_context_h context, ucp_md_map_t reg_md_map, md_attr = &context->tl_mds[md_index].attr; if (*md_map_p & UCS_BIT(md_index)) { /* already registered, use previous memh */ + ucs_assert(prev_memh_index < prev_num_memh); uct_memh[memh_index++] = prev_uct_memh[prev_memh_index++]; new_md_map |= UCS_BIT(md_index); } else if (context->tl_mds[md_index].md == alloc_md) { @@ -96,28 +102,43 @@ ucs_status_t ucp_mem_rereg_mds(ucp_context_h context, ucp_md_map_t reg_md_map, ucs_assert(alloc_md_memh_p != NULL); uct_memh[memh_index++] = *alloc_md_memh_p; new_md_map |= UCS_BIT(md_index); - } else if ((md_attr->cap.flags & UCT_MD_FLAG_REG) && - (md_attr->cap.reg_mem_types & UCS_BIT(mem_type))) { - /* MD supports registration, register new memh on it */ - status = uct_md_mem_reg(context->tl_mds[md_index].md, address, - length, uct_flags, &uct_memh[memh_index]); - if (status != UCS_OK) { - level = (uct_flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ? - UCS_LOG_LEVEL_DEBUG : UCS_LOG_LEVEL_ERROR; - ucs_log(level, - "failed to register address %p length %zu on md[%d]=%s: %s", - address, length, md_index, context->tl_mds[md_index].rsc.md_name, - ucs_status_string(status)); - ucp_mem_rereg_mds(context, 0, NULL, 0, 0, alloc_md, mem_type, - alloc_md_memh_p, uct_memh, md_map_p); - return status; + } else if (!length) { + /* don't register zero-length regions */ + continue; + } else if (md_attr->cap.flags & UCT_MD_FLAG_REG) { + if (!(md_attr->cap.reg_mem_types & UCS_BIT(mem_type))) { + status = UCS_ERR_UNSUPPORTED; + } else { + ucs_assert(address && length); + + /* MD supports registration, register new memh on it */ + status = uct_md_mem_reg(context->tl_mds[md_index].md, address, + length, uct_flags, &uct_memh[memh_index]); } - ucs_trace("registered address %p length %zu on md[%d] memh[%d]=%p", - address, length, md_index, memh_index, - uct_memh[memh_index]); - new_md_map |= UCS_BIT(md_index); - ++memh_index; + if (status == UCS_OK) { + ucs_trace("registered address %p length %zu on md[%d] memh[%d]=%p", + address, length, md_index, memh_index, + uct_memh[memh_index]); + new_md_map |= UCS_BIT(md_index); + ++memh_index; + continue; + } + + level = (uct_flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ? + UCS_LOG_LEVEL_DEBUG : UCS_LOG_LEVEL_ERROR; + + ucs_log(level, + "failed to register address %p mem_type bit 0x%lx length %zu on " + "md[%d]=%s: %s (md reg_mem_types 0x%lx)", + address, UCS_BIT(mem_type), length, md_index, + context->tl_mds[md_index].rsc.md_name, + ucs_status_string(status), + md_attr->cap.reg_mem_types); + + if (!(uct_flags & UCT_MD_MEM_FLAG_HIDE_ERRORS)) { + goto err_dereg; + } } } @@ -125,6 +146,12 @@ ucs_status_t ucp_mem_rereg_mds(ucp_context_h context, ucp_md_map_t reg_md_map, * missing from the map.*/ *md_map_p = new_md_map; return UCS_OK; + +err_dereg: + ucp_mem_rereg_mds(context, 0, NULL, 0, 0, alloc_md, mem_type, + alloc_md_memh_p, uct_memh, md_map_p); + return status; + } /** @@ -135,14 +162,14 @@ static int ucp_is_md_selected_by_config(ucp_context_h context, unsigned config_method_index, unsigned md_index) { - const char *cfg_mdc_name; - const char *mdc_name; + const char *cfg_cmpt_name; + const char *cmpt_name; - cfg_mdc_name = context->config.alloc_methods[config_method_index].mdc_name; - mdc_name = context->tl_mds[md_index].attr.component_name; + cfg_cmpt_name = context->config.alloc_methods[config_method_index].cmpt_name; + cmpt_name = context->tl_mds[md_index].attr.component_name; - return !strncmp(cfg_mdc_name, "*", UCT_MD_COMPONENT_NAME_MAX) || - !strncmp(cfg_mdc_name, mdc_name, UCT_MD_COMPONENT_NAME_MAX); + return !strncmp(cfg_cmpt_name, "*", UCT_COMPONENT_NAME_MAX) || + !strncmp(cfg_cmpt_name, cmpt_name, UCT_COMPONENT_NAME_MAX); } static ucs_status_t ucp_mem_alloc(ucp_context_h context, size_t length, @@ -196,8 +223,9 @@ static ucs_status_t ucp_mem_alloc(ucp_context_h context, size_t length, memh->alloc_md = mem.md; memh->md_map = 0; status = ucp_mem_rereg_mds(context, UCS_MASK(context->num_mds), memh->address, - memh->length, uct_flags, memh->alloc_md, memh->mem_type, - &mem.memh, memh->uct, &memh->md_map); + memh->length, uct_flags | UCT_MD_MEM_FLAG_HIDE_ERRORS, + memh->alloc_md, memh->mem_type, &mem.memh, + memh->uct, &memh->md_map); if (status != UCS_OK) { uct_mem_free(&mem); } @@ -290,7 +318,8 @@ static inline int ucp_mem_map_is_allocate(ucp_mem_map_params_t *params) static ucs_status_t ucp_mem_map_common(ucp_context_h context, void *address, size_t length, unsigned uct_flags, - int is_allocate, ucp_mem_h *memh_p) + int is_allocate, const char *alloc_name, + ucp_mem_h *memh_p) { ucs_status_t status; ucp_mem_h memh; @@ -308,21 +337,24 @@ static ucs_status_t ucp_mem_map_common(ucp_context_h context, void *address, memh->length = length; if (is_allocate) { - ucs_debug("allocation user memory at %p length %zu", address, length); - status = ucp_mem_alloc(context, length, uct_flags, - "user allocation", memh); + ucs_debug("allocating %s at %p length %zu", alloc_name, address, length); + status = ucp_mem_alloc(context, length, uct_flags, alloc_name, memh); if (status != UCS_OK) { goto err_free_memh; } } else { - ucs_debug("registering user memory at %p length %zu", address, length); + memh->mem_type = ucp_memory_type_detect(context, address, length); memh->alloc_method = UCT_ALLOC_METHOD_LAST; - memh->mem_type = UCT_MD_MEM_TYPE_HOST; memh->alloc_md = NULL; memh->md_map = 0; + + ucs_debug("registering %s %p length %zu mem_type %s", alloc_name, + address, length, ucs_memory_type_names[memh->mem_type]); status = ucp_mem_rereg_mds(context, UCS_MASK(context->num_mds), - memh->address, memh->length, uct_flags, NULL, - memh->mem_type, NULL, memh->uct, &memh->md_map); + memh->address, memh->length, + uct_flags | UCT_MD_MEM_FLAG_HIDE_ERRORS, + NULL, memh->mem_type, NULL, memh->uct, + &memh->md_map); if (status != UCS_OK) { goto err_free_memh; } @@ -401,7 +433,8 @@ ucs_status_t ucp_mem_map(ucp_context_h context, const ucp_mem_map_params_t *para status = ucp_mem_map_common(context, mem_params.address, mem_params.length, ucp_mem_map_params2uct_flags(&mem_params), - ucp_mem_map_is_allocate(&mem_params), memh_p); + ucp_mem_map_is_allocate(&mem_params), + "user memory", memh_p); out: UCP_THREAD_CS_EXIT(&context->mt_lock); return status; @@ -427,66 +460,72 @@ ucs_status_t ucp_mem_unmap(ucp_context_h context, ucp_mem_h memh) } ucs_status_t ucp_mem_type_reg_buffers(ucp_worker_h worker, void *remote_addr, - size_t length, uct_memory_type_t mem_type, - unsigned md_index, uct_mem_h *memh, + size_t length, ucs_memory_type_t mem_type, + ucp_md_index_t md_index, uct_mem_h *memh, ucp_md_map_t *md_map, uct_rkey_bundle_t *rkey_bundle) { - ucp_context_h context = worker->context; - uct_md_h md; - const uct_md_attr_t *md_attr; + ucp_context_h context = worker->context; + const uct_md_attr_t *md_attr = &context->tl_mds[md_index].attr; + uct_component_h cmpt; + ucp_tl_md_t *tl_md; ucs_status_t status; char *rkey_buffer; + if (!(md_attr->cap.flags & UCT_MD_FLAG_NEED_RKEY)) { + rkey_bundle->handle = NULL; + rkey_bundle->rkey = UCT_INVALID_RKEY; + status = UCS_OK; + goto out; + } + + tl_md = &context->tl_mds[md_index]; + cmpt = context->tl_cmpts[tl_md->cmpt_index].cmpt; - *memh = UCT_MEM_HANDLE_NULL; status = ucp_mem_rereg_mds(context, UCS_BIT(md_index), remote_addr, length, - UCT_MD_MEM_ACCESS_ALL, NULL, mem_type, - NULL, memh, md_map); + UCT_MD_MEM_ACCESS_ALL | + UCT_MD_MEM_FLAG_HIDE_ERRORS, + NULL, mem_type, NULL, memh, md_map); if (status != UCS_OK) { - goto err; + goto out; } - md_attr = &context->tl_mds[md_index].attr; - if (md_attr->cap.flags & UCT_MD_FLAG_NEED_RKEY) { - rkey_buffer = ucs_alloca(md_attr->rkey_packed_size); - md = context->tl_mds[md_index].md; - status = uct_md_mkey_pack(md, memh[0], rkey_buffer); - if (status != UCS_OK) { - ucs_error("failed to pack key from md[%d]: %s", - md_index, ucs_status_string(status)); - goto err_dreg_mem; - } + rkey_buffer = ucs_alloca(md_attr->rkey_packed_size); + status = uct_md_mkey_pack(tl_md->md, memh[0], rkey_buffer); + if (status != UCS_OK) { + ucs_error("failed to pack key from md[%d]: %s", + md_index, ucs_status_string(status)); + goto out_dereg_mem; + } - status = uct_rkey_unpack(rkey_buffer, rkey_bundle); - if (status != UCS_OK) { - ucs_error("failed to unpack key from md[%d]: %s", - md_index, ucs_status_string(status)); - goto err_dreg_mem; - } - } else { - rkey_bundle->handle = NULL; - rkey_bundle->rkey = UCT_INVALID_RKEY; - rkey_bundle->type = NULL; + status = uct_rkey_unpack(cmpt, rkey_buffer, rkey_bundle); + if (status != UCS_OK) { + ucs_error("failed to unpack key from md[%d]: %s", + md_index, ucs_status_string(status)); + goto out_dereg_mem; } return UCS_OK; -err_dreg_mem: +out_dereg_mem: ucp_mem_rereg_mds(context, 0, NULL, 0, 0, NULL, mem_type, NULL, memh, md_map); -err: +out: + *memh = UCT_MEM_HANDLE_NULL; return status; } -void ucp_mem_type_unreg_buffers(ucp_worker_h worker, uct_memory_type_t mem_type, - uct_mem_h *memh, ucp_md_map_t *md_map, +void ucp_mem_type_unreg_buffers(ucp_worker_h worker, ucs_memory_type_t mem_type, + ucp_md_index_t md_index, uct_mem_h *memh, + ucp_md_map_t *md_map, uct_rkey_bundle_t *rkey_bundle) { ucp_context_h context = worker->context; + ucp_rsc_index_t cmpt_index; if (rkey_bundle->rkey != UCT_INVALID_RKEY) { - uct_rkey_release(rkey_bundle); + cmpt_index = context->tl_mds[md_index].cmpt_index; + uct_rkey_release(context->tl_cmpts[cmpt_index].cmpt, rkey_bundle); } ucp_mem_rereg_mds(context, 0, NULL, 0, 0, NULL, mem_type, NULL, @@ -506,7 +545,7 @@ ucs_status_t ucp_mem_query(const ucp_mem_h memh, ucp_mem_attr_t *attr) return UCS_OK; } -static ucs_status_t ucp_advice2uct(unsigned ucp_advice, unsigned *uct_advice) +static ucs_status_t ucp_advice2uct(unsigned ucp_advice, uct_mem_advice_t *uct_advice) { switch(ucp_advice) { case UCP_MADV_NORMAL: @@ -519,13 +558,13 @@ static ucs_status_t ucp_advice2uct(unsigned ucp_advice, unsigned *uct_advice) return UCS_ERR_INVALID_PARAM; } -ucs_status_t -ucp_mem_advise(ucp_context_h context, ucp_mem_h memh, +ucs_status_t +ucp_mem_advise(ucp_context_h context, ucp_mem_h memh, ucp_mem_advise_params_t *params) { ucs_status_t status, tmp_status; int md_index; - unsigned uct_advice; + uct_mem_advice_t uct_advice; uct_mem_h uct_memh; if (!ucs_test_all_flags(params->field_mask, @@ -536,7 +575,8 @@ ucp_mem_advise(ucp_context_h context, ucp_mem_h memh, } if ((params->address < memh->address) || - (params->address + params->length > memh->address + memh->length)) { + (UCS_PTR_BYTE_OFFSET(params->address, params->length) > + UCS_PTR_BYTE_OFFSET(memh->address, memh->length))) { return UCS_ERR_INVALID_PARAM; } @@ -586,7 +626,7 @@ ucp_mpool_malloc(ucp_worker_h worker, ucs_mpool_t *mp, size_t *size_p, void **ch status = ucp_mem_map_common(worker->context, NULL, *size_p + sizeof(*chunk_hdr), ucp_mem_map_params2uct_flags(&mem_params), - 1, &memh); + 1, ucs_mpool_name(mp), &memh); if (status != UCS_OK) { goto out; } @@ -642,3 +682,74 @@ void ucp_frag_mpool_free(ucs_mpool_t *mp, void *chunk) ucp_mpool_free(worker, mp, chunk); } + +void ucp_mem_print_info(const char *mem_size, ucp_context_h context, FILE *stream) +{ + size_t min_page_size, max_page_size; + ucp_mem_map_params_t mem_params; + size_t mem_size_value; + char memunits_str[32]; + ucs_status_t status; + unsigned md_index; + ucp_mem_h memh; + + status = ucs_str_to_memunits(mem_size, &mem_size_value); + if (status != UCS_OK) { + printf("\n"); + return; + } + + mem_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | + UCP_MEM_MAP_PARAM_FIELD_LENGTH | + UCP_MEM_MAP_PARAM_FIELD_FLAGS; + mem_params.address = NULL; + mem_params.length = mem_size_value; + mem_params.flags = UCP_MEM_MAP_ALLOCATE; + + status = ucp_mem_map(context, &mem_params, &memh); + if (status != UCS_OK) { + printf("\n", mem_size); + return; + } + + fprintf(stream, "#\n"); + fprintf(stream, "# UCP memory allocation\n"); + fprintf(stream, "#\n"); + + ucs_memunits_to_str(memh->length, memunits_str, sizeof(memunits_str)); + fprintf(stream, "# allocated %s at address %p with ", memunits_str, + memh->address); + + if (memh->alloc_md == NULL) { + fprintf(stream, "%s", uct_alloc_method_names[memh->alloc_method]); + } else { + for (md_index = 0; md_index < context->num_mds; ++md_index) { + if (memh->alloc_md == context->tl_mds[md_index].md) { + fprintf(stream, "%s", context->tl_mds[md_index].rsc.md_name); + break; + } + } + } + + ucs_get_mem_page_size(memh->address, memh->length, &min_page_size, + &max_page_size); + ucs_memunits_to_str(min_page_size, memunits_str, sizeof(memunits_str)); + fprintf(stream, ", pagesize: %s", memunits_str); + if (min_page_size != max_page_size) { + ucs_memunits_to_str(max_page_size, memunits_str, sizeof(memunits_str)); + fprintf(stream, "-%s", memunits_str); + } + + fprintf(stream, "\n"); + fprintf(stream, "# registered on: "); + ucs_for_each_bit(md_index, memh->md_map) { + fprintf(stream, "%s ", context->tl_mds[md_index].rsc.md_name); + } + fprintf(stream, "\n"); + fprintf(stream, "#\n"); + + status = ucp_mem_unmap(context, memh); + if (status != UCS_OK) { + printf("\n", mem_size); + } +} diff --git a/src/ucp/core/ucp_mm.h b/src/ucp/core/ucp_mm.h index 039abc43826..4c298bcd02d 100644 --- a/src/ucp/core/ucp_mm.h +++ b/src/ucp/core/ucp_mm.h @@ -23,6 +23,24 @@ */ #define UCP_RKEY_MPOOL_MAX_MD 3 + +/** + * UCT remote key along with component handle which should be used to release it. + * + */ +typedef struct ucp_tl_rkey { + uct_rkey_bundle_t rkey; + uct_component_h cmpt; +} ucp_tl_rkey_t; + +/** + * Rkey flags + */ +enum { + UCP_RKEY_DESC_FLAG_POOL = UCS_BIT(0) /* Descriptor was allocated from pool + and must be retuned to pool, not free */ +}; + /** * Remote memory key structure. * Contains remote keys for UCT MDs. @@ -35,18 +53,19 @@ typedef struct ucp_rkey { ucp_ep_cfg_index_t ep_cfg_index; /* EP configuration relevant for the cache */ ucp_lane_index_t rma_lane; /* Lane to use for RMAs */ ucp_lane_index_t amo_lane; /* Lane to use for AMOs */ - unsigned max_put_short;/* Cached value of max_put_short */ + ssize_t max_put_short;/* Cached value of max_put_short */ uct_rkey_t rma_rkey; /* Key to use for RMAs */ uct_rkey_t amo_rkey; /* Key to use for AMOs */ ucp_amo_proto_t *amo_proto; /* Protocol for AMOs */ ucp_rma_proto_t *rma_proto; /* Protocol for RMAs */ } cache; - ucp_md_map_t md_map; /* Which *remote* MDs have valid memory handles */ - uct_memory_type_t mem_type;/* Memory type of remote key memory */ + ucp_md_map_t md_map; /* Which *remote* MDs have valid memory handles */ + ucs_memory_type_t mem_type; /* Memory type of remote key memory */ + uint8_t flags; /* Rkey flags */ #if ENABLE_PARAMS_CHECK ucp_ep_h ep; #endif - uct_rkey_bundle_t uct[0]; /* Remote key for every MD */ + ucp_tl_rkey_t tl_rkey[0]; /* UCT rkey for every remote MD */ } ucp_rkey_t; @@ -60,7 +79,7 @@ typedef struct ucp_mem { void *address; /* Region start address */ size_t length; /* Region length */ uct_alloc_method_t alloc_method; /* Method used to allocate the memory */ - uct_memory_type_t mem_type; /**< type of allocated memory */ + ucs_memory_type_t mem_type; /**< type of allocated memory */ uct_md_h alloc_md; /* MD used to allocated the memory */ ucp_md_map_t md_map; /* Which MDs have valid memory handles */ uct_mem_h uct[0]; /* Valid memory handles, as popcount(md_map) */ @@ -78,10 +97,13 @@ typedef struct ucp_mem_desc { void ucp_rkey_resolve_inner(ucp_rkey_h rkey, ucp_ep_h ep); -ucp_lane_index_t ucp_rkey_get_rma_bw_lane(ucp_rkey_h rkey, ucp_ep_h ep, - uct_memory_type_t mem_type, - uct_rkey_t *uct_rkey_p, - ucp_lane_map_t ignore); +ucp_lane_index_t ucp_rkey_find_rma_lane(ucp_context_h context, + const ucp_ep_config_t *config, + ucs_memory_type_t mem_type, + const ucp_lane_index_t *lanes, + ucp_rkey_h rkey, + ucp_lane_map_t ignore, + uct_rkey_t *uct_rkey_p); ucs_status_t ucp_reg_mpool_malloc(ucs_mpool_t *mp, size_t *size_p, void **chunk_p); @@ -119,32 +141,45 @@ void ucp_frag_mpool_free(ucs_mpool_t *mp, void *chunk); */ ucs_status_t ucp_mem_rereg_mds(ucp_context_h context, ucp_md_map_t reg_md_map, void *address, size_t length, unsigned uct_flags, - uct_md_h alloc_md, uct_memory_type_t mem_type, + uct_md_h alloc_md, ucs_memory_type_t mem_type, uct_mem_h *alloc_md_memh_p, uct_mem_h *uct_memh, ucp_md_map_t *md_map_p); size_t ucp_rkey_packed_size(ucp_context_h context, ucp_md_map_t md_map); void ucp_rkey_packed_copy(ucp_context_h context, ucp_md_map_t md_map, - uct_memory_type_t mem_type, void *rkey_buffer, + ucs_memory_type_t mem_type, void *rkey_buffer, const void* uct_rkeys[]); ssize_t ucp_rkey_pack_uct(ucp_context_h context, ucp_md_map_t md_map, - const uct_mem_h *memh, uct_memory_type_t mem_type, + const uct_mem_h *memh, ucs_memory_type_t mem_type, void *rkey_buffer); void ucp_rkey_dump_packed(const void *rkey_buffer, char *buffer, size_t max); ucs_status_t ucp_mem_type_reg_buffers(ucp_worker_h worker, void *remote_addr, - size_t length, uct_memory_type_t mem_type, - unsigned md_index, uct_mem_h *memh, + size_t length, ucs_memory_type_t mem_type, + ucp_md_index_t md_index, uct_mem_h *memh, ucp_md_map_t *md_map, uct_rkey_bundle_t *rkey_bundle); -void ucp_mem_type_unreg_buffers(ucp_worker_h worker, uct_memory_type_t mem_type, - uct_mem_h *memh, ucp_md_map_t *md_map, +void ucp_mem_type_unreg_buffers(ucp_worker_h worker, ucs_memory_type_t mem_type, + ucp_md_index_t md_index, uct_mem_h *memh, + ucp_md_map_t *md_map, uct_rkey_bundle_t *rkey_bundle); +static UCS_F_ALWAYS_INLINE ucp_md_map_t +ucp_rkey_packed_md_map(const void *rkey_buffer) +{ + return *(const ucp_md_map_t*)rkey_buffer; +} + +static UCS_F_ALWAYS_INLINE ucs_memory_type_t +ucp_rkey_packed_mem_type(const void *rkey_buffer) +{ + return (ucs_memory_type_t)(*(uint8_t *)((const ucp_md_map_t*)rkey_buffer + 1)); +} + static UCS_F_ALWAYS_INLINE uct_mem_h ucp_memh_map2uct(const uct_mem_h *uct, ucp_md_map_t md_map, ucp_md_index_t md_idx) { @@ -164,38 +199,41 @@ ucp_memh2uct(ucp_mem_h memh, ucp_md_index_t md_idx) #define UCP_RKEY_RESOLVE_NOCHECK(_rkey, _ep, _op_type) \ ({ \ - ucs_status_t status = UCS_OK; \ + ucs_status_t _status_nc = UCS_OK; \ if (ucs_unlikely((_ep)->cfg_index != (_rkey)->cache.ep_cfg_index)) { \ ucp_rkey_resolve_inner(_rkey, _ep); \ } \ if (ucs_unlikely((_rkey)->cache._op_type##_lane == UCP_NULL_LANE)) { \ ucs_error("remote memory is unreachable (remote md_map 0x%lx)", \ (_rkey)->md_map); \ - status = UCS_ERR_UNREACHABLE; \ + _status_nc = UCS_ERR_UNREACHABLE; \ } \ - status; \ + _status_nc; \ }) #if ENABLE_PARAMS_CHECK #define UCP_RKEY_RESOLVE(_rkey, _ep, _op_type) \ ({ \ - ucs_status_t status; \ + ucs_status_t _status; \ if ((_rkey)->ep != (_ep)) { \ ucs_error("cannot use a remote key on a different endpoint than it was unpacked on"); \ - status = UCS_ERR_INVALID_PARAM; \ + _status = UCS_ERR_INVALID_PARAM; \ } else { \ - status = UCP_RKEY_RESOLVE_NOCHECK(_rkey, _ep, _op_type); \ + _status = UCP_RKEY_RESOLVE_NOCHECK(_rkey, _ep, _op_type); \ } \ - status; \ + _status; \ }) #else #define UCP_RKEY_RESOLVE UCP_RKEY_RESOLVE_NOCHECK #endif -#define UCP_MEM_IS_HOST(_mem_type) ((_mem_type) == UCT_MD_MEM_TYPE_HOST) -#define UCP_MEM_IS_ROCM(_mem_type) ((_mem_type) == UCT_MD_MEM_TYPE_ROCM) -#define UCP_MEM_IS_CUDA_MANAGED(_mem_type) ((_mem_type) == UCT_MD_MEM_TYPE_CUDA_MANAGED) -#define UCP_MEM_IS_ROCM_MANAGED(_mem_type) ((_mem_type) == UCT_MD_MEM_TYPE_ROCM_MANAGED) +#define UCP_MEM_IS_HOST(_mem_type) ((_mem_type) == UCS_MEMORY_TYPE_HOST) +#define UCP_MEM_IS_ROCM(_mem_type) ((_mem_type) == UCS_MEMORY_TYPE_ROCM) +#define UCP_MEM_IS_CUDA(_mem_type) ((_mem_type) == UCS_MEMORY_TYPE_CUDA) +#define UCP_MEM_IS_CUDA_MANAGED(_mem_type) ((_mem_type) == UCS_MEMORY_TYPE_CUDA_MANAGED) +#define UCP_MEM_IS_ROCM_MANAGED(_mem_type) ((_mem_type) == UCS_MEMORY_TYPE_ROCM_MANAGED) +#define UCP_MEM_IS_ACCESSIBLE_FROM_CPU(_mem_type) \ + (UCS_BIT(_mem_type) & UCS_MEMORY_TYPES_CPU_ACCESSIBLE) #endif diff --git a/src/ucp/core/ucp_proxy_ep.c b/src/ucp/core/ucp_proxy_ep.c index 59745b86613..8e384205f37 100644 --- a/src/ucp/core/ucp_proxy_ep.c +++ b/src/ucp/core/ucp_proxy_ep.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ucp_proxy_ep.h" #include "ucp_ep.inl" @@ -150,21 +154,21 @@ UCS_CLASS_INIT_FUNC(ucp_proxy_ep_t, const uct_iface_ops_t *ops, ucp_ep_h ucp_ep, UCP_PROXY_EP_SET_OP(ep_get_address); UCP_PROXY_EP_SET_OP(ep_connect_to_ep); - self->iface.ops.iface_tag_recv_zcopy = (void*)ucp_proxy_ep_fatal; - self->iface.ops.iface_tag_recv_cancel = (void*)ucp_proxy_ep_fatal; - self->iface.ops.ep_create = (void*)ucp_proxy_ep_fatal; - self->iface.ops.iface_flush = (void*)ucp_proxy_ep_fatal; - self->iface.ops.iface_fence = (void*)ucp_proxy_ep_fatal; - self->iface.ops.iface_progress_enable = (void*)ucp_proxy_ep_fatal; - self->iface.ops.iface_progress_disable = (void*)ucp_proxy_ep_fatal; - self->iface.ops.iface_progress = (void*)ucp_proxy_ep_fatal; - self->iface.ops.iface_event_fd_get = (void*)ucp_proxy_ep_fatal; - self->iface.ops.iface_event_arm = (void*)ucp_proxy_ep_fatal; - self->iface.ops.iface_close = (void*)ucp_proxy_ep_fatal; - self->iface.ops.iface_query = (void*)ucp_proxy_ep_fatal; - self->iface.ops.iface_get_device_address = (void*)ucp_proxy_ep_fatal; - self->iface.ops.iface_get_address = (void*)ucp_proxy_ep_fatal; - self->iface.ops.iface_is_reachable = (void*)ucp_proxy_ep_fatal; + self->iface.ops.iface_tag_recv_zcopy = (uct_iface_tag_recv_zcopy_func_t)ucp_proxy_ep_fatal; + self->iface.ops.iface_tag_recv_cancel = (uct_iface_tag_recv_cancel_func_t)ucp_proxy_ep_fatal; + self->iface.ops.ep_create = (uct_ep_create_func_t)ucp_proxy_ep_fatal; + self->iface.ops.iface_flush = (uct_iface_flush_func_t)ucp_proxy_ep_fatal; + self->iface.ops.iface_fence = (uct_iface_fence_func_t)ucp_proxy_ep_fatal; + self->iface.ops.iface_progress_enable = (uct_iface_progress_enable_func_t)ucp_proxy_ep_fatal; + self->iface.ops.iface_progress_disable = (uct_iface_progress_disable_func_t)ucp_proxy_ep_fatal; + self->iface.ops.iface_progress = (uct_iface_progress_func_t)ucp_proxy_ep_fatal; + self->iface.ops.iface_event_fd_get = (uct_iface_event_fd_get_func_t)ucp_proxy_ep_fatal; + self->iface.ops.iface_event_arm = (uct_iface_event_arm_func_t)ucp_proxy_ep_fatal; + self->iface.ops.iface_close = (uct_iface_close_func_t)ucp_proxy_ep_fatal; + self->iface.ops.iface_query = (uct_iface_query_func_t)ucp_proxy_ep_fatal; + self->iface.ops.iface_get_device_address = (uct_iface_get_device_address_func_t)ucp_proxy_ep_fatal; + self->iface.ops.iface_get_address = (uct_iface_get_address_func_t)ucp_proxy_ep_fatal; + self->iface.ops.iface_is_reachable = (uct_iface_is_reachable_func_t)ucp_proxy_ep_fatal; return UCS_OK; } diff --git a/src/ucp/core/ucp_request.c b/src/ucp/core/ucp_request.c index 56fcb52215d..974685198cd 100644 --- a/src/ucp/core/ucp_request.c +++ b/src/ucp/core/ucp_request.c @@ -1,20 +1,27 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ucp_context.h" #include "ucp_worker.h" #include "ucp_request.inl" -#include +#include #include #include #include +const ucp_request_param_t ucp_request_null_param = { .op_attr_mask = 0 }; + + int ucp_request_is_completed(void *request) { ucp_request_t *req = (ucp_request_t*)request - 1; @@ -64,7 +71,7 @@ ucp_request_release_common(void *request, uint8_t cb_flag, const char *debug_nam ucp_request_t *req = (ucp_request_t*)request - 1; ucp_worker_h UCS_V_UNUSED worker = ucs_container_of(ucs_mpool_obj_owner(req), ucp_worker_t, req_mp); - uint16_t flags; + uint32_t flags; UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); @@ -96,10 +103,18 @@ UCS_PROFILE_FUNC_VOID(ucp_request_free, (request), void *request) ucp_request_release_common(request, UCP_REQUEST_FLAG_CALLBACK, "free"); } +UCS_PROFILE_FUNC(void*, ucp_request_alloc, + (worker), + ucp_worker_h worker) +{ + return NULL; +} + UCS_PROFILE_FUNC_VOID(ucp_request_cancel, (worker, request), ucp_worker_h worker, void *request) { ucp_request_t *req = (ucp_request_t*)request - 1; + int removed; if (req->flags & UCP_REQUEST_FLAG_COMPLETED) { return; @@ -108,9 +123,9 @@ UCS_PROFILE_FUNC_VOID(ucp_request_cancel, (worker, request), if (req->flags & UCP_REQUEST_FLAG_EXPECTED) { UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); - ucp_tag_exp_remove(&worker->tm, req); + removed = ucp_tag_exp_remove(&worker->tm, req); /* If tag posted to the transport need to wait its completion */ - if (!(req->flags & UCP_REQUEST_FLAG_OFFLOADED)) { + if (removed && !(req->flags & UCP_REQUEST_FLAG_OFFLOADED)) { ucp_request_complete_tag_recv(req, UCS_ERR_CANCELED); } @@ -175,10 +190,10 @@ int ucp_request_pending_add(ucp_request_t *req, ucs_status_t *req_status, /* Could not add, try to send again */ return 0; } + /* Unexpected error while adding to pending */ - ucs_assert(status != UCS_INPROGRESS); - *req_status = status; - return 1; + ucs_fatal("invalid return status from uct_ep_pending_add(): %s", + ucs_status_string(status)); } static void ucp_request_dt_dereg(ucp_context_t *context, ucp_dt_reg_t *dt_reg, @@ -189,7 +204,7 @@ static void ucp_request_dt_dereg(ucp_context_t *context, ucp_dt_reg_t *dt_reg, for (i = 0; i < count; ++i) { ucp_trace_req(req_dbg, "mem dereg buffer %ld/%ld md_map 0x%"PRIx64, i, count, dt_reg[i].md_map); - ucp_mem_rereg_mds(context, 0, NULL, 0, 0, NULL, UCT_MD_MEM_TYPE_HOST, NULL, + ucp_mem_rereg_mds(context, 0, NULL, 0, 0, NULL, UCS_MEMORY_TYPE_HOST, NULL, dt_reg[i].memh, &dt_reg[i].md_map); ucs_assert(dt_reg[i].md_map == 0); } @@ -199,7 +214,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_request_memory_reg, (context, md_map, buffer, length, datatype, state, mem_type, req_dbg, uct_flags), ucp_context_t *context, ucp_md_map_t md_map, void *buffer, size_t length, ucp_datatype_t datatype, ucp_dt_state_t *state, - uct_memory_type_t mem_type, ucp_request_t *req_dbg, unsigned uct_flags) + ucs_memory_type_t mem_type, ucp_request_t *req_dbg, unsigned uct_flags) { size_t iov_it, iovcnt; const ucp_dt_iov_t *iov; @@ -225,13 +240,14 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_request_memory_reg, case UCP_DATATYPE_IOV: iovcnt = state->dt.iov.iovcnt; iov = buffer; - dt_reg = ucs_malloc(sizeof(*dt_reg) * iovcnt, "iov_dt_reg"); + dt_reg = ((state->dt.iov.dt_reg == NULL) ? + ucs_calloc(iovcnt, sizeof(*dt_reg), "iov_dt_reg") : + state->dt.iov.dt_reg); if (NULL == dt_reg) { status = UCS_ERR_NO_MEMORY; goto err; } for (iov_it = 0; iov_it < iovcnt; ++iov_it) { - dt_reg[iov_it].md_map = 0; if (iov[iov_it].length) { status = ucp_mem_rereg_mds(context, md_map, iov[iov_it].buffer, iov[iov_it].length, flags, NULL, @@ -305,11 +321,28 @@ ucs_status_t ucp_request_test(void *request, ucp_tag_recv_info_t *info) return UCS_INPROGRESS; } +static UCS_F_ALWAYS_INLINE +void ucp_request_init_multi_proto(ucp_request_t *req, + uct_pending_callback_t multi_func, + const char *multi_func_str) +{ + req->send.uct.func = multi_func; + + if (req->flags & (UCP_REQUEST_FLAG_SEND_TAG | + UCP_REQUEST_FLAG_SEND_AM)) { + req->send.msg_proto.message_id = req->send.ep->worker->am_message_id++; + req->send.msg_proto.am_bw_index = 0; + } + + req->send.pending_lane = UCP_NULL_LANE; + UCS_PROFILE_REQUEST_EVENT(req, multi_func_str, req->send.length); +} + ucs_status_t ucp_request_send_start(ucp_request_t *req, ssize_t max_short, size_t zcopy_thresh, size_t zcopy_max, size_t dt_count, const ucp_ep_msg_config_t* msg_config, - const ucp_proto_t *proto) + const ucp_request_send_proto_t *proto) { size_t length = req->send.length; ucs_status_t status; @@ -323,22 +356,20 @@ ucp_request_send_start(ucp_request_t *req, ssize_t max_short, } else if (length < zcopy_thresh) { /* bcopy */ ucp_request_send_state_reset(req, NULL, UCP_REQUEST_SEND_PROTO_BCOPY_AM); - if (length <= msg_config->max_bcopy - proto->only_hdr_size) { - req->send.uct.func = proto->bcopy_single; + ucs_assert(msg_config->max_bcopy >= proto->only_hdr_size); + if (length <= (msg_config->max_bcopy - proto->only_hdr_size)) { + req->send.uct.func = proto->bcopy_single; UCS_PROFILE_REQUEST_EVENT(req, "start_bcopy_single", req->send.length); } else { - req->send.uct.func = proto->bcopy_multi; - req->send.tag.message_id = req->send.ep->worker->tm.am.message_id++; - req->send.tag.am_bw_index = 1; - req->send.pending_lane = UCP_NULL_LANE; - UCS_PROFILE_REQUEST_EVENT(req, "start_bcopy_multi", req->send.length); + ucp_request_init_multi_proto(req, proto->bcopy_multi, + "start_bcopy_multi"); } return UCS_OK; } else if (length < zcopy_max) { /* zcopy */ ucp_request_send_state_reset(req, proto->zcopy_completion, UCP_REQUEST_SEND_PROTO_ZCOPY_AM); - status = ucp_request_send_buffer_reg_lane(req, req->send.lane); + status = ucp_request_send_buffer_reg_lane(req, req->send.lane, 0); if (status != UCS_OK) { return status; } @@ -357,13 +388,10 @@ ucp_request_send_start(ucp_request_t *req, ssize_t max_short, } if (multi) { - req->send.uct.func = proto->zcopy_multi; - req->send.tag.message_id = req->send.ep->worker->tm.am.message_id++; - req->send.tag.am_bw_index = 1; - req->send.pending_lane = UCP_NULL_LANE; - UCS_PROFILE_REQUEST_EVENT(req, "start_zcopy_multi", req->send.length); + ucp_request_init_multi_proto(req, proto->zcopy_multi, + "start_zcopy_multi"); } else { - req->send.uct.func = proto->zcopy_single; + req->send.uct.func = proto->zcopy_single; UCS_PROFILE_REQUEST_EVENT(req, "start_zcopy_single", req->send.length); } return UCS_OK; @@ -374,7 +402,14 @@ ucp_request_send_start(ucp_request_t *req, ssize_t max_short, void ucp_request_send_state_ff(ucp_request_t *req, ucs_status_t status) { - if (req->send.state.uct_comp.func) { + /* + * FIXME should not fast-forward requests owned by UCT + */ + ucp_trace_req(req, "fast-forward with status %s", ucs_status_string(status)); + + if (req->send.state.uct_comp.func == ucp_ep_flush_completion) { + ucp_ep_flush_request_ff(req, status); + } else if (req->send.state.uct_comp.func) { req->send.state.dt.offset = req->send.length; req->send.state.uct_comp.count = 0; req->send.state.uct_comp.func(&req->send.state.uct_comp, status); @@ -382,3 +417,22 @@ void ucp_request_send_state_ff(ucp_request_t *req, ucs_status_t status) ucp_request_complete_send(req, status); } } + +ucs_status_t ucp_request_recv_msg_truncated(ucp_request_t *req, size_t length, + size_t offset) +{ + ucp_dt_generic_t *dt_gen; + + ucs_debug("message truncated: recv_length %zu offset %zu buffer_size %zu", + length, offset, req->recv.length); + + if (UCP_DT_IS_GENERIC(req->recv.datatype)) { + dt_gen = ucp_dt_generic(req->recv.datatype); + UCS_PROFILE_NAMED_CALL_VOID("dt_finish", dt_gen->ops.finish, + req->recv.state.dt.generic.state); + } + + return UCS_ERR_MESSAGE_TRUNCATED; +} + + diff --git a/src/ucp/core/ucp_request.h b/src/ucp/core/ucp_request.h index 1f895c647ae..9e815b2885c 100644 --- a/src/ucp/core/ucp_request.h +++ b/src/ucp/core/ucp_request.h @@ -1,7 +1,8 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * Copyright (c) UT-Battelle, LLC. 2015-2017. ALL RIGHTS RESERVED. - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. + * Copyright (C) Los Alamos National Security, LLC. 2019 ALL RIGHTS RESERVED. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -17,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -42,11 +44,14 @@ enum { UCP_REQUEST_FLAG_OFFLOADED = UCS_BIT(10), UCP_REQUEST_FLAG_BLOCK_OFFLOAD = UCS_BIT(11), UCP_REQUEST_FLAG_STREAM_RECV_WAITALL = UCS_BIT(12), - -#if ENABLE_ASSERT - UCP_REQUEST_FLAG_STREAM_RECV = UCS_BIT(14), - UCP_REQUEST_DEBUG_FLAG_EXTERNAL = UCS_BIT(15) + UCP_REQUEST_FLAG_SEND_AM = UCS_BIT(13), + UCP_REQUEST_FLAG_SEND_TAG = UCS_BIT(14), + UCP_REQUEST_FLAG_RNDV_FRAG = UCS_BIT(15), +#if UCS_ENABLE_ASSERT + UCP_REQUEST_FLAG_STREAM_RECV = UCS_BIT(16), + UCP_REQUEST_DEBUG_FLAG_EXTERNAL = UCS_BIT(17) #else + UCP_REQUEST_FLAG_STREAM_RECV = 0, UCP_REQUEST_DEBUG_FLAG_EXTERNAL = 0 #endif }; @@ -73,7 +78,12 @@ enum { UCP_RECV_DESC_FLAG_EAGER_ONLY = UCS_BIT(2), /* Eager tag message with single fragment */ UCP_RECV_DESC_FLAG_EAGER_SYNC = UCS_BIT(3), /* Eager tag message which requires reply */ UCP_RECV_DESC_FLAG_EAGER_OFFLOAD = UCS_BIT(4), /* Eager tag from offload */ - UCP_RECV_DESC_FLAG_RNDV = UCS_BIT(5) /* Rendezvous request */ + UCP_RECV_DESC_FLAG_EAGER_LAST = UCS_BIT(5), /* Last fragment of eager tag message. + Used by tag offload protocol. */ + UCP_RECV_DESC_FLAG_RNDV = UCS_BIT(6), /* Rendezvous request */ + UCP_RECV_DESC_FLAG_MALLOC = UCS_BIT(7) /* Descriptor was allocated with malloc + and must be freed, not returned to the + memory pool or UCT */ }; @@ -90,33 +100,41 @@ enum { * Request in progress. */ struct ucp_request { - ucs_status_t status; /* Operation status */ - uint16_t flags; /* Request flags */ + ucs_status_t status; /* Operation status */ + uint32_t flags; /* Request flags */ + void *user_data; /* Completion user data */ union { - /* "send" part - used for tag_send, stream_send, put, get, and atomic + /* "send" part - used for tag_send, am_send, stream_send, put, get, and atomic * operations */ struct { - ucp_ep_h ep; - void *buffer; /* Send buffer */ - ucp_datatype_t datatype; /* Send type */ - size_t length; /* Total length, in bytes */ - uct_memory_type_t mem_type; /* Memory type */ - ucp_send_callback_t cb; /* Completion callback */ + ucp_ep_h ep; + void *buffer; /* Send buffer */ + ucp_datatype_t datatype; /* Send type */ + size_t length; /* Total length, in bytes */ + ucs_memory_type_t mem_type; /* Memory type */ + ucp_send_nbx_callback_t cb; /* Completion callback */ union { - ucp_wireup_msg_t wireup; - /* Tagged send */ struct { - ucp_tag_t tag; - uint64_t message_id; /* message ID used in AM */ - ucp_lane_index_t am_bw_index; /* AM BW lane index */ - uintptr_t rreq_ptr; /* receive request ptr on the - recv side (used in AM rndv) */ - } tag; + ucp_lane_index_t am_bw_index; /* AM BW lane index */ + uint64_t message_id; /* used to identify matching parts + of a large message */ + + struct { + ucp_tag_t tag; + uintptr_t rreq_ptr; /* receive request ptr on the + recv side (used in AM rndv) */ + } tag; + + struct { + uint16_t am_id; + unsigned flags; + } am; + } msg_proto; struct { uint64_t remote_addr; /* Remote address */ @@ -125,6 +143,7 @@ struct ucp_request { struct { uintptr_t remote_request; /* pointer to the send request on receiver side */ + ucp_request_t *sreq; /* original send request of frag put */ uint8_t am_id; ucs_status_t status; ucp_tag_t sender_tag; /* Sender tag, which is sent back in sync ack */ @@ -137,25 +156,38 @@ struct ucp_request { } proxy; struct { - uint64_t remote_address; /* address of the sender's data buffer */ - uintptr_t remote_request; /* pointer to the sender's send request */ - ucp_request_t *rreq; /* receive request on the recv side */ - ucp_rkey_h rkey; /* key for remote send buffer */ - ucp_lane_map_t lanes_map; /* used lanes map */ - ucp_lane_index_t lane_count; /* number of lanes used in transaction */ + uint64_t remote_address; /* address of the sender's data buffer */ + uintptr_t remote_request; /* pointer to the sender's request */ + ucp_request_t *rreq; /* receive request on the recv side */ + ucp_rkey_h rkey; /* key for remote send buffer */ + ucp_lane_map_t lanes_map_avail; /* used lanes map */ + ucp_lane_map_t lanes_map_all; /* actual lanes map */ + uint8_t lanes_count; /* actual lanes count */ + uint8_t rkey_index[UCP_MAX_LANES]; + } rndv_get; struct { - uint64_t remote_address; /* address of the receiver's data buffer */ - uintptr_t remote_request; /* pointer to the receiver's receive request */ - ucp_request_t *sreq; /* send request on the send side */ - ucp_rkey_h rkey; /* key for remote receive buffer */ - uct_rkey_t uct_rkey; /* UCT remote key */ + uint64_t remote_address; /* address of the receiver's data buffer */ + uintptr_t remote_request; /* pointer to the receiver's receive request */ + ucp_request_t *sreq; /* send request on the send side */ + ucp_rkey_h rkey; /* key for remote receive buffer */ + uct_rkey_t uct_rkey; /* UCT remote key */ } rndv_put; + struct { + ucs_queue_elem_t queue_elem; + uintptr_t remote_request; /* pointer to the sender's request */ + ucp_request_t *rreq; /* receive request on the recv side */ + ucp_rkey_h rkey; /* key for remote send buffer */ + } rkey_ptr; + struct { uintptr_t remote_request; /* pointer to the send request on receiver side */ - ucp_request_t *rreq; + ucp_request_t *rreq; /* pointer to the receive request */ + size_t length; /* the length of the data that should be fetched + * from sender side */ + size_t offset; /* offset in recv buffer */ } rndv_rtr; struct { @@ -168,7 +200,8 @@ struct ucp_request { this request is waiting for */ uint8_t sw_started; uint8_t sw_done; - ucp_lane_map_t lanes; /* Which lanes need to be flushed */ + uint8_t num_lanes; /* How many lanes are being flushed */ + ucp_lane_map_t started_lanes;/* Which lanes need were flushed */ } flush; struct { @@ -221,39 +254,54 @@ struct ucp_request { void *buffer; /* Buffer to receive data to */ ucp_datatype_t datatype; /* Receive type */ size_t length; /* Total length, in bytes */ - uct_memory_type_t mem_type; /* Memory type */ + ucs_memory_type_t mem_type; /* Memory type */ ucp_dt_state_t state; ucp_worker_t *worker; uct_tag_context_t uct_ctx; /* Transport offload context */ union { struct { - ucp_tag_t tag; /* Expected tag */ - ucp_tag_t tag_mask; /* Expected tag mask */ - uint64_t sn; /* Tag match sequence */ - ucp_tag_recv_callback_t cb; /* Completion callback */ - ucp_tag_recv_info_t info; /* Completion info to fill */ - ucp_mem_desc_t *rdesc; /* Offload bounce buffer */ - ssize_t remaining; /* How much more data to be received */ - ucp_worker_iface_t *wiface; /* Cached iface this request - is received on. Used in - tag offload expected callbacks*/ + ucp_tag_t tag; /* Expected tag */ + ucp_tag_t tag_mask; /* Expected tag mask */ + uint64_t sn; /* Tag match sequence */ + ucp_tag_recv_nbx_callback_t cb; /* Completion callback */ + ucp_tag_recv_info_t info; /* Completion info to fill */ + ssize_t remaining; /* How much more data + * to be received */ + + /* Can use union, because rdesc is used in expected flow, + * while non_contig_buf is used in unexpected flow only. */ + union { + ucp_mem_desc_t *rdesc; /* Offload bounce buffer */ + void *non_contig_buf; /* Used for assembling + multi-fragment + non-contig unexpected + message in tag offload flow. */ + }; + ucp_worker_iface_t *wiface; /* Cached iface this request + is received on. Used in + tag offload expected callbacks*/ } tag; struct { - ucp_stream_recv_callback_t cb; /* Completion callback */ - size_t offset; /* Receive data offset */ - size_t length; /* Completion info to fill */ + ucp_request_t *rreq; /* recv request on recv side */ + size_t offset; /* offset in recv buffer */ + } frag; + + struct { + ucp_stream_recv_nbx_callback_t cb; /* Completion callback */ + size_t offset; /* Receive data offset */ + size_t length; /* Completion info to fill */ } stream; }; } recv; struct { - ucp_worker_h worker; /* Worker to flush */ - ucp_send_callback_t cb; /* Completion callback */ - uct_worker_cb_id_t prog_id; /* Progress callback ID */ - int comp_count; /* Countdown to request completion */ - ucp_ep_ext_gen_t *next_ep; /* Next endpoint to flush */ + ucp_worker_h worker; /* Worker to flush */ + ucp_send_nbx_callback_t cb; /* Completion callback */ + uct_worker_cb_id_t prog_id; /* Progress callback ID */ + int comp_count; /* Countdown to request completion */ + ucp_ep_ext_gen_t *next_ep; /* Next endpoint to flush */ } flush_worker; struct { @@ -282,24 +330,41 @@ struct ucp_request { */ struct ucp_recv_desc { union { - ucs_list_link_t tag_list[2]; /* Hash list TAG-element */ - ucs_queue_elem_t stream_queue; /* Queue STREAM-element */ - ucs_queue_elem_t tag_frag_queue; /* Tag fragments queue */ + ucs_list_link_t tag_list[2]; /* Hash list TAG-element */ + ucs_queue_elem_t stream_queue; /* Queue STREAM-element */ + ucs_queue_elem_t tag_frag_queue; /* Tag fragments queue */ + ucp_am_first_desc_t am_first; /* AM first fragment data needed + for assembling the message */ + ucs_queue_elem_t am_mid_queue; /* AM middle fragments queue */ }; - uint32_t length; /* Received length */ - uint32_t payload_offset; /* Offset from end of the descriptor - * to AM data */ - uint16_t flags; /* Flags */ - int16_t priv_length; /* Number of bytes consumed from - headroom private space, except the - space needed for ucp_recv_desc itself. - It is used for releasing descriptor - back to UCT only */ + uint32_t length; /* Received length */ + uint32_t payload_offset; /* Offset from end of the descriptor + * to AM data */ + uint16_t flags; /* Flags */ + int16_t uct_desc_offset; /* Offset which needs to be + substructed from rdesc when + releasing it back to UCT */ +}; + + +/** + * Defines protocol functions for ucp_request_send_start() function. + * TODO will be removed when switching to new protocols implementation. + */ +struct ucp_request_send_proto { + uct_pending_callback_t contig_short; /**< Progress short data */ + uct_pending_callback_t bcopy_single; /**< Progress bcopy single fragment */ + uct_pending_callback_t bcopy_multi; /**< Progress bcopy multi-fragment */ + uct_pending_callback_t zcopy_single; /**< Progress zcopy single fragment */ + uct_pending_callback_t zcopy_multi; /**< Progress zcopy multi-fragment */ + uct_completion_callback_t zcopy_completion; /**< Callback for UCT zcopy completion */ + size_t only_hdr_size; /**< Header size for single / short */ }; extern ucs_mpool_ops_t ucp_request_mpool_ops; extern ucs_mpool_ops_t ucp_rndv_get_mpool_ops; +extern const ucp_request_param_t ucp_request_null_param; int ucp_request_pending_add(ucp_request_t *req, ucs_status_t *req_status, @@ -307,7 +372,7 @@ int ucp_request_pending_add(ucp_request_t *req, ucs_status_t *req_status, ucs_status_t ucp_request_memory_reg(ucp_context_t *context, ucp_md_map_t md_map, void *buffer, size_t length, ucp_datatype_t datatype, - ucp_dt_state_t *state, uct_memory_type_t mem_type, + ucp_dt_state_t *state, ucs_memory_type_t mem_type, ucp_request_t *req_dbg, unsigned uct_flags); void ucp_request_memory_dereg(ucp_context_t *context, ucp_datatype_t datatype, @@ -317,9 +382,12 @@ ucs_status_t ucp_request_send_start(ucp_request_t *req, ssize_t max_short, size_t zcopy_thresh, size_t zcopy_max, size_t dt_count, const ucp_ep_msg_config_t* msg_config, - const ucp_proto_t *proto); + const ucp_request_send_proto_t *proto); /* Fast-forward to data end */ void ucp_request_send_state_ff(ucp_request_t *req, ucs_status_t status); +ucs_status_t ucp_request_recv_msg_truncated(ucp_request_t *req, size_t length, + size_t offset); + #endif diff --git a/src/ucp/core/ucp_request.inl b/src/ucp/core/ucp_request.inl index 674ab220c62..e978f6f1931 100644 --- a/src/ucp/core/ucp_request.inl +++ b/src/ucp/core/ucp_request.inl @@ -1,5 +1,5 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -71,11 +71,61 @@ } \ } -#define ucp_request_set_callback(_req, _cb, _value) \ +#define ucp_request_set_callback(_req, _cb, _cb_value, _user_data) \ { \ - (_req)->_cb = _value; \ - (_req)->flags |= UCP_REQUEST_FLAG_CALLBACK; \ - ucs_trace_data("request %p %s set to %p", _req, #_cb, _value); \ + (_req)->_cb = _cb_value; \ + (_req)->user_data = _user_data; \ + (_req)->flags |= UCP_REQUEST_FLAG_CALLBACK; \ + ucs_trace_data("request %p %s set to %p, user data: %p", \ + _req, #_cb, _cb_value, _user_data); \ + } + + +#define ucp_request_get_param(_worker, _param, _failed) \ + ({ \ + ucp_request_t *__req; \ + if (!((_param)->op_attr_mask & UCP_OP_ATTR_FIELD_REQUEST)) { \ + __req = ucp_request_get(_worker); \ + if (ucs_unlikely((__req) == NULL)) { \ + _failed; \ + } \ + } else { \ + __req = ((ucp_request_t*)(_param)->request) - 1; \ + } \ + __req; \ + }) + + +#define ucp_request_put_param(_param, _req) \ + if (!((_param)->op_attr_mask & UCP_OP_ATTR_FIELD_REQUEST)) { \ + ucp_request_put(_req); \ + } + + +#define ucp_request_cb_param(_param, _req, _cb, ...) \ + if ((_param)->op_attr_mask & UCP_OP_ATTR_FIELD_CALLBACK) { \ + param->cb._cb(req + 1, status, ##__VA_ARGS__, param->user_data); \ + } + + +#define ucp_request_imm_cmpl_param(_param, _req, _status, _cb, ...) \ + if ((_param)->op_attr_mask & UCP_OP_ATTR_FLAG_NO_IMM_CMPL) { \ + ucp_request_cb_param(_param, _req, _cb, ##__VA_ARGS__); \ + ucs_trace_req("request %p completed, but immediate completion is " \ + "prohibited, status %s", _req, \ + ucs_status_string(_status)); \ + return (_req) + 1; \ + } \ + ucp_request_put_param(_param, _req); \ + return UCS_STATUS_PTR(_status); + + +#define ucp_request_set_send_callback_param(_param, _req, _cb) \ + if ((_param)->op_attr_mask & UCP_OP_ATTR_FIELD_CALLBACK) { \ + ucp_request_set_callback(_req, _cb.cb, (_param)->cb.send, \ + ((_param)->op_attr_mask & \ + UCP_OP_ATTR_FIELD_USER_DATA) ? \ + (_param)->user_data : NULL); \ } @@ -94,7 +144,7 @@ ucp_request_complete_send(ucp_request_t *req, ucs_status_t status) req, req + 1, UCP_REQUEST_FLAGS_ARG(req->flags), ucs_status_string(status)); UCS_PROFILE_REQUEST_EVENT(req, "complete_send", status); - ucp_request_complete(req, send.cb, status); + ucp_request_complete(req, send.cb, status, req->user_data); } static UCS_F_ALWAYS_INLINE void @@ -106,7 +156,8 @@ ucp_request_complete_tag_recv(ucp_request_t *req, ucs_status_t status) req->recv.tag.info.sender_tag, req->recv.tag.info.length, ucs_status_string(status)); UCS_PROFILE_REQUEST_EVENT(req, "complete_recv", status); - ucp_request_complete(req, recv.tag.cb, status, &req->recv.tag.info); + ucp_request_complete(req, recv.tag.cb, status, &req->recv.tag.info, + req->user_data); } static UCS_F_ALWAYS_INLINE void @@ -118,7 +169,7 @@ ucp_request_complete_stream_recv(ucp_request_t *req, ucp_ep_ext_proto_t* ep_ext, ucs_queue_pull_elem_non_empty(&ep_ext->stream.match_q, ucp_request_t, recv.queue); ucs_assert(check_req == req); - ucs_assert(req->recv.stream.offset > 0); + ucs_assert((req->recv.stream.offset > 0) || UCS_STATUS_IS_ERR(status)); req->recv.stream.length = req->recv.stream.offset; ucs_trace_req("completing stream receive request %p (%p) " @@ -126,7 +177,8 @@ ucp_request_complete_stream_recv(ucp_request_t *req, ucp_ep_ext_proto_t* ep_ext, req, req + 1, UCP_REQUEST_FLAGS_ARG(req->flags), req->recv.stream.length, ucs_status_string(status)); UCS_PROFILE_REQUEST_EVENT(req, "complete_recv", status); - ucp_request_complete(req, recv.stream.cb, status, req->recv.stream.length); + ucp_request_complete(req, recv.stream.cb, status, req->recv.stream.length, + req->user_data); } static UCS_F_ALWAYS_INLINE int @@ -166,6 +218,8 @@ ucp_request_try_send(ucp_request_t *req, ucs_status_t *req_status, { ucs_status_t status; + /* coverity wrongly resolves (*req).send.uct.func to test_uct_pending::pending_send_op_ok */ + /* coverity[address_free] */ status = req->send.uct.func(&req->send.uct); if (status == UCS_OK) { /* Completed the operation */ @@ -326,8 +380,10 @@ ucp_request_send_state_advance(ucp_request_t *req, case UCP_REQUEST_SEND_PROTO_BCOPY_AM: ucs_assert(new_dt_state != NULL); if (UCP_DT_IS_CONTIG(req->send.datatype)) { + /* cppcheck-suppress nullPointer */ req->send.state.dt.offset = new_dt_state->offset; } else { + /* cppcheck-suppress nullPointer */ req->send.state.dt = *new_dt_state; } break; @@ -341,29 +397,61 @@ ucp_request_send_state_advance(ucp_request_t *req, } static UCS_F_ALWAYS_INLINE ucs_status_t -ucp_request_send_buffer_reg(ucp_request_t *req, ucp_md_map_t md_map) +ucp_request_send_buffer_reg(ucp_request_t *req, ucp_md_map_t md_map, + unsigned uct_flags) { return ucp_request_memory_reg(req->send.ep->worker->context, md_map, (void*)req->send.buffer, req->send.length, req->send.datatype, &req->send.state.dt, - req->send.mem_type, req, 0); + req->send.mem_type, req, uct_flags); +} + +static UCS_F_ALWAYS_INLINE ucs_status_t +ucp_request_send_buffer_reg_lane_check(ucp_request_t *req, ucp_lane_index_t lane, + ucp_md_map_t prev_md_map, unsigned uct_flags) +{ + ucp_md_map_t md_map; + + if (!(ucp_ep_md_attr(req->send.ep, + lane)->cap.flags & UCT_MD_FLAG_NEED_MEMH)) { + return UCS_OK; + } + + ucs_assert(ucp_ep_md_attr(req->send.ep, + lane)->cap.flags & UCT_MD_FLAG_REG); + md_map = UCS_BIT(ucp_ep_md_index(req->send.ep, lane)) | prev_md_map; + return ucp_request_send_buffer_reg(req, md_map, uct_flags); } static UCS_F_ALWAYS_INLINE ucs_status_t -ucp_request_send_buffer_reg_lane(ucp_request_t *req, ucp_lane_index_t lane) +ucp_request_send_buffer_reg_lane(ucp_request_t *req, ucp_lane_index_t lane, + unsigned uct_flags) { - ucp_md_map_t md_map = UCS_BIT(ucp_ep_md_index(req->send.ep, lane)); - return ucp_request_send_buffer_reg(req, md_map); + return ucp_request_send_buffer_reg_lane_check(req, lane, 0, uct_flags); } static UCS_F_ALWAYS_INLINE ucs_status_t ucp_send_request_add_reg_lane(ucp_request_t *req, ucp_lane_index_t lane) { - /* add new lane to registration map */ - ucp_md_map_t md_map = UCS_BIT(ucp_ep_md_index(req->send.ep, lane)) | - req->send.state.dt.dt.contig.md_map; + /* Add new lane to registration map */ + ucp_md_map_t md_map; + + if (ucs_likely(UCP_DT_IS_CONTIG(req->send.datatype))) { + md_map = req->send.state.dt.dt.contig.md_map; + } else if (UCP_DT_IS_IOV(req->send.datatype) && + (req->send.state.dt.dt.iov.dt_reg != NULL)) { + /* dt_reg can be NULL if underlying UCT TL doesn't require + * memory handle for for local AM/GET/PUT operations + * (i.e. UCT_MD_FLAG_NEED_MEMH is not set) */ + /* Can use the first DT registration element, since + * they have the same MD maps */ + md_map = req->send.state.dt.dt.iov.dt_reg[0].md_map; + } else { + md_map = 0; + } + ucs_assert(ucs_popcount(md_map) <= UCP_MAX_OP_MDS); - return ucp_request_send_buffer_reg(req, md_map); + return ucp_request_send_buffer_reg_lane_check(req, lane, md_map, 0); } static UCS_F_ALWAYS_INLINE ucs_status_t @@ -373,7 +461,8 @@ ucp_request_recv_buffer_reg(ucp_request_t *req, ucp_md_map_t md_map, return ucp_request_memory_reg(req->recv.worker->context, md_map, req->recv.buffer, length, req->recv.datatype, &req->recv.state, - req->recv.mem_type, req, 0); + req->recv.mem_type, req, + UCT_MD_MEM_FLAG_HIDE_ERRORS); } static UCS_F_ALWAYS_INLINE void ucp_request_send_buffer_dereg(ucp_request_t *req) @@ -396,6 +485,19 @@ ucp_request_wait_uct_comp(ucp_request_t *req) } } +static UCS_F_ALWAYS_INLINE void +ucp_request_unpack_contig(ucp_request_t *req, void *buf, const void *data, + size_t length) +{ + if (ucs_likely(UCP_MEM_IS_ACCESSIBLE_FROM_CPU(req->recv.mem_type))) { + UCS_PROFILE_NAMED_CALL("memcpy_recv", ucs_memcpy_relaxed, buf, + data, length); + } else { + ucp_mem_type_unpack(req->recv.worker, buf, data, length, + req->recv.mem_type); + } +} + /** * Unpack receive data to a request * @@ -419,28 +521,15 @@ ucp_request_recv_data_unpack(ucp_request_t *req, const void *data, req->recv.length, length, offset, last ? "yes" : "no"); if (ucs_unlikely((length + offset) > req->recv.length)) { - ucs_debug("message truncated: recv_length %zu offset %zu buffer_size %zu", - length, offset, req->recv.length); - if (UCP_DT_IS_GENERIC(req->recv.datatype)) { - dt_gen = ucp_dt_generic(req->recv.datatype); - UCS_PROFILE_NAMED_CALL_VOID("dt_finish", dt_gen->ops.finish, - req->recv.state.dt.generic.state); - } - return UCS_ERR_MESSAGE_TRUNCATED; + return ucp_request_recv_msg_truncated(req, length, offset); } switch (req->recv.datatype & UCP_DATATYPE_CLASS_MASK) { case UCP_DATATYPE_CONTIG: - if ((ucs_likely(UCP_MEM_IS_HOST(req->recv.mem_type))) || - (ucs_likely(UCP_MEM_IS_CUDA_MANAGED(req->recv.mem_type))) || - (ucs_likely(UCP_MEM_IS_ROCM_MANAGED(req->recv.mem_type)))) { - UCS_PROFILE_NAMED_CALL("memcpy_recv", memcpy, req->recv.buffer + offset, - data, length); - } else { - ucp_mem_type_unpack(req->recv.worker, req->recv.buffer + offset, - data, length, req->recv.mem_type); - } - return UCS_OK;; + ucp_request_unpack_contig(req, + UCS_PTR_BYTE_OFFSET(req->recv.buffer, offset), + data, length); + return UCS_OK; case UCP_DATATYPE_IOV: if (offset != req->recv.state.offset) { @@ -476,7 +565,7 @@ ucp_request_recv_data_unpack(ucp_request_t *req, const void *data, static UCS_F_ALWAYS_INLINE ucs_status_t ucp_recv_desc_init(ucp_worker_h worker, void *data, size_t length, int data_offset, unsigned am_flags, uint16_t hdr_len, - uint16_t rdesc_flags, uint16_t priv_length, + uint16_t rdesc_flags, int priv_length, ucp_recv_desc_t **rdesc_p) { ucp_recv_desc_t *rdesc; @@ -486,11 +575,11 @@ ucp_recv_desc_init(ucp_worker_h worker, void *data, size_t length, if (ucs_unlikely(am_flags & UCT_CB_PARAM_FLAG_DESC)) { /* slowpath */ ucs_assert(priv_length <= UCP_WORKER_HEADROOM_PRIV_SIZE); - data_hdr = UCS_PTR_BYTE_OFFSET(data, -data_offset); - rdesc = (ucp_recv_desc_t *)data_hdr - 1; - rdesc->flags = rdesc_flags | UCP_RECV_DESC_FLAG_UCT_DESC; - rdesc->priv_length = priv_length; - status = UCS_INPROGRESS; + data_hdr = UCS_PTR_BYTE_OFFSET(data, -data_offset); + rdesc = (ucp_recv_desc_t *)data_hdr - 1; + rdesc->flags = rdesc_flags | UCP_RECV_DESC_FLAG_UCT_DESC; + rdesc->uct_desc_offset = UCP_WORKER_HEADROOM_PRIV_SIZE - priv_length; + status = UCS_INPROGRESS; } else { rdesc = (ucp_recv_desc_t*)ucs_mpool_get_inline(&worker->am_mp); if (rdesc == NULL) { @@ -515,34 +604,39 @@ ucp_recv_desc_init(ucp_worker_h worker, void *data, size_t length, static UCS_F_ALWAYS_INLINE void ucp_recv_desc_release(ucp_recv_desc_t *rdesc) { + void *uct_desc; + ucs_trace_req("release receive descriptor %p", rdesc); if (ucs_unlikely(rdesc->flags & UCP_RECV_DESC_FLAG_UCT_DESC)) { /* uct desc is slowpath */ - uct_iface_release_desc(UCS_PTR_BYTE_OFFSET(rdesc, - -(UCP_WORKER_HEADROOM_PRIV_SIZE - - rdesc->priv_length))); + uct_desc = UCS_PTR_BYTE_OFFSET(rdesc, -rdesc->uct_desc_offset); + uct_iface_release_desc(uct_desc); } else { ucs_mpool_put_inline(rdesc); } } static UCS_F_ALWAYS_INLINE ucp_lane_index_t -ucp_send_request_get_next_am_bw_lane(ucp_request_t *req) +ucp_send_request_get_am_bw_lane(ucp_request_t *req) { ucp_lane_index_t lane; - /* at least one lane must be initialized */ - ucs_assert(ucp_ep_config(req->send.ep)->key.am_bw_lanes[0] != UCP_NULL_LANE); + lane = ucp_ep_config(req->send.ep)-> + key.am_bw_lanes[req->send.msg_proto.am_bw_index]; + ucs_assertv(lane != UCP_NULL_LANE, "req->send.msg_proto.am_bw_index=%d", + req->send.msg_proto.am_bw_index); + return lane; +} - lane = (req->send.tag.am_bw_index >= UCP_MAX_LANES) ? - UCP_NULL_LANE : - ucp_ep_config(req->send.ep)->key.am_bw_lanes[req->send.tag.am_bw_index]; - if (lane != UCP_NULL_LANE) { - req->send.tag.am_bw_index++; - return lane; - } else { - req->send.tag.am_bw_index = 1; - return ucp_ep_config(req->send.ep)->key.am_bw_lanes[0]; +static UCS_F_ALWAYS_INLINE void +ucp_send_request_next_am_bw_lane(ucp_request_t *req) +{ + ucp_lane_index_t am_bw_index = ++req->send.msg_proto.am_bw_index; + ucp_ep_config_t *config = ucp_ep_config(req->send.ep); + + if ((am_bw_index >= UCP_MAX_LANES) || + (config->key.am_bw_lanes[am_bw_index] == UCP_NULL_LANE)) { + req->send.msg_proto.am_bw_index = 0; } } @@ -555,4 +649,18 @@ static UCS_F_ALWAYS_INLINE uintptr_t ucp_request_get_dest_ep_ptr(ucp_request_t * return ucp_ep_dest_ep_ptr(req->send.ep); } +static UCS_F_ALWAYS_INLINE uint32_t +ucp_request_param_flags(const ucp_request_param_t *param) +{ + return (param->op_attr_mask & UCP_OP_ATTR_FIELD_FLAGS) ? + param->flags : 0; +} + +static UCS_F_ALWAYS_INLINE ucp_datatype_t +ucp_request_param_datatype(const ucp_request_param_t *param) +{ + return (param->op_attr_mask & UCP_OP_ATTR_FIELD_DATATYPE) ? + param->datatype : ucp_dt_make_contig(1); +} + #endif diff --git a/src/ucp/core/ucp_rkey.c b/src/ucp/core/ucp_rkey.c index c375b1b9977..419432cd879 100644 --- a/src/ucp/core/ucp_rkey.c +++ b/src/ucp/core/ucp_rkey.c @@ -4,19 +4,25 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ucp_mm.h" #include "ucp_request.h" #include "ucp_ep.inl" #include #include +#include +#include #include static struct { ucp_md_map_t md_map; uint8_t mem_type; -} UCS_S_PACKED ucp_mem_dummy_buffer = {0, UCT_MD_MEM_TYPE_HOST}; +} UCS_S_PACKED ucp_mem_dummy_buffer = {0, UCS_MEMORY_TYPE_HOST}; size_t ucp_rkey_packed_size(ucp_context_h context, ucp_md_map_t md_map) @@ -35,22 +41,22 @@ size_t ucp_rkey_packed_size(ucp_context_h context, ucp_md_map_t md_map) } void ucp_rkey_packed_copy(ucp_context_h context, ucp_md_map_t md_map, - uct_memory_type_t mem_type, void *rkey_buffer, + ucs_memory_type_t mem_type, void *rkey_buffer, const void* uct_rkeys[]) { - void *p = rkey_buffer; + uint8_t *p = rkey_buffer; unsigned md_index; size_t md_size; *(ucp_md_map_t*)p = md_map; p += sizeof(ucp_md_map_t); - *((uint8_t *)p++) = mem_type; + *(p++) = mem_type; ucs_for_each_bit(md_index, md_map) { md_size = context->tl_mds[md_index].attr.rkey_packed_size; ucs_assert_always(md_size <= UINT8_MAX); - *((uint8_t*)p++) = md_size; + *(p++) = md_size; memcpy(p, *uct_rkeys, md_size); p += md_size; ++uct_rkeys; @@ -58,10 +64,10 @@ void ucp_rkey_packed_copy(ucp_context_h context, ucp_md_map_t md_map, } ssize_t ucp_rkey_pack_uct(ucp_context_h context, ucp_md_map_t md_map, - const uct_mem_h *memh, uct_memory_type_t mem_type, + const uct_mem_h *memh, ucs_memory_type_t mem_type, void *rkey_buffer) { - void *p = rkey_buffer; + uint8_t *p = rkey_buffer; ucs_status_t status = UCS_OK; unsigned md_index, uct_memh_index; size_t md_size; @@ -75,14 +81,14 @@ ssize_t ucp_rkey_pack_uct(ucp_context_h context, ucp_md_map_t md_map, p += sizeof(ucp_md_map_t); /* Write memory type */ - UCS_STATIC_ASSERT(UCT_MD_MEM_TYPE_LAST <= 255); - *((uint8_t*)p++) = mem_type; + UCS_STATIC_ASSERT(UCS_MEMORY_TYPE_LAST <= 255); + *(p++) = mem_type; /* Write both size and rkey_buffer for each UCT rkey */ uct_memh_index = 0; ucs_for_each_bit (md_index, md_map) { md_size = context->tl_mds[md_index].attr.rkey_packed_size; - *((uint8_t*)p++) = md_size; + *(p++) = md_size; status = uct_md_mkey_pack(context->tl_mds[md_index].md, memh[uct_memh_index], p); if (status != UCS_OK) { @@ -90,14 +96,14 @@ ssize_t ucp_rkey_pack_uct(ucp_context_h context, ucp_md_map_t md_map, } ucs_trace("rkey[%d]=%s for md[%d]=%s", uct_memh_index, - ucs_log_dump_hex(p, md_size, buf, sizeof(buf)), md_index, - context->tl_mds[md_index].rsc.md_name); + ucs_str_dump_hex(p, md_size, buf, sizeof(buf), SIZE_MAX), + md_index, context->tl_mds[md_index].rsc.md_name); ++uct_memh_index; p += md_size; } - return p - rkey_buffer; + return UCS_PTR_BYTE_DIFF(rkey_buffer, p); } ucs_status_t ucp_rkey_pack(ucp_context_h context, ucp_mem_h memh, @@ -164,19 +170,28 @@ void ucp_rkey_buffer_release(void *rkey_buffer) ucs_free(rkey_buffer); } -ucs_status_t ucp_ep_rkey_unpack(ucp_ep_h ep, const void *rkey_buffer, - ucp_rkey_h *rkey_p) +UCS_PROFILE_FUNC(ucs_status_t, ucp_ep_rkey_unpack, (ep, rkey_buffer, rkey_p), + ucp_ep_h ep, const void *rkey_buffer, + ucp_rkey_h *rkey_p) { - ucp_context_t *context = ep->worker->context; + ucp_worker_h worker = ep->worker; + const ucp_ep_config_t *ep_config; unsigned remote_md_index; ucp_md_map_t md_map, remote_md_map; + ucp_rsc_index_t cmpt_index; + ucp_tl_rkey_t *tl_rkey; unsigned rkey_index; unsigned md_count; ucs_status_t status; ucp_rkey_h rkey; - uct_memory_type_t mem_type; + ucs_memory_type_t mem_type; uint8_t md_size; - const void *p; + const uint8_t *p; + uint8_t flags; + + UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); + + ep_config = ucp_ep_config(ep); /* Count the number of remote MDs in the rkey buffer */ p = rkey_buffer; @@ -194,33 +209,34 @@ ucs_status_t ucp_ep_rkey_unpack(ucp_ep_h ep, const void *rkey_buffer, * allocations are done from a memory pool. * We keep all of them to handle a future transport switch. */ + flags = 0; if (md_count <= UCP_RKEY_MPOOL_MAX_MD) { - UCP_THREAD_CS_ENTER_CONDITIONAL(&context->mt_lock); - rkey = ucs_mpool_get_inline(&context->rkey_mp); - UCP_THREAD_CS_EXIT_CONDITIONAL(&context->mt_lock); + rkey = ucs_mpool_get_inline(&worker->rkey_mp); + flags = UCP_RKEY_DESC_FLAG_POOL; } else { - rkey = ucs_malloc(sizeof(*rkey) + (sizeof(rkey->uct[0]) * md_count), + rkey = ucs_malloc(sizeof(*rkey) + (sizeof(rkey->tl_rkey[0]) * md_count), "ucp_rkey"); } if (rkey == NULL) { status = UCS_ERR_NO_MEMORY; - goto err; + goto out_unlock; } /* Read memory type */ - mem_type = *((uint8_t*)p++); + mem_type = (ucs_memory_type_t)*(p++); rkey->md_map = md_map; rkey->mem_type = mem_type; + rkey->flags = flags; #if ENABLE_PARAMS_CHECK rkey->ep = ep; #endif /* Unpack rkey of each UCT MD */ - remote_md_index = 0; /* Index of remote MD */ - rkey_index = 0; /* Index of the rkey in the array */ + rkey_index = 0; /* Index of the rkey in the array */ + /* Go over remote MD indices */ ucs_for_each_bit (remote_md_index, remote_md_map) { - md_size = *((uint8_t*)p++); + md_size = *(p++); /* Use bit operations to iterate through the indices of the remote MDs * as provided in the md_map. md_map always holds a bitmap of MD indices @@ -234,19 +250,23 @@ ucs_status_t ucp_ep_rkey_unpack(ucp_ep_h ep, const void *rkey_buffer, if (UCS_BIT(remote_md_index) & rkey->md_map) { ucs_assert(rkey_index < md_count); - status = uct_rkey_unpack(p, &rkey->uct[rkey_index]); + tl_rkey = &rkey->tl_rkey[rkey_index]; + cmpt_index = ucp_ep_config_get_dst_md_cmpt(&ep_config->key, + remote_md_index); + tl_rkey->cmpt = worker->context->tl_cmpts[cmpt_index].cmpt; + status = uct_rkey_unpack(tl_rkey->cmpt, p, &tl_rkey->rkey); if (status == UCS_OK) { ucs_trace("rkey[%d] for remote md %d is 0x%lx", rkey_index, - remote_md_index, rkey->uct[rkey_index].rkey); - rkey->md_map |= UCS_BIT(remote_md_index); + remote_md_index, tl_rkey->rkey.rkey); ++rkey_index; } else if (status == UCS_ERR_UNREACHABLE) { rkey->md_map &= ~UCS_BIT(remote_md_index); - ucs_trace("rkey[%d] for remote md %d is 0x%lx not reachable", rkey_index, - remote_md_index, rkey->uct[rkey_index].rkey); + ucs_trace("rkey[%d] for remote md %d is 0x%lx not reachable", + rkey_index, remote_md_index, tl_rkey->rkey.rkey); + /* FIXME this can make malloc allocated key be released to mpool */ } else { - ucs_error("Failed to unpack remote key from remote md[%d]: %s", + ucs_error("failed to unpack remote key from remote md[%d]: %s", remote_md_index, ucs_status_string(status)); goto err_destroy; } @@ -255,20 +275,29 @@ ucs_status_t ucp_ep_rkey_unpack(ucp_ep_h ep, const void *rkey_buffer, p += md_size; } + /* Silence clang checker - assert that if some rkeys are unpacked, then + * rkey->md_map is nozero. + */ + ucs_assert((rkey_index > 0) || (rkey->md_map == 0)); + ucp_rkey_resolve_inner(rkey, ep); *rkey_p = rkey; - return UCS_OK; + status = UCS_OK; + +out_unlock: + UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker); + return status; err_destroy: ucp_rkey_destroy(rkey); -err: - return status; + goto out_unlock; } void ucp_rkey_dump_packed(const void *rkey_buffer, char *buffer, size_t max) { - char *p = buffer; - char *endp = buffer + max; + char *p = buffer; + char *endp = buffer + max; + const uint8_t *rkey_buf = rkey_buffer; ucp_md_map_t md_map; unsigned md_index; uint8_t md_size; @@ -277,15 +306,13 @@ void ucp_rkey_dump_packed(const void *rkey_buffer, char *buffer, size_t max) snprintf(p, endp - p, "{"); p += strlen(p); - md_map = *(ucp_md_map_t*)(rkey_buffer); - rkey_buffer += sizeof(ucp_md_map_t); - - rkey_buffer += sizeof(uint8_t); + md_map = *(ucp_md_map_t*)(rkey_buf); + rkey_buf += sizeof(ucp_md_map_t) + sizeof(uint8_t); first = 1; ucs_for_each_bit(md_index, md_map) { - md_size = *((uint8_t*)rkey_buffer); - rkey_buffer += sizeof(uint8_t); + md_size = *rkey_buf; + rkey_buf += sizeof(uint8_t); if (!first) { snprintf(p, endp - p, ","); @@ -296,30 +323,30 @@ void ucp_rkey_dump_packed(const void *rkey_buffer, char *buffer, size_t max) snprintf(p, endp - p, "%d:", md_index); p += strlen(p); - ucs_log_dump_hex(rkey_buffer, md_size, p, endp - p); + ucs_str_dump_hex(rkey_buf, md_size, p, endp - p, SIZE_MAX); p += strlen(p); - rkey_buffer += md_size; + rkey_buf += md_size; } snprintf(p, endp - p, "}"); - p += strlen(p); } ucs_status_t ucp_rkey_ptr(ucp_rkey_h rkey, uint64_t raddr, void **addr_p) { - unsigned num_rkeys; - unsigned i; + unsigned remote_md_index, rkey_index; ucs_status_t status; - num_rkeys = ucs_popcount(rkey->md_map); - - for (i = 0; i < num_rkeys; ++i) { - status = uct_rkey_ptr(&rkey->uct[i], raddr, addr_p); + rkey_index = 0; + ucs_for_each_bit(remote_md_index, rkey->md_map) { + status = uct_rkey_ptr(rkey->tl_rkey[rkey_index].cmpt, + &rkey->tl_rkey[rkey_index].rkey, raddr, addr_p); if ((status == UCS_OK) || (status == UCS_ERR_INVALID_ADDR)) { return status; } + + ++rkey_index; } return UCS_ERR_UNREACHABLE; @@ -327,34 +354,34 @@ ucs_status_t ucp_rkey_ptr(ucp_rkey_h rkey, uint64_t raddr, void **addr_p) void ucp_rkey_destroy(ucp_rkey_h rkey) { - ucp_context_h UCS_V_UNUSED context; - unsigned num_rkeys; - unsigned i; - - num_rkeys = ucs_popcount(rkey->md_map); - - for (i = 0; i < num_rkeys; ++i) { - uct_rkey_release(&rkey->uct[i]); + unsigned remote_md_index, rkey_index; + ucp_worker_h UCS_V_UNUSED worker; + + rkey_index = 0; + ucs_for_each_bit(remote_md_index, rkey->md_map) { + uct_rkey_release(rkey->tl_rkey[rkey_index].cmpt, + &rkey->tl_rkey[rkey_index].rkey); + ++rkey_index; } - if (ucs_popcount(rkey->md_map) <= UCP_RKEY_MPOOL_MAX_MD) { - context = ucs_container_of(ucs_mpool_obj_owner(rkey), ucp_context_t, - rkey_mp); - UCP_THREAD_CS_ENTER_CONDITIONAL(&context->mt_lock); + if (rkey->flags & UCP_RKEY_DESC_FLAG_POOL) { + worker = ucs_container_of(ucs_mpool_obj_owner(rkey), ucp_worker_t, + rkey_mp); + UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); ucs_mpool_put_inline(rkey); - UCP_THREAD_CS_EXIT_CONDITIONAL(&context->mt_lock); + UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker); } else { ucs_free(rkey); } } -static ucp_lane_index_t ucp_config_find_rma_lane(ucp_context_h context, - const ucp_ep_config_t *config, - uct_memory_type_t mem_type, - const ucp_lane_index_t *lanes, - ucp_rkey_h rkey, - ucp_lane_map_t ignore, - uct_rkey_t *uct_rkey_p) +ucp_lane_index_t ucp_rkey_find_rma_lane(ucp_context_h context, + const ucp_ep_config_t *config, + ucs_memory_type_t mem_type, + const ucp_lane_index_t *lanes, + ucp_rkey_h rkey, + ucp_lane_map_t ignore, + uct_rkey_t *uct_rkey_p) { ucp_md_index_t dst_md_index; ucp_lane_index_t lane; @@ -378,7 +405,7 @@ static ucp_lane_index_t ucp_config_find_rma_lane(ucp_context_h context, (!(md_attr->cap.flags & UCT_MD_FLAG_NEED_RKEY))) { /* Lane does not need rkey, can use the lane with invalid rkey */ - if (!rkey || ((mem_type == md_attr->cap.mem_type) && + if (!rkey || ((mem_type == md_attr->cap.access_mem_type) && (mem_type == rkey->mem_type))) { *uct_rkey_p = UCT_INVALID_RKEY; return lane; @@ -394,7 +421,7 @@ static ucp_lane_index_t ucp_config_find_rma_lane(ucp_context_h context, if (rkey->md_map & UCS_BIT(dst_md_index)) { /* Return first matching lane */ rkey_index = ucs_bitmap2idx(rkey->md_map, dst_md_index); - *uct_rkey_p = rkey->uct[rkey_index].rkey; + *uct_rkey_p = rkey->tl_rkey[rkey_index].rkey.rkey; return lane; } } @@ -410,10 +437,10 @@ void ucp_rkey_resolve_inner(ucp_rkey_h rkey, ucp_ep_h ep) uct_rkey_t uct_rkey; int rma_sw, amo_sw; - rkey->cache.rma_lane = ucp_config_find_rma_lane(context, config, - UCT_MD_MEM_TYPE_HOST, - config->key.rma_lanes, rkey, - 0, &uct_rkey); + rkey->cache.rma_lane = ucp_rkey_find_rma_lane(context, config, + UCS_MEMORY_TYPE_HOST, + config->key.rma_lanes, rkey, + 0, &uct_rkey); rma_sw = (rkey->cache.rma_lane == UCP_NULL_LANE); if (rma_sw) { rkey->cache.rma_proto = &ucp_rma_sw_proto; @@ -426,10 +453,10 @@ void ucp_rkey_resolve_inner(ucp_rkey_h rkey, ucp_ep_h ep) rkey->cache.max_put_short = config->rma[rkey->cache.rma_lane].max_put_short; } - rkey->cache.amo_lane = ucp_config_find_rma_lane(context, config, - UCT_MD_MEM_TYPE_HOST, - config->key.amo_lanes, rkey, - 0, &uct_rkey); + rkey->cache.amo_lane = ucp_rkey_find_rma_lane(context, config, + UCS_MEMORY_TYPE_HOST, + config->key.amo_lanes, rkey, + 0, &uct_rkey); amo_sw = (rkey->cache.amo_lane == UCP_NULL_LANE); if (amo_sw) { rkey->cache.amo_proto = &ucp_amo_sw_proto; @@ -468,14 +495,3 @@ void ucp_rkey_resolve_inner(ucp_rkey_h rkey, ucp_ep_h ep) rkey->cache.rma_proto->name, rkey->cache.rma_lane, rkey->cache.rma_rkey, rkey->cache.amo_proto->name, rkey->cache.amo_lane, rkey->cache.amo_rkey); } - -ucp_lane_index_t ucp_rkey_get_rma_bw_lane(ucp_rkey_h rkey, ucp_ep_h ep, - uct_memory_type_t mem_type, - uct_rkey_t *uct_rkey_p, - ucp_lane_map_t ignore) -{ - ucp_ep_config_t *config = ucp_ep_config(ep); - return ucp_config_find_rma_lane(ep->worker->context, config, mem_type, - config->key.rma_bw_lanes, rkey, - ignore, uct_rkey_p); -} diff --git a/src/ucp/core/ucp_thread.h b/src/ucp/core/ucp_thread.h index ada9dcce3ba..bd66918b312 100644 --- a/src/ucp/core/ucp_thread.h +++ b/src/ucp/core/ucp_thread.h @@ -35,37 +35,40 @@ typedef struct ucp_mt_lock { /* Lock for multithreading support. Either spinlock or mutex is used at at one time. Spinlock is the default option. */ pthread_mutex_t mt_mutex; - ucs_spinlock_t mt_spinlock; + ucs_recursive_spinlock_t mt_spinlock; } lock; } ucp_mt_lock_t; -#if ENABLE_MT - #define UCP_THREAD_IS_REQUIRED(_lock_ptr) \ ((_lock_ptr)->mt_type) -#define UCP_THREAD_LOCK_INIT(_lock_ptr) \ - { \ - if ((_lock_ptr)->mt_type == UCP_MT_TYPE_MUTEX) { \ - pthread_mutex_init(&((_lock_ptr)->lock.mt_mutex), NULL); \ - } else { \ - ucs_spinlock_init(&((_lock_ptr)->lock.mt_spinlock)); \ - } \ - } -#define UCP_THREAD_LOCK_FINALIZE(_lock_ptr) \ - { \ - if ((_lock_ptr)->mt_type == UCP_MT_TYPE_MUTEX) { \ - pthread_mutex_destroy(&((_lock_ptr)->lock.mt_mutex)); \ - } else { \ - ucs_spinlock_destroy(&((_lock_ptr)->lock.mt_spinlock)); \ - } \ - } +#define UCP_THREAD_LOCK_INIT(_lock_ptr) \ + do { \ + if ((_lock_ptr)->mt_type == UCP_MT_TYPE_MUTEX) { \ + pthread_mutex_init(&((_lock_ptr)->lock.mt_mutex), NULL); \ + } else { \ + ucs_recursive_spinlock_init(&((_lock_ptr)->lock.mt_spinlock), 0); \ + } \ + } while (0) +#define UCP_THREAD_LOCK_FINALIZE(_lock_ptr) \ + do { \ + ucs_status_t _status; \ + \ + if ((_lock_ptr)->mt_type == UCP_MT_TYPE_MUTEX) { \ + pthread_mutex_destroy(&((_lock_ptr)->lock.mt_mutex)); \ + } else { \ + _status = ucs_recursive_spinlock_destroy(&((_lock_ptr)->lock.mt_spinlock)); \ + if (_status != UCS_OK) { \ + ucs_warn("ucs_recursive_spinlock_destroy() failed (%d)", _status); \ + } \ + } \ + } while (0) #define UCP_THREAD_CS_ENTER(_lock_ptr) \ { \ if ((_lock_ptr)->mt_type == UCP_MT_TYPE_MUTEX) { \ pthread_mutex_lock(&((_lock_ptr)->lock.mt_mutex)); \ } else { \ - ucs_spin_lock(&((_lock_ptr)->lock.mt_spinlock)); \ + ucs_recursive_spin_lock(&((_lock_ptr)->lock.mt_spinlock)); \ } \ } #define UCP_THREAD_CS_EXIT(_lock_ptr) \ @@ -73,32 +76,8 @@ typedef struct ucp_mt_lock { if ((_lock_ptr)->mt_type == UCP_MT_TYPE_MUTEX) { \ pthread_mutex_unlock(&((_lock_ptr)->lock.mt_mutex)); \ } else { \ - ucs_spin_unlock(&((_lock_ptr)->lock.mt_spinlock)); \ + ucs_recursive_spin_unlock(&((_lock_ptr)->lock.mt_spinlock));\ } \ } -#else - -#define UCP_THREAD_IS_REQUIRED(_lock_ptr) 0 -#define UCP_THREAD_LOCK_INIT(_lock_ptr) {} -#define UCP_THREAD_LOCK_FINALIZE(_lock_ptr) {} -#define UCP_THREAD_CS_ENTER(_lock_ptr) {} -#define UCP_THREAD_CS_EXIT(_lock_ptr) {} - -#endif - -#define UCP_THREAD_CS_ENTER_CONDITIONAL(_lock_ptr) \ - { \ - if (UCP_THREAD_IS_REQUIRED(_lock_ptr)) { \ - UCP_THREAD_CS_ENTER(_lock_ptr); \ - } \ - } -#define UCP_THREAD_CS_EXIT_CONDITIONAL(_lock_ptr) \ - { \ - if (UCP_THREAD_IS_REQUIRED(_lock_ptr)) { \ - UCP_THREAD_CS_EXIT(_lock_ptr); \ - } \ - } - - #endif diff --git a/src/ucp/core/ucp_types.h b/src/ucp/core/ucp_types.h index 9a424f53c49..c3920c33182 100644 --- a/src/ucp/core/ucp_types.h +++ b/src/ucp/core/ucp_types.h @@ -1,6 +1,6 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -47,10 +47,15 @@ typedef struct ucp_address_iface_attr ucp_address_iface_attr_t; typedef struct ucp_address_entry ucp_address_entry_t; typedef struct ucp_unpacked_address ucp_unpacked_address_t; typedef struct ucp_wireup_ep ucp_wireup_ep_t; -typedef struct ucp_proto ucp_proto_t; +typedef struct ucp_request_send_proto ucp_request_send_proto_t; typedef struct ucp_worker_iface ucp_worker_iface_t; +typedef struct ucp_worker_cm ucp_worker_cm_t; typedef struct ucp_rma_proto ucp_rma_proto_t; typedef struct ucp_amo_proto ucp_amo_proto_t; +typedef struct ucp_wireup_sockaddr_data ucp_wireup_sockaddr_data_t; +typedef struct ucp_ep_config ucp_ep_config_t; +typedef struct ucp_ep_config_key ucp_ep_config_key_t; +typedef struct ucp_proto ucp_proto_t; /** @@ -85,7 +90,12 @@ enum { UCP_AM_ID_ATOMIC_REQ = 20, /* Remote memory atomic request */ UCP_AM_ID_ATOMIC_REP = 21, /* Remote memory atomic reply */ UCP_AM_ID_CMPL = 22, /* Remote memory operation completion */ - + UCP_AM_ID_SINGLE = 23, /* Single fragment user defined AM */ + UCP_AM_ID_FIRST = 24, /* First fragment of user defined AM */ + UCP_AM_ID_MIDDLE = 25, /* Middle or last fragment of user + defined AM */ + UCP_AM_ID_SINGLE_REPLY = 26, /* Single fragment user defined AM + carrying remote ep for reply */ UCP_AM_ID_LAST, UCP_AM_ID_MAX = 32 /* Total IDs available for pre-registration */ }; diff --git a/src/ucp/core/ucp_worker.c b/src/ucp/core/ucp_worker.c index 4ac42893b00..46599237930 100644 --- a/src/ucp/core/ucp_worker.c +++ b/src/ucp/core/ucp_worker.c @@ -1,16 +1,22 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. * Copyright (C) ARM Ltd. 2016-2017. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "ucp_am.h" #include "ucp_worker.h" #include "ucp_mm.h" #include "ucp_request.inl" #include +#include #include #include #include @@ -23,26 +29,18 @@ #include #include #include +#include #define UCP_WORKER_HEADROOM_SIZE \ (sizeof(ucp_recv_desc_t) + UCP_WORKER_HEADROOM_PRIV_SIZE) +typedef enum ucp_worker_event_fd_op { + UCP_WORKER_EPFD_OP_ADD, + UCP_WORKER_EPFD_OP_DEL +} ucp_worker_event_fd_op_t; -#if ENABLE_STATS -static ucs_stats_class_t ucp_worker_stats_class = { - .name = "ucp_worker", - .num_counters = UCP_WORKER_STAT_LAST, - .counter_names = { - [UCP_WORKER_STAT_TAG_RX_EAGER_MSG] = "rx_eager_msg", - [UCP_WORKER_STAT_TAG_RX_EAGER_SYNC_MSG] = "rx_sync_msg", - [UCP_WORKER_STAT_TAG_RX_EAGER_CHUNK_EXP] = "rx_eager_chunk_exp", - [UCP_WORKER_STAT_TAG_RX_EAGER_CHUNK_UNEXP] = "rx_eager_chunk_unexp", - [UCP_WORKER_STAT_TAG_RX_RNDV_EXP] = "rx_rndv_rts_exp", - [UCP_WORKER_STAT_TAG_RX_RNDV_UNEXP] = "rx_rndv_rts_unexp" - } -}; - +#ifdef ENABLE_STATS static ucs_stats_class_t ucp_worker_tm_offload_stats_class = { .name = "tag_offload", .num_counters = UCP_WORKER_STAT_TAG_OFFLOAD_LAST, @@ -56,9 +54,26 @@ static ucs_stats_class_t ucp_worker_tm_offload_stats_class = { [UCP_WORKER_STAT_TAG_OFFLOAD_BLOCK_WILDCARD] = "block_wildcard", [UCP_WORKER_STAT_TAG_OFFLOAD_BLOCK_SW_PEND] = "block_sw_pend", [UCP_WORKER_STAT_TAG_OFFLOAD_BLOCK_NO_IFACE] = "block_no_iface", + [UCP_WORKER_STAT_TAG_OFFLOAD_BLOCK_MEM_REG] = "block_mem_reg", [UCP_WORKER_STAT_TAG_OFFLOAD_RX_UNEXP_EGR] = "rx_unexp_egr", [UCP_WORKER_STAT_TAG_OFFLOAD_RX_UNEXP_RNDV] = "rx_unexp_rndv", - [UCP_WORKER_STAT_TAG_OFFLOAD_RX_UNEXP_SW_RNDV] = "rx_unexp_sw_rndv", + [UCP_WORKER_STAT_TAG_OFFLOAD_RX_UNEXP_SW_RNDV] = "rx_unexp_sw_rndv" + } +}; + +static ucs_stats_class_t ucp_worker_stats_class = { + .name = "ucp_worker", + .num_counters = UCP_WORKER_STAT_LAST, + .counter_names = { + [UCP_WORKER_STAT_TAG_RX_EAGER_MSG] = "rx_eager_msg", + [UCP_WORKER_STAT_TAG_RX_EAGER_SYNC_MSG] = "rx_sync_msg", + [UCP_WORKER_STAT_TAG_RX_EAGER_CHUNK_EXP] = "rx_eager_chunk_exp", + [UCP_WORKER_STAT_TAG_RX_EAGER_CHUNK_UNEXP] = "rx_eager_chunk_unexp", + [UCP_WORKER_STAT_TAG_RX_RNDV_EXP] = "rx_rndv_rts_exp", + [UCP_WORKER_STAT_TAG_RX_RNDV_UNEXP] = "rx_rndv_rts_unexp", + [UCP_WORKER_STAT_TAG_RX_RNDV_GET_ZCOPY] = "rx_rndv_get_zcopy", + [UCP_WORKER_STAT_TAG_RX_RNDV_SEND_RTR] = "rx_rndv_send_rtr", + [UCP_WORKER_STAT_TAG_RX_RNDV_RKEY_PTR] = "rx_rndv_rkey_ptr" } }; #endif @@ -86,40 +101,37 @@ ucs_mpool_ops_t ucp_frag_mpool_ops = { .obj_cleanup = ucs_empty_function }; -void ucp_worker_iface_check_events(ucp_worker_iface_t *wiface, int force); - -static UCS_F_ALWAYS_INLINE double -ucp_worker_iface_latency(ucp_worker_h worker, ucp_worker_iface_t *wiface) -{ - return wiface->attr.latency.overhead + - wiface->attr.latency.growth * worker->context->config.est_num_eps; -} -static ucs_status_t ucp_worker_wakeup_ctl_fd(ucp_worker_h worker, int op, +static ucs_status_t ucp_worker_wakeup_ctl_fd(ucp_worker_h worker, + ucp_worker_event_fd_op_t op, int event_fd) { - struct epoll_event event = {0}; - int ret; + ucs_event_set_type_t events = UCS_EVENT_SET_EVREAD; + ucs_status_t status; if (!(worker->context->config.features & UCP_FEATURE_WAKEUP)) { return UCS_OK; } - memset(&event.data, 0, sizeof(event.data)); - event.data.ptr = worker->user_data; - event.events = EPOLLIN; if (worker->flags & UCP_WORKER_FLAG_EDGE_TRIGGERED) { - event.events |= EPOLLET; + events |= UCS_EVENT_SET_EDGE_TRIGGERED; } - ret = epoll_ctl(worker->epfd, op, event_fd, &event); - if (ret == -1) { - ucs_error("epoll_ctl(epfd=%d, op=%d, fd=%d) failed: %m", worker->epfd, - op, event_fd); - return UCS_ERR_IO_ERROR; + switch (op) { + case UCP_WORKER_EPFD_OP_ADD: + status = ucs_event_set_add(worker->event_set, event_fd, + events, worker->user_data); + break; + case UCP_WORKER_EPFD_OP_DEL: + status = ucs_event_set_del(worker->event_set, event_fd); + break; + default: + ucs_bug("Unknown operation (%d) was passed", op); + status = UCS_ERR_INVALID_PARAM; + break; } - return UCS_OK; + return status; } static void ucp_worker_set_am_handlers(ucp_worker_iface_t *wiface, int is_proxy) @@ -196,7 +208,7 @@ static void ucp_worker_remove_am_handlers(ucp_worker_h worker) ucs_debug("worker %p: remove active message handlers", worker); for (iface_id = 0; iface_id < worker->num_ifaces; ++iface_id) { - wiface = &worker->ifaces[iface_id]; + wiface = worker->ifaces[iface_id]; if (!(wiface->attr.cap.flags & (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_AM_ZCOPY))) { @@ -235,11 +247,12 @@ static ucs_status_t ucp_worker_wakeup_init(ucp_worker_h worker, const ucp_worker_params_t *params) { ucp_context_h context = worker->context; - ucp_wakeup_event_t events; + unsigned events; ucs_status_t status; if (!(context->config.features & UCP_FEATURE_WAKEUP)) { - worker->epfd = -1; + worker->event_fd = -1; + worker->event_set = NULL; worker->eventfd = -1; worker->uct_events = 0; status = UCS_OK; @@ -254,15 +267,19 @@ static ucs_status_t ucp_worker_wakeup_init(ucp_worker_h worker, } if (params->field_mask & UCP_WORKER_PARAM_FIELD_EVENT_FD) { - worker->epfd = params->event_fd; - worker->flags |= UCP_WORKER_FLAG_EXTERNAL_EVENT_FD; + worker->flags |= UCP_WORKER_FLAG_EXTERNAL_EVENT_FD; + status = ucs_event_set_create_from_fd(&worker->event_set, + params->event_fd); } else { - worker->epfd = epoll_create(context->num_tls); - if (worker->epfd == -1) { - ucs_error("Failed to create epoll file descriptor: %m"); - status = UCS_ERR_IO_ERROR; - goto out; - } + status = ucs_event_set_create(&worker->event_set); + } + if (status != UCS_OK) { + goto out; + } + + status = ucs_event_set_fd_get(worker->event_set, &worker->event_fd); + if (status != UCS_OK) { + goto err_cleanup_event_set; } if (events & UCP_WAKEUP_EDGE) { @@ -273,10 +290,10 @@ static ucs_status_t ucp_worker_wakeup_init(ucp_worker_h worker, if (worker->eventfd == -1) { ucs_error("Failed to create event fd: %m"); status = UCS_ERR_IO_ERROR; - goto err_close_epfd; + goto err_cleanup_event_set; } - ucp_worker_wakeup_ctl_fd(worker, EPOLL_CTL_ADD, worker->eventfd); + ucp_worker_wakeup_ctl_fd(worker, UCP_WORKER_EPFD_OP_ADD, worker->eventfd); worker->uct_events = 0; @@ -287,7 +304,7 @@ static ucs_status_t ucp_worker_wakeup_init(ucp_worker_h worker, */ if ((events & UCP_WAKEUP_TAG_SEND) || ((events & UCP_WAKEUP_TAG_RECV) && - (context->config.ext.rndv_thresh != UCS_CONFIG_MEMUNITS_INF))) + (context->config.ext.rndv_thresh != UCS_MEMUNITS_INF))) { worker->uct_events |= UCT_EVENT_SEND_COMP; } @@ -302,31 +319,68 @@ static ucs_status_t ucp_worker_wakeup_init(ucp_worker_h worker, return UCS_OK; -err_close_epfd: - close(worker->epfd); +err_cleanup_event_set: + ucs_event_set_cleanup(worker->event_set); + worker->event_set = NULL; + worker->event_fd = -1; out: return status; } static void ucp_worker_wakeup_cleanup(ucp_worker_h worker) { - if ((worker->epfd != -1) && - !(worker->flags & UCP_WORKER_FLAG_EXTERNAL_EVENT_FD)) { - close(worker->epfd); + if (worker->event_set != NULL) { + ucs_assert(worker->event_fd != -1); + ucs_event_set_cleanup(worker->event_set); + worker->event_set = NULL; + worker->event_fd = -1; } if (worker->eventfd != -1) { close(worker->eventfd); } } -static void ucp_worker_iface_disarm(ucp_worker_iface_t *wiface) +static UCS_F_ALWAYS_INLINE +int ucp_worker_iface_has_event_notify(const ucp_worker_iface_t *wiface) +{ + return (wiface->attr.cap.event_flags & (UCT_IFACE_FLAG_EVENT_FD | + UCT_IFACE_FLAG_EVENT_ASYNC_CB)); +} + +static UCS_F_ALWAYS_INLINE +int ucp_worker_iface_use_event_fd(const ucp_worker_iface_t *wiface) +{ + /* use iface's fd if it is supported by UCT iface and asynchronous + * callback mechanism isn't supported (this is preferred mechanism, + * since it will be called anyway) */ + return (wiface->attr.cap.event_flags & UCT_IFACE_FLAG_EVENT_FD) && + !(wiface->attr.cap.event_flags & UCT_IFACE_FLAG_EVENT_ASYNC_CB); +} + +static UCS_F_ALWAYS_INLINE +int ucp_worker_iface_get_event_fd(const ucp_worker_iface_t *wiface) +{ + ucs_assert(ucp_worker_iface_use_event_fd(wiface)); + return wiface->event_fd; +} + +static UCS_F_ALWAYS_INLINE +void ucp_worker_iface_event_fd_ctl(ucp_worker_iface_t *wiface, + ucp_worker_event_fd_op_t op) { ucs_status_t status; + status = ucp_worker_wakeup_ctl_fd(wiface->worker, op, + ucp_worker_iface_get_event_fd(wiface)); + ucs_assert_always(status == UCS_OK); +} + +static void ucp_worker_iface_disarm(ucp_worker_iface_t *wiface) +{ if (wiface->flags & UCP_WORKER_IFACE_FLAG_ON_ARM_LIST) { - status = ucp_worker_wakeup_ctl_fd(wiface->worker, EPOLL_CTL_DEL, - wiface->event_fd); - ucs_assert_always(status == UCS_OK); + if (ucp_worker_iface_use_event_fd(wiface)) { + ucp_worker_iface_event_fd_ctl(wiface, UCP_WORKER_EPFD_OP_DEL); + } ucs_list_del(&wiface->arm_list); wiface->flags &= ~UCP_WORKER_IFACE_FLAG_ON_ARM_LIST; } @@ -375,6 +429,7 @@ static unsigned ucp_worker_iface_err_handle_progress(void *arg) ucp_lane_index_t failed_lane = err_handle_arg->failed_lane; ucp_lane_index_t lane; ucp_ep_config_key_t key; + ucp_request_t *close_req; UCS_ASYNC_BLOCK(&worker->async); @@ -403,6 +458,8 @@ static unsigned ucp_worker_iface_err_handle_progress(void *arg) } } + ucp_stream_ep_cleanup(ucp_ep); + /* Move failed lane to index 0 */ if ((failed_lane != 0) && (failed_lane != UCP_NULL_LANE)) { ucp_ep->uct_eps[0] = ucp_ep->uct_eps[failed_lane]; @@ -426,22 +483,33 @@ static unsigned ucp_worker_iface_err_handle_progress(void *arg) key.wireup_lane = 0; key.tag_lane = 0; key.rma_lanes[0] = 0; + key.rkey_ptr_lane = UCP_NULL_LANE; key.rma_bw_lanes[0] = 0; key.amo_lanes[0] = 0; key.lanes[0].rsc_index = UCP_NULL_RESOURCE; key.num_lanes = 1; key.status = status; - ucp_ep->cfg_index = ucp_worker_get_ep_config(worker, &key); - ucp_ep->am_lane = 0; + status = ucp_worker_get_ep_config(worker, &key, 0, &ucp_ep->cfg_index); + if (status != UCS_OK) { + ucs_fatal("ep %p: could not change configuration to error state: %s", + ucp_ep, ucs_status_string(status)); + } + + ucp_ep->am_lane = 0; - if (ucp_ep_ext_gen(ucp_ep)->err_cb != NULL) { - ucs_assert(ucp_ep->flags & UCP_EP_FLAG_USED); - ucs_debug("ep %p: calling user error callback %p with arg %p", ucp_ep, - ucp_ep_ext_gen(ucp_ep)->err_cb, ucp_ep_ext_gen(ucp_ep)->user_data); - ucp_ep_ext_gen(ucp_ep)->err_cb(ucp_ep_ext_gen(ucp_ep)->user_data, ucp_ep, - status); - } else if (!(ucp_ep->flags & UCP_EP_FLAG_USED)) { + if (ucp_ep->flags & UCP_EP_FLAG_USED) { + if (ucp_ep->flags & UCP_EP_FLAG_CLOSE_REQ_VALID) { + ucs_assert(ucp_ep->flags & UCP_EP_FLAG_CLOSED); + /* Promote close operation to CANCEL in case of transport error, + * since the disconnect event may never arrive. */ + close_req = ucp_ep_ext_gen(ucp_ep)->close_req.req; + close_req->send.flush.uct_flags |= UCT_FLUSH_FLAG_CANCEL; + ucp_ep_local_disconnect_progress(close_req); + } else { + ucp_ep_invoke_err_cb(ucp_ep, key.status); + } + } else { ucs_debug("ep %p: destroy internal endpoint due to peer failure", ucp_ep); ucp_ep_disconnected(ucp_ep, 1); } @@ -456,10 +524,19 @@ int ucp_worker_err_handle_remove_filter(const ucs_callbackq_elem_t *elem, { ucp_worker_err_handle_arg_t *err_handle_arg = elem->arg; - return (elem->cb == ucp_worker_iface_err_handle_progress) && - (err_handle_arg->ucp_ep == arg); + if ((elem->cb == ucp_worker_iface_err_handle_progress) && + (err_handle_arg->ucp_ep == arg)) { + /* release err handling argument to avoid memory leak */ + ucs_free(err_handle_arg); + return 1; + } + + return 0; } +/* + * Caller must acquire lock + */ ucs_status_t ucp_worker_set_ep_failed(ucp_worker_h worker, ucp_ep_h ucp_ep, uct_ep_h uct_ep, ucp_lane_index_t lane, ucs_status_t status) @@ -469,12 +546,18 @@ ucs_status_t ucp_worker_set_ep_failed(ucp_worker_h worker, ucp_ep_h ucp_ep, ucp_rsc_index_t rsc_index; uct_tl_resource_desc_t *tl_rsc; ucp_worker_err_handle_arg_t *err_handle_arg; + ucs_log_level_t log_level; + + /* In case if this is a local failure we need to notify remote side */ + if (ucp_ep_is_cm_local_connected(ucp_ep)) { + ucp_ep_cm_disconnect_cm_lane(ucp_ep); + } + /* set endpoint to failed to prevent wireup_ep switch */ if (ucp_ep->flags & UCP_EP_FLAG_FAILED) { goto out_ok; } - /* set endpoint to failed to prevent wireup_ep switch */ ucp_ep->flags |= UCP_EP_FLAG_FAILED; if (ucp_ep_config(ucp_ep)->key.err_mode == UCP_ERR_HANDLING_MODE_NONE) { @@ -505,16 +588,23 @@ ucs_status_t ucp_worker_set_ep_failed(ucp_worker_h worker, ucp_ep_h ucp_ep, if ((ucp_ep_ext_gen(ucp_ep)->err_cb == NULL) && (ucp_ep->flags & UCP_EP_FLAG_USED)) { + /* do not print error if connection reset by remote peer since it can + * be part of user level close protocol */ + log_level = (status == UCS_ERR_CONNECTION_RESET) ? UCS_LOG_LEVEL_DIAG : + UCS_LOG_LEVEL_ERROR; + if (lane != UCP_NULL_LANE) { rsc_index = ucp_ep_get_rsc_index(ucp_ep, lane); tl_rsc = &worker->context->tl_rscs[rsc_index].tl_rsc; - ucs_error("error '%s' will not be handled for ep %p - " - UCT_TL_RESOURCE_DESC_FMT, ucs_status_string(status), ucp_ep, - UCT_TL_RESOURCE_DESC_ARG(tl_rsc)); + ucs_log(log_level, "error '%s' will not be handled for ep %p - " + UCT_TL_RESOURCE_DESC_FMT " since no error callback is installed", + ucs_status_string(status), ucp_ep, + UCT_TL_RESOURCE_DESC_ARG(tl_rsc)); } else { ucs_assert(uct_ep == NULL); - ucs_error("error '%s' occurred on wireup will not be handled for ep %p", - ucs_status_string(status), ucp_ep); + ucs_log(log_level, "error '%s' occurred on wireup will not be " + "handled for ep %p since no error callback is installed", + ucs_status_string(status), ucp_ep); } ret_status = status; goto out; @@ -569,7 +659,6 @@ ucp_worker_iface_error_handler(void *arg, uct_ep_h uct_ep, ucs_status_t status) void ucp_worker_iface_activate(ucp_worker_iface_t *wiface, unsigned uct_flags) { ucp_worker_h worker = wiface->worker; - ucs_status_t status; ucs_trace("activate iface %p acount=%u aifaces=%u", wiface->iface, wiface->activate_count, worker->num_active_ifaces); @@ -584,10 +673,13 @@ void ucp_worker_iface_activate(ucp_worker_iface_t *wiface, unsigned uct_flags) /* Set default active message handlers */ ucp_worker_set_am_handlers(wiface, 0); - /* Add to user wakeup */ - if (wiface->attr.cap.flags & UCP_WORKER_UCT_ALL_EVENT_CAP_FLAGS) { - status = ucp_worker_wakeup_ctl_fd(worker, EPOLL_CTL_ADD, wiface->event_fd); - ucs_assert_always(status == UCS_OK); + if (ucp_worker_iface_has_event_notify(wiface)) { + if (ucp_worker_iface_use_event_fd(wiface)) { + /* Add to user wakeup */ + ucp_worker_iface_event_fd_ctl(wiface, UCP_WORKER_EPFD_OP_ADD); + } + + /* Add to the list of UCT ifaces that should be armed */ wiface->flags |= UCP_WORKER_IFACE_FLAG_ON_ARM_LIST; ucs_list_add_tail(&worker->arm_ifaces, &wiface->arm_list); } @@ -598,57 +690,6 @@ void ucp_worker_iface_activate(ucp_worker_iface_t *wiface, unsigned uct_flags) UCT_PROGRESS_SEND | UCT_PROGRESS_RECV | uct_flags); } -static void ucp_worker_iface_deactivate(ucp_worker_iface_t *wiface, int force) -{ - ucs_trace("deactivate iface %p force=%d acount=%u aifaces=%u", - wiface->iface, force, wiface->activate_count, - wiface->worker->num_active_ifaces); - - if (!force) { - ucs_assert(wiface->activate_count > 0); - if (--wiface->activate_count > 0) { - return; /* not completely deactivated yet */ - } - --wiface->worker->num_active_ifaces; - } - - /* Avoid progress on the interface to reduce overhead */ - uct_iface_progress_disable(wiface->iface, - UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); - - /* Remove from user wakeup */ - ucp_worker_iface_disarm(wiface); - - /* Set proxy active message handlers to count receives */ - ucp_worker_set_am_handlers(wiface, 1); - - /* Prepare for next receive event */ - ucp_worker_iface_check_events(wiface, force); -} - -void ucp_worker_iface_progress_ep(ucp_worker_iface_t *wiface) -{ - ucs_trace_func("iface=%p", wiface->iface); - - UCS_ASYNC_BLOCK(&wiface->worker->async); - - /* This function may be called from progress thread (such as when processing - * wireup messages), so ask UCT to be thread-safe. - */ - ucp_worker_iface_activate(wiface, UCT_PROGRESS_THREAD_SAFE); - - UCS_ASYNC_UNBLOCK(&wiface->worker->async); -} - -void ucp_worker_iface_unprogress_ep(ucp_worker_iface_t *wiface) -{ - ucs_trace_func("iface=%p", wiface->iface); - - UCS_ASYNC_BLOCK(&wiface->worker->async); - ucp_worker_iface_deactivate(wiface, 0); - UCS_ASYNC_UNBLOCK(&wiface->worker->async); -} - /* * If active messages were received by am proxy handler, activate the interface. * Otherwise, arm the interface event and make sure that when an active message @@ -660,7 +701,7 @@ static ucs_status_t ucp_worker_iface_check_events_do(ucp_worker_iface_t *wiface, unsigned prev_recv_count; ucs_status_t status; - ucs_trace_func("iface=%p", wiface->iface); + ucs_trace_func("wiface=%p iface=%p", wiface, wiface->iface); if (wiface->activate_count > 0) { ucs_trace("iface %p already activated", wiface->iface); @@ -677,17 +718,21 @@ static ucs_status_t ucp_worker_iface_check_events_do(ucp_worker_iface_t *wiface, return UCS_OK; } else if (*progress_count == 0) { /* Arm the interface to wait for next event */ - ucs_assert(wiface->attr.cap.flags & UCP_WORKER_UCT_RECV_EVENT_CAP_FLAGS); + ucs_assert(wiface->attr.cap.event_flags & UCP_WORKER_UCT_RECV_EVENT_CAP_FLAGS); status = uct_iface_event_arm(wiface->iface, UCP_WORKER_UCT_RECV_EVENT_ARM_FLAGS); if (status == UCS_OK) { ucs_trace("armed iface %p", wiface->iface); - /* re-enable events, which were disabled by ucp_suspended_iface_event() */ - status = ucs_async_modify_handler(wiface->event_fd, POLLIN); - if (status != UCS_OK) { - ucs_fatal("failed to modify %d event handler to POLLIN: %s", - wiface->event_fd, ucs_status_string(status)); + if (ucp_worker_iface_use_event_fd(wiface)) { + /* re-enable events, which were disabled by + * ucp_worker_iface_async_fd_event() */ + status = ucs_async_modify_handler(wiface->event_fd, + UCS_EVENT_SET_EVREAD); + if (status != UCS_OK) { + ucs_fatal("failed to modify %d event handler to UCS_EVENT_SET_EVREAD: %s", + wiface->event_fd, ucs_status_string(status)); + } } return UCS_OK; @@ -699,8 +744,8 @@ static ucs_status_t ucp_worker_iface_check_events_do(ucp_worker_iface_t *wiface, return UCS_ERR_BUSY; } } else { - ucs_trace("iface %p progress returned %u, but no active messages were received", - wiface->iface, *progress_count); + ucs_trace("wiface %p progress returned %u, but no active messages were received", + wiface, *progress_count); return UCS_ERR_BUSY; } } @@ -727,7 +772,7 @@ static unsigned ucp_worker_iface_check_events_progress(void *arg) return progress_count; } -void ucp_worker_iface_check_events(ucp_worker_iface_t *wiface, int force) +static void ucp_worker_iface_check_events(ucp_worker_iface_t *wiface, int force) { unsigned progress_count; ucs_status_t status; @@ -736,6 +781,11 @@ void ucp_worker_iface_check_events(ucp_worker_iface_t *wiface, int force) if (force) { do { + /* coverity wrongly resolves rc's progress to ucp_listener_conn_request_progress + * which in turn releases wiface->iface. this leads coverity to assume + * that ucp_worker_iface_check_events_do() dereferences a freed pointer + * in the subsequent call in the following loop */ + /* coverity[freed_arg] */ status = ucp_worker_iface_check_events_do(wiface, &progress_count); ucs_assert(progress_count == 0); } while (status == UCS_ERR_BUSY); @@ -750,25 +800,103 @@ void ucp_worker_iface_check_events(ucp_worker_iface_t *wiface, int force) } } -void ucp_worker_iface_event(int fd, void *arg) +static void ucp_worker_iface_deactivate(ucp_worker_iface_t *wiface, int force) +{ + ucs_trace("deactivate iface %p force=%d acount=%u aifaces=%u", + wiface->iface, force, wiface->activate_count, + wiface->worker->num_active_ifaces); + + if (!force) { + ucs_assert(wiface->activate_count > 0); + if (--wiface->activate_count > 0) { + return; /* not completely deactivated yet */ + } + --wiface->worker->num_active_ifaces; + } + + /* Avoid progress on the interface to reduce overhead */ + uct_iface_progress_disable(wiface->iface, + UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); + + /* Remove from user wakeup */ + ucp_worker_iface_disarm(wiface); + + /* Set proxy active message handlers to count receives */ + ucp_worker_set_am_handlers(wiface, 1); + + /* Prepare for next receive event */ + ucp_worker_iface_check_events(wiface, force); +} + +void ucp_worker_iface_progress_ep(ucp_worker_iface_t *wiface) +{ + ucs_trace_func("iface=%p", wiface->iface); + + UCS_ASYNC_BLOCK(&wiface->worker->async); + + /* This function may be called from progress thread (such as when processing + * wireup messages), so ask UCT to be thread-safe. + */ + ucp_worker_iface_activate(wiface, UCT_PROGRESS_THREAD_SAFE); + + UCS_ASYNC_UNBLOCK(&wiface->worker->async); +} + +void ucp_worker_iface_unprogress_ep(ucp_worker_iface_t *wiface) +{ + ucs_trace_func("iface=%p", wiface->iface); + + UCS_ASYNC_BLOCK(&wiface->worker->async); + ucp_worker_iface_deactivate(wiface, 0); + UCS_ASYNC_UNBLOCK(&wiface->worker->async); +} + +static UCS_F_ALWAYS_INLINE void +ucp_worker_iface_event_common(ucp_worker_iface_t *wiface) { - ucp_worker_iface_t *wiface = arg; ucp_worker_h worker = wiface->worker; + + /* Do more work on the main thread */ + ucp_worker_iface_check_events(wiface, 0); + + /* Signal user wakeup to report the first message on the interface */ + ucp_worker_signal_internal(worker); +} + +static void ucp_worker_iface_async_cb_event(void *arg, unsigned flags) +{ + ucp_worker_iface_t *wiface = arg; + + ucs_assert(wiface->attr.cap.event_flags & UCT_IFACE_FLAG_EVENT_ASYNC_CB); + ucs_trace_func("async_cb for iface=%p", wiface->iface); + + ucp_worker_iface_event_common(wiface); +} + +static void ucp_worker_iface_async_fd_event(int fd, int events, void *arg) +{ + ucp_worker_iface_t *wiface = arg; + int event_fd = ucp_worker_iface_get_event_fd(wiface);; ucs_status_t status; - ucs_trace_func("fd=%d iface=%p", fd, wiface->iface); + ucs_assertv(fd == event_fd, "fd=%d vs wiface::event_fd=%d", fd, event_fd); + ucs_trace_func("fd=%d iface=%p", event_fd, wiface->iface); - status = ucs_async_modify_handler(wiface->event_fd, 0); + status = ucs_async_modify_handler(event_fd, 0); if (status != UCS_OK) { ucs_fatal("failed to modify %d event handler to : %s", - wiface->event_fd, ucs_status_string(status)); + event_fd, ucs_status_string(status)); } - /* Do more work on the main thread */ - ucp_worker_iface_check_events(wiface, 0); + ucp_worker_iface_event_common(wiface); +} - /* Signal user wakeup, to report the first message on the interface */ - ucp_worker_signal_internal(worker); +static void ucp_worker_uct_iface_close(ucp_worker_iface_t *wiface) +{ + if (wiface->iface != NULL) { + uct_iface_close(wiface->iface); + wiface->iface = NULL; + } } static int ucp_worker_iface_find_better(ucp_worker_h worker, @@ -779,14 +907,18 @@ static int ucp_worker_iface_find_better(ucp_worker_h worker, ucp_rsc_index_t rsc_index; ucp_worker_iface_t *if_iter; uint64_t test_flags; - double latency_iter, latency_cur; - float epsilon; + double latency_iter, latency_cur, bw_cur; + + ucs_assert(wiface != NULL); + + latency_cur = ucp_tl_iface_latency(ctx, &wiface->attr.latency); + bw_cur = ucp_tl_iface_bandwidth(ctx, &wiface->attr.bandwidth); test_flags = wiface->attr.cap.flags & ~(UCT_IFACE_FLAG_CONNECT_TO_IFACE | UCT_IFACE_FLAG_CONNECT_TO_EP); for (rsc_index = 0; rsc_index < ctx->num_tls; ++rsc_index) { - if_iter = &worker->ifaces[rsc_index]; + if_iter = worker->ifaces[rsc_index]; /* Need to check resources which belong to the same device only */ if ((ctx->tl_rscs[rsc_index].dev_index != ctx->tl_rscs[wiface->rsc_index].dev_index) || @@ -795,30 +927,34 @@ static int ucp_worker_iface_find_better(ucp_worker_h worker, continue; } - /* Check that another iface: - * 1. Supports all capabilities of the target iface (at least), except - * ...CONNECT_TO... caps. - * 2. Has the same or better performance charasteristics */ - if (ucs_test_all_flags(if_iter->attr.cap.flags, test_flags) && - (if_iter->attr.overhead <= wiface->attr.overhead) && - (if_iter->attr.bandwidth >= wiface->attr.bandwidth) && - (if_iter->attr.priority >= wiface->attr.priority)) { - - latency_iter = ucp_worker_iface_latency(worker, if_iter); - latency_cur = ucp_worker_iface_latency(worker, wiface); - epsilon = (latency_iter + latency_cur) * 1e-6; - if (latency_iter < latency_cur + epsilon) { - /* Do not check this iface anymore, because better one exists. - * It helps to avoid the case when two interfaces with the same caps - * and performance exclude each other. */ - wiface->flags |= UCP_WORKER_IFACE_FLAG_UNUSED; - *better_index = rsc_index; - return 1; - } + latency_iter = ucp_tl_iface_latency(ctx, &if_iter->attr.latency); + + /* Check that another iface: */ + if (/* 1. Supports all capabilities of the target iface (at least), + * except ...CONNECT_TO... caps. */ + ucs_test_all_flags(if_iter->attr.cap.flags, test_flags) && + /* 2. Has the same or better performance characteristics */ + (if_iter->attr.overhead <= wiface->attr.overhead) && + (ucp_tl_iface_bandwidth(ctx, &if_iter->attr.bandwidth) >= bw_cur) && + /* swap latencies in args list since less is better */ + (ucp_score_prio_cmp(latency_cur, if_iter->attr.priority, + latency_iter, wiface->attr.priority) >= 0) && + /* 3. The found transport is scalable enough or both + * transport are unscalable */ + (ucp_is_scalable_transport(ctx, if_iter->attr.max_num_eps) || + !ucp_is_scalable_transport(ctx, wiface->attr.max_num_eps))) + { + *better_index = rsc_index; + /* Do not check this iface anymore, because better one exists. + * It helps to avoid the case when two interfaces with the same + * caps and performance exclude each other. */ + wiface->flags |= UCP_WORKER_IFACE_FLAG_UNUSED; + return 1; } } - *better_index = 0; + /* Better resource wasn't found */ + *better_index = 0; return 0; } @@ -830,11 +966,11 @@ static int ucp_worker_iface_find_better(ucp_worker_h worker, * * @return Error code as defined by @ref ucs_status_t */ -static ucs_status_t ucp_worker_select_best_ifaces(ucp_worker_h worker, - uint64_t *tl_bitmap_p) +static void ucp_worker_select_best_ifaces(ucp_worker_h worker, + uint64_t *tl_bitmap_p) { - ucp_context_h context = worker->context; - uint64_t tl_bitmap = 0; + ucp_context_h context = worker->context; + uint64_t tl_bitmap = 0; ucp_rsc_index_t repl_ifaces[UCP_MAX_RESOURCES]; ucp_worker_iface_t *wiface; ucp_rsc_index_t tl_id, iface_id; @@ -844,7 +980,7 @@ static ucs_status_t ucp_worker_select_best_ifaces(ucp_worker_h worker, * 2. Provides equivalent or better performance */ for (tl_id = 0; tl_id < context->num_tls; ++tl_id) { - wiface = &worker->ifaces[tl_id]; + wiface = worker->ifaces[tl_id]; if (!ucp_worker_iface_find_better(worker, wiface, &repl_ifaces[tl_id])) { tl_bitmap |= UCS_BIT(tl_id); } @@ -854,31 +990,34 @@ static ucs_status_t ucp_worker_select_best_ifaces(ucp_worker_h worker, worker->num_ifaces = ucs_popcount(tl_bitmap); ucs_assert(worker->num_ifaces <= context->num_tls); - if (worker->num_ifaces < context->num_tls) { - /* Some ifaces need to be closed */ - for (tl_id = 0, iface_id = 0; tl_id < context->num_tls; ++tl_id) { - wiface = &worker->ifaces[tl_id]; - if (tl_bitmap & UCS_BIT(tl_id)) { - if (iface_id != tl_id) { - memcpy(worker->ifaces + iface_id, wiface, sizeof(*wiface)); - } - ++iface_id; - } else { - ucs_debug("closing resource[%d] "UCT_TL_RESOURCE_DESC_FMT - ", since resource[%d] "UCT_TL_RESOURCE_DESC_FMT - " is better, worker %p", - tl_id, UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[tl_id].tl_rsc), - repl_ifaces[tl_id], - UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[repl_ifaces[tl_id]].tl_rsc), - worker); - /* Ifaces should not be initialized yet, just close it - * (no need for cleanup) */ - uct_iface_close(wiface->iface); + if (worker->num_ifaces == context->num_tls) { + return; + } + + ucs_assert(worker->num_ifaces < context->num_tls); + + /* Some ifaces need to be closed */ + for (tl_id = 0, iface_id = 0; tl_id < context->num_tls; ++tl_id) { + wiface = worker->ifaces[tl_id]; + if (tl_bitmap & UCS_BIT(tl_id)) { + if (iface_id != tl_id) { + worker->ifaces[iface_id] = wiface; } + ++iface_id; + } else { + ucs_debug("closing resource[%d] "UCT_TL_RESOURCE_DESC_FMT + ", since resource[%d] "UCT_TL_RESOURCE_DESC_FMT + " is better, worker %p", + tl_id, UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[tl_id].tl_rsc), + repl_ifaces[tl_id], + UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[repl_ifaces[tl_id]].tl_rsc), + worker); + /* Ifaces should not be initialized yet, just close it + * (no need for cleanup) */ + ucp_worker_uct_iface_close(wiface); + ucs_free(wiface); } } - - return UCS_OK; } /** @@ -900,27 +1039,32 @@ static ucs_status_t ucp_worker_add_resource_ifaces(ucp_worker_h worker) ucp_tl_resource_desc_t *resource; uct_iface_params_t iface_params; ucp_rsc_index_t tl_id, iface_id; + ucp_worker_iface_t *wiface; uint64_t ctx_tl_bitmap, tl_bitmap; + unsigned num_ifaces; ucs_status_t status; /* If tl_bitmap is already set, just use it. Otherwise open ifaces on all * available resources and then select the best ones. */ - ctx_tl_bitmap = context->tl_bitmap; + ctx_tl_bitmap = context->tl_bitmap; if (ctx_tl_bitmap) { - worker->num_ifaces = ucs_popcount(ctx_tl_bitmap); - tl_bitmap = ctx_tl_bitmap; + num_ifaces = ucs_popcount(ctx_tl_bitmap); + tl_bitmap = ctx_tl_bitmap; } else { - worker->num_ifaces = context->num_tls; - tl_bitmap = UCS_MASK(context->num_tls); + num_ifaces = context->num_tls; + tl_bitmap = UCS_MASK(context->num_tls); } - worker->ifaces = ucs_calloc(worker->num_ifaces, sizeof(ucp_worker_iface_t), - "ucp iface"); + worker->ifaces = ucs_calloc(num_ifaces, sizeof(*worker->ifaces), + "ucp ifaces array"); if (worker->ifaces == NULL) { + ucs_error("failed to allocate worker ifaces"); return UCS_ERR_NO_MEMORY; } - iface_id = 0; + worker->num_ifaces = num_ifaces; + iface_id = 0; + ucs_for_each_bit(tl_id, tl_bitmap) { iface_params.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE; resource = &context->tl_rscs[tl_id]; @@ -944,23 +1088,35 @@ static ucs_status_t ucp_worker_add_resource_ifaces(ucp_worker_h worker) if (!ctx_tl_bitmap) { /* Context bitmap is not set, need to select the best tl resources */ tl_bitmap = 0; - status = ucp_worker_select_best_ifaces(worker, &tl_bitmap); - if (status != UCS_OK) { - return status; - } + ucp_worker_select_best_ifaces(worker, &tl_bitmap); ucs_assert(tl_bitmap); /* Cache tl_bitmap on the context, so the next workers would not need * to select best ifaces. */ context->tl_bitmap = tl_bitmap; - ucs_debug("Selected tl bitmap: 0x%lx (%d tls)", + ucs_debug("selected tl bitmap: 0x%lx (%d tls)", tl_bitmap, ucs_popcount(tl_bitmap)); } + worker->scalable_tl_bitmap = 0; + ucs_for_each_bit(tl_id, context->tl_bitmap) { + ucs_assert(ucp_worker_is_tl_p2p(worker, tl_id) || + ucp_worker_is_tl_2iface(worker, tl_id) || + ucp_worker_is_tl_2sockaddr(worker, tl_id)); + wiface = ucp_worker_iface(worker, tl_id); + if (ucp_is_scalable_transport(context, wiface->attr.max_num_eps)) { + worker->scalable_tl_bitmap |= UCS_BIT(tl_id); + } + } + + ucs_debug("selected scalable tl bitmap: 0x%lx (%d tls)", + worker->scalable_tl_bitmap, + ucs_popcount(worker->scalable_tl_bitmap)); + iface_id = 0; ucs_for_each_bit(tl_id, tl_bitmap) { status = ucp_worker_iface_init(worker, tl_id, - &worker->ifaces[iface_id++]); + worker->ifaces[iface_id++]); if (status != UCS_OK) { return status; } @@ -976,8 +1132,8 @@ static void ucp_worker_close_ifaces(ucp_worker_h worker) UCS_ASYNC_BLOCK(&worker->async); for (iface_id = 0; iface_id < worker->num_ifaces; ++iface_id) { - wiface = &worker->ifaces[iface_id]; - if (wiface->iface != NULL) { + wiface = worker->ifaces[iface_id]; + if (wiface != NULL) { ucp_worker_iface_cleanup(wiface); } } @@ -987,15 +1143,21 @@ static void ucp_worker_close_ifaces(ucp_worker_h worker) ucs_status_t ucp_worker_iface_open(ucp_worker_h worker, ucp_rsc_index_t tl_id, uct_iface_params_t *iface_params, - ucp_worker_iface_t *wiface) + ucp_worker_iface_t **wiface_p) { ucp_context_h context = worker->context; ucp_tl_resource_desc_t *resource = &context->tl_rscs[tl_id]; uct_md_h md = context->tl_mds[resource->md_index].md; uct_iface_config_t *iface_config; const char *cfg_tl_name; + ucp_worker_iface_t *wiface; ucs_status_t status; + wiface = ucs_calloc(1, sizeof(*wiface), "ucp_iface"); + if (wiface == NULL) { + return UCS_ERR_NO_MEMORY; + } + wiface->rsc_index = tl_id; wiface->worker = worker; wiface->event_fd = -1; @@ -1013,7 +1175,7 @@ ucs_status_t ucp_worker_iface_open(ucp_worker_h worker, ucp_rsc_index_t tl_id, } status = uct_md_iface_config_read(md, cfg_tl_name, NULL, NULL, &iface_config); if (status != UCS_OK) { - return status; + goto err_free_iface; } UCS_STATIC_ASSERT(UCP_WORKER_HEADROOM_PRIV_SIZE >= sizeof(ucp_eager_sync_hdr_t)); @@ -1024,37 +1186,58 @@ ucs_status_t ucp_worker_iface_open(ucp_worker_h worker, ucp_rsc_index_t tl_id, UCT_IFACE_PARAM_FIELD_ERR_HANDLER_ARG | UCT_IFACE_PARAM_FIELD_ERR_HANDLER | UCT_IFACE_PARAM_FIELD_ERR_HANDLER_FLAGS | - UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_ARG | - UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_ARG | - UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_CB | - UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_CB | UCT_IFACE_PARAM_FIELD_CPU_MASK; iface_params->stats_root = UCS_STATS_RVAL(worker->stats); iface_params->rx_headroom = UCP_WORKER_HEADROOM_SIZE; iface_params->err_handler_arg = worker; iface_params->err_handler = ucp_worker_iface_error_handler; iface_params->err_handler_flags = UCT_CB_FLAG_ASYNC; - iface_params->eager_arg = iface_params->rndv_arg = wiface; - iface_params->eager_cb = ucp_tag_offload_unexp_eager; - iface_params->rndv_cb = ucp_tag_offload_unexp_rndv; iface_params->cpu_mask = worker->cpu_mask; + if (context->config.features & UCP_FEATURE_TAG) { + iface_params->eager_arg = iface_params->rndv_arg = wiface; + iface_params->eager_cb = ucp_tag_offload_unexp_eager; + iface_params->rndv_cb = ucp_tag_offload_unexp_rndv; + iface_params->field_mask |= UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_ARG | + UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_ARG | + UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_CB | + UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_CB; + } + + iface_params->async_event_arg = wiface; + iface_params->async_event_cb = ucp_worker_iface_async_cb_event; + iface_params->field_mask |= UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_ARG | + UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_CB; + /* Open UCT interface */ status = uct_iface_open(md, worker->uct, iface_params, iface_config, &wiface->iface); uct_config_release(iface_config); if (status != UCS_OK) { - return status; + goto err_free_iface; + } + + VALGRIND_MAKE_MEM_UNDEFINED(&wiface->attr, sizeof(wiface->attr)); + + status = uct_iface_query(wiface->iface, &wiface->attr); + if (status != UCS_OK) { + goto err_close_iface; } ucs_debug("created interface[%d]=%p using "UCT_TL_RESOURCE_DESC_FMT" on worker %p", tl_id, wiface->iface, UCT_TL_RESOURCE_DESC_ARG(&resource->tl_rsc), worker); - VALGRIND_MAKE_MEM_UNDEFINED(&wiface->attr, sizeof(wiface->attr)); + *wiface_p = wiface; - return uct_iface_query(wiface->iface, &wiface->attr); + return UCS_OK; + +err_close_iface: + uct_iface_close(wiface->iface); +err_free_iface: + ucs_free(wiface); + return status; } ucs_status_t ucp_worker_iface_init(ucp_worker_h worker, ucp_rsc_index_t tl_id, @@ -1064,8 +1247,10 @@ ucs_status_t ucp_worker_iface_init(ucp_worker_h worker, ucp_rsc_index_t tl_id, ucp_tl_resource_desc_t *resource = &context->tl_rscs[tl_id]; ucs_status_t status; + ucs_assert(wiface != NULL); + /* Set wake-up handlers */ - if (wiface->attr.cap.flags & UCP_WORKER_UCT_ALL_EVENT_CAP_FLAGS) { + if (ucp_worker_iface_use_event_fd(wiface)) { status = uct_iface_event_fd_get(wiface->iface, &wiface->event_fd); if (status != UCS_OK) { goto out_close_iface; @@ -1073,7 +1258,7 @@ ucs_status_t ucp_worker_iface_init(ucp_worker_h worker, ucp_rsc_index_t tl_id, /* Register event handler without actual events so we could modify it later. */ status = ucs_async_set_event_handler(worker->async.mode, wiface->event_fd, - 0, ucp_worker_iface_event, wiface, + 0, ucp_worker_iface_async_fd_event, wiface, &worker->async); if (status != UCS_OK) { ucs_fatal("failed to register event handler: %s", @@ -1093,7 +1278,7 @@ ucs_status_t ucp_worker_iface_init(ucp_worker_h worker, ucp_rsc_index_t tl_id, } if (context->config.ext.adaptive_progress && - (wiface->attr.cap.flags & UCP_WORKER_UCT_RECV_EVENT_CAP_FLAGS)) + (wiface->attr.cap.event_flags & UCP_WORKER_UCT_RECV_EVENT_CAP_FLAGS)) { ucp_worker_iface_deactivate(wiface, 1); } else { @@ -1101,13 +1286,13 @@ ucs_status_t ucp_worker_iface_init(ucp_worker_h worker, ucp_rsc_index_t tl_id, } } - context->mem_type_tls[context->tl_mds[resource->md_index]. - attr.cap.mem_type] |= UCS_BIT(tl_id); + context->mem_type_access_tls[context->tl_mds[resource->md_index]. + attr.cap.access_mem_type] |= UCS_BIT(tl_id); return UCS_OK; out_close_iface: - uct_iface_close(wiface->iface); + ucp_worker_uct_iface_close(wiface); return status; } @@ -1120,7 +1305,10 @@ void ucp_worker_iface_cleanup(ucp_worker_iface_t *wiface) ucp_worker_iface_disarm(wiface); - if (wiface->attr.cap.flags & UCP_WORKER_UCT_ALL_EVENT_CAP_FLAGS) { + if (wiface->event_fd != -1) { + ucs_assertv(ucp_worker_iface_use_event_fd(wiface), + "%p: has event fd %d, but it has to not use this mechanism", + wiface, wiface->event_fd); status = ucs_async_remove_handler(wiface->event_fd, 1); if (status != UCS_OK) { ucs_warn("failed to remove event handler for fd %d: %s", @@ -1128,7 +1316,78 @@ void ucp_worker_iface_cleanup(ucp_worker_iface_t *wiface) } } - uct_iface_close(wiface->iface); + ucp_worker_uct_iface_close(wiface); + ucs_free(wiface); +} + +static void ucp_worker_close_cms(ucp_worker_h worker) +{ + const ucp_rsc_index_t num_cms = ucp_worker_num_cm_cmpts(worker); + ucp_rsc_index_t i; + + for (i = 0; (i < num_cms) && (worker->cms[i].cm != NULL); ++i) { + uct_cm_close(worker->cms[i].cm); + } + + ucs_free(worker->cms); + worker->cms = NULL; +} + +static ucs_status_t ucp_worker_add_resource_cms(ucp_worker_h worker) +{ + ucp_context_h context = worker->context; + uct_cm_config_t *cm_config; + uct_component_h cmpt; + ucp_rsc_index_t cmpt_index, cm_cmpt_index, i; + ucs_status_t status; + + if (!ucp_worker_sockaddr_is_cm_proto(worker)) { + worker->cms = NULL; + return UCS_OK; + } + + UCS_ASYNC_BLOCK(&worker->async); + + worker->cms = ucs_calloc(ucp_worker_num_cm_cmpts(worker), + sizeof(*worker->cms), "ucp cms"); + if (worker->cms == NULL) { + ucs_error("can't allocate CMs array"); + status = UCS_ERR_NO_MEMORY; + goto out; + } + + for (i = 0, cm_cmpt_index = 0; cm_cmpt_index < context->config.num_cm_cmpts; + ++cm_cmpt_index) { + cmpt_index = context->config.cm_cmpt_idxs[cm_cmpt_index]; + cmpt = context->tl_cmpts[cmpt_index].cmpt; + + status = uct_cm_config_read(cmpt, NULL, NULL, &cm_config); + if (status != UCS_OK) { + ucs_error("failed to read cm configuration on component %s", + context->tl_cmpts[cmpt_index].attr.name); + goto err_free_cms; + } + + status = uct_cm_open(cmpt, worker->uct, cm_config, &worker->cms[i].cm); + if (status != UCS_OK) { + ucs_error("failed to open CM on component %s with status %s", + context->tl_cmpts[cmpt_index].attr.name, + ucs_status_string(status)); + goto err_free_cms; + } + + uct_config_release(cm_config); + worker->cms[i++].cmpt_idx = cmpt_index; + } + + status = UCS_OK; + goto out; + +err_free_cms: + ucp_worker_close_cms(worker); +out: + UCS_ASYNC_UNBLOCK(&worker->async); + return status; } static void ucp_worker_enable_atomic_tl(ucp_worker_h worker, const char *mode, @@ -1150,7 +1409,7 @@ static void ucp_worker_init_cpu_atomics(ucp_worker_h worker) /* Enable all interfaces which have host-based atomics */ for (iface_id = 0; iface_id < worker->num_ifaces; ++iface_id) { - wiface = &worker->ifaces[iface_id]; + wiface = worker->ifaces[iface_id]; if (wiface->attr.cap.flags & UCT_IFACE_FLAG_ATOMIC_CPU) { ucp_worker_enable_atomic_tl(worker, "cpu", wiface->rsc_index); } @@ -1167,7 +1426,7 @@ static void ucp_worker_init_device_atomics(ucp_worker_h worker) ucp_rsc_index_t iface_id; uint64_t iface_cap_flags; double score, best_score; - ucp_rsc_index_t md_index; + ucp_md_index_t md_index; ucp_worker_iface_t *wiface; uct_md_attr_t *md_attr; uint64_t supp_tls; @@ -1176,22 +1435,23 @@ static void ucp_worker_init_device_atomics(ucp_worker_h worker) ucp_context_uct_atomic_iface_flags(context, &atomic); - iface_cap_flags = UCT_IFACE_FLAG_ATOMIC_DEVICE; + iface_cap_flags = UCT_IFACE_FLAG_ATOMIC_DEVICE; - dummy_iface_attr.bandwidth = 1e12; - dummy_iface_attr.cap_flags = -1; - dummy_iface_attr.overhead = 0; - dummy_iface_attr.priority = 0; - dummy_iface_attr.lat_ovh = 0; + dummy_iface_attr.bandwidth.dedicated = 1e12; + dummy_iface_attr.bandwidth.shared = 0; + dummy_iface_attr.cap_flags = UINT64_MAX; + dummy_iface_attr.overhead = 0; + dummy_iface_attr.priority = 0; + dummy_iface_attr.lat_ovh = 0; - supp_tls = 0; - best_score = -1; - best_rsc = NULL; - best_priority = 0; + supp_tls = 0; + best_score = -1; + best_rsc = NULL; + best_priority = 0; /* Select best interface for atomics device */ for (iface_id = 0; iface_id < worker->num_ifaces; ++iface_id) { - wiface = &worker->ifaces[iface_id]; + wiface = worker->ifaces[iface_id]; rsc_index = wiface->rsc_index; rsc = &context->tl_rscs[rsc_index]; md_index = rsc->md_index; @@ -1213,8 +1473,10 @@ static void ucp_worker_init_device_atomics(ucp_worker_h worker) score = ucp_wireup_amo_score_func(context, md_attr, iface_attr, &dummy_iface_attr); - if ((score > best_score) || - ((score == best_score) && (priority > best_priority))) + if (ucp_is_scalable_transport(worker->context, + iface_attr->max_num_eps) && + ((score > best_score) || + ((score == best_score) && (priority > best_priority)))) { best_rsc = rsc; best_score = score; @@ -1248,7 +1510,10 @@ static void ucp_worker_init_guess_atomics(ucp_worker_h worker) ucp_rsc_index_t iface_id; for (iface_id = 0; iface_id < worker->num_ifaces; ++iface_id) { - accumulated_flags |= worker->ifaces[iface_id].attr.cap.flags; + if (ucp_is_scalable_transport(worker->context, + worker->ifaces[iface_id]->attr.max_num_eps)) { + accumulated_flags |= worker->ifaces[iface_id]->attr.cap.flags; + } } if (accumulated_flags & UCT_IFACE_FLAG_ATOMIC_DEVICE) { @@ -1282,6 +1547,95 @@ static void ucp_worker_init_atomic_tls(ucp_worker_h worker) } } +static char* ucp_worker_add_feature_rsc(ucp_context_h context, + const ucp_ep_config_key_t *key, + ucp_lane_map_t lanes_bitmap, + const char *feature_str, + char *buf, size_t max) +{ + char *p = buf; + char *endp = buf + max; + int sep = 0; + ucp_rsc_index_t rsc_idx; + ucp_lane_index_t lane; + + if (!lanes_bitmap) { + return p; + } + + snprintf(p, endp - p, "%s(", feature_str); + p += strlen(p); + + ucs_for_each_bit(lane, lanes_bitmap) { + ucs_assert(lane < UCP_MAX_LANES); /* make coverity happy */ + rsc_idx = key->lanes[lane].rsc_index; + snprintf(p, endp - p, "%*s"UCT_TL_RESOURCE_DESC_FMT, sep, "", + UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[rsc_idx].tl_rsc)); + p += strlen(p); + sep = 1; /* add space between tl names */ + } + + snprintf(p, endp - p, "); "); + p += strlen(p); + + return p; +} + +static void ucp_worker_print_used_tls(const ucp_ep_config_key_t *key, + ucp_context_h context, + ucp_ep_cfg_index_t config_idx) +{ + char info[256] = {0}; + ucp_lane_map_t tag_lanes_map = 0; + ucp_lane_map_t rma_lanes_map = 0; + ucp_lane_map_t amo_lanes_map = 0; + ucp_lane_map_t stream_lanes_map = 0; + ucp_lane_index_t lane; + char *p, *endp; + + if (!ucs_log_is_enabled(UCS_LOG_LEVEL_INFO)) { + return; + } + + p = info; + endp = p + sizeof(info); + + snprintf(p, endp - p, "ep_cfg[%d]: ", config_idx); + p += strlen(p); + + for (lane = 0; lane < key->num_lanes; ++lane) { + if (((key->am_lane == lane) || (lane == key->tag_lane) || + (ucp_ep_config_get_multi_lane_prio(key->am_bw_lanes, lane) >= 0) || + (ucp_ep_config_get_multi_lane_prio(key->rma_bw_lanes, lane) >= 0)) && + (context->config.features & UCP_FEATURE_TAG)) { + tag_lanes_map |= UCS_BIT(lane); + } + + if ((key->am_lane == lane) && + (context->config.features & UCP_FEATURE_STREAM)) { + stream_lanes_map |= UCS_BIT(lane); + } + + if ((ucp_ep_config_get_multi_lane_prio(key->rma_lanes, lane) >= 0)) { + rma_lanes_map |= UCS_BIT(lane); + } + + if ((ucp_ep_config_get_multi_lane_prio(key->amo_lanes, lane) >= 0)) { + amo_lanes_map |= UCS_BIT(lane); + } + } + + p = ucp_worker_add_feature_rsc(context, key, tag_lanes_map, "tag", + p, endp - p); + p = ucp_worker_add_feature_rsc(context, key, rma_lanes_map, "rma", + p, endp - p); + p = ucp_worker_add_feature_rsc(context, key, amo_lanes_map, "amo", + p, endp - p); + ucp_worker_add_feature_rsc(context, key, stream_lanes_map, "stream", + p, endp - p); + ucs_info("%s", info); +} + static ucs_status_t ucp_worker_init_mpools(ucp_worker_h worker) { size_t max_mp_entry_size = 0; @@ -1291,7 +1645,7 @@ static ucs_status_t ucp_worker_init_mpools(ucp_worker_h worker) ucs_status_t status; for (iface_id = 0; iface_id < worker->num_ifaces; ++iface_id) { - if_attr = &worker->ifaces[iface_id].attr; + if_attr = &worker->ifaces[iface_id]->attr; max_mp_entry_size = ucs_max(max_mp_entry_size, if_attr->cap.am.max_short); max_mp_entry_size = ucs_max(max_mp_entry_size, @@ -1318,7 +1672,7 @@ static ucs_status_t ucp_worker_init_mpools(ucp_worker_h worker) status = ucs_mpool_init(&worker->rndv_frag_mp, 0, context->config.ext.rndv_frag_size + sizeof(ucp_mem_desc_t), - sizeof(ucp_mem_desc_t), UCS_SYS_CACHE_LINE_SIZE, 128, + sizeof(ucp_mem_desc_t), UCS_SYS_PCI_MAX_PAYLOAD, 128, UINT_MAX, &ucp_frag_mpool_ops, "ucp_rndv_frags"); if (status != UCS_OK) { goto err_release_reg_mpool; @@ -1371,11 +1725,13 @@ static void ucp_worker_clean_extensions(ucp_worker_h worker) * A 'key' identifies an entry in the ep_config array. An entry holds the key and * additional configuration parameters and thresholds. */ -unsigned ucp_worker_get_ep_config(ucp_worker_h worker, - const ucp_ep_config_key_t *key) +ucs_status_t ucp_worker_get_ep_config(ucp_worker_h worker, + const ucp_ep_config_key_t *key, + int print_cfg, + ucp_ep_cfg_index_t *config_idx_p) { - ucp_ep_config_t *config; - unsigned config_idx; + ucp_ep_cfg_index_t config_idx; + ucs_status_t status; /* Search for the given key in the ep_config array */ for (config_idx = 0; config_idx < worker->ep_config_count; ++config_idx) { @@ -1391,16 +1747,27 @@ unsigned ucp_worker_get_ep_config(ucp_worker_h worker, /* Create new configuration */ config_idx = worker->ep_config_count++; - config = &worker->ep_config[config_idx]; + status = ucp_ep_config_init(worker, &worker->ep_config[config_idx], key); + if (status != UCS_OK) { + return status; + } - memset(config, 0, sizeof(*config)); - config->key = *key; - ucp_ep_config_init(worker, config); + if (print_cfg) { + ucp_worker_print_used_tls(key, worker->context, config_idx); + } out: - return config_idx; + *config_idx_p = config_idx; + return UCS_OK; } +static ucs_mpool_ops_t ucp_rkey_mpool_ops = { + .chunk_alloc = ucs_mpool_chunk_malloc, + .chunk_release = ucs_mpool_chunk_free, + .obj_init = NULL, + .obj_cleanup = NULL +}; + size_t ucp_worker_base_size(ucp_context_h context, unsigned *config_max) { *config_max = ucs_min(UINT8_MAX, (context->num_tls + 1) * (context->num_tls + 1) * context->num_tls); @@ -1412,7 +1779,6 @@ ucs_status_t ucp_worker_create(ucp_context_h context, ucp_worker_h *worker_p) { ucs_thread_mode_t uct_thread_mode; - ucs_thread_mode_t thread_mode; unsigned config_count; unsigned name_length; ucp_worker_h worker; @@ -1425,30 +1791,24 @@ ucs_status_t ucp_worker_create(ucp_context_h context, return UCS_ERR_NO_MEMORY; } + uct_thread_mode = UCS_THREAD_MODE_SINGLE; + worker->flags = 0; + if (params->field_mask & UCP_WORKER_PARAM_FIELD_THREAD_MODE) { -#if !ENABLE_MT - thread_mode = UCS_THREAD_MODE_SINGLE; +#if ENABLE_MT if (params->thread_mode != UCS_THREAD_MODE_SINGLE) { - ucs_debug("forced single thread mode on worker create"); + /* UCT is serialized by UCP lock or by UCP user */ + uct_thread_mode = UCS_THREAD_MODE_SERIALIZED; + } + + if (params->thread_mode == UCS_THREAD_MODE_MULTI) { + worker->flags |= UCP_WORKER_FLAG_MT; } #else - thread_mode = params->thread_mode; + if (params->thread_mode != UCS_THREAD_MODE_SINGLE) { + ucs_debug("forced single thread mode on worker create"); + } #endif - } else { - thread_mode = UCS_THREAD_MODE_SINGLE; - } - - if (thread_mode == UCS_THREAD_MODE_MULTI) { - worker->flags = UCP_WORKER_FLAG_MT; - } else { - worker->flags = 0; - } - - if (thread_mode == UCS_THREAD_MODE_SINGLE) { - uct_thread_mode = UCS_THREAD_MODE_SINGLE; - } else { - /* UCT is serialized by UCP lock or by UCP user */ - uct_thread_mode = UCS_THREAD_MODE_SERIALIZED; } worker->context = context; @@ -1458,13 +1818,17 @@ ucs_status_t ucp_worker_create(ucp_context_h context, worker->ep_config_max = config_count; worker->ep_config_count = 0; worker->num_active_ifaces = 0; + worker->num_ifaces = 0; + worker->am_message_id = ucs_generate_uuid(0); + worker->rkey_ptr_cb_id = UCS_CALLBACKQ_ID_NULL; + ucs_queue_head_init(&worker->rkey_ptr_reqs); ucs_list_head_init(&worker->arm_ifaces); ucs_list_head_init(&worker->stream_ready_eps); ucs_list_head_init(&worker->all_eps); ucp_ep_match_init(&worker->ep_match_ctx); UCS_STATIC_ASSERT(sizeof(ucp_ep_ext_gen_t) <= sizeof(ucp_ep_t)); - if (context->config.features & UCP_FEATURE_STREAM) { + if (context->config.features & (UCP_FEATURE_STREAM | UCP_FEATURE_AM)) { UCS_STATIC_ASSERT(sizeof(ucp_ep_ext_proto_t) <= sizeof(ucp_ep_t)); ucs_strided_alloc_init(&worker->ep_alloc, sizeof(ucp_ep_t), 3); } else { @@ -1519,12 +1883,22 @@ ucs_status_t ucp_worker_create(ucp_context_h context, goto err_destroy_uct_worker; } - /* Create epoll set which combines events from all transports */ - status = ucp_worker_wakeup_init(worker, params); + /* create memory pool for small rkeys */ + status = ucs_mpool_init(&worker->rkey_mp, 0, + sizeof(ucp_rkey_t) + + sizeof(ucp_tl_rkey_t) * UCP_RKEY_MPOOL_MAX_MD, + 0, UCS_SYS_CACHE_LINE_SIZE, 128, UINT_MAX, + &ucp_rkey_mpool_ops, "ucp_rkeys"); if (status != UCS_OK) { goto err_req_mp_cleanup; } + /* Create UCS event set which combines events from all transports */ + status = ucp_worker_wakeup_init(worker, params); + if (status != UCS_OK) { + goto err_rkey_mp_cleanup; + } + if (params->field_mask & UCP_WORKER_PARAM_FIELD_CPU_MASK) { worker->cpu_mask = params->cpu_mask; } else { @@ -1537,22 +1911,34 @@ ucs_status_t ucp_worker_create(ucp_context_h context, goto err_wakeup_cleanup; } + /* Initialize UCP AMs */ + status = ucp_am_init(worker); + if (status != UCS_OK) { + goto err_tag_match_cleanup; + } + /* Open all resources as interfaces on this worker */ status = ucp_worker_add_resource_ifaces(worker); if (status != UCS_OK) { goto err_close_ifaces; } + /* Open all resources as connection managers on this worker */ + status = ucp_worker_add_resource_cms(worker); + if (status != UCS_OK) { + goto err_close_cms; + } + /* create mem type endponts */ - status = ucp_worker_create_mem_type_endpoints(worker);; + status = ucp_worker_create_mem_type_endpoints(worker); if (status != UCS_OK) { - goto err_close_ifaces; + goto err_close_cms; } /* Init AM and registered memory pools */ status = ucp_worker_init_mpools(worker); if (status != UCS_OK) { - goto err_close_ifaces; + goto err_close_cms; } /* Select atomic resources */ @@ -1567,7 +1953,7 @@ ucs_status_t ucp_worker_create(ucp_context_h context, /* At this point all UCT memory domains and interfaces are already created * so warn about unused environment variables. */ - ucs_config_parser_warn_unused_env_vars(); + ucs_config_parser_warn_unused_env_vars_once(context->config.env_prefix); *worker_p = worker; return UCS_OK; @@ -1576,11 +1962,16 @@ ucs_status_t ucp_worker_create(ucp_context_h context, ucs_mpool_cleanup(&worker->am_mp, 1); ucs_mpool_cleanup(&worker->reg_mp, 1); ucs_mpool_cleanup(&worker->rndv_frag_mp, 1); +err_close_cms: + ucp_worker_close_cms(worker); err_close_ifaces: ucp_worker_close_ifaces(worker); +err_tag_match_cleanup: ucp_tag_match_cleanup(&worker->tm); err_wakeup_cleanup: ucp_worker_wakeup_cleanup(worker); +err_rkey_mp_cleanup: + ucs_mpool_cleanup(&worker->rkey_mp, 1); err_req_mp_cleanup: ucs_mpool_cleanup(&worker->req_mp, 1); err_destroy_uct_worker: @@ -1607,6 +1998,17 @@ static void ucp_worker_destroy_eps(ucp_worker_h worker) } } +static void ucp_worker_destroy_ep_configs(ucp_worker_h worker) +{ + unsigned i; + + for (i = 0; i < worker->ep_config_count; ++i) { + ucp_ep_config_cleanup(worker, &worker->ep_config[i]); + } + + worker->ep_config_count = 0; +} + void ucp_worker_destroy(ucp_worker_h worker) { ucs_trace_func("worker=%p", worker); @@ -1614,15 +2016,19 @@ void ucp_worker_destroy(ucp_worker_h worker) UCS_ASYNC_BLOCK(&worker->async); ucp_worker_destroy_eps(worker); ucp_worker_remove_am_handlers(worker); + ucp_am_cleanup(worker); + ucp_worker_close_cms(worker); UCS_ASYNC_UNBLOCK(&worker->async); ucp_worker_clean_extensions(worker); + ucp_worker_destroy_ep_configs(worker); + ucp_tag_match_cleanup(&worker->tm); ucs_mpool_cleanup(&worker->am_mp, 1); ucs_mpool_cleanup(&worker->reg_mp, 1); ucs_mpool_cleanup(&worker->rndv_frag_mp, 1); ucp_worker_close_ifaces(worker); - ucp_tag_match_cleanup(&worker->tm); ucp_worker_wakeup_cleanup(worker); + ucs_mpool_cleanup(&worker->rkey_mp, 1); ucs_mpool_cleanup(&worker->req_mp, 1); uct_worker_destroy(worker->uct); ucs_async_context_cleanup(&worker->async); @@ -1652,7 +2058,7 @@ ucs_status_t ucp_worker_query(ucp_worker_h worker, if (attr->field_mask & UCP_WORKER_ATTR_FIELD_ADDRESS) { /* If UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS is not set, * pack all tl adresses */ - tl_bitmap = -1; + tl_bitmap = UINT64_MAX; if (attr->field_mask & UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS) { if (attr->address_flags & UCP_WORKER_ADDRESS_FLAG_NET_ONLY) { @@ -1665,7 +2071,9 @@ ucs_status_t ucp_worker_query(ucp_worker_h worker, } } - status = ucp_address_pack(worker, NULL, tl_bitmap, NULL, &attr->address_length, + status = ucp_address_pack(worker, NULL, tl_bitmap, + UCP_ADDRESS_PACK_FLAGS_ALL, NULL, + &attr->address_length, (void**)&attr->address); } @@ -1731,7 +2139,7 @@ ucs_status_t ucp_worker_get_efd(ucp_worker_h worker, int *fd) if (worker->flags & UCP_WORKER_FLAG_EXTERNAL_EVENT_FD) { status = UCS_ERR_UNSUPPORTED; } else { - *fd = worker->epfd; + *fd = worker->event_fd; status = UCS_OK; } UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker); @@ -1816,26 +2224,38 @@ ucs_status_t ucp_worker_wait(ucp_worker_h worker) status = ucp_worker_arm(worker); if (status == UCS_ERR_BUSY) { /* if UCS_ERR_BUSY returned - no poll() must called */ status = UCS_OK; - goto out; + goto out_unlock; } else if (status != UCS_OK) { - goto out; + goto out_unlock; } if (worker->flags & UCP_WORKER_FLAG_EXTERNAL_EVENT_FD) { pfd = ucs_alloca(sizeof(*pfd) * worker->context->num_tls); nfds = 0; ucs_list_for_each(wiface, &worker->arm_ifaces, arm_list) { - pfd[nfds].fd = wiface->event_fd; + if (!ucp_worker_iface_use_event_fd(wiface)) { + /* if UCT iface supports asynchronous event callback, we + * prefer this method, since it will be called anyway. So, + * no need to get event fd. */ + continue; + } + + pfd[nfds].fd = ucp_worker_iface_get_event_fd(wiface); pfd[nfds].events = POLLIN; ++nfds; } } else { pfd = ucs_alloca(sizeof(*pfd)); - pfd->fd = worker->epfd; - pfd->events = POLLIN; - nfds = 1; + pfd->fd = worker->event_fd; + pfd->events = POLLIN; + nfds = 1; } + UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker); + + /* poll is thread safe system call, though can have unpredictable results + * because of using the same descriptor in multiple threads. + */ for (;;) { ret = poll(pfd, nfds, -1); if (ret >= 0) { @@ -1851,8 +2271,9 @@ ucs_status_t ucp_worker_wait(ucp_worker_h worker) } } +out_unlock: + UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker); out: - UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker); return status; } @@ -1871,8 +2292,9 @@ ucs_status_t ucp_worker_get_address(ucp_worker_h worker, ucp_address_t **address UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); - status = ucp_address_pack(worker, NULL, -1, NULL, address_length_p, - (void**)address_p); + status = ucp_address_pack(worker, NULL, UINT64_MAX, + UCP_ADDRESS_PACK_FLAGS_ALL, NULL, + address_length_p, (void**)address_p); UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker); diff --git a/src/ucp/core/ucp_worker.h b/src/ucp/core/ucp_worker.h index 9c33b10bdd8..b7c96d77ec4 100644 --- a/src/ucp/core/ucp_worker.h +++ b/src/ucp/core/ucp_worker.h @@ -1,6 +1,5 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -13,7 +12,7 @@ #include "ucp_context.h" #include "ucp_thread.h" -#include +#include #include #include #include @@ -25,7 +24,7 @@ /* The size of the private buffer in UCT descriptor headroom, which UCP may * use for its own needs. This size does not include ucp_recv_desc_t length, * because it is common for all cases and protocols (TAG, STREAM). */ -#define UCP_WORKER_HEADROOM_PRIV_SIZE 24 +#define UCP_WORKER_HEADROOM_PRIV_SIZE 32 #if ENABLE_MT @@ -97,6 +96,11 @@ enum { UCP_WORKER_STAT_TAG_RX_RNDV_EXP, UCP_WORKER_STAT_TAG_RX_RNDV_UNEXP, + + UCP_WORKER_STAT_TAG_RX_RNDV_GET_ZCOPY, + UCP_WORKER_STAT_TAG_RX_RNDV_SEND_RTR, + UCP_WORKER_STAT_TAG_RX_RNDV_RKEY_PTR, + UCP_WORKER_STAT_LAST }; @@ -114,6 +118,7 @@ enum { UCP_WORKER_STAT_TAG_OFFLOAD_BLOCK_WILDCARD, UCP_WORKER_STAT_TAG_OFFLOAD_BLOCK_SW_PEND, UCP_WORKER_STAT_TAG_OFFLOAD_BLOCK_NO_IFACE, + UCP_WORKER_STAT_TAG_OFFLOAD_BLOCK_MEM_REG, UCP_WORKER_STAT_TAG_OFFLOAD_RX_UNEXP_EGR, UCP_WORKER_STAT_TAG_OFFLOAD_RX_UNEXP_RNDV, UCP_WORKER_STAT_TAG_OFFLOAD_RX_UNEXP_SW_RNDV, @@ -134,7 +139,7 @@ enum { #define UCP_WORKER_STAT_EAGER_MSG(_worker, _flags) \ UCS_STATS_UPDATE_COUNTER((_worker)->stats, \ - (_flags & UCP_RECV_DESC_FLAG_EAGER_SYNC) ? \ + ((_flags) & UCP_RECV_DESC_FLAG_EAGER_SYNC) ? \ UCP_WORKER_STAT_TAG_RX_EAGER_SYNC_MSG : \ UCP_WORKER_STAT_TAG_RX_EAGER_MSG, 1); @@ -142,21 +147,21 @@ enum { UCS_STATS_UPDATE_COUNTER((_worker)->stats, \ UCP_WORKER_STAT_TAG_RX_EAGER_CHUNK_##_is_exp, 1); -#define UCP_WORKER_STAT_RNDV(_worker, _is_exp) \ +#define UCP_WORKER_STAT_RNDV(_worker, _is_exp, _value) \ UCS_STATS_UPDATE_COUNTER((_worker)->stats, \ - UCP_WORKER_STAT_TAG_RX_RNDV_##_is_exp, 1); + UCP_WORKER_STAT_TAG_RX_RNDV_##_is_exp, _value); #define UCP_WORKER_STAT_TAG_OFFLOAD(_worker, _name) \ UCS_STATS_UPDATE_COUNTER((_worker)->tm_offload_stats, \ UCP_WORKER_STAT_TAG_OFFLOAD_##_name, 1); -#define ucp_worker_mpool_get(_worker) \ +#define ucp_worker_mpool_get(_mp) \ ({ \ - ucp_mem_desc_t *rdesc = ucs_mpool_get_inline(&(_worker)->reg_mp); \ - if (rdesc != NULL) { \ - VALGRIND_MAKE_MEM_DEFINED(rdesc, sizeof(*rdesc)); \ + ucp_mem_desc_t *_rdesc = ucs_mpool_get_inline(_mp); \ + if (_rdesc != NULL) { \ + VALGRIND_MAKE_MEM_DEFINED(_rdesc, sizeof(*_rdesc)); \ } \ - rdesc; \ + _rdesc; \ }) @@ -181,6 +186,16 @@ struct ucp_worker_iface { }; +/** + * UCP worker CM, which encapsulates UCT CM and its auxiliary info. + */ +struct ucp_worker_cm { + uct_cm_h cm; /* UCT CM handle */ + ucp_rsc_index_t cmpt_idx; /* Index of corresponding + component */ +}; + + /** * UCP worker (thread context). */ @@ -191,6 +206,7 @@ typedef struct ucp_worker { uint64_t uuid; /* Unique ID for wireup */ uct_worker_h uct; /* UCT worker handle */ ucs_mpool_t req_mp; /* Memory pool for requests */ + ucs_mpool_t rkey_mp; /* Pool for small memory keys */ uint64_t atomic_tls; /* Which resources can be used for atomics */ int inprogress; @@ -198,7 +214,8 @@ typedef struct ucp_worker { unsigned flush_ops_count;/* Number of pending operations */ - int epfd; /* Allocated (on-demand) epoll fd for wakeup */ + int event_fd; /* Allocated (on-demand) event fd for wakeup */ + ucs_sys_event_set_t *event_set; /* Allocated UCS event set for wakeup */ int eventfd; /* Event fd to support signal() calls */ unsigned uct_events; /* UCT arm events */ ucs_list_link_t arm_ifaces; /* List of interfaces to arm */ @@ -208,18 +225,25 @@ typedef struct ucp_worker { ucs_list_link_t stream_ready_eps; /* List of EPs with received stream data */ ucs_list_link_t all_eps; /* List of all endpoints */ ucp_ep_match_ctx_t ep_match_ctx; /* Endpoint-to-endpoint matching context */ - ucp_worker_iface_t *ifaces; /* Array of interfaces, one for each resource */ + ucp_worker_iface_t **ifaces; /* Array of pointers to interfaces, + one for each resource */ unsigned num_ifaces; /* Number of elements in ifaces array */ unsigned num_active_ifaces; /* Number of activated ifaces */ + uint64_t scalable_tl_bitmap; /* Map of scalable tl resources */ + ucp_worker_cm_t *cms; /* Array of CMs, one for each component */ ucs_mpool_t am_mp; /* Memory pool for AM receives */ ucs_mpool_t reg_mp; /* Registered memory pool */ ucs_mpool_t rndv_frag_mp; /* Memory pool for RNDV fragments */ + ucs_queue_head_t rkey_ptr_reqs; /* Queue of submitted RKEY PTR requests that + * are in-progress */ + uct_worker_cb_id_t rkey_ptr_cb_id;/* RKEY PTR worker callback queue ID */ ucp_tag_match_t tm; /* Tag-matching queues and offload info */ - void *groups; /* Groups and collectives context */ - ucp_ep_h mem_type_ep[UCT_MD_MEM_TYPE_LAST];/* memory type eps */ + ucp_am_context_t am; /* Array of AM callbacks and their data */ + uint64_t am_message_id; /* For matching long am's */ + ucp_ep_h mem_type_ep[UCS_MEMORY_TYPE_LAST];/* memory type eps */ - UCS_STATS_NODE_DECLARE(stats); - UCS_STATS_NODE_DECLARE(tm_offload_stats); + UCS_STATS_NODE_DECLARE(stats) + UCS_STATS_NODE_DECLARE(tm_offload_stats) ucs_cpu_set_t cpu_mask; /* Save CPU mask for subsequent calls to ucp_worker_listen */ unsigned ep_config_max; /* Maximal number of configurations */ @@ -240,12 +264,14 @@ typedef struct ucp_worker_err_handle_arg { } ucp_worker_err_handle_arg_t; -unsigned ucp_worker_get_ep_config(ucp_worker_h worker, - const ucp_ep_config_key_t *key); +ucs_status_t ucp_worker_get_ep_config(ucp_worker_h worker, + const ucp_ep_config_key_t *key, + int print_cfg, + ucp_ep_cfg_index_t *config_idx_p); ucs_status_t ucp_worker_iface_open(ucp_worker_h worker, ucp_rsc_index_t tl_id, uct_iface_params_t *iface_params, - ucp_worker_iface_t *wiface); + ucp_worker_iface_t **wiface); ucs_status_t ucp_worker_iface_init(ucp_worker_h worker, ucp_rsc_index_t tl_id, ucp_worker_iface_t *wiface); @@ -286,7 +312,15 @@ static inline ucp_ep_h ucp_worker_get_ep_by_ptr(ucp_worker_h worker, static UCS_F_ALWAYS_INLINE ucp_worker_iface_t* ucp_worker_iface(ucp_worker_h worker, ucp_rsc_index_t rsc_index) { - return &worker->ifaces[ucs_bitmap2idx(worker->context->tl_bitmap, rsc_index)]; + uint64_t tl_bitmap; + + if (rsc_index == UCP_NULL_RESOURCE) { + return NULL; + } + + tl_bitmap = worker->context->tl_bitmap; + ucs_assert(UCS_BIT(rsc_index) & tl_bitmap); + return worker->ifaces[ucs_bitmap2idx(tl_bitmap, rsc_index)]; } static UCS_F_ALWAYS_INLINE uct_iface_attr_t* @@ -295,10 +329,30 @@ ucp_worker_iface_get_attr(ucp_worker_h worker, ucp_rsc_index_t rsc_index) return &ucp_worker_iface(worker, rsc_index)->attr; } +static UCS_F_ALWAYS_INLINE double +ucp_worker_iface_bandwidth(ucp_worker_h worker, ucp_rsc_index_t rsc_index) +{ + uct_iface_attr_t *iface_attr = ucp_worker_iface_get_attr(worker, rsc_index); + + return ucp_tl_iface_bandwidth(worker->context, &iface_attr->bandwidth); +} + static UCS_F_ALWAYS_INLINE int ucp_worker_unified_mode(ucp_worker_h worker) { return worker->context->config.ext.unified_mode; } +static UCS_F_ALWAYS_INLINE ucp_rsc_index_t +ucp_worker_num_cm_cmpts(const ucp_worker_h worker) +{ + return worker->context->config.num_cm_cmpts; +} + +static UCS_F_ALWAYS_INLINE int +ucp_worker_sockaddr_is_cm_proto(const ucp_worker_h worker) +{ + return !!ucp_worker_num_cm_cmpts(worker); +} + #endif diff --git a/src/ucp/dt/dt.c b/src/ucp/dt/dt.c index 34dcf300bf5..b85bcc1e555 100644 --- a/src/ucp/dt/dt.c +++ b/src/ucp/dt/dt.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "dt.h" #include @@ -15,7 +19,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_mem_type_unpack, (worker, buffer, recv_data, recv_length, mem_type), ucp_worker_h worker, void *buffer, const void *recv_data, - size_t recv_length, uct_memory_type_t mem_type) + size_t recv_length, ucs_memory_type_t mem_type) { ucp_ep_h ep = worker->mem_type_ep[mem_type]; ucp_md_map_t md_map = 0; @@ -36,7 +40,8 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_mem_type_unpack, mem_type, md_index, memh, &md_map, &rkey_bundle); if (status != UCS_OK) { - ucs_error("failed to register buffer with mem type domian"); + ucs_error("failed to register buffer with mem type domain %s", + ucs_memory_type_names[mem_type]); return status; } @@ -46,7 +51,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_mem_type_unpack, ucs_error("uct_ep_put_short() failed %s", ucs_status_string(status)); } - ucp_mem_type_unreg_buffers(worker, mem_type, memh, + ucp_mem_type_unreg_buffers(worker, mem_type, md_index, memh, &md_map, &rkey_bundle); return status; } @@ -54,12 +59,12 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_mem_type_unpack, UCS_PROFILE_FUNC(ucs_status_t, ucp_mem_type_pack, (worker, dest, src, length, mem_type), ucp_worker_h worker, void *dest, const void *src, size_t length, - uct_memory_type_t mem_type) + ucs_memory_type_t mem_type) { ucp_ep_h ep = worker->mem_type_ep[mem_type]; ucp_md_map_t md_map = 0; ucp_lane_index_t lane; - unsigned md_index; + ucp_md_index_t md_index; ucs_status_t status; uct_mem_h memh[1]; uct_rkey_bundle_t rkey_bundle; @@ -71,26 +76,27 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_mem_type_pack, lane = ucp_ep_config(ep)->key.rma_lanes[0]; md_index = ucp_ep_md_index(ep, lane); - status = ucp_mem_type_reg_buffers(worker, (void *)src, length, mem_type, md_index, - memh, &md_map, &rkey_bundle); + status = ucp_mem_type_reg_buffers(worker, (void *)src, length, mem_type, + md_index, memh, &md_map, &rkey_bundle); if (status != UCS_OK) { - ucs_error("failed to register buffer with mem type domian"); + ucs_error("failed to register buffer with mem type domain %s", + ucs_memory_type_names[mem_type]); return status; } status = uct_ep_get_short(ep->uct_eps[lane], dest, length, (uint64_t)src, rkey_bundle.rkey); if (status != UCS_OK) { - ucs_error("uct_ep_put_short() failed %s", ucs_status_string(status)); + ucs_error("uct_ep_get_short() failed %s", ucs_status_string(status)); } - ucp_mem_type_unreg_buffers(worker, mem_type, memh, + ucp_mem_type_unreg_buffers(worker, mem_type, md_index, memh, &md_map, &rkey_bundle); return status; } size_t ucp_dt_pack(ucp_worker_h worker, ucp_datatype_t datatype, - uct_memory_type_t mem_type, void *dest, const void *src, + ucs_memory_type_t mem_type, void *dest, const void *src, ucp_dt_state_t *state, size_t length) { size_t result_len = 0; @@ -102,12 +108,13 @@ size_t ucp_dt_pack(ucp_worker_h worker, ucp_datatype_t datatype, switch (datatype & UCP_DATATYPE_CLASS_MASK) { case UCP_DATATYPE_CONTIG: - if ((ucs_likely(UCP_MEM_IS_HOST(mem_type))) || - (ucs_likely(UCP_MEM_IS_CUDA_MANAGED(mem_type))) || - (ucs_likely(UCP_MEM_IS_ROCM_MANAGED(mem_type)))) { - UCS_PROFILE_CALL(memcpy, dest, src + state->offset, length); + if (UCP_MEM_IS_ACCESSIBLE_FROM_CPU(mem_type)) { + UCS_PROFILE_CALL(ucs_memcpy_relaxed, dest, + UCS_PTR_BYTE_OFFSET(src, state->offset), length); } else { - ucp_mem_type_pack(worker, dest, src + state->offset, length, mem_type); + ucp_mem_type_pack(worker, dest, + UCS_PTR_BYTE_OFFSET(src, state->offset), + length, mem_type); } result_len = length; break; diff --git a/src/ucp/dt/dt.h b/src/ucp/dt/dt.h index bd034fcfb03..c1a5afc1151 100644 --- a/src/ucp/dt/dt.h +++ b/src/ucp/dt/dt.h @@ -47,12 +47,18 @@ typedef struct ucp_dt_state { size_t ucp_dt_pack(ucp_worker_h worker, ucp_datatype_t datatype, - uct_memory_type_t mem_type, void *dest, const void *src, + ucs_memory_type_t mem_type, void *dest, const void *src, ucp_dt_state_t *state, size_t length); + +ucs_status_t ucp_mem_type_pack(ucp_worker_h worker, void *dest, + const void *src, size_t length, + ucs_memory_type_t mem_type); + + ucs_status_t ucp_mem_type_unpack(ucp_worker_h worker, void *buffer, const void *recv_data, size_t recv_length, - uct_memory_type_t mem_type); + ucs_memory_type_t mem_type); #endif /* UCP_DT_H_ */ diff --git a/src/ucp/dt/dt.inl b/src/ucp/dt/dt.inl index b50b8b8f981..95e462d86c0 100644 --- a/src/ucp/dt/dt.inl +++ b/src/ucp/dt/dt.inl @@ -41,7 +41,7 @@ size_t ucp_dt_length(ucp_datatype_t datatype, size_t count, static UCS_F_ALWAYS_INLINE ucs_status_t ucp_dt_unpack_only(ucp_worker_h worker, void *buffer, size_t count, - ucp_datatype_t datatype, uct_memory_type_t mem_type, + ucp_datatype_t datatype, ucs_memory_type_t mem_type, const void *data, size_t length, int truncation) { size_t iov_offset, iovcnt_offset; @@ -56,10 +56,8 @@ ucp_dt_unpack_only(ucp_worker_h worker, void *buffer, size_t count, ucs_unlikely(length > (buffer_size = ucp_contig_dt_length(datatype, count)))) { goto err_truncated; } - if (ucs_likely(UCP_MEM_IS_HOST(mem_type)) || - (ucs_likely(UCP_MEM_IS_CUDA_MANAGED(mem_type))) || - (ucs_likely(UCP_MEM_IS_ROCM_MANAGED(mem_type)))) { - UCS_PROFILE_NAMED_CALL("memcpy_recv", memcpy, buffer, data, length); + if (ucs_likely(UCP_MEM_IS_ACCESSIBLE_FROM_CPU(mem_type))) { + UCS_PROFILE_NAMED_CALL("memcpy_recv", ucs_memcpy_relaxed, buffer, data, length); } else { ucp_mem_type_unpack(worker, buffer, data, length, mem_type); } diff --git a/src/ucp/dt/dt_contig.c b/src/ucp/dt/dt_contig.c index 024de95c3b8..0a646e688ab 100644 --- a/src/ucp/dt/dt_contig.c +++ b/src/ucp/dt/dt_contig.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "dt_contig.h" #include diff --git a/src/ucp/dt/dt_generic.c b/src/ucp/dt/dt_generic.c index d1147fe02cd..cf38142710b 100644 --- a/src/ucp/dt/dt_generic.c +++ b/src/ucp/dt/dt_generic.c @@ -4,8 +4,13 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "dt_generic.h" +#include #include @@ -13,9 +18,12 @@ ucs_status_t ucp_dt_create_generic(const ucp_generic_dt_ops_t *ops, void *contex ucp_datatype_t *datatype_p) { ucp_dt_generic_t *dt; + int ret; - dt = ucs_memalign(UCS_BIT(UCP_DATATYPE_SHIFT), sizeof(*dt), "generic_dt"); - if (dt == NULL) { + ret = ucs_posix_memalign((void **)&dt, + ucs_max(sizeof(void *), UCS_BIT(UCP_DATATYPE_SHIFT)), + sizeof(*dt), "generic_dt"); + if (ret != 0) { return UCS_ERR_NO_MEMORY; } diff --git a/src/ucp/dt/dt_iov.c b/src/ucp/dt/dt_iov.c index 2ae6b604903..1458467c79b 100644 --- a/src/ucp/dt/dt_iov.c +++ b/src/ucp/dt/dt_iov.c @@ -3,6 +3,11 @@ * * See file LICENSE for terms. */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "dt_iov.h" #include @@ -25,7 +30,8 @@ void ucp_dt_iov_gather(void *dest, const ucp_dt_iov_t *iov, size_t length, item_len_to_copy = item_reminder - ucs_max((ssize_t)((length_it + item_reminder) - length), 0); - memcpy(dest + length_it, iov[*iovcnt_offset].buffer + *iov_offset, + memcpy(UCS_PTR_BYTE_OFFSET(dest, length_it), + UCS_PTR_BYTE_OFFSET(iov[*iovcnt_offset].buffer, *iov_offset), item_len_to_copy); length_it += item_len_to_copy; @@ -51,7 +57,8 @@ size_t ucp_dt_iov_scatter(ucp_dt_iov_t *iov, size_t iovcnt, const void *src, length - length_it); ucs_assert(*iov_offset <= item_len); - memcpy(iov[*iovcnt_offset].buffer + *iov_offset, src + length_it, + memcpy(UCS_PTR_BYTE_OFFSET(iov[*iovcnt_offset].buffer, *iov_offset), + UCS_PTR_BYTE_OFFSET(src, length_it), item_len_to_copy); length_it += item_len_to_copy; diff --git a/src/ucp/dt/dt_iov.h b/src/ucp/dt/dt_iov.h index c19a7bae714..be8437cb335 100644 --- a/src/ucp/dt/dt_iov.h +++ b/src/ucp/dt/dt_iov.h @@ -16,13 +16,19 @@ /** - * Get the total length of the data contains in IOV buffers + * Get the total length of the data in @a iov buffers + * + * @param [in] iov @ref ucp_dt_iov_t buffer + * @param [in] iovcnt Number of entries in the @a iov buffer + * + * @return Total length of data in the @a iov buffers */ static inline size_t ucp_dt_iov_length(const ucp_dt_iov_t *iov, size_t iovcnt) { size_t iov_it, total_length = 0; for (iov_it = 0; iov_it < iovcnt; ++iov_it) { + /* cppcheck-suppress nullPointer */ total_length += iov[iov_it].length; } @@ -30,8 +36,8 @@ static inline size_t ucp_dt_iov_length(const ucp_dt_iov_t *iov, size_t iovcnt) } /** - * Copy iov data buffers from @a src to contiguous buffer @a dest with - * a iov item data @a iov_offset and iov item @a iovcnt_offset + * Copy iov data buffers from @a iov to contiguous buffer @a dest with + * an iov item data @a iov_offset and iov item @a iovcnt_offset * * @param [in] dest Destination contiguous buffer * (no offset applicable) @@ -41,7 +47,7 @@ static inline size_t ucp_dt_iov_length(const ucp_dt_iov_t *iov, size_t iovcnt) * from an @a iov item pointed by * @a iovcnt_offset. The @a iov_offset is not aligned * by @ref ucp_dt_iov_t items length. - * @param [inout] iovcnt_offset Auxiliary offset to select @a iov item which + * @param [inout] iovcnt_offset Auxiliary offset to select @a iov item that * belongs to the @a iov_offset. The point to start * copying from should be selected as * iov[iovcnt_offset].buffer + iov_offset @@ -61,12 +67,12 @@ void ucp_dt_iov_gather(void *dest, const ucp_dt_iov_t *iov, size_t length, * to an @a iov item pointed by @a iovcnt_offset. * The @a iov_offset is not aligned by * @ref ucp_dt_iov_t items length. - * @param [inout] iovcnt_offset Auxiliary offset to select @a iov item which + * @param [inout] iovcnt_offset Auxiliary offset to select @a iov item that * belongs to the @a iov_offset. The point to - * start copying from should be selected as + * start copying to should be selected as * iov[iovcnt_offset].buffer + iov_offset * - * @return Size in bytes that is actually copied from @a src to @a iov. It must + * @return Size in bytes that was actually copied from @a src to @a iov. It must * be less or equal to @a length. */ size_t ucp_dt_iov_scatter(ucp_dt_iov_t *iov, size_t iovcnt, const void *src, @@ -77,9 +83,9 @@ size_t ucp_dt_iov_scatter(ucp_dt_iov_t *iov, size_t iovcnt, const void *src, * Seek to a logical offset in the iov * * @param [in] iov @ref ucp_dt_iov_t buffer to seek in - * @param [in] iovcnt Number of entries the @a iov buffer + * @param [in] iovcnt Number of entries in the @a iov buffer * @param [in] distance Distance to move, relative to the current - * current location + * location * @param [inout] iov_offset The offset in bytes from the beginning of the * current iov entry * @param [inout] iovcnt_offset Current @a iov item index @@ -89,12 +95,12 @@ void ucp_dt_iov_seek(ucp_dt_iov_t *iov, size_t iovcnt, ptrdiff_t distance, /** - * Count non-empty buffers in the iov + * Count non-empty entries in the @a iov array * * @param [in] iov @ref ucp_dt_iov_t buffer to count - * @param [in] iovcnt Number of entries the @a iov buffer + * @param [in] iovcnt Number of entries in the @a iov buffer * - * @return Number of non-empty buffers in the iovec + * @return Number of non-empty entries in the @a iov array */ size_t ucp_dt_iov_count_nonempty(const ucp_dt_iov_t *iov, size_t iovcnt); diff --git a/src/ucp/proto/lane_type.c b/src/ucp/proto/lane_type.c new file mode 100644 index 00000000000..9533c4df1e6 --- /dev/null +++ b/src/ucp/proto/lane_type.c @@ -0,0 +1,45 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "lane_type.h" + +#include + + +const ucp_lane_type_info_t ucp_lane_type_info[] = { + [UCP_LANE_TYPE_AM] = { + .short_name = "am" + }, + [UCP_LANE_TYPE_AM_BW] = { + .short_name = "am_bw" + }, + [UCP_LANE_TYPE_RMA] = { + .short_name = "rma" + }, + [UCP_LANE_TYPE_RMA_BW] = { + .short_name = "rma_bw" + }, + [UCP_LANE_TYPE_RKEY_PTR] = { + .short_name = "rkey_ptr" + }, + [UCP_LANE_TYPE_AMO] = { + .short_name = "amo" + }, + [UCP_LANE_TYPE_TAG] = { + .short_name = "tag" + }, + [UCP_LANE_TYPE_CM] = { + .short_name = "cm" + }, + [UCP_LANE_TYPE_LAST] = { + .short_name = NULL + }, +}; + diff --git a/src/ucp/proto/lane_type.h b/src/ucp/proto/lane_type.h new file mode 100644 index 00000000000..767601bbd75 --- /dev/null +++ b/src/ucp/proto/lane_type.h @@ -0,0 +1,39 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCP_LANE_TYPE_H_ +#define UCP_LANE_TYPE_H_ + +#include + + +/* + * Defines how a lane should be selected and used + */ +typedef enum { + UCP_LANE_TYPE_AM, /* Active messages */ + UCP_LANE_TYPE_AM_BW, /* High-BW active messages */ + UCP_LANE_TYPE_RMA, /* Remote memory access */ + UCP_LANE_TYPE_RMA_BW, /* High-BW remote memory access */ + UCP_LANE_TYPE_RKEY_PTR, /* Obtain remote memory pointer */ + UCP_LANE_TYPE_AMO, /* Atomic memory access */ + UCP_LANE_TYPE_TAG, /* Tag matching offload */ + UCP_LANE_TYPE_CM, /* CM wireup */ + UCP_LANE_TYPE_LAST +} ucp_lane_type_t; + + +typedef struct ucp_lane_type_info { + const char *short_name; +} ucp_lane_type_info_t; + + +typedef uint32_t ucp_lane_type_mask_t; + + +extern const ucp_lane_type_info_t ucp_lane_type_info[]; + +#endif diff --git a/src/ucp/proto/proto.c b/src/ucp/proto/proto.c new file mode 100644 index 00000000000..cf33e75892d --- /dev/null +++ b/src/ucp/proto/proto.c @@ -0,0 +1,16 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "proto.h" + + +const ucp_proto_t *ucp_protocols[UCP_PROTO_MAX_COUNT] = {}; +unsigned ucp_protocols_count = 0; + diff --git a/src/ucp/proto/proto.h b/src/ucp/proto/proto.h index 290efa73543..344a274dd7a 100644 --- a/src/ucp/proto/proto.h +++ b/src/ucp/proto/proto.h @@ -1,5 +1,5 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2016. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -7,49 +7,172 @@ #ifndef UCP_PROTO_H_ #define UCP_PROTO_H_ -#include -#include -#include +#include "lane_type.h" + +#include +#include +#include + + +/* Maximal number of lanes per protocol */ +#define UCP_PROTO_MAX_LANES UCP_MAX_LANES + + +/* Maximal number of protocol performance ranges */ +#define UCP_PROTO_MAX_PERF_RANGES 32 + + +/* Maximal size of protocol private data */ +#define UCP_PROTO_PRIV_MAX 1024 + + +/* Maximal number of protocols in total */ +#define UCP_PROTO_MAX_COUNT 64 + + +/* Special value for non-existent protocol */ +#define UCP_PROTO_ID_INVALID ((ucp_proto_id_t)-1) + + +/* Protocol identifier */ +typedef unsigned ucp_proto_id_t; + + +/* Bitmap of protocols */ +typedef uint64_t ucp_proto_id_mask_t; /** - * Header segment for a transaction + * Key for looking up protocol configuration by operation parameters + */ +typedef struct { + uint8_t op_id; /* Operation ID */ + uint8_t op_flags; /* Operation flags */ + uint8_t dt_class; /* Datatype */ + uint8_t mem_type; /* Memory type */ + uint8_t sys_dev; /* System device */ + uint8_t sg_count; /* Number of non-contig scatter/gather + entries. If the actual number is larger + than UINT8_MAX, UINT8_MAX is used. */ + uint8_t padding[2]; /* Make structure size be sizeof(uint64_t) */ +} UCS_S_PACKED ucp_proto_select_param_t; + + +/** + * Protocol and its private configuration + */ +typedef struct { + const ucp_proto_t *proto; /* Protocol definition */ + const void *priv; /* Protocol private configuration space */ +} ucp_proto_config_t; + + +/* + * Performance estimation for a range of message sizes */ typedef struct { - uintptr_t ep_ptr; - uintptr_t reqptr; -} UCS_S_PACKED ucp_request_hdr_t; + size_t max_length; /* Maximal message size */ + ucs_linear_func_t perf; /* Estimated time in seconds, as a + function of message size in bytes */ +} ucp_proto_perf_range_t; /** - * Header for transaction acknowledgment + * UCP protocol capabilities (per operation parameters) */ typedef struct { - uint64_t reqptr; - ucs_status_t status; -} UCS_S_PACKED ucp_reply_hdr_t; + size_t cfg_thresh; /* Configured protocol threshold */ + size_t min_length; /* Minimal message size */ + unsigned num_ranges; /* Number of entries in 'ranges' */ + + /* Performance estimation function for different message sizes */ + ucp_proto_perf_range_t ranges[UCP_PROTO_MAX_PERF_RANGES]; + +} ucp_proto_caps_t; /** - * Defines functions for a protocol, on all possible data types. + * Parameters for protocol initialization function + */ +typedef struct { + /* Input parameters */ + ucp_worker_h worker; /* Worker to initialize on */ + const ucp_proto_select_param_t *sel_param; /* Operation parameters */ + const ucp_ep_config_key_t *ep_config_key; /* Endpoint configuration */ + + /* Output parameters */ + void *priv; /* Pointer to priv buffer */ + size_t *priv_size; /* Occupied size in priv buffer */ + ucp_proto_caps_t *caps; /* Protocol capabilities */ +} ucp_proto_init_params_t; + + +/** + * Initialize protocol-specific configuration and estimate protocol performance + * as function of message size. + * + * @param [in] params Protocol initialization parameters. + * + * @return UCS_OK - if successful. + * UCS_ERR_UNSUPPORTED - if the protocol is not supported on the key. + */ +typedef ucs_status_t +(*ucp_proto_init_func_t)(const ucp_proto_init_params_t *params); + + +/** + * Dump protocol-specific configuration. + * + * @param [in] priv Protocol private data, which was previously filled by + * @ref ucp_proto_init_func_t. + * @param [out] strb Filled with a string of protocol configuration text. + * The user is responsible to release the string by + * calling @ref ucs_string_buffer_cleanup. + */ +typedef void +(*ucp_proto_config_str_func_t)(const void *priv, ucs_string_buffer_t *strb); + + +/** + * UCP base protocol definition */ struct ucp_proto { - uct_pending_callback_t contig_short; /**< Progress short data */ - uct_pending_callback_t bcopy_single; /**< Progress bcopy single fragment */ - uct_pending_callback_t bcopy_multi; /**< Progress bcopy multi-fragment */ - uct_pending_callback_t zcopy_single; /**< Progress zcopy single fragment */ - uct_pending_callback_t zcopy_multi; /**< Progress zcopy multi-fragment */ - uct_completion_callback_t zcopy_completion; /**< Callback for UCT zcopy completion */ - size_t only_hdr_size; /**< Header size for single / short */ - size_t first_hdr_size; /**< Header size for first of multi */ - size_t mid_hdr_size; /**< Header size for rest of multi */ + const char *name; /* Protocol name */ + ucp_proto_init_func_t init; /* Initialization function */ + ucp_proto_config_str_func_t config_str; /* Configuration dump function */ + uct_pending_callback_t progress; /* UCT progress function */ }; -ucs_status_t ucp_proto_progress_am_bcopy_single(uct_pending_req_t *self); +/** + * Register a protocol definition. + */ +#define UCP_PROTO_REGISTER(_proto) \ + UCS_STATIC_INIT { \ + ucs_assert_always(ucp_protocols_count < UCP_PROTO_MAX_COUNT); \ + ucp_protocols[ucp_protocols_count++] = (_proto); \ + } + + +/** + * Retrieve a protocol field by protocol id. + */ +#define ucp_proto_id_field(_proto_id, _field) \ + (ucp_protocols[(_proto_id)]->_field) + + +/** + * Call a protocol method by protocol id. + */ +#define ucp_proto_id_call(_proto_id, _func, ...) \ + ucp_proto_id_field(_proto_id, _func)(__VA_ARGS__) + + +/* Global array of all registered protocols */ +extern const ucp_proto_t *ucp_protocols[UCP_PROTO_MAX_COUNT]; -void ucp_proto_am_zcopy_completion(uct_completion_t *self, ucs_status_t status); +/* Number of globally registered protocols */ +extern unsigned ucp_protocols_count; -void ucp_proto_am_zcopy_req_complete(ucp_request_t *req, ucs_status_t status); #endif diff --git a/src/ucp/proto/proto_am.c b/src/ucp/proto/proto_am.c index f5f5de66a3d..01be173d2a4 100644 --- a/src/ucp/proto/proto_am.c +++ b/src/ucp/proto/proto_am.c @@ -4,12 +4,21 @@ * See file LICENSE for terms. */ -#include "proto.h" +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "proto_am.inl" #include +static inline size_t ucp_proto_max_packed_size() +{ + return ucs_max(sizeof(ucp_reply_hdr_t), + sizeof(ucp_offload_ssend_hdr_t)); +} + static size_t ucp_proto_pack(void *dest, void *arg) { ucp_request_t *req = arg; @@ -31,16 +40,44 @@ static size_t ucp_proto_pack(void *dest, void *arg) return sizeof(*off_rep_hdr); } - ucs_bug("unexpected am_id"); + ucs_fatal("unexpected am_id"); return 0; } -ucs_status_t ucp_proto_progress_am_bcopy_single(uct_pending_req_t *self) +ucs_status_t +ucp_do_am_single(uct_pending_req_t *self, uint8_t am_id, + uct_pack_callback_t pack_cb, ssize_t max_packed_size) { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); + ucp_ep_t *ep = req->send.ep; + ssize_t packed_len; + uint64_t *buffer; + + /* if packed data can fit short active message, use it, because it should + * be faster than bcopy. + */ + if ((max_packed_size <= UCS_ALLOCA_MAX_SIZE) && + (max_packed_size <= ucp_ep_config(ep)->am.max_short)) { + req->send.lane = ucp_ep_get_am_lane(ep); + buffer = ucs_alloca(max_packed_size); + packed_len = pack_cb(buffer, req); + ucs_assertv((packed_len >= 0) && (packed_len <= max_packed_size), + "packed_len=%zd max_packed_size=%zu", packed_len, + max_packed_size); - ucs_status_t status = ucp_do_am_bcopy_single(self, req->send.proto.am_id, - ucp_proto_pack); + return uct_ep_am_short(ep->uct_eps[req->send.lane], am_id, buffer[0], + &buffer[1], packed_len - sizeof(uint64_t)); + } else { + return ucp_do_am_bcopy_single(self, am_id, pack_cb); + } +} + +ucs_status_t ucp_proto_progress_am_single(uct_pending_req_t *self) +{ + ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); + ucs_status_t status = ucp_do_am_single(self, req->send.proto.am_id, + ucp_proto_pack, + ucp_proto_max_packed_size()); if (status == UCS_OK) { req->send.proto.comp_cb(req); } diff --git a/src/ucp/proto/proto_am.h b/src/ucp/proto/proto_am.h new file mode 100644 index 00000000000..ee990210b6e --- /dev/null +++ b/src/ucp/proto/proto_am.h @@ -0,0 +1,42 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCP_PROTO_AM_H_ +#define UCP_PROTO_AM_H_ + +#include +#include + + +/** + * Header segment for a transaction + */ +typedef struct { + uintptr_t ep_ptr; + uintptr_t reqptr; +} UCS_S_PACKED ucp_request_hdr_t; + + +/** + * Header for transaction acknowledgment + */ +typedef struct { + uint64_t reqptr; + ucs_status_t status; +} UCS_S_PACKED ucp_reply_hdr_t; + + +ucs_status_t +ucp_do_am_single(uct_pending_req_t *self, uint8_t am_id, + uct_pack_callback_t pack_cb, ssize_t max_packed_size); + +ucs_status_t ucp_proto_progress_am_single(uct_pending_req_t *self); + +void ucp_proto_am_zcopy_completion(uct_completion_t *self, ucs_status_t status); + +void ucp_proto_am_zcopy_req_complete(ucp_request_t *req, ucs_status_t status); + +#endif diff --git a/src/ucp/proto/proto_am.inl b/src/ucp/proto/proto_am.inl index 4ab53b8af09..4907742df18 100644 --- a/src/ucp/proto/proto_am.inl +++ b/src/ucp/proto/proto_am.inl @@ -1,9 +1,14 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2016. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifndef UCP_PROTO_AM_INL_ +#define UCP_PROTO_AM_INL_ + +#include "proto_am.h" + #include #include #include @@ -11,6 +16,7 @@ #include #include + #define UCP_STATUS_PENDING_SWITCH (UCS_ERR_LAST - 1) typedef void (*ucp_req_complete_func_t)(ucp_request_t *req, ucs_status_t status); @@ -20,88 +26,155 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucp_do_am_bcopy_single(uct_pending_req_t *self, uint8_t am_id, uct_pack_callback_t pack_cb) { - ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); - ucp_ep_t *ep = req->send.ep; + ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); + ucp_ep_t *ep = req->send.ep; + ucp_dt_state_t state = req->send.state.dt; ssize_t packed_len; req->send.lane = ucp_ep_get_am_lane(ep); packed_len = uct_ep_am_bcopy(ep->uct_eps[req->send.lane], am_id, pack_cb, req, 0); - if (packed_len < 0) { - return packed_len; + if (ucs_unlikely(packed_len < 0)) { + /* Reset the state to the previous one */ + req->send.state.dt = state; + return (ucs_status_t)packed_len; } + ucs_assertv((size_t)packed_len <= ucp_ep_get_max_bcopy(ep, req->send.lane), + "packed_len=%zd max_bcopy=%zu", + packed_len, ucp_ep_get_max_bcopy(ep, req->send.lane)); + return UCS_OK; } static UCS_F_ALWAYS_INLINE ucs_status_t ucp_do_am_bcopy_multi(uct_pending_req_t *self, uint8_t am_id_first, uint8_t am_id_middle, - size_t hdr_size_middle, uct_pack_callback_t pack_first, uct_pack_callback_t pack_middle, int enable_am_bw) { - ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); - ucp_ep_t *ep = req->send.ep; + ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); + ucp_ep_t *ep = req->send.ep; + ucp_dt_state_t state = req->send.state.dt; ucs_status_t status; - size_t UCS_V_UNUSED max_middle; ssize_t packed_len; uct_ep_h uct_ep; - size_t offset; - int pending_adde_res; + int pending_add_res; - offset = req->send.state.dt.offset; - req->send.lane = (!enable_am_bw || !offset) ? /* first part of message must be sent */ - ucp_ep_get_am_lane(ep) : /* via AM lane */ - ucp_send_request_get_next_am_bw_lane(req); + req->send.lane = (!enable_am_bw || (state.offset == 0)) ? /* first part of message must be sent */ + ucp_ep_get_am_lane(ep) : /* via AM lane */ + ucp_send_request_get_am_bw_lane(req); uct_ep = ep->uct_eps[req->send.lane]; - max_middle = ucp_ep_get_max_bcopy(ep, req->send.lane) - hdr_size_middle; for (;;) { - if (offset == 0) { + if (state.offset == 0) { /* First */ packed_len = uct_ep_am_bcopy(uct_ep, am_id_first, pack_first, req, 0); - UCS_PROFILE_REQUEST_EVENT_CHECK_STATUS(req, "am_bcopy_first", packed_len, - packed_len); - ucs_assertv(req->send.state.dt.offset < req->send.length, - "offset=%zd", req->send.state.dt.offset); + UCS_PROFILE_REQUEST_EVENT_CHECK_STATUS(req, "am_bcopy_first", + packed_len, packed_len); } else { - ucs_assert(offset < req->send.length); + ucs_assert(state.offset < req->send.length); /* Middle or last */ packed_len = uct_ep_am_bcopy(uct_ep, am_id_middle, pack_middle, req, 0); - ucs_assertv((packed_len < 0) || (packed_len <= max_middle + hdr_size_middle), - "packed_len=%zd max_middle=%zu hdr_size_middle=%zu", - packed_len, max_middle, hdr_size_middle); UCS_PROFILE_REQUEST_EVENT_CHECK_STATUS(req, "am_bcopy_middle", packed_len, packed_len); - ucs_assert((packed_len < 0) || - (offset + packed_len - hdr_size_middle <= req->send.length)); - if ((packed_len > 0) && (offset + packed_len - hdr_size_middle == req->send.length)) { - /* Last */ - return UCS_OK; - } } if (ucs_unlikely(packed_len < 0)) { - if (req->send.lane != req->send.pending_lane) { + /* Reset the state to the previous one */ + req->send.state.dt = state; + + if ((packed_len == UCS_ERR_NO_RESOURCE) && + (req->send.lane != req->send.pending_lane)) { /* switch to new pending lane */ - pending_adde_res = ucp_request_pending_add(req, &status, 0); - if (!pending_adde_res) { + pending_add_res = ucp_request_pending_add(req, &status, 0); + if (!pending_add_res) { /* failed to switch req to pending queue, try again */ continue; } ucs_assert(status == UCS_INPROGRESS); - return UCP_STATUS_PENDING_SWITCH; + return (ucs_status_t)UCP_STATUS_PENDING_SWITCH; } else { - return packed_len; + return (ucs_status_t)packed_len; } } else { - return UCS_INPROGRESS; + ucs_assertv(/* The packed length has to be the same as maximum + * AM Bcopy for the first and middle segments */ + ((req->send.state.dt.offset < req->send.length) && + (packed_len == ucp_ep_get_max_bcopy(ep, req->send.lane))) || + /* The packed length has to be the same or less than + * maximum AM Bcopy for the last segment */ + (packed_len <= ucp_ep_get_max_bcopy(ep, req->send.lane)), + "packed_len=%zd max_bcopy=%zu", + packed_len, ucp_ep_get_max_bcopy(ep, req->send.lane)); + ucs_assertv(req->send.state.dt.offset <= req->send.length, + "offset=%zd length=%zu", + req->send.state.dt.offset, req->send.length); + ucs_assert(state.offset < req->send.state.dt.offset); + /* If the last segment was sent, return UCS_OK, + * otherwise - UCS_INPROGRESS */ + if (enable_am_bw) { + ucp_send_request_next_am_bw_lane(req); + } + return ((req->send.state.dt.offset < req->send.length) ? + UCS_INPROGRESS : UCS_OK); } } } +static UCS_F_ALWAYS_INLINE +size_t ucp_dt_iov_copy_iov_uct(uct_iov_t *iov, size_t *iovcnt, + size_t max_dst_iov, ucp_dt_state_t *state, + const ucp_dt_iov_t *src_iov, size_t length_max, + ucp_md_index_t md_index, uint64_t md_flags) +{ + size_t length_it = 0; + size_t iov_offset, max_src_iov, src_it, dst_it; + ucp_md_index_t memh_index; + + iov_offset = state->dt.iov.iov_offset; + max_src_iov = state->dt.iov.iovcnt; + src_it = state->dt.iov.iovcnt_offset; + dst_it = 0; + state->dt.iov.iov_offset = 0; + + while ((dst_it < max_dst_iov) && (src_it < max_src_iov)) { + if (src_iov[src_it].length != 0) { + iov[dst_it].buffer = UCS_PTR_BYTE_OFFSET(src_iov[src_it].buffer, + iov_offset); + iov[dst_it].length = src_iov[src_it].length - iov_offset; + if (md_flags & UCT_MD_FLAG_NEED_MEMH) { + ucs_assert(state->dt.iov.dt_reg != NULL); + memh_index = ucs_bitmap2idx(state->dt.iov.dt_reg[src_it].md_map, + md_index); + iov[dst_it].memh = state->dt.iov.dt_reg[src_it].memh[memh_index]; + } else { + ucs_assert(state->dt.iov.dt_reg == NULL); + iov[dst_it].memh = UCT_MEM_HANDLE_NULL; + } + iov[dst_it].stride = 0; + iov[dst_it].count = 1; + length_it += iov[dst_it].length; + + ++dst_it; + if (length_it >= length_max) { + iov[dst_it - 1].length -= (length_it - length_max); + length_it = length_max; + state->dt.iov.iov_offset = iov_offset + iov[dst_it - 1].length; + break; + } + } + iov_offset = 0; + ++src_it; + } + + state->dt.iov.iovcnt_offset = src_it; + *iovcnt = dst_it; + + return length_it; +} + static UCS_F_ALWAYS_INLINE void ucp_dt_iov_copy_uct(ucp_context_h context, uct_iov_t *iov, size_t *iovcnt, size_t max_dst_iov, ucp_dt_state_t *state, @@ -109,13 +182,16 @@ void ucp_dt_iov_copy_uct(ucp_context_h context, uct_iov_t *iov, size_t *iovcnt, size_t length_max, ucp_md_index_t md_index, ucp_mem_desc_t *mdesc) { - size_t iov_offset, max_src_iov, src_it, dst_it; - size_t length_it = 0; + uint64_t md_flags = context->tl_mds[md_index].attr.cap.flags; + size_t length_it = 0; ucp_md_index_t memh_index; + ucs_assert((context->tl_mds[md_index].attr.cap.flags & UCT_MD_FLAG_REG) || + !(md_flags & UCT_MD_FLAG_NEED_MEMH)); + switch (datatype & UCP_DATATYPE_CLASS_MASK) { case UCP_DATATYPE_CONTIG: - if (context->tl_mds[md_index].attr.cap.flags & UCT_MD_FLAG_REG) { + if (md_flags & UCT_MD_FLAG_NEED_MEMH) { if (mdesc) { memh_index = ucs_bitmap2idx(mdesc->memh->md_map, md_index); iov[0].memh = mdesc->memh->uct[memh_index]; @@ -126,7 +202,7 @@ void ucp_dt_iov_copy_uct(ucp_context_h context, uct_iov_t *iov, size_t *iovcnt, } else { iov[0].memh = UCT_MEM_HANDLE_NULL; } - iov[0].buffer = (void *)src_iov + state->offset; + iov[0].buffer = UCS_PTR_BYTE_OFFSET(src_iov, state->offset); iov[0].length = length_max; iov[0].stride = 0; iov[0].count = 1; @@ -135,34 +211,9 @@ void ucp_dt_iov_copy_uct(ucp_context_h context, uct_iov_t *iov, size_t *iovcnt, length_it = iov[0].length; break; case UCP_DATATYPE_IOV: - iov_offset = state->dt.iov.iov_offset; - max_src_iov = state->dt.iov.iovcnt; - src_it = state->dt.iov.iovcnt_offset; - dst_it = 0; - state->dt.iov.iov_offset = 0; - while ((dst_it < max_dst_iov) && (src_it < max_src_iov)) { - if (src_iov[src_it].length) { - iov[dst_it].buffer = src_iov[src_it].buffer + iov_offset; - iov[dst_it].length = src_iov[src_it].length - iov_offset; - iov[dst_it].memh = state->dt.iov.dt_reg[src_it].memh[0]; - iov[dst_it].stride = 0; - iov[dst_it].count = 1; - length_it += iov[dst_it].length; - - ++dst_it; - if (length_it >= length_max) { - iov[dst_it - 1].length -= (length_it - length_max); - length_it = length_max; - state->dt.iov.iov_offset = iov_offset + iov[dst_it - 1].length; - break; - } - } - iov_offset = 0; - ++src_it; - } - - state->dt.iov.iovcnt_offset = src_it; - *iovcnt = dst_it; + length_it = ucp_dt_iov_copy_iov_uct(iov, iovcnt, max_dst_iov, state, + src_iov, length_max, md_index, + md_flags); break; default: ucs_error("Invalid data type"); @@ -176,12 +227,12 @@ ucs_status_t ucp_do_am_zcopy_single(uct_pending_req_t *self, uint8_t am_id, const void *hdr, size_t hdr_size, ucp_req_complete_func_t complete) { - ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); - ucp_ep_t *ep = req->send.ep; - size_t max_iov = ucp_ep_config(ep)->am.max_iov; - uct_iov_t *iov = ucs_alloca(max_iov * sizeof(uct_iov_t)); - size_t iovcnt = 0; - ucp_dt_state_t state = req->send.state.dt; + ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); + ucp_ep_t *ep = req->send.ep; + size_t max_iov = ucp_ep_config(ep)->am.max_iov; + uct_iov_t *iov = ucs_alloca(max_iov * sizeof(uct_iov_t)); + size_t iovcnt = 0; + ucp_dt_state_t state = req->send.state.dt; ucs_status_t status; req->send.lane = ucp_ep_get_am_lane(ep); @@ -203,6 +254,23 @@ ucs_status_t ucp_do_am_zcopy_single(uct_pending_req_t *self, uint8_t am_id, return UCS_STATUS_IS_ERR(status) ? status : UCS_OK; } +static UCS_F_ALWAYS_INLINE +void ucp_am_zcopy_complete_last_stage(ucp_request_t *req, ucp_dt_state_t *state, + ucp_req_complete_func_t complete) +{ + ucp_request_send_state_advance(req, state, + UCP_REQUEST_SEND_PROTO_ZCOPY_AM, + UCS_OK); + + /* Complete a request on a last stage if all previous AM + * Zcopy operations completed successfully. If there are + * operations that are in progress on other lanes, the last + * completed operation will complete the request */ + if (req->send.state.uct_comp.count == 0) { + complete(req, UCS_OK); + } +} + static UCS_F_ALWAYS_INLINE ucs_status_t ucp_do_am_zcopy_multi(uct_pending_req_t *self, uint8_t am_id_first, uint8_t am_id_middle, @@ -210,10 +278,10 @@ ucs_status_t ucp_do_am_zcopy_multi(uct_pending_req_t *self, uint8_t am_id_first, const void *hdr_middle, size_t hdr_size_middle, ucp_req_complete_func_t complete, int enable_am_bw) { - ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); - ucp_ep_t *ep = req->send.ep; - unsigned flag_iov_mid = 0; - size_t iovcnt = 0; + ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); + ucp_ep_t *ep = req->send.ep; + unsigned flag_iov_mid = 0; + size_t iovcnt = 0; ucp_dt_state_t state; size_t max_middle; size_t max_iov; @@ -222,19 +290,12 @@ ucs_status_t ucp_do_am_zcopy_multi(uct_pending_req_t *self, uint8_t am_id_first, size_t mid_len; ucs_status_t status; uct_ep_h uct_ep; - int pending_adde_res; + int pending_add_res; - if (UCP_DT_IS_CONTIG(req->send.datatype)) { - if (enable_am_bw && req->send.state.dt.offset) { - req->send.lane = ucp_send_request_get_next_am_bw_lane(req); - ucp_send_request_add_reg_lane(req, req->send.lane); - } else { - req->send.lane = ucp_ep_get_am_lane(ep); - } + if (enable_am_bw && (req->send.state.dt.offset != 0)) { + req->send.lane = ucp_send_request_get_am_bw_lane(req); + ucp_send_request_add_reg_lane(req, req->send.lane); } else { - ucs_assert(UCP_DT_IS_IOV(req->send.datatype)); - /* disable multilane for IOV datatype. - * TODO: add IOV processing for multilane */ req->send.lane = ucp_ep_get_am_lane(ep); } @@ -244,8 +305,8 @@ ucs_status_t ucp_do_am_zcopy_multi(uct_pending_req_t *self, uint8_t am_id_first, iov = ucs_alloca(max_iov * sizeof(uct_iov_t)); for (;;) { - state = req->send.state.dt; - offset = state.offset; + state = req->send.state.dt; + offset = state.offset; ucs_assert(max_iov > 0); if (UCP_DT_IS_IOV(req->send.datatype)) { @@ -287,6 +348,7 @@ ucs_status_t ucp_do_am_zcopy_multi(uct_pending_req_t *self, uint8_t am_id_first, &req->send.state.uct_comp); } else if (state.offset == req->send.length) { /* Empty IOVs on last stage */ + ucp_am_zcopy_complete_last_stage(req, &state, complete); return UCS_OK; } else { ucs_assert(offset == state.offset); @@ -303,13 +365,17 @@ ucs_status_t ucp_do_am_zcopy_multi(uct_pending_req_t *self, uint8_t am_id_first, if (!flag_iov_mid && (offset + mid_len == req->send.length)) { /* Last stage */ if (status == UCS_OK) { - complete(req, UCS_OK); + ucp_am_zcopy_complete_last_stage(req, &state, complete); return UCS_OK; } + ucp_request_send_state_advance(req, &state, UCP_REQUEST_SEND_PROTO_ZCOPY_AM, status); if (!UCS_STATUS_IS_ERR(status)) { + if (enable_am_bw) { + ucp_send_request_next_am_bw_lane(req); + } return UCS_OK; } } @@ -318,8 +384,8 @@ ucs_status_t ucp_do_am_zcopy_multi(uct_pending_req_t *self, uint8_t am_id_first, if (status == UCS_ERR_NO_RESOURCE) { if (req->send.lane != req->send.pending_lane) { /* switch to new pending lane */ - pending_adde_res = ucp_request_pending_add(req, &status, 0); - if (!pending_adde_res) { + pending_add_res = ucp_request_pending_add(req, &status, 0); + if (!pending_add_res) { /* failed to switch req to pending queue, try again */ continue; } @@ -327,11 +393,18 @@ ucs_status_t ucp_do_am_zcopy_multi(uct_pending_req_t *self, uint8_t am_id_first, return UCS_OK; } } + ucp_request_send_state_advance(req, &state, UCP_REQUEST_SEND_PROTO_ZCOPY_AM, status); - - return UCS_STATUS_IS_ERR(status) ? status : UCS_INPROGRESS; + if (UCS_STATUS_IS_ERR(status)) { + return status; + } else { + if (enable_am_bw) { + ucp_send_request_next_am_bw_lane(req); + } + return UCS_INPROGRESS; + } } } @@ -340,21 +413,17 @@ ucp_proto_get_zcopy_threshold(const ucp_request_t *req, const ucp_ep_msg_config_t *msg_config, size_t count, size_t max_zcopy) { - ucp_worker_h worker; + ucp_worker_h worker; ucp_lane_index_t lane; - ucp_rsc_index_t rsc_index; - size_t zcopy_thresh; + ucp_rsc_index_t rsc_index; + size_t zcopy_thresh; if (ucs_unlikely(msg_config->max_zcopy == 0)) { return max_zcopy; } - if (ucs_unlikely(!UCP_MEM_IS_HOST(req->send.mem_type))) { - return ucs_min(max_zcopy, msg_config->mem_type_zcopy_thresh[req->send.mem_type]); - } - if (ucs_likely(UCP_DT_IS_CONTIG(req->send.datatype))) { - return ucs_min(max_zcopy, msg_config->zcopy_thresh[0]); + return ucs_min(max_zcopy, msg_config->mem_type_zcopy_thresh[req->send.mem_type]); } else if (UCP_DT_IS_IOV(req->send.datatype)) { if (0 == count) { /* disable zcopy */ @@ -373,7 +442,7 @@ ucp_proto_get_zcopy_threshold(const ucp_request_t *req, zcopy_thresh = ucp_ep_config_get_zcopy_auto_thresh(count, &ucp_ep_md_attr(req->send.ep, lane)->reg_cost, worker->context, - ucp_worker_iface_get_attr(worker, rsc_index)->bandwidth); + ucp_worker_iface_bandwidth(worker, rsc_index)); } return ucs_min(max_zcopy, zcopy_thresh); } else if (UCP_DT_IS_GENERIC(req->send.datatype)) { @@ -389,8 +458,29 @@ static UCS_F_ALWAYS_INLINE ssize_t ucp_proto_get_short_max(const ucp_request_t *req, const ucp_ep_msg_config_t *msg_config) { - return (!UCP_DT_IS_CONTIG(req->send.datatype) || + return (!UCP_DT_IS_CONTIG(req->send.datatype) || (req->flags & UCP_REQUEST_FLAG_SYNC) || (!UCP_MEM_IS_HOST(req->send.mem_type))) ? -1 : msg_config->max_short; } + +static UCS_F_ALWAYS_INLINE ucp_request_t* +ucp_proto_ssend_ack_request_alloc(ucp_worker_h worker, uintptr_t ep_ptr) +{ + ucp_request_t *req; + + req = ucp_request_get(worker); + if (req == NULL) { + return NULL; + } + + req->flags = 0; + req->send.ep = ucp_worker_get_ep_by_ptr(worker, ep_ptr); + req->send.uct.func = ucp_proto_progress_am_single; + req->send.proto.comp_cb = ucp_request_put; + req->send.proto.status = UCS_OK; + + return req; +} + +#endif diff --git a/src/ucp/proto/proto_select.h b/src/ucp/proto/proto_select.h new file mode 100644 index 00000000000..65ab792248f --- /dev/null +++ b/src/ucp/proto/proto_select.h @@ -0,0 +1,62 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCP_PROTO_SELECT_H_ +#define UCP_PROTO_SELECT_H_ + +#include "proto.h" + +#include + + +/** + * Some flags from ucp_request_param_t.op_attr_mask can affect protocol + * selection decision. + */ +#define UCP_PROTO_SELECT_OP_ATTR_BASE UCP_OP_ATTR_FLAG_NO_IMM_CMPL +#define UCP_PROTO_SELECT_OP_ATTR_MASK UCP_OP_ATTR_FLAG_FAST_CMPL + + +/** + * Entry which defines which protocol should be used for a message size range. + */ +typedef struct { + ucp_proto_config_t proto_config; /* Protocol configuration to use */ + size_t max_msg_length; /* Max message length, inclusive */ +} ucp_proto_threshold_elem_t; + + +/** + * Protocol selection per a particular buffer type and operation + */ +typedef struct { + ucp_proto_threshold_elem_t *thresholds; /* Array of which protocol to use + for different message sizes */ + void *priv_buf; /* Private configuration area for + the selected protocols */ +} ucp_proto_select_elem_t; + + +/* Hash type of mapping a buffer-type (key) to a protocol selection */ +KHASH_TYPE(ucp_proto_select_hash, khint64_t, ucp_proto_select_elem_t) + + +/** + * Top-level data structure to select protocols for various buffer types + */ +typedef struct { + /* Lookup from protocol selection key to thresholds array */ + khash_t(ucp_proto_select_hash) hash; + + /* cache the last used protocol, for fast lookup */ + struct { + uint64_t key; + ucp_proto_select_elem_t *value; + } cache; +} ucp_proto_select_t; + + +#endif diff --git a/src/ucp/rma/amo_basic.c b/src/ucp/rma/amo_basic.c index e48030ca441..339af45a4ca 100644 --- a/src/ucp/rma/amo_basic.c +++ b/src/ucp/rma/amo_basic.c @@ -5,6 +5,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rma.h" #include "rma.inl" @@ -84,7 +88,7 @@ static ucs_status_t ucp_amo_basic_progress_fetch(uct_pending_req_t *self) &req->send.state.uct_comp); } else { status = uct_ep_atomic_cswap32(ep->uct_eps[req->send.lane], - value, *result, remote_addr, + value, *(uint32_t*)result, remote_addr, rkey->cache.amo_rkey, (uint32_t*)result, &req->send.state.uct_comp); } diff --git a/src/ucp/rma/amo_send.c b/src/ucp/rma/amo_send.c index 744b3064518..e2e21d84042 100644 --- a/src/ucp/rma/amo_send.c +++ b/src/ucp/rma/amo_send.c @@ -5,6 +5,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rma.h" #include "rma.inl" @@ -12,6 +16,8 @@ #include #include #include +#include + #include @@ -44,6 +50,21 @@ } +#define UCP_AMO_CHECK_PARAM_NBX(_context, _remote_addr, _size, _count, \ + _opcode, _last_opcode, _action) \ + { \ + if (ENABLE_PARAMS_CHECK) { \ + if ((_count) != 1) { \ + ucs_error("unsupported number of elements: %zu", (_count)); \ + _action; \ + } \ + } \ + \ + UCP_AMO_CHECK_PARAM(_context, _remote_addr, _size, _opcode, \ + _last_opcode, _action); \ + } + + static uct_atomic_op_t ucp_uct_op_table[] = { [UCP_ATOMIC_POST_OP_ADD] = UCT_ATOMIC_OP_ADD, [UCP_ATOMIC_POST_OP_AND] = UCT_ATOMIC_OP_AND, @@ -51,13 +72,13 @@ static uct_atomic_op_t ucp_uct_op_table[] = { [UCP_ATOMIC_POST_OP_XOR] = UCT_ATOMIC_OP_XOR }; -static uct_atomic_op_t ucp_uct_fop_table[] = { - [UCP_ATOMIC_FETCH_OP_FADD] = UCT_ATOMIC_OP_ADD, - [UCP_ATOMIC_FETCH_OP_FAND] = UCT_ATOMIC_OP_AND, - [UCP_ATOMIC_FETCH_OP_FOR] = UCT_ATOMIC_OP_OR, - [UCP_ATOMIC_FETCH_OP_FXOR] = UCT_ATOMIC_OP_XOR, - [UCP_ATOMIC_FETCH_OP_SWAP] = UCT_ATOMIC_OP_SWAP, - [UCP_ATOMIC_FETCH_OP_CSWAP] = UCT_ATOMIC_OP_CSWAP, +static uct_atomic_op_t ucp_uct_atomic_op_table[] = { + [UCP_ATOMIC_OP_ADD] = UCT_ATOMIC_OP_ADD, + [UCP_ATOMIC_OP_AND] = UCT_ATOMIC_OP_AND, + [UCP_ATOMIC_OP_OR] = UCT_ATOMIC_OP_OR, + [UCP_ATOMIC_OP_XOR] = UCT_ATOMIC_OP_XOR, + [UCP_ATOMIC_OP_SWAP] = UCT_ATOMIC_OP_SWAP, + [UCP_ATOMIC_OP_CSWAP] = UCT_ATOMIC_OP_CSWAP }; @@ -66,7 +87,7 @@ static void ucp_amo_completed_single(uct_completion_t *self, { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.state.uct_comp); - ucs_trace("invoking completion on AMO request %p", req); + ucp_trace_req(req, "invoking completion"); ucp_request_complete_send(req, status); } @@ -82,7 +103,7 @@ ucp_amo_init_common(ucp_request_t *req, ucp_ep_h ep, uct_atomic_op_t op, req->send.amo.remote_addr = remote_addr; req->send.amo.rkey = rkey; req->send.amo.value = value; -#if ENABLE_ASSERT +#if UCS_ENABLE_ASSERT req->send.lane = UCP_NULL_LANE; #endif } @@ -112,37 +133,88 @@ ucs_status_ptr_t ucp_atomic_fetch_nb(ucp_ep_h ep, ucp_atomic_fetch_op_t opcode, uint64_t value, void *result, size_t op_size, uint64_t remote_addr, ucp_rkey_h rkey, ucp_send_callback_t cb) +{ + ucp_request_param_t param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FIELD_DATATYPE | + UCP_OP_ATTR_FIELD_REPLY_BUFFER, + .datatype = ucp_dt_make_contig(op_size), + .cb.send = (ucp_send_nbx_callback_t)cb, + .reply_buffer = result + }; + + /* Note: opcode transition from ucp_atomic_fetch_op_t to ucp_atomic_op_t */ + return ucp_atomic_op_nbx(ep, (ucp_atomic_op_t)opcode, &value, 1, + remote_addr, rkey, ¶m); +} + +UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_atomic_op_nbx, + (ep, opcode, buffer, count, remote_addr, rkey, param), + ucp_ep_h ep, ucp_atomic_op_t opcode, const void *buffer, + size_t count, uint64_t remote_addr, ucp_rkey_h rkey, + const ucp_request_param_t *param) { ucs_status_ptr_t status_p; ucs_status_t status; ucp_request_t *req; + uint64_t value; + size_t op_size; - UCP_AMO_CHECK_PARAM(ep->worker->context, remote_addr, op_size, opcode, - UCP_ATOMIC_FETCH_OP_LAST, - return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM)); + if (ucs_unlikely(!(param->op_attr_mask & UCP_OP_ATTR_FIELD_DATATYPE))) { + ucs_error("missing atomic operation datatype"); + return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); + } + + if (param->datatype == ucp_dt_make_contig(8)) { + value = *(uint64_t*)buffer; + op_size = sizeof(uint64_t); + } else if (param->datatype == ucp_dt_make_contig(4)) { + value = *(uint32_t*)buffer; + op_size = sizeof(uint32_t); + } else { + ucs_error("invalid atomic operation datatype: %zu", param->datatype); + return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); + } + + UCP_AMO_CHECK_PARAM_NBX(ep->worker->context, remote_addr, op_size, + count, opcode, UCP_ATOMIC_OP_LAST, + return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM)); UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker); - ucs_trace_req("atomic_fetch_nb opcode %d value %"PRIu64" buffer %p size %zu" - " remote_addr %"PRIx64" rkey %p to %s cb %p", - opcode, value, result, op_size, remote_addr, rkey, - ucp_ep_peer_name(ep), cb); + ucs_trace_req("atomic_op_nbx opcode %d buffer %p result %p " + "datatype %zu remote_addr %"PRIx64" rkey %p to %s cb %p", + opcode, buffer, + (param->op_attr_mask & UCP_OP_ATTR_FIELD_REPLY_BUFFER) ? + param->reply_buffer : NULL, param->datatype, + remote_addr, rkey, ucp_ep_peer_name(ep), + (param->op_attr_mask & UCP_OP_ATTR_FIELD_CALLBACK) ? + param->cb.send : NULL); status = UCP_RKEY_RESOLVE(rkey, ep, amo); if (status != UCS_OK) { - status_p = UCS_STATUS_PTR(UCS_ERR_UNREACHABLE); - goto out; - } - - req = ucp_request_get(ep->worker); - if (ucs_unlikely(NULL == req)) { - status_p = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); + status_p = UCS_STATUS_PTR(status); goto out; } - ucp_amo_init_fetch(req, ep, result, ucp_uct_fop_table[opcode], op_size, - remote_addr, rkey, value, rkey->cache.amo_proto); + req = ucp_request_get_param(ep->worker, param, + {status_p = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); + goto out;}); - status_p = ucp_rma_send_request_cb(req, cb); + if (param->op_attr_mask & UCP_OP_ATTR_FIELD_REPLY_BUFFER) { + ucp_amo_init_fetch(req, ep, param->reply_buffer, + ucp_uct_atomic_op_table[opcode], op_size, + remote_addr, rkey, value, rkey->cache.amo_proto); + status_p = ucp_rma_send_request(req, param); + } else { + ucp_amo_init_post(req, ep, ucp_uct_atomic_op_table[opcode], op_size, + remote_addr, rkey, value, rkey->cache.amo_proto); + + status_p = ucp_rma_send_request(req, param); + if (UCS_PTR_IS_PTR(status_p)) { + ucp_request_release(status_p); + } + status_p = UCS_STATUS_PTR(UCS_OK); + } out: UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(ep->worker); @@ -200,7 +272,7 @@ ucp_atomic_fetch_b(ucp_ep_h ep, ucp_atomic_fetch_op_t opcode, uint64_t value, void *request; request = ucp_atomic_fetch_nb(ep, opcode, value, result, size, remote_addr, - rkey, (void*)ucs_empty_function); + rkey, (ucp_send_callback_t)ucs_empty_function); return ucp_rma_wait(ep->worker, request, op_name); } diff --git a/src/ucp/rma/amo_sw.c b/src/ucp/rma/amo_sw.c index 1a482a72832..0d7ce00b6ae 100644 --- a/src/ucp/rma/amo_sw.c +++ b/src/ucp/rma/amo_sw.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rma.h" #include "rma.inl" @@ -30,7 +34,7 @@ static size_t ucp_amo_sw_pack(void *dest, void *arg, uint8_t fetch) if (req->send.amo.uct_op == UCT_ATOMIC_OP_CSWAP) { /* compare-swap has two arguments */ - memcpy((void*)(atomich + 1) + size, req->send.buffer, size); + memcpy(UCS_PTR_BYTE_OFFSET(atomich + 1, size), req->send.buffer, size); length += size; } @@ -197,8 +201,21 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_atomic_req_handler, (arg, data, length, am_fl ucp_worker_h worker = arg; ucp_ep_h ep = ucp_worker_get_ep_by_ptr(worker, atomicreqh->req.ep_ptr); + ucp_rsc_index_t amo_rsc_idx = ucs_ffs64_safe(worker->atomic_tls); ucp_request_t *req; + if (ucs_unlikely((amo_rsc_idx != UCP_MAX_RESOURCES) && + (ucp_worker_iface_get_attr(worker, + amo_rsc_idx)->cap.flags & + UCT_IFACE_FLAG_ATOMIC_DEVICE))) { + ucs_error("Unsupported: got software atomic request while device atomics are selected on worker %p", + worker); + /* TODO: this situation will be possible then CM wireup is implemented + * and CM lane is bound to suboptimal device, then need to execute + * AMO on fastest resource from worker->atomic_tls using loopback + * EP and continue SW AMO protocol */ + } + if (atomicreqh->req.reqptr == 0) { /* atomic operation without result */ switch (atomicreqh->length) { @@ -283,7 +300,8 @@ static void ucp_amo_sw_dump_packet(ucp_worker_h worker, uct_am_trace_type_t type } p = buffer + strlen(buffer); - ucp_dump_payload(worker->context, p, buffer + max - p, data + header_len, + ucp_dump_payload(worker->context, p, buffer + max - p, + UCS_PTR_BYTE_OFFSET(data, header_len), length - header_len); } diff --git a/src/ucp/rma/flush.c b/src/ucp/rma/flush.c index b99573f3524..d2da3d5400b 100644 --- a/src/ucp/rma/flush.c +++ b/src/ucp/rma/flush.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include #include @@ -28,22 +32,48 @@ static int ucp_ep_flush_is_completed(ucp_request_t *req) static void ucp_ep_flush_progress(ucp_request_t *req) { - ucp_ep_h ep = req->send.ep; + ucp_ep_h ep = req->send.ep; + unsigned num_lanes = ucp_ep_num_lanes(ep); + ucp_lane_map_t all_lanes = UCS_MASK(num_lanes); ucp_ep_flush_state_t *flush_state; ucp_lane_index_t lane; ucs_status_t status; uct_ep_h uct_ep; + int diff; + + /* If the number of lanes changed since flush operation was submitted, adjust + * the number of expected completions */ + if (ucs_unlikely(req->send.flush.num_lanes != num_lanes)) { + ucp_trace_req(req, "ep %p: number of lanes changed from %d to %d", + ep, req->send.flush.num_lanes, num_lanes); + diff = num_lanes - req->send.flush.num_lanes; + req->send.flush.num_lanes = num_lanes; + if (diff >= 0) { + ucp_trace_req(req, + "ep %p: adjusting expected flush completion count by %d", + ep, diff); + req->send.state.uct_comp.count += diff; + } else { + /* If we have less lanes, it means we are in error flow and + * ucp_worker_set_ep_failed() was completed, so we should have + * completed the flush on all lanes. + */ + ucs_assertv(req->send.state.uct_comp.count == 0, + "uct_comp.count=%d num_lanes=%d", + req->send.state.uct_comp.count, num_lanes); + } + } - ucs_trace("ep %p: progress flush req %p, lanes 0x%x count %d", ep, req, - req->send.flush.lanes, req->send.state.uct_comp.count); + ucs_trace("ep %p: progress flush req %p, started_lanes 0x%x count %d", ep, + req, req->send.flush.started_lanes, req->send.state.uct_comp.count); - while (req->send.flush.lanes) { + while (req->send.flush.started_lanes < all_lanes) { /* Search for next lane to start flush */ - lane = ucs_ffs64(req->send.flush.lanes); + lane = ucs_ffs64(all_lanes & ~req->send.flush.started_lanes); uct_ep = ep->uct_eps[lane]; if (uct_ep == NULL) { - req->send.flush.lanes &= ~UCS_BIT(lane); + req->send.flush.started_lanes |= UCS_BIT(lane); --req->send.state.uct_comp.count; continue; } @@ -55,15 +85,16 @@ static void ucp_ep_flush_progress(ucp_request_t *req) } status = uct_ep_flush(uct_ep, req->send.flush.uct_flags, &req->send.state.uct_comp); - ucs_trace("flushing ep %p lane[%d]: %s", ep, lane, - ucs_status_string(status)); + ucp_trace_req(req, "ep %p flush lane[%d]=%p flags 0x%x: %s", + ep, lane, uct_ep, req->send.flush.uct_flags, + ucs_status_string(status)); if (status == UCS_OK) { - req->send.flush.lanes &= ~UCS_BIT(lane); + req->send.flush.started_lanes |= UCS_BIT(lane); --req->send.state.uct_comp.count; ucs_trace("ep %p: flush comp %p count reduced to %d", ep, &req->send.state.uct_comp, req->send.state.uct_comp.count); } else if (status == UCS_INPROGRESS) { - req->send.flush.lanes &= ~UCS_BIT(lane); + req->send.flush.started_lanes |= UCS_BIT(lane); } else if (status == UCS_ERR_NO_RESOURCE) { if (req->send.lane != UCP_NULL_LANE) { ucs_trace("ep %p: not adding pending flush %p on lane %d, " @@ -76,8 +107,8 @@ static void ucp_ep_flush_progress(ucp_request_t *req) ucs_trace("adding pending flush on ep %p lane[%d]: %s", ep, lane, ucs_status_string(status)); if (status == UCS_OK) { - req->send.lane = lane; - req->send.flush.lanes &= ~UCS_BIT(lane); + req->send.lane = lane; + req->send.flush.started_lanes |= UCS_BIT(lane); } else if (status != UCS_ERR_BUSY) { ucp_ep_flush_error(req, status); break; @@ -173,8 +204,10 @@ static ucs_status_t ucp_ep_flush_progress_pending(uct_pending_req_t *self) ucp_ep_flush_progress(req); completed = ucp_flush_check_completion(req); - /* If the operation has not completed, add slow-path progress to resume */ - if (!completed && req->send.flush.lanes) { + /* If the operation has not completed, and not started on all lanes, add + * slow-path progress to resume */ + if (!completed && + (req->send.flush.started_lanes != UCS_MASK(ucp_ep_num_lanes(ep)))) { ucs_trace("ep %p: adding slow-path callback to resume flush", ep); uct_worker_progress_register_safe(ep->worker->uct, ucp_ep_flush_resume_slow_path_callback, @@ -195,7 +228,7 @@ static ucs_status_t ucp_ep_flush_progress_pending(uct_pending_req_t *self) } } -static void ucp_ep_flush_completion(uct_completion_t *self, ucs_status_t status) +void ucp_ep_flush_completion(uct_completion_t *self, ucs_status_t status) { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.state.uct_comp); @@ -219,6 +252,29 @@ static void ucp_ep_flush_completion(uct_completion_t *self, ucs_status_t status) ucp_flush_check_completion(req); } +void ucp_ep_flush_request_ff(ucp_request_t *req, ucs_status_t status) +{ + /* Calculate how many completions to emulate: 1 for every lane we did not + * start to flush yet, plus one for the lane from which we just removed + * this request from its pending queue + */ + int num_comps = req->send.flush.num_lanes - + ucs_popcount(req->send.flush.started_lanes) + + 1; + + ucp_trace_req(req, "fast-forward flush, comp-=%d num_lanes %d started 0x%x", + num_comps, req->send.flush.num_lanes, + req->send.flush.started_lanes); + + req->send.flush.started_lanes = UCS_MASK(req->send.flush.num_lanes); + + ucs_assert(req->send.state.uct_comp.count >= num_comps); + req->send.state.uct_comp.count -= num_comps; + if (req->send.state.uct_comp.count == 0) { + req->send.state.uct_comp.func(&req->send.state.uct_comp, status); + } +} + void ucp_ep_flush_remote_completed(ucp_request_t *req) { ucs_trace_req("flush remote ops completed req=%p", req); @@ -230,8 +286,8 @@ void ucp_ep_flush_remote_completed(ucp_request_t *req) } ucs_status_ptr_t ucp_ep_flush_internal(ucp_ep_h ep, unsigned uct_flags, - ucp_send_callback_t req_cb, unsigned req_flags, + const ucp_request_param_t *param, ucp_request_t *worker_req, ucp_request_callback_t flushed_cb, const char *debug_name) @@ -245,43 +301,42 @@ ucs_status_ptr_t ucp_ep_flush_internal(ucp_ep_h ep, unsigned uct_flags, return NULL; } - req = ucp_request_get(ep->worker); - if (req == NULL) { - return UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); - } + req = ucp_request_get_param(ep->worker, param, + {return UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);}); /* - * Flush operation can be queued on the pending queue of only one of the + * Flush operation can be queued on the pending queue of only one of the * lanes (indicated by req->send.lane) and scheduled for completion on any * number of lanes. req->send.uct_comp.count keeps track of how many lanes * are not flushed yet, and when it reaches zero, it means all lanes are * flushed. req->send.flush.lanes keeps track of which lanes we still have * to start flush on. - */ - req->flags = req_flags; - req->status = UCS_OK; - req->send.ep = ep; - req->send.cb = req_cb; - req->send.flush.flushed_cb = flushed_cb; - req->send.flush.lanes = UCS_MASK(ucp_ep_num_lanes(ep)); - req->send.flush.prog_id = UCS_CALLBACKQ_ID_NULL; - req->send.flush.uct_flags = uct_flags; - req->send.flush.worker_req = worker_req; - req->send.flush.sw_started = 0; - req->send.flush.sw_done = 0; - - req->send.lane = UCP_NULL_LANE; - req->send.uct.func = ucp_ep_flush_progress_pending; - req->send.state.uct_comp.func = ucp_ep_flush_completion; - req->send.state.uct_comp.count = ucp_ep_num_lanes(ep); - + */ + req->flags = req_flags; + req->status = UCS_OK; + req->send.ep = ep; + req->send.flush.flushed_cb = flushed_cb; + req->send.flush.prog_id = UCS_CALLBACKQ_ID_NULL; + req->send.flush.uct_flags = uct_flags; + req->send.flush.worker_req = worker_req; + req->send.flush.sw_started = 0; + req->send.flush.sw_done = 0; + req->send.flush.num_lanes = ucp_ep_num_lanes(ep);; + req->send.flush.started_lanes = 0; + + req->send.lane = UCP_NULL_LANE; + req->send.uct.func = ucp_ep_flush_progress_pending; + req->send.state.uct_comp.func = ucp_ep_flush_completion; + req->send.state.uct_comp.count = ucp_ep_num_lanes(ep); + + ucp_request_set_send_callback_param(param, req, send); ucp_ep_flush_progress(req); if (ucp_ep_flush_is_completed(req)) { status = req->status; ucs_trace_req("ep %p: releasing flush request %p, returning status %s", ep, req, ucs_status_string(status)); - ucp_request_put(req); + ucp_request_put_param(param, req) return UCS_STATUS_PTR(status); } @@ -297,14 +352,25 @@ static void ucp_ep_flushed_callback(ucp_request_t *req) UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_ep_flush_nb, (ep, flags, cb), ucp_ep_h ep, unsigned flags, ucp_send_callback_t cb) +{ + ucp_request_param_t param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK, + .cb.send = (ucp_send_nbx_callback_t)cb + }; + + return ucp_ep_flush_nbx(ep, ¶m); +} + +UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_ep_flush_nbx, (ep, param), + ucp_ep_h ep, const ucp_request_param_t *param) { void *request; UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker); - request = ucp_ep_flush_internal(ep, UCT_FLUSH_FLAG_LOCAL, cb, - UCP_REQUEST_FLAG_CALLBACK, NULL, - ucp_ep_flushed_callback, "flush_nb"); + request = ucp_ep_flush_internal(ep, UCT_FLUSH_FLAG_LOCAL, 0, param, + NULL, ucp_ep_flushed_callback, + "flush_nbx"); UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(ep->worker); @@ -322,7 +388,7 @@ static ucs_status_t ucp_worker_flush_check(ucp_worker_h worker) } for (iface_id = 0; iface_id < worker->num_ifaces; ++iface_id) { - wiface = &worker->ifaces[iface_id]; + wiface = worker->ifaces[iface_id]; if (wiface->iface == NULL) { continue; } @@ -358,7 +424,7 @@ static void ucp_worker_flush_complete_one(ucp_request_t *req, ucs_status_t statu if (complete) { ucs_assert(status != UCS_INPROGRESS); - ucp_request_complete(req, flush_worker.cb, status); + ucp_request_complete(req, flush_worker.cb, status, req->user_data); } } @@ -395,8 +461,9 @@ static unsigned ucp_worker_flush_progress(void *arg) req->flush_worker.next_ep = ucs_list_next(&next_ep->ep_list, ucp_ep_ext_gen_t, ep_list); - ep_flush_request = ucp_ep_flush_internal(ep, UCT_FLUSH_FLAG_LOCAL, NULL, - UCP_REQUEST_FLAG_RELEASED, req, + ep_flush_request = ucp_ep_flush_internal(ep, UCT_FLUSH_FLAG_LOCAL, + UCP_REQUEST_FLAG_RELEASED, + &ucp_request_null_param, req, ucp_worker_flush_ep_flushed_cb, "flush_worker"); if (UCS_PTR_IS_ERR(ep_flush_request)) { @@ -412,9 +479,9 @@ static unsigned ucp_worker_flush_progress(void *arg) return 0; } -static ucs_status_ptr_t ucp_worker_flush_nb_internal(ucp_worker_h worker, - ucp_send_callback_t cb, - unsigned req_flags) +static ucs_status_ptr_t +ucp_worker_flush_nbx_internal(ucp_worker_h worker, + const ucp_request_param_t *param) { ucs_status_t status; ucp_request_t *req; @@ -424,21 +491,19 @@ static ucs_status_ptr_t ucp_worker_flush_nb_internal(ucp_worker_h worker, return UCS_STATUS_PTR(status); } - req = ucp_request_get(worker); - if (req == NULL) { - return UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); - } + req = ucp_request_get_param(worker, param, + {return UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);}); - req->flags = req_flags; + req->flags = 0; req->status = UCS_OK; req->flush_worker.worker = worker; - req->flush_worker.cb = cb; req->flush_worker.comp_count = 1; /* counting starts from 1, and decremented when finished going over all endpoints */ req->flush_worker.prog_id = UCS_CALLBACKQ_ID_NULL; req->flush_worker.next_ep = ucs_list_head(&worker->all_eps, ucp_ep_ext_gen_t, ep_list); + ucp_request_set_send_callback_param(param, req, flush_worker); uct_worker_progress_register_safe(worker->uct, ucp_worker_flush_progress, req, 0, &req->flush_worker.prog_id); return req + 1; @@ -446,13 +511,23 @@ static ucs_status_ptr_t ucp_worker_flush_nb_internal(ucp_worker_h worker, UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_worker_flush_nb, (worker, flags, cb), ucp_worker_h worker, unsigned flags, ucp_send_callback_t cb) +{ + ucp_request_param_t param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK, + .cb.send = (ucp_send_nbx_callback_t)cb + }; + + return ucp_worker_flush_nbx(worker, ¶m); +} + +UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_worker_flush_nbx, (worker, param), + ucp_worker_h worker, const ucp_request_param_t *param) { void *request; UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); - request = ucp_worker_flush_nb_internal(worker, cb, - UCP_REQUEST_FLAG_CALLBACK); + request = ucp_worker_flush_nbx_internal(worker, param); UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker); @@ -471,8 +546,8 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_worker_flush, (worker), ucp_worker_h worker) UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); - request = ucp_worker_flush_nb_internal(worker, NULL, 0); - status = ucp_flush_wait(worker, request); + request = ucp_worker_flush_nbx_internal(worker, &ucp_request_null_param); + status = ucp_flush_wait(worker, request); UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker); @@ -486,7 +561,8 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_ep_flush, (ep), ucp_ep_h ep) UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker); - request = ucp_ep_flush_internal(ep, UCT_FLUSH_FLAG_LOCAL, NULL, 0, NULL, + request = ucp_ep_flush_internal(ep, UCT_FLUSH_FLAG_LOCAL, 0, + &ucp_request_null_param, NULL, ucp_ep_flushed_callback, "flush"); status = ucp_flush_wait(ep->worker, request); diff --git a/src/ucp/rma/rma.h b/src/ucp/rma/rma.h index 84b9608b304..073f882957b 100644 --- a/src/ucp/rma/rma.h +++ b/src/ucp/rma/rma.h @@ -7,7 +7,9 @@ #ifndef UCP_RMA_H_ #define UCP_RMA_H_ -#include +#include +#include +#include /** @@ -64,7 +66,7 @@ typedef struct { typedef struct { uint64_t address; - ucp_request_hdr_t req; // NULL if no reply + ucp_request_hdr_t req; /* NULL if no reply */ uint8_t length; uint8_t opcode; } UCS_S_PACKED ucp_atomic_req_hdr_t; diff --git a/src/ucp/rma/rma.inl b/src/ucp/rma/rma.inl index 772cf508358..3f6ac9d72e8 100644 --- a/src/ucp/rma/rma.inl +++ b/src/ucp/rma/rma.inl @@ -14,6 +14,7 @@ #include +/* TODO: remove it after AMO API is implemented via NBX */ static UCS_F_ALWAYS_INLINE ucs_status_ptr_t ucp_rma_send_request_cb(ucp_request_t *req, ucp_send_callback_t cb) { @@ -22,13 +23,30 @@ ucp_rma_send_request_cb(ucp_request_t *req, ucp_send_callback_t cb) if (req->flags & UCP_REQUEST_FLAG_COMPLETED) { ucs_trace_req("releasing send request %p, returning status %s", req, ucs_status_string(status)); - ucs_mpool_put(req); + ucp_request_put(req); return UCS_STATUS_PTR(status); } ucs_trace_req("returning request %p, status %s", req, ucs_status_string(status)); - ucp_request_set_callback(req, send.cb, cb); + ucp_request_set_callback(req, send.cb, (ucp_send_nbx_callback_t)cb, NULL); + return req + 1; +} + +static UCS_F_ALWAYS_INLINE ucs_status_ptr_t +ucp_rma_send_request(ucp_request_t *req, const ucp_request_param_t *param) +{ + ucs_status_t status = ucp_request_send(req, 0); + + if (req->flags & UCP_REQUEST_FLAG_COMPLETED) { + ucp_request_imm_cmpl_param(param, req, status, send); + } + + ucs_trace_req("returning request %p, status %s", req, + ucs_status_string(status)); + + ucp_request_set_send_callback_param(param, req, send); + return req + 1; } @@ -71,7 +89,8 @@ static inline void ucp_ep_rma_remote_request_completed(ucp_ep_t *ep) ucs_queue_for_each_extract(req, &flush_state->reqs, send.flush.queue, UCS_CIRCULAR_COMPARE32(req->send.flush.cmpl_sn, - <= ,flush_state->cmpl_sn)) { + <= , + flush_state->cmpl_sn)) { ucp_ep_flush_remote_completed(req); } } diff --git a/src/ucp/rma/rma_basic.c b/src/ucp/rma/rma_basic.c index e101d23229e..cc1ac441d9a 100644 --- a/src/ucp/rma/rma_basic.c +++ b/src/ucp/rma/rma_basic.c @@ -6,6 +6,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rma.h" #include diff --git a/src/ucp/rma/rma_send.c b/src/ucp/rma/rma_send.c index 7c993d8e924..bd46828c89b 100644 --- a/src/ucp/rma/rma_send.c +++ b/src/ucp/rma/rma_send.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rma.h" #include "rma.inl" @@ -11,6 +15,7 @@ #include #include +#include #define UCP_RMA_CHECK_BUFFER(_buffer, _action) \ @@ -48,6 +53,14 @@ } while (0) +#define UCP_RMA_CHECK_CONTIG1(_param) \ + if (ucs_unlikely(ENABLE_PARAMS_CHECK && \ + ((_param)->op_attr_mask & UCP_OP_ATTR_FIELD_DATATYPE) && \ + ((_param)->datatype != ucp_dt_make_contig(1)))) { \ + return UCS_STATUS_PTR(UCS_ERR_UNSUPPORTED); \ + } + + /* request can be released if * - all fragments were sent (length == 0) (bcopy & zcopy mix) * - all zcopy fragments are done (uct_comp.count == 0) @@ -85,7 +98,7 @@ ucs_status_t ucp_rma_request_advance(ucp_request_t *req, ssize_t frag_length, } return UCS_OK; } - req->send.buffer += frag_length; + req->send.buffer = UCS_PTR_BYTE_OFFSET(req->send.buffer, frag_length); req->send.rma.remote_addr += frag_length; return UCS_INPROGRESS; } @@ -122,7 +135,7 @@ ucp_rma_request_init(ucp_request_t *req, ucp_ep_h ep, const void *buffer, req->send.ep = ep; req->send.buffer = (void*)buffer; req->send.datatype = ucp_dt_make_contig(1); - req->send.mem_type = UCT_MD_MEM_TYPE_HOST; + req->send.mem_type = UCS_MEMORY_TYPE_HOST; req->send.length = length; req->send.rma.remote_addr = remote_addr; req->send.rma.rkey = rkey; @@ -134,52 +147,27 @@ ucp_rma_request_init(ucp_request_t *req, ucp_ep_h ep, const void *buffer, ucp_rma_request_bcopy_completion : ucp_rma_request_zcopy_completion, UCP_REQUEST_SEND_PROTO_RMA); -#if ENABLE_ASSERT +#if UCS_ENABLE_ASSERT req->send.cb = NULL; #endif if (length < zcopy_thresh) { return UCS_OK; } - return ucp_request_send_buffer_reg_lane(req, req->send.lane); + return ucp_request_send_buffer_reg_lane(req, req->send.lane, 0); } -static UCS_F_ALWAYS_INLINE ucs_status_t +static UCS_F_ALWAYS_INLINE ucs_status_ptr_t ucp_rma_nonblocking(ucp_ep_h ep, const void *buffer, size_t length, uint64_t remote_addr, ucp_rkey_h rkey, - uct_pending_callback_t progress_cb, size_t zcopy_thresh) + uct_pending_callback_t progress_cb, size_t zcopy_thresh, + const ucp_request_param_t *param) { ucs_status_t status; ucp_request_t *req; - req = ucp_request_get(ep->worker); - if (req == NULL) { - return UCS_ERR_NO_MEMORY; - } - - status = ucp_rma_request_init(req, ep, buffer, length, remote_addr, rkey, - progress_cb, zcopy_thresh, - UCP_REQUEST_FLAG_RELEASED); - if (ucs_unlikely(status != UCS_OK)) { - return status; - } - - return ucp_request_send(req, 0); -} - -static UCS_F_ALWAYS_INLINE ucs_status_ptr_t -ucp_rma_nonblocking_cb(ucp_ep_h ep, const void *buffer, size_t length, - uint64_t remote_addr, ucp_rkey_h rkey, - uct_pending_callback_t progress_cb, size_t zcopy_thresh, - ucp_send_callback_t cb) -{ - ucs_status_t status; - ucp_request_t *req; - - req = ucp_request_get(ep->worker); - if (req == NULL) { - return UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); - } + req = ucp_request_get_param(ep->worker, param, + {return UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);}); status = ucp_rma_request_init(req, ep, buffer, length, remote_addr, rkey, progress_cb, zcopy_thresh, 0); @@ -187,58 +175,53 @@ ucp_rma_nonblocking_cb(ucp_ep_h ep, const void *buffer, size_t length, return UCS_STATUS_PTR(status); } - return ucp_rma_send_request_cb(req, cb); + return ucp_rma_send_request(req, param); } ucs_status_t ucp_put_nbi(ucp_ep_h ep, const void *buffer, size_t length, uint64_t remote_addr, ucp_rkey_h rkey) { - ucp_ep_rma_config_t *rma_config; - ucs_status_t status; - - UCP_RMA_CHECK(ep->worker->context, buffer, length); - - UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker); - - ucs_trace_req("put_nbi buffer %p length %zu remote_addr %"PRIx64" rkey %p to %s", - buffer, length, remote_addr, rkey, ucp_ep_peer_name(ep)); - - status = UCP_RKEY_RESOLVE(rkey, ep, rma); - if (status != UCS_OK) { - goto out_unlock; - } + ucs_status_ptr_t status_ptr; - /* Fast path for a single short message */ - if (ucs_likely((ssize_t)length <= (int)rkey->cache.max_put_short)) { - status = UCS_PROFILE_CALL(uct_ep_put_short, ep->uct_eps[rkey->cache.rma_lane], - buffer, length, remote_addr, rkey->cache.rma_rkey); - if (ucs_likely(status != UCS_ERR_NO_RESOURCE)) { - goto out_unlock; - } + status_ptr = ucp_put_nbx(ep, buffer, length, remote_addr, rkey, + &ucp_request_null_param); + if (UCS_PTR_IS_PTR(status_ptr)) { + ucp_request_free(status_ptr); + return UCS_INPROGRESS; } - rma_config = &ucp_ep_config(ep)->rma[rkey->cache.rma_lane]; - status = ucp_rma_nonblocking(ep, buffer, length, remote_addr, rkey, - rkey->cache.rma_proto->progress_put, - rma_config->put_zcopy_thresh); -out_unlock: - UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(ep->worker); - return status; + /* coverity[overflow] */ + return UCS_PTR_STATUS(status_ptr); } ucs_status_ptr_t ucp_put_nb(ucp_ep_h ep, const void *buffer, size_t length, uint64_t remote_addr, ucp_rkey_h rkey, ucp_send_callback_t cb) +{ + ucp_request_param_t param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK, + .cb.send = (ucp_send_nbx_callback_t)cb + }; + + return ucp_put_nbx(ep, buffer, length, remote_addr, rkey, ¶m); +} + +ucs_status_ptr_t ucp_put_nbx(ucp_ep_h ep, const void *buffer, size_t count, + uint64_t remote_addr, ucp_rkey_h rkey, + const ucp_request_param_t *param) { ucp_ep_rma_config_t *rma_config; ucs_status_ptr_t ptr_status; ucs_status_t status; - UCP_RMA_CHECK_PTR(ep->worker->context, buffer, length); + UCP_RMA_CHECK_CONTIG1(param); + UCP_RMA_CHECK_PTR(ep->worker->context, buffer, count); UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker); - ucs_trace_req("put_nb buffer %p length %zu remote_addr %"PRIx64" rkey %p to %s cb %p", - buffer, length, remote_addr, rkey, ucp_ep_peer_name(ep), cb); + ucs_trace_req("put_nbx buffer %p count %zu remote_addr %"PRIx64" rkey %p to %s cb %p", + buffer, count, remote_addr, rkey, ucp_ep_peer_name(ep), + (param->op_attr_mask & UCP_OP_ATTR_FIELD_CALLBACK) ? + param->cb.send : NULL); status = UCP_RKEY_RESOLVE(rkey, ep, rma); if (status != UCS_OK) { @@ -247,19 +230,25 @@ ucs_status_ptr_t ucp_put_nb(ucp_ep_h ep, const void *buffer, size_t length, } /* Fast path for a single short message */ - if (ucs_likely((ssize_t)length <= (int)rkey->cache.max_put_short)) { + if (ucs_likely(!(param->op_attr_mask & UCP_OP_ATTR_FLAG_NO_IMM_CMPL) && + ((ssize_t)count <= rkey->cache.max_put_short))) { status = UCS_PROFILE_CALL(uct_ep_put_short, ep->uct_eps[rkey->cache.rma_lane], - buffer, length, remote_addr, rkey->cache.rma_rkey); + buffer, count, remote_addr, rkey->cache.rma_rkey); if (ucs_likely(status != UCS_ERR_NO_RESOURCE)) { ptr_status = UCS_STATUS_PTR(status); goto out_unlock; } } + if (ucs_unlikely(param->op_attr_mask & UCP_OP_ATTR_FLAG_FORCE_IMM_CMPL)) { + ptr_status = UCS_STATUS_PTR(UCS_ERR_NO_RESOURCE); + goto out_unlock; + } + rma_config = &ucp_ep_config(ep)->rma[rkey->cache.rma_lane]; - ptr_status = ucp_rma_nonblocking_cb(ep, buffer, length, remote_addr, rkey, - rkey->cache.rma_proto->progress_put, - rma_config->put_zcopy_thresh, cb); + ptr_status = ucp_rma_nonblocking(ep, buffer, count, remote_addr, rkey, + rkey->cache.rma_proto->progress_put, + rma_config->put_zcopy_thresh, param); out_unlock: UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(ep->worker); return ptr_status; @@ -268,42 +257,52 @@ ucs_status_ptr_t ucp_put_nb(ucp_ep_h ep, const void *buffer, size_t length, ucs_status_t ucp_get_nbi(ucp_ep_h ep, void *buffer, size_t length, uint64_t remote_addr, ucp_rkey_h rkey) { - ucp_ep_rma_config_t *rma_config; - ucs_status_t status; + ucs_status_ptr_t status_ptr; - UCP_RMA_CHECK(ep->worker->context, buffer, length); - UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker); - - ucs_trace_req("get_nbi buffer %p length %zu remote_addr %"PRIx64" rkey %p from %s", - buffer, length, remote_addr, rkey, ucp_ep_peer_name(ep)); - - status = UCP_RKEY_RESOLVE(rkey, ep, rma); - if (status != UCS_OK) { - goto out_unlock; + status_ptr = ucp_get_nbx(ep, buffer, length, remote_addr, rkey, + &ucp_request_null_param); + if (UCS_PTR_IS_PTR(status_ptr)) { + ucp_request_free(status_ptr); + return UCS_INPROGRESS; } - rma_config = &ucp_ep_config(ep)->rma[rkey->cache.rma_lane]; - status = ucp_rma_nonblocking(ep, buffer, length, remote_addr, rkey, - rkey->cache.rma_proto->progress_get, - rma_config->get_zcopy_thresh); -out_unlock: - UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(ep->worker); - return status; + /* coverity[overflow] */ + return UCS_PTR_STATUS(status_ptr); } ucs_status_ptr_t ucp_get_nb(ucp_ep_h ep, void *buffer, size_t length, uint64_t remote_addr, ucp_rkey_h rkey, ucp_send_callback_t cb) +{ + ucp_request_param_t param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK, + .cb.send = (ucp_send_nbx_callback_t)cb + }; + + return ucp_get_nbx(ep, buffer, length, remote_addr, rkey, ¶m); +} + +ucs_status_ptr_t ucp_get_nbx(ucp_ep_h ep, void *buffer, size_t count, + uint64_t remote_addr, ucp_rkey_h rkey, + const ucp_request_param_t *param) { ucp_ep_rma_config_t *rma_config; ucs_status_ptr_t ptr_status; ucs_status_t status; - UCP_RMA_CHECK_PTR(ep->worker->context, buffer, length); + UCP_RMA_CHECK_CONTIG1(param); + + if (ucs_unlikely(param->op_attr_mask & UCP_OP_ATTR_FLAG_FORCE_IMM_CMPL)) { + return UCS_STATUS_PTR(UCS_ERR_NO_RESOURCE); + } + + UCP_RMA_CHECK_PTR(ep->worker->context, buffer, count); UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker); - ucs_trace_req("get_nb buffer %p length %zu remote_addr %"PRIx64" rkey %p from %s cb %p", - buffer, length, remote_addr, rkey, ucp_ep_peer_name(ep), cb); + ucs_trace_req("get_nbx buffer %p count %zu remote_addr %"PRIx64" rkey %p from %s cb %p", + buffer, count, remote_addr, rkey, ucp_ep_peer_name(ep), + (param->op_attr_mask & UCP_OP_ATTR_FIELD_CALLBACK) ? + param->cb.send : NULL); status = UCP_RKEY_RESOLVE(rkey, ep, rma); if (status != UCS_OK) { @@ -312,9 +311,9 @@ ucs_status_ptr_t ucp_get_nb(ucp_ep_h ep, void *buffer, size_t length, } rma_config = &ucp_ep_config(ep)->rma[rkey->cache.rma_lane]; - ptr_status = ucp_rma_nonblocking_cb(ep, buffer, length, remote_addr, rkey, - rkey->cache.rma_proto->progress_get, - rma_config->get_zcopy_thresh, cb); + ptr_status = ucp_rma_nonblocking(ep, buffer, count, remote_addr, rkey, + rkey->cache.rma_proto->progress_get, + rma_config->get_zcopy_thresh, param); out_unlock: UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(ep->worker); return ptr_status; @@ -326,7 +325,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_put, (ep, buffer, length, remote_addr, rkey), { return ucp_rma_wait(ep->worker, ucp_put_nb(ep, buffer, length, remote_addr, rkey, - (void*)ucs_empty_function), + (ucp_send_callback_t)ucs_empty_function), "put"); } @@ -336,6 +335,6 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_get, (ep, buffer, length, remote_addr, rkey), { return ucp_rma_wait(ep->worker, ucp_get_nb(ep, buffer, length, remote_addr, rkey, - (void*)ucs_empty_function), + (ucp_send_callback_t)ucs_empty_function), "get"); } diff --git a/src/ucp/rma/rma_sw.c b/src/ucp/rma/rma_sw.c index 3fc7129dc68..451188f3fd6 100644 --- a/src/ucp/rma/rma_sw.c +++ b/src/ucp/rma/rma_sw.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rma.h" #include "rma.inl" @@ -37,10 +41,9 @@ static ucs_status_t ucp_rma_sw_progress_put(uct_pending_req_t *self) ssize_t packed_len; ucs_status_t status; - ucs_assert(req->send.lane == ucp_ep_get_am_lane(ep)); - - packed_len = uct_ep_am_bcopy(ep->uct_eps[req->send.lane], UCP_AM_ID_PUT, - ucp_rma_sw_put_pack_cb, req, 0); + req->send.lane = ucp_ep_get_am_lane(ep); + packed_len = uct_ep_am_bcopy(ep->uct_eps[req->send.lane], UCP_AM_ID_PUT, + ucp_rma_sw_put_pack_cb, req, 0); if (packed_len > 0) { status = UCS_OK; ucp_ep_rma_remote_request_sent(ep); @@ -73,10 +76,10 @@ static ucs_status_t ucp_rma_sw_progress_get(uct_pending_req_t *self) ucs_status_t status; ssize_t packed_len; - ucs_assert(req->send.lane == ucp_ep_get_am_lane(ep)); - - packed_len = uct_ep_am_bcopy(ep->uct_eps[req->send.lane], UCP_AM_ID_GET_REQ, - ucp_rma_sw_get_req_pack_cb, req, 0); + req->send.lane = ucp_ep_get_am_lane(ep); + packed_len = uct_ep_am_bcopy(ep->uct_eps[req->send.lane], + UCP_AM_ID_GET_REQ, + ucp_rma_sw_get_req_pack_cb, req, 0); if (packed_len < 0) { status = (ucs_status_t)packed_len; if (status != UCS_ERR_NO_RESOURCE) { @@ -192,7 +195,7 @@ static ucs_status_t ucp_progress_get_reply(uct_pending_req_t *self) payload_len = packed_len - sizeof(ucp_rma_rep_hdr_t); ucs_assert(payload_len >= 0); - req->send.buffer += payload_len; + req->send.buffer = UCS_PTR_BYTE_OFFSET(req->send.buffer, payload_len); req->send.length -= payload_len; if (req->send.length == 0) { @@ -283,7 +286,8 @@ static void ucp_rma_sw_dump_packet(ucp_worker_h worker, uct_am_trace_type_t type } p = buffer + strlen(buffer); - ucp_dump_payload(worker->context, p, buffer + max - p, data + header_len, + ucp_dump_payload(worker->context, p, buffer + max - p, + UCS_PTR_BYTE_OFFSET(data, header_len), length - header_len); } diff --git a/src/ucp/stream/stream.h b/src/ucp/stream/stream.h index 428e4cba68b..e196792c086 100644 --- a/src/ucp/stream/stream.h +++ b/src/ucp/stream/stream.h @@ -61,6 +61,8 @@ ucp_stream_worker_dequeue_ep_head(ucp_worker_h worker) ucp_ep_ext_proto_t *ep_ext = ucs_list_head(&worker->stream_ready_eps, ucp_ep_ext_proto_t, stream.ready_list); + + ucs_assert(ep_ext->stream.ready_list.next != NULL); ucp_stream_ep_dequeue(ep_ext); return ep_ext; } diff --git a/src/ucp/stream/stream_recv.c b/src/ucp/stream/stream_recv.c index e2d0828434e..45c7d65fd64 100644 --- a/src/ucp/stream/stream_recv.c +++ b/src/ucp/stream/stream_recv.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include #include @@ -38,7 +42,7 @@ * rdesc pointer to 'ucp_recv_desc_t *', it's needed to get access to * 'ucp_recv_desc_t *' inside @ref ucp_stream_release_data after * the buffer was returned to user by - * @ref ucp_stream_recv_data_nb as a pointer to 'paylod' + * @ref ucp_stream_recv_data_nb as a pointer to 'payload' */ @@ -181,7 +185,7 @@ ucp_stream_rdesc_advance(ucp_recv_desc_t *rdesc, ssize_t offset, ucs_assert(offset <= rdesc->length); if (ucs_unlikely(offset < 0)) { - return offset; + return (ucs_status_t)offset; } else if (ucs_likely(offset == rdesc->length)) { ucp_stream_rdesc_dequeue_and_release(rdesc, ep_ext); } else { @@ -197,16 +201,13 @@ ucp_stream_process_rdesc_inplace(ucp_recv_desc_t *rdesc, ucp_datatype_t dt, void *buffer, size_t count, size_t length, ucp_ep_ext_proto_t *ep_ext) { + ucp_worker_h worker = ucp_ep_from_ext_proto(ep_ext)->worker; ucs_status_t status; ssize_t unpacked; - uct_memory_type_t mem_type; - + ucs_memory_type_t mem_type; - ucp_memory_type_detect_mds(ucp_ep_from_ext_proto(ep_ext)->worker->context, buffer, - length, &mem_type); - - status = ucp_dt_unpack_only(ucp_ep_from_ext_proto(ep_ext)->worker, buffer, - count, dt, mem_type, + mem_type = ucp_memory_type_detect(worker->context, buffer, length); + status = ucp_dt_unpack_only(worker, buffer, count, dt, mem_type, ucp_stream_rdesc_payload(rdesc), length, 0); unpacked = ucs_likely(status == UCS_OK) ? length : status; @@ -231,15 +232,16 @@ static UCS_F_ALWAYS_INLINE void ucp_stream_recv_request_init(ucp_request_t *req, ucp_ep_h ep, void *buffer, size_t count, size_t length, ucp_datatype_t datatype, - ucp_stream_recv_callback_t cb, - uint16_t request_flags) + const ucp_request_param_t *param) { - req->flags = UCP_REQUEST_FLAG_CALLBACK | request_flags; -#if ENABLE_ASSERT - req->flags |= UCP_REQUEST_FLAG_STREAM_RECV; + uint32_t flags = ucp_request_param_flags(param); + + req->flags = UCP_REQUEST_FLAG_STREAM_RECV | + ((flags & UCP_STREAM_RECV_FLAG_WAITALL) ? + UCP_REQUEST_FLAG_STREAM_RECV_WAITALL : 0); +#if UCS_ENABLE_ASSERT req->status = UCS_OK; /* for ucp_request_recv_data_unpack() */ #endif - req->recv.stream.cb = cb; req->recv.stream.length = 0; req->recv.stream.offset = 0; @@ -250,8 +252,15 @@ ucp_stream_recv_request_init(ucp_request_t *req, ucp_ep_h ep, void *buffer, req->recv.datatype = datatype; req->recv.length = ucs_likely(!UCP_DT_IS_GENERIC(datatype)) ? length : ucp_dt_length(datatype, count, NULL, &req->recv.state); - ucp_memory_type_detect_mds(ep->worker->context, (void *)buffer, - req->recv.length, &req->recv.mem_type); + req->recv.mem_type = ucp_memory_type_detect(ep->worker->context, + (void*)buffer, req->recv.length); + + if (param->op_attr_mask & UCP_OP_ATTR_FIELD_CALLBACK) { + req->flags |= UCP_REQUEST_FLAG_CALLBACK; + req->recv.stream.cb = param->cb.recv_stream; + req->user_data = (param->op_attr_mask & UCP_OP_ATTR_FIELD_USER_DATA) ? + param->user_data : NULL; + } } static UCS_F_ALWAYS_INLINE int @@ -267,38 +276,79 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_stream_recv_nb, ucp_datatype_t datatype, ucp_stream_recv_callback_t cb, size_t *length, unsigned flags) { - ucs_status_t status = UCS_OK; - ucp_ep_ext_proto_t *ep_ext = ucp_ep_ext_proto(ep); + ucp_request_param_t param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_DATATYPE | + UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FIELD_FLAGS, + .cb.recv_stream = (ucp_stream_recv_nbx_callback_t)cb, + .flags = flags, + .datatype = datatype + }; + + return ucp_stream_recv_nbx(ep, buffer, count, length, ¶m); +} + +UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_stream_recv_nbx, + (ep, buffer, count, length, param), + ucp_ep_h ep, void *buffer, size_t count, size_t *length, + const ucp_request_param_t *param) +{ + ucs_status_t status = UCS_OK; + ucp_ep_ext_proto_t *ep_ext = ucp_ep_ext_proto(ep); + ucp_datatype_t datatype; size_t dt_length; ucp_request_t *req; ucp_recv_desc_t *rdesc; + uint32_t attr_mask; UCP_CONTEXT_CHECK_FEATURE_FLAGS(ep->worker->context, UCP_FEATURE_STREAM, return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM)); UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker); - if (ucs_likely(!UCP_DT_IS_GENERIC(datatype))) { - dt_length = ucp_dt_length(datatype, count, buffer, NULL); - if (ucs_likely(ucp_stream_recv_nb_is_inplace(ep_ext, dt_length))) { - status = ucp_stream_process_rdesc_inplace(ucp_stream_rdesc_get(ep_ext), - datatype, buffer, count, - dt_length, ep_ext); - *length = dt_length; + attr_mask = param->op_attr_mask & + (UCP_OP_ATTR_FIELD_DATATYPE | UCP_OP_ATTR_FLAG_NO_IMM_CMPL); + if (ucs_likely(attr_mask == 0)) { + datatype = ucp_dt_make_contig(1); + dt_length = count; /* use dt_lendth to suppress coverity false positive */ + if (ucs_likely(ucp_stream_recv_nb_is_inplace(ep_ext, count))) { + status = ucp_stream_process_rdesc_inplace(ucp_stream_rdesc_get(ep_ext), + datatype, buffer, count, + dt_length, ep_ext); + *length = count; goto out_status; } + } else if (attr_mask == UCP_OP_ATTR_FIELD_DATATYPE) { + datatype = param->datatype; + if (!UCP_DT_IS_GENERIC(datatype)) { + dt_length = ucp_dt_length(datatype, count, buffer, NULL); + if (ucp_stream_recv_nb_is_inplace(ep_ext, dt_length)) { + status = ucp_stream_process_rdesc_inplace(ucp_stream_rdesc_get(ep_ext), + datatype, buffer, count, + dt_length, ep_ext); + *length = dt_length; + goto out_status; + } + } else { + dt_length = 0; + } } else { - dt_length = 0; /* Suppress warnings of paranoid compilers */ + datatype = ucp_dt_make_contig(1); + dt_length = count; } - req = ucp_request_get(ep->worker); - if (ucs_unlikely(req == NULL)) { - status = UCS_ERR_NO_MEMORY; + if (ucs_unlikely(param->op_attr_mask & UCP_OP_ATTR_FLAG_FORCE_IMM_CMPL)) { + status = UCS_ERR_NO_RESOURCE; goto out_status; } + req = ucp_request_get_param(ep->worker, param, + { + status = UCS_ERR_NO_MEMORY; + goto out_status; + }); + ucp_stream_recv_request_init(req, ep, buffer, count, dt_length, datatype, - cb, (flags & UCP_STREAM_RECV_FLAG_WAITALL) ? - UCP_REQUEST_FLAG_STREAM_RECV_WAITALL : 0); + param); /* OK, lets obtain all arrived data which matches the recv size */ while ((req->recv.stream.offset < req->recv.length) && @@ -333,7 +383,7 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_stream_recv_nb, } out_put_request: - ucp_request_put(req); + ucp_request_put_param(param, req); out_status: req = UCS_STATUS_PTR(status); @@ -398,11 +448,11 @@ ucp_stream_am_data_process(ucp_worker_t *worker, ucp_ep_ext_proto_t *ep_ext, rdesc_tmp.length); } else { /* slowpath */ - rdesc = (ucp_recv_desc_t *)am_data - 1; - rdesc->length = rdesc_tmp.length; - rdesc->payload_offset = rdesc_tmp.payload_offset + sizeof(*rdesc); - rdesc->priv_length = 0; - rdesc->flags = UCP_RECV_DESC_FLAG_UCT_DESC; + rdesc = (ucp_recv_desc_t *)am_data - 1; + rdesc->length = rdesc_tmp.length; + rdesc->payload_offset = rdesc_tmp.payload_offset + sizeof(*rdesc); + rdesc->uct_desc_offset = UCP_WORKER_HEADROOM_PRIV_SIZE; + rdesc->flags = UCP_RECV_DESC_FLAG_UCT_DESC; } ucp_ep_from_ext_proto(ep_ext)->flags |= UCP_EP_FLAG_STREAM_HAS_DATA; @@ -424,18 +474,33 @@ void ucp_stream_ep_init(ucp_ep_h ep) void ucp_stream_ep_cleanup(ucp_ep_h ep) { + ucp_ep_ext_proto_t* ep_ext; + ucp_request_t *req; size_t length; void *data; - if (ep->worker->context->config.features & UCP_FEATURE_STREAM) { - while ((data = ucp_stream_recv_data_nb_nolock(ep, &length)) != NULL) { - ucs_assert_always(!UCS_PTR_IS_ERR(data)); - ucp_stream_data_release(ep, data); - } + if (!(ep->worker->context->config.features & UCP_FEATURE_STREAM)) { + return; + } - if (ucp_stream_ep_is_queued(ucp_ep_ext_proto(ep))) { - ucp_stream_ep_dequeue(ucp_ep_ext_proto(ep)); - } + /* drop unmatched data */ + while ((data = ucp_stream_recv_data_nb_nolock(ep, &length)) != NULL) { + ucs_assert_always(!UCS_PTR_IS_ERR(data)); + ucp_stream_data_release(ep, data); + } + + ep_ext = ucp_ep_ext_proto(ep); + + if (ucp_stream_ep_is_queued(ep_ext)) { + ucp_stream_ep_dequeue(ep_ext); + } + + /* cancel not completed requests */ + ucs_assert(!ucp_stream_ep_has_data(ep_ext)); + while (!ucs_queue_is_empty(&ep_ext->stream.match_q)) { + req = ucs_queue_head_elem_non_empty(&ep_ext->stream.match_q, + ucp_request_t, recv.queue); + ucp_request_complete_stream_recv(req, ep_ext, UCS_ERR_CANCELED); } } @@ -499,8 +564,8 @@ static void ucp_stream_am_dump(ucp_worker_h worker, uct_am_trace_type_t type, p = buffer + strlen(buffer); ucs_assert(hdr->ep_ptr != 0); - ucp_dump_payload(worker->context, p, buffer + max - p, data + hdr_len, - length - hdr_len); + ucp_dump_payload(worker->context, p, buffer + max - p, + UCS_PTR_BYTE_OFFSET(data, hdr_len), length - hdr_len); } UCP_DEFINE_AM(UCP_FEATURE_STREAM, UCP_AM_ID_STREAM_DATA, ucp_stream_am_handler, diff --git a/src/ucp/stream/stream_send.c b/src/ucp/stream/stream_send.c index fe67fb95806..3e3b43fe471 100644 --- a/src/ucp/stream/stream_send.c +++ b/src/ucp/stream/stream_send.c @@ -4,17 +4,26 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include #include #include -#include #include #include #include #include +#define UCP_STREAM_SEND_CHECK_STATUS(_ep, _status, _ret, _done) \ + if (ucs_likely((_status) != UCS_ERR_NO_RESOURCE)) { \ + _ret = UCS_STATUS_PTR(_status); /* UCS_OK also goes here */ \ + _done; \ + } + static UCS_F_ALWAYS_INLINE ucs_status_t ucp_stream_send_am_short(ucp_ep_t *ep, const void *buffer, size_t length) { @@ -26,7 +35,7 @@ ucp_stream_send_am_short(ucp_ep_t *ep, const void *buffer, size_t length) static void ucp_stream_send_req_init(ucp_request_t* req, ucp_ep_h ep, const void* buffer, uintptr_t datatype, - size_t count, uint16_t flags) + size_t count, uint32_t flags) { req->flags = flags; req->send.ep = ep; @@ -37,15 +46,18 @@ static void ucp_stream_send_req_init(ucp_request_t* req, ucp_ep_h ep, req->send.length = ucp_dt_length(req->send.datatype, count, req->send.buffer, &req->send.state.dt); - ucp_memory_type_detect_mds(ep->worker->context, (void *)buffer, - req->send.length, &req->send.mem_type); - VALGRIND_MAKE_MEM_UNDEFINED(&req->send.tag, sizeof(req->send.tag)); + req->send.mem_type = ucp_memory_type_detect(ep->worker->context, + (void*)buffer, + req->send.length); + VALGRIND_MAKE_MEM_UNDEFINED(&req->send.msg_proto.tag, + sizeof(req->send.msg_proto.tag)); } static UCS_F_ALWAYS_INLINE ucs_status_ptr_t ucp_stream_send_req(ucp_request_t *req, size_t count, const ucp_ep_msg_config_t* msg_config, - ucp_send_callback_t cb, const ucp_proto_t *proto) + const ucp_request_param_t *param, + const ucp_request_send_proto_t *proto) { size_t zcopy_thresh = ucp_proto_get_zcopy_threshold(req, msg_config, count, SIZE_MAX); @@ -65,13 +77,10 @@ ucp_stream_send_req(ucp_request_t *req, size_t count, */ status = ucp_request_send(req, 0); if (req->flags & UCP_REQUEST_FLAG_COMPLETED) { - ucs_trace_req("releasing send request %p, returning status %s", req, - ucs_status_string(status)); - ucp_request_put(req); - return UCS_STATUS_PTR(status); + ucp_request_imm_cmpl_param(param, req, status, send); } - ucp_request_set_callback(req, send.cb, cb) + ucp_request_set_send_callback_param(param, req, send); ucs_trace_req("returning send request %p", req); return req + 1; } @@ -81,17 +90,51 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_stream_send_nb, ucp_ep_h ep, const void *buffer, size_t count, uintptr_t datatype, ucp_send_callback_t cb, unsigned flags) { + ucp_request_param_t param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_DATATYPE | + UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FIELD_FLAGS, + .cb.send = (ucp_send_nbx_callback_t)cb, + .flags = flags, + .datatype = datatype + }; + + return ucp_stream_send_nbx(ep, buffer, count, ¶m); +} + +static UCS_F_ALWAYS_INLINE ucs_status_t +ucp_stream_send_nbx_am_short(ucp_ep_t *ep, const void *buffer, size_t length) +{ + if (ucs_likely((ssize_t)length <= ucp_ep_config(ep)->am.max_short)) { + return UCS_PROFILE_CALL(ucp_stream_send_am_short, ep, buffer, length); + } + + return UCS_ERR_NO_RESOURCE; +} + +UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_stream_send_nbx, + (ep, buffer, count, param), + ucp_ep_h ep, const void *buffer, size_t count, + const ucp_request_param_t *param) +{ + ucp_datatype_t datatype; ucp_request_t *req; size_t length; ucs_status_t status; ucs_status_ptr_t ret; + uint32_t attr_mask; + uint32_t flags; UCP_CONTEXT_CHECK_FEATURE_FLAGS(ep->worker->context, UCP_FEATURE_STREAM, return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM)); UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker); - ucs_trace_req("stream_send_nb buffer %p count %zu to %s cb %p flags %u", - buffer, count, ucp_ep_peer_name(ep), cb, flags); + flags = ucp_request_param_flags(param); + + ucs_trace_req("stream_send_nbx buffer %p count %zu to %s cb %p flags %u", + buffer, count, ucp_ep_peer_name(ep), + param->op_attr_mask & UCP_OP_ATTR_FIELD_CALLBACK ? + param->cb.send : NULL, flags); if (ucs_unlikely(flags != 0)) { ret = UCS_STATUS_PTR(UCS_ERR_NOT_IMPLEMENTED); @@ -104,29 +147,41 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_stream_send_nb, goto out; } - if (ucs_likely(UCP_DT_IS_CONTIG(datatype)) && - ucp_memory_type_cache_is_empty(ep->worker->context)) { - length = ucp_contig_dt_length(datatype, count); - if (ucs_likely((ssize_t)length <= ucp_ep_config(ep)->am.max_short)) { - status = UCS_PROFILE_CALL(ucp_stream_send_am_short, ep, buffer, - length); - if (ucs_likely(status != UCS_ERR_NO_RESOURCE)) { - UCP_EP_STAT_TAG_OP(ep, EAGER); - ret = UCS_STATUS_PTR(status); /* UCS_OK also goes here */ - goto out; + if (ucp_memory_type_cache_is_empty(ep->worker->context)) { + attr_mask = param->op_attr_mask & + (UCP_OP_ATTR_FIELD_DATATYPE | UCP_OP_ATTR_FLAG_NO_IMM_CMPL); + if (ucs_likely(attr_mask == 0)) { + status = ucp_stream_send_nbx_am_short(ep, buffer, count); + UCP_STREAM_SEND_CHECK_STATUS(ep, status, ret, goto out); + datatype = ucp_dt_make_contig(1); + } else if (attr_mask == UCP_OP_ATTR_FIELD_DATATYPE) { + datatype = param->datatype; + if (UCP_DT_IS_CONTIG(datatype)) { + length = ucp_contig_dt_length(datatype, count); + status = ucp_stream_send_nbx_am_short(ep, buffer, length); + UCP_STREAM_SEND_CHECK_STATUS(ep, status, ret, goto out); } + } else { + datatype = ucp_dt_make_contig(1); } + } else { + datatype = ucp_request_param_datatype(param); } - req = ucp_request_get(ep->worker); - if (ucs_unlikely(req == NULL)) { - ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); + if (ucs_unlikely(param->op_attr_mask & UCP_OP_ATTR_FLAG_FORCE_IMM_CMPL)) { + ret = UCS_STATUS_PTR(UCS_ERR_NO_RESOURCE); goto out; } + req = ucp_request_get_param(ep->worker, param, + { + ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); + goto out; + }); + ucp_stream_send_req_init(req, ep, buffer, datatype, count, flags); - ret = ucp_stream_send_req(req, count, &ucp_ep_config(ep)->am, cb, + ret = ucp_stream_send_req(req, count, &ucp_ep_config(ep)->am, param, ucp_ep_config(ep)->stream.proto); out: @@ -184,10 +239,10 @@ static size_t ucp_stream_pack_am_first_dt(void *dest, void *arg) size_t length; hdr->ep_ptr = ucp_request_get_dest_ep_ptr(req); - length = ucp_ep_config(req->send.ep)->am.max_bcopy - sizeof(*hdr); + length = ucs_min(ucp_ep_config(req->send.ep)->am.max_bcopy - sizeof(*hdr), + req->send.length); ucs_assert(req->send.state.dt.offset == 0); - ucs_assert(req->send.length > length); return sizeof(*hdr) + ucp_dt_pack(req->send.ep->worker, req->send.datatype, req->send.mem_type, hdr + 1, req->send.buffer, &req->send.state.dt, length); @@ -212,7 +267,6 @@ static ucs_status_t ucp_stream_bcopy_multi(uct_pending_req_t *self) ucs_status_t status = ucp_do_am_bcopy_multi(self, UCP_AM_ID_STREAM_DATA, UCP_AM_ID_STREAM_DATA, - sizeof(ucp_stream_am_hdr_t), ucp_stream_pack_am_first_dt, ucp_stream_pack_am_middle_dt, 0); if (status == UCS_OK) { @@ -248,14 +302,12 @@ static ucs_status_t ucp_stream_eager_zcopy_multi(uct_pending_req_t *self) ucp_proto_am_zcopy_req_complete, 0); } -const ucp_proto_t ucp_stream_am_proto = { +const ucp_request_send_proto_t ucp_stream_am_proto = { .contig_short = ucp_stream_contig_am_short, .bcopy_single = ucp_stream_bcopy_single, .bcopy_multi = ucp_stream_bcopy_multi, .zcopy_single = ucp_stream_eager_zcopy_single, .zcopy_multi = ucp_stream_eager_zcopy_multi, .zcopy_completion = ucp_proto_am_zcopy_completion, - .only_hdr_size = sizeof(ucp_stream_am_hdr_t), - .first_hdr_size = sizeof(ucp_stream_am_hdr_t), - .mid_hdr_size = sizeof(ucp_stream_am_hdr_t) + .only_hdr_size = sizeof(ucp_stream_am_hdr_t) }; diff --git a/src/ucp/tag/eager.h b/src/ucp/tag/eager.h index a7656bfd467..7c7034272f0 100644 --- a/src/ucp/tag/eager.h +++ b/src/ucp/tag/eager.h @@ -14,7 +14,6 @@ #include #include #include -#include /* @@ -62,12 +61,12 @@ typedef struct { } UCS_S_PACKED ucp_eager_sync_first_hdr_t; -extern const ucp_proto_t ucp_tag_eager_proto; -extern const ucp_proto_t ucp_tag_eager_sync_proto; +extern const ucp_request_send_proto_t ucp_tag_eager_proto; +extern const ucp_request_send_proto_t ucp_tag_eager_sync_proto; void ucp_tag_eager_sync_send_ack(ucp_worker_h worker, void *hdr, uint16_t recv_flags); -void ucp_tag_eager_sync_completion(ucp_request_t *req, uint16_t flag, +void ucp_tag_eager_sync_completion(ucp_request_t *req, uint32_t flag, ucs_status_t status); void ucp_tag_eager_zcopy_completion(uct_completion_t *self, ucs_status_t status); diff --git a/src/ucp/tag/eager_rcv.c b/src/ucp/tag/eager_rcv.c index 9e643990697..f828705fa01 100644 --- a/src/ucp/tag/eager_rcv.c +++ b/src/ucp/tag/eager_rcv.c @@ -1,9 +1,13 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "eager.h" #include "tag_match.inl" #include "offload.h" @@ -96,17 +100,19 @@ ucp_eager_tagged_handler(void *arg, void *data, size_t length, unsigned am_flags if (flags & UCP_RECV_DESC_FLAG_EAGER_ONLY) { req->recv.tag.info.length = recv_len; - status = ucp_request_recv_data_unpack(req, data + hdr_len, recv_len, - 0, 1); + status = ucp_request_recv_data_unpack(req, + UCS_PTR_BYTE_OFFSET(data, hdr_len), + recv_len, 0, 1); ucp_request_complete_tag_recv(req, status); } else { eagerf_hdr = data; req->recv.tag.info.length = req->recv.tag.remaining = eagerf_hdr->total_len; - status = ucp_tag_request_process_recv_data(req, data + hdr_len, - recv_len, 0, 0); - ucs_assert(status == UCS_INPROGRESS); + status = ucp_tag_request_process_recv_data(req, + UCS_PTR_BYTE_OFFSET(data, hdr_len), + recv_len, 0, 0, flags); + ucs_assert((status == UCS_OK) || (status == UCS_INPROGRESS)); ucp_tag_frag_list_process_queue(&worker->tm, req, eagerf_hdr->msg_id UCS_STATS_ARG(UCP_WORKER_STAT_TAG_RX_EAGER_CHUNK_EXP)); @@ -143,14 +149,14 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_eager_first_handler, sizeof(ucp_eager_first_hdr_t), 0); } -UCS_PROFILE_FUNC(ucs_status_t, ucp_eager_middle_handler, - (arg, data, length, am_flags), - void *arg, void *data, size_t length, unsigned am_flags) +static UCS_F_ALWAYS_INLINE ucs_status_t +ucp_eager_common_middle_handler(ucp_worker_t *worker, void *data, size_t length, + uint16_t hdr_len, unsigned tl_flags, + uint16_t flags, uint16_t priv_length) { - ucp_worker_h worker = arg; ucp_eager_middle_hdr_t *hdr = data; + ucp_recv_desc_t *rdesc = NULL; ucp_tag_frag_match_t *matchq; - ucp_recv_desc_t *rdesc; ucp_request_t *req; ucs_status_t status; size_t recv_len; @@ -158,6 +164,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_eager_middle_handler, int ret; iter = kh_put(ucp_tag_frag_hash, &worker->tm.frag_hash, hdr->msg_id, &ret); + ucs_assert(ret >= 0); matchq = &kh_value(&worker->tm.frag_hash, iter); if (ret != 0) { /* initialize a previously empty hash entry */ @@ -166,20 +173,35 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_eager_middle_handler, if (ucp_tag_frag_match_is_unexp(matchq)) { /* add new received descriptor to the queue */ - status = ucp_recv_desc_init(worker, data, length, 0, am_flags, - sizeof(*hdr), UCP_RECV_DESC_FLAG_EAGER, 0, - &rdesc); - if (!UCS_STATUS_IS_ERR(status)) { + status = ucp_recv_desc_init(worker, data, length, 0, tl_flags, + hdr_len, flags, priv_length, &rdesc); + if (ucs_likely(!UCS_STATUS_IS_ERR(status))) { ucp_tag_frag_match_add_unexp(matchq, rdesc, hdr->offset); + } else if (ucs_queue_is_empty(&matchq->unexp_q)) { + /* If adding the first fragment to the unexpected queue fails, + * remove the element from the hash. Otherwise hash would contain an + * empty queue, which is not allowed, because queue implementation + * relies on the address of its head for certain operations (e.g. + * ucs_queue_is_empty). And khash may change address of its elements + * during resize (provoked by kh_put). */ + kh_del(ucp_tag_frag_hash, &worker->tm.frag_hash, iter); } } else { + /* If fragment is expected, the corresponding element must be present + * in the hash (added in ucp_tag_frag_list_process_queue). */ + ucs_assert(ret == 0); + /* hash entry contains a request, copy data to user buffer */ req = matchq->exp_req; - recv_len = length - sizeof(*hdr); + recv_len = length - hdr_len; UCP_WORKER_STAT_EAGER_CHUNK(worker, EXP); - status = ucp_tag_request_process_recv_data(req, data + sizeof(*hdr), - recv_len, hdr->offset, 0); + + /* Need to use hdr_len rather than sizeof(*hdr), because tag offload flow + * can use extended header for sync sends. */ + status = ucp_tag_request_process_recv_data(req, + UCS_PTR_BYTE_OFFSET(data, hdr_len), + recv_len, hdr->offset, 0, flags); if (status != UCS_INPROGRESS) { /* request completed, delete hash entry */ kh_del(ucp_tag_frag_hash, &worker->tm.frag_hash, iter); @@ -188,9 +210,23 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_eager_middle_handler, status = UCS_OK; } + /* If hash contains queue of unexpected fragments, it should not be empty */ + ucs_assert(!ucp_tag_frag_match_is_unexp(matchq) || + !ucs_queue_is_empty(&matchq->unexp_q)); + return status; } +UCS_PROFILE_FUNC(ucs_status_t, ucp_eager_middle_handler, + (arg, data, length, am_flags), + void *arg, void *data, size_t length, unsigned am_flags) +{ + return ucp_eager_common_middle_handler(arg, data, length, + sizeof(ucp_eager_middle_hdr_t), + am_flags, UCP_RECV_DESC_FLAG_EAGER, + 0); +} + UCS_PROFILE_FUNC(ucs_status_t, ucp_eager_sync_only_handler, (arg, data, length, am_flags), void *arg, void *data, size_t length, unsigned am_flags) @@ -248,46 +284,146 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_eager_sync_ack_handler, return UCS_OK; } +#define ucp_tag_eager_offload_priv(_flags, _data, _length, _priv_type) \ + ({ \ + size_t _priv_len = sizeof(_priv_type); \ + typeof(_priv_type) *priv_data; \ + if (ucs_unlikely((_flags) & UCT_CB_PARAM_FLAG_DESC)) { \ + priv_data = UCS_PTR_BYTE_OFFSET(_data, -_priv_len); \ + } else { /* Can not shift back, no headroom */ \ + priv_data = ucs_alloca((_length) + _priv_len); \ + memcpy(UCS_PTR_BYTE_OFFSET(priv_data, _priv_len), _data, (_length)); \ + } \ + priv_data; \ + }) + +static UCS_F_ALWAYS_INLINE ucs_status_t +ucp_tag_offload_eager_first_handler(ucp_worker_h worker, void *data, + size_t length, unsigned tl_flags, + uct_tag_t stag, uint16_t flags, + void **context) +{ + ucp_eager_first_hdr_t *priv; + uint64_t msg_ctx; + int priv_len; + + /* First part of the fragmented message. Pass message id back to UCT, + * so it will be provided with the rest of message fragments. Immediate + * data (indicating sync send) is passed with last fragment only, so + * ack will be sent upon receiving of the last fragment. */ + msg_ctx = worker->am_message_id++; + *(uint64_t*)context = msg_ctx; + priv_len = sizeof(*priv); + priv = ucp_tag_eager_offload_priv(tl_flags, data, length, + ucp_eager_first_hdr_t); + priv->super.super.tag = stag; + priv->total_len = SIZE_MAX; /* length is not known at this point */ + priv->msg_id = msg_ctx; + return ucp_eager_tagged_handler(worker, priv, length + priv_len, + tl_flags, flags, priv_len, priv_len); +} + +static UCS_F_ALWAYS_INLINE ucs_status_t +ucp_tag_offload_eager_middle_handler(ucp_worker_h worker, void *data, + size_t length, unsigned tl_flags, + uct_tag_t stag, uint64_t imm, + uint16_t flags, void **context) +{ + ucp_offload_last_ssend_hdr_t *l_priv; + ucp_eager_middle_hdr_t *m_priv; + void *tag_priv; + int priv_len; + + /* Last fragment may contain immediate data, indicating that it is + * synchronous send */ + if (!(tl_flags & UCT_CB_PARAM_FLAG_MORE) && imm) { + l_priv = ucp_tag_eager_offload_priv(tl_flags, data, length, + ucp_offload_last_ssend_hdr_t); + priv_len = sizeof(*l_priv); + tag_priv = l_priv; + l_priv->ssend_ack.sender_tag = stag; + l_priv->ssend_ack.ep_ptr = imm; + m_priv = &l_priv->super; + flags |= UCP_RECV_DESC_FLAG_EAGER_SYNC | + UCP_RECV_DESC_FLAG_EAGER_LAST; + } else { + m_priv = ucp_tag_eager_offload_priv(tl_flags, data, length, + ucp_eager_middle_hdr_t); + priv_len = sizeof(*m_priv); + tag_priv = m_priv; + flags |= (tl_flags & UCT_CB_PARAM_FLAG_MORE) ? + 0 : UCP_RECV_DESC_FLAG_EAGER_LAST; + } + + /* Offset is calculated during data processing in the + * ucp_tag_request_process_recv_data function */ + m_priv->offset = 0; + m_priv->msg_id = *(uint64_t*)context; + + return ucp_eager_common_middle_handler(worker, tag_priv, length + priv_len, + priv_len, tl_flags, flags, priv_len); +} + +/* TODO: can handle multi-fragment messages in a more efficient way by saving + * request or some unexp descriptors handle in the context. This would eliminate + * the need for fragments hashing on UCP level. */ UCS_PROFILE_FUNC(ucs_status_t, ucp_tag_offload_unexp_eager, - (arg, data, length, tl_flags, stag, imm), + (arg, data, length, tl_flags, stag, imm, context), void *arg, void *data, size_t length, unsigned tl_flags, - uct_tag_t stag, uint64_t imm) + uct_tag_t stag, uint64_t imm, void **context) { /* Align data with AM protocol. We should add tag before the data. */ ucp_worker_iface_t *wiface = arg; + ucp_worker_t *worker = wiface->worker; uint16_t flags = UCP_RECV_DESC_FLAG_EAGER | - UCP_RECV_DESC_FLAG_EAGER_ONLY | UCP_RECV_DESC_FLAG_EAGER_OFFLOAD; - ucp_eager_sync_hdr_t *hdr; - int hdr_len; + ucp_eager_sync_hdr_t *priv; + int priv_len; UCP_WORKER_STAT_TAG_OFFLOAD(wiface->worker, RX_UNEXP_EGR); - ucp_tag_offload_unexp(wiface, stag, length); + /* Fast path - single-fragment, non-sync eager message */ + if (ucs_likely((tl_flags & UCT_CB_PARAM_FLAG_FIRST) && + !(tl_flags & UCT_CB_PARAM_FLAG_MORE) && + !imm)) { + ucp_tag_offload_unexp(wiface, stag, length); - if (ucs_likely(!imm)) { return ucp_eager_offload_handler(wiface->worker, data, length, tl_flags, - flags, stag); + flags | UCP_RECV_DESC_FLAG_EAGER_ONLY, + stag); } - /* It is a sync send, imm data contains sender uuid */ - hdr_len = sizeof(ucp_eager_sync_hdr_t); - - if (ucs_unlikely(tl_flags & UCT_CB_PARAM_FLAG_DESC)) { - hdr = (ucp_eager_sync_hdr_t*)(UCS_PTR_BYTE_OFFSET(data, -hdr_len)); - } else { - /* Can not shift back, no headroom */ - hdr = ucs_alloca(length + hdr_len); - memcpy(UCS_PTR_BYTE_OFFSET(hdr, hdr_len), data, length); + if (!(tl_flags & UCT_CB_PARAM_FLAG_FIRST)) { + /* Either middle or last fragment */ + return ucp_tag_offload_eager_middle_handler(worker, data, length, + tl_flags, stag, imm, flags, + context); } - hdr->super.super.tag = stag; - hdr->req.reqptr = 0ul; - hdr->req.ep_ptr = imm; - flags |= UCP_RECV_DESC_FLAG_EAGER_SYNC; + /* Either first eager fragment or entire sync eager message */ + ucp_tag_offload_unexp(wiface, stag, length); + + if (tl_flags & UCT_CB_PARAM_FLAG_MORE) { + /* First part of the fragmented message */ + return ucp_tag_offload_eager_first_handler(worker, data, length, + tl_flags, stag, flags, + context); + } - return ucp_eager_tagged_handler(wiface->worker, hdr, length + hdr_len, - tl_flags, flags, hdr_len, hdr_len); + /* Sync eager only packet */ + ucs_assert(!(tl_flags & UCT_CB_PARAM_FLAG_MORE)); + ucs_assert(imm); + + flags |= UCP_RECV_DESC_FLAG_EAGER_ONLY | + UCP_RECV_DESC_FLAG_EAGER_SYNC; + priv_len = sizeof(*priv); + priv = ucp_tag_eager_offload_priv(tl_flags, data, length, + ucp_eager_sync_hdr_t); + priv->req.reqptr = 0ul; + priv->req.ep_ptr = imm; + priv->super.super.tag = stag; + return ucp_eager_tagged_handler(worker, priv, length + priv_len, + tl_flags, flags, priv_len, priv_len); } static void ucp_eager_dump(ucp_worker_h worker, uct_am_trace_type_t type, @@ -352,8 +488,8 @@ static void ucp_eager_dump(ucp_worker_h worker, uct_am_trace_type_t type, } p = buffer + strlen(buffer); - ucp_dump_payload(worker->context, p, buffer + max - p, data + header_len, - length - header_len); + ucp_dump_payload(worker->context, p, buffer + max - p, + UCS_PTR_BYTE_OFFSET(data, header_len), length - header_len); } UCP_DEFINE_AM(UCP_FEATURE_TAG, UCP_AM_ID_EAGER_ONLY, ucp_eager_only_handler, diff --git a/src/ucp/tag/eager_snd.c b/src/ucp/tag/eager_snd.c index 15360683ba3..8cd369e941d 100644 --- a/src/ucp/tag/eager_snd.c +++ b/src/ucp/tag/eager_snd.c @@ -1,51 +1,61 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "eager.h" #include "offload.h" #include -#include #include -/* packing start */ +/* packing start */ + +static UCS_F_ALWAYS_INLINE size_t +ucp_tag_pack_eager_common(ucp_request_t *req, void *dest, + size_t length, size_t hdr_length, + int UCS_V_UNUSED is_first) +{ + size_t packed_length; + + ucs_assert((length + hdr_length) <= + ucp_ep_get_max_bcopy(req->send.ep, req->send.lane)); + ucs_assert(!is_first || (req->send.state.dt.offset == 0)); + + packed_length = ucp_dt_pack(req->send.ep->worker, req->send.datatype, + req->send.mem_type, dest, req->send.buffer, + &req->send.state.dt, length); + return packed_length + hdr_length; +} static size_t ucp_tag_pack_eager_only_dt(void *dest, void *arg) { ucp_eager_hdr_t *hdr = dest; ucp_request_t *req = arg; - size_t length; - hdr->super.tag = req->send.tag.tag; + hdr->super.tag = req->send.msg_proto.tag.tag; - ucs_assert(req->send.state.dt.offset == 0); - length = ucp_dt_pack(req->send.ep->worker, req->send.datatype, - req->send.mem_type, hdr + 1, req->send.buffer, - &req->send.state.dt, req->send.length); - ucs_assert(length == req->send.length); - return sizeof(*hdr) + length; + return ucp_tag_pack_eager_common(req, hdr + 1, req->send.length, + sizeof(*hdr), 1); } static size_t ucp_tag_pack_eager_sync_only_dt(void *dest, void *arg) { ucp_eager_sync_hdr_t *hdr = dest; ucp_request_t *req = arg; - size_t length; - hdr->super.super.tag = req->send.tag.tag; + hdr->super.super.tag = req->send.msg_proto.tag.tag; hdr->req.ep_ptr = ucp_request_get_dest_ep_ptr(req); hdr->req.reqptr = (uintptr_t)req; - ucs_assert(req->send.state.dt.offset == 0); - length = ucp_dt_pack(req->send.ep->worker, req->send.datatype, - req->send.mem_type, hdr + 1, req->send.buffer, - &req->send.state.dt, req->send.length); - ucs_assert(length == req->send.length); - return sizeof(*hdr) + length; + return ucp_tag_pack_eager_common(req, hdr + 1, req->send.length, + sizeof(*hdr), 1); } static size_t ucp_tag_pack_eager_first_dt(void *dest, void *arg) @@ -58,15 +68,12 @@ static size_t ucp_tag_pack_eager_first_dt(void *dest, void *arg) length = ucp_ep_get_max_bcopy(req->send.ep, req->send.lane) - sizeof(*hdr); - hdr->super.super.tag = req->send.tag.tag; + length = ucs_min(length, req->send.length); + hdr->super.super.tag = req->send.msg_proto.tag.tag; hdr->total_len = req->send.length; - hdr->msg_id = req->send.tag.message_id; + hdr->msg_id = req->send.msg_proto.message_id; - ucs_assert(req->send.state.dt.offset == 0); - ucs_assert(req->send.length > length); - return sizeof(*hdr) + ucp_dt_pack(req->send.ep->worker, req->send.datatype, - req->send.mem_type, hdr + 1, req->send.buffer, - &req->send.state.dt, length); + return ucp_tag_pack_eager_common(req, hdr + 1, length, sizeof(*hdr), 1); } static size_t ucp_tag_pack_eager_sync_first_dt(void *dest, void *arg) @@ -77,19 +84,17 @@ static size_t ucp_tag_pack_eager_sync_first_dt(void *dest, void *arg) ucs_assert(req->send.lane == ucp_ep_get_am_lane(req->send.ep)); - length = ucp_ep_get_max_bcopy(req->send.ep, req->send.lane) - - sizeof(*hdr); - hdr->super.super.super.tag = req->send.tag.tag; - hdr->super.total_len = req->send.length; - hdr->req.ep_ptr = ucp_request_get_dest_ep_ptr(req); - hdr->super.msg_id = req->send.tag.message_id; - hdr->req.reqptr = (uintptr_t)req; - - ucs_assert(req->send.state.dt.offset == 0); - ucs_assert(req->send.length > length); - return sizeof(*hdr) + ucp_dt_pack(req->send.ep->worker, req->send.datatype, - req->send.mem_type, hdr + 1, req->send.buffer, - &req->send.state.dt, length); + length = ucp_ep_get_max_bcopy(req->send.ep, + req->send.lane) - + sizeof(*hdr); + length = ucs_min(length, req->send.length); + hdr->super.super.super.tag = req->send.msg_proto.tag.tag; + hdr->super.total_len = req->send.length; + hdr->req.ep_ptr = ucp_request_get_dest_ep_ptr(req); + hdr->super.msg_id = req->send.msg_proto.message_id; + hdr->req.reqptr = (uintptr_t)req; + + return ucp_tag_pack_eager_common(req, hdr + 1, length, sizeof(*hdr), 1); } static size_t ucp_tag_pack_eager_middle_dt(void *dest, void *arg) @@ -98,14 +103,13 @@ static size_t ucp_tag_pack_eager_middle_dt(void *dest, void *arg) ucp_request_t *req = arg; size_t length; - length = ucs_min(ucp_ep_get_max_bcopy(req->send.ep, req->send.lane) - - sizeof(*hdr), - req->send.length - req->send.state.dt.offset); - hdr->msg_id = req->send.tag.message_id; - hdr->offset = req->send.state.dt.offset; - return sizeof(*hdr) + ucp_dt_pack(req->send.ep->worker, req->send.datatype, - req->send.mem_type, hdr + 1, req->send.buffer, - &req->send.state.dt, length); + length = ucs_min(ucp_ep_get_max_bcopy(req->send.ep, req->send.lane) - + sizeof(*hdr), + req->send.length - req->send.state.dt.offset); + hdr->msg_id = req->send.msg_proto.message_id; + hdr->offset = req->send.state.dt.offset; + + return ucp_tag_pack_eager_common(req, hdr + 1, length, sizeof(*hdr), 0); } /* eager */ @@ -118,7 +122,8 @@ static ucs_status_t ucp_tag_eager_contig_short(uct_pending_req_t *self) req->send.lane = ucp_ep_get_am_lane(ep); status = uct_ep_am_short(ep->uct_eps[req->send.lane], UCP_AM_ID_EAGER_ONLY, - req->send.tag.tag, req->send.buffer, req->send.length); + req->send.msg_proto.tag.tag, req->send.buffer, + req->send.length); if (status != UCS_OK) { return status; } @@ -144,7 +149,6 @@ static ucs_status_t ucp_tag_eager_bcopy_multi(uct_pending_req_t *self) ucs_status_t status = ucp_do_am_bcopy_multi(self, UCP_AM_ID_EAGER_FIRST, UCP_AM_ID_EAGER_MIDDLE, - sizeof(ucp_eager_middle_hdr_t), ucp_tag_pack_eager_first_dt, ucp_tag_pack_eager_middle_dt, 1); if (status == UCS_OK) { @@ -162,7 +166,7 @@ static ucs_status_t ucp_tag_eager_zcopy_single(uct_pending_req_t *self) ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); ucp_eager_hdr_t hdr; - hdr.super.tag = req->send.tag.tag; + hdr.super.tag = req->send.msg_proto.tag.tag; return ucp_do_am_zcopy_single(self, UCP_AM_ID_EAGER_ONLY, &hdr, sizeof(hdr), ucp_proto_am_zcopy_req_complete); } @@ -173,10 +177,10 @@ static ucs_status_t ucp_tag_eager_zcopy_multi(uct_pending_req_t *self) ucp_eager_first_hdr_t first_hdr; ucp_eager_middle_hdr_t middle_hdr; - first_hdr.super.super.tag = req->send.tag.tag; + first_hdr.super.super.tag = req->send.msg_proto.tag.tag; first_hdr.total_len = req->send.length; - first_hdr.msg_id = req->send.tag.message_id; - middle_hdr.msg_id = req->send.tag.message_id; + first_hdr.msg_id = req->send.msg_proto.message_id; + middle_hdr.msg_id = req->send.msg_proto.message_id; middle_hdr.offset = req->send.state.dt.offset; return ucp_do_am_zcopy_multi(self, @@ -189,21 +193,19 @@ static ucs_status_t ucp_tag_eager_zcopy_multi(uct_pending_req_t *self) ucs_status_t ucp_tag_send_start_rndv(uct_pending_req_t *self); -const ucp_proto_t ucp_tag_eager_proto = { +const ucp_request_send_proto_t ucp_tag_eager_proto = { .contig_short = ucp_tag_eager_contig_short, .bcopy_single = ucp_tag_eager_bcopy_single, .bcopy_multi = ucp_tag_eager_bcopy_multi, .zcopy_single = ucp_tag_eager_zcopy_single, .zcopy_multi = ucp_tag_eager_zcopy_multi, .zcopy_completion = ucp_proto_am_zcopy_completion, - .only_hdr_size = sizeof(ucp_eager_hdr_t), - .first_hdr_size = sizeof(ucp_eager_first_hdr_t), - .mid_hdr_size = sizeof(ucp_eager_hdr_t) + .only_hdr_size = sizeof(ucp_eager_hdr_t) }; /* eager sync */ -void ucp_tag_eager_sync_completion(ucp_request_t *req, uint16_t flag, +void ucp_tag_eager_sync_completion(ucp_request_t *req, uint32_t flag, ucs_status_t status) { static const uint16_t all_completed = UCP_REQUEST_FLAG_LOCAL_COMPLETED | @@ -225,8 +227,6 @@ static ucs_status_t ucp_tag_eager_sync_bcopy_single(uct_pending_req_t *self) ucp_request_send_generic_dt_finish(req); ucp_tag_eager_sync_completion(req, UCP_REQUEST_FLAG_LOCAL_COMPLETED, UCS_OK); - } else if (status == UCP_STATUS_PENDING_SWITCH) { - status = UCS_OK; } return status; } @@ -236,7 +236,6 @@ static ucs_status_t ucp_tag_eager_sync_bcopy_multi(uct_pending_req_t *self) ucs_status_t status = ucp_do_am_bcopy_multi(self, UCP_AM_ID_EAGER_SYNC_FIRST, UCP_AM_ID_EAGER_MIDDLE, - sizeof(ucp_eager_middle_hdr_t), ucp_tag_pack_eager_sync_first_dt, ucp_tag_pack_eager_middle_dt, 1); if (status == UCS_OK) { @@ -253,10 +252,19 @@ static ucs_status_t ucp_tag_eager_sync_bcopy_multi(uct_pending_req_t *self) void ucp_tag_eager_sync_zcopy_req_complete(ucp_request_t *req, ucs_status_t status) { + ucs_assert(req->send.state.uct_comp.count == 0); + ucp_request_send_buffer_dereg(req); /* TODO register+lane change */ + ucp_tag_eager_sync_completion(req, UCP_REQUEST_FLAG_LOCAL_COMPLETED, + status); +} + +void ucp_tag_eager_sync_zcopy_completion(uct_completion_t *self, + ucs_status_t status) +{ + ucp_request_t *req = ucs_container_of(self, ucp_request_t, + send.state.uct_comp); if (req->send.state.dt.offset == req->send.length) { - ucp_request_send_buffer_dereg(req); /* TODO register+lane change */ - ucp_tag_eager_sync_completion(req, UCP_REQUEST_FLAG_LOCAL_COMPLETED, - status); + ucp_tag_eager_sync_zcopy_req_complete(req, status); } else if (status != UCS_OK) { ucs_fatal("error handling is not supported with tag-sync protocol"); } @@ -267,7 +275,7 @@ static ucs_status_t ucp_tag_eager_sync_zcopy_single(uct_pending_req_t *self) ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); ucp_eager_sync_hdr_t hdr; - hdr.super.super.tag = req->send.tag.tag; + hdr.super.super.tag = req->send.msg_proto.tag.tag; hdr.req.ep_ptr = ucp_request_get_dest_ep_ptr(req); hdr.req.reqptr = (uintptr_t)req; @@ -281,12 +289,12 @@ static ucs_status_t ucp_tag_eager_sync_zcopy_multi(uct_pending_req_t *self) ucp_eager_sync_first_hdr_t first_hdr; ucp_eager_middle_hdr_t middle_hdr; - first_hdr.super.super.super.tag = req->send.tag.tag; + first_hdr.super.super.super.tag = req->send.msg_proto.tag.tag; first_hdr.super.total_len = req->send.length; first_hdr.req.ep_ptr = ucp_request_get_dest_ep_ptr(req); first_hdr.req.reqptr = (uintptr_t)req; - first_hdr.super.msg_id = req->send.tag.message_id; - middle_hdr.msg_id = req->send.tag.message_id; + first_hdr.super.msg_id = req->send.msg_proto.message_id; + middle_hdr.msg_id = req->send.msg_proto.message_id; middle_hdr.offset = req->send.state.dt.offset; return ucp_do_am_zcopy_multi(self, @@ -297,24 +305,14 @@ static ucs_status_t ucp_tag_eager_sync_zcopy_multi(uct_pending_req_t *self) ucp_tag_eager_sync_zcopy_req_complete, 1); } -void ucp_tag_eager_sync_zcopy_completion(uct_completion_t *self, ucs_status_t status) -{ - ucp_request_t *req; - - req = ucs_container_of(self, ucp_request_t, send.state.uct_comp); - ucp_tag_eager_sync_zcopy_req_complete(req, status); -} - -const ucp_proto_t ucp_tag_eager_sync_proto = { +const ucp_request_send_proto_t ucp_tag_eager_sync_proto = { .contig_short = NULL, .bcopy_single = ucp_tag_eager_sync_bcopy_single, .bcopy_multi = ucp_tag_eager_sync_bcopy_multi, .zcopy_single = ucp_tag_eager_sync_zcopy_single, .zcopy_multi = ucp_tag_eager_sync_zcopy_multi, .zcopy_completion = ucp_tag_eager_sync_zcopy_completion, - .only_hdr_size = sizeof(ucp_eager_sync_hdr_t), - .first_hdr_size = sizeof(ucp_eager_sync_first_hdr_t), - .mid_hdr_size = sizeof(ucp_eager_hdr_t) + .only_hdr_size = sizeof(ucp_eager_sync_hdr_t) }; void ucp_tag_eager_sync_send_ack(ucp_worker_h worker, void *hdr, uint16_t recv_flags) @@ -330,28 +328,23 @@ void ucp_tag_eager_sync_send_ack(ucp_worker_h worker, void *hdr, uint16_t recv_f reqhdr = &((ucp_eager_sync_first_hdr_t*)hdr)->req; /* first */ } - req = ucp_request_get(worker); + if (recv_flags & UCP_RECV_DESC_FLAG_EAGER_OFFLOAD) { + ucp_tag_offload_sync_send_ack(worker, reqhdr->ep_ptr, + ((ucp_eager_sync_hdr_t*)hdr)->super.super.tag, + recv_flags); + return; + } + + ucs_assert(reqhdr->reqptr != 0); + req = ucp_proto_ssend_ack_request_alloc(worker, reqhdr->ep_ptr); if (req == NULL) { ucs_fatal("could not allocate request"); } - req->flags = 0; - req->send.ep = ucp_worker_get_ep_by_ptr(worker, reqhdr->ep_ptr); - req->send.uct.func = ucp_proto_progress_am_bcopy_single; - req->send.proto.comp_cb = ucp_request_put; - req->send.proto.status = UCS_OK; + req->send.proto.am_id = UCP_AM_ID_EAGER_SYNC_ACK; + req->send.proto.remote_request = reqhdr->reqptr; ucs_trace_req("send_sync_ack req %p ep %p", req, req->send.ep); - if (recv_flags & UCP_RECV_DESC_FLAG_EAGER_OFFLOAD) { - ucs_assert(recv_flags & UCP_RECV_DESC_FLAG_EAGER_ONLY); - req->send.proto.am_id = UCP_AM_ID_OFFLOAD_SYNC_ACK; - req->send.proto.sender_tag = ((ucp_eager_sync_hdr_t*)hdr)->super.super.tag; - } else { - ucs_assert(reqhdr->reqptr != 0); - req->send.proto.am_id = UCP_AM_ID_EAGER_SYNC_ACK; - req->send.proto.remote_request = reqhdr->reqptr; - } - ucp_request_send(req, 0); } diff --git a/src/ucp/tag/offload.c b/src/ucp/tag/offload.c index 3fe7fe7f961..240f6daf450 100644 --- a/src/ucp/tag/offload.c +++ b/src/ucp/tag/offload.c @@ -1,13 +1,17 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "offload.h" #include "eager.h" #include "rndv.h" -#include + #include #include #include @@ -16,7 +20,7 @@ #include -int ucp_tag_offload_iface_activate(ucp_worker_iface_t *iface) +void ucp_tag_offload_iface_activate(ucp_worker_iface_t *iface) { ucp_worker_t *worker = iface->worker; ucp_context_t *context = worker->context; @@ -26,7 +30,8 @@ int ucp_tag_offload_iface_activate(ucp_worker_iface_t *iface) ucs_assert(worker->tm.offload.zcopy_thresh == SIZE_MAX); ucs_assert(worker->tm.offload.iface == NULL); - worker->tm.offload.thresh = context->config.ext.tm_thresh; + worker->tm.offload.thresh = ucs_max(context->config.ext.tm_thresh, + iface->attr.cap.tag.recv.min_recv); worker->tm.offload.zcopy_thresh = context->config.ext.tm_max_bb_size; /* Cache active offload iface. Can use it if this will be the only @@ -41,8 +46,6 @@ int ucp_tag_offload_iface_activate(ucp_worker_iface_t *iface) iface->flags |= UCP_WORKER_IFACE_FLAG_OFFLOAD_ACTIVATED; ucs_debug("Activate tag offload iface %p", iface); - - return 1; } static UCS_F_ALWAYS_INLINE ucp_worker_iface_t* @@ -136,6 +139,7 @@ UCS_PROFILE_FUNC_VOID(ucp_tag_offload_rndv_cb, ucs_status_t status) { ucp_request_t *req = ucs_container_of(self, ucp_request_t, recv.uct_ctx); + void *header_host_copy; UCP_WORKER_STAT_TAG_OFFLOAD(req->recv.worker, MATCHED_SW_RNDV); @@ -147,7 +151,19 @@ UCS_PROFILE_FUNC_VOID(ucp_tag_offload_rndv_cb, } ucs_assert(header_length >= sizeof(ucp_rndv_rts_hdr_t)); - ucp_rndv_matched(req->recv.worker, req, header); + + if (UCP_MEM_IS_ACCESSIBLE_FROM_CPU(req->recv.mem_type)) { + ucp_rndv_matched(req->recv.worker, req, header); + } else { + /* SW rendezvous request is stored in the user buffer (temporarily) + when matched. If user buffer allocated on GPU memory, need to "pack" + it to the host memory staging buffer for further processing. */ + header_host_copy = ucs_alloca(header_length); + ucp_mem_type_pack(req->recv.worker, header_host_copy, header, + header_length, req->recv.mem_type); + ucp_rndv_matched(req->recv.worker, req, header_host_copy); + } + ucp_tag_offload_release_buf(req, 0); } @@ -187,7 +203,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_tag_offload_unexp_rndv, dummy_rts->size = length; ucp_rkey_packed_copy(worker->context, UCS_BIT(md_index), - UCT_MD_MEM_TYPE_HOST, dummy_rts + 1, uct_rkeys); + UCS_MEMORY_TYPE_HOST, dummy_rts + 1, uct_rkeys); UCP_WORKER_STAT_TAG_OFFLOAD(worker, RX_UNEXP_RNDV); ucp_rndv_process_rts(worker, dummy_rts, dummy_rts_size, 0); @@ -234,7 +250,7 @@ UCS_PROFILE_FUNC_VOID(ucp_tag_offload_cancel, (worker, req, mode), } } -static UCS_F_ALWAYS_INLINE int +static UCS_F_ALWAYS_INLINE ucs_status_t ucp_tag_offload_do_post(ucp_request_t *req) { ucp_worker_t *worker = req->recv.worker; @@ -243,7 +259,7 @@ ucp_tag_offload_do_post(ucp_request_t *req) ucp_mem_desc_t *rdesc = NULL; ucp_worker_iface_t *wiface; ucs_status_t status; - ucp_rsc_index_t mdi; + ucp_md_index_t mdi; uct_iov_t iov; wiface = ucp_tag_offload_iface(worker, req->recv.tag.tag); @@ -254,7 +270,10 @@ ucp_tag_offload_do_post(ucp_request_t *req) mdi = context->tl_rscs[wiface->rsc_index].md_index; - if (ucs_unlikely(length >= worker->tm.offload.zcopy_thresh)) { + /* Do not use bounce buffer for receives to GPU memory to avoid + * cost of h2d transfers (i.e. cuda_copy from staging to dest memory). */ + if ((length >= worker->tm.offload.zcopy_thresh) || + !UCP_MEM_IS_ACCESSIBLE_FROM_CPU(req->recv.mem_type)) { if (length > wiface->attr.cap.tag.recv.max_zcopy) { /* Post maximum allowed length. If sender sends smaller message * (which is allowed per MPI standard), max recv should fit it. @@ -270,7 +289,9 @@ ucp_tag_offload_do_post(ucp_request_t *req) req->recv.length, req->recv.datatype, &req->recv.state, req->recv.mem_type, req, UCT_MD_MEM_FLAG_HIDE_ERRORS); - if (status != UCS_OK) { + if ((status != UCS_OK) || !req->recv.state.dt.contig.md_map) { + /* Can't register this buffer on the offload iface */ + UCP_WORKER_STAT_TAG_OFFLOAD(worker, BLOCK_MEM_REG); return status; } @@ -278,7 +299,7 @@ ucp_tag_offload_do_post(ucp_request_t *req) iov.buffer = (void*)req->recv.buffer; iov.memh = req->recv.state.dt.contig.memh[0]; } else { - rdesc = ucp_worker_mpool_get(worker); + rdesc = ucp_worker_mpool_get(&worker->reg_mp); if (rdesc == NULL) { return UCS_ERR_NO_MEMORY; } @@ -300,6 +321,9 @@ ucp_tag_offload_do_post(ucp_request_t *req) req->recv.tag.tag_mask, &iov, 1, &req->recv.uct_ctx); if (status != UCS_OK) { + ucs_assert((status == UCS_ERR_NO_RESOURCE) || + (status == UCS_ERR_EXCEEDS_LIMIT) || + (status == UCS_ERR_ALREADY_EXISTS)); /* No more matching entries in the transport. * TODO keep registration in case SW RNDV protocol will be used */ ucp_tag_offload_release_buf(req, 1); @@ -439,7 +463,8 @@ static ucs_status_t ucp_tag_offload_eager_short(uct_pending_req_t *self) req->send.lane = ucp_ep_get_tag_lane(ep); status = uct_ep_tag_eager_short(ep->uct_eps[req->send.lane], - req->send.tag.tag, req->send.buffer, + req->send.msg_proto.tag.tag, + req->send.buffer, req->send.length); if (status == UCS_OK) { ucp_request_complete_send(req, UCS_OK); @@ -457,10 +482,10 @@ ucp_do_tag_offload_bcopy(uct_pending_req_t *self, uint64_t imm_data, req->send.lane = ucp_ep_get_tag_lane(ep); packed_len = uct_ep_tag_eager_bcopy(ep->uct_eps[req->send.lane], - req->send.tag.tag, imm_data, - pack_cb, req, 0); + req->send.msg_proto.tag.tag, + imm_data, pack_cb, req, 0); if (packed_len < 0) { - return packed_len; + return (ucs_status_t)packed_len; } return UCS_OK; } @@ -483,7 +508,8 @@ ucp_do_tag_offload_zcopy(uct_pending_req_t *self, uint64_t imm_data, req->send.buffer, req->send.datatype, req->send.length, ucp_ep_md_index(ep, req->send.lane), NULL); - status = uct_ep_tag_eager_zcopy(ep->uct_eps[req->send.lane], req->send.tag.tag, + status = uct_ep_tag_eager_zcopy(ep->uct_eps[req->send.lane], + req->send.msg_proto.tag.tag, imm_data, iov, iovcnt, 0, &req->send.state.uct_comp); if (status == UCS_OK) { @@ -525,14 +551,19 @@ ucs_status_t ucp_tag_offload_sw_rndv(uct_pending_req_t *self) ucs_assert((UCP_DT_IS_CONTIG(req->send.datatype) && (req->send.length > ucp_ep_config(ep)->tag.offload.max_rndv_zcopy)) || - !UCP_DT_IS_CONTIG(req->send.datatype)); + !UCP_DT_IS_CONTIG(req->send.datatype) || + !(ep->worker->context->tl_mds[ucp_ep_md_index(ep, req->send.lane)].attr.cap. + reg_mem_types & UCS_BIT(req->send.mem_type)) || + ep->worker->context->config.ext.tm_sw_rndv); /* send RTS to allow fallback to SW RNDV on receiver */ rndv_hdr_len = sizeof(ucp_rndv_rts_hdr_t) + ucp_ep_config(ep)->tag.rndv.rkey_size; rndv_rts_hdr = ucs_alloca(rndv_hdr_len); packed_len = ucp_tag_rndv_rts_pack(rndv_rts_hdr, req); - ucs_assert((rndv_rts_hdr->address != 0) || !UCP_DT_IS_CONTIG(req->send.datatype)); - return uct_ep_tag_rndv_request(ep->uct_eps[req->send.lane], req->send.tag.tag, + ucs_assert((rndv_rts_hdr->address != 0) || !UCP_DT_IS_CONTIG(req->send.datatype) || + !ucp_rndv_is_get_zcopy(req, ep->worker->context)); + return uct_ep_tag_rndv_request(ep->uct_eps[req->send.lane], + req->send.msg_proto.tag.tag, rndv_rts_hdr, packed_len, 0); } @@ -551,7 +582,7 @@ ucs_status_t ucp_tag_offload_rndv_zcopy(uct_pending_req_t *self) size_t max_iov = ucp_ep_config(ep)->tag.eager.max_iov; uct_iov_t *iov = ucs_alloca(max_iov * sizeof(uct_iov_t)); size_t iovcnt = 0; - ucp_rsc_index_t md_index; + ucp_md_index_t md_index; ucp_dt_state_t dt_state; void *rndv_op; @@ -572,8 +603,9 @@ ucs_status_t ucp_tag_offload_rndv_zcopy(uct_pending_req_t *self) req->send.buffer, req->send.datatype, req->send.length, ucp_ep_md_index(ep, req->send.lane), NULL); - rndv_op = uct_ep_tag_rndv_zcopy(ep->uct_eps[req->send.lane], req->send.tag.tag, - &rndv_hdr, sizeof(rndv_hdr), iov, iovcnt, 0, + rndv_op = uct_ep_tag_rndv_zcopy(ep->uct_eps[req->send.lane], + req->send.msg_proto.tag.tag, &rndv_hdr, + sizeof(rndv_hdr), iov, iovcnt, 0, &req->send.state.uct_comp); if (UCS_PTR_IS_ERR(rndv_op)) { return UCS_PTR_STATUS(rndv_op); @@ -597,33 +629,47 @@ void ucp_tag_offload_cancel_rndv(ucp_request_t *req) if (status != UCS_OK) { ucs_error("Failed to cancel tag rndv op %s", ucs_status_string(status)); } + + req->flags &= ~UCP_REQUEST_FLAG_OFFLOADED; } ucs_status_t ucp_tag_offload_start_rndv(ucp_request_t *sreq) { + ucp_ep_t *ep = sreq->send.ep; + ucp_context_t *context = ep->worker->context; + ucp_md_index_t mdi = ucp_ep_md_index(ep, sreq->send.lane); + uct_md_attr_t *md_attr = &context->tl_mds[mdi].attr; ucs_status_t status; - ucp_ep_t *ep = sreq->send.ep; /* should be set by ucp_tag_send_req_init() */ ucs_assert(sreq->send.lane == ucp_ep_get_tag_lane(ep)); - if (UCP_DT_IS_CONTIG(sreq->send.datatype)) { - status = ucp_request_send_buffer_reg_lane(sreq, sreq->send.lane); - if (status != UCS_OK) { - return status; - } - } - if (UCP_DT_IS_CONTIG(sreq->send.datatype) && - (sreq->send.length <= ucp_ep_config(ep)->tag.offload.max_rndv_zcopy)) { + !context->config.ext.tm_sw_rndv && + (sreq->send.length <= ucp_ep_config(ep)->tag.offload.max_rndv_zcopy) && + (md_attr->cap.reg_mem_types & UCS_BIT(sreq->send.mem_type))) { ucp_request_send_state_reset(sreq, ucp_tag_offload_rndv_zcopy_completion, UCP_REQUEST_SEND_PROTO_RNDV_GET); + /* Register send buffer with tag lane, because tag offload rndv + * protocol will perform RDMA_READ on it (if it arrives expectedly) */ + status = ucp_request_send_buffer_reg_lane(sreq, sreq->send.lane, 0); + if (status != UCS_OK) { + return status; + } + /* contiguous buffer, offload can be used, but only a single lane */ sreq->send.uct.func = ucp_tag_offload_rndv_zcopy; } else { ucp_request_send_state_reset(sreq, NULL, UCP_REQUEST_SEND_PROTO_RNDV_GET); + /* RNDV will be performed by the SW - can register with SW RNDV lanes + * to get multirail benefits */ + status = ucp_tag_rndv_reg_send_buffer(sreq); + if (status != UCS_OK) { + return status; + } + /* offload enabled but can't be used */ sreq->send.uct.func = ucp_tag_offload_sw_rndv; } @@ -631,23 +677,21 @@ ucs_status_t ucp_tag_offload_start_rndv(ucp_request_t *sreq) return UCS_OK; } -const ucp_proto_t ucp_tag_offload_proto = { +const ucp_request_send_proto_t ucp_tag_offload_proto = { .contig_short = ucp_tag_offload_eager_short, .bcopy_single = ucp_tag_offload_eager_bcopy, .bcopy_multi = NULL, .zcopy_single = ucp_tag_offload_eager_zcopy, .zcopy_multi = NULL, .zcopy_completion = ucp_proto_am_zcopy_completion, - .only_hdr_size = 0, - .first_hdr_size = 0, - .mid_hdr_size = 0 + .only_hdr_size = 0 }; /* Eager sync */ static UCS_F_ALWAYS_INLINE void ucp_tag_offload_sync_posted(ucp_worker_t *worker, ucp_request_t *req) { - req->send.tag_offload.ssend_tag = req->send.tag.tag; + req->send.tag_offload.ssend_tag = req->send.msg_proto.tag.tag; ucs_queue_push(&worker->tm.offload.sync_reqs, &req->send.tag_offload.queue); } @@ -682,14 +726,33 @@ static ucs_status_t ucp_tag_offload_eager_sync_zcopy(uct_pending_req_t *self) return status; } -const ucp_proto_t ucp_tag_offload_sync_proto = { +void ucp_tag_offload_sync_send_ack(ucp_worker_h worker, uintptr_t ep_ptr, + ucp_tag_t stag, uint16_t recv_flags) +{ + ucp_request_t *req; + + ucs_assert(recv_flags & UCP_RECV_DESC_FLAG_EAGER_OFFLOAD); + + req = ucp_proto_ssend_ack_request_alloc(worker, ep_ptr); + if (req == NULL) { + ucs_fatal("could not allocate request"); + } + + req->send.proto.am_id = UCP_AM_ID_OFFLOAD_SYNC_ACK; + req->send.proto.sender_tag = stag; + + ucs_trace_req("tag_offload send_sync_ack ep 0x%lx tag %"PRIx64"", + ep_ptr, stag); + + ucp_request_send(req, 0); +} + +const ucp_request_send_proto_t ucp_tag_offload_sync_proto = { .contig_short = NULL, .bcopy_single = ucp_tag_offload_eager_sync_bcopy, .bcopy_multi = NULL, .zcopy_single = ucp_tag_offload_eager_sync_zcopy, .zcopy_multi = NULL, .zcopy_completion = ucp_tag_eager_sync_zcopy_completion, - .only_hdr_size = 0, - .first_hdr_size = 0, - .mid_hdr_size = 0 + .only_hdr_size = 0 }; diff --git a/src/ucp/tag/offload.h b/src/ucp/tag/offload.h index fbdc4be7503..39cb97c348f 100644 --- a/src/ucp/tag/offload.h +++ b/src/ucp/tag/offload.h @@ -1,5 +1,5 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -7,9 +7,9 @@ #ifndef UCP_TAG_OFFLOAD_H_ #define UCP_TAG_OFFLOAD_H_ +#include #include #include -#include #include @@ -37,9 +37,18 @@ typedef struct { } UCS_S_PACKED ucp_offload_ssend_hdr_t; -extern const ucp_proto_t ucp_tag_offload_proto; +/** + * Header for multi-fragmented sync send acknowledgment + * (carried by last fragment) + */ +typedef struct { + ucp_eager_middle_hdr_t super; + ucp_offload_ssend_hdr_t ssend_ack; +} UCS_S_PACKED ucp_offload_last_ssend_hdr_t; + -extern const ucp_proto_t ucp_tag_offload_sync_proto; +extern const ucp_request_send_proto_t ucp_tag_offload_proto; +extern const ucp_request_send_proto_t ucp_tag_offload_sync_proto; ucs_status_t ucp_tag_offload_rndv_zcopy(uct_pending_req_t *self); @@ -50,7 +59,8 @@ void ucp_tag_offload_cancel_rndv(ucp_request_t *req); ucs_status_t ucp_tag_offload_start_rndv(ucp_request_t *sreq); ucs_status_t ucp_tag_offload_unexp_eager(void *arg, void *data, size_t length, - unsigned flags, uct_tag_t stag, uint64_t imm); + unsigned flags, uct_tag_t stag, + uint64_t imm, void **context); ucs_status_t ucp_tag_offload_unexp_rndv(void *arg, unsigned flags, uint64_t stag, @@ -62,15 +72,15 @@ void ucp_tag_offload_cancel(ucp_worker_t *worker, ucp_request_t *req, unsigned m int ucp_tag_offload_post(ucp_request_t *req, ucp_request_queue_t *req_queue); +void ucp_tag_offload_sync_send_ack(ucp_worker_h worker, uintptr_t ep_ptr, + ucp_tag_t stag, uint16_t recv_flags); + /** * @brief Activate tag offload interface * * @param [in] wiface UCP worker interface. - * - * @return 0 - if tag offloading is disabled in the configuration - * 1 - wiface interface is activated (if it was inactive before) */ -int ucp_tag_offload_iface_activate(ucp_worker_iface_t *wiface); +void ucp_tag_offload_iface_activate(ucp_worker_iface_t *wiface); static UCS_F_ALWAYS_INLINE void ucp_tag_offload_try_post(ucp_worker_t *worker, ucp_request_t *req, @@ -126,21 +136,26 @@ ucp_tag_offload_unexp(ucp_worker_iface_t *wiface, ucp_tag_t tag, size_t length) ++wiface->proxy_recv_count; if (ucs_unlikely(!(wiface->flags & UCP_WORKER_IFACE_FLAG_OFFLOAD_ACTIVATED))) { - if (!ucp_tag_offload_iface_activate(wiface)) { - return; - } + ucp_tag_offload_iface_activate(wiface); } + /* Need to hash all tags of messages arriving to offload-capable interface + if more than one interface is activated on the worker. This is needed to + avoid unwanted postings of receive buffers (those, which are expected to + arrive from offload incapable iface) to the HW. */ if (ucs_unlikely((length >= worker->tm.offload.thresh) && (worker->num_active_ifaces > 1))) { tag_key = worker->context->config.tag_sender_mask & tag; + hash_it = kh_get(ucp_tag_offload_hash, &worker->tm.offload.tag_hash, + tag_key); + if (ucs_likely(hash_it != kh_end(&worker->tm.offload.tag_hash))) { + return; + } + hash_it = kh_put(ucp_tag_offload_hash, &worker->tm.offload.tag_hash, tag_key, &ret); - - /* khash returns 1 or 2 if key is not present and value can be set */ - if (ret > 0) { - kh_value(&worker->tm.offload.tag_hash, hash_it) = wiface; - } + ucs_assertv((ret == 1) || (ret == 2), "ret=%d", ret); + kh_value(&worker->tm.offload.tag_hash, hash_it) = wiface; } } diff --git a/src/ucp/tag/probe.c b/src/ucp/tag/probe.c index ec80139fa7c..a3cee88ccd6 100644 --- a/src/ucp/tag/probe.c +++ b/src/ucp/tag/probe.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "eager.h" #include "rndv.h" #include "tag_match.inl" @@ -12,10 +16,10 @@ #include #include - -ucp_tag_message_h ucp_tag_probe_nb(ucp_worker_h worker, ucp_tag_t tag, - ucp_tag_t tag_mask, int remove, - ucp_tag_recv_info_t *info) +UCS_PROFILE_FUNC(ucp_tag_message_h, ucp_tag_probe_nb, + (worker, tag, tag_mask, rem, info), + ucp_worker_h worker, ucp_tag_t tag, ucp_tag_t tag_mask, + int rem, ucp_tag_recv_info_t *info) { ucp_context_h UCS_V_UNUSED context = worker->context; ucp_recv_desc_t *rdesc; @@ -26,9 +30,9 @@ ucp_tag_message_h ucp_tag_probe_nb(ucp_worker_h worker, ucp_tag_t tag, UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); ucs_trace_req("probe_nb tag %"PRIx64"/%"PRIx64" remove=%d", tag, tag_mask, - remove); + rem); - rdesc = ucp_tag_unexp_search(&worker->tm, tag, tag_mask, remove, "probe"); + rdesc = ucp_tag_unexp_search(&worker->tm, tag, tag_mask, rem, "probe"); if (rdesc != NULL) { flags = rdesc->flags; info->sender_tag = ucp_rdesc_get_tag(rdesc); diff --git a/src/ucp/tag/rndv.c b/src/ucp/tag/rndv.c index 919f68d4b3c..860a941e7b5 100644 --- a/src/ucp/tag/rndv.c +++ b/src/ucp/tag/rndv.c @@ -1,43 +1,80 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rndv.h" #include "tag_match.inl" - #include "offload.h" -#include + #include #include -static int ucp_rndv_is_get_zcopy(ucp_request_t *sreq, ucp_rndv_mode_t rndv_mode) +static int ucp_rndv_is_recv_pipeline_needed(ucp_request_t *rndv_req, + const ucp_rndv_rts_hdr_t *rndv_rts_hdr, + ucs_memory_type_t mem_type, + int is_get_zcopy_failed) { - return ((rndv_mode == UCP_RNDV_MODE_GET_ZCOPY) || - ((rndv_mode == UCP_RNDV_MODE_AUTO) && - (UCP_MEM_IS_HOST(sreq->send.mem_type) || - UCP_MEM_IS_ROCM(sreq->send.mem_type)))); -} - -static int ucp_rndv_is_pipeline_needed(ucp_request_t *sreq) { + const ucp_ep_config_t *ep_config = ucp_ep_config(rndv_req->send.ep); + ucp_context_h context = rndv_req->send.ep->worker->context; + int found = 0; + ucp_md_index_t md_index; uct_md_attr_t *md_attr; - unsigned md_index; + uint64_t mem_types; + int i; + + for (i = 0; + (i < UCP_MAX_LANES) && + (ep_config->key.rma_bw_lanes[i] != UCP_NULL_LANE); i++) { + md_index = ep_config->md_index[ep_config->key.rma_bw_lanes[i]]; + if (context->tl_mds[md_index].attr.cap.access_mem_type == UCS_MEMORY_TYPE_HOST) { + found = 1; + break; + } + } - if (UCP_MEM_IS_HOST(sreq->send.mem_type)) { + /* no host bw lanes for pipeline staging */ + if (!found) { return 0; } - if (sreq->send.ep->worker->context->config.ext.rndv_mode == - UCP_RNDV_MODE_PUT_ZCOPY) { - return 0; + if (is_get_zcopy_failed) { + return 1; } - md_index = ucp_ep_md_index(sreq->send.ep, sreq->send.lane); - md_attr = &sreq->send.ep->worker->context->tl_mds[md_index].attr; + /* disqualify recv side pipeline if + * a mem_type bw lane exist AND + * lane can do RMA on remote mem_type + */ + mem_types = UCS_BIT(mem_type); + if (rndv_rts_hdr->address) { + mem_types |= UCS_BIT(ucp_rkey_packed_mem_type(rndv_rts_hdr + 1)); + } - /* check if lane support only mem type */ - return md_attr->cap.reg_mem_types & UCS_BIT(UCT_MD_MEM_TYPE_HOST); + ucs_for_each_bit(md_index, ep_config->key.rma_bw_md_map) { + md_attr = &context->tl_mds[md_index].attr; + if (ucs_test_all_flags(md_attr->cap.reg_mem_types, mem_types)) { + return 0; + } + } + + return 1; +} + +static int ucp_rndv_is_put_pipeline_needed(uintptr_t remote_address, + size_t length, size_t min_get_zcopy, + size_t max_get_zcopy, + int is_get_zcopy_failed) +{ + /* fallback to PUT pipeline if remote mem type is non-HOST memory OR + * can't do GET ZCOPY */ + return ((remote_address == 0) || (max_get_zcopy == 0) || + (length < min_get_zcopy) || is_get_zcopy_failed); } size_t ucp_tag_rndv_rts_pack(void *dest, void *arg) @@ -47,14 +84,14 @@ size_t ucp_tag_rndv_rts_pack(void *dest, void *arg) ucp_worker_h worker = sreq->send.ep->worker; ssize_t packed_rkey_size; - rndv_rts_hdr->super.tag = sreq->send.tag.tag; + rndv_rts_hdr->super.tag = sreq->send.msg_proto.tag.tag; rndv_rts_hdr->sreq.reqptr = (uintptr_t)sreq; rndv_rts_hdr->sreq.ep_ptr = ucp_request_get_dest_ep_ptr(sreq); rndv_rts_hdr->size = sreq->send.length; /* Pack remote keys (which can be empty list) */ if (UCP_DT_IS_CONTIG(sreq->send.datatype) && - ucp_rndv_is_get_zcopy(sreq, worker->context->config.ext.rndv_mode)) { + ucp_rndv_is_get_zcopy(sreq, worker->context)) { /* pack rkey, ask target to do get_zcopy */ rndv_rts_hdr->address = (uintptr_t)sreq->send.buffer; packed_rkey_size = ucp_rkey_pack_uct(worker->context, @@ -64,8 +101,11 @@ size_t ucp_tag_rndv_rts_pack(void *dest, void *arg) rndv_rts_hdr + 1); if (packed_rkey_size < 0) { ucs_fatal("failed to pack rendezvous remote key: %s", - ucs_status_string(packed_rkey_size)); + ucs_status_string((ucs_status_t)packed_rkey_size)); } + + ucs_assert(packed_rkey_size <= + ucp_ep_config(sreq->send.ep)->tag.rndv.rkey_size); } else { rndv_rts_hdr->address = 0; packed_rkey_size = 0; @@ -77,8 +117,13 @@ size_t ucp_tag_rndv_rts_pack(void *dest, void *arg) UCS_PROFILE_FUNC(ucs_status_t, ucp_proto_progress_rndv_rts, (self), uct_pending_req_t *self) { + ucp_request_t *sreq = ucs_container_of(self, ucp_request_t, send.uct); + size_t packed_rkey_size; + /* send the RTS. the pack_cb will pack all the necessary fields in the RTS */ - return ucp_do_am_bcopy_single(self, UCP_AM_ID_RNDV_RTS, ucp_tag_rndv_rts_pack); + packed_rkey_size = ucp_ep_config(sreq->send.ep)->tag.rndv.rkey_size; + return ucp_do_am_single(self, UCP_AM_ID_RNDV_RTS, ucp_tag_rndv_rts_pack, + sizeof(ucp_rndv_rts_hdr_t) + packed_rkey_size); } static size_t ucp_tag_rndv_rtr_pack(void *dest, void *arg) @@ -94,6 +139,9 @@ static size_t ucp_tag_rndv_rtr_pack(void *dest, void *arg) /* Pack remote keys (which can be empty list) */ if (UCP_DT_IS_CONTIG(rreq->recv.datatype)) { rndv_rtr_hdr->address = (uintptr_t)rreq->recv.buffer; + rndv_rtr_hdr->size = rndv_req->send.rndv_rtr.length; + rndv_rtr_hdr->offset = rndv_req->send.rndv_rtr.offset; + packed_rkey_size = ucp_rkey_pack_uct(rndv_req->send.ep->worker->context, rreq->recv.state.dt.contig.md_map, rreq->recv.state.dt.contig.memh, @@ -104,6 +152,8 @@ static size_t ucp_tag_rndv_rtr_pack(void *dest, void *arg) } } else { rndv_rtr_hdr->address = 0; + rndv_rtr_hdr->size = 0; + rndv_rtr_hdr->offset = 0; packed_rkey_size = 0; } @@ -114,23 +164,52 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_proto_progress_rndv_rtr, (self), uct_pending_req_t *self) { ucp_request_t *rndv_req = ucs_container_of(self, ucp_request_t, send.uct); + size_t packed_rkey_size; ucs_status_t status; /* send the RTR. the pack_cb will pack all the necessary fields in the RTR */ - status = ucp_do_am_bcopy_single(self, UCP_AM_ID_RNDV_RTR, ucp_tag_rndv_rtr_pack); + packed_rkey_size = ucp_ep_config(rndv_req->send.ep)->tag.rndv.rkey_size; + status = ucp_do_am_single(self, UCP_AM_ID_RNDV_RTR, ucp_tag_rndv_rtr_pack, + sizeof(ucp_rndv_rtr_hdr_t) + packed_rkey_size); if (status == UCS_OK) { + /* release rndv request */ ucp_request_put(rndv_req); } return status; } -ucs_status_t ucp_tag_send_start_rndv(ucp_request_t *sreq) +ucs_status_t ucp_tag_rndv_reg_send_buffer(ucp_request_t *sreq) { ucp_ep_h ep = sreq->send.ep; ucp_md_map_t md_map; ucs_status_t status; + if (UCP_DT_IS_CONTIG(sreq->send.datatype) && + ucp_rndv_is_get_zcopy(sreq, ep->worker->context)) { + + /* register a contiguous buffer for rma_get */ + md_map = ucp_ep_config(ep)->key.rma_bw_md_map; + + /* Pass UCT_MD_MEM_FLAG_HIDE_ERRORS flag, because registration may fail + * if md does not support send memory type (e.g. CUDA memory). In this + * case RTS will be sent with empty key, and sender will fallback to + * PUT or pipeline protocols. */ + status = ucp_request_send_buffer_reg(sreq, md_map, + UCT_MD_MEM_FLAG_HIDE_ERRORS); + if (status != UCS_OK) { + return status; + } + } + + return UCS_OK; +} + +ucs_status_t ucp_tag_send_start_rndv(ucp_request_t *sreq) +{ + ucp_ep_h ep = sreq->send.ep; + ucs_status_t status; + ucp_trace_req(sreq, "start_rndv to %s buffer %p length %zu", ucp_ep_peer_name(ep), sreq->send.buffer, sreq->send.length); @@ -143,46 +222,80 @@ ucs_status_t ucp_tag_send_start_rndv(ucp_request_t *sreq) if (ucp_ep_is_tag_offload_enabled(ucp_ep_config(ep))) { status = ucp_tag_offload_start_rndv(sreq); - if (status != UCS_OK) { - return status; - } } else { - if (UCP_DT_IS_CONTIG(sreq->send.datatype) && - ucp_rndv_is_get_zcopy(sreq, ep->worker->context->config.ext.rndv_mode)) { - /* register a contiguous buffer for rma_get */ - md_map = ucp_ep_config(ep)->key.rma_bw_md_map; - status = ucp_request_send_buffer_reg(sreq, md_map); - if (status != UCS_OK) { - return status; - } - } - ucs_assert(sreq->send.lane == ucp_ep_get_am_lane(ep)); sreq->send.uct.func = ucp_proto_progress_rndv_rts; + status = ucp_tag_rndv_reg_send_buffer(sreq); } - return UCS_OK; + return status; +} + +static UCS_F_ALWAYS_INLINE size_t +ucp_rndv_adjust_zcopy_length(size_t min_zcopy, size_t max_zcopy, size_t align, + size_t send_length, size_t offset, size_t length) +{ + size_t result_length, tail; + + ucs_assert(length > 0); + + /* ensure that the current length is over min_zcopy */ + result_length = ucs_max(length, min_zcopy); + + /* ensure that the current length is less than max_zcopy */ + result_length = ucs_min(result_length, max_zcopy); + + /* ensure that tail (rest of message) is over min_zcopy */ + ucs_assertv(send_length >= (offset + result_length), + "send_length=%zu, offset=%zu, length=%zu", + send_length, offset, result_length); + tail = send_length - (offset + result_length); + if (ucs_unlikely((tail != 0) && (tail < min_zcopy))) { + /* ok, tail is less zcopy minimal & could not be processed as + * standalone operation */ + /* check if we have room to increase current part and not + * step over max_zcopy */ + if (result_length < (max_zcopy - tail)) { + /* if we can increase length by min_zcopy - let's do it to + * avoid small tail (we have limitation on minimal get zcopy) */ + result_length += tail; + } else { + /* reduce current length by align or min_zcopy value + * to process it on next round */ + ucs_assert(result_length > ucs_max(min_zcopy, align)); + result_length -= ucs_max(min_zcopy, align); + } + } + + ucs_assertv(result_length >= min_zcopy, "length=%zu, min_zcopy=%zu", + result_length, min_zcopy); + ucs_assertv(((send_length - (offset + result_length)) == 0) || + ((send_length - (offset + result_length)) >= min_zcopy), + "send_length=%zu, offset=%zu, length=%zu, min_zcopy=%zu", + send_length, offset, result_length, min_zcopy); + + return result_length; } -static void ucp_rndv_complete_send(ucp_request_t *sreq) +static void ucp_rndv_complete_send(ucp_request_t *sreq, ucs_status_t status) { ucp_request_send_generic_dt_finish(sreq); ucp_request_send_buffer_dereg(sreq); - ucp_request_complete_send(sreq, UCS_OK); + ucp_request_complete_send(sreq, status); } static void ucp_rndv_req_send_ats(ucp_request_t *rndv_req, ucp_request_t *rreq, - uintptr_t remote_request) + uintptr_t remote_request, ucs_status_t status) { ucp_trace_req(rndv_req, "send ats remote_request 0x%lx", remote_request); UCS_PROFILE_REQUEST_EVENT(rreq, "send_ats", 0); - rndv_req->send.lane = ucp_ep_get_am_lane(rndv_req->send.ep); - rndv_req->send.uct.func = ucp_proto_progress_am_bcopy_single; - rndv_req->send.proto.am_id = UCP_AM_ID_RNDV_ATS; - rndv_req->send.proto.status = UCS_OK; + rndv_req->send.lane = ucp_ep_get_am_lane(rndv_req->send.ep); + rndv_req->send.uct.func = ucp_proto_progress_am_single; + rndv_req->send.proto.am_id = UCP_AM_ID_RNDV_ATS; + rndv_req->send.proto.status = status; rndv_req->send.proto.remote_request = remote_request; - rndv_req->send.proto.comp_cb = ucp_request_put; + rndv_req->send.proto.comp_cb = ucp_request_put; ucp_request_send(rndv_req, 0); } @@ -210,7 +323,7 @@ static void ucp_rndv_send_atp(ucp_request_t *sreq, uintptr_t remote_request) ucp_rkey_destroy(sreq->send.rndv_put.rkey); sreq->send.lane = ucp_ep_get_am_lane(sreq->send.ep); - sreq->send.uct.func = ucp_proto_progress_am_bcopy_single; + sreq->send.uct.func = ucp_proto_progress_am_single; sreq->send.proto.am_id = UCP_AM_ID_RNDV_ATP; sreq->send.proto.status = UCS_OK; sreq->send.proto.remote_request = remote_request; @@ -219,13 +332,49 @@ static void ucp_rndv_send_atp(ucp_request_t *sreq, uintptr_t remote_request) ucp_request_send(sreq, 0); } +UCS_PROFILE_FUNC_VOID(ucp_rndv_complete_frag_rma_put_zcopy, (fsreq), + ucp_request_t *fsreq) +{ + ucp_request_t *sreq = fsreq->send.proto.sreq; + + sreq->send.state.dt.offset += fsreq->send.length; + + /* delete fragments send request */ + ucp_request_put(fsreq); + + /* complete send request after put completions of all fragments */ + if (sreq->send.state.dt.offset == sreq->send.length) { + ucp_rndv_complete_rma_put_zcopy(sreq); + } +} + +static void ucp_rndv_send_frag_atp(ucp_request_t *fsreq, uintptr_t remote_request) +{ + ucp_trace_req(fsreq, "send frag atp remote_request 0x%lx", remote_request); + UCS_PROFILE_REQUEST_EVENT(fsreq, "send_frag_atp", 0); + + /* destroy rkey before it gets overridden by ATP protocol data */ + ucp_rkey_destroy(fsreq->send.rndv_put.rkey); + + fsreq->send.lane = ucp_ep_get_am_lane(fsreq->send.ep); + fsreq->send.uct.func = ucp_proto_progress_am_single; + fsreq->send.proto.sreq = fsreq->send.rndv_put.sreq; + fsreq->send.proto.am_id = UCP_AM_ID_RNDV_ATP; + fsreq->send.proto.status = UCS_OK; + fsreq->send.proto.remote_request = remote_request; + fsreq->send.proto.comp_cb = ucp_rndv_complete_frag_rma_put_zcopy; + + ucp_request_send(fsreq, 0); +} + static void ucp_rndv_zcopy_recv_req_complete(ucp_request_t *req, ucs_status_t status) { ucp_request_recv_buffer_dereg(req); ucp_request_complete_tag_recv(req, status); } -static void ucp_rndv_complete_rma_get_zcopy(ucp_request_t *rndv_req) +static void ucp_rndv_complete_rma_get_zcopy(ucp_request_t *rndv_req, + ucs_status_t status) { ucp_request_t *rreq = rndv_req->send.rndv_get.rreq; @@ -233,14 +382,22 @@ static void ucp_rndv_complete_rma_get_zcopy(ucp_request_t *rndv_req) "rndv_req=%p offset=%zu length=%zu", rndv_req, rndv_req->send.state.dt.offset, rndv_req->send.length); - ucp_trace_req(rndv_req, "rndv_get completed"); + ucp_trace_req(rndv_req, "rndv_get completed with status %s", + ucs_status_string(status)); UCS_PROFILE_REQUEST_EVENT(rreq, "complete_rndv_get", 0); ucp_rkey_destroy(rndv_req->send.rndv_get.rkey); ucp_request_send_buffer_dereg(rndv_req); - ucp_rndv_req_send_ats(rndv_req, rreq, rndv_req->send.rndv_get.remote_request); - ucp_rndv_zcopy_recv_req_complete(rreq, UCS_OK); + if (status == UCS_OK) { + ucp_rndv_req_send_ats(rndv_req, rreq, + rndv_req->send.rndv_get.remote_request, UCS_OK); + } else { + /* if completing RNDV with the error, just release RNDV request */ + ucp_request_put(rndv_req); + } + + ucp_rndv_zcopy_recv_req_complete(rreq, status); } static void ucp_rndv_recv_data_init(ucp_request_t *rreq, size_t size) @@ -250,7 +407,8 @@ static void ucp_rndv_recv_data_init(ucp_request_t *rreq, size_t size) } static void ucp_rndv_req_send_rtr(ucp_request_t *rndv_req, ucp_request_t *rreq, - uintptr_t sender_reqptr) + uintptr_t sender_reqptr, size_t recv_length, + size_t offset) { ucp_trace_req(rndv_req, "send rtr remote sreq 0x%lx rreq %p", sender_reqptr, rreq); @@ -259,66 +417,40 @@ static void ucp_rndv_req_send_rtr(ucp_request_t *rndv_req, ucp_request_t *rreq, rndv_req->send.uct.func = ucp_proto_progress_rndv_rtr; rndv_req->send.rndv_rtr.remote_request = sender_reqptr; rndv_req->send.rndv_rtr.rreq = rreq; + rndv_req->send.rndv_rtr.length = recv_length; + rndv_req->send.rndv_rtr.offset = offset; ucp_request_send(rndv_req, 0); } -static void ucp_rndv_get_lanes_count(ucp_request_t *req) +static ucp_lane_index_t +ucp_rndv_get_zcopy_get_lane(ucp_request_t *rndv_req, uct_rkey_t *uct_rkey) { - ucp_ep_h ep = req->send.ep; - ucp_lane_map_t map = 0; - uct_rkey_t uct_rkey; - ucp_lane_index_t lane; + ucp_lane_index_t lane_idx; + ucp_ep_config_t *ep_config; + ucp_rkey_h rkey; + uint8_t rkey_index; - if (ucs_likely(req->send.rndv_get.lane_count != 0)) { - return; /* already resolved */ - } - - while ((lane = ucp_rkey_get_rma_bw_lane(req->send.rndv_get.rkey, ep, req->send.mem_type, - &uct_rkey, map)) != UCP_NULL_LANE) { - req->send.rndv_get.lane_count++; - map |= UCS_BIT(lane); + if (ucs_unlikely(!rndv_req->send.rndv_get.lanes_map_all)) { + return UCP_NULL_LANE; } - req->send.rndv_get.lane_count = ucs_min(req->send.rndv_get.lane_count, - ep->worker->context->config.ext.max_rndv_lanes); + lane_idx = ucs_ffs64_safe(rndv_req->send.rndv_get.lanes_map_avail); + ucs_assert(lane_idx < UCP_MAX_LANES); + rkey = rndv_req->send.rndv_get.rkey; + rkey_index = rndv_req->send.rndv_get.rkey_index[lane_idx]; + *uct_rkey = (rkey_index != UCP_NULL_RESOURCE) ? + rkey->tl_rkey[rkey_index].rkey.rkey : UCT_INVALID_RKEY; + ep_config = ucp_ep_config(rndv_req->send.ep); + return ep_config->tag.rndv.get_zcopy_lanes[lane_idx]; } -static ucp_lane_index_t ucp_rndv_get_next_lane(ucp_request_t *rndv_req, uct_rkey_t *uct_rkey) +static void ucp_rndv_get_zcopy_next_lane(ucp_request_t *rndv_req) { - /* get lane and mask it for next iteration. - * next time this lane will not be selected & we continue - * with another lane. After all lanes are masked - reset mask - * to zero & start from scratch. this way allows to enumerate - * all lanes */ - ucp_ep_h ep = rndv_req->send.ep; - ucp_lane_index_t lane; - - lane = ucp_rkey_get_rma_bw_lane(rndv_req->send.rndv_get.rkey, ep, rndv_req->send.mem_type, - uct_rkey, rndv_req->send.rndv_get.lanes_map); - - if ((lane == UCP_NULL_LANE) && (rndv_req->send.rndv_get.lanes_map != 0)) { - /* lanes_map != 0 - no more lanes (but BW lanes are exist because map - * is not NULL - we found at least one lane on previous iteration). - * reset used lanes map to NULL and iterate it again */ - rndv_req->send.rndv_get.lanes_map = 0; - lane = ucp_rkey_get_rma_bw_lane(rndv_req->send.rndv_get.rkey, ep, rndv_req->send.mem_type, - uct_rkey, rndv_req->send.rndv_get.lanes_map); + rndv_req->send.rndv_get.lanes_map_avail &= rndv_req->send.rndv_get.lanes_map_avail - 1; + if (!rndv_req->send.rndv_get.lanes_map_avail) { + rndv_req->send.rndv_get.lanes_map_avail = rndv_req->send.rndv_get.lanes_map_all; } - - if (ucs_unlikely(lane == UCP_NULL_LANE)) { - /* there are no BW lanes */ - return UCP_NULL_LANE; - } - - rndv_req->send.rndv_get.lanes_map |= UCS_BIT(lane); - /* in case if masked too much lanes - reset mask to zero - * to select first lane next time */ - if (ucs_popcount(rndv_req->send.rndv_get.lanes_map) >= - ep->worker->context->config.ext.max_rndv_lanes) { - rndv_req->send.rndv_get.lanes_map = 0; - } - return lane; } UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_rma_get_zcopy, (self), @@ -330,7 +462,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_rma_get_zcopy, (self), const size_t max_iovcnt = 1; uct_iface_attr_t* attrs; ucs_status_t status; - size_t offset, length, ucp_mtu, remainder, align, chunk; + size_t offset, length, ucp_mtu, remaining, align, chunk; uct_iov_t iov[max_iovcnt]; size_t iovcnt; ucp_rsc_index_t rsc_index; @@ -338,14 +470,11 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_rma_get_zcopy, (self), uct_rkey_t uct_rkey; size_t min_zcopy; size_t max_zcopy; - size_t tail; int pending_add_res; ucp_lane_index_t lane; - ucp_rndv_get_lanes_count(rndv_req); - /* Figure out which lane to use for get operation */ - rndv_req->send.lane = lane = ucp_rndv_get_next_lane(rndv_req, &uct_rkey); + rndv_req->send.lane = lane = ucp_rndv_get_zcopy_get_lane(rndv_req, &uct_rkey); if (lane == UCP_NULL_LANE) { /* If can't perform get_zcopy - switch to active-message. @@ -354,11 +483,17 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_rma_get_zcopy, (self), ucp_rkey_destroy(rndv_req->send.rndv_get.rkey); ucp_rndv_recv_data_init(rndv_req->send.rndv_get.rreq, rndv_req->send.length); + /* Update statistics counters from get_zcopy to rtr */ + UCP_WORKER_STAT_RNDV(ep->worker, GET_ZCOPY, -1); + UCP_WORKER_STAT_RNDV(ep->worker, SEND_RTR, +1); ucp_rndv_req_send_rtr(rndv_req, rndv_req->send.rndv_get.rreq, - rndv_req->send.rndv_get.remote_request); + rndv_req->send.rndv_get.remote_request, + rndv_req->send.length, 0ul); return UCS_OK; } + ucs_assert_always(rndv_req->send.rndv_get.lanes_count > 0); + if (!rndv_req->send.mdesc) { status = ucp_send_request_add_reg_lane(rndv_req, lane); ucs_assert_always(status == UCS_OK); @@ -372,43 +507,25 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_rma_get_zcopy, (self), max_zcopy = config->tag.rndv.max_get_zcopy; offset = rndv_req->send.state.dt.offset; - remainder = (uintptr_t)rndv_req->send.buffer % align; + remaining = (uintptr_t)rndv_req->send.buffer % align; - if ((offset == 0) && (remainder > 0) && (rndv_req->send.length > ucp_mtu)) { - length = ucp_mtu - remainder; + if ((offset == 0) && (remaining > 0) && (rndv_req->send.length > ucp_mtu)) { + length = ucp_mtu - remaining; } else { - chunk = ucs_align_up((size_t)(ucs_min(rndv_req->send.length / - rndv_req->send.rndv_get.lane_count, - max_zcopy) * config->tag.rndv.scale[lane]), + chunk = ucs_align_up((size_t)(rndv_req->send.length / + rndv_req->send.rndv_get.lanes_count + * config->tag.rndv.scale[lane]), align); length = ucs_min(chunk, rndv_req->send.length - offset); } - /* ensure that tail (rest of message) is over min_zcopy */ - tail = rndv_req->send.length - (offset + length); - if (ucs_unlikely(tail && (tail < min_zcopy))) { - /* ok, tail is less get zcopy minimal & could not be processed as - * standalone operation */ - /* check if we have room to increase current part and not - * step over max_zcopy */ - if (length < (max_zcopy - tail)) { - /* if we can encrease length by min_zcopy - let's do it to - * avoid small tail (we have limitation on minimal get zcopy) */ - length += tail; - } else { - /* reduce current length by align or min_zcopy value - * to process it on next round */ - ucs_assert(length > ucs_max(min_zcopy, align)); - length -= ucs_max(min_zcopy, align); - } - } - - ucs_assert(length >= min_zcopy); - ucs_assert((rndv_req->send.length - (offset + length) == 0) || - (rndv_req->send.length - (offset + length) >= min_zcopy)); + length = ucp_rndv_adjust_zcopy_length(min_zcopy, max_zcopy, align, + rndv_req->send.length, offset, + length); ucs_trace_data("req %p: offset %zu remainder %zu rma-get to %p len %zu lane %d", - rndv_req, offset, remainder, rndv_req->send.buffer + offset, + rndv_req, offset, remaining, + UCS_PTR_BYTE_OFFSET(rndv_req->send.buffer, offset), length, lane); state = rndv_req->send.state.dt; @@ -431,12 +548,13 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_rma_get_zcopy, (self), status); if (rndv_req->send.state.dt.offset == rndv_req->send.length) { if (rndv_req->send.state.uct_comp.count == 0) { - ucp_rndv_complete_rma_get_zcopy(rndv_req); + rndv_req->send.state.uct_comp.func(&rndv_req->send.state.uct_comp, status); } return UCS_OK; } else if (!UCS_STATUS_IS_ERR(status)) { /* in case if not all chunks are transmitted - return in_progress * status */ + ucp_rndv_get_zcopy_next_lane(rndv_req); return UCS_INPROGRESS; } else { if (status == UCS_ERR_NO_RESOURCE) { @@ -463,7 +581,7 @@ UCS_PROFILE_FUNC_VOID(ucp_rndv_get_completion, (self, status), send.state.uct_comp); if (rndv_req->send.state.dt.offset == rndv_req->send.length) { - ucp_rndv_complete_rma_get_zcopy(rndv_req); + ucp_rndv_complete_rma_get_zcopy(rndv_req, status); } } @@ -477,10 +595,93 @@ static void ucp_rndv_put_completion(uct_completion_t *self, ucs_status_t status) } } -static void ucp_rndv_req_send_rma_get(ucp_request_t *rndv_req, ucp_request_t *rreq, - const ucp_rndv_rts_hdr_t *rndv_rts_hdr) +static void ucp_rndv_req_init_get_zcopy_lane_map(ucp_request_t *rndv_req) { + ucp_ep_h ep = rndv_req->send.ep; + ucp_ep_config_t *ep_config = ucp_ep_config(ep); + ucp_context_h context = ep->worker->context; + ucs_memory_type_t mem_type = rndv_req->send.mem_type; + ucp_rkey_h rkey = rndv_req->send.rndv_get.rkey; + ucp_lane_map_t lane_map; + ucp_lane_index_t lane, lane_idx; + ucp_md_index_t md_index; + uct_md_attr_t *md_attr; + ucp_md_index_t dst_md_index; + ucp_rsc_index_t rsc_index; + uct_iface_attr_t *iface_attr; + double max_lane_bw, lane_bw; + int i; + + max_lane_bw = 0; + lane_map = 0; + for (i = 0; i < UCP_MAX_LANES; i++) { + lane = ep_config->tag.rndv.get_zcopy_lanes[i]; + if (lane == UCP_NULL_LANE) { + break; /* no more lanes */ + } + + md_index = ep_config->md_index[lane]; + md_attr = &context->tl_mds[md_index].attr; + rsc_index = ep_config->key.lanes[lane].rsc_index; + iface_attr = ucp_worker_iface_get_attr(ep->worker, rsc_index); + lane_bw = ucp_tl_iface_bandwidth(context, &iface_attr->bandwidth); + + if (ucs_unlikely((md_index != UCP_NULL_RESOURCE) && + !(md_attr->cap.flags & UCT_MD_FLAG_NEED_RKEY))) { + /* Lane does not need rkey, can use the lane with invalid rkey */ + if (!rkey || ((mem_type == md_attr->cap.access_mem_type) && + (mem_type == rkey->mem_type))) { + rndv_req->send.rndv_get.rkey_index[i] = UCP_NULL_RESOURCE; + lane_map |= UCS_BIT(i); + max_lane_bw = ucs_max(max_lane_bw, lane_bw); + continue; + } + } + + if (ucs_unlikely((md_index != UCP_NULL_RESOURCE) && + (!(md_attr->cap.reg_mem_types & UCS_BIT(mem_type))))) { + continue; + } + + dst_md_index = ep_config->key.lanes[lane].dst_md_index; + if (rkey && ucs_likely(rkey->md_map & UCS_BIT(dst_md_index))) { + /* Return first matching lane */ + rndv_req->send.rndv_get.rkey_index[i] = ucs_bitmap2idx(rkey->md_map, + dst_md_index); + lane_map |= UCS_BIT(i); + max_lane_bw = ucs_max(max_lane_bw, lane_bw); + } + } + + if (ucs_popcount(lane_map) > 1) { + /* remove lanes if bandwidth is too low comparing to the best lane */ + ucs_for_each_bit(lane_idx, lane_map) { + ucs_assert(lane_idx < UCP_MAX_LANES); + lane = ep_config->tag.rndv.get_zcopy_lanes[lane_idx]; + rsc_index = ep_config->key.lanes[lane].rsc_index; + iface_attr = ucp_worker_iface_get_attr(ep->worker, rsc_index); + lane_bw = ucp_tl_iface_bandwidth(context, &iface_attr->bandwidth); + + if ((lane_bw/max_lane_bw) < + (1. / context->config.ext.multi_lane_max_ratio)) { + lane_map &= ~UCS_BIT(lane_idx); + rndv_req->send.rndv_get.rkey_index[lane_idx] = UCP_NULL_RESOURCE; + } + } + } + + rndv_req->send.rndv_get.lanes_map_all = lane_map; + rndv_req->send.rndv_get.lanes_map_avail = lane_map; + rndv_req->send.rndv_get.lanes_count = ucs_popcount(lane_map); +} + +static ucs_status_t ucp_rndv_req_send_rma_get(ucp_request_t *rndv_req, + ucp_request_t *rreq, + const ucp_rndv_rts_hdr_t *rndv_rts_hdr) +{ + ucp_ep_h ep = rndv_req->send.ep; ucs_status_t status; + uct_rkey_t uct_rkey; ucp_trace_req(rndv_req, "start rma_get rreq %p", rreq); @@ -492,22 +693,453 @@ static void ucp_rndv_req_send_rma_get(ucp_request_t *rndv_req, ucp_request_t *rr rndv_req->send.rndv_get.remote_request = rndv_rts_hdr->sreq.reqptr; rndv_req->send.rndv_get.remote_address = rndv_rts_hdr->address; rndv_req->send.rndv_get.rreq = rreq; - rndv_req->send.rndv_get.lanes_map = 0; - rndv_req->send.rndv_get.lane_count = 0; rndv_req->send.datatype = rreq->recv.datatype; - status = ucp_ep_rkey_unpack(rndv_req->send.ep, rndv_rts_hdr + 1, + status = ucp_ep_rkey_unpack(ep, rndv_rts_hdr + 1, &rndv_req->send.rndv_get.rkey); if (status != UCS_OK) { ucs_fatal("failed to unpack rendezvous remote key received from %s: %s", - ucp_ep_peer_name(rndv_req->send.ep), ucs_status_string(status)); + ucp_ep_peer_name(ep), ucs_status_string(status)); } ucp_request_send_state_init(rndv_req, ucp_dt_make_contig(1), 0); ucp_request_send_state_reset(rndv_req, ucp_rndv_get_completion, UCP_REQUEST_SEND_PROTO_RNDV_GET); + ucp_rndv_req_init_get_zcopy_lane_map(rndv_req); + + rndv_req->send.lane = ucp_rndv_get_zcopy_get_lane(rndv_req, &uct_rkey); + if (rndv_req->send.lane == UCP_NULL_LANE) { + return UCS_ERR_UNREACHABLE; + } + + UCP_WORKER_STAT_RNDV(ep->worker, GET_ZCOPY, 1); ucp_request_send(rndv_req, 0); + + return UCS_OK; +} + +UCS_PROFILE_FUNC_VOID(ucp_rndv_recv_frag_put_completion, (self, status), + uct_completion_t *self, ucs_status_t status) +{ + ucp_request_t *freq = ucs_container_of(self, ucp_request_t, + send.state.uct_comp); + ucp_request_t *req = freq->send.rndv_put.sreq; + ucp_request_t *rndv_req = (ucp_request_t*)freq->send.rndv_put.remote_request; + + ucs_trace_req("freq:%p: recv_frag_put done. rreq:%p ", freq, req); + + /* release memory descriptor */ + ucs_mpool_put_inline((void *)freq->send.mdesc); + + /* rndv_req is NULL in case of put protocol */ + if (rndv_req != NULL) { + /* pipeline recv get protocol */ + rndv_req->send.state.dt.offset += freq->send.length; + + /* send ATS for fragment get rndv completion */ + if (rndv_req->send.length == rndv_req->send.state.dt.offset) { + ucp_rkey_destroy(rndv_req->send.rndv_get.rkey); + ucp_rndv_req_send_ats(rndv_req, req, + rndv_req->send.rndv_get.remote_request, UCS_OK); + } + } + + req->recv.tag.remaining -= freq->send.length; + if (req->recv.tag.remaining == 0) { + ucp_request_complete_tag_recv(req, UCS_OK); + } + + ucp_request_put(freq); +} + +static UCS_F_ALWAYS_INLINE void +ucp_rndv_init_mem_type_frag_req(ucp_worker_h worker, ucp_request_t *freq, int rndv_op, + uct_completion_callback_t comp_cb, ucp_mem_desc_t *mdesc, + ucs_memory_type_t mem_type, size_t length, + uct_pending_callback_t uct_func) +{ + ucp_ep_h mem_type_ep; + ucp_md_index_t md_index; + ucp_lane_index_t mem_type_rma_lane; + + ucp_request_send_state_init(freq, ucp_dt_make_contig(1), 0); + ucp_request_send_state_reset(freq, comp_cb, rndv_op); + + freq->send.buffer = mdesc + 1; + freq->send.length = length; + freq->send.datatype = ucp_dt_make_contig(1); + freq->send.mem_type = mem_type; + freq->send.mdesc = mdesc; + freq->send.uct.func = uct_func; + + if (mem_type != UCS_MEMORY_TYPE_HOST) { + mem_type_ep = worker->mem_type_ep[mem_type]; + mem_type_rma_lane = ucp_ep_config(mem_type_ep)->key.rma_bw_lanes[0]; + md_index = ucp_ep_md_index(mem_type_ep, mem_type_rma_lane); + ucs_assert(mem_type_rma_lane != UCP_NULL_LANE); + + freq->send.lane = mem_type_rma_lane; + freq->send.ep = mem_type_ep; + freq->send.state.dt.dt.contig.memh[0] = ucp_memh2uct(mdesc->memh, md_index); + freq->send.state.dt.dt.contig.md_map = UCS_BIT(md_index); + } +} + +static void +ucp_rndv_recv_frag_put_mem_type(ucp_request_t *rreq, ucp_request_t *rndv_req, + ucp_request_t *freq, ucp_mem_desc_t *mdesc, + size_t length, size_t offset) +{ + + ucs_assert_always(!UCP_MEM_IS_ACCESSIBLE_FROM_CPU(rreq->recv.mem_type)); + + /* PUT on memtype endpoint to stage from + * frag recv buffer to memtype recv buffer + */ + + ucp_rndv_init_mem_type_frag_req(rreq->recv.worker, freq, UCP_REQUEST_SEND_PROTO_RNDV_PUT, + ucp_rndv_recv_frag_put_completion, mdesc, rreq->recv.mem_type, + length, ucp_rndv_progress_rma_put_zcopy); + + freq->send.rndv_put.sreq = rreq; + freq->send.rndv_put.rkey = NULL; + freq->send.rndv_put.remote_request = (uintptr_t)rndv_req; + freq->send.rndv_put.remote_address = (uintptr_t)rreq->recv.buffer + offset; + + ucp_request_send(freq, 0); +} + +static ucs_status_t +ucp_rndv_send_frag_get_mem_type(ucp_request_t *sreq, uintptr_t rreq_ptr, + size_t length, uint64_t remote_address, + ucs_memory_type_t remote_mem_type, ucp_rkey_h rkey, + uint8_t *rkey_index, ucp_lane_map_t lanes_map, + uct_completion_callback_t comp_cb) +{ + ucp_worker_h worker = sreq->send.ep->worker; + ucp_request_t *freq; + ucp_mem_desc_t *mdesc; + ucp_lane_index_t i; + + /* GET fragment to stage buffer */ + + freq = ucp_request_get(worker); + if (ucs_unlikely(freq == NULL)) { + ucs_error("failed to allocate fragment receive request"); + return UCS_ERR_NO_MEMORY; + } + + mdesc = ucp_worker_mpool_get(&worker->rndv_frag_mp); + if (ucs_unlikely(mdesc == NULL)) { + ucs_error("failed to allocate fragment memory desc"); + return UCS_ERR_NO_MEMORY; + } + + freq->send.ep = sreq->send.ep; + + ucp_rndv_init_mem_type_frag_req(worker, freq, UCP_REQUEST_SEND_PROTO_RNDV_GET, + comp_cb, mdesc, remote_mem_type, length, + ucp_rndv_progress_rma_get_zcopy); + + freq->send.rndv_get.rkey = rkey; + freq->send.rndv_get.remote_address = remote_address; + freq->send.rndv_get.remote_request = rreq_ptr; + freq->send.rndv_get.rreq = sreq; + freq->send.rndv_get.lanes_map_all = lanes_map; + freq->send.rndv_get.lanes_map_avail = lanes_map; + freq->send.rndv_get.lanes_count = ucs_popcount(lanes_map); + + for (i = 0; i < UCP_MAX_LANES; i++) { + freq->send.rndv_get.rkey_index[i] = rkey_index ? rkey_index[i] + : UCP_NULL_RESOURCE; + } + + + return ucp_request_send(freq, 0); +} + +UCS_PROFILE_FUNC_VOID(ucp_rndv_recv_frag_get_completion, (self, status), + uct_completion_t *self, ucs_status_t status) +{ + ucp_request_t *freq = ucs_container_of(self, ucp_request_t, + send.state.uct_comp); + ucp_request_t *rndv_req = freq->send.rndv_get.rreq; + ucp_request_t *rreq = rndv_req->send.rndv_get.rreq; + + ucs_trace_req("freq:%p: recv_frag_get done. rreq:%p length:%ld offset:%ld", + freq, rndv_req, freq->send.length, + freq->send.rndv_get.remote_address - rndv_req->send.rndv_get.remote_address); + + /* fragment GET completed from remote to staging buffer, issue PUT from + * staging buffer to recv buffer */ + ucp_rndv_recv_frag_put_mem_type(rreq, rndv_req, freq, + (ucp_mem_desc_t *)freq->send.buffer -1, + freq->send.length, (freq->send.rndv_get.remote_address - + rndv_req->send.rndv_get.remote_address)); +} + +static ucs_status_t +ucp_rndv_recv_start_get_pipeline(ucp_worker_h worker, ucp_request_t *rndv_req, + ucp_request_t *rreq, uintptr_t remote_request, + const void *rkey_buffer, uint64_t remote_address, + size_t size, size_t base_offset) +{ + ucp_ep_h ep = rndv_req->send.ep; + ucp_ep_config_t *config = ucp_ep_config(ep); + ucp_context_h context = worker->context; + ucs_status_t status; + size_t max_frag_size, offset, length; + size_t min_zcopy, max_zcopy; + + min_zcopy = config->tag.rndv.min_get_zcopy; + max_zcopy = config->tag.rndv.max_get_zcopy; + max_frag_size = ucs_min(context->config.ext.rndv_frag_size, + max_zcopy); + rndv_req->send.rndv_get.remote_request = remote_request; + rndv_req->send.rndv_get.remote_address = remote_address - base_offset; + rndv_req->send.rndv_get.rreq = rreq; + rndv_req->send.length = size; + rndv_req->send.state.dt.offset = 0; + rndv_req->send.mem_type = rreq->recv.mem_type; + + /* Protocol: + * Step 1: GET remote fragment into HOST fragment buffer + * Step 2: PUT from fragment buffer to MEM TYPE destination + * Step 3: Send ATS for RNDV request + */ + + status = ucp_ep_rkey_unpack(rndv_req->send.ep, rkey_buffer, + &rndv_req->send.rndv_get.rkey); + if (ucs_unlikely(status != UCS_OK)) { + ucs_fatal("failed to unpack rendezvous remote key received from %s: %s", + ucp_ep_peer_name(rndv_req->send.ep), ucs_status_string(status)); + } + + ucp_rndv_req_init_get_zcopy_lane_map(rndv_req); + + offset = 0; + while (offset != size) { + length = ucp_rndv_adjust_zcopy_length(min_zcopy, max_frag_size, 0, + size, offset, size - offset); + + /* GET remote fragment into HOST fragment buffer */ + ucp_rndv_send_frag_get_mem_type(rndv_req, remote_request, length, + remote_address + offset, UCS_MEMORY_TYPE_HOST, + rndv_req->send.rndv_get.rkey, + rndv_req->send.rndv_get.rkey_index, + rndv_req->send.rndv_get.lanes_map_all, + ucp_rndv_recv_frag_get_completion); + + offset += length; + } + + return UCS_OK; +} + +static void ucp_rndv_send_frag_rtr(ucp_worker_h worker, ucp_request_t *rndv_req, + ucp_request_t *rreq, + const ucp_rndv_rts_hdr_t *rndv_rts_hdr) +{ + size_t max_frag_size = worker->context->config.ext.rndv_frag_size; + int i, num_frags; + size_t frag_size; + size_t offset; + ucp_mem_desc_t *mdesc; + ucp_request_t *freq; + ucp_request_t *frndv_req; + unsigned md_index; + unsigned memh_index; + + ucp_trace_req(rreq, "using rndv pipeline protocol rndv_req %p", rndv_req); + + offset = 0; + num_frags = ucs_div_round_up(rndv_rts_hdr->size, max_frag_size); + + for (i = 0; i < num_frags; i++) { + frag_size = ucs_min(max_frag_size, (rndv_rts_hdr->size - offset)); + + /* internal fragment recv request allocated on receiver side to receive + * put fragment from sender and to perform a put to recv buffer */ + freq = ucp_request_get(worker); + if (freq == NULL) { + ucs_fatal("failed to allocate fragment receive request"); + } + + /* internal rndv request to send RTR */ + frndv_req = ucp_request_get(worker); + if (frndv_req == NULL) { + ucs_fatal("failed to allocate fragment rendezvous reply"); + } + + /* allocate fragment recv buffer desc*/ + mdesc = ucp_worker_mpool_get(&worker->rndv_frag_mp); + if (mdesc == NULL) { + ucs_fatal("failed to allocate fragment memory buffer"); + } + + freq->recv.buffer = mdesc + 1; + freq->recv.datatype = ucp_dt_make_contig(1); + freq->recv.mem_type = UCS_MEMORY_TYPE_HOST; + freq->recv.length = frag_size; + freq->recv.state.dt.contig.md_map = 0; + freq->recv.frag.rreq = rreq; + freq->recv.frag.offset = offset; + freq->flags |= UCP_REQUEST_FLAG_RNDV_FRAG; + + memh_index = 0; + ucs_for_each_bit(md_index, + (ucp_ep_config(rndv_req->send.ep)->key.rma_bw_md_map & + mdesc->memh->md_map)) { + freq->recv.state.dt.contig.memh[memh_index++] = ucp_memh2uct(mdesc->memh, md_index); + freq->recv.state.dt.contig.md_map |= UCS_BIT(md_index); + } + ucs_assert(memh_index <= UCP_MAX_OP_MDS); + + frndv_req->send.ep = rndv_req->send.ep; + frndv_req->send.pending_lane = UCP_NULL_LANE; + + ucp_rndv_req_send_rtr(frndv_req, freq, rndv_rts_hdr->sreq.reqptr, + freq->recv.length, offset); + offset += frag_size; + } + + /* release original rndv reply request */ + ucp_request_put(rndv_req); +} + +static UCS_F_ALWAYS_INLINE int +ucp_rndv_is_rkey_ptr(const ucp_rndv_rts_hdr_t *rndv_rts_hdr, ucp_ep_h ep, + ucs_memory_type_t recv_mem_type, ucp_rndv_mode_t rndv_mode) +{ + const ucp_ep_config_t *ep_config = ucp_ep_config(ep); + + return /* must have remote address */ + (rndv_rts_hdr->address != 0) && + /* remote key must be on a memory domain for which we support rkey_ptr */ + (ucp_rkey_packed_md_map(rndv_rts_hdr + 1) & + ep_config->tag.rndv.rkey_ptr_dst_mds) && + /* rendezvous mode must not be forced to put/get */ + (rndv_mode == UCP_RNDV_MODE_AUTO) && + /* need local memory access for data unpack */ + UCP_MEM_IS_ACCESSIBLE_FROM_CPU(recv_mem_type); +} + +static unsigned ucp_rndv_progress_rkey_ptr(void *arg) +{ + ucp_worker_h worker = (ucp_worker_h)arg; + ucp_request_t *rndv_req = ucs_queue_head_elem_non_empty(&worker->rkey_ptr_reqs, + ucp_request_t, + send.rkey_ptr.queue_elem); + ucp_request_t *rreq = rndv_req->send.rkey_ptr.rreq; + size_t seg_size = ucs_min(worker->context->config.ext.rkey_ptr_seg_size, + rndv_req->send.length - rreq->recv.state.offset); + ucs_status_t status; + size_t offset, new_offset; + int last; + + offset = rreq->recv.state.offset; + new_offset = offset + seg_size; + last = new_offset == rndv_req->send.length; + status = ucp_request_recv_data_unpack(rreq, + rndv_req->send.buffer + offset, + seg_size, offset, last); + if (ucs_unlikely(status != UCS_OK) || last) { + ucs_queue_pull_non_empty(&worker->rkey_ptr_reqs); + ucp_request_complete_tag_recv(rreq, status); + ucp_rkey_destroy(rndv_req->send.rkey_ptr.rkey); + ucp_rndv_req_send_ats(rndv_req, rreq, + rndv_req->send.rkey_ptr.remote_request, status); + if (ucs_queue_is_empty(&worker->rkey_ptr_reqs)) { + uct_worker_progress_unregister_safe(worker->uct, + &worker->rkey_ptr_cb_id); + } + } else { + rreq->recv.state.offset = new_offset; + } + + return 1; +} + +static void ucp_rndv_do_rkey_ptr(ucp_request_t *rndv_req, ucp_request_t *rreq, + const ucp_rndv_rts_hdr_t *rndv_rts_hdr) +{ + ucp_ep_h ep = rndv_req->send.ep; + const ucp_ep_config_t *ep_config = ucp_ep_config(ep); + ucp_worker_h worker = rreq->recv.worker; + ucp_md_index_t dst_md_index = 0; + ucp_lane_index_t i, lane; + ucs_status_t status; + unsigned rkey_index; + void *local_ptr; + ucp_rkey_h rkey; + + ucp_trace_req(rndv_req, "start rkey_ptr rndv rreq %p", rreq); + + status = ucp_ep_rkey_unpack(ep, rndv_rts_hdr + 1, &rkey); + if (status != UCS_OK) { + ucs_fatal("failed to unpack rendezvous remote key received from %s: %s", + ucp_ep_peer_name(ep), ucs_status_string(status)); + } + + /* Find a lane which is capable of accessing the destination memory */ + lane = UCP_NULL_LANE; + for (i = 0; i < ep_config->key.num_lanes; ++i) { + dst_md_index = ep_config->key.lanes[i].dst_md_index; + if (UCS_BIT(dst_md_index) & rkey->md_map) { + lane = i; + break; + } + } + + if (ucs_unlikely(lane == UCP_NULL_LANE)) { + /* We should be able to find a lane, because ucp_rndv_is_rkey_ptr() + * already checked that (rkey->md_map & ep_config->rkey_ptr_dst_mds) != 0 + */ + ucs_fatal("failed to find a lane to access remote memory domains 0x%lx", + rkey->md_map); + } + + rkey_index = ucs_bitmap2idx(rkey->md_map, dst_md_index); + status = uct_rkey_ptr(rkey->tl_rkey[rkey_index].cmpt, + &rkey->tl_rkey[rkey_index].rkey, + rndv_rts_hdr->address, &local_ptr); + if (status != UCS_OK) { + ucp_request_complete_tag_recv(rreq, status); + ucp_rkey_destroy(rkey); + ucp_rndv_req_send_ats(rndv_req, rreq, rndv_rts_hdr->sreq.reqptr, status); + return; + } + + rreq->recv.state.offset = 0; + + ucp_trace_req(rndv_req, "obtained a local pointer to remote buffer: %p", + local_ptr); + rndv_req->send.buffer = local_ptr; + rndv_req->send.length = rndv_rts_hdr->size; + rndv_req->send.rkey_ptr.rkey = rkey; + rndv_req->send.rkey_ptr.remote_request = rndv_rts_hdr->sreq.reqptr; + rndv_req->send.rkey_ptr.rreq = rreq; + + UCP_WORKER_STAT_RNDV(ep->worker, RKEY_PTR, 1); + + ucs_queue_push(&worker->rkey_ptr_reqs, &rndv_req->send.rkey_ptr.queue_elem); + uct_worker_progress_register_safe(worker->uct, + ucp_rndv_progress_rkey_ptr, + rreq->recv.worker, + UCS_CALLBACKQ_FLAG_FAST, + &worker->rkey_ptr_cb_id); +} + +static UCS_F_ALWAYS_INLINE int +ucp_rndv_test_zcopy_scheme_support(size_t length, size_t min_zcopy, + size_t max_zcopy, int split) +{ + return /* is the current message greater than the minimal GET/PUT Zcopy? */ + (length >= min_zcopy) && + /* is the current message less than the maximal GET/PUT Zcopy? */ + ((length <= max_zcopy) || + /* or can the message be split? */ split); } UCS_PROFILE_FUNC_VOID(ucp_rndv_matched, (worker, rreq, rndv_rts_hdr), @@ -517,6 +1149,9 @@ UCS_PROFILE_FUNC_VOID(ucp_rndv_matched, (worker, rreq, rndv_rts_hdr), ucp_rndv_mode_t rndv_mode; ucp_request_t *rndv_req; ucp_ep_h ep; + ucp_ep_config_t *ep_config; + ucs_status_t status; + int is_get_zcopy_failed; UCS_ASYNC_BLOCK(&worker->async); @@ -539,6 +1174,7 @@ UCS_PROFILE_FUNC_VOID(ucp_rndv_matched, (worker, rreq, rndv_rts_hdr), rndv_req->flags = 0; rndv_req->send.mdesc = NULL; rndv_req->send.pending_lane = UCP_NULL_LANE; + is_get_zcopy_failed = 0; ucp_trace_req(rreq, "rndv matched remote {address 0x%"PRIx64" size %zu sreq 0x%lx}" @@ -549,34 +1185,82 @@ UCS_PROFILE_FUNC_VOID(ucp_rndv_matched, (worker, rreq, rndv_rts_hdr), ucp_trace_req(rndv_req, "rndv truncated remote size %zu local size %zu rreq %p", rndv_rts_hdr->size, rreq->recv.length, rreq); - ucp_rndv_req_send_ats(rndv_req, rreq, rndv_rts_hdr->sreq.reqptr); + ucp_rndv_req_send_ats(rndv_req, rreq, rndv_rts_hdr->sreq.reqptr, UCS_OK); ucp_request_recv_generic_dt_finish(rreq); ucp_rndv_zcopy_recv_req_complete(rreq, UCS_ERR_MESSAGE_TRUNCATED); goto out; } - /* if the receive side is not connected yet then the RTS was received on a stub ep */ - ep = rndv_req->send.ep; - + ep = rndv_req->send.ep; + ep_config = ucp_ep_config(ep); rndv_mode = worker->context->config.ext.rndv_mode; + + if (ucp_rndv_is_rkey_ptr(rndv_rts_hdr, ep, rreq->recv.mem_type, rndv_mode)) { + ucp_rndv_do_rkey_ptr(rndv_req, rreq, rndv_rts_hdr); + goto out; + } + if (UCP_DT_IS_CONTIG(rreq->recv.datatype)) { - if (rndv_rts_hdr->address && (rndv_mode != UCP_RNDV_MODE_PUT_ZCOPY)) { + if ((rndv_rts_hdr->address != 0) && + ucp_rndv_test_zcopy_scheme_support(rndv_rts_hdr->size, + ep_config->tag.rndv.min_get_zcopy, + ep_config->tag.rndv.max_get_zcopy, + ep_config->tag.rndv.get_zcopy_split)) { /* try to fetch the data with a get_zcopy operation */ - ucp_rndv_req_send_rma_get(rndv_req, rreq, rndv_rts_hdr); - goto out; - } else if (rndv_mode != UCP_RNDV_MODE_GET_ZCOPY) { + status = ucp_rndv_req_send_rma_get(rndv_req, rreq, rndv_rts_hdr); + if (status == UCS_OK) { + goto out; + } + + /* fallback to non get zcopy protocol */ + ucp_rkey_destroy(rndv_req->send.rndv_get.rkey); + is_get_zcopy_failed = 1; + } + + if (rndv_mode == UCP_RNDV_MODE_AUTO) { + /* check if we need pipelined memtype staging */ + if (UCP_MEM_IS_CUDA(rreq->recv.mem_type) && + ucp_rndv_is_recv_pipeline_needed(rndv_req, + rndv_rts_hdr, + rreq->recv.mem_type, + is_get_zcopy_failed)) { + ucp_rndv_recv_data_init(rreq, rndv_rts_hdr->size); + if (ucp_rndv_is_put_pipeline_needed(rndv_rts_hdr->address, + rndv_rts_hdr->size, + ep_config->tag.rndv.min_get_zcopy, + ep_config->tag.rndv.max_get_zcopy, + is_get_zcopy_failed)) { + /* send FRAG RTR for sender to PUT the fragment. */ + ucp_rndv_send_frag_rtr(worker, rndv_req, rreq, rndv_rts_hdr); + } else { + /* sender address is present. do GET pipeline */ + ucp_rndv_recv_start_get_pipeline(worker, rndv_req, rreq, + rndv_rts_hdr->sreq.reqptr, + rndv_rts_hdr + 1, + rndv_rts_hdr->address, + rndv_rts_hdr->size, 0); + } + goto out; + } + } + + if ((rndv_mode == UCP_RNDV_MODE_PUT_ZCOPY) || + UCP_MEM_IS_CUDA(rreq->recv.mem_type)) { /* put protocol is allowed - register receive buffer memory for rma */ - ucp_request_recv_buffer_reg(rreq, ucp_ep_config(ep)->key.rma_bw_md_map, - ucs_min(rreq->recv.length, rndv_rts_hdr->size)); + ucs_assert(rndv_rts_hdr->size <= rreq->recv.length); + ucp_request_recv_buffer_reg(rreq, ep_config->key.rma_bw_md_map, + rndv_rts_hdr->size); } } /* The sender didn't specify its address in the RTS, or the rndv mode was - * configured to put - send an RTR and the sender will send the data with - * active message or put_zcopy. */ + * configured to PUT, or GET rndv mode is unsupported - send an RTR and + * the sender will send the data with active message or put_zcopy. */ ucp_rndv_recv_data_init(rreq, rndv_rts_hdr->size); - ucp_rndv_req_send_rtr(rndv_req, rreq, rndv_rts_hdr->sreq.reqptr); + UCP_WORKER_STAT_RNDV(ep->worker, SEND_RTR, 1); + ucp_rndv_req_send_rtr(rndv_req, rreq, rndv_rts_hdr->sreq.reqptr, + rndv_rts_hdr->size, 0ul); out: UCS_ASYNC_UNBLOCK(&worker->async); @@ -599,7 +1283,7 @@ ucs_status_t ucp_rndv_process_rts(void *arg, void *data, size_t length, as unexpected */ ucp_tag_offload_try_cancel(worker, rreq, UCP_TAG_OFFLOAD_CANCEL_FORCE); - UCP_WORKER_STAT_RNDV(worker, EXP); + UCP_WORKER_STAT_RNDV(worker, EXP, 1); status = UCS_OK; } else { status = ucp_recv_desc_init(worker, data, length, 0, tl_flags, @@ -632,20 +1316,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_ats_handler, if (sreq->flags & UCP_REQUEST_FLAG_OFFLOADED) { ucp_tag_offload_cancel_rndv(sreq); } - ucp_rndv_complete_send(sreq); - return UCS_OK; -} - -UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_atp_handler, - (arg, data, length, flags), - void *arg, void *data, size_t length, unsigned flags) -{ - ucp_reply_hdr_t *rep_hdr = data; - ucp_request_t *rreq = (ucp_request_t*) rep_hdr->reqptr; - - /* dereg the original recv request and set it to complete */ - UCS_PROFILE_REQUEST_EVENT(rreq, "rndv_atp_recv", 0); - ucp_rndv_zcopy_recv_req_complete(rreq, UCS_OK); + ucp_rndv_complete_send(sreq, rep_hdr->status); return UCS_OK; } @@ -656,7 +1327,7 @@ static size_t ucp_rndv_pack_data(void *dest, void *arg) size_t length, offset; offset = sreq->send.state.dt.offset; - hdr->rreq_ptr = sreq->send.tag.rreq_ptr; + hdr->rreq_ptr = sreq->send.msg_proto.tag.rreq_ptr; hdr->offset = offset; length = ucs_min(sreq->send.length - offset, ucp_ep_get_max_bcopy(sreq->send.ep, sreq->send.lane) - sizeof(*hdr)); @@ -670,11 +1341,9 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_am_bcopy, (self), uct_pending_req_t *self) { ucp_request_t *sreq = ucs_container_of(self, ucp_request_t, send.uct); - ucp_ep_t *ep = sreq->send.ep; + ucp_ep_t *ep = sreq->send.ep; ucs_status_t status; - sreq->send.lane = ucp_ep_get_am_lane(ep); - if (sreq->send.length <= ucp_ep_config(ep)->am.max_bcopy - sizeof(ucp_rndv_data_hdr_t)) { /* send a single bcopy message */ status = ucp_do_am_bcopy_single(self, UCP_AM_ID_RNDV_DATA, @@ -682,12 +1351,11 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_am_bcopy, (self), } else { status = ucp_do_am_bcopy_multi(self, UCP_AM_ID_RNDV_DATA, UCP_AM_ID_RNDV_DATA, - sizeof(ucp_rndv_data_hdr_t), ucp_rndv_pack_data, ucp_rndv_pack_data, 1); } if (status == UCS_OK) { - ucp_rndv_complete_send(sreq); + ucp_rndv_complete_send(sreq, UCS_OK); } else if (status == UCP_STATUS_PENDING_SWITCH) { status = UCS_OK; } @@ -702,14 +1370,14 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_rma_put_zcopy, (self), const size_t max_iovcnt = 1; ucp_ep_h ep = sreq->send.ep; ucs_status_t status; - size_t offset, ucp_mtu, align, remainder, length; + size_t offset, ucp_mtu, align, remaining, length; uct_iface_attr_t *attrs; uct_iov_t iov[max_iovcnt]; size_t iovcnt; ucp_dt_state_t state; if (!sreq->send.mdesc) { - status = ucp_request_send_buffer_reg_lane(sreq, sreq->send.lane); + status = ucp_request_send_buffer_reg_lane(sreq, sreq->send.lane, 0); ucs_assert_always(status == UCS_OK); } @@ -719,10 +1387,10 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_rma_put_zcopy, (self), ucp_mtu = attrs->cap.put.align_mtu; offset = sreq->send.state.dt.offset; - remainder = (uintptr_t)sreq->send.buffer % align; + remaining = (uintptr_t)sreq->send.buffer % align; - if ((offset == 0) && (remainder > 0) && (sreq->send.length > ucp_mtu)) { - length = ucp_mtu - remainder; + if ((offset == 0) && (remaining > 0) && (sreq->send.length > ucp_mtu)) { + length = ucp_mtu - remaining; } else { length = ucs_min(sreq->send.length - offset, ucp_ep_config(ep)->tag.rndv.max_put_zcopy); @@ -730,7 +1398,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_rma_put_zcopy, (self), ucs_trace_data("req %p: offset %zu remainder %zu. read to %p len %zu", sreq, offset, (uintptr_t)sreq->send.buffer % align, - (void*)sreq->send.buffer + offset, length); + UCS_PTR_BYTE_OFFSET(sreq->send.buffer, offset), length); state = sreq->send.state.dt; ucp_dt_iov_copy_uct(ep->worker->context, iov, &iovcnt, max_iovcnt, &state, @@ -759,6 +1427,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_rma_put_zcopy, (self), static void ucp_rndv_am_zcopy_send_req_complete(ucp_request_t *req, ucs_status_t status) { + ucs_assert(req->send.state.uct_comp.count == 0); ucp_request_send_buffer_dereg(req); ucp_request_complete_send(req, status); } @@ -780,7 +1449,7 @@ static ucs_status_t ucp_rndv_progress_am_zcopy_single(uct_pending_req_t *self) ucp_request_t *sreq = ucs_container_of(self, ucp_request_t, send.uct); ucp_rndv_data_hdr_t hdr; - hdr.rreq_ptr = sreq->send.tag.rreq_ptr; + hdr.rreq_ptr = sreq->send.msg_proto.tag.rreq_ptr; hdr.offset = 0; return ucp_do_am_zcopy_single(self, UCP_AM_ID_RNDV_DATA, &hdr, sizeof(hdr), ucp_rndv_am_zcopy_send_req_complete); @@ -791,7 +1460,7 @@ static ucs_status_t ucp_rndv_progress_am_zcopy_multi(uct_pending_req_t *self) ucp_request_t *sreq = ucs_container_of(self, ucp_request_t, send.uct); ucp_rndv_data_hdr_t hdr; - hdr.rreq_ptr = sreq->send.tag.rreq_ptr; + hdr.rreq_ptr = sreq->send.msg_proto.tag.rreq_ptr; hdr.offset = sreq->send.state.dt.offset; return ucp_do_am_zcopy_multi(self, UCP_AM_ID_RNDV_DATA, @@ -801,104 +1470,180 @@ static ucs_status_t ucp_rndv_progress_am_zcopy_multi(uct_pending_req_t *self) ucp_rndv_am_zcopy_send_req_complete, 1); } -UCS_PROFILE_FUNC_VOID(ucp_rndv_frag_put_completion, (self, status), +UCS_PROFILE_FUNC_VOID(ucp_rndv_send_frag_put_completion, (self, status), uct_completion_t *self, ucs_status_t status) { - ucp_request_t *frag_req = ucs_container_of(self, ucp_request_t, send.state.uct_comp); - ucp_request_t *sreq = frag_req->send.rndv_put.sreq; + ucp_request_t *freq = ucs_container_of(self, ucp_request_t, send.state.uct_comp); + ucp_request_t *req = freq->send.rndv_put.sreq; - ucs_mpool_put_inline((void *)frag_req->send.mdesc); - sreq->send.state.dt.offset += frag_req->send.length; - sreq->send.state.uct_comp.count--; - if (0 == sreq->send.state.uct_comp.count) { - ucp_rndv_send_atp(sreq, sreq->send.rndv_put.remote_request); + /* release memory descriptor */ + if (freq->send.mdesc) { + ucs_mpool_put_inline((void *)freq->send.mdesc); + } + + req->send.state.dt.offset += freq->send.length; + ucs_assert(req->send.state.dt.offset <= req->send.length); + + /* send ATP for last fragment of the rndv request */ + if (req->send.length == req->send.state.dt.offset) { + ucp_rndv_send_frag_atp(req, req->send.rndv_put.remote_request); } - ucp_request_put(frag_req); + + ucp_request_put(freq); } -UCS_PROFILE_FUNC_VOID(ucp_rndv_frag_get_completion, (self, status), +UCS_PROFILE_FUNC_VOID(ucp_rndv_put_pipeline_frag_get_completion, (self, status), uct_completion_t *self, ucs_status_t status) { - ucp_request_t *frag_req = ucs_container_of(self, ucp_request_t, send.state.uct_comp); - ucp_request_t *sreq = frag_req->send.rndv_get.rreq; - size_t offset = frag_req->send.rndv_get.remote_address - (uint64_t)(sreq->send.buffer); + ucp_request_t *freq = ucs_container_of(self, ucp_request_t, send.state.uct_comp); + ucp_request_t *fsreq = freq->send.rndv_get.rreq; - frag_req->send.ep = sreq->send.ep; - ucp_request_send_state_reset(frag_req, ucp_rndv_frag_put_completion, + /* get completed on memtype endpoint to stage on host. send put request to receiver*/ + ucp_request_send_state_reset(freq, ucp_rndv_send_frag_put_completion, UCP_REQUEST_SEND_PROTO_RNDV_PUT); - frag_req->send.uct.func = ucp_rndv_progress_rma_put_zcopy; - frag_req->send.rndv_put.sreq = sreq; - frag_req->send.rndv_put.rkey = sreq->send.rndv_put.rkey; - frag_req->send.rndv_put.uct_rkey = sreq->send.rndv_put.uct_rkey; - frag_req->send.rndv_put.remote_address = sreq->send.rndv_put.remote_address + offset; - frag_req->send.lane = sreq->send.lane; - frag_req->send.state.dt.dt.contig.md_map = 0; - - ucp_request_send(frag_req, 0); + freq->send.rndv_put.remote_address = fsreq->send.rndv_put.remote_address + + (freq->send.rndv_get.remote_address - (uint64_t)fsreq->send.buffer); + freq->send.ep = fsreq->send.ep; + freq->send.uct.func = ucp_rndv_progress_rma_put_zcopy; + freq->send.rndv_put.sreq = fsreq; + freq->send.rndv_put.rkey = fsreq->send.rndv_put.rkey; + freq->send.rndv_put.uct_rkey = fsreq->send.rndv_put.uct_rkey; + freq->send.lane = fsreq->send.lane; + freq->send.state.dt.dt.contig.md_map = 0; + + ucp_request_send(freq, 0); } -static ucs_status_t ucp_rndv_pipeline(ucp_request_t *sreq, ucp_rndv_rtr_hdr_t *rndv_rtr_hdr) +static ucs_status_t ucp_rndv_send_start_put_pipeline(ucp_request_t *sreq, + ucp_rndv_rtr_hdr_t *rndv_rtr_hdr) { - ucp_worker_h worker = sreq->send.ep->worker; - ucp_ep_h pipeline_ep = worker->mem_type_ep[sreq->send.mem_type]; - ucp_mem_desc_t *mdesc; - ucp_request_t *frag_req; - ucp_rsc_index_t md_index; - ucs_status_t status; - int i, num_frags; - size_t frag_size, length; - size_t offset; + ucp_ep_h ep = sreq->send.ep; + ucp_ep_config_t *config = ucp_ep_config(ep); + ucp_worker_h worker = sreq->send.ep->worker; + ucp_context_h context = worker->context; + const uct_md_attr_t *md_attr; + ucp_request_t *freq; + ucp_request_t *fsreq; + ucp_md_index_t md_index; + size_t max_frag_size, rndv_size, length; + size_t offset, rndv_base_offset; + size_t min_zcopy, max_zcopy; + + ucp_trace_req(sreq, "using put rndv pipeline protocol"); + + /* Protocol: + * Step 1: GET fragment from send buffer to HOST fragment buffer + * Step 2: PUT from fragment HOST buffer to remote HOST fragment buffer + * Step 3: send ATP for each fragment request + */ + + /* check if lane supports host memory, to stage sends through host memory */ + md_attr = ucp_ep_md_attr(sreq->send.ep, sreq->send.lane); + if (!(md_attr->cap.reg_mem_types & UCS_BIT(UCS_MEMORY_TYPE_HOST))) { + return UCS_ERR_UNSUPPORTED; + } - md_index = ucp_ep_md_index(pipeline_ep, - ucp_ep_config(pipeline_ep)->key.rma_bw_lanes[0]); + min_zcopy = config->tag.rndv.min_put_zcopy; + max_zcopy = config->tag.rndv.max_put_zcopy; + rndv_size = ucs_min(rndv_rtr_hdr->size, sreq->send.length); + max_frag_size = ucs_min(context->config.ext.rndv_frag_size, max_zcopy); + rndv_base_offset = rndv_rtr_hdr->offset; - frag_size = worker->context->config.ext.rndv_frag_size; - num_frags = (sreq->send.length + frag_size - 1) / frag_size; - sreq->send.state.uct_comp.count = num_frags; - sreq->send.state.dt.offset = 0; - sreq->send.rndv_put.remote_request = rndv_rtr_hdr->rreq_ptr; - sreq->send.rndv_put.remote_address = rndv_rtr_hdr->address; + /* initialize send req state on first fragment rndv request */ + if (rndv_base_offset == 0) { + ucp_request_send_state_reset(sreq, NULL, UCP_REQUEST_SEND_PROTO_RNDV_PUT); + } - offset = 0; - for (i = 0; i < num_frags; i++) { - length = (i != (num_frags - 1)) ? frag_size : (sreq->send.length - offset); + /* internal send request allocated on sender side to handle send fragments for RTR */ + fsreq = ucp_request_get(worker); + if (fsreq == NULL) { + ucs_fatal("failed to allocate fragment receive request"); + } - frag_req = ucp_request_get(worker); - if (frag_req == NULL) { - status = UCS_ERR_NO_MEMORY; - goto out; - } + ucp_request_send_state_init(fsreq, ucp_dt_make_contig(1), 0); + fsreq->send.buffer = UCS_PTR_BYTE_OFFSET(sreq->send.buffer, + rndv_base_offset); + fsreq->send.length = rndv_size; + fsreq->send.mem_type = sreq->send.mem_type; + fsreq->send.ep = sreq->send.ep; + fsreq->send.lane = sreq->send.lane; + fsreq->send.rndv_put.rkey = sreq->send.rndv_put.rkey; + fsreq->send.rndv_put.uct_rkey = sreq->send.rndv_put.uct_rkey; + fsreq->send.rndv_put.remote_request = rndv_rtr_hdr->rreq_ptr; + fsreq->send.rndv_put.remote_address = rndv_rtr_hdr->address; + fsreq->send.rndv_put.sreq = sreq; + fsreq->send.state.dt.offset = 0; - mdesc = ucs_mpool_get_inline(&worker->rndv_frag_mp); - if (mdesc == NULL) { - status = UCS_ERR_NO_MEMORY; - goto out; + offset = 0; + while (offset != rndv_size) { + length = ucp_rndv_adjust_zcopy_length(min_zcopy, max_frag_size, 0, + rndv_size, offset, rndv_size - offset); + + if (UCP_MEM_IS_ACCESSIBLE_FROM_CPU(sreq->send.mem_type)) { + /* sbuf is in host, directly do put */ + freq = ucp_request_get(worker); + if (ucs_unlikely(freq == NULL)) { + ucs_error("failed to allocate fragment receive request"); + return UCS_ERR_NO_MEMORY; + } + + ucp_request_send_state_reset(freq, ucp_rndv_send_frag_put_completion, + UCP_REQUEST_SEND_PROTO_RNDV_PUT); + md_index = ucp_ep_md_index(sreq->send.ep, + sreq->send.lane); + freq->send.ep = fsreq->send.ep; + freq->send.buffer = UCS_PTR_BYTE_OFFSET(fsreq->send.buffer, + offset); + freq->send.datatype = ucp_dt_make_contig(1); + freq->send.mem_type = UCS_MEMORY_TYPE_HOST; + freq->send.state.dt.dt.contig.memh[0] = + ucp_memh_map2uct(sreq->send.state.dt.dt.contig.memh, + sreq->send.state.dt.dt.contig.md_map, md_index); + freq->send.state.dt.dt.contig.md_map = UCS_BIT(md_index); + freq->send.length = length; + freq->send.uct.func = ucp_rndv_progress_rma_put_zcopy; + freq->send.rndv_put.sreq = fsreq; + freq->send.rndv_put.rkey = fsreq->send.rndv_put.rkey; + freq->send.rndv_put.uct_rkey = fsreq->send.rndv_put.uct_rkey; + freq->send.rndv_put.remote_address = rndv_rtr_hdr->address + offset; + freq->send.rndv_put.remote_request = rndv_rtr_hdr->rreq_ptr; + freq->send.lane = fsreq->send.lane; + freq->send.mdesc = NULL; + + ucp_request_send(freq, 0); + } else { + ucp_rndv_send_frag_get_mem_type(fsreq, 0, length, + (uint64_t)UCS_PTR_BYTE_OFFSET(fsreq->send.buffer, offset), + fsreq->send.mem_type, NULL, NULL, UCS_BIT(0), + ucp_rndv_put_pipeline_frag_get_completion); } - ucp_request_send_state_init(frag_req, ucp_dt_make_contig(1), 0); - ucp_request_send_state_reset(frag_req, ucp_rndv_frag_get_completion, - UCP_REQUEST_SEND_PROTO_RNDV_GET); - frag_req->send.ep = pipeline_ep; - frag_req->send.buffer = mdesc + 1; - frag_req->send.datatype = ucp_dt_make_contig(1); - frag_req->send.mem_type = sreq->send.mem_type; - frag_req->send.state.dt.dt.contig.memh[0]= ucp_memh2uct(mdesc->memh, md_index); - frag_req->send.state.dt.dt.contig.md_map = UCS_BIT(md_index); - frag_req->send.length = length; - frag_req->send.uct.func = ucp_rndv_progress_rma_get_zcopy; - frag_req->send.rndv_get.rkey = NULL; - frag_req->send.rndv_get.remote_address = (uint64_t)(sreq->send.buffer + offset); - frag_req->send.rndv_get.lanes_map = 0; - frag_req->send.rndv_get.lane_count = 0; - frag_req->send.rndv_get.rreq = sreq; - frag_req->send.mdesc = mdesc; - - ucp_request_send(frag_req, 0); offset += length; } + + return UCS_OK; +} + +UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_atp_handler, + (arg, data, length, flags), + void *arg, void *data, size_t length, unsigned flags) +{ + ucp_reply_hdr_t *rep_hdr = data; + ucp_request_t *req = (ucp_request_t*) rep_hdr->reqptr; + + if (req->flags & UCP_REQUEST_FLAG_RNDV_FRAG) { + /* received ATP for frag RTR request */ + ucs_assert(req->recv.frag.rreq != NULL); + UCS_PROFILE_REQUEST_EVENT(req, "rndv_frag_atp_recv", 0); + ucp_rndv_recv_frag_put_mem_type(req->recv.frag.rreq, NULL, req, + ((ucp_mem_desc_t*) req->recv.buffer - 1), + req->recv.length, req->recv.frag.offset); + } else { + UCS_PROFILE_REQUEST_EVENT(req, "rndv_atp_recv", 0); + ucp_rndv_zcopy_recv_req_complete(req, UCS_OK); + } + return UCS_OK; - out: - return status;; } UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_rtr_handler, @@ -908,7 +1653,10 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_rtr_handler, ucp_rndv_rtr_hdr_t *rndv_rtr_hdr = data; ucp_request_t *sreq = (ucp_request_t*)rndv_rtr_hdr->sreq_ptr; ucp_ep_h ep = sreq->send.ep; + ucp_ep_config_t *ep_config = ucp_ep_config(ep); + ucp_context_h context = ep->worker->context; ucs_status_t status; + int is_pipeline_rndv; ucp_trace_req(sreq, "received rtr address 0x%lx remote rreq 0x%lx", rndv_rtr_hdr->address, rndv_rtr_hdr->rreq_ptr); @@ -928,11 +1676,37 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_rtr_handler, ucp_ep_peer_name(ep), ucs_status_string(status)); } - sreq->send.lane = ucp_rkey_get_rma_bw_lane(sreq->send.rndv_put.rkey, ep, - sreq->send.mem_type, - &sreq->send.rndv_put.uct_rkey, 0); + is_pipeline_rndv = ((!UCP_MEM_IS_ACCESSIBLE_FROM_CPU(sreq->send.mem_type) || + (sreq->send.length != rndv_rtr_hdr->size)) && + (context->config.ext.rndv_mode != UCP_RNDV_MODE_PUT_ZCOPY)); + + sreq->send.lane = ucp_rkey_find_rma_lane(ep->worker->context, ep_config, + (is_pipeline_rndv ? + sreq->send.rndv_put.rkey->mem_type : + sreq->send.mem_type), + ep_config->tag.rndv.put_zcopy_lanes, + sreq->send.rndv_put.rkey, 0, + &sreq->send.rndv_put.uct_rkey); if (sreq->send.lane != UCP_NULL_LANE) { - if (!ucp_rndv_is_pipeline_needed(sreq)) { + /* + * Try pipeline protocol for non-host memory, if PUT_ZCOPY protocol is + * not explicitly required. If pipeline is UNSUPPORTED, fallback to + * PUT_ZCOPY anyway. + */ + if (is_pipeline_rndv) { + status = ucp_rndv_send_start_put_pipeline(sreq, rndv_rtr_hdr); + if (status != UCS_ERR_UNSUPPORTED) { + return status; + } + /* If we get here, it means that RNDV pipeline protocol is + * unsupported and we have to use PUT_ZCOPY RNDV scheme instead */ + } + + if ((context->config.ext.rndv_mode != UCP_RNDV_MODE_GET_ZCOPY) && + ucp_rndv_test_zcopy_scheme_support(sreq->send.length, + ep_config->tag.rndv.min_put_zcopy, + ep_config->tag.rndv.max_put_zcopy, + ep_config->tag.rndv.put_zcopy_split)) { ucp_request_send_state_reset(sreq, ucp_rndv_put_completion, UCP_REQUEST_SEND_PROTO_RNDV_PUT); sreq->send.uct.func = ucp_rndv_progress_rma_put_zcopy; @@ -941,37 +1715,39 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_rtr_handler, sreq->send.mdesc = NULL; goto out_send; } else { - return ucp_rndv_pipeline(sreq, rndv_rtr_hdr); + ucp_rkey_destroy(sreq->send.rndv_put.rkey); } } else { ucp_rkey_destroy(sreq->send.rndv_put.rkey); } } + ucp_trace_req(sreq, "using rdnv_data protocol"); + /* switch to AM */ - sreq->send.tag.rreq_ptr = rndv_rtr_hdr->rreq_ptr; + sreq->send.msg_proto.tag.rreq_ptr = rndv_rtr_hdr->rreq_ptr; if (UCP_DT_IS_CONTIG(sreq->send.datatype) && (sreq->send.length >= - ucp_ep_config(ep)->am.mem_type_zcopy_thresh[sreq->send.mem_type])) + ep_config->am.mem_type_zcopy_thresh[sreq->send.mem_type])) { - status = ucp_request_send_buffer_reg_lane(sreq, ucp_ep_get_am_lane(ep)); + status = ucp_request_send_buffer_reg_lane(sreq, ucp_ep_get_am_lane(ep), 0); ucs_assert_always(status == UCS_OK); ucp_request_send_state_reset(sreq, ucp_rndv_am_zcopy_completion, UCP_REQUEST_SEND_PROTO_ZCOPY_AM); if ((sreq->send.length + sizeof(ucp_rndv_data_hdr_t)) <= - ucp_ep_config(ep)->am.max_zcopy) { + ep_config->am.max_zcopy) { sreq->send.uct.func = ucp_rndv_progress_am_zcopy_single; } else { - sreq->send.uct.func = ucp_rndv_progress_am_zcopy_multi; - sreq->send.tag.am_bw_index = 1; + sreq->send.uct.func = ucp_rndv_progress_am_zcopy_multi; + sreq->send.msg_proto.am_bw_index = 1; } } else { ucp_request_send_state_reset(sreq, NULL, UCP_REQUEST_SEND_PROTO_BCOPY_AM); - sreq->send.uct.func = ucp_rndv_progress_am_bcopy; - sreq->send.tag.am_bw_index = 1; + sreq->send.uct.func = ucp_rndv_progress_am_bcopy; + sreq->send.msg_proto.am_bw_index = 1; } out_send: @@ -987,11 +1763,13 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_data_handler, ucp_request_t *rreq = (ucp_request_t*) rndv_data_hdr->rreq_ptr; size_t recv_len; + ucs_assert(!(rreq->flags & UCP_REQUEST_FLAG_RNDV_FRAG)); + recv_len = length - sizeof(*rndv_data_hdr); UCS_PROFILE_REQUEST_EVENT(rreq, "rndv_data_recv", recv_len); (void)ucp_tag_request_process_recv_data(rreq, rndv_data_hdr + 1, recv_len, - rndv_data_hdr->offset, 1); + rndv_data_hdr->offset, 1, 0); return UCS_OK; } diff --git a/src/ucp/tag/rndv.h b/src/ucp/tag/rndv.h index 0eb34d3f3e4..449c923c8bc 100644 --- a/src/ucp/tag/rndv.h +++ b/src/ucp/tag/rndv.h @@ -12,7 +12,6 @@ #include #include #include -#include /* @@ -33,6 +32,8 @@ typedef struct { uintptr_t sreq_ptr; /* request on the rndv initiator side - sender */ uintptr_t rreq_ptr; /* request on the rndv receiver side */ uint64_t address; /* holds the address of the data buffer on the receiver's side */ + size_t size; /* size of the data to receive */ + size_t offset; /* offset of the data in the recv buffer */ /* packed rkeys follow */ } UCS_S_PACKED ucp_rndv_rtr_hdr_t; @@ -52,9 +53,22 @@ void ucp_rndv_matched(ucp_worker_h worker, ucp_request_t *req, ucs_status_t ucp_rndv_progress_rma_get_zcopy(uct_pending_req_t *self); +ucs_status_t ucp_rndv_progress_rma_put_zcopy(uct_pending_req_t *self); + ucs_status_t ucp_rndv_process_rts(void *arg, void *data, size_t length, unsigned tl_flags); size_t ucp_tag_rndv_rts_pack(void *dest, void *arg); +ucs_status_t ucp_tag_rndv_reg_send_buffer(ucp_request_t *sreq); + +static UCS_F_ALWAYS_INLINE int +ucp_rndv_is_get_zcopy(ucp_request_t *req, ucp_context_h context) +{ + return ((context->config.ext.rndv_mode == UCP_RNDV_MODE_GET_ZCOPY) || + ((context->config.ext.rndv_mode == UCP_RNDV_MODE_AUTO) && + (!UCP_MEM_IS_CUDA(req->send.mem_type) || + (req->send.length < context->config.ext.rndv_pipeline_send_thresh)))); +} + #endif diff --git a/src/ucp/tag/tag_match.c b/src/ucp/tag/tag_match.c index e8bc72f0688..03dc78727dd 100644 --- a/src/ucp/tag/tag_match.c +++ b/src/ucp/tag/tag_match.c @@ -1,9 +1,13 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "tag_match.inl" #include @@ -45,12 +49,20 @@ ucs_status_t ucp_tag_match_init(ucp_tag_match_t *tm) tm->offload.thresh = SIZE_MAX; tm->offload.zcopy_thresh = SIZE_MAX; tm->offload.iface = NULL; - tm->am.message_id = ucs_generate_uuid(0); return UCS_OK; } void ucp_tag_match_cleanup(ucp_tag_match_t *tm) { + ucp_recv_desc_t *rdesc, *tmp_rdesc; + + ucs_list_for_each_safe(rdesc, tmp_rdesc, &tm->unexpected.all, + tag_list[UCP_RDESC_ALL_LIST]) { + ucs_warn("unexpected tag-receive descriptor %p was not matched", rdesc); + ucp_tag_unexp_remove(rdesc); + ucp_recv_desc_release(rdesc); + } + kh_destroy_inplace(ucp_tag_offload_hash, &tm->offload.tag_hash); kh_destroy_inplace(ucp_tag_frag_hash, &tm->frag_hash); ucs_free(tm->unexpected.hash); @@ -62,7 +74,7 @@ int ucp_tag_unexp_is_empty(ucp_tag_match_t *tm) return ucs_list_is_empty(&tm->unexpected.all); } -void ucp_tag_exp_remove(ucp_tag_match_t *tm, ucp_request_t *req) +int ucp_tag_exp_remove(ucp_tag_match_t *tm, ucp_request_t *req) { ucp_request_queue_t *req_queue = ucp_tag_exp_get_req_queue(tm, req); ucs_queue_iter_t iter; @@ -72,11 +84,14 @@ void ucp_tag_exp_remove(ucp_tag_match_t *tm, ucp_request_t *req) if (qreq == req) { ucp_tag_offload_try_cancel(req->recv.worker, req, 0); ucp_tag_exp_delete(req, tm, req_queue, iter); - return; + return 1; } } - ucs_bug("expected request not found"); + ucs_assert(!(req->flags & UCP_REQUEST_FLAG_COMPLETED)); + ucs_trace_req("can't remove req %p (already matched)", req); + + return 0; } static inline uint64_t ucp_tag_exp_req_seq(ucs_queue_iter_t iter) @@ -159,6 +174,7 @@ void ucp_tag_frag_list_process_queue(ucp_tag_match_t *tm, ucp_request_t *req, /* if we completed the request, delete hash entry */ if (status != UCS_INPROGRESS) { kh_del(ucp_tag_frag_hash, &tm->frag_hash, iter); + return; } } diff --git a/src/ucp/tag/tag_match.h b/src/ucp/tag/tag_match.h index 9aa2c033c64..b653658cbf8 100644 --- a/src/ucp/tag/tag_match.h +++ b/src/ucp/tag/tag_match.h @@ -1,5 +1,5 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -46,8 +46,8 @@ typedef struct { * Hash table entry for tag message fragments */ typedef union { - ucs_queue_head_t unexp_q; /* Queue of unexpected descriptors */ - ucp_request_t *exp_req; /* Expected request */ + ucs_queue_head_t unexp_q; /* Queue of unexpected descriptors */ + ucp_request_t *exp_req; /* Expected request */ } ucp_tag_frag_match_t; @@ -98,10 +98,6 @@ typedef struct ucp_tag_match { 'thresh' configuration. */ } offload; - struct { - uint64_t message_id; /* Unique ID for active messages */ - } am; - } ucp_tag_match_t; @@ -109,7 +105,7 @@ ucs_status_t ucp_tag_match_init(ucp_tag_match_t *tm); void ucp_tag_match_cleanup(ucp_tag_match_t *tm); -void ucp_tag_exp_remove(ucp_tag_match_t *tm, ucp_request_t *req); +int ucp_tag_exp_remove(ucp_tag_match_t *tm, ucp_request_t *req); int ucp_tag_unexp_is_empty(ucp_tag_match_t *tm); diff --git a/src/ucp/tag/tag_match.inl b/src/ucp/tag/tag_match.inl index 94ba43525b6..6452265c089 100644 --- a/src/ucp/tag/tag_match.inl +++ b/src/ucp/tag/tag_match.inl @@ -1,5 +1,5 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -10,6 +10,7 @@ #include "tag_match.h" #include "eager.h" +#include #include #include #include @@ -168,7 +169,7 @@ ucp_tag_unexp_list_next(ucp_recv_desc_t *rdesc, int i_list) */ static UCS_F_ALWAYS_INLINE ucp_recv_desc_t* ucp_tag_unexp_search(ucp_tag_match_t *tm, ucp_tag_t tag, uint64_t tag_mask, - int remove, const char *title) + int rem, const char *title) { ucp_recv_desc_t *rdesc; ucs_list_link_t *list; @@ -200,7 +201,7 @@ ucp_tag_unexp_search(ucp_tag_match_t *tm, ucp_tag_t tag, uint64_t tag_mask, ucs_trace_req("matched unexp rdesc " UCP_RECV_DESC_FMT " to " "%s tag %"PRIx64"/%"PRIx64, UCP_RECV_DESC_ARG(rdesc), title, tag, tag_mask); - if (remove) { + if (rem) { ucp_tag_unexp_remove(rdesc); } return rdesc; @@ -212,29 +213,123 @@ ucp_tag_unexp_search(ucp_tag_match_t *tm, ucp_tag_t tag, uint64_t tag_mask, return NULL; } +static UCS_F_ALWAYS_INLINE void +ucp_tag_recv_request_release_non_contig_buffer(ucp_request_t *req) +{ + ucs_assert(!UCP_DT_IS_CONTIG(req->recv.datatype)); + ucs_free(req->recv.tag.non_contig_buf); + req->recv.tag.non_contig_buf = NULL; +} + +static UCS_F_ALWAYS_INLINE ucs_status_t +ucp_request_recv_offload_data(ucp_request_t *req, const void *data, + size_t length, unsigned recv_flags) +{ + ucs_status_t status = UCS_OK; + size_t offset; + ucp_offload_last_ssend_hdr_t *priv; + + /* Should be used in multi-fragmented flow only */ + ucs_assert(!(recv_flags & UCP_RECV_DESC_FLAG_EAGER_ONLY)); + + if (ucs_test_all_flags(recv_flags, UCP_RECV_DESC_FLAG_EAGER_LAST | + UCP_RECV_DESC_FLAG_EAGER_SYNC)) { + priv = (ucp_offload_last_ssend_hdr_t*)UCS_PTR_BYTE_OFFSET(data, + -sizeof(*priv)); + ucp_tag_offload_sync_send_ack(req->recv.worker, priv->ssend_ack.ep_ptr, + priv->ssend_ack.sender_tag, recv_flags); + } + + if (ucs_unlikely(req->status != UCS_OK)) { + return req->status; + } + + /* There is no correct offset in middle headers with tag offload flow. + * All fragments are always in order - can calculate offset by + * subtraction of already received data. + * NOTE: total length of unexpected eager offload message is not known + * until last fragment arrives, so it is initialized to SIZE_MAX. */ + offset = SIZE_MAX - req->recv.tag.remaining; + + if (ucs_unlikely(req->recv.length < (length + offset))) { + /* We have to release non-contig buffer only in case of + * this is not the first segment and the datatype is + * non-contig */ + if ((offset != 0) && !UCP_DT_IS_CONTIG(req->recv.datatype)) { + ucp_tag_recv_request_release_non_contig_buffer(req); + } + return ucp_request_recv_msg_truncated(req, length, offset); + } + + if (UCP_DT_IS_CONTIG(req->recv.datatype)) { + ucp_request_unpack_contig(req, + UCS_PTR_BYTE_OFFSET(req->recv.buffer, offset), + data, length); + } else { + /* For non-contig data need to assemble the whole message + * before calling unpack. */ + if (offset == 0) { + req->recv.tag.non_contig_buf = ucs_malloc(req->recv.length, + "tag gen buffer"); + if (ucs_unlikely(req->recv.tag.non_contig_buf == NULL)) { + return UCS_ERR_NO_MEMORY; + } + } + + ucp_request_unpack_contig(req, + UCS_PTR_BYTE_OFFSET(req->recv.tag.non_contig_buf, + offset), + data, length); + } + + if (recv_flags & UCP_RECV_DESC_FLAG_EAGER_LAST) { + /* Need to update recv info length. In tag offload protocol we do not + * know the total message length until the last fragment arrives. */ + req->recv.tag.info.length = offset + length; + + if (!UCP_DT_IS_CONTIG(req->recv.datatype)) { + status = ucp_request_recv_data_unpack(req, req->recv.tag.non_contig_buf, + req->recv.tag.info.length, + 0, 1); + ucp_tag_recv_request_release_non_contig_buffer(req); + } + } + + return status; +} + /* * process data, complete receive if done * @return UCS_OK/ERR - completed, UCS_INPROGRESS - not completed */ static UCS_F_ALWAYS_INLINE ucs_status_t ucp_tag_request_process_recv_data(ucp_request_t *req, const void *data, - size_t length, size_t offset, int dereg) + size_t length, size_t offset, int dereg, + unsigned recv_flags) { ucs_status_t status; int last; - last = req->recv.tag.remaining == length; + if (recv_flags & UCP_RECV_DESC_FLAG_EAGER_OFFLOAD) { + req->status = ucp_request_recv_offload_data(req, data, length, + recv_flags); - /* process data only if the request is not in error state */ - if (req->status == UCS_OK) { - status = ucp_request_recv_data_unpack(req, data, length, offset, last); - if (status != UCS_OK) { - req->status = status; + last = recv_flags & UCP_RECV_DESC_FLAG_EAGER_LAST; + } else { + last = req->recv.tag.remaining == length; + + /* process data only if the request is not in error state */ + if (ucs_likely(req->status == UCS_OK)) { + req->status = ucp_request_recv_data_unpack(req, data, length, + offset, last); } + ucs_assertv(req->recv.tag.remaining >= length, + "req->recv.tag.remaining=%zu length=%zu", + req->recv.tag.remaining, length); } - ucs_assert(req->recv.tag.remaining >= length); req->recv.tag.remaining -= length; + if (last) { status = req->status; if (dereg) { @@ -257,8 +352,9 @@ ucp_tag_recv_request_process_rdesc(ucp_request_t *req, ucp_recv_desc_t *rdesc, hdr_len = rdesc->payload_offset; recv_len = rdesc->length - hdr_len; - status = ucp_tag_request_process_recv_data(req, (void*)(rdesc + 1) + hdr_len, - recv_len, offset, 0); + status = ucp_tag_request_process_recv_data(req, + UCS_PTR_BYTE_OFFSET(rdesc + 1, hdr_len), + recv_len, offset, 0, rdesc->flags); ucp_recv_desc_release(rdesc); return status; } diff --git a/src/ucp/tag/tag_recv.c b/src/ucp/tag/tag_recv.c index d8a3ea20eb5..1851c3e4632 100644 --- a/src/ucp/tag/tag_recv.c +++ b/src/ucp/tag/tag_recv.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "eager.h" #include "rndv.h" #include "tag_match.inl" @@ -15,31 +19,18 @@ #include -static UCS_F_ALWAYS_INLINE void -ucp_tag_recv_request_completed(ucp_request_t *req, ucs_status_t status, - ucp_tag_recv_info_t *info, const char *function) -{ - ucs_trace_req("%s returning completed request %p (%p) stag 0x%"PRIx64" len %zu, %s", - function, req, req + 1, info->sender_tag, info->length, - ucs_status_string(status)); - - req->status = status; - if ((req->flags |= UCP_REQUEST_FLAG_COMPLETED) & UCP_REQUEST_FLAG_RELEASED) { - ucp_request_put(req); - } - UCS_PROFILE_REQUEST_EVENT(req, "complete_recv", 0); -} - -static UCS_F_ALWAYS_INLINE void +static UCS_F_ALWAYS_INLINE ucs_status_ptr_t ucp_tag_recv_common(ucp_worker_h worker, void *buffer, size_t count, uintptr_t datatype, ucp_tag_t tag, ucp_tag_t tag_mask, - ucp_request_t *req, uint16_t req_flags, ucp_tag_recv_callback_t cb, - ucp_recv_desc_t *rdesc, const char *debug_name) + ucp_request_t *req, ucp_recv_desc_t *rdesc, + const ucp_request_param_t *param, const char *debug_name) { unsigned common_flags = UCP_REQUEST_FLAG_RECV | UCP_REQUEST_FLAG_EXPECTED; + uint32_t req_flags = (param->op_attr_mask & UCP_OP_ATTR_FIELD_CALLBACK) ? + UCP_REQUEST_FLAG_CALLBACK : 0; ucp_eager_first_hdr_t *eagerf_hdr; ucp_request_queue_t *req_queue; - uct_memory_type_t mem_type; + ucs_memory_type_t mem_type; size_t hdr_len, recv_len; ucs_status_t status; uint64_t msg_id; @@ -59,24 +50,32 @@ ucp_tag_recv_common(ucp_worker_h worker, void *buffer, size_t count, ucp_tag_eager_sync_send_ack(worker, rdesc + 1, rdesc->flags); } - req->flags = UCP_REQUEST_FLAG_RECV | req_flags; + req->flags = UCP_REQUEST_FLAG_COMPLETED | + UCP_REQUEST_FLAG_RECV; hdr_len = rdesc->payload_offset; recv_len = rdesc->length - hdr_len; req->recv.tag.info.sender_tag = ucp_rdesc_get_tag(rdesc); req->recv.tag.info.length = recv_len; - - ucp_memory_type_detect_mds(worker->context, buffer, recv_len, &mem_type); + mem_type = ucp_memory_type_detect(worker->context, + buffer, recv_len); status = ucp_dt_unpack_only(worker, buffer, count, datatype, mem_type, - (void*)(rdesc + 1) + hdr_len, recv_len, 1); + UCS_PTR_BYTE_OFFSET(rdesc + 1, hdr_len), + recv_len, 1); ucp_recv_desc_release(rdesc); - if (req_flags & UCP_REQUEST_FLAG_CALLBACK) { - cb(req + 1, status, &req->recv.tag.info); - } - ucp_tag_recv_request_completed(req, status, &req->recv.tag.info, - debug_name); - return; + req->status = status; + UCS_PROFILE_REQUEST_EVENT(req, "complete_recv", 0); + + ucp_request_imm_cmpl_param(param, req, status, recv, + &req->recv.tag.info); + } + + /* TODO: allocate request only in case if flag + * UCP_OP_ATTR_FLAG_FORCE_IMM_CMPL is not set */ + if (ucs_unlikely(param->op_attr_mask & UCP_OP_ATTR_FLAG_FORCE_IMM_CMPL)) { + ucp_request_put_param(param, req); + return UCS_STATUS_PTR(UCS_ERR_NO_RESOURCE); } /* Initialize receive request */ @@ -94,13 +93,15 @@ ucp_tag_recv_common(ucp_worker_h worker, void *buffer, size_t count, req->flags = common_flags | req_flags; req->recv.length = ucp_dt_length(datatype, count, buffer, &req->recv.state); - - ucp_memory_type_detect_mds(worker->context, buffer, req->recv.length, &mem_type); - - req->recv.mem_type = mem_type; + req->recv.mem_type = ucp_memory_type_detect(worker->context, buffer, + req->recv.length); req->recv.tag.tag = tag; req->recv.tag.tag_mask = tag_mask; - req->recv.tag.cb = cb; + if (param->op_attr_mask & UCP_OP_ATTR_FIELD_CALLBACK) { + req->recv.tag.cb = param->cb.recv; + req->user_data = param->user_data; + } + if (ucs_log_is_enabled(UCS_LOG_LEVEL_TRACE_REQ)) { req->recv.tag.info.sender_tag = 0; } @@ -118,15 +119,15 @@ ucp_tag_recv_common(ucp_worker_h worker, void *buffer, size_t count, ucs_trace_req("%s returning expected request %p (%p)", debug_name, req, req + 1); - return; + return req + 1; } /* Check rendezvous case */ if (ucs_unlikely(rdesc->flags & UCP_RECV_DESC_FLAG_RNDV)) { ucp_rndv_matched(worker, req, (void*)(rdesc + 1)); - UCP_WORKER_STAT_RNDV(worker, UNEXP); + UCP_WORKER_STAT_RNDV(worker, UNEXP, 1); ucp_recv_desc_release(rdesc); - return; + return req + 1; } if (ucs_unlikely(rdesc->flags & UCP_RECV_DESC_FLAG_EAGER_SYNC)) { @@ -144,11 +145,12 @@ ucp_tag_recv_common(ucp_worker_h worker, void *buffer, size_t count, UCP_WORKER_STAT_EAGER_CHUNK(worker, UNEXP); msg_id = eagerf_hdr->msg_id; status = ucp_tag_recv_request_process_rdesc(req, rdesc, 0); - ucs_assert(status == UCS_INPROGRESS); + ucs_assert((status == UCS_OK) || (status == UCS_INPROGRESS)); /* process additional fragments */ ucp_tag_frag_list_process_queue(&worker->tm, req, msg_id UCS_STATS_ARG(UCP_WORKER_STAT_TAG_RX_EAGER_CHUNK_UNEXP)); + return req + 1; } UCS_PROFILE_FUNC(ucs_status_t, ucp_tag_recv_nbr, @@ -157,20 +159,17 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_tag_recv_nbr, uintptr_t datatype, ucp_tag_t tag, ucp_tag_t tag_mask, void *request) { - ucp_request_t *req = (ucp_request_t *)request - 1; - ucp_recv_desc_t *rdesc; - - UCP_CONTEXT_CHECK_FEATURE_FLAGS(worker->context, UCP_FEATURE_TAG, - return UCS_ERR_INVALID_PARAM); - UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); - - rdesc = ucp_tag_unexp_search(&worker->tm, tag, tag_mask, 1, "recv_nbr"); - ucp_tag_recv_common(worker, buffer, count, datatype, tag, tag_mask, - req, UCP_REQUEST_DEBUG_FLAG_EXTERNAL, NULL, rdesc, - "recv_nbr"); - - UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker); - return UCS_OK; + ucp_request_param_t param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_DATATYPE | + UCP_OP_ATTR_FIELD_REQUEST | + UCP_OP_ATTR_FLAG_NO_IMM_CMPL, + .request = request, + .datatype = datatype + }; + ucs_status_ptr_t status; + + status = ucp_tag_recv_nbx(worker, buffer, count, tag, tag_mask, ¶m); + return UCS_PTR_IS_ERR(status) ? UCS_PTR_STATUS(status) : UCS_OK; } UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_recv_nb, @@ -178,25 +177,45 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_recv_nb, ucp_worker_h worker, void *buffer, size_t count, uintptr_t datatype, ucp_tag_t tag, ucp_tag_t tag_mask, ucp_tag_recv_callback_t cb) +{ + ucp_request_param_t param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_DATATYPE | + UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FLAG_NO_IMM_CMPL, + .cb.recv = (ucp_tag_recv_nbx_callback_t)cb, + .datatype = datatype + }; + + return ucp_tag_recv_nbx(worker, buffer, count, tag, tag_mask, ¶m); +} + +UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_recv_nbx, + (worker, buffer, count, tag, tag_mask, param), + ucp_worker_h worker, void *buffer, size_t count, + ucp_tag_t tag, ucp_tag_t tag_mask, + const ucp_request_param_t *param) { ucp_recv_desc_t *rdesc; ucs_status_ptr_t ret; ucp_request_t *req; + ucp_datatype_t datatype; UCP_CONTEXT_CHECK_FEATURE_FLAGS(worker->context, UCP_FEATURE_TAG, return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM)); UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); - req = ucp_request_get(worker); - if (ucs_likely(req != NULL)) { - rdesc = ucp_tag_unexp_search(&worker->tm, tag, tag_mask, 1, "recv_nb"); - ucp_tag_recv_common(worker, buffer, count, datatype, tag, tag_mask, req, - UCP_REQUEST_FLAG_CALLBACK, cb, rdesc,"recv_nb"); - ret = req + 1; - } else { - ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); - } + datatype = (param->op_attr_mask & UCP_OP_ATTR_FIELD_DATATYPE) ? + param->datatype : ucp_dt_make_contig(1); + + req = ucp_request_get_param(worker, param, + {ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); + goto out;}); + + rdesc = ucp_tag_unexp_search(&worker->tm, tag, tag_mask, 1, "recv_nbx"); + ret = ucp_tag_recv_common(worker, buffer, count, datatype, tag, tag_mask, req, + rdesc, param, "recv_nbx"); +out: UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker); return ret; } @@ -207,6 +226,11 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_msg_recv_nb, uintptr_t datatype, ucp_tag_message_h message, ucp_tag_recv_callback_t cb) { + ucp_request_param_t param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FLAG_NO_IMM_CMPL, + .cb.recv = (ucp_tag_recv_nbx_callback_t)cb + }; ucp_recv_desc_t *rdesc = message; ucs_status_ptr_t ret; ucp_request_t *req; @@ -217,10 +241,9 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_msg_recv_nb, req = ucp_request_get(worker); if (ucs_likely(req != NULL)) { - ucp_tag_recv_common(worker, buffer, count, datatype, - ucp_rdesc_get_tag(rdesc), UCP_TAG_MASK_FULL, req, - UCP_REQUEST_FLAG_CALLBACK, cb, rdesc, "msg_recv_nb"); - ret = req + 1; + ret = ucp_tag_recv_common(worker, buffer, count, datatype, + ucp_rdesc_get_tag(rdesc), UCP_TAG_MASK_FULL, + req, rdesc, ¶m, "msg_recv_nb"); } else { ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); } diff --git a/src/ucp/tag/tag_send.c b/src/ucp/tag/tag_send.c index 0547b743d9d..3a4f51ca164 100644 --- a/src/ucp/tag/tag_send.c +++ b/src/ucp/tag/tag_send.c @@ -4,7 +4,11 @@ * See file LICENSE for terms. */ -#include "tag_match.h" +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "tag_match.inl" #include "eager.h" #include "rndv.h" @@ -16,6 +20,13 @@ #include +#define UCP_TAG_SEND_CHECK_STATUS(_status, _ret, _done) \ + if (ucs_likely((_status) != UCS_ERR_NO_RESOURCE)) { \ + _ret = UCS_STATUS_PTR(_status); /* UCS_OK also goes here */ \ + _done; \ + } + + static UCS_F_ALWAYS_INLINE size_t ucp_tag_get_rndv_threshold(const ucp_request_t *req, size_t count, size_t max_iov, size_t rndv_rma_thresh, @@ -44,19 +55,30 @@ ucp_tag_get_rndv_threshold(const ucp_request_t *req, size_t count, static UCS_F_ALWAYS_INLINE ucs_status_ptr_t ucp_tag_send_req(ucp_request_t *req, size_t dt_count, const ucp_ep_msg_config_t* msg_config, - size_t rndv_rma_thresh, size_t rndv_am_thresh, - ucp_send_callback_t cb, const ucp_proto_t *proto, - int enable_zcopy) + const ucp_request_param_t *param, + const ucp_request_send_proto_t *proto) { - size_t rndv_thresh = ucp_tag_get_rndv_threshold(req, dt_count, - msg_config->max_iov, - rndv_rma_thresh, - rndv_am_thresh); - ssize_t max_short = ucp_proto_get_short_max(req, msg_config); + ssize_t max_short = ucp_proto_get_short_max(req, msg_config); ucs_status_t status; size_t zcopy_thresh; + size_t rndv_thresh; + size_t rndv_rma_thresh; + size_t rndv_am_thresh; + + if ((param->op_attr_mask & UCP_OP_ATTR_FLAG_FAST_CMPL) && + ucs_likely(UCP_MEM_IS_ACCESSIBLE_FROM_CPU(req->send.mem_type))) { + rndv_rma_thresh = ucp_ep_config(req->send.ep)->tag.rndv_send_nbr.rma_thresh; + rndv_am_thresh = ucp_ep_config(req->send.ep)->tag.rndv_send_nbr.am_thresh; + } else { + rndv_rma_thresh = ucp_ep_config(req->send.ep)->tag.rndv.rma_thresh; + rndv_am_thresh = ucp_ep_config(req->send.ep)->tag.rndv.am_thresh; + } + + rndv_thresh = ucp_tag_get_rndv_threshold(req, dt_count, msg_config->max_iov, + rndv_rma_thresh, rndv_am_thresh); - if (enable_zcopy || ucs_unlikely(!UCP_MEM_IS_HOST(req->send.mem_type))) { + if (!(param->op_attr_mask & UCP_OP_ATTR_FLAG_FAST_CMPL) || + ucs_unlikely(!UCP_MEM_IS_ACCESSIBLE_FROM_CPU(req->send.mem_type))) { zcopy_thresh = ucp_proto_get_zcopy_threshold(req, msg_config, dt_count, rndv_thresh); } else { @@ -67,7 +89,8 @@ ucp_tag_send_req(ucp_request_t *req, size_t dt_count, "buffer=%p length=%zu max_short=%zd rndv_thresh=%zu " "zcopy_thresh=%zu zcopy_enabled=%d", req, req->send.datatype, req->send.buffer, req->send.length, - max_short, rndv_thresh, zcopy_thresh, enable_zcopy); + max_short, rndv_thresh, zcopy_thresh, + !(param->op_attr_mask & UCP_OP_ATTR_FLAG_FAST_CMPL)); status = ucp_request_send_start(req, max_short, zcopy_thresh, rndv_thresh, dt_count, msg_config, proto); @@ -94,23 +117,16 @@ ucp_tag_send_req(ucp_request_t *req, size_t dt_count, /* * Start the request. - * If it is completed immediately, release the request and return the status. + * If it is completed immediately and this completion is allowed, + * release the request and return the status. * Otherwise, return the request. */ status = ucp_request_send(req, 0); if (req->flags & UCP_REQUEST_FLAG_COMPLETED) { - ucs_trace_req("releasing send request %p, returning status %s", req, - ucs_status_string(status)); - if (enable_zcopy) { - ucp_request_put(req); - } - return UCS_STATUS_PTR(status); - } - - if (enable_zcopy) { - ucp_request_set_callback(req, send.cb, cb) + ucp_request_imm_cmpl_param(param, req, status, send); } + ucp_request_set_send_callback_param(param, req, send); ucs_trace_req("returning send request %p", req); return req + 1; } @@ -118,43 +134,46 @@ ucp_tag_send_req(ucp_request_t *req, size_t dt_count, static UCS_F_ALWAYS_INLINE void ucp_tag_send_req_init(ucp_request_t* req, ucp_ep_h ep, const void* buffer, uintptr_t datatype, size_t count, ucp_tag_t tag, - uint16_t flags) + uint32_t flags) { - req->flags = flags; - req->send.ep = ep; - req->send.buffer = (void*)buffer; - req->send.datatype = datatype; - req->send.tag.tag = tag; + req->flags = flags | UCP_REQUEST_FLAG_SEND_TAG; + req->send.ep = ep; + req->send.buffer = (void*)buffer; + req->send.datatype = datatype; + req->send.msg_proto.tag.tag = tag; ucp_request_send_state_init(req, datatype, count); req->send.length = ucp_dt_length(req->send.datatype, count, req->send.buffer, &req->send.state.dt); - ucp_memory_type_detect_mds(ep->worker->context, (void *)buffer, - req->send.length, &req->send.mem_type); + req->send.mem_type = ucp_memory_type_detect(ep->worker->context, + (void*)buffer, + req->send.length); req->send.lane = ucp_ep_config(ep)->tag.lane; req->send.pending_lane = UCP_NULL_LANE; } +static UCS_F_ALWAYS_INLINE int +ucp_tag_eager_is_inline(ucp_ep_h ep, const ucp_memtype_thresh_t *max_eager_short, + ssize_t length) +{ + return (ucs_likely(length <= max_eager_short->memtype_off) || + (length <= max_eager_short->memtype_on && + ucp_memory_type_cache_is_empty(ep->worker->context))); +} + static UCS_F_ALWAYS_INLINE ucs_status_t -ucp_tag_send_inline(ucp_ep_h ep, const void *buffer, size_t count, - uintptr_t datatype, ucp_tag_t tag) +ucp_tag_send_inline(ucp_ep_h ep, const void *buffer, size_t length, ucp_tag_t tag) { ucs_status_t status; - size_t length; - if (ucs_unlikely(!UCP_DT_IS_CONTIG(datatype))) { - return UCS_ERR_NO_RESOURCE; - } - - length = ucp_contig_dt_length(datatype, count); - - if ((ssize_t)length <= ucp_ep_config(ep)->tag.max_eager_short && - ucp_memory_type_cache_is_empty(ep->worker->context)) { + if (ucp_tag_eager_is_inline(ep, &ucp_ep_config(ep)->tag.max_eager_short, + length)) { UCS_STATIC_ASSERT(sizeof(ucp_tag_t) == sizeof(ucp_eager_hdr_t)); UCS_STATIC_ASSERT(sizeof(ucp_tag_t) == sizeof(uint64_t)); status = uct_ep_am_short(ucp_ep_get_am_uct_ep(ep), UCP_AM_ID_EAGER_ONLY, tag, buffer, length); - } else if ((ssize_t)length <= ucp_ep_config(ep)->tag.offload.max_eager_short) { + } else if (ucp_tag_eager_is_inline(ep, &ucp_ep_config(ep)->tag.offload.max_eager_short, + length)) { UCS_STATIC_ASSERT(sizeof(ucp_tag_t) == sizeof(uct_tag_t)); status = uct_ep_tag_eager_short(ucp_ep_get_tag_uct_ep(ep), tag, buffer, length); @@ -169,102 +188,132 @@ ucp_tag_send_inline(ucp_ep_h ep, const void *buffer, size_t count, return status; } - UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_nb, (ep, buffer, count, datatype, tag, cb), ucp_ep_h ep, const void *buffer, size_t count, uintptr_t datatype, ucp_tag_t tag, ucp_send_callback_t cb) { - ucs_status_t status; - ucp_request_t *req; - ucs_status_ptr_t ret; + ucp_request_param_t param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_DATATYPE | UCP_OP_ATTR_FIELD_CALLBACK, + .cb.send = (ucp_send_nbx_callback_t)cb, + .datatype = datatype + }; - UCP_CONTEXT_CHECK_FEATURE_FLAGS(ep->worker->context, UCP_FEATURE_TAG, - return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM)); - UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker); - - ucs_trace_req("send_nb buffer %p count %zu tag %"PRIx64" to %s cb %p", - buffer, count, tag, ucp_ep_peer_name(ep), cb); + return ucp_tag_send_nbx(ep, buffer, count, tag, ¶m); +} - status = UCS_PROFILE_CALL(ucp_tag_send_inline, ep, buffer, count, - datatype, tag); - if (ucs_likely(status != UCS_ERR_NO_RESOURCE)) { - ret = UCS_STATUS_PTR(status); /* UCS_OK also goes here */ - goto out; +UCS_PROFILE_FUNC(ucs_status_t, ucp_tag_send_nbr, + (ep, buffer, count, datatype, tag, request), + ucp_ep_h ep, const void *buffer, size_t count, + uintptr_t datatype, ucp_tag_t tag, void *request) +{ + ucp_request_param_t param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_DATATYPE | UCP_OP_ATTR_FIELD_REQUEST | + UCP_OP_ATTR_FLAG_FAST_CMPL, + .datatype = datatype, + .request = request + }; + ucs_status_ptr_t status; + + status = ucp_tag_send_nbx(ep, buffer, count, tag, ¶m); + if (ucs_likely(status == UCS_OK)) { + return UCS_OK; } - req = ucp_request_get(ep->worker); - if (req == NULL) { - ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); - goto out; + if (ucs_unlikely(UCS_PTR_IS_ERR(status))) { + return UCS_PTR_STATUS(status); } + return UCS_INPROGRESS; +} - ucp_tag_send_req_init(req, ep, buffer, datatype, count, tag, 0); +UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_sync_nb, + (ep, buffer, count, datatype, tag, cb), + ucp_ep_h ep, const void *buffer, size_t count, + uintptr_t datatype, ucp_tag_t tag, ucp_send_callback_t cb) +{ + ucp_request_param_t param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_DATATYPE | UCP_OP_ATTR_FIELD_CALLBACK, + .cb.send = (ucp_send_nbx_callback_t)cb, + .datatype = datatype + }; - ret = ucp_tag_send_req(req, count, &ucp_ep_config(ep)->tag.eager, - ucp_ep_config(ep)->tag.rndv.rma_thresh, - ucp_ep_config(ep)->tag.rndv.am_thresh, - cb, ucp_ep_config(ep)->tag.proto, 1); -out: - UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(ep->worker); - return ret; + return ucp_tag_send_sync_nbx(ep, buffer, count, tag, ¶m); } -UCS_PROFILE_FUNC(ucs_status_t, ucp_tag_send_nbr, - (ep, buffer, count, datatype, tag, request), +UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_nbx, + (ep, buffer, count, tag, param), ucp_ep_h ep, const void *buffer, size_t count, - uintptr_t datatype, ucp_tag_t tag, void *request) + ucp_tag_t tag, const ucp_request_param_t *param) { - ucp_request_t *req = (ucp_request_t *)request - 1; ucs_status_t status; + ucp_request_t *req; ucs_status_ptr_t ret; + uintptr_t datatype; + uint32_t attr_mask; UCP_CONTEXT_CHECK_FEATURE_FLAGS(ep->worker->context, UCP_FEATURE_TAG, - return UCS_ERR_INVALID_PARAM); + return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM)); UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker); - ucs_trace_req("send_nbr buffer %p count %zu tag %"PRIx64" to %s req %p", - buffer, count, tag, ucp_ep_peer_name(ep), request); + ucs_trace_req("send_nbx buffer %p count %zu tag %"PRIx64" to %s", + buffer, count, tag, ucp_ep_peer_name(ep)); + + attr_mask = param->op_attr_mask & + (UCP_OP_ATTR_FIELD_DATATYPE | UCP_OP_ATTR_FLAG_NO_IMM_CMPL); + + if (ucs_likely(attr_mask == 0)) { + status = UCS_PROFILE_CALL(ucp_tag_send_inline, ep, buffer, count, tag); + UCP_TAG_SEND_CHECK_STATUS(status, ret, goto out); + datatype = ucp_dt_make_contig(1); + } else if (attr_mask == UCP_OP_ATTR_FIELD_DATATYPE) { + datatype = param->datatype; + if (ucs_likely(UCP_DT_IS_CONTIG(datatype))) { + status = UCS_PROFILE_CALL(ucp_tag_send_inline, ep, buffer, + ucp_contig_dt_length(datatype, count), tag); + UCP_TAG_SEND_CHECK_STATUS(status, ret, goto out); + } + } else { + datatype = ucp_dt_make_contig(1); + } - status = UCS_PROFILE_CALL(ucp_tag_send_inline, ep, buffer, count, - datatype, tag); - if (ucs_likely(status != UCS_ERR_NO_RESOURCE)) { - UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(ep->worker); - return status; + if (ucs_unlikely(param->op_attr_mask & UCP_OP_ATTR_FLAG_FORCE_IMM_CMPL)) { + ret = UCS_STATUS_PTR(UCS_ERR_NO_RESOURCE); + goto out; } - ucp_tag_send_req_init(req, ep, buffer, datatype, count, tag, 0); + req = ucp_request_get_param(ep->worker, param, + {ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); + goto out;}); + ucp_tag_send_req_init(req, ep, buffer, datatype, count, tag, 0); ret = ucp_tag_send_req(req, count, &ucp_ep_config(ep)->tag.eager, - ucp_ep_config(ep)->tag.rndv_send_nbr.rma_thresh, - ucp_ep_config(ep)->tag.rndv_send_nbr.am_thresh, - NULL, ucp_ep_config(ep)->tag.proto, 0); - + param, ucp_ep_config(ep)->tag.proto); +out: UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(ep->worker); - - if (ucs_unlikely(UCS_PTR_IS_ERR(ret))) { - return UCS_PTR_STATUS(ret); - } - return UCS_INPROGRESS; + return ret; } -UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_sync_nb, - (ep, buffer, count, datatype, tag, cb), +UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_sync_nbx, + (ep, buffer, count, tag, param), ucp_ep_h ep, const void *buffer, size_t count, - uintptr_t datatype, ucp_tag_t tag, ucp_send_callback_t cb) + ucp_tag_t tag, const ucp_request_param_t *param) { + ucs_status_t status; ucp_request_t *req; ucs_status_ptr_t ret; - ucs_status_t status; + uintptr_t datatype; UCP_CONTEXT_CHECK_FEATURE_FLAGS(ep->worker->context, UCP_FEATURE_TAG, return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM)); UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker); - ucs_trace_req("send_sync_nb buffer %p count %zu tag %"PRIx64" to %s cb %p", - buffer, count, tag, ucp_ep_peer_name(ep), cb); + ucs_trace_req("send_sync_nbx buffer %p count %zu tag %"PRIx64" to %s", + buffer, count, tag, ucp_ep_peer_name(ep)); - if (ucp_ep_config(ep)->key.err_mode == UCP_ERR_HANDLING_MODE_PEER) { + datatype = (param->op_attr_mask & UCP_OP_ATTR_FIELD_DATATYPE) ? + param->datatype : ucp_dt_make_contig(1); + + if (!ucp_ep_config_test_rndv_support(ucp_ep_config(ep))) { ret = UCS_STATUS_PTR(UCS_ERR_UNSUPPORTED); goto out; } @@ -275,19 +324,14 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_sync_nb, goto out; } - req = ucp_request_get(ep->worker); - if (req == NULL) { - ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); - goto out; - } + req = ucp_request_get_param(ep->worker, param, + {ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); + goto out;}); ucp_tag_send_req_init(req, ep, buffer, datatype, count, tag, UCP_REQUEST_FLAG_SYNC); - ret = ucp_tag_send_req(req, count, &ucp_ep_config(ep)->tag.eager, - ucp_ep_config(ep)->tag.rndv.rma_thresh, - ucp_ep_config(ep)->tag.rndv.am_thresh, - cb, ucp_ep_config(ep)->tag.sync_proto, 1); + param, ucp_ep_config(ep)->tag.sync_proto); out: UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(ep->worker); return ret; diff --git a/src/ucp/wireup/address.c b/src/ucp/wireup/address.c index f7f12278cb3..9ab8735f2fe 100644 --- a/src/ucp/wireup/address.c +++ b/src/ucp/wireup/address.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "address.h" #include "wireup_ep.h" @@ -17,7 +21,7 @@ /* * Packed address layout: * - * [ uuid(64bit) | worker_name(string) ] + * [ header(8bit) | uuid(64bit) | worker_name(string) ] * [ device1_md_index | device1_address(var) ] * [ tl1_name_csum(string) | tl1_info | tl1_address(var) ] * [ tl2_name_csum(string) | tl2_info | tl2_address(var) ] @@ -25,25 +29,30 @@ * [ device2_md_index | device2_address(var) ] * ... * - * * worker_name is packed if ENABLE_DEBUG is set. - * * In unified mode tl_info contains just rsc_index. For last address in the - * tl address list, it will have LAST flag set. + * * Worker name is packed if UCX_ADDRESS_DEBUG_INFO is enabled. + * * In unified mode tl_info contains just rsc_index and iface latency overhead. + * For last address in the tl address list, it will have LAST flag set. + * * For ep address, lane index contains the LAST flag. * * In non unified mode tl_info contains iface attributes. LAST flag is set in * iface address length. * * If a device does not have tl addresses, it's md_index will have the flag * EMPTY. * * If the address list is empty, then it will contain only a single md_index * which equals to UCP_NULL_RESOURCE. - * + * * For non-unified mode, ep address contains length with flags. Multiple ep + * addresses could be present and the last one is marked with the flag + * UCP_ADDRESS_FLAG_LAST. For unified mode, there could not be more than one + * ep address. + * * For any mode, ep address is followed by a lane index. */ typedef struct { - const char *dev_name; size_t dev_addr_len; uint64_t tl_bitmap; ucp_rsc_index_t rsc_index; ucp_rsc_index_t tl_count; + unsigned num_paths; size_t tl_addrs_size; } ucp_address_packed_device_t; @@ -52,38 +61,77 @@ typedef struct { float overhead; float bandwidth; float lat_ovh; - uint32_t prio_cap_flags; /* 8 lsb: prio, 22 msb: cap flags, 2 hsb: amo */ + uint32_t prio_cap_flags; /* 8 lsb : prio + * 22 msb: + * - iface flags + * - iface event flags + * 2 hsb : + * - amo32 + * - amo64 */ } ucp_address_packed_iface_attr_t; -#define UCT_ADDRESS_FLAG_ATOMIC32 UCS_BIT(30) /* 32bit atomic operations */ -#define UCT_ADDRESS_FLAG_ATOMIC64 UCS_BIT(31) /* 64bit atomic operations */ -#define UCP_ADDRESS_FLAG_LAST 0x80 /* Last address in the list */ -#define UCP_ADDRESS_FLAG_EP_ADDR 0x40 /* Indicates that ep addr is packed - right after iface addr */ -#define UCP_ADDRESS_FLAG_LEN_MASK ~(UCP_ADDRESS_FLAG_EP_ADDR | \ - UCP_ADDRESS_FLAG_LAST) +/* In unified mode we pack resource index instead of iface attrs to the address, + * so the peer can get all attrs from the local device with the same resource + * index. + * Also we send information which depends on device NUMA locality, + * which may be different on peers (processes which do address pack + * and address unpack): + * - latency overhead + * - Indication whether resource can be used for atomics or not (packed to the + * signed bit of lat_ovh). + * + * TODO: Revise/fix this when NUMA locality is exposed in UCP. + * */ +typedef struct { + ucp_rsc_index_t rsc_index; + float lat_ovh; +} ucp_address_unified_iface_attr_t; -#define UCP_ADDRESS_FLAG_EMPTY 0x80 /* Device without TL addresses */ -#define UCP_ADDRESS_FLAG_MD_ALLOC 0x40 /* MD can register */ -#define UCP_ADDRESS_FLAG_MD_REG 0x20 /* MD can allocate */ -#define UCP_ADDRESS_FLAG_MD_MASK ~(UCP_ADDRESS_FLAG_EMPTY | \ - UCP_ADDRESS_FLAG_MD_ALLOC | \ - UCP_ADDRESS_FLAG_MD_REG) -static size_t ucp_address_worker_name_size(ucp_worker_h worker) -{ -#if ENABLE_DEBUG_DATA - return strlen(ucp_worker_get_name(worker)) + 1; -#else - return 0; -#endif -} +#define UCP_ADDRESS_FLAG_ATOMIC32 UCS_BIT(30) /* 32bit atomic operations */ +#define UCP_ADDRESS_FLAG_ATOMIC64 UCS_BIT(31) /* 64bit atomic operations */ + +#define UCP_ADDRESS_FLAG_LAST 0x80u /* Last address in the list */ +#define UCP_ADDRESS_FLAG_HAS_EP_ADDR 0x40u /* For iface address: + Indicates that ep addr is packed + right after iface addr */ +#define UCP_ADDRESS_FLAG_HAVE_PATHS 0x40u /* For device address: + Indicates that number of paths on the + device is packed right after device + address, otherwise number of paths + defaults to 1. */ +#define UCP_ADDRESS_FLAG_LEN_MASK (UCS_MASK(8) ^ \ + (UCP_ADDRESS_FLAG_HAS_EP_ADDR | \ + UCP_ADDRESS_FLAG_HAVE_PATHS | \ + UCP_ADDRESS_FLAG_LAST)) + +#define UCP_ADDRESS_FLAG_MD_EMPTY_DEV 0x80u /* Device without TL addresses */ +#define UCP_ADDRESS_FLAG_MD_ALLOC 0x40u /* MD can register */ +#define UCP_ADDRESS_FLAG_MD_REG 0x20u /* MD can allocate */ +#define UCP_ADDRESS_FLAG_MD_MASK (UCS_MASK(8) ^ \ + (UCP_ADDRESS_FLAG_MD_EMPTY_DEV | \ + UCP_ADDRESS_FLAG_MD_ALLOC | \ + UCP_ADDRESS_FLAG_MD_REG)) + +#define UCP_ADDRESS_HEADER_VERSION_MASK UCS_MASK(4) /* Version - 4 bits */ +#define UCP_ADDRESS_HEADER_FLAG_DEBUG_INFO UCS_BIT(4) /* Address has debug info */ + +/* Enumeration of UCP address versions. + * Every release which changes the address binary format must bump this number. + */ +enum { + UCP_ADDRESS_VERSION_V1 = 0, + UCP_ADDRESS_VERSION_LAST, + UCP_ADDRESS_VERSION_CURRENT = UCP_ADDRESS_VERSION_LAST - 1 +}; + static size_t ucp_address_iface_attr_size(ucp_worker_t *worker) { return ucp_worker_unified_mode(worker) ? - sizeof(ucp_rsc_index_t) : sizeof(ucp_address_packed_iface_attr_t); + sizeof(ucp_address_unified_iface_attr_t) : + sizeof(ucp_address_packed_iface_attr_t); } static uint64_t ucp_worker_iface_can_connect(uct_iface_attr_t *attrs) @@ -95,67 +143,63 @@ static uint64_t ucp_worker_iface_can_connect(uct_iface_attr_t *attrs) /* Pack a string and return a pointer to storage right after the string */ static void* ucp_address_pack_worker_name(ucp_worker_h worker, void *dest) { -#if ENABLE_DEBUG_DATA - const char *s = ucp_worker_get_name(worker); - size_t length = strlen(s); + const char *s; + size_t length; + s = ucp_worker_get_name(worker); + length = strlen(s); ucs_assert(length <= UINT8_MAX); *(uint8_t*)dest = length; - memcpy(dest + 1, s, length); - return dest + 1 + length; -#else - return dest; -#endif + memcpy(UCS_PTR_TYPE_OFFSET(dest, uint8_t), s, length); + return UCS_PTR_BYTE_OFFSET(UCS_PTR_TYPE_OFFSET(dest, uint8_t), length); } /* Unpack a string and return pointer to next storage byte */ -static const void* ucp_address_unpack_worker_name(const void *src, char *s, size_t max) +static const void* +ucp_address_unpack_worker_name(const void *src, char *s) { -#if ENABLE_DEBUG_DATA size_t length, avail; - ucs_assert(max >= 1); length = *(const uint8_t*)src; - avail = ucs_min(length, max - 1); - memcpy(s, src + 1, avail); + avail = ucs_min(length, UCP_WORKER_NAME_MAX - 1); + memcpy(s, UCS_PTR_TYPE_OFFSET(src, uint8_t), avail); s[avail] = '\0'; - return src + length + 1; -#else - s[0] = '\0'; - return src; -#endif + return UCS_PTR_TYPE_OFFSET(UCS_PTR_BYTE_OFFSET(src, length), uint8_t); } static ucp_address_packed_device_t* -ucp_address_get_device(const char *name, ucp_address_packed_device_t *devices, +ucp_address_get_device(ucp_context_h context, ucp_rsc_index_t rsc_index, + ucp_address_packed_device_t *devices, ucp_rsc_index_t *num_devices_p) { + const ucp_tl_resource_desc_t *tl_rsc = context->tl_rscs; ucp_address_packed_device_t *dev; for (dev = devices; dev < devices + *num_devices_p; ++dev) { - if (!strcmp(name, dev->dev_name)) { + if ((tl_rsc[rsc_index].md_index == tl_rsc[dev->rsc_index].md_index) && + !strcmp(tl_rsc[rsc_index].tl_rsc.dev_name, + tl_rsc[dev->rsc_index].tl_rsc.dev_name)) { goto out; } } dev = &devices[(*num_devices_p)++]; memset(dev, 0, sizeof(*dev)); - dev->dev_name = name; out: return dev; } static ucs_status_t -ucp_address_gather_devices(ucp_worker_h worker, uint64_t tl_bitmap, int has_ep, - ucp_address_packed_device_t **devices_p, +ucp_address_gather_devices(ucp_worker_h worker, ucp_ep_h ep, uint64_t tl_bitmap, + uint64_t flags, ucp_address_packed_device_t **devices_p, ucp_rsc_index_t *num_devices_p) { ucp_context_h context = worker->context; ucp_address_packed_device_t *dev, *devices; uct_iface_attr_t *iface_attr; ucp_rsc_index_t num_devices; - ucp_rsc_index_t i; - uint64_t mask; + ucp_rsc_index_t rsc_index; + ucp_lane_index_t lane; devices = ucs_calloc(context->num_tls, sizeof(*devices), "packed_devices"); if (devices == NULL) { @@ -163,37 +207,56 @@ ucp_address_gather_devices(ucp_worker_h worker, uint64_t tl_bitmap, int has_ep, } num_devices = 0; - ucs_for_each_bit(i, context->tl_bitmap) { - mask = UCS_BIT(i); - - if (!(mask & tl_bitmap)) { + tl_bitmap &= context->tl_bitmap; + ucs_for_each_bit(rsc_index, tl_bitmap) { + iface_attr = ucp_worker_iface_get_attr(worker, rsc_index); + if (!ucp_worker_iface_can_connect(iface_attr)) { continue; } - iface_attr = ucp_worker_iface_get_attr(worker, i); + dev = ucp_address_get_device(context, rsc_index, devices, &num_devices); - if (!ucp_worker_iface_can_connect(iface_attr)) { - continue; + if (flags & UCP_ADDRESS_PACK_FLAG_EP_ADDR) { + ucs_assert(ep != NULL); + /* Each lane which matches the resource index adds an ep address + * entry. The length and flags is packed in non-unified mode only. + */ + ucs_for_each_bit(lane, ucp_ep_config(ep)->p2p_lanes) { + if (ucp_ep_get_rsc_index(ep, lane) == rsc_index) { + dev->tl_addrs_size += !ucp_worker_unified_mode(worker); + dev->tl_addrs_size += iface_attr->ep_addr_len; + dev->tl_addrs_size += sizeof(uint8_t); /* lane index */ + } + } } - dev = ucp_address_get_device(context->tl_rscs[i].tl_rsc.dev_name, - devices, &num_devices); + dev->tl_addrs_size += sizeof(uint16_t); /* tl name checksum */ - if (!(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) && has_ep) { - /* ep address (its length will be packed in non-unified mode only) */ - dev->tl_addrs_size += iface_attr->ep_addr_len; - dev->tl_addrs_size += !ucp_worker_unified_mode(worker); + if (flags & UCP_ADDRESS_PACK_FLAG_IFACE_ADDR) { + /* iface address (its length will be packed in non-unified mode only) */ + dev->tl_addrs_size += iface_attr->iface_addr_len; + dev->tl_addrs_size += !ucp_worker_unified_mode(worker); /* if addr length */ + dev->tl_addrs_size += ucp_address_iface_attr_size(worker); + } else { + dev->tl_addrs_size += 1; /* 0-value for valid unpacking */ } - dev->tl_addrs_size += sizeof(uint16_t); /* tl name checksum */ + if (flags & UCP_ADDRESS_PACK_FLAG_DEVICE_ADDR) { + dev->dev_addr_len = iface_attr->device_addr_len; + } else { + dev->dev_addr_len = 0; + } + + if (iface_attr->dev_num_paths > UINT8_MAX) { + ucs_error("only up to %d paths are supported by address pack (got: %u)", + UINT8_MAX, iface_attr->dev_num_paths); + ucs_free(devices); + return UCS_ERR_UNSUPPORTED; + } - /* iface address (its length will be packed in non-unified mode only) */ - dev->tl_addrs_size += iface_attr->iface_addr_len; - dev->tl_addrs_size += !ucp_worker_unified_mode(worker); /* if addr length */ - dev->tl_addrs_size += ucp_address_iface_attr_size(worker); - dev->rsc_index = i; - dev->dev_addr_len = iface_attr->device_addr_len; - dev->tl_bitmap |= mask; + dev->rsc_index = rsc_index; + dev->tl_bitmap |= UCS_BIT(rsc_index); + dev->num_paths = iface_attr->dev_num_paths; } *devices_p = devices; @@ -203,180 +266,250 @@ ucp_address_gather_devices(ucp_worker_h worker, uint64_t tl_bitmap, int has_ep, static size_t ucp_address_packed_size(ucp_worker_h worker, const ucp_address_packed_device_t *devices, - ucp_rsc_index_t num_devices) + ucp_rsc_index_t num_devices, + uint64_t pack_flags) { + size_t size = 0; const ucp_address_packed_device_t *dev; - size_t size; - size = sizeof(uint64_t) + ucp_address_worker_name_size(worker); + /* header: version and flags */ + size += 1; + + if (pack_flags & UCP_ADDRESS_PACK_FLAG_WORKER_UUID) { + size += sizeof(uint64_t); + } + + if ((worker->context->config.ext.address_debug_info) && + (pack_flags & UCP_ADDRESS_PACK_FLAG_WORKER_NAME)) { + size += strlen(ucp_worker_get_name(worker)) + 1; + } if (num_devices == 0) { size += 1; /* NULL md_index */ } else { - for (dev = devices; dev < devices + num_devices; ++dev) { + for (dev = devices; dev < (devices + num_devices); ++dev) { size += 1; /* device md_index */ size += 1; /* device address length */ - size += dev->dev_addr_len; /* device address */ + if (pack_flags & UCP_ADDRESS_PACK_FLAG_DEVICE_ADDR) { + size += dev->dev_addr_len; /* device address */ + } + if (dev->num_paths > 1) { + size += 1; /* number of paths */ + } size += dev->tl_addrs_size; /* transport addresses */ } } return size; } -static void ucp_address_memchek(void *ptr, size_t size, - const uct_tl_resource_desc_t *rsc) +static void ucp_address_memcheck(ucp_context_h context, void *ptr, size_t size, + ucp_rsc_index_t rsc_index) { + void *undef_ptr; undef_ptr = (void*)VALGRIND_CHECK_MEM_IS_DEFINED(ptr, size); if (undef_ptr != NULL) { ucs_error(UCT_TL_RESOURCE_DESC_FMT " address contains undefined bytes at offset %zd", - UCT_TL_RESOURCE_DESC_ARG(rsc), undef_ptr - ptr); + UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[rsc_index].tl_rsc), + UCS_PTR_BYTE_DIFF(ptr, undef_ptr)); } } -static ucs_status_t -ucp_address_pack_ep_address(ucp_ep_h ep, ucp_rsc_index_t tl_index, - uct_ep_addr_t *addr) +static uint32_t ucp_address_pack_flags(uint64_t input_flags, + uint64_t cap_mask, + uint8_t output_start_bit) { - ucp_lane_index_t lane; + uint32_t result_flags = 0; + uint32_t packed_flag; + uint8_t cap_index; - for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { - if (ucp_ep_get_rsc_index(ep, lane) == tl_index) { - /* - * If this is a wireup endpoint, it will return the underlying next_ep - * address, and the length will be correct because the resource index - * is of the next_ep. - */ - return uct_ep_get_address(ep->uct_eps[lane], addr); + ucs_assert((ucs_popcount(cap_mask) + output_start_bit) < 32); + packed_flag = UCS_BIT(output_start_bit); + + ucs_for_each_bit(cap_index, cap_mask) { + if (input_flags & UCS_BIT(cap_index)) { + result_flags |= packed_flag; + } + + packed_flag <<= 1; + } + + return result_flags; +} + +static uint64_t ucp_address_unpack_flags(uint32_t input_flags, + uint64_t cap_mask, + uint8_t input_start_bit) +{ + uint64_t result_flags = 0; + uint32_t packed_flag; + uint8_t cap_index; + + ucs_assert((ucs_popcount(cap_mask) + input_start_bit) < 32); + packed_flag = UCS_BIT(input_start_bit); + + ucs_for_each_bit(cap_index, cap_mask) { + if (input_flags & packed_flag) { + result_flags |= UCS_BIT(cap_index); } + + packed_flag <<= 1; } - ucs_bug("provided ucp_ep without required transport"); - return UCS_ERR_INVALID_ADDR; + return result_flags; } static int ucp_address_pack_iface_attr(ucp_worker_h worker, void *ptr, - ucp_rsc_index_t index, + ucp_rsc_index_t rsc_index, const uct_iface_attr_t *iface_attr, int enable_atomics) { - ucp_address_packed_iface_attr_t *packed; - uint32_t packed_flag; - uint64_t cap_flags; - uint64_t bit; + ucp_address_packed_iface_attr_t *packed; + ucp_address_unified_iface_attr_t *unified; + + /* check if at least one of bandwidth values is 0 */ + if ((iface_attr->bandwidth.dedicated * iface_attr->bandwidth.shared) != 0) { + ucs_error("Incorrect bandwidth value: one of bandwidth dedicated/shared must be zero"); + return -1; + } + if (ucp_worker_unified_mode(worker)) { /* In unified mode all workers have the same transports and tl bitmap. * Just send rsc index, so the remote peer could fetch iface attributes - * from its local iface. */ - *(ucp_rsc_index_t*)ptr = index; - return sizeof(ucp_rsc_index_t); + * from its local iface. Also send latency overhead, because it + * depends on device NUMA locality. */ + unified = ptr; + unified->rsc_index = rsc_index; + unified->lat_ovh = enable_atomics ? -iface_attr->latency.c : + iface_attr->latency.c; + + return sizeof(*unified); } - packed = ptr; - cap_flags = iface_attr->cap.flags; - + packed = ptr; packed->prio_cap_flags = ((uint8_t)iface_attr->priority); packed->overhead = iface_attr->overhead; - packed->bandwidth = iface_attr->bandwidth; - packed->lat_ovh = iface_attr->latency.overhead; - - /* Keep only the bits defined by UCP_ADDRESS_IFACE_FLAGS, to shrink address. */ - packed_flag = UCS_BIT(8); - bit = 1; - while (UCP_ADDRESS_IFACE_FLAGS & ~(bit - 1)) { - if (UCP_ADDRESS_IFACE_FLAGS & bit) { - if (cap_flags & bit) { - packed->prio_cap_flags |= packed_flag; - } - packed_flag <<= 1; - } - bit <<= 1; - } + packed->bandwidth = iface_attr->bandwidth.dedicated - iface_attr->bandwidth.shared; + packed->lat_ovh = iface_attr->latency.c; + + ucs_assert((ucs_popcount(UCP_ADDRESS_IFACE_FLAGS) + + ucs_popcount(UCP_ADDRESS_IFACE_EVENT_FLAGS)) <= 22); + + /* Keep only the bits defined by UCP_ADDRESS_IFACE_FLAGS + * to shrink address. */ + packed->prio_cap_flags |= + ucp_address_pack_flags(iface_attr->cap.flags, + UCP_ADDRESS_IFACE_FLAGS, 8); + + /* Keep only the bits defined by UCP_ADDRESS_IFACE_EVENT_FLAGS + * to shrink address. */ + packed->prio_cap_flags |= + ucp_address_pack_flags(iface_attr->cap.event_flags, + UCP_ADDRESS_IFACE_EVENT_FLAGS, + 8 + ucs_popcount(UCP_ADDRESS_IFACE_FLAGS)); if (enable_atomics) { if (ucs_test_all_flags(iface_attr->cap.atomic32.op_flags, UCP_ATOMIC_OP_MASK) && ucs_test_all_flags(iface_attr->cap.atomic32.fop_flags, UCP_ATOMIC_FOP_MASK)) { - packed->prio_cap_flags |= UCT_ADDRESS_FLAG_ATOMIC32; + packed->prio_cap_flags |= UCP_ADDRESS_FLAG_ATOMIC32; } if (ucs_test_all_flags(iface_attr->cap.atomic64.op_flags, UCP_ATOMIC_OP_MASK) && ucs_test_all_flags(iface_attr->cap.atomic64.fop_flags, UCP_ATOMIC_FOP_MASK)) { - packed->prio_cap_flags |= UCT_ADDRESS_FLAG_ATOMIC64; + packed->prio_cap_flags |= UCP_ADDRESS_FLAG_ATOMIC64; } } return sizeof(*packed); } -static int +static ucs_status_t ucp_address_unpack_iface_attr(ucp_worker_t *worker, ucp_address_iface_attr_t *iface_attr, - const void *ptr) + const void *ptr, unsigned unpack_flags, + size_t *size_p) { const ucp_address_packed_iface_attr_t *packed; + const ucp_address_unified_iface_attr_t *unified; ucp_worker_iface_t *wiface; - uint32_t packed_flag; ucp_rsc_index_t rsc_idx; - uint64_t bit; if (ucp_worker_unified_mode(worker)) { - /* Address contains resources index, not iface attrs. - * Just take iface attrs from the local resource. */ - rsc_idx = (*(ucp_rsc_index_t*)ptr) & UCP_ADDRESS_FLAG_LEN_MASK; - wiface = ucp_worker_iface(worker, rsc_idx); - iface_attr->cap_flags = wiface->attr.cap.flags; - iface_attr->priority = wiface->attr.priority; - iface_attr->overhead = wiface->attr.overhead; - iface_attr->bandwidth = wiface->attr.bandwidth; - iface_attr->lat_ovh = wiface->attr.latency.overhead; - if (worker->atomic_tls & UCS_BIT(rsc_idx)) { + /* Address contains resources index and iface latency overhead + * (not all iface attrs). */ + unified = ptr; + rsc_idx = unified->rsc_index & UCP_ADDRESS_FLAG_LEN_MASK; + iface_attr->lat_ovh = fabs(unified->lat_ovh); + if (!(worker->context->tl_bitmap & UCS_BIT(rsc_idx))) { + if (!(unpack_flags & UCP_ADDRESS_PACK_FLAG_NO_TRACE)) { + ucs_error("failed to unpack address, resource[%d] is not valid", + rsc_idx); + } + return UCS_ERR_INVALID_ADDR; + } + + /* Just take the rest of iface attrs from the local resource. */ + wiface = ucp_worker_iface(worker, rsc_idx); + iface_attr->cap_flags = wiface->attr.cap.flags; + iface_attr->event_flags = wiface->attr.cap.event_flags; + iface_attr->priority = wiface->attr.priority; + iface_attr->overhead = wiface->attr.overhead; + iface_attr->bandwidth = wiface->attr.bandwidth; + if (signbit(unified->lat_ovh)) { iface_attr->atomic.atomic32.op_flags = wiface->attr.cap.atomic32.op_flags; iface_attr->atomic.atomic32.fop_flags = wiface->attr.cap.atomic32.fop_flags; iface_attr->atomic.atomic64.op_flags = wiface->attr.cap.atomic64.op_flags; iface_attr->atomic.atomic64.fop_flags = wiface->attr.cap.atomic64.fop_flags; } - return sizeof(rsc_idx); - } - - packed = ptr; - iface_attr->cap_flags = 0; - iface_attr->priority = packed->prio_cap_flags & UCS_MASK(8); - iface_attr->overhead = packed->overhead; - iface_attr->bandwidth = packed->bandwidth; - iface_attr->lat_ovh = packed->lat_ovh; - - packed_flag = UCS_BIT(8); - bit = 1; - while (UCP_ADDRESS_IFACE_FLAGS & ~(bit - 1)) { - if (UCP_ADDRESS_IFACE_FLAGS & bit) { - if (packed->prio_cap_flags & packed_flag) { - iface_attr->cap_flags |= bit; - } - packed_flag <<= 1; - } - bit <<= 1; + + *size_p = sizeof(*unified); + return UCS_OK; } - if (packed->prio_cap_flags & UCT_ADDRESS_FLAG_ATOMIC32) { + packed = ptr; + iface_attr->priority = packed->prio_cap_flags & UCS_MASK(8); + iface_attr->overhead = packed->overhead; + iface_attr->bandwidth.dedicated = ucs_max(0.0, packed->bandwidth); + iface_attr->bandwidth.shared = ucs_max(0.0, -packed->bandwidth); + iface_attr->lat_ovh = packed->lat_ovh; + + /* Unpack iface flags */ + iface_attr->cap_flags = + ucp_address_unpack_flags(packed->prio_cap_flags, + UCP_ADDRESS_IFACE_FLAGS, 8); + + /* Unpack iface event flags */ + iface_attr->event_flags = + ucp_address_unpack_flags(packed->prio_cap_flags, + UCP_ADDRESS_IFACE_EVENT_FLAGS, + 8 + ucs_popcount(UCP_ADDRESS_IFACE_FLAGS)); + + /* Unpack iface 32-bit atomic operations */ + if (packed->prio_cap_flags & UCP_ADDRESS_FLAG_ATOMIC32) { iface_attr->atomic.atomic32.op_flags |= UCP_ATOMIC_OP_MASK; iface_attr->atomic.atomic32.fop_flags |= UCP_ATOMIC_FOP_MASK; } - if (packed->prio_cap_flags & UCT_ADDRESS_FLAG_ATOMIC64) { + + /* Unpack iface 64-bit atomic operations */ + if (packed->prio_cap_flags & UCP_ADDRESS_FLAG_ATOMIC64) { iface_attr->atomic.atomic64.op_flags |= UCP_ATOMIC_OP_MASK; iface_attr->atomic.atomic64.fop_flags |= UCP_ATOMIC_FOP_MASK; } - return sizeof(*packed); + *size_p = sizeof(*packed); + return UCS_OK; } -static const void* -ucp_address_iface_flags_ptr(ucp_worker_h worker, const void *attr_ptr, int attr_len) +static void* +ucp_address_iface_flags_ptr(ucp_worker_h worker, void *attr_ptr, int attr_len) { if (ucp_worker_unified_mode(worker)) { /* In unified mode, rsc_index is packed instead of attrs. Address flags * will be packed in the end of rsc_index byte. */ + UCS_STATIC_ASSERT(ucs_offsetof(ucp_address_unified_iface_attr_t, + rsc_index) == 0); return attr_ptr; } @@ -392,118 +525,163 @@ ucp_address_pack_length(ucp_worker_h worker, void *ptr, size_t addr_length) return ptr; } - ucs_assert(addr_length < UINT8_MAX); + ucs_assertv(addr_length <= UCP_ADDRESS_FLAG_LEN_MASK, "addr_length=%zu", + addr_length); *(uint8_t*)ptr = addr_length; - return UCS_PTR_BYTE_OFFSET(ptr, 1); + return UCS_PTR_TYPE_OFFSET(ptr, uint8_t); } static const void* ucp_address_unpack_length(ucp_worker_h worker, const void* flags_ptr, const void *ptr, - size_t *addr_length, int is_ep_addr) + size_t *addr_length, int is_ep_addr, int *is_last_iface) { ucp_rsc_index_t rsc_index; uct_iface_attr_t *attr; + const ucp_address_unified_iface_attr_t *unified; + + /* Caller should not use *is_last_iface for ep address, because for ep + * address last flag is part of lane index */ + ucs_assert(!is_ep_addr || is_last_iface == NULL); if (ucp_worker_unified_mode(worker)) { /* In unified mode: - * - flags are packed with rsc index + * - flags are packed with rsc index in ucp_address_unified_iface_attr_t * - iface and ep addr lengths are not packed, need to take them from * local iface attrs */ - rsc_index = (*(ucp_rsc_index_t*)flags_ptr) & UCP_ADDRESS_FLAG_LEN_MASK; - attr = &ucp_worker_iface(worker, rsc_index)->attr; + unified = flags_ptr; + rsc_index = unified->rsc_index & UCP_ADDRESS_FLAG_LEN_MASK; + attr = ucp_worker_iface_get_attr(worker, rsc_index); + + ucs_assert(&unified->rsc_index == flags_ptr); if (is_ep_addr) { - *addr_length = ((*(uint8_t*)flags_ptr) & UCP_ADDRESS_FLAG_EP_ADDR) ? - attr->ep_addr_len : 0; + *addr_length = attr->ep_addr_len; } else { - *addr_length = attr->iface_addr_len; + *addr_length = attr->iface_addr_len; + *is_last_iface = unified->rsc_index & UCP_ADDRESS_FLAG_LAST; } return ptr; } - if (is_ep_addr && !((*(uint8_t*)flags_ptr) & UCP_ADDRESS_FLAG_EP_ADDR)) { - /* No ep address packed */ - *addr_length = 0; - return ptr; + if (!is_ep_addr) { + *is_last_iface = *(uint8_t*)ptr & UCP_ADDRESS_FLAG_LAST; } - *addr_length = (*(uint8_t*)ptr) & UCP_ADDRESS_FLAG_LEN_MASK; + *addr_length = *(uint8_t*)ptr & UCP_ADDRESS_FLAG_LEN_MASK; - return UCS_PTR_BYTE_OFFSET(ptr, 1); + return UCS_PTR_TYPE_OFFSET(ptr, uint8_t); } static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep, void *buffer, size_t size, - uint64_t tl_bitmap, unsigned *order, + uint64_t tl_bitmap, unsigned pack_flags, + const ucp_lane_index_t *lanes2remote, const ucp_address_packed_device_t *devices, ucp_rsc_index_t num_devices) { - ucp_context_h context = worker->context; + ucp_context_h context = worker->context; + uint64_t md_flags_pack_mask = (UCT_MD_FLAG_REG | UCT_MD_FLAG_ALLOC); const ucp_address_packed_device_t *dev; + uint8_t *address_header_p; uct_iface_attr_t *iface_attr; - ucp_rsc_index_t md_index; + ucp_md_index_t md_index; ucp_worker_iface_t *wiface; + ucp_rsc_index_t rsc_index; + ucp_lane_index_t lane, remote_lane; + uint64_t dev_tl_bitmap; + unsigned num_ep_addrs; ucs_status_t status; - ucp_rsc_index_t i; size_t iface_addr_len; size_t ep_addr_len; uint64_t md_flags; - unsigned index; + uint8_t *ep_lane_ptr; + void *flags_ptr; + unsigned addr_index; int attr_len; void *ptr; - const void *flags_ptr; + int enable_amo; - ptr = buffer; - index = 0; + ptr = buffer; + addr_index = 0; + address_header_p = ptr; + *address_header_p = UCP_ADDRESS_VERSION_CURRENT; + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint8_t); - *(uint64_t*)ptr = worker->uuid; - ptr += sizeof(uint64_t); - ptr = ucp_address_pack_worker_name(worker, ptr); + if (pack_flags & UCP_ADDRESS_PACK_FLAG_WORKER_UUID) { + *(uint64_t*)ptr = worker->uuid; + ptr = UCS_PTR_TYPE_OFFSET(ptr, worker->uuid); + } + + if (worker->context->config.ext.address_debug_info) { + /* Add debug information to the packed address, and set the corresponding + * flag in address header. + */ + *address_header_p |= UCP_ADDRESS_HEADER_FLAG_DEBUG_INFO; + + if (pack_flags & UCP_ADDRESS_PACK_FLAG_WORKER_NAME) { + ptr = ucp_address_pack_worker_name(worker, ptr); + } + } if (num_devices == 0) { *((uint8_t*)ptr) = UCP_NULL_RESOURCE; - ++ptr; + ptr = UCS_PTR_TYPE_OFFSET(ptr, UCP_NULL_RESOURCE); goto out; } - for (dev = devices; dev < devices + num_devices; ++dev) { + for (dev = devices; dev < (devices + num_devices); ++dev) { + + dev_tl_bitmap = context->tl_bitmap & dev->tl_bitmap; /* MD index */ md_index = context->tl_rscs[dev->rsc_index].md_index; - md_flags = context->tl_mds[md_index].attr.cap.flags; - ucs_assert_always(!(md_index & ~UCP_ADDRESS_FLAG_MD_MASK)); + md_flags = context->tl_mds[md_index].attr.cap.flags & md_flags_pack_mask; + ucs_assertv_always(md_index <= UCP_ADDRESS_FLAG_MD_MASK, + "md_index=%d", md_index); *(uint8_t*)ptr = md_index | - ((dev->tl_bitmap == 0) ? UCP_ADDRESS_FLAG_EMPTY : 0) | - ((md_flags & UCT_MD_FLAG_ALLOC) ? UCP_ADDRESS_FLAG_MD_ALLOC : 0) | - ((md_flags & UCT_MD_FLAG_REG) ? UCP_ADDRESS_FLAG_MD_REG : 0); - ++ptr; + ((dev_tl_bitmap == 0) ? UCP_ADDRESS_FLAG_MD_EMPTY_DEV : 0) | + ((md_flags & UCT_MD_FLAG_ALLOC) ? UCP_ADDRESS_FLAG_MD_ALLOC : 0) | + ((md_flags & UCT_MD_FLAG_REG) ? UCP_ADDRESS_FLAG_MD_REG : 0); + ptr = UCS_PTR_TYPE_OFFSET(ptr, md_index); /* Device address length */ - ucs_assert(dev->dev_addr_len < UCP_ADDRESS_FLAG_LAST); - *(uint8_t*)ptr = dev->dev_addr_len | ((dev == (devices + num_devices - 1)) ? - UCP_ADDRESS_FLAG_LAST : 0); - ++ptr; - - /* Device address */ - wiface = ucp_worker_iface(worker, dev->rsc_index); - status = uct_iface_get_device_address(wiface->iface, (uct_device_addr_t*)ptr); - if (status != UCS_OK) { - return status; + *(uint8_t*)ptr = (dev == (devices + num_devices - 1)) ? + UCP_ADDRESS_FLAG_LAST : 0; + if (pack_flags & UCP_ADDRESS_PACK_FLAG_DEVICE_ADDR) { + ucs_assert(dev->dev_addr_len <= UCP_ADDRESS_FLAG_LEN_MASK); + *(uint8_t*)ptr |= dev->dev_addr_len; } - ucp_address_memchek(ptr, dev->dev_addr_len, - &context->tl_rscs[dev->rsc_index].tl_rsc); - ptr += dev->dev_addr_len; + /* Device number of paths flag and value */ + ucs_assert(dev->num_paths >= 1); + ucs_assert(dev->num_paths <= UINT8_MAX); - ucs_for_each_bit(i, context->tl_bitmap) { + if (dev->num_paths > 1) { + *(uint8_t*)ptr |= UCP_ADDRESS_FLAG_HAVE_PATHS; + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint8_t); + *(uint8_t*)ptr = dev->num_paths; + } + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint8_t); - if (!(UCS_BIT(i) & dev->tl_bitmap)) { - continue; + /* Device address */ + if (pack_flags & UCP_ADDRESS_PACK_FLAG_DEVICE_ADDR) { + wiface = ucp_worker_iface(worker, dev->rsc_index); + status = uct_iface_get_device_address(wiface->iface, + (uct_device_addr_t*)ptr); + if (status != UCS_OK) { + return status; } - wiface = ucp_worker_iface(worker, i); + ucp_address_memcheck(context, ptr, dev->dev_addr_len, dev->rsc_index); + ptr = UCS_PTR_BYTE_OFFSET(ptr, dev->dev_addr_len); + } + + flags_ptr = NULL; + ucs_for_each_bit(rsc_index, dev_tl_bitmap) { + + wiface = ucp_worker_iface(worker, rsc_index); iface_attr = &wiface->attr; if (!ucp_worker_iface_can_connect(iface_attr)) { @@ -511,78 +689,146 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep, } /* Transport name checksum */ - *(uint16_t*)ptr = context->tl_rscs[i].tl_name_csum; - ptr += sizeof(uint16_t); + *(uint16_t*)ptr = context->tl_rscs[rsc_index].tl_name_csum; + ptr = UCS_PTR_TYPE_OFFSET(ptr, + context->tl_rscs[rsc_index].tl_name_csum); /* Transport information */ - attr_len = ucp_address_pack_iface_attr(worker, ptr, i, iface_attr, - worker->atomic_tls & UCS_BIT(i)); - ucp_address_memchek(ptr, attr_len, - &context->tl_rscs[dev->rsc_index].tl_rsc); - - iface_addr_len = iface_attr->iface_addr_len; - flags_ptr = ucp_address_iface_flags_ptr(worker, ptr, attr_len); - ptr += attr_len; - ucs_assert(iface_addr_len < UCP_ADDRESS_FLAG_EP_ADDR); - - /* Pack iface address */ - ptr = ucp_address_pack_length(worker, ptr, iface_addr_len); - status = uct_iface_get_address(wiface->iface, (uct_iface_addr_t*)ptr); - if (status != UCS_OK) { - return status; + enable_amo = worker->atomic_tls & UCS_BIT(rsc_index); + attr_len = ucp_address_pack_iface_attr(worker, ptr, rsc_index, + iface_attr, enable_amo); + if (attr_len < 0) { + return UCS_ERR_INVALID_ADDR; } - ucp_address_memchek(ptr, iface_addr_len, - &context->tl_rscs[dev->rsc_index].tl_rsc); - ptr += iface_addr_len; - if (i == ucs_ilog2(dev->tl_bitmap)) { - *(uint8_t*)flags_ptr |= UCP_ADDRESS_FLAG_LAST; - } + ucp_address_memcheck(context, ptr, attr_len, rsc_index); - /* Pack ep address if present */ - if (!(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) && - (ep != NULL)) { + if (pack_flags & UCP_ADDRESS_PACK_FLAG_IFACE_ADDR) { + iface_addr_len = iface_attr->iface_addr_len; + } else { + iface_addr_len = 0; + } - ep_addr_len = iface_attr->ep_addr_len; - *(uint8_t*)flags_ptr |= UCP_ADDRESS_FLAG_EP_ADDR; + flags_ptr = ucp_address_iface_flags_ptr(worker, ptr, attr_len); + ptr = UCS_PTR_BYTE_OFFSET(ptr, attr_len); - ptr = ucp_address_pack_length(worker, ptr, ep_addr_len); - status = ucp_address_pack_ep_address(ep, i, ptr); + /* Pack iface address */ + ptr = ucp_address_pack_length(worker, ptr, iface_addr_len); + if (pack_flags & UCP_ADDRESS_PACK_FLAG_IFACE_ADDR) { + status = uct_iface_get_address(wiface->iface, + (uct_iface_addr_t*)ptr); if (status != UCS_OK) { return status; } - ucp_address_memchek(ptr, ep_addr_len, - &context->tl_rscs[dev->rsc_index].tl_rsc); - ptr += ep_addr_len; + + ucp_address_memcheck(context, ptr, iface_addr_len, rsc_index); + ptr = UCS_PTR_BYTE_OFFSET(ptr, iface_addr_len); } - /* Save the address index of this transport */ - if (order != NULL) { - order[ucs_bitmap2idx(tl_bitmap, i)] = index; + /* Pack ep address if present: iterate over all lanes which use the + * current resource (rsc_index) and pack their addresses. The last + * one is marked with UCP_ADDRESS_FLAG_LAST in its length field. + */ + num_ep_addrs = 0; + if (pack_flags & UCP_ADDRESS_PACK_FLAG_EP_ADDR) { + ucs_assert(ep != NULL); + ep_addr_len = iface_attr->ep_addr_len; + ep_lane_ptr = NULL; + + ucs_for_each_bit(lane, ucp_ep_config(ep)->p2p_lanes) { + if (ucp_ep_get_rsc_index(ep, lane) != rsc_index) { + continue; + } + + /* pack ep address length and save pointer to flags */ + ptr = ucp_address_pack_length(worker, ptr, ep_addr_len); + + /* pack ep address */ + status = uct_ep_get_address(ep->uct_eps[lane], ptr); + if (status != UCS_OK) { + return status; + } + + ucp_address_memcheck(context, ptr, ep_addr_len, rsc_index); + ptr = UCS_PTR_BYTE_OFFSET(ptr, ep_addr_len); + + /* pack ep lane index, and save the pointer for lane index + * of last ep in 'ep_last_ptr' to set UCP_ADDRESS_FLAG_LAST. + */ + remote_lane = (lanes2remote == NULL) ? lane : + lanes2remote[lane]; + ucs_assertv(remote_lane <= UCP_ADDRESS_FLAG_LEN_MASK, + "remote_lane=%d", remote_lane); + ep_lane_ptr = ptr; + *ep_lane_ptr = remote_lane; + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint8_t); + + if (!(pack_flags & UCP_ADDRESS_PACK_FLAG_NO_TRACE)) { + ucs_trace("pack addr[%d].ep_addr[%d] : len %zu lane %d->%d", + addr_index, num_ep_addrs, ep_addr_len, lane, + remote_lane); + } + + ++num_ep_addrs; + } + + if (num_ep_addrs > 0) { + /* set LAST flag for the last ep address */ + ucs_assert(ep_lane_ptr != NULL); + *ep_lane_ptr |= UCP_ADDRESS_FLAG_LAST; + /* indicate that the iface has ep address */ + *(uint8_t*)flags_ptr |= UCP_ADDRESS_FLAG_HAS_EP_ADDR; + } } - ucs_trace("pack addr[%d] : "UCT_TL_RESOURCE_DESC_FMT - " md_flags 0x%"PRIx64" tl_flags 0x%"PRIx64" bw %e ovh %e " - "lat_ovh: %e dev_priority %d", - index, - UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[i].tl_rsc), - md_flags, iface_attr->cap.flags, - iface_attr->bandwidth, - iface_attr->overhead, - iface_attr->latency.overhead, - iface_attr->priority); - ++index; + ucs_assert((num_ep_addrs > 0) || + !(*(uint8_t*)flags_ptr & UCP_ADDRESS_FLAG_HAS_EP_ADDR)); + + if (!(pack_flags & UCP_ADDRESS_PACK_FLAG_NO_TRACE)) { + ucs_trace("pack addr[%d] : "UCT_TL_RESOURCE_DESC_FMT" " + "eps %u md_flags 0x%"PRIx64" tl_flags 0x%"PRIx64" bw %e + %e/n ovh %e " + "lat_ovh %e dev_priority %d a32 0x%lx/0x%lx a64 0x%lx/0x%lx", + addr_index, + UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[rsc_index].tl_rsc), + num_ep_addrs, md_flags, iface_attr->cap.flags, + iface_attr->bandwidth.dedicated, + iface_attr->bandwidth.shared, + iface_attr->overhead, + iface_attr->latency.c, + iface_attr->priority, + iface_attr->cap.atomic32.op_flags, + iface_attr->cap.atomic32.fop_flags, + iface_attr->cap.atomic64.op_flags, + iface_attr->cap.atomic64.fop_flags); + } + + ++addr_index; + ucs_assert(addr_index <= UCP_MAX_RESOURCES); + } + + /* flags_ptr is a valid pointer to the flags set to the last entry + * during the above loop So, set the LAST flag for the flags_ptr + * from the last iteration */ + if (flags_ptr != NULL) { + ucs_assert(dev_tl_bitmap != 0); + *(uint8_t*)flags_ptr |= UCP_ADDRESS_FLAG_LAST; + } else { + /* cppcheck-suppress internalAstError */ + ucs_assert(dev_tl_bitmap == 0); } } out: - ucs_assertv(buffer + size == ptr, "buffer=%p size=%zu ptr=%p ptr-buffer=%zd", - buffer, size, ptr, ptr - buffer); + ucs_assertv(UCS_PTR_BYTE_OFFSET(buffer, size) == ptr, + "buffer=%p size=%zu ptr=%p ptr-buffer=%zd", + buffer, size, ptr, UCS_PTR_BYTE_DIFF(buffer, ptr)); return UCS_OK; } -ucs_status_t ucp_address_pack(ucp_worker_h worker, ucp_ep_h ep, uint64_t tl_bitmap, - unsigned *order, size_t *size_p, void **buffer_p) +ucs_status_t ucp_address_pack(ucp_worker_h worker, ucp_ep_h ep, + uint64_t tl_bitmap, unsigned pack_flags, + const ucp_lane_index_t *lanes2remote, + size_t *size_p, void **buffer_p) { ucp_address_packed_device_t *devices; ucp_rsc_index_t num_devices; @@ -590,15 +836,19 @@ ucs_status_t ucp_address_pack(ucp_worker_h worker, ucp_ep_h ep, uint64_t tl_bitm void *buffer; size_t size; + if (ep == NULL) { + pack_flags &= ~UCP_ADDRESS_PACK_FLAG_EP_ADDR; + } + /* Collect all devices we want to pack */ - status = ucp_address_gather_devices(worker, tl_bitmap, ep != NULL, + status = ucp_address_gather_devices(worker, ep, tl_bitmap, pack_flags, &devices, &num_devices); if (status != UCS_OK) { goto out; } /* Calculate packed size */ - size = ucp_address_packed_size(worker, devices, num_devices); + size = ucp_address_packed_size(worker, devices, num_devices, pack_flags); /* Allocate address */ buffer = ucs_malloc(size, "ucp_address"); @@ -610,8 +860,8 @@ ucs_status_t ucp_address_pack(ucp_worker_h worker, ucp_ep_h ep, uint64_t tl_bitm memset(buffer, 0, size); /* Pack the address */ - status = ucp_address_do_pack(worker, ep, buffer, size, tl_bitmap, order, - devices, num_devices); + status = ucp_address_do_pack(worker, ep, buffer, size, tl_bitmap, pack_flags, + lanes2remote, devices, num_devices); if (status != UCS_OK) { ucs_free(buffer); goto out_free_devices; @@ -630,14 +880,18 @@ ucs_status_t ucp_address_pack(ucp_worker_h worker, ucp_ep_h ep, uint64_t tl_bitm } ucs_status_t ucp_address_unpack(ucp_worker_t *worker, const void *buffer, + unsigned unpack_flags, ucp_unpacked_address_t *unpacked_address) { ucp_address_entry_t *address_list, *address; + uint8_t address_header, address_version; + ucp_address_entry_ep_addr_t *ep_addr; + int last_dev, last_tl, last_ep_addr; const uct_device_addr_t *dev_addr; ucp_rsc_index_t dev_index; - ucp_rsc_index_t md_index; - unsigned address_count; - int last_dev, last_tl; + ucp_md_index_t md_index; + unsigned dev_num_paths; + ucs_status_t status; int empty_dev; uint64_t md_flags; size_t dev_addr_len; @@ -646,59 +900,46 @@ ucs_status_t ucp_address_unpack(ucp_worker_t *worker, const void *buffer, size_t attr_len; uint8_t md_byte; const void *ptr; - const void *aptr; const void *flags_ptr; - ptr = buffer; - unpacked_address->uuid = *(uint64_t*)ptr; - ptr += sizeof(uint64_t); - - aptr = ucp_address_unpack_worker_name(ptr, unpacked_address->name, - sizeof(unpacked_address->name)); - - address_count = 0; - - /* Count addresses */ - ptr = aptr; - do { - if (*(uint8_t*)ptr == UCP_NULL_RESOURCE) { - break; - } - - /* md_index */ - empty_dev = (*(uint8_t*)ptr) & UCP_ADDRESS_FLAG_EMPTY; - ++ptr; - - /* device address length */ - dev_addr_len = (*(uint8_t*)ptr) & ~UCP_ADDRESS_FLAG_LAST; - last_dev = (*(uint8_t*)ptr) & UCP_ADDRESS_FLAG_LAST; - ++ptr; + /* Initialize the unpacked address to empty */ + unpacked_address->address_count = 0; + unpacked_address->address_list = NULL; - ptr += dev_addr_len; + ptr = buffer; + address_header = *(const uint8_t *)ptr; + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint8_t); - last_tl = empty_dev; - while (!last_tl) { - ptr += sizeof(uint16_t); /* tl_name_csum */ - attr_len = ucp_address_iface_attr_size(worker); - flags_ptr = ucp_address_iface_flags_ptr(worker, ptr, attr_len); - ptr += attr_len; - ptr = ucp_address_unpack_length(worker, flags_ptr, ptr, - &iface_addr_len, 0); - ptr += iface_addr_len; - ptr = ucp_address_unpack_length(worker, flags_ptr, ptr, - &ep_addr_len, 1); - ptr += ep_addr_len; - last_tl = (*(uint8_t*)flags_ptr) & UCP_ADDRESS_FLAG_LAST; + /* Check address version */ + address_version = address_header & UCP_ADDRESS_HEADER_VERSION_MASK; + if (address_version != UCP_ADDRESS_VERSION_CURRENT) { + ucs_error("address version mismatch: expected %u, actual %u", + UCP_ADDRESS_VERSION_CURRENT, address_version); + return UCS_ERR_UNREACHABLE; + } - ++address_count; - ucs_assert(address_count <= UCP_MAX_RESOURCES); - } + if (unpack_flags & UCP_ADDRESS_PACK_FLAG_WORKER_UUID) { + unpacked_address->uuid = *(uint64_t*)ptr; + ptr = UCS_PTR_TYPE_OFFSET(ptr, unpacked_address->uuid); + } else { + unpacked_address->uuid = 0; + } - } while (!last_dev); + if ((address_header & UCP_ADDRESS_HEADER_FLAG_DEBUG_INFO) && + (unpack_flags & UCP_ADDRESS_PACK_FLAG_WORKER_NAME)) { + ptr = ucp_address_unpack_worker_name(ptr, unpacked_address->name); + } else { + ucs_strncpy_safe(unpacked_address->name, UCP_WIREUP_EMPTY_PEER_NAME, + sizeof(unpacked_address->name)); + } + /* Empty address list */ + if (*(uint8_t*)ptr == UCP_NULL_RESOURCE) { + return UCS_OK; + } /* Allocate address list */ - address_list = ucs_calloc(address_count, sizeof(*address_list), + address_list = ucs_calloc(UCP_MAX_RESOURCES, sizeof(*address_list), "ucp_address_list"); if (address_list == NULL) { ucs_error("failed to allocate address list"); @@ -706,71 +947,125 @@ ucs_status_t ucp_address_unpack(ucp_worker_t *worker, const void *buffer, } /* Unpack addresses */ - address = address_list; - ptr = aptr; + address = address_list; dev_index = 0; - do { - if (*(uint8_t*)ptr == UCP_NULL_RESOURCE) { - break; - } + do { /* md_index */ md_byte = (*(uint8_t*)ptr); md_index = md_byte & UCP_ADDRESS_FLAG_MD_MASK; md_flags = (md_byte & UCP_ADDRESS_FLAG_MD_ALLOC) ? UCT_MD_FLAG_ALLOC : 0; md_flags |= (md_byte & UCP_ADDRESS_FLAG_MD_REG) ? UCT_MD_FLAG_REG : 0; - empty_dev = md_byte & UCP_ADDRESS_FLAG_EMPTY; - ++ptr; + empty_dev = md_byte & UCP_ADDRESS_FLAG_MD_EMPTY_DEV; + ptr = UCS_PTR_TYPE_OFFSET(ptr, md_byte); /* device address length */ - dev_addr_len = (*(uint8_t*)ptr) & ~UCP_ADDRESS_FLAG_LAST; + dev_addr_len = (*(uint8_t*)ptr) & UCP_ADDRESS_FLAG_LEN_MASK; last_dev = (*(uint8_t*)ptr) & UCP_ADDRESS_FLAG_LAST; - ++ptr; + if ((*(uint8_t*)ptr) & UCP_ADDRESS_FLAG_HAVE_PATHS) { + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint8_t); + dev_num_paths = *(uint8_t*)ptr; + } else { + dev_num_paths = 1; + } + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint8_t); dev_addr = ptr; - - ptr += dev_addr_len; + ptr = UCS_PTR_BYTE_OFFSET(ptr, dev_addr_len); last_tl = empty_dev; while (!last_tl) { + if (address >= &address_list[UCP_MAX_RESOURCES]) { + if (!(unpack_flags & UCP_ADDRESS_PACK_FLAG_NO_TRACE)) { + ucs_error("failed to parse address: number of addresses" + "exceeds %d", UCP_MAX_RESOURCES); + } + goto err_free; + } + /* tl_name_csum */ address->tl_name_csum = *(uint16_t*)ptr; - ptr += sizeof(uint16_t); + ptr = UCS_PTR_TYPE_OFFSET(ptr, address->tl_name_csum); - address->dev_addr = (dev_addr_len > 0) ? dev_addr : NULL; - address->md_index = md_index; - address->dev_index = dev_index; - address->md_flags = md_flags; + address->dev_addr = (dev_addr_len > 0) ? dev_addr : NULL; + address->md_index = md_index; + address->dev_index = dev_index; + address->md_flags = md_flags; + address->dev_num_paths = dev_num_paths; - attr_len = ucp_address_unpack_iface_attr(worker, &address->iface_attr, ptr); - flags_ptr = ucp_address_iface_flags_ptr(worker, ptr, attr_len); - ptr += attr_len; - ptr = ucp_address_unpack_length(worker, flags_ptr, ptr, - &iface_addr_len, 0); - address->iface_addr = (iface_addr_len > 0) ? ptr : NULL; + status = ucp_address_unpack_iface_attr(worker, &address->iface_attr, + ptr, unpack_flags, &attr_len); + if (status != UCS_OK) { + goto err_free; + } - ptr += iface_addr_len; + flags_ptr = ucp_address_iface_flags_ptr(worker, (void*)ptr, attr_len); + ptr = UCS_PTR_BYTE_OFFSET(ptr, attr_len); ptr = ucp_address_unpack_length(worker, flags_ptr, ptr, - &ep_addr_len, 1); - address->ep_addr = (ep_addr_len > 0) ? ptr : NULL; - ptr += ep_addr_len; - last_tl = (*(uint8_t*)flags_ptr) & UCP_ADDRESS_FLAG_LAST; - - ucs_trace("unpack addr[%d] : md_flags 0x%"PRIx64" tl_flags 0x%"PRIx64" bw %e ovh %e " - "lat_ovh %e dev_priority %d", - (int)(address - address_list), - address->md_flags, address->iface_attr.cap_flags, - address->iface_attr.bandwidth, address->iface_attr.overhead, - address->iface_attr.lat_ovh, - address->iface_attr.priority); + &iface_addr_len, 0, &last_tl); + address->iface_addr = (iface_addr_len > 0) ? ptr : NULL; + address->num_ep_addrs = 0; + ptr = UCS_PTR_BYTE_OFFSET(ptr, iface_addr_len); + + last_ep_addr = !(*(uint8_t*)flags_ptr & UCP_ADDRESS_FLAG_HAS_EP_ADDR); + while (!last_ep_addr) { + if (address->num_ep_addrs >= UCP_MAX_LANES) { + if (!(unpack_flags & UCP_ADDRESS_PACK_FLAG_NO_TRACE)) { + ucs_error("failed to parse address: number of ep addresses" + "exceeds %d", UCP_MAX_LANES); + } + goto err_free; + } + + ep_addr = &address->ep_addrs[address->num_ep_addrs++]; + ptr = ucp_address_unpack_length(worker, flags_ptr, ptr, + &ep_addr_len, 1, NULL); + ep_addr->addr = ptr; + ptr = UCS_PTR_BYTE_OFFSET(ptr, ep_addr_len); + + ep_addr->lane = *(uint8_t*)ptr & UCP_ADDRESS_FLAG_LEN_MASK; + last_ep_addr = *(uint8_t*)ptr & UCP_ADDRESS_FLAG_LAST; + + if (!(unpack_flags & UCP_ADDRESS_PACK_FLAG_NO_TRACE)) { + ucs_trace("unpack addr[%d].ep_addr[%d] : len %zu lane %d", + (int)(address - address_list), + (int)(ep_addr - address->ep_addrs), + ep_addr_len, ep_addr->lane); + } + + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint8_t); + } + + if (!(unpack_flags & UCP_ADDRESS_PACK_FLAG_NO_TRACE)) { + ucs_trace("unpack addr[%d] : eps %u md_flags 0x%"PRIx64 + " tl_iface_flags 0x%"PRIx64" tl_event_flags 0x%"PRIx64 + " bw %e + %e/n ovh %e lat_ovh %e dev_priority %d a32 " + "0x%lx/0x%lx a64 0x%lx/0x%lx", + (int)(address - address_list), address->num_ep_addrs, + address->md_flags, address->iface_attr.cap_flags, + address->iface_attr.event_flags, + address->iface_attr.bandwidth.dedicated, + address->iface_attr.bandwidth.shared, + address->iface_attr.overhead, + address->iface_attr.lat_ovh, + address->iface_attr.priority, + address->iface_attr.atomic.atomic32.op_flags, + address->iface_attr.atomic.atomic32.fop_flags, + address->iface_attr.atomic.atomic64.op_flags, + address->iface_attr.atomic.atomic64.fop_flags); + } + ++address; } ++dev_index; } while (!last_dev); - unpacked_address->address_count = address_count; + unpacked_address->address_count = address - address_list; unpacked_address->address_list = address_list; return UCS_OK; -} +err_free: + ucs_free(address_list); + return UCS_ERR_INVALID_PARAM; +} diff --git a/src/ucp/wireup/address.h b/src/ucp/wireup/address.h index 0f018a89b87..dc2c40a5996 100644 --- a/src/ucp/wireup/address.h +++ b/src/ucp/wireup/address.h @@ -11,6 +11,7 @@ #include #include +#include /* Which iface flags would be packed in the address */ @@ -28,37 +29,67 @@ enum { UCT_IFACE_FLAG_GET_ZCOPY | UCT_IFACE_FLAG_TAG_EAGER_BCOPY | UCT_IFACE_FLAG_TAG_RNDV_ZCOPY | - UCT_IFACE_FLAG_EVENT_RECV | - UCT_IFACE_FLAG_EVENT_RECV_SIG | UCT_IFACE_FLAG_PENDING }; +/* Which iface event flags would be packed in the address */ +enum { + UCP_ADDRESS_IFACE_EVENT_FLAGS = UCP_WORKER_UCT_RECV_EVENT_CAP_FLAGS +}; + + +enum { + UCP_ADDRESS_PACK_FLAG_WORKER_UUID = UCS_BIT(0), /* Add worker UUID */ + UCP_ADDRESS_PACK_FLAG_WORKER_NAME = UCS_BIT(1), /* Pack worker name */ + UCP_ADDRESS_PACK_FLAG_DEVICE_ADDR = UCS_BIT(2), /* Pack device addresses */ + UCP_ADDRESS_PACK_FLAG_IFACE_ADDR = UCS_BIT(3), /* Pack interface addresses */ + UCP_ADDRESS_PACK_FLAG_EP_ADDR = UCS_BIT(4), /* Pack endpoint addresses */ + + UCP_ADDRESS_PACK_FLAG_LAST, + + /* A bitmap of all flags: UCP_ADDRESS_PACK_FLAG_LAST is the last bit plus 1, + * so UCP_ADDRESS_PACK_FLAG_LAST<<1 is the next bit plus 2. If we subtract 3 + * we get the next bit minus 1. + */ + UCP_ADDRESS_PACK_FLAGS_ALL = (UCP_ADDRESS_PACK_FLAG_LAST << 1) - 3, + + UCP_ADDRESS_PACK_FLAG_NO_TRACE = UCS_BIT(16) /* Suppress debug tracing */ +}; + + /** * Remote interface attributes. */ struct ucp_address_iface_attr { uint64_t cap_flags; /* Interface capability flags */ + uint64_t event_flags; /* Interface event capability flags */ double overhead; /* Interface performance - overhead */ - double bandwidth; /* Interface performance - bandwidth */ + uct_ppn_bandwidth_t bandwidth; /* Interface performance - bandwidth */ int priority; /* Priority of device */ double lat_ovh; /* Latency overhead */ ucp_tl_iface_atomic_flags_t atomic; /* Atomic operations */ }; +typedef struct ucp_address_entry_ep_addr { + ucp_lane_index_t lane; /* Lane index (local or remote) */ + const uct_ep_addr_t *addr; /* Pointer to ep address */ +} ucp_address_entry_ep_addr_t; /** * Address entry. */ struct ucp_address_entry { - const uct_device_addr_t *dev_addr; /* Points to device address */ - const uct_iface_addr_t *iface_addr; /* Interface address, NULL if not available */ - const uct_ep_addr_t *ep_addr; /* Endpoint address, NULL if not available */ - ucp_address_iface_attr_t iface_attr; /* Interface attributes information */ - uint64_t md_flags; /* MD reg/alloc flags */ - uint16_t tl_name_csum; /* Checksum of transport name */ - ucp_rsc_index_t md_index; /* Memory domain index */ - ucp_rsc_index_t dev_index; /* Device index */ + const uct_device_addr_t *dev_addr; /* Points to device address */ + const uct_iface_addr_t *iface_addr; /* Interface address, NULL if not available */ + unsigned num_ep_addrs; /* How many endpoint address are in ep_addrs */ + ucp_address_entry_ep_addr_t ep_addrs[UCP_MAX_LANES]; /* Endpoint addresses */ + ucp_address_iface_attr_t iface_attr; /* Interface attributes information */ + uint64_t md_flags; /* MD reg/alloc flags */ + unsigned dev_num_paths; /* Number of paths on the device */ + uint16_t tl_name_csum; /* Checksum of transport name */ + ucp_md_index_t md_index; /* Memory domain index */ + ucp_rsc_index_t dev_index; /* Device index */ }; @@ -73,6 +104,18 @@ struct ucp_unpacked_address { }; +/* Iterate over entries in an unpacked address */ +#define ucp_unpacked_address_for_each(_elem, _unpacked_address) \ + for (_elem = (_unpacked_address)->address_list; \ + _elem < (_unpacked_address)->address_list + (_unpacked_address)->address_count; \ + ++_elem) + + +/* Return the index of a specific entry in an unpacked address */ +#define ucp_unpacked_address_index(_unpacked_address, _ae) \ + ((int)((_ae) - (_unpacked_address)->address_list)) + + /** * Pack multiple addresses into a buffer, of resources specified in rsc_bitmap. * For every resource in rcs_bitmap: @@ -80,22 +123,25 @@ struct ucp_unpacked_address { * - if iface is CONNECT_TO_EP, and ep != NULL, and it has a uct_ep on this * resource, pack endpoint address. * - * @param [in] worker Worker object whose interface addresses to pack. - * @param [in] ep Endpoint object whose uct_ep addresses to pack. + * @param [in] worker Worker object whose interface addresses to pack. + * @param [in] ep Endpoint object whose uct_ep addresses to pack. * Can be set to NULL, to take addresses only from worker. - * @param [in] tl_bitmap Specifies the resources whose transport address - * (ep or iface) should be packed. - * @param [out] order If != NULL, filled with the order of addresses as they - * were packed. For example: first entry in the array is - * the address index of the first transport specified - * by tl_bitmap. The array should be large enough to - * hold all transports specified by tl_bitmap. - * @param [out] size_p Filled with buffer size. - * @param [out] buffer_p Filled with pointer to packed buffer. It should be - * released by ucs_free(). + * @param [in] tl_bitmap Specifies the resources whose transport address + * (ep or iface) should be packed. + * @param [in] pack_flags UCP_ADDRESS_PACK_FLAG_xx flags to specify address + * format. + * @param [in] lanes2remote If NULL, the lane index in each packed ep address + * will be the local lane index. Otherwise, specifies + * which lane index should be packed in the ep address + * for each local lane. + * @param [out] size_p Filled with buffer size. + * @param [out] buffer_p Filled with pointer to packed buffer. It should be + * released by ucs_free(). */ -ucs_status_t ucp_address_pack(ucp_worker_h worker, ucp_ep_h ep, uint64_t tl_bitmap, - unsigned *order, size_t *size_p, void **buffer_p); +ucs_status_t ucp_address_pack(ucp_worker_h worker, ucp_ep_h ep, + uint64_t tl_bitmap, unsigned pack_flags, + const ucp_lane_index_t *lanes2remote, + size_t *size_p, void **buffer_p); /** @@ -103,6 +149,9 @@ ucs_status_t ucp_address_pack(ucp_worker_h worker, ucp_ep_h ep, uint64_t tl_bitm * * @param [in] worker Worker object. * @param [in] buffer Buffer with data to unpack. + * @param [in] unpack_flags UCP_ADDRESS_PACK_FLAG_xx flags to specify + * address format, must be the same as the address + * which was packed by @ref ucp_address_pack. * @param [out] unpacked_address Filled with remote address data. * * @note Entries in the address list could point into the data buffer, so it @@ -112,6 +161,7 @@ ucs_status_t ucp_address_pack(ucp_worker_h worker, ucp_ep_h ep, uint64_t tl_bitm * by ucs_free(). */ ucs_status_t ucp_address_unpack(ucp_worker_h worker, const void *buffer, + unsigned unpack_flags, ucp_unpacked_address_t *unpacked_address); diff --git a/src/ucp/wireup/ep_match.c b/src/ucp/wireup/ep_match.c index b08d160dc0f..193cc26d609 100644 --- a/src/ucp/wireup/ep_match.c +++ b/src/ucp/wireup/ep_match.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include #include @@ -13,37 +17,6 @@ __KHASH_IMPL(ucp_ep_match, static UCS_F_MAYBE_UNUSED inline, uint64_t, ucp_ep_match_entry_t, 1, kh_int64_hash_func, kh_int64_hash_equal); - -#define ucp_ep_match_list_for_each(_elem, _head, _member) \ - for (_elem = ucs_container_of((_head)->next, typeof(*_elem), _member); \ - (_elem) != ucs_container_of(NULL, typeof(*_elem), _member); \ - _elem = ucs_container_of((_elem)->_member.next, typeof(*_elem), _member)) - -static inline void ucp_ep_match_list_add_tail(ucs_list_link_t *head, - ucs_list_link_t *elem) -{ - ucs_list_link_t *last; - - last = head->prev; - elem->next = NULL; - head->prev = elem; - - if (last == NULL) { - elem->prev = NULL; - head->next = elem; - } else { - elem->prev = last; - last->next = elem; - } -} - -static inline void ucp_ep_match_list_del(ucs_list_link_t *head, - ucs_list_link_t *elem) -{ - (elem->prev ? elem->prev : head)->next = elem->next; - (elem->next ? elem->next : head)->prev = elem->prev; -} - void ucp_ep_match_init(ucp_ep_match_ctx_t *match_ctx) { kh_init_inplace(ucp_ep_match, &match_ctx->hash); @@ -55,11 +28,11 @@ void ucp_ep_match_cleanup(ucp_ep_match_ctx_t *match_ctx) uint64_t dest_uuid; kh_foreach(&match_ctx->hash, dest_uuid, entry, { - if (entry.exp_ep_q.next != NULL) { + if (!ucs_hlist_is_empty(&entry.exp_ep_q)) { ucs_warn("match_ctx %p: uuid 0x%"PRIx64" expected queue is not empty", match_ctx, dest_uuid); } - if (entry.unexp_ep_q.next != NULL) { + if (!ucs_hlist_is_empty(&entry.unexp_ep_q)) { ucs_warn("match_ctx %p: uuid 0x%"PRIx64" unexpected queue is not empty", match_ctx, dest_uuid); } @@ -80,10 +53,8 @@ ucp_ep_match_entry_get(ucp_ep_match_ctx_t *match_ctx, uint64_t dest_uuid) if (ret != 0) { /* initialize match list on first use */ entry->next_conn_sn = 0; - entry->exp_ep_q.next = NULL; - entry->exp_ep_q.prev = NULL; - entry->unexp_ep_q.next = NULL; - entry->unexp_ep_q.prev = NULL; + ucs_hlist_head_init(&entry->exp_ep_q); + ucs_hlist_head_init(&entry->unexp_ep_q); } return entry; @@ -97,7 +68,7 @@ ucp_ep_conn_sn_t ucp_ep_match_get_next_sn(ucp_ep_match_ctx_t *match_ctx, } static void ucp_ep_match_insert_common(ucp_ep_match_ctx_t *match_ctx, - ucs_list_link_t *list, ucp_ep_h ep, + ucs_hlist_head_t *head, ucp_ep_h ep, uint64_t dest_uuid, const char *title) { /* NOTE: protect union */ @@ -105,7 +76,7 @@ static void ucp_ep_match_insert_common(ucp_ep_match_ctx_t *match_ctx, UCP_EP_FLAG_FLUSH_STATE_VALID | UCP_EP_FLAG_LISTENER))); - ucp_ep_match_list_add_tail(list, &ucp_ep_ext_gen(ep)->ep_match.list); + ucs_hlist_add_tail(head, &ucp_ep_ext_gen(ep)->ep_match.list); ep->flags |= UCP_EP_FLAG_ON_MATCH_CTX; ucp_ep_ext_gen(ep)->ep_match.dest_uuid = dest_uuid; ucs_trace("match_ctx %p: ep %p added as %s uuid 0x%"PRIx64" conn_sn %d", @@ -137,8 +108,8 @@ ucp_ep_match_retrieve_common(ucp_ep_match_ctx_t *match_ctx, uint64_t dest_uuid, ucp_ep_flags_t exp_ep_flags, const char *title) { ucp_ep_match_entry_t *entry; - ucs_list_link_t *list; ucp_ep_ext_gen_t *ep_ext; + ucs_hlist_head_t *head; khiter_t iter; ucp_ep_h ep; @@ -148,11 +119,12 @@ ucp_ep_match_retrieve_common(ucp_ep_match_ctx_t *match_ctx, uint64_t dest_uuid, } entry = &kh_value(&match_ctx->hash, iter); - list = is_exp ? &entry->exp_ep_q : &entry->unexp_ep_q; - ucp_ep_match_list_for_each(ep_ext, list, ep_match.list) { + head = is_exp ? &entry->exp_ep_q : &entry->unexp_ep_q; + + ucs_hlist_for_each(ep_ext, head, ep_match.list) { ep = ucp_ep_from_ext_gen(ep_ext); if (ep->conn_sn == conn_sn) { - ucp_ep_match_list_del(list, &ep_ext->ep_match.list); + ucs_hlist_del(head, &ep_ext->ep_match.list); ucs_trace("match_ctx %p: matched %s ep %p by uuid 0x%"PRIx64" conn_sn %d", match_ctx, title, ep, dest_uuid, conn_sn); ucs_assertv(ucs_test_all_flags(ep->flags, @@ -200,10 +172,10 @@ void ucp_ep_match_remove_ep(ucp_ep_match_ctx_t *match_ctx, ucp_ep_h ep) if (ep->flags & UCP_EP_FLAG_DEST_EP) { ucs_trace("match_ctx %p: remove unexpected ep %p", match_ctx, ep); - ucp_ep_match_list_del(&entry->unexp_ep_q, &ep_ext->ep_match.list); + ucs_hlist_del(&entry->unexp_ep_q, &ep_ext->ep_match.list); } else { ucs_trace("match_ctx %p: remove expected ep %p", match_ctx, ep); - ucp_ep_match_list_del(&entry->exp_ep_q, &ep_ext->ep_match.list); + ucs_hlist_del(&entry->exp_ep_q, &ep_ext->ep_match.list); } ep->flags &= ~UCP_EP_FLAG_ON_MATCH_CTX; } diff --git a/src/ucp/wireup/ep_match.h b/src/ucp/wireup/ep_match.h index 6b424d3957e..dd7966d9c11 100644 --- a/src/ucp/wireup/ep_match.h +++ b/src/ucp/wireup/ep_match.h @@ -9,7 +9,7 @@ #include #include -#include +#include /* @@ -17,7 +17,7 @@ */ typedef struct { uint64_t dest_uuid; /* Destination worker UUID */ - ucs_list_link_t list; /* List entry into endpoint + ucs_hlist_link_t list; /* List entry into endpoint matching structure */ } ucp_ep_match_t; @@ -28,9 +28,9 @@ typedef struct { * The expected/unexpected lists are *not* circular */ typedef struct ucp_ep_match_entry { - ucs_list_link_t exp_ep_q; /* Endpoints created by API and not + ucs_hlist_head_t exp_ep_q; /* Endpoints created by API and not connected to remote endpoint */ - ucs_list_link_t unexp_ep_q; /* Endpoints created internally as + ucs_hlist_head_t unexp_ep_q; /* Endpoints created internally as connected a to remote endpoints, but not provided to user yet */ ucp_ep_conn_sn_t next_conn_sn; /* Sequence number of matching diff --git a/src/ucp/wireup/select.c b/src/ucp/wireup/select.c index 588cee506a4..094adefbcbc 100644 --- a/src/ucp/wireup/select.c +++ b/src/ucp/wireup/select.c @@ -1,10 +1,16 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2016. ALL RIGHTS RESERVED. + * Copyright (C) Los Alamos National Security, LLC. 2019 ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "wireup.h" +#include "wireup_cm.h" #include "address.h" #include @@ -35,26 +41,14 @@ typedef struct ucp_wireup_atomic_flag { } ucp_wireup_atomic_flag_t; -enum { - UCP_WIREUP_LANE_USAGE_AM = UCS_BIT(0), /* Active messages */ - UCP_WIREUP_LANE_USAGE_AM_BW = UCS_BIT(1), /* High-BW active messages */ - UCP_WIREUP_LANE_USAGE_RMA = UCS_BIT(2), /* Remote memory access */ - UCP_WIREUP_LANE_USAGE_RMA_BW = UCS_BIT(3), /* High-BW remote memory access */ - UCP_WIREUP_LANE_USAGE_AMO = UCS_BIT(4), /* Atomic memory access */ - UCP_WIREUP_LANE_USAGE_TAG = UCS_BIT(5) /* Tag matching offload */ -}; - - typedef struct { - ucp_rsc_index_t rsc_index; - unsigned addr_index; - ucp_lane_index_t proxy_lane; - ucp_rsc_index_t dst_md_index; - uint32_t usage; - double am_bw_score; - double rma_score; - double rma_bw_score; - double amo_score; + ucp_rsc_index_t rsc_index; + unsigned addr_index; + unsigned path_index; + ucp_lane_index_t proxy_lane; + ucp_md_index_t dst_md_index; + ucp_lane_type_mask_t lane_types; + double score[UCP_LANE_TYPE_LAST]; } ucp_wireup_lane_desc_t; @@ -63,14 +57,40 @@ typedef struct { uint64_t local_dev_bitmap; uint64_t remote_dev_bitmap; ucp_md_map_t md_map; - uint32_t usage; + ucp_lane_type_t lane_type; unsigned max_lanes; } ucp_wireup_select_bw_info_t; +/** + * Global parameters for lanes selection during UCP wireup procedure + */ +typedef struct { + ucp_ep_h ep; /* UCP Endpoint */ + unsigned ep_init_flags; /* Endpoint init flags */ + uint64_t tl_bitmap; /* TLs bitmap which can be selected */ + const ucp_unpacked_address_t *address; /* Remote addresses */ + int allow_am; /* Shows whether emulation over AM + * is allowed or not for RMA/AMO */ + int show_error; /* Global flag that controls showing + * errors from a selecting transport + * procedure */ +} ucp_wireup_select_params_t; + +/** + * Context for lanes selection during UCP wireup procedure + */ +typedef struct { + ucp_wireup_lane_desc_t lane_descs[UCP_MAX_LANES]; /* Array of active lanes that are + * found during selection */ + ucp_lane_index_t num_lanes; /* Number of active lanes */ + unsigned ucp_ep_init_flags; /* Endpoint init extra flags */ +} ucp_wireup_select_context_t; + static const char *ucp_wireup_md_flags[] = { [ucs_ilog2(UCT_MD_FLAG_ALLOC)] = "memory allocation", [ucs_ilog2(UCT_MD_FLAG_REG)] = "memory registration", + [ucs_ilog2(UCT_MD_FLAG_RKEY_PTR)] = "obtain remote memory pointer" }; static const char *ucp_wireup_iface_flags[] = { @@ -89,9 +109,6 @@ static const char *ucp_wireup_iface_flags[] = { [ucs_ilog2(UCT_IFACE_FLAG_AM_DUP)] = "full reliability", [ucs_ilog2(UCT_IFACE_FLAG_CB_SYNC)] = "sync callback", [ucs_ilog2(UCT_IFACE_FLAG_CB_ASYNC)] = "async callback", - [ucs_ilog2(UCT_IFACE_FLAG_EVENT_SEND_COMP)] = "send completion event", - [ucs_ilog2(UCT_IFACE_FLAG_EVENT_RECV)] = "tag or active message event", - [ucs_ilog2(UCT_IFACE_FLAG_EVENT_RECV_SIG)] = "signaled message event", [ucs_ilog2(UCT_IFACE_FLAG_PENDING)] = "pending", [ucs_ilog2(UCT_IFACE_FLAG_TAG_EAGER_SHORT)] = "tag eager short", [ucs_ilog2(UCT_IFACE_FLAG_TAG_EAGER_BCOPY)] = "tag eager bcopy", @@ -99,6 +116,12 @@ static const char *ucp_wireup_iface_flags[] = { [ucs_ilog2(UCT_IFACE_FLAG_TAG_RNDV_ZCOPY)] = "tag rndv zcopy" }; +static const char *ucp_wireup_event_flags[] = { + [ucs_ilog2(UCT_IFACE_FLAG_EVENT_SEND_COMP)] = "send completion event", + [ucs_ilog2(UCT_IFACE_FLAG_EVENT_RECV)] = "tag or active message event", + [ucs_ilog2(UCT_IFACE_FLAG_EVENT_RECV_SIG)] = "signaled message event" +}; + static ucp_wireup_atomic_flag_t ucp_wireup_atomic_desc[] = { [UCT_ATOMIC_OP_ADD] = {.name = "add", .fetch = "fetch-"}, [UCT_ATOMIC_OP_AND] = {.name = "and", .fetch = "fetch-"}, @@ -200,55 +223,61 @@ static int ucp_wireup_check_amo_flags(const uct_tl_resource_desc_t *resource, return 0; } -static int ucp_wireup_is_reachable(ucp_worker_h worker, ucp_rsc_index_t rsc_index, - const ucp_address_entry_t *ae) +static void +ucp_wireup_init_select_info(ucp_context_h context, double score, + unsigned addr_index, ucp_rsc_index_t rsc_index, + uint8_t priority, const char *title, + ucp_wireup_select_info_t *select_info) { - ucp_context_h context = worker->context; - ucp_worker_iface_t *wiface = ucp_worker_iface(worker, rsc_index); + ucs_assert(score >= 0.0); - return (context->tl_rscs[rsc_index].tl_name_csum == ae->tl_name_csum) && - uct_iface_is_reachable(wiface->iface, ae->dev_addr, ae->iface_addr); + select_info->score = score; + select_info->addr_index = addr_index; + select_info->path_index = 0; + select_info->rsc_index = rsc_index; + select_info->priority = priority; } /** * Select a local and remote transport */ static UCS_F_NOINLINE ucs_status_t -ucp_wireup_select_transport(ucp_ep_h ep, const ucp_address_entry_t *address_list, - unsigned address_count, const ucp_wireup_criteria_t *criteria, +ucp_wireup_select_transport(const ucp_wireup_select_params_t *select_params, + const ucp_wireup_criteria_t *criteria, uint64_t tl_bitmap, uint64_t remote_md_map, - uint64_t local_dev_bitmap, uint64_t remote_dev_bitmap, - int show_error, ucp_rsc_index_t *rsc_index_p, - unsigned *dst_addr_index_p, double *score_p) + uint64_t local_dev_bitmap, + uint64_t remote_dev_bitmap, + int show_error, + ucp_wireup_select_info_t *select_info) { - ucp_worker_h worker = ep->worker; - ucp_context_h context = worker->context; + ucp_ep_h ep = select_params->ep; + ucp_worker_h worker = ep->worker; + ucp_context_h context = worker->context; + ucp_wireup_select_info_t sinfo = {0}; + int found = 0; + unsigned addr_index; uct_tl_resource_desc_t *resource; const ucp_address_entry_t *ae; ucp_rsc_index_t rsc_index; - double score, best_score; char tls_info[256]; char *p, *endp; uct_iface_attr_t *iface_attr; uct_md_attr_t *md_attr; uint64_t addr_index_map; - unsigned addr_index; - int reachable; - int found; - uint8_t priority, best_score_priority; - float epsilon; /* a small value to overcome float imprecision */ - - found = 0; - best_score = 0.0; - best_score_priority = 0; - p = tls_info; - endp = tls_info + sizeof(tls_info) - 1; - tls_info[0] = '\0'; + int is_reachable; + double score; + uint8_t priority; + + p = tls_info; + endp = tls_info + sizeof(tls_info) - 1; + tls_info[0] = '\0'; + tl_bitmap &= (select_params->tl_bitmap & context->tl_bitmap); + show_error = (select_params->show_error && show_error); /* Check which remote addresses satisfy the criteria */ addr_index_map = 0; - for (ae = address_list; ae < address_list + address_count; ++ae) { - addr_index = ae - address_list; + ucp_unpacked_address_for_each(ae, select_params->address) { + addr_index = ucp_unpacked_address_index(select_params->address, ae); if (!(remote_dev_bitmap & UCS_BIT(ae->dev_index))) { ucs_trace("addr[%d]: not in use, because on device[%d]", addr_index, ae->dev_index); @@ -257,7 +286,8 @@ ucp_wireup_select_transport(ucp_ep_h ep, const ucp_address_entry_t *address_list ucs_trace("addr[%d]: not in use, because on md[%d]", addr_index, ae->md_index); continue; - } else if (!ucs_test_all_flags(ae->md_flags, criteria->remote_md_flags)) { + } else if (!ucs_test_all_flags(ae->md_flags, + criteria->remote_md_flags)) { ucs_trace("addr[%d] %s: no %s", addr_index, ucp_find_tl_name_by_csum(context, ae->tl_name_csum), ucp_wireup_get_missing_flag_desc(ae->md_flags, @@ -270,6 +300,8 @@ ucp_wireup_select_transport(ucp_ep_h ep, const ucp_address_entry_t *address_list * ucp packed address */ ucs_assert(ucs_test_all_flags(UCP_ADDRESS_IFACE_FLAGS, criteria->remote_iface_flags)); + ucs_assert(ucs_test_all_flags(UCP_ADDRESS_IFACE_EVENT_FLAGS, + criteria->remote_event_flags)); if (!ucs_test_all_flags(ae->iface_attr.cap_flags, criteria->remote_iface_flags)) { ucs_trace("addr[%d] %s: no %s", addr_index, @@ -280,6 +312,15 @@ ucp_wireup_select_transport(ucp_ep_h ep, const ucp_address_entry_t *address_list continue; } + if (!ucs_test_all_flags(ae->iface_attr.event_flags, criteria->remote_event_flags)) { + ucs_trace("addr[%d] %s: no %s", addr_index, + ucp_find_tl_name_by_csum(context, ae->tl_name_csum), + ucp_wireup_get_missing_flag_desc(ae->iface_attr.event_flags, + criteria->remote_event_flags, + ucp_wireup_event_flags)); + continue; + } + UCP_WIREUP_CHECK_AMO_FLAGS(ae, criteria, context, addr_index, op, 32); UCP_WIREUP_CHECK_AMO_FLAGS(ae, criteria, context, addr_index, op, 64); UCP_WIREUP_CHECK_AMO_FLAGS(ae, criteria, context, addr_index, fop, 32); @@ -298,7 +339,7 @@ ucp_wireup_select_transport(ucp_ep_h ep, const ucp_address_entry_t *address_list * Pick the best local resource to satisfy the criteria. * best one has the highest score (from the dedicated score_func) and * has a reachable tl on the remote peer */ - ucs_for_each_bit(rsc_index, context->tl_bitmap) { + ucs_for_each_bit(rsc_index, tl_bitmap) { resource = &context->tl_rscs[rsc_index].tl_rsc; iface_attr = ucp_worker_iface_get_attr(worker, rsc_index); md_attr = &context->tl_mds[context->tl_rscs[rsc_index].md_index].attr; @@ -315,6 +356,9 @@ ucp_wireup_select_transport(ucp_ep_h ep, const ucp_address_entry_t *address_list !ucp_wireup_check_flags(resource, iface_attr->cap.flags, criteria->local_iface_flags, criteria->title, ucp_wireup_iface_flags, p, endp - p) || + !ucp_wireup_check_flags(resource, iface_attr->cap.event_flags, + criteria->local_event_flags, criteria->title, + ucp_wireup_event_flags, p, endp - p) || !ucp_wireup_check_amo_flags(resource, iface_attr->cap.atomic32.op_flags, criteria->local_atomic_flags.atomic32.op_flags, 32, 0, criteria->title, p, endp - p) || @@ -351,46 +395,38 @@ ucp_wireup_select_transport(ucp_ep_h ep, const ucp_address_entry_t *address_list continue; } - reachable = 0; - - for (ae = address_list; ae < address_list + address_count; ++ae) { - if (!(addr_index_map & UCS_BIT(ae - address_list)) || - !ucp_wireup_is_reachable(worker, rsc_index, ae)) - { + is_reachable = 0; + ucp_unpacked_address_for_each(ae, select_params->address) { + addr_index = ucp_unpacked_address_index(select_params->address, ae); + if (!(addr_index_map & UCS_BIT(addr_index)) || + !ucp_wireup_is_reachable(ep, rsc_index, ae)) { /* Must be reachable device address, on same transport */ continue; } - reachable = 1; - - score = criteria->calc_score(context, md_attr, iface_attr, - &ae->iface_attr); - ucs_assert(score >= 0.0); - - priority = iface_attr->priority + ae->iface_attr.priority; - - ucs_trace(UCT_TL_RESOURCE_DESC_FMT "->addr[%zd] : %s score %.2f priority %d", - UCT_TL_RESOURCE_DESC_ARG(resource), ae - address_list, - criteria->title, score, priority); - - /* First comparing score, if score equals to current best score, - * comparing priority with the priority of best score */ - epsilon = (score + best_score) * (1e-6); - if (!found || (score > (best_score + epsilon)) || - ((fabs(score - best_score) < epsilon) && (priority > best_score_priority))) { - *rsc_index_p = rsc_index; - *dst_addr_index_p = ae - address_list; - *score_p = score; - best_score = score; - best_score_priority = priority; - found = 1; + score = criteria->calc_score(context, md_attr, iface_attr, + &ae->iface_attr); + priority = iface_attr->priority + ae->iface_attr.priority; + + ucs_trace(UCT_TL_RESOURCE_DESC_FMT "->addr[%u] : %s score %.2f priority %d", + UCT_TL_RESOURCE_DESC_ARG(resource), + addr_index, criteria->title, score, priority); + is_reachable = 1; + + if (!found || (ucp_score_prio_cmp(score, priority, sinfo.score, + sinfo.priority) > 0)) { + ucp_wireup_init_select_info(context, score, addr_index, + rsc_index, priority, + criteria->title, &sinfo); + found = 1; } } - /* If a local resource cannot reach any of the remote addresses, generate - * debug message. - */ - if (!reachable) { + /* If a local resource cannot reach any of the remote addresses, + * generate debug message. */ + if (!is_reachable) { + ucs_trace(UCT_TL_RESOURCE_DESC_FMT" : unreachable ", + UCT_TL_RESOURCE_DESC_ARG(resource)); snprintf(p, endp - p, UCT_TL_RESOURCE_DESC_FMT" - %s, ", UCT_TL_RESOURCE_DESC_ARG(resource), ucs_status_string(UCS_ERR_UNREACHABLE)); @@ -399,7 +435,6 @@ ucp_wireup_select_transport(ucp_ep_h ep, const ucp_address_entry_t *address_list } out: - if (p >= tls_info + 2) { *(p - 2) = '\0'; /* trim last "," */ } @@ -407,17 +442,21 @@ ucp_wireup_select_transport(ucp_ep_h ep, const ucp_address_entry_t *address_list if (!found) { if (show_error) { ucs_error("no %s transport to %s: %s", criteria->title, - ucp_ep_peer_name(ep), tls_info); + select_params->address->name, tls_info); } + return UCS_ERR_UNREACHABLE; } ucs_trace("ep %p: selected for %s: " UCT_TL_RESOURCE_DESC_FMT " md[%d]" " -> '%s' address[%d],md[%d] score %.2f", ep, criteria->title, - UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[*rsc_index_p].tl_rsc), - context->tl_rscs[*rsc_index_p].md_index, - ucp_ep_peer_name(ep), *dst_addr_index_p, - address_list[*dst_addr_index_p].md_index, best_score); + UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[sinfo.rsc_index].tl_rsc), + context->tl_rscs[sinfo.rsc_index].md_index, ucp_ep_peer_name(ep), + sinfo.addr_index, + select_params->address->address_list[sinfo.addr_index].md_index, + sinfo.score); + + *select_info = sinfo; return UCS_OK; } @@ -425,34 +464,38 @@ static inline double ucp_wireup_tl_iface_latency(ucp_context_h context, const uct_iface_attr_t *iface_attr, const ucp_address_iface_attr_t *remote_iface_attr) { - return ucs_max(iface_attr->latency.overhead, remote_iface_attr->lat_ovh) + - (iface_attr->latency.growth * context->config.est_num_eps); + return ucs_max(iface_attr->latency.c, remote_iface_attr->lat_ovh) + + (iface_attr->latency.m * context->config.est_num_eps); } -static UCS_F_NOINLINE void -ucp_wireup_add_lane_desc(ucp_wireup_lane_desc_t *lane_descs, - ucp_lane_index_t *num_lanes_p, ucp_rsc_index_t rsc_index, - unsigned addr_index, ucp_rsc_index_t dst_md_index, - double score, uint32_t usage, int is_proxy) +static UCS_F_NOINLINE ucs_status_t +ucp_wireup_add_lane_desc(const ucp_wireup_select_info_t *select_info, + ucp_md_index_t dst_md_index, + ucp_lane_type_t lane_type, int is_proxy, + ucp_wireup_select_context_t *select_ctx) { ucp_wireup_lane_desc_t *lane_desc; ucp_lane_index_t lane, proxy_lane; + ucp_lane_type_t lane_type_iter; int proxy_changed; /* Add a new lane, but try to reuse already added lanes which are selected * on the same transport resources. */ proxy_changed = 0; - for (lane_desc = lane_descs; lane_desc < lane_descs + (*num_lanes_p); ++lane_desc) { - if ((lane_desc->rsc_index == rsc_index) && - (lane_desc->addr_index == addr_index)) + for (lane_desc = select_ctx->lane_descs; + lane_desc < select_ctx->lane_descs + select_ctx->num_lanes; ++lane_desc) { + if ((lane_desc->rsc_index == select_info->rsc_index) && + (lane_desc->addr_index == select_info->addr_index) && + (lane_desc->path_index == select_info->path_index)) { - lane = lane_desc - lane_descs; + lane = lane_desc - select_ctx->lane_descs; ucs_assertv_always(dst_md_index == lane_desc->dst_md_index, "lane[%d].dst_md_index=%d, dst_md_index=%d", lane, lane_desc->dst_md_index, dst_md_index); - ucs_assertv_always(!(lane_desc->usage & usage), "lane[%d]=0x%x |= 0x%x", - lane, lane_desc->usage, usage); + ucs_assertv_always(!(lane_desc->lane_types & UCS_BIT(lane_type)), + "lane[%d]=0x%x |= 0x%x", lane, lane_desc->lane_types, + lane_type); if (is_proxy && (lane_desc->proxy_lane == UCP_NULL_LANE)) { /* New lane is a proxy, and found existing non-proxy lane with * same resource. So that lane should be used by the proxy. @@ -464,198 +507,234 @@ ucp_wireup_add_lane_desc(ucp_wireup_lane_desc_t *lane_descs, * could use the new lane. It also means we should be able to * add our new lane. */ - lane_desc->proxy_lane = *num_lanes_p; + lane_desc->proxy_lane = select_ctx->num_lanes; proxy_changed = 1; } else if (!is_proxy && (lane_desc->proxy_lane == UCP_NULL_LANE)) { /* Found non-proxy lane with same resource - don't add */ ucs_assert_always(!proxy_changed); - lane_desc->usage |= usage; + lane_desc->lane_types |= UCS_BIT(lane_type); goto out_update_score; } } } /* If a proxy cannot find other lane with same resource, proxy to self */ - proxy_lane = is_proxy ? (*num_lanes_p) : UCP_NULL_LANE; + proxy_lane = is_proxy ? select_ctx->num_lanes : UCP_NULL_LANE; out_add_lane: - lane_desc = &lane_descs[*num_lanes_p]; - ++(*num_lanes_p); + if (select_ctx->num_lanes >= UCP_MAX_LANES) { + ucs_error("cannot add %s lane - reached limit (%d)", + ucp_lane_type_info[lane_type].short_name, + select_ctx->num_lanes); + return UCS_ERR_EXCEEDS_LIMIT; + } + + lane_desc = &select_ctx->lane_descs[select_ctx->num_lanes]; + ++select_ctx->num_lanes; - lane_desc->rsc_index = rsc_index; - lane_desc->addr_index = addr_index; + lane_desc->rsc_index = select_info->rsc_index; + lane_desc->addr_index = select_info->addr_index; + lane_desc->path_index = select_info->path_index; lane_desc->proxy_lane = proxy_lane; lane_desc->dst_md_index = dst_md_index; - lane_desc->usage = usage; - lane_desc->am_bw_score = 0.0; - lane_desc->rma_score = 0.0; - lane_desc->rma_bw_score = 0.0; - lane_desc->amo_score = 0.0; + lane_desc->lane_types = UCS_BIT(lane_type); + for (lane_type_iter = 0; lane_type_iter < UCP_LANE_TYPE_LAST; + ++lane_type_iter) { + lane_desc->score[lane_type_iter] = 0.0; + } out_update_score: - if (usage & UCP_WIREUP_LANE_USAGE_AM_BW) { - lane_desc->am_bw_score = score; - } - if (usage & UCP_WIREUP_LANE_USAGE_RMA) { - lane_desc->rma_score = score; - } - if (usage & UCP_WIREUP_LANE_USAGE_RMA_BW) { - lane_desc->rma_bw_score = score; - } - if (usage & UCP_WIREUP_LANE_USAGE_AMO) { - lane_desc->amo_score = score; + lane_desc->score[lane_type] = select_info->score; + return UCS_OK; +} + +static int ucp_wireup_is_lane_proxy(ucp_worker_h worker, + ucp_rsc_index_t rsc_index, + uint64_t remote_event_flags) +{ + return ucp_worker_is_tl_2iface(worker, rsc_index) && + ((remote_event_flags & UCP_WORKER_UCT_RECV_EVENT_CAP_FLAGS) == + UCT_IFACE_FLAG_EVENT_RECV_SIG); +} + +static UCS_F_NOINLINE ucs_status_t +ucp_wireup_add_lane(const ucp_wireup_select_params_t *select_params, + const ucp_wireup_select_info_t *select_info, + ucp_lane_type_t lane_type, + ucp_wireup_select_context_t *select_ctx) +{ + int is_proxy = 0; + ucp_md_index_t dst_md_index; + uint64_t remote_event_flags; + + if ((lane_type == UCP_LANE_TYPE_AM) || (lane_type == UCP_LANE_TYPE_AM_BW) || + (lane_type == UCP_LANE_TYPE_TAG)) { + /* If the remote side is not p2p and has only signaled-am wakeup, it may + * deactivate its interface and wait for signaled active message to wake up. + * Use a proxy lane which would send the first active message as signaled to + * make sure the remote interface will indeed wake up. */ + remote_event_flags = select_params->address->address_list + [select_info->addr_index].iface_attr.event_flags; + is_proxy = ucp_wireup_is_lane_proxy(select_params->ep->worker, + select_info->rsc_index, + remote_event_flags); } + + dst_md_index = select_params->address->address_list + [select_info->addr_index].md_index; + return ucp_wireup_add_lane_desc(select_info, dst_md_index, lane_type, + is_proxy, select_ctx); } -#define UCP_WIREUP_COMPARE_SCORE(_elem1, _elem2, _arg, _token) \ - ({ \ - const ucp_lane_index_t *lane1 = (_elem1); \ - const ucp_lane_index_t *lane2 = (_elem2); \ - const ucp_wireup_lane_desc_t *lanes = (_arg); \ - double score1, score2; \ - \ - score1 = (*lane1 == UCP_NULL_LANE) ? 0.0 : lanes[*lane1]._token##_score; \ - score2 = (*lane2 == UCP_NULL_LANE) ? 0.0 : lanes[*lane2]._token##_score; \ - /* sort from highest score to lowest */ \ - (score1 < score2) ? 1 : ((score1 > score2) ? -1 : 0); \ - }) +static int ucp_wireup_compare_score(const void *elem1, const void *elem2, + void *arg, ucp_lane_type_t lane_type) +{ + const ucp_lane_index_t *lane1 = elem1; + const ucp_lane_index_t *lane2 = elem2; + const ucp_wireup_lane_desc_t *lanes = arg; + double score1, score2; + + score1 = (*lane1 == UCP_NULL_LANE) ? 0.0 : lanes[*lane1].score[lane_type]; + score2 = (*lane2 == UCP_NULL_LANE) ? 0.0 : lanes[*lane2].score[lane_type]; + + /* sort from highest score to lowest */ + return (score1 < score2) ? 1 : ((score1 > score2) ? -1 : 0); +} static int ucp_wireup_compare_lane_am_bw_score(const void *elem1, const void *elem2, void *arg) { - return UCP_WIREUP_COMPARE_SCORE(elem1, elem2, arg, am_bw); + return ucp_wireup_compare_score(elem1, elem2, arg, UCP_LANE_TYPE_AM_BW); } static int ucp_wireup_compare_lane_rma_score(const void *elem1, const void *elem2, void *arg) { - return UCP_WIREUP_COMPARE_SCORE(elem1, elem2, arg, rma); + return ucp_wireup_compare_score(elem1, elem2, arg, UCP_LANE_TYPE_RMA); } static int ucp_wireup_compare_lane_rma_bw_score(const void *elem1, const void *elem2, void *arg) { - return UCP_WIREUP_COMPARE_SCORE(elem1, elem2, arg, rma_bw); + return ucp_wireup_compare_score(elem1, elem2, arg, UCP_LANE_TYPE_RMA_BW); } static int ucp_wireup_compare_lane_amo_score(const void *elem1, const void *elem2, void *arg) { - return UCP_WIREUP_COMPARE_SCORE(elem1, elem2, arg, amo); + return ucp_wireup_compare_score(elem1, elem2, arg, UCP_LANE_TYPE_AMO); } -static uint64_t ucp_wireup_unset_tl_by_md(ucp_ep_h ep, uint64_t tl_bitmap, - ucp_rsc_index_t rsc_index) +static void +ucp_wireup_unset_tl_by_md(const ucp_wireup_select_params_t *sparams, + const ucp_wireup_select_info_t *sinfo, + uint64_t *tl_bitmap, uint64_t *remote_md_map) { - ucp_context_h context = ep->worker->context; - ucp_rsc_index_t md_index = context->tl_rscs[rsc_index].md_index; + ucp_context_h context = sparams->ep->worker->context; + const ucp_address_entry_t *ae = &sparams->address-> + address_list[sinfo->addr_index]; + ucp_md_index_t md_index = context->tl_rscs[sinfo->rsc_index].md_index; + ucp_md_index_t dst_md_index = ae->md_index; ucp_rsc_index_t i; + *remote_md_map &= ~UCS_BIT(dst_md_index); + ucs_for_each_bit(i, context->tl_bitmap) { if (context->tl_rscs[i].md_index == md_index) { - tl_bitmap &= ~UCS_BIT(i); + *tl_bitmap &= ~UCS_BIT(i); } } - - return tl_bitmap; } static UCS_F_NOINLINE ucs_status_t -ucp_wireup_add_memaccess_lanes(ucp_ep_h ep, unsigned address_count, - const ucp_address_entry_t *address_list, - ucp_wireup_lane_desc_t *lane_descs, - ucp_lane_index_t *num_lanes_p, +ucp_wireup_add_memaccess_lanes(const ucp_wireup_select_params_t *select_params, const ucp_wireup_criteria_t *criteria, - uint64_t tl_bitmap, uint32_t usage, - int select_best, int show_error) + uint64_t tl_bitmap, ucp_lane_type_t lane_type, + ucp_wireup_select_context_t *select_ctx) { - ucp_wireup_criteria_t mem_criteria = *criteria; - ucp_address_entry_t *address_list_copy; - ucp_rsc_index_t rsc_index, dst_md_index; - size_t address_list_size; - double score, reg_score; + ucp_wireup_criteria_t mem_criteria = *criteria; + ucp_wireup_select_info_t select_info = {0}; + int show_error = !select_params->allow_am; + double reg_score; uint64_t remote_md_map; - unsigned addr_index; ucs_status_t status; char title[64]; - remote_md_map = -1; - - /* Create a copy of the address list */ - address_list_size = sizeof(*address_list_copy) * address_count; - address_list_copy = ucs_malloc(address_list_size, "rma address list"); - if (address_list_copy == NULL) { - status = UCS_ERR_NO_MEMORY; - goto out; - } - - memcpy(address_list_copy, address_list, address_list_size); + remote_md_map = UINT64_MAX; /* Select best transport which can reach registered memory */ snprintf(title, sizeof(title), criteria->title, "registered"); mem_criteria.title = title; mem_criteria.remote_md_flags = UCT_MD_FLAG_REG | criteria->remote_md_flags; - status = ucp_wireup_select_transport(ep, address_list_copy, address_count, - &mem_criteria, tl_bitmap, remote_md_map, - -1, -1, show_error, - &rsc_index, &addr_index, &score); + status = ucp_wireup_select_transport(select_params, &mem_criteria, + tl_bitmap, remote_md_map, + UINT64_MAX, UINT64_MAX, + show_error, &select_info); if (status != UCS_OK) { - goto out_free_address_list; + goto out; } - dst_md_index = address_list_copy[addr_index].md_index; - reg_score = score; + reg_score = select_info.score; /* Add to the list of lanes and remove all occurrences of the remote md - * from the address list, to avoid selecting the same remote md again.*/ - ucp_wireup_add_lane_desc(lane_descs, num_lanes_p, rsc_index, addr_index, - dst_md_index, score, usage, 0); - remote_md_map &= ~UCS_BIT(dst_md_index); - tl_bitmap = ucp_wireup_unset_tl_by_md(ep, tl_bitmap, rsc_index); - - /* Select additional transports which can access allocated memory, but only - * if their scores are better. We need this because a remote memory block can - * be potentially allocated using one of them, and we might get better performance - * than the transports which support only registered remote memory. - */ - if (select_best) { - snprintf(title, sizeof(title), criteria->title, "allocated"); - mem_criteria.title = title; - mem_criteria.remote_md_flags = UCT_MD_FLAG_ALLOC | criteria->remote_md_flags; - } else if (ep->worker->context->tl_rscs[rsc_index].tl_rsc.dev_type == UCT_DEVICE_TYPE_SHM) { - /* special case for SHM: do not try to lookup additional lanes when - * SHM transport detected (another transport will be significantly - * slower) */ - goto out_free_address_list; + * from the address list, to avoid selecting the same remote md again. */ + status = ucp_wireup_add_lane(select_params, &select_info, lane_type, + select_ctx); + if (status != UCS_OK) { + goto out; } - while (address_count > 0) { - status = ucp_wireup_select_transport(ep, address_list_copy, address_count, - &mem_criteria, tl_bitmap, remote_md_map, - -1, -1, 0, &rsc_index, - &addr_index, &score); + ucp_wireup_unset_tl_by_md(select_params, &select_info, &tl_bitmap, + &remote_md_map); + + /* Select additional transports which can access allocated memory, but + * only if their scores are better. We need this because a remote memory + * block can be potentially allocated using one of them, and we might get + * better performance than the transports which support only registered + * remote memory. */ + snprintf(title, sizeof(title), criteria->title, "allocated"); + mem_criteria.title = title; + mem_criteria.remote_md_flags = UCT_MD_FLAG_ALLOC | + criteria->remote_md_flags; + + for (;;) { + status = ucp_wireup_select_transport(select_params, &mem_criteria, + tl_bitmap, remote_md_map, + UINT64_MAX, UINT64_MAX, 0, + &select_info); + /* Break if: */ + /* - transport selection wasn't OK */ if ((status != UCS_OK) || - (select_best && (score <= reg_score))) { + /* - the selected transport is worse than + * the transport selected above */ + (ucp_score_cmp(select_info.score, reg_score) <= 0)) { break; } - /* Add lane description and remove all occurrences of the remote md */ - dst_md_index = address_list_copy[addr_index].md_index; - ucp_wireup_add_lane_desc(lane_descs, num_lanes_p, rsc_index, addr_index, - dst_md_index, score, usage, 0); - remote_md_map &= ~UCS_BIT(dst_md_index); - tl_bitmap = ucp_wireup_unset_tl_by_md(ep, tl_bitmap, rsc_index); + /* Add lane description and remove all occurrences of the remote md. */ + status = ucp_wireup_add_lane(select_params, &select_info, lane_type, + select_ctx); + if (status != UCS_OK) { + goto out; + } + + ucp_wireup_unset_tl_by_md(select_params, &select_info, &tl_bitmap, + &remote_md_map); } status = UCS_OK; -out_free_address_list: - ucs_free(address_list_copy); out: - return select_best ? status : UCS_OK; + if ((status != UCS_OK) && select_params->allow_am) { + /* using emulation over active messages */ + select_ctx->ucp_ep_init_flags |= UCP_EP_INIT_CREATE_AM_LANE; + status = UCS_OK; + } + + return status; } -static uint64_t ucp_ep_get_context_features(ucp_ep_h ep) +static uint64_t ucp_ep_get_context_features(const ucp_ep_h ep) { return ep->worker->context->config.features; } @@ -668,25 +747,20 @@ static double ucp_wireup_rma_score_func(ucp_context_h context, /* best for 4k messages */ return 1e-3 / (ucp_wireup_tl_iface_latency(context, iface_attr, remote_iface_attr) + iface_attr->overhead + - (4096.0 / ucs_min(iface_attr->bandwidth, remote_iface_attr->bandwidth))); -} - -static int ucp_wireup_ep_params_is_err_mode_peer(const ucp_ep_params_t *params) -{ - return (params->field_mask & UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE) && - (params->err_mode == UCP_ERR_HANDLING_MODE_PEER); + (4096.0 / ucs_min(ucp_tl_iface_bandwidth(context, &iface_attr->bandwidth), + ucp_tl_iface_bandwidth(context, &remote_iface_attr->bandwidth)))); } -static void ucp_wireup_fill_ep_params_criteria(ucp_wireup_criteria_t *criteria, - const ucp_ep_params_t *params) +static void ucp_wireup_fill_peer_err_criteria(ucp_wireup_criteria_t *criteria, + unsigned ep_init_flags) { - if (ucp_wireup_ep_params_is_err_mode_peer(params)) { + if (ep_init_flags & UCP_EP_INIT_ERR_MODE_PEER_FAILURE) { criteria->local_iface_flags |= UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE; } } static void ucp_wireup_fill_aux_criteria(ucp_wireup_criteria_t *criteria, - const ucp_ep_params_t *params) + unsigned ep_init_flags) { criteria->title = "auxiliary"; criteria->local_md_flags = 0; @@ -697,10 +771,12 @@ static void ucp_wireup_fill_aux_criteria(ucp_wireup_criteria_t *criteria, criteria->remote_iface_flags = UCT_IFACE_FLAG_CONNECT_TO_IFACE | UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_CB_ASYNC; + criteria->local_event_flags = 0; + criteria->remote_event_flags = 0; criteria->calc_score = ucp_wireup_aux_score_func; criteria->tl_rsc_flags = UCP_TL_RSC_FLAG_AUX; /* Can use aux transports */ - ucp_wireup_fill_ep_params_criteria(criteria, params); + ucp_wireup_fill_peer_err_criteria(criteria, ep_init_flags); } static void ucp_wireup_clean_amo_criteria(ucp_wireup_criteria_t *criteria) @@ -711,27 +787,65 @@ static void ucp_wireup_clean_amo_criteria(ucp_wireup_criteria_t *criteria) sizeof(criteria->local_atomic_flags)); } -static int ucp_wireup_allow_am_emulation_layer(const ucp_ep_params_t *params, - unsigned ep_init_flags) +/** + * Check whether emulation over AM is allowed for RMA/AMO lanes + */ +static int ucp_wireup_allow_am_emulation_layer(unsigned ep_init_flags) { - return !(ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE) && - /* disable emulation layer if err handling is required due to lack of - * keep alive protocol */ - !ucp_wireup_ep_params_is_err_mode_peer(params); + if (ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE) { + return 0; + } + + /* disable emulation layer if err handling is required due to lack of + * keep alive protocol, unless we have CM which handles disconnect + */ + if ((ep_init_flags & UCP_EP_INIT_ERR_MODE_PEER_FAILURE) && + !ucp_ep_init_flags_has_cm(ep_init_flags)) { + return 0; + } + + return 1; +} + +static unsigned +ucp_wireup_ep_init_flags(const ucp_wireup_select_params_t *select_params, + const ucp_wireup_select_context_t *select_ctx) +{ + return select_params->ep_init_flags | select_ctx->ucp_ep_init_flags; +} + +static ucs_status_t +ucp_wireup_add_cm_lane(const ucp_wireup_select_params_t *select_params, + ucp_wireup_select_context_t *select_ctx) +{ + ucp_wireup_select_info_t select_info; + + if (!ucp_ep_init_flags_has_cm(select_params->ep_init_flags)) { + return UCS_OK; + } + + select_info.priority = 0; /**< Currently we have only 1 CM + implementation */ + select_info.rsc_index = UCP_NULL_RESOURCE; /**< RSC doesn't matter for CM */ + select_info.addr_index = 0; /**< This makes sense only for transport + lanes */ + select_info.score = 0.; /**< TODO: when we have > 1 CM implementation */ + select_info.path_index = 0; /**< Only one lane per CM device */ + + /* server is not a proxy because it can create all lanes connected */ + return ucp_wireup_add_lane_desc(&select_info, select_info.rsc_index, + UCP_LANE_TYPE_CM, 0, select_ctx); } -static ucs_status_t ucp_wireup_add_rma_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, - unsigned ep_init_flags, unsigned address_count, - const ucp_address_entry_t *address_list, - ucp_wireup_lane_desc_t *lane_descs, - ucp_lane_index_t *num_lanes_p, - int *need_am) +static ucs_status_t +ucp_wireup_add_rma_lanes(const ucp_wireup_select_params_t *select_params, + ucp_wireup_select_context_t *select_ctx) { ucp_wireup_criteria_t criteria = {0}; - ucs_status_t status; - int allow_am; + unsigned ep_init_flags = ucp_wireup_ep_init_flags(select_params, + select_ctx); - if (!(ucp_ep_get_context_features(ep) & UCP_FEATURE_RMA) && + if (!(ucp_ep_get_context_features(select_params->ep) & UCP_FEATURE_RMA) && !(ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE)) { return UCS_OK; } @@ -748,23 +862,14 @@ static ucs_status_t ucp_wireup_add_rma_lanes(ucp_ep_h ep, const ucp_ep_params_t criteria.local_iface_flags = criteria.remote_iface_flags | UCT_IFACE_FLAG_PENDING; } - criteria.calc_score = ucp_wireup_rma_score_func; - criteria.tl_rsc_flags = 0; - ucp_wireup_fill_ep_params_criteria(&criteria, params); - - allow_am = ucp_wireup_allow_am_emulation_layer(params, ep_init_flags); - status = ucp_wireup_add_memaccess_lanes(ep, address_count, address_list, - lane_descs, num_lanes_p, &criteria, - -1, UCP_WIREUP_LANE_USAGE_RMA, 1, - !allow_am); - if (status == UCS_OK) { - return status; /* using transport RMA operations */ - } else if (allow_am) { - *need_am = 1; /* using emulation over active messages */ - return UCS_OK; - } else { - return status; - } + criteria.remote_event_flags = 0; + criteria.local_event_flags = 0; + criteria.calc_score = ucp_wireup_rma_score_func; + criteria.tl_rsc_flags = 0; + ucp_wireup_fill_peer_err_criteria(&criteria, ep_init_flags); + + return ucp_wireup_add_memaccess_lanes(select_params, &criteria, UINT64_MAX, + UCP_LANE_TYPE_RMA, select_ctx); } double ucp_wireup_amo_score_func(ucp_context_h context, @@ -777,23 +882,20 @@ double ucp_wireup_amo_score_func(ucp_context_h context, iface_attr->overhead); } -static ucs_status_t ucp_wireup_add_amo_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, - unsigned ep_init_flags, - unsigned address_count, - const ucp_address_entry_t *address_list, - ucp_wireup_lane_desc_t *lane_descs, - ucp_lane_index_t *num_lanes_p, - int *need_am) +static ucs_status_t +ucp_wireup_add_amo_lanes(const ucp_wireup_select_params_t *select_params, + ucp_wireup_select_context_t *select_ctx) { - ucp_worker_h worker = ep->worker; + ucp_worker_h worker = select_params->ep->worker; ucp_context_h context = worker->context; ucp_wireup_criteria_t criteria = {0}; + unsigned ep_init_flags = ucp_wireup_ep_init_flags(select_params, + select_ctx); ucp_rsc_index_t rsc_index; - ucs_status_t status; uint64_t tl_bitmap; - int allow_am; - if (!ucs_test_flags(context->config.features, UCP_FEATURE_AMO32, UCP_FEATURE_AMO64) || + if (!ucs_test_flags(context->config.features, + UCP_FEATURE_AMO32, UCP_FEATURE_AMO64) || (ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE)) { return UCS_OK; } @@ -801,36 +903,27 @@ static ucs_status_t ucp_wireup_add_amo_lanes(ucp_ep_h ep, const ucp_ep_params_t ucp_context_uct_atomic_iface_flags(context, &criteria.remote_atomic_flags); criteria.title = "atomic operations on %s memory"; - criteria.local_iface_flags = criteria.remote_iface_flags | - UCT_IFACE_FLAG_PENDING; + criteria.local_iface_flags = UCT_IFACE_FLAG_PENDING; + criteria.remote_iface_flags = 0; + criteria.local_event_flags = 0; + criteria.remote_event_flags = 0; criteria.local_atomic_flags = criteria.remote_atomic_flags; criteria.calc_score = ucp_wireup_amo_score_func; - ucp_wireup_fill_ep_params_criteria(&criteria, params); + ucp_wireup_fill_peer_err_criteria(&criteria, ep_init_flags); /* We can use only non-p2p resources or resources which are explicitly * selected for atomics. Otherwise, the remote peer would not be able to * connect back on p2p transport. */ tl_bitmap = worker->atomic_tls; - for (rsc_index = 0; rsc_index < context->num_tls; ++rsc_index) { - if (!ucp_worker_is_tl_p2p(worker, rsc_index)) { + ucs_for_each_bit(rsc_index, context->tl_bitmap) { + if (ucp_worker_is_tl_2iface(worker, rsc_index)) { tl_bitmap |= UCS_BIT(rsc_index); } } - allow_am = ucp_wireup_allow_am_emulation_layer(params, ep_init_flags); - status = ucp_wireup_add_memaccess_lanes(ep, address_count, address_list, - lane_descs, num_lanes_p, &criteria, - tl_bitmap, UCP_WIREUP_LANE_USAGE_AMO, - 1, !allow_am); - if (status == UCS_OK) { - return status; /* using transport AMO operations */ - } else if (allow_am) { - *need_am = 1; /* using emulation over active messages */ - return UCS_OK; - } else { - return status; - } + return ucp_wireup_add_memaccess_lanes(select_params, &criteria, tl_bitmap, + UCP_LANE_TYPE_AMO, select_ctx); } static double ucp_wireup_am_score_func(ucp_context_h context, @@ -852,43 +945,40 @@ static double ucp_wireup_rma_bw_score_func(ucp_context_h context, * a size which is likely to be used for high-bw memory access protocol, for * how long it would take to transfer it with a certain transport. */ return 1 / ((UCP_WIREUP_RMA_BW_TEST_MSG_SIZE / - ucs_min(iface_attr->bandwidth, remote_iface_attr->bandwidth)) + + ucs_min(ucp_tl_iface_bandwidth(context, &iface_attr->bandwidth), + ucp_tl_iface_bandwidth(context, &remote_iface_attr->bandwidth))) + ucp_wireup_tl_iface_latency(context, iface_attr, remote_iface_attr) + - iface_attr->overhead + md_attr->reg_cost.overhead + - (UCP_WIREUP_RMA_BW_TEST_MSG_SIZE * md_attr->reg_cost.growth)); -} - -static int ucp_wireup_is_lane_proxy(ucp_ep_h ep, ucp_rsc_index_t rsc_index, - uint64_t remote_cap_flags) -{ - return !ucp_worker_is_tl_p2p(ep->worker, rsc_index) && - ((remote_cap_flags & UCP_WORKER_UCT_RECV_EVENT_CAP_FLAGS) == - UCT_IFACE_FLAG_EVENT_RECV_SIG); + iface_attr->overhead + + ucs_linear_func_apply(md_attr->reg_cost, + UCP_WIREUP_RMA_BW_TEST_MSG_SIZE)); } -static inline int ucp_wireup_is_am_required(ucp_ep_h ep, - const ucp_ep_params_t *params, - unsigned ep_init_flags, - ucp_wireup_lane_desc_t *lane_descs, - int num_lanes_p) +static inline int +ucp_wireup_is_am_required(const ucp_wireup_select_params_t *select_params, + const ucp_wireup_select_context_t *select_ctx) { + ucp_ep_h ep = select_params->ep; + unsigned ep_init_flags = ucp_wireup_ep_init_flags(select_params, + select_ctx); ucp_lane_index_t lane; /* Check if we need active messages from the configurations, for wireup. * If not, check if am is required due to p2p transports */ - if ((ep_init_flags & UCP_EP_CREATE_AM_LANE) || - (params->field_mask & UCP_EP_PARAM_FIELD_SOCK_ADDR)) { + if (ep_init_flags & UCP_EP_INIT_CREATE_AM_LANE) { return 1; } if (!(ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE) && - (ucp_ep_get_context_features(ep) & (UCP_FEATURE_TAG | UCP_FEATURE_STREAM))) { + (ucp_ep_get_context_features(ep) & (UCP_FEATURE_TAG | + UCP_FEATURE_STREAM | + UCP_FEATURE_AM))) { return 1; } - for (lane = 0; lane < num_lanes_p; ++lane) { - if (ucp_worker_is_tl_p2p(ep->worker, lane_descs[lane].rsc_index)) { + for (lane = 0; lane < select_ctx->num_lanes; ++lane) { + if (!ucp_worker_is_tl_2iface(ep->worker, + select_ctx->lane_descs[lane].rsc_index)) { return 1; } } @@ -896,58 +986,60 @@ static inline int ucp_wireup_is_am_required(ucp_ep_h ep, return 0; } -static ucs_status_t ucp_wireup_add_am_lane(ucp_ep_h ep, const ucp_ep_params_t *params, - unsigned ep_init_flags, unsigned address_count, - const ucp_address_entry_t *address_list, - ucp_wireup_lane_desc_t *lane_descs, - ucp_lane_index_t *num_lanes_p, - double *am_score, - ucp_err_handling_mode_t err_mode) +static ucs_status_t +ucp_wireup_add_am_lane(const ucp_wireup_select_params_t *select_params, + ucp_wireup_select_info_t *am_info, + ucp_wireup_select_context_t *select_ctx) { + ucp_worker_h worker = select_params->ep->worker; + uint64_t tl_bitmap = select_params->tl_bitmap; ucp_wireup_criteria_t criteria = {0}; - ucp_rsc_index_t rsc_index; + const uct_iface_attr_t *iface_attr; ucs_status_t status; - unsigned addr_index; - int is_proxy; - if (!ucp_wireup_is_am_required(ep, params, ep_init_flags, lane_descs, - *num_lanes_p)) { + if (!ucp_wireup_is_am_required(select_params, select_ctx)) { + memset(am_info, 0, sizeof(*am_info)); return UCS_OK; } /* Select one lane for active messages */ - criteria.title = "active messages"; - criteria.remote_iface_flags = UCT_IFACE_FLAG_AM_BCOPY | - UCT_IFACE_FLAG_CB_SYNC; - criteria.local_iface_flags = UCT_IFACE_FLAG_AM_BCOPY; - criteria.calc_score = ucp_wireup_am_score_func; - ucp_wireup_fill_ep_params_criteria(&criteria, params); - - if (ucs_test_all_flags(ucp_ep_get_context_features(ep), UCP_FEATURE_TAG | - UCP_FEATURE_WAKEUP)) { - criteria.local_iface_flags |= UCP_WORKER_UCT_UNSIG_EVENT_CAP_FLAGS; - } - - status = ucp_wireup_select_transport(ep, address_list, address_count, &criteria, - -1, -1, -1, -1, 1, &rsc_index, &addr_index, - am_score); - if (status != UCS_OK) { - return status; - } + for (;;) { + criteria.title = "active messages"; + criteria.remote_iface_flags = UCT_IFACE_FLAG_AM_BCOPY | + UCT_IFACE_FLAG_CB_SYNC; + criteria.local_iface_flags = UCT_IFACE_FLAG_AM_BCOPY; + criteria.remote_event_flags = 0; + criteria.local_event_flags = 0; + criteria.calc_score = ucp_wireup_am_score_func; + ucp_wireup_fill_peer_err_criteria(&criteria, + ucp_wireup_ep_init_flags(select_params, + select_ctx)); + + if (ucs_test_all_flags(ucp_ep_get_context_features(select_params->ep), + UCP_FEATURE_TAG | UCP_FEATURE_WAKEUP)) { + criteria.local_event_flags = UCP_WORKER_UCT_UNSIG_EVENT_CAP_FLAGS; + } - /* If the remote side is not p2p and has only signaled-am wakeup, it may - * deactivate its interface and wait for signaled active message to wake up. - * Use a proxy lane which would send the first active message as signaled to - * make sure the remote interface will indeed wake up. - */ - is_proxy = ucp_wireup_is_lane_proxy(ep, rsc_index, - address_list[addr_index].iface_attr.cap_flags); + status = ucp_wireup_select_transport(select_params, &criteria, tl_bitmap, + UINT64_MAX, UINT64_MAX, UINT64_MAX, + 1, am_info); + if (status != UCS_OK) { + return status; + } - ucp_wireup_add_lane_desc(lane_descs, num_lanes_p, rsc_index, addr_index, - address_list[addr_index].md_index, *am_score, - UCP_WIREUP_LANE_USAGE_AM, is_proxy); + /* If max_bcopy is too small, try again */ + iface_attr = ucp_worker_iface_get_attr(worker, am_info->rsc_index); + if (iface_attr->cap.am.max_bcopy < UCP_MIN_BCOPY) { + ucs_debug("ep %p: rsc_index[%d] am.max_bcopy is too small: %zu, " + "expected: >= %d", select_params->ep, am_info->rsc_index, + iface_attr->cap.am.max_bcopy, UCP_MIN_BCOPY); + tl_bitmap &= ~UCS_BIT(am_info->rsc_index); + continue; + } - return UCS_OK; + return ucp_wireup_add_lane(select_params, am_info, UCP_LANE_TYPE_AM, + select_ctx); + } } static double ucp_wireup_am_bw_score_func(ucp_context_h context, @@ -957,98 +1049,114 @@ static double ucp_wireup_am_bw_score_func(ucp_context_h context, { /* best single MTU bandwidth */ double size = iface_attr->cap.am.max_bcopy; - double time = (size / ucs_min(iface_attr->bandwidth, - remote_iface_attr->bandwidth)) + + double t = (size / ucs_min(ucp_tl_iface_bandwidth(context, &iface_attr->bandwidth), + ucp_tl_iface_bandwidth(context, &remote_iface_attr->bandwidth))) + iface_attr->overhead + remote_iface_attr->overhead + ucp_wireup_tl_iface_latency(context, iface_attr, remote_iface_attr); - return size / time * 1e-5; + return size / t * 1e-5; } -static int ucp_wireup_is_ep_single_lane(ucp_ep_h ep, ucp_rsc_index_t rsc_index) +static unsigned +ucp_wireup_add_bw_lanes(const ucp_wireup_select_params_t *select_params, + const ucp_wireup_select_bw_info_t *bw_info, + uint64_t tl_bitmap, ucp_lane_index_t excl_lane, + ucp_wireup_select_context_t *select_ctx) { - return (ep->worker->context->tl_rscs[rsc_index].tl_rsc.dev_type == UCT_DEVICE_TYPE_SHM) || - (ep->worker->context->tl_rscs[rsc_index].tl_rsc.dev_type == UCT_DEVICE_TYPE_SELF); -} - -static ucs_status_t ucp_wireup_add_bw_lanes(ucp_ep_h ep, - unsigned address_count, - const ucp_address_entry_t *address_list, - const ucp_wireup_select_bw_info_t *bw_info, - int allow_proxy, uint64_t tl_bitmap, - ucp_wireup_lane_desc_t *lane_descs, - ucp_lane_index_t *num_lanes_p) -{ - ucp_context_h context = ep->worker->context; + ucp_ep_h ep = select_params->ep; + ucp_context_h context = ep->worker->context; + ucp_wireup_select_info_t sinfo = {0}; + unsigned local_dev_count[UCP_MAX_RESOURCES] = {0}; + unsigned remote_dev_count[UCP_MAX_RESOURCES] = {0}; + const uct_iface_attr_t *iface_attr; + const ucp_address_entry_t *ae; ucs_status_t status; - int num_lanes; + unsigned num_lanes; uint64_t local_dev_bitmap; uint64_t remote_dev_bitmap; + ucp_rsc_index_t dev_index; ucp_md_map_t md_map; ucp_rsc_index_t rsc_index; unsigned addr_index; - double score; - int is_proxy; - status = UCS_ERR_UNREACHABLE; - num_lanes = 0; - md_map = bw_info->md_map; - local_dev_bitmap = bw_info->local_dev_bitmap; - remote_dev_bitmap = bw_info->remote_dev_bitmap; + num_lanes = 0; + md_map = bw_info->md_map; + local_dev_bitmap = bw_info->local_dev_bitmap; + remote_dev_bitmap = bw_info->remote_dev_bitmap; /* lookup for requested number of lanes or limit of MD map * (we have to limit MD's number to avoid malloc in * memory registration) */ while ((num_lanes < bw_info->max_lanes) && (ucs_popcount(md_map) < UCP_MAX_OP_MDS)) { - status = ucp_wireup_select_transport(ep, address_list, address_count, - &bw_info->criteria, tl_bitmap, -1, - local_dev_bitmap, remote_dev_bitmap, - 0, &rsc_index, &addr_index, &score); - if (status != UCS_OK) { - break; - } + if (excl_lane == UCP_NULL_LANE) { + status = ucp_wireup_select_transport(select_params, &bw_info->criteria, + tl_bitmap, UINT64_MAX, + local_dev_bitmap, remote_dev_bitmap, + 0, &sinfo); + if (status != UCS_OK) { + break; + } - is_proxy = allow_proxy && - ucp_wireup_is_lane_proxy(ep, rsc_index, - address_list[addr_index].iface_attr.cap_flags); + rsc_index = sinfo.rsc_index; + addr_index = sinfo.addr_index; + dev_index = context->tl_rscs[rsc_index].dev_index; + sinfo.path_index = local_dev_count[dev_index]; + status = ucp_wireup_add_lane(select_params, &sinfo, + bw_info->lane_type, select_ctx); + if (status != UCS_OK) { + break; + } - ucp_wireup_add_lane_desc(lane_descs, num_lanes_p, rsc_index, addr_index, - address_list[addr_index].md_index, score, - bw_info->usage, is_proxy); - md_map |= UCS_BIT(context->tl_rscs[rsc_index].md_index); - num_lanes++; + num_lanes++; + } else { + /* disqualify/count lane_desc_idx */ + addr_index = select_ctx->lane_descs[excl_lane].addr_index; + rsc_index = select_ctx->lane_descs[excl_lane].rsc_index; + dev_index = context->tl_rscs[rsc_index].dev_index; + excl_lane = UCP_NULL_LANE; + } - local_dev_bitmap &= ~UCS_BIT(context->tl_rscs[rsc_index].dev_index); - remote_dev_bitmap &= ~UCS_BIT(address_list[addr_index].dev_index); + /* Count how many times the LOCAL device is used */ + iface_attr = ucp_worker_iface_get_attr(ep->worker, rsc_index); + ++local_dev_count[dev_index]; + if (local_dev_count[dev_index] >= iface_attr->dev_num_paths) { + /* exclude local device if reached max concurrency level */ + local_dev_bitmap &= ~UCS_BIT(dev_index); + } - if (ucp_wireup_is_ep_single_lane(ep, rsc_index)) { - /* special case for SHM: do not try to lookup additional lanes when - * SHM transport detected (another transport will be significantly - * slower) */ - break; + /* Count how many times the REMOTE device is used */ + ae = &select_params->address->address_list[addr_index]; + ++remote_dev_count[ae->dev_index]; + if (remote_dev_count[ae->dev_index] >= ae->dev_num_paths) { + /* exclude remote device if reached max concurrency level */ + remote_dev_bitmap &= ~UCS_BIT(ae->dev_index); } + + md_map |= UCS_BIT(context->tl_rscs[rsc_index].md_index); } - return UCS_OK; + return num_lanes; } -static ucs_status_t ucp_wireup_add_am_bw_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, - unsigned ep_init_flags, unsigned address_count, - const ucp_address_entry_t *address_list, - ucp_wireup_lane_desc_t *lane_descs, - ucp_lane_index_t *num_lanes_p) +static ucs_status_t +ucp_wireup_add_am_bw_lanes(const ucp_wireup_select_params_t *select_params, + ucp_wireup_select_context_t *select_ctx) { - ucp_context_h context = ep->worker->context; + ucp_ep_h ep = select_params->ep; + ucp_context_h context = ep->worker->context; + unsigned ep_init_flags = ucp_wireup_ep_init_flags(select_params, + select_ctx); + + ucp_lane_index_t lane_desc_idx, am_lane; ucp_wireup_select_bw_info_t bw_info; - ucp_lane_index_t lane_desc_idx; - ucp_rsc_index_t rsc_index; - unsigned addr_index; + unsigned num_am_bw_lanes; - /* Check if we need active messages, for wireup */ - if (!(ucp_ep_get_context_features(ep) & UCP_FEATURE_TAG) || - (ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE) || - (ep->worker->context->config.ext.max_eager_lanes < 2)) { + /* Check if we need active message BW lanes */ + if (!(ucp_ep_get_context_features(ep) & (UCP_FEATURE_TAG | + UCP_FEATURE_AM)) || + (ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE) || + (context->config.ext.max_eager_lanes < 2)) { return UCS_OK; } @@ -1059,113 +1167,184 @@ static ucs_status_t ucp_wireup_add_am_bw_lanes(ucp_ep_h ep, const ucp_ep_params_ bw_info.criteria.remote_iface_flags = UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_CB_SYNC; bw_info.criteria.local_iface_flags = UCT_IFACE_FLAG_AM_BCOPY; + bw_info.criteria.remote_event_flags = 0; + bw_info.criteria.local_event_flags = 0; bw_info.criteria.calc_score = ucp_wireup_am_bw_score_func; bw_info.criteria.tl_rsc_flags = 0; ucp_wireup_clean_amo_criteria(&bw_info.criteria); - ucp_wireup_fill_ep_params_criteria(&bw_info.criteria, params); + ucp_wireup_fill_peer_err_criteria(&bw_info.criteria, ep_init_flags); - if (ucs_test_all_flags(ucp_ep_get_context_features(ep), UCP_FEATURE_TAG | - UCP_FEATURE_WAKEUP)) { - bw_info.criteria.local_iface_flags |= UCP_WORKER_UCT_UNSIG_EVENT_CAP_FLAGS; + if (ucs_test_all_flags(ucp_ep_get_context_features(ep), + UCP_FEATURE_TAG | UCP_FEATURE_WAKEUP)) { + bw_info.criteria.local_event_flags = UCP_WORKER_UCT_UNSIG_EVENT_CAP_FLAGS; } - bw_info.local_dev_bitmap = -1; - bw_info.remote_dev_bitmap = -1; + bw_info.local_dev_bitmap = UINT64_MAX; + bw_info.remote_dev_bitmap = UINT64_MAX; bw_info.md_map = 0; - bw_info.max_lanes = ep->worker->context->config.ext.max_eager_lanes - 1; - bw_info.usage = UCP_WIREUP_LANE_USAGE_AM_BW; + bw_info.max_lanes = context->config.ext.max_eager_lanes - 1; + bw_info.lane_type = UCP_LANE_TYPE_AM_BW; /* am_bw_lane[0] is am_lane, so don't re-select it here */ - for (lane_desc_idx = 0; lane_desc_idx < *num_lanes_p; ++lane_desc_idx) { - if (lane_descs[lane_desc_idx].usage & UCP_WIREUP_LANE_USAGE_AM) { - addr_index = lane_descs[lane_desc_idx].addr_index; - rsc_index = lane_descs[lane_desc_idx].rsc_index; - bw_info.md_map |= UCS_BIT(context->tl_rscs[rsc_index].md_index); - bw_info.local_dev_bitmap &= ~UCS_BIT(context->tl_rscs[rsc_index].dev_index); - bw_info.remote_dev_bitmap &= ~UCS_BIT(address_list[addr_index].dev_index); - if (ucp_wireup_is_ep_single_lane(ep, rsc_index)) { - /* if AM lane is SELF or SHMEM - then do not use more lanes */ - return UCS_OK; - } else { - break; /* do not continue searching due to we found - AM lane (and there is only one lane) */ - } + am_lane = UCP_NULL_LANE; + for (lane_desc_idx = 0; lane_desc_idx < select_ctx->num_lanes; ++lane_desc_idx) { + if (select_ctx->lane_descs[lane_desc_idx].lane_types & + UCS_BIT(UCP_LANE_TYPE_AM)) { + /* do not continue searching since we found AM lane (and there is + * only one AM lane) */ + am_lane = lane_desc_idx; + break; } } - return ucp_wireup_add_bw_lanes(ep, address_count, address_list, &bw_info, 1, - -1, lane_descs, num_lanes_p); + num_am_bw_lanes = ucp_wireup_add_bw_lanes(select_params, &bw_info, UINT64_MAX, + am_lane, select_ctx); + return ((am_lane != UCP_NULL_LANE) || (num_am_bw_lanes > 0)) ? UCS_OK : + UCS_ERR_UNREACHABLE; +} + +static uint64_t ucp_wireup_get_rma_bw_iface_flags(ucp_rndv_mode_t rndv_mode) +{ + switch (rndv_mode) { + case UCP_RNDV_MODE_AUTO: + return (UCT_IFACE_FLAG_GET_ZCOPY | UCT_IFACE_FLAG_PUT_ZCOPY); + case UCP_RNDV_MODE_GET_ZCOPY: + return UCT_IFACE_FLAG_GET_ZCOPY; + case UCP_RNDV_MODE_PUT_ZCOPY: + return UCT_IFACE_FLAG_PUT_ZCOPY; + default: + return 0; + } } -static ucs_status_t ucp_wireup_add_rma_bw_lanes(ucp_ep_h ep, - const ucp_ep_params_t *params, - unsigned ep_init_flags, unsigned address_count, - const ucp_address_entry_t *address_list, - ucp_wireup_lane_desc_t *lane_descs, - ucp_lane_index_t *num_lanes_p) +static ucs_status_t +ucp_wireup_add_rma_bw_lanes(const ucp_wireup_select_params_t *select_params, + ucp_wireup_select_context_t *select_ctx) { + ucp_ep_h ep = select_params->ep; + ucp_context_h context = ep->worker->context; + unsigned ep_init_flags = ucp_wireup_ep_init_flags(select_params, + select_ctx); + uint64_t iface_rma_flags = 0; + ucp_rndv_mode_t rndv_modes[] = { + context->config.ext.rndv_mode, + UCP_RNDV_MODE_GET_ZCOPY, + UCP_RNDV_MODE_PUT_ZCOPY + }; ucp_wireup_select_bw_info_t bw_info; - uct_memory_type_t mem_type; + ucs_memory_type_t mem_type; + size_t added_lanes; + uint64_t md_reg_flag; + uint8_t i; if (ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE) { - bw_info.criteria.remote_md_flags = 0; - bw_info.criteria.local_md_flags = 0; + md_reg_flag = 0; } else if (ucp_ep_get_context_features(ep) & UCP_FEATURE_TAG) { /* if needed for RNDV, need only access for remote registered memory */ - bw_info.criteria.remote_md_flags = UCT_MD_FLAG_REG; - bw_info.criteria.local_md_flags = UCT_MD_FLAG_REG; + md_reg_flag = UCT_MD_FLAG_REG; } else { return UCS_OK; } - bw_info.criteria.title = "high-bw remote memory access"; - bw_info.criteria.remote_iface_flags = UCT_IFACE_FLAG_GET_ZCOPY | - UCT_IFACE_FLAG_PUT_ZCOPY; - bw_info.criteria.local_iface_flags = bw_info.criteria.remote_iface_flags | - UCT_IFACE_FLAG_PENDING; + bw_info.criteria.remote_iface_flags = 0; + bw_info.criteria.local_iface_flags = UCT_IFACE_FLAG_PENDING; + bw_info.criteria.remote_event_flags = 0; + bw_info.criteria.local_event_flags = 0; bw_info.criteria.calc_score = ucp_wireup_rma_bw_score_func; bw_info.criteria.tl_rsc_flags = 0; + bw_info.criteria.remote_md_flags = md_reg_flag; ucp_wireup_clean_amo_criteria(&bw_info.criteria); - ucp_wireup_fill_ep_params_criteria(&bw_info.criteria, params); + ucp_wireup_fill_peer_err_criteria(&bw_info.criteria, ep_init_flags); if (ucs_test_all_flags(ucp_ep_get_context_features(ep), UCP_FEATURE_TAG | UCP_FEATURE_WAKEUP)) { - bw_info.criteria.local_iface_flags |= UCP_WORKER_UCT_UNSIG_EVENT_CAP_FLAGS; + bw_info.criteria.local_event_flags = UCP_WORKER_UCT_UNSIG_EVENT_CAP_FLAGS; } - bw_info.local_dev_bitmap = -1; - bw_info.remote_dev_bitmap = -1; + bw_info.local_dev_bitmap = UINT64_MAX; + bw_info.remote_dev_bitmap = UINT64_MAX; bw_info.md_map = 0; - bw_info.max_lanes = ep->worker->context->config.ext.max_rndv_lanes; - bw_info.usage = UCP_WIREUP_LANE_USAGE_RMA_BW; - for (mem_type = 0; mem_type < UCT_MD_MEM_TYPE_LAST; mem_type++) { - if (!ep->worker->context->mem_type_tls[mem_type]) { - continue; + /* check rkey_ptr */ + if (!(ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE) && + (context->config.ext.rndv_mode == UCP_RNDV_MODE_AUTO)) { + + /* We require remote memory registration and local ability to obtain + * a pointer to the remote key. Only one is needed since we are doing + * memory copy on the CPU. + * Allow selecting additional lanes in case the remote memory will not be + * registered with this memory domain, i.e with GPU memory. + */ + bw_info.lane_type = UCP_LANE_TYPE_RKEY_PTR; + bw_info.criteria.title = "obtain remote memory pointer"; + bw_info.criteria.local_md_flags = UCT_MD_FLAG_RKEY_PTR; + bw_info.max_lanes = 1; + + ucp_wireup_add_bw_lanes(select_params, &bw_info, + context->mem_type_access_tls[UCS_MEMORY_TYPE_HOST], + UCP_NULL_LANE, select_ctx); + } + + /* First checked RNDV mode has to be a mode specified in config */ + bw_info.lane_type = UCP_LANE_TYPE_RMA_BW; + bw_info.criteria.title = "high-bw remote memory access"; + bw_info.criteria.local_md_flags = md_reg_flag; + bw_info.max_lanes = context->config.ext.max_rndv_lanes; + ucs_assert(rndv_modes[0] == context->config.ext.rndv_mode); + + /* RNDV protocol can't mix different schemes, i.e. wireup has to + * select lanes with the same iface flags depends on a requested + * RNDV scheme. + * First of all, try to select lanes with RNDV scheme requested + * by user. If no lanes were selected and RNDV scheme in the + * configuration is AUTO, try other schemes. */ + UCS_STATIC_ASSERT(UCS_MEMORY_TYPE_HOST == 0); + for (i = 0; i < ucs_array_size(rndv_modes); i++) { + /* Remove the previous iface RMA flags */ + bw_info.criteria.remote_iface_flags &= ~iface_rma_flags; + bw_info.criteria.local_iface_flags &= ~iface_rma_flags; + + iface_rma_flags = ucp_wireup_get_rma_bw_iface_flags(rndv_modes[i]); + + /* Set the new iface RMA flags */ + bw_info.criteria.remote_iface_flags |= iface_rma_flags; + bw_info.criteria.local_iface_flags |= iface_rma_flags; + + added_lanes = 0; + + for (mem_type = UCS_MEMORY_TYPE_HOST; + mem_type < UCS_MEMORY_TYPE_LAST; mem_type++) { + if (!context->mem_type_access_tls[mem_type]) { + continue; + } + + added_lanes += ucp_wireup_add_bw_lanes(select_params, &bw_info, + context->mem_type_access_tls[mem_type], + UCP_NULL_LANE, select_ctx); } - ucp_wireup_add_bw_lanes(ep, address_count, address_list, &bw_info, 0, - ep->worker->context->mem_type_tls[mem_type], - lane_descs, num_lanes_p); + if (added_lanes /* There are selected lanes */ || + /* There are no selected lanes, but a user requested + * the exact RNDV scheme, so there is no other choice */ + (context->config.ext.rndv_mode != UCP_RNDV_MODE_AUTO)) { + break; + } } return UCS_OK; } /* Lane for transport offloaded tag interface */ -static ucs_status_t ucp_wireup_add_tag_lane(ucp_ep_h ep, unsigned address_count, - const ucp_address_entry_t *address_list, - ucp_wireup_lane_desc_t *lane_descs, - ucp_lane_index_t *num_lanes_p, - double am_score, - ucp_err_handling_mode_t err_mode) +static ucs_status_t +ucp_wireup_add_tag_lane(const ucp_wireup_select_params_t *select_params, + const ucp_wireup_select_info_t *am_info, + ucp_err_handling_mode_t err_mode, + ucp_wireup_select_context_t *select_ctx) { - ucp_wireup_criteria_t criteria = {0}; - ucp_rsc_index_t rsc_index; + ucp_ep_h ep = select_params->ep; + ucp_wireup_criteria_t criteria = {0}; + ucp_wireup_select_info_t select_info = {0}; ucs_status_t status; - unsigned addr_index; - double score; - int is_proxy; if (!(ucp_ep_get_context_features(ep) & UCP_FEATURE_TAG) || /* TODO: remove check below when UCP_ERR_HANDLING_MODE_PEER supports @@ -1183,40 +1362,32 @@ static ucs_status_t ucp_wireup_add_tag_lane(ucp_ep_h ep, unsigned address_count, UCT_IFACE_FLAG_TAG_RNDV_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY | UCT_IFACE_FLAG_PENDING; + criteria.remote_event_flags = 0; criteria.calc_score = ucp_wireup_am_score_func; - if (ucs_test_all_flags(ucp_ep_get_context_features(ep), UCP_FEATURE_WAKEUP)) { - criteria.local_iface_flags |= UCP_WORKER_UCT_UNSIG_EVENT_CAP_FLAGS; + if (ucs_test_all_flags(ucp_ep_get_context_features(ep), + UCP_FEATURE_WAKEUP)) { + criteria.local_event_flags = UCP_WORKER_UCT_UNSIG_EVENT_CAP_FLAGS; } /* Do not add tag offload lane, if selected tag lane score is lower * than AM score. In this case AM will be used for tag macthing. */ - status = ucp_wireup_select_transport(ep, address_list, address_count, &criteria, - -1, -1, -1, -1, 0, &rsc_index, &addr_index, - &score); - if ((status != UCS_OK) || (am_score > score)) { - goto out; + status = ucp_wireup_select_transport(select_params, &criteria, + UINT64_MAX, UINT64_MAX, UINT64_MAX, + UINT64_MAX, 0, &select_info); + if ((status == UCS_OK) && + (ucp_score_cmp(select_info.score, + am_info->score) >= 0)) { + return ucp_wireup_add_lane(select_params, &select_info, + UCP_LANE_TYPE_TAG, select_ctx); } - /* If the remote side is not p2p and has only signaled wakeup, it may - * deactivate its interface and wait for signaled tag message to wake up. - * Use a proxy lane which would send the first tag message as signaled to - * make sure the remote interface will indeed wake up. - */ - is_proxy = ucp_wireup_is_lane_proxy(ep, rsc_index, - address_list[addr_index].iface_attr.cap_flags); - - ucp_wireup_add_lane_desc(lane_descs, num_lanes_p, rsc_index, addr_index, - address_list[addr_index].md_index, score, - UCP_WIREUP_LANE_USAGE_TAG, is_proxy); - -out: return UCS_OK; } static ucp_lane_index_t ucp_wireup_select_wireup_msg_lane(ucp_worker_h worker, - const ucp_ep_params_t *ep_params, + unsigned ep_init_flags, const ucp_address_entry_t *address_list, const ucp_wireup_lane_desc_t *lane_descs, ucp_lane_index_t num_lanes) @@ -1230,7 +1401,7 @@ ucp_wireup_select_wireup_msg_lane(ucp_worker_h worker, ucp_lane_index_t lane; unsigned addr_index; - ucp_wireup_fill_aux_criteria(&criteria, ep_params); + ucp_wireup_fill_aux_criteria(&criteria, ep_init_flags); for (lane = 0; lane < num_lanes; ++lane) { rsc_index = lane_descs[lane].rsc_index; addr_index = lane_descs[lane].addr_index; @@ -1243,144 +1414,161 @@ ucp_wireup_select_wireup_msg_lane(ucp_worker_h worker, attrs->cap.flags, criteria.local_iface_flags, criteria.title, ucp_wireup_iface_flags, NULL, 0) && + ucp_wireup_check_flags(resource, + attrs->cap.event_flags, + criteria.local_event_flags, criteria.title, + ucp_wireup_event_flags, NULL, 0) && ucp_wireup_check_flags(resource, address_list[addr_index].iface_attr.cap_flags, criteria.remote_iface_flags, criteria.title, - ucp_wireup_iface_flags, NULL, 0)) - { - return lane; - } else if (ucp_worker_is_tl_p2p(worker, rsc_index)) { - p2p_lane = lane; - } + ucp_wireup_iface_flags, NULL, 0) && + ucp_wireup_check_flags(resource, + address_list[addr_index].iface_attr.event_flags, + criteria.remote_event_flags, criteria.title, + ucp_wireup_event_flags, NULL, 0)) + { + return lane; + } else if (ucp_worker_is_tl_p2p(worker, rsc_index)) { + p2p_lane = lane; + } } return p2p_lane; } -static uint64_t -ucp_wireup_get_reachable_mds(ucp_worker_h worker, unsigned address_count, - const ucp_address_entry_t *address_list) +static UCS_F_NOINLINE void +ucp_wireup_select_params_init(ucp_wireup_select_params_t *select_params, + ucp_ep_h ep, unsigned ep_init_flags, + const ucp_unpacked_address_t *remote_address, + uint64_t tl_bitmap, int show_error) { - ucp_context_h context = worker->context; - uint64_t reachable_mds = 0; - const ucp_address_entry_t *ae; - ucp_rsc_index_t rsc_index; - - for (rsc_index = 0; rsc_index < context->num_tls; ++rsc_index) { - for (ae = address_list; ae < address_list + address_count; ++ae) { - if (ucp_wireup_is_reachable(worker, rsc_index, ae)) { - reachable_mds |= UCS_BIT(ae->md_index); - } - } - } - - return reachable_mds; + select_params->ep = ep; + select_params->ep_init_flags = ep_init_flags; + select_params->tl_bitmap = tl_bitmap; + select_params->address = remote_address; + select_params->allow_am = + ucp_wireup_allow_am_emulation_layer(ep_init_flags); + select_params->show_error = show_error; } -ucs_status_t ucp_wireup_select_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, - unsigned ep_init_flags, unsigned address_count, - const ucp_address_entry_t *address_list, - uint8_t *addr_indices, - ucp_ep_config_key_t *key) +static UCS_F_NOINLINE ucs_status_t +ucp_wireup_search_lanes(const ucp_wireup_select_params_t *select_params, + ucp_err_handling_mode_t err_mode, + ucp_wireup_select_context_t *select_ctx) { - ucp_worker_h worker = ep->worker; - ucp_context_h context = worker->context; - double am_score = 0.0; - ucp_wireup_lane_desc_t lane_descs[UCP_MAX_LANES]; - ucp_rsc_index_t rsc_index; - ucp_md_index_t md_index; - ucp_lane_index_t lane; - ucp_lane_index_t i; + ucp_wireup_select_info_t am_info; ucs_status_t status; - int need_am = 0; - memset(lane_descs, 0, sizeof(lane_descs)); - ucp_ep_config_key_reset(key); - ucp_ep_config_key_set_params(key, params); + memset(select_ctx, 0, sizeof(*select_ctx)); - status = ucp_wireup_add_rma_lanes(ep, params, ep_init_flags, address_count, - address_list, lane_descs, &key->num_lanes, - &need_am); + status = ucp_wireup_add_cm_lane(select_params, select_ctx); if (status != UCS_OK) { return status; } - status = ucp_wireup_add_amo_lanes(ep, params, ep_init_flags, address_count, - address_list, lane_descs, &key->num_lanes, - &need_am); + status = ucp_wireup_add_rma_lanes(select_params, select_ctx); if (status != UCS_OK) { return status; } - if (need_am) { - ep_init_flags |= UCP_EP_CREATE_AM_LANE; + status = ucp_wireup_add_amo_lanes(select_params, select_ctx); + if (status != UCS_OK) { + return status; } - status = ucp_wireup_add_am_lane(ep, params, ep_init_flags, address_count, - address_list, lane_descs, &key->num_lanes, - &am_score, key->err_mode); + /* Add AM lane only after RMA/AMO was selected to be aware + * about whether they need emulation over AM or not */ + status = ucp_wireup_add_am_lane(select_params, &am_info, select_ctx); if (status != UCS_OK) { return status; } - status = ucp_wireup_add_rma_bw_lanes(ep, params, ep_init_flags, address_count, - address_list, lane_descs, &key->num_lanes); + status = ucp_wireup_add_rma_bw_lanes(select_params, select_ctx); if (status != UCS_OK) { return status; } - status = ucp_wireup_add_tag_lane(ep, address_count, address_list, - lane_descs, &key->num_lanes, am_score, - key->err_mode); + status = ucp_wireup_add_tag_lane(select_params, &am_info, err_mode, + select_ctx); if (status != UCS_OK) { return status; } /* call ucp_wireup_add_am_bw_lanes after ucp_wireup_add_am_lane to * allow exclude AM lane from AM_BW list */ - status = ucp_wireup_add_am_bw_lanes(ep, params, ep_init_flags, address_count, - address_list, lane_descs, &key->num_lanes); + status = ucp_wireup_add_am_bw_lanes(select_params, select_ctx); if (status != UCS_OK) { return status; } /* User should not create endpoints unless requested communication features */ - if (key->num_lanes == 0) { + if (select_ctx->num_lanes == 0) { ucs_error("No transports selected to %s (features: 0x%lx)", - ucp_ep_peer_name(ep), ucp_ep_get_context_features(ep)); + select_params->address->name, + ucp_ep_get_context_features(select_params->ep)); return UCS_ERR_UNREACHABLE; } + return UCS_OK; +} + +static UCS_F_NOINLINE void +ucp_wireup_construct_lanes(const ucp_wireup_select_params_t *select_params, + ucp_wireup_select_context_t *select_ctx, + unsigned *addr_indices, ucp_ep_config_key_t *key) +{ + ucp_ep_h ep = select_params->ep; + ucp_worker_h worker = ep->worker; + ucp_context_h context = worker->context; + ucp_rsc_index_t rsc_index; + ucp_md_index_t md_index; + ucp_lane_index_t lane; + ucp_lane_index_t i; + + key->num_lanes = select_ctx->num_lanes; /* Construct the endpoint configuration key: * - arrange lane description in the EP configuration * - create remote MD bitmap * - if AM lane exists and fits for wireup messages, select it for this purpose. */ for (lane = 0; lane < key->num_lanes; ++lane) { - ucs_assert(lane_descs[lane].usage != 0); - key->lanes[lane].rsc_index = lane_descs[lane].rsc_index; - key->lanes[lane].proxy_lane = lane_descs[lane].proxy_lane; - key->lanes[lane].dst_md_index = lane_descs[lane].dst_md_index; - addr_indices[lane] = lane_descs[lane].addr_index; - - if (lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_AM) { + ucs_assert(select_ctx->lane_descs[lane].lane_types != 0); + key->lanes[lane].rsc_index = select_ctx->lane_descs[lane].rsc_index; + key->lanes[lane].proxy_lane = select_ctx->lane_descs[lane].proxy_lane; + key->lanes[lane].dst_md_index = select_ctx->lane_descs[lane].dst_md_index; + key->lanes[lane].path_index = select_ctx->lane_descs[lane].path_index; + key->lanes[lane].lane_types = select_ctx->lane_descs[lane].lane_types; + addr_indices[lane] = select_ctx->lane_descs[lane].addr_index; + + if (select_ctx->lane_descs[lane].lane_types & UCS_BIT(UCP_LANE_TYPE_CM)) { + ucs_assert(key->cm_lane == UCP_NULL_LANE); + key->cm_lane = lane; + /* CM lane can't be shared with TL lane types */ + ucs_assert(ucs_popcount(select_ctx->lane_descs[lane].lane_types) == 1); + continue; + } + if (select_ctx->lane_descs[lane].lane_types & UCS_BIT(UCP_LANE_TYPE_AM)) { ucs_assert(key->am_lane == UCP_NULL_LANE); key->am_lane = lane; } - if ((lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_AM_BW) && + if ((select_ctx->lane_descs[lane].lane_types & UCS_BIT(UCP_LANE_TYPE_AM_BW)) && (lane < UCP_MAX_LANES - 1)) { key->am_bw_lanes[lane + 1] = lane; } - if (lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_RMA) { + if (select_ctx->lane_descs[lane].lane_types & UCS_BIT(UCP_LANE_TYPE_RMA)) { key->rma_lanes[lane] = lane; } - if (lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_RMA_BW) { + if (select_ctx->lane_descs[lane].lane_types & UCS_BIT(UCP_LANE_TYPE_RMA_BW)) { key->rma_bw_lanes[lane] = lane; } - if (lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_AMO) { + if (select_ctx->lane_descs[lane].lane_types & UCS_BIT(UCP_LANE_TYPE_RKEY_PTR)) { + ucs_assert(key->rkey_ptr_lane == UCP_NULL_LANE); + key->rkey_ptr_lane = lane; + } + if (select_ctx->lane_descs[lane].lane_types & UCS_BIT(UCP_LANE_TYPE_AMO)) { key->amo_lanes[lane] = lane; } - if (lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_TAG) { + if (select_ctx->lane_descs[lane].lane_types & UCS_BIT(UCP_LANE_TYPE_TAG)) { ucs_assert(key->tag_lane == UCP_NULL_LANE); key->tag_lane = lane; } @@ -1388,31 +1576,32 @@ ucs_status_t ucp_wireup_select_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, /* Sort AM, RMA and AMO lanes according to score */ ucs_qsort_r(key->am_bw_lanes + 1, UCP_MAX_LANES - 1, sizeof(ucp_lane_index_t), - ucp_wireup_compare_lane_am_bw_score, lane_descs); + ucp_wireup_compare_lane_am_bw_score, select_ctx->lane_descs); ucs_qsort_r(key->rma_lanes, UCP_MAX_LANES, sizeof(ucp_lane_index_t), - ucp_wireup_compare_lane_rma_score, lane_descs); + ucp_wireup_compare_lane_rma_score, select_ctx->lane_descs); ucs_qsort_r(key->rma_bw_lanes, UCP_MAX_LANES, sizeof(ucp_lane_index_t), - ucp_wireup_compare_lane_rma_bw_score, lane_descs); + ucp_wireup_compare_lane_rma_bw_score, select_ctx->lane_descs); ucs_qsort_r(key->amo_lanes, UCP_MAX_LANES, sizeof(ucp_lane_index_t), - ucp_wireup_compare_lane_amo_score, lane_descs); - - /* Get all reachable MDs from full remote address list */ - key->reachable_md_map = ucp_wireup_get_reachable_mds(worker, address_count, - address_list); - - /* Select lane for wireup messages */ - key->wireup_lane = ucp_wireup_select_wireup_msg_lane(worker, params, - address_list, - lane_descs, - key->num_lanes); + ucp_wireup_compare_lane_amo_score, select_ctx->lane_descs); + + if (!ucp_ep_init_flags_has_cm(select_params->ep_init_flags)) { + /* Select lane for wireup messages */ + key->wireup_lane = + ucp_wireup_select_wireup_msg_lane(worker, + ucp_wireup_ep_init_flags(select_params, + select_ctx), + select_params->address->address_list, + select_ctx->lane_descs, + key->num_lanes); + } /* add to map first UCP_MAX_OP_MDS fastest MD's */ for (i = 0; (key->rma_bw_lanes[i] != UCP_NULL_LANE) && (ucs_popcount(key->rma_bw_md_map) < UCP_MAX_OP_MDS); i++) { lane = key->rma_bw_lanes[i]; - rsc_index = lane_descs[lane].rsc_index; - md_index = worker->context->tl_rscs[rsc_index].md_index; + rsc_index = select_ctx->lane_descs[lane].rsc_index; + md_index = context->tl_rscs[rsc_index].md_index; /* Pack remote key only if needed for RMA. * FIXME a temporary workaround to prevent the ugni uct from using rndv. */ @@ -1422,10 +1611,53 @@ ucs_status_t ucp_wireup_select_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, } } + if ((key->rkey_ptr_lane != UCP_NULL_LANE) && + (ucs_popcount(key->rma_bw_md_map) < UCP_MAX_OP_MDS)) { + rsc_index = select_ctx->lane_descs[key->rkey_ptr_lane].rsc_index; + md_index = context->tl_rscs[rsc_index].md_index; + key->rma_bw_md_map |= UCS_BIT(md_index); + } + /* use AM lane first for eager AM transport to simplify processing single/middle * msg packets */ key->am_bw_lanes[0] = key->am_lane; +} + +ucs_status_t +ucp_wireup_select_lanes(ucp_ep_h ep, unsigned ep_init_flags, uint64_t tl_bitmap, + const ucp_unpacked_address_t *remote_address, + unsigned *addr_indices, ucp_ep_config_key_t *key) +{ + ucp_worker_h worker = ep->worker; + uint64_t scalable_tl_bitmap = worker->scalable_tl_bitmap & tl_bitmap; + ucp_wireup_select_context_t select_ctx; + ucp_wireup_select_params_t select_params; + ucs_status_t status; + + if (scalable_tl_bitmap) { + ucp_wireup_select_params_init(&select_params, ep, ep_init_flags, + remote_address, scalable_tl_bitmap, 0); + status = ucp_wireup_search_lanes(&select_params, key->err_mode, + &select_ctx); + if (status == UCS_OK) { + goto out; + } + /* If the transport selection based on the scalable TL bitmap wasn't + * successful, repeat the selection procedure with full TL bitmap in + * order to select best transports based on their scores only */ + } + + ucp_wireup_select_params_init(&select_params, ep, ep_init_flags, + remote_address, tl_bitmap, 1); + status = ucp_wireup_search_lanes(&select_params, key->err_mode, + &select_ctx); + if (status != UCS_OK) { + return status; + } + +out: + ucp_wireup_construct_lanes(&select_params, &select_ctx, addr_indices, key); return UCS_OK; } @@ -1439,55 +1671,58 @@ static double ucp_wireup_aux_score_func(ucp_context_h context, iface_attr->overhead + remote_iface_attr->overhead)); } -ucs_status_t ucp_wireup_select_aux_transport(ucp_ep_h ep, - const ucp_ep_params_t *params, - const ucp_address_entry_t *address_list, - unsigned address_count, - ucp_rsc_index_t *rsc_index_p, - unsigned *addr_index_p) +ucs_status_t +ucp_wireup_select_aux_transport(ucp_ep_h ep, unsigned ep_init_flags, + const ucp_unpacked_address_t *remote_address, + ucp_wireup_select_info_t *select_info) { ucp_wireup_criteria_t criteria = {0}; - double score; - - ucp_wireup_fill_aux_criteria(&criteria, params); - return ucp_wireup_select_transport(ep, address_list, address_count, - &criteria, -1, -1, -1, -1, 1, rsc_index_p, - addr_index_p, &score); + ucp_wireup_select_params_t select_params; + + ucp_wireup_select_params_init(&select_params, ep, ep_init_flags, + remote_address, UINT64_MAX, 1); + ucp_wireup_fill_aux_criteria(&criteria, ep_init_flags); + return ucp_wireup_select_transport(&select_params, &criteria, + UINT64_MAX, UINT64_MAX, UINT64_MAX, + UINT64_MAX, 1, select_info); } -ucs_status_t ucp_wireup_select_sockaddr_transport(ucp_ep_h ep, - const ucp_ep_params_t *params, - ucp_rsc_index_t *rsc_index_p) +ucs_status_t +ucp_wireup_select_sockaddr_transport(const ucp_context_h context, + const ucs_sock_addr_t *sockaddr, + ucp_rsc_index_t *rsc_index_p) { - ucp_worker_h worker = ep->worker; - ucp_context_h context = worker->context; char saddr_str[UCS_SOCKADDR_STRING_LEN]; ucp_tl_resource_desc_t *resource; ucp_rsc_index_t tl_id; ucp_md_index_t md_index; uct_md_h md; + int i; - for (tl_id = 0; tl_id < context->num_tls; ++tl_id) { + /* Go over the sockaddr transports priority array and try to use the transports + * one by one for the client side */ + for (i = 0; i < context->config.num_sockaddr_tls; i++) { + tl_id = context->config.sockaddr_tl_ids[i]; resource = &context->tl_rscs[tl_id]; - if (!(resource->flags & UCP_TL_RSC_FLAG_SOCKADDR)) { - continue; - } - md_index = resource->md_index; md = context->tl_mds[md_index].md; + ucs_assert(context->tl_mds[md_index].attr.cap.flags & UCT_MD_FLAG_SOCKADDR); - if (uct_md_is_sockaddr_accessible(md, ¶ms->sockaddr, + /* The client selects the transport for sockaddr according to the + * configuration. We rely on the server having this transport available + * as well */ + if (uct_md_is_sockaddr_accessible(md, sockaddr, UCT_SOCKADDR_ACC_REMOTE)) { - /* TODO use score to prefer best tl rather than using first one */ *rsc_index_p = tl_id; + ucs_debug("sockaddr transport selected: %s", resource->tl_rsc.tl_name); return UCS_OK; } ucs_debug("md %s cannot reach %s", context->tl_mds[md_index].rsc.md_name, - ucs_sockaddr_str(params->sockaddr.addr, saddr_str, + ucs_sockaddr_str(sockaddr->addr, saddr_str, sizeof(saddr_str))); } diff --git a/src/ucp/wireup/signaling_ep.c b/src/ucp/wireup/signaling_ep.c index 4459276603b..7fe876f273f 100644 --- a/src/ucp/wireup/signaling_ep.c +++ b/src/ucp/wireup/signaling_ep.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "wireup.h" #include @@ -22,7 +26,7 @@ static size_t ucp_signaling_ep_pack_short(void *dest, void *arg) ucp_signaling_ep_pack_ctx_t *ctx = arg; *(uint64_t*)dest = ctx->header; - memcpy(dest + sizeof(uint64_t), ctx->payload, ctx->length); + memcpy(UCS_PTR_BYTE_OFFSET(dest, sizeof(uint64_t)), ctx->payload, ctx->length); return sizeof(uint64_t) + ctx->length; } @@ -46,6 +50,9 @@ ucp_signaling_ep_am_short(uct_ep_h ep, uint8_t id, uint64_t header, ctx.payload = payload; ctx.length = length; + ucp_assert_memtype(proxy_ep->ucp_ep->worker->context, ctx.payload, + ctx.length, UCS_MEMORY_TYPE_HOST); + packed_size = uct_ep_am_bcopy(proxy_ep->uct_ep, id, ucp_signaling_ep_pack_short, &ctx, UCT_SEND_FLAG_SIGNALED); @@ -99,6 +106,9 @@ ucp_signaling_ep_tag_eager_short(uct_ep_h ep, uct_tag_t tag, const void *data, ctx.payload = data; ctx.length = length; + ucp_assert_memtype(proxy_ep->ucp_ep->worker->context, ctx.payload, + ctx.length, UCS_MEMORY_TYPE_HOST); + packed_size = uct_ep_tag_eager_bcopy(proxy_ep->uct_ep, tag, 0, ucp_signaling_ep_pack_tag_short, &ctx, UCT_SEND_FLAG_SIGNALED); diff --git a/src/ucp/wireup/wireup.c b/src/ucp/wireup/wireup.c index 310e29d227a..f63ae8940c2 100644 --- a/src/ucp/wireup/wireup.c +++ b/src/ucp/wireup/wireup.c @@ -4,8 +4,13 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "wireup.h" #include "address.h" +#include "wireup_cm.h" #include "wireup_ep.h" #include @@ -30,6 +35,48 @@ static size_t ucp_wireup_msg_pack(void *dest, void *arg) return sizeof(ucp_wireup_msg_t) + req->send.length; } +static const char* ucp_wireup_msg_str(uint8_t msg_type) +{ + switch (msg_type) { + case UCP_WIREUP_MSG_PRE_REQUEST: + return "PRE_REQ"; + case UCP_WIREUP_MSG_REQUEST: + return "REQ"; + case UCP_WIREUP_MSG_REPLY: + return "REP"; + case UCP_WIREUP_MSG_ACK: + return "ACK"; + default: + return ""; + } +} + +static ucp_lane_index_t ucp_wireup_get_msg_lane(ucp_ep_h ep, uint8_t msg_type) +{ + ucp_context_h context = ep->worker->context; + ucp_ep_config_t *ep_config = ucp_ep_config(ep); + ucp_lane_index_t lane = UCP_NULL_LANE; + + if (msg_type != UCP_WIREUP_MSG_ACK) { + /* for request/response, try wireup_lane first */ + lane = ep_config->key.wireup_lane; + } + + if (lane == UCP_NULL_LANE) { + /* fallback to active messages lane */ + lane = ep_config->key.am_lane; + } + + if (lane == UCP_NULL_LANE) { + ucs_fatal("ep %p to %s: could not find a lane to send CONN_%s%s", + ep, ucp_ep_peer_name(ep), ucp_wireup_msg_str(msg_type), + context->config.ext.unified_mode ? + ". try to set UCX_UNIFIED_MODE=n." : ""); + } + + return lane; +} + ucs_status_t ucp_wireup_msg_progress(uct_pending_req_t *self) { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); @@ -48,11 +95,7 @@ ucs_status_t ucp_wireup_msg_progress(uct_pending_req_t *self) } /* send the active message */ - if (req->send.wireup.type == UCP_WIREUP_MSG_ACK) { - req->send.lane = ucp_ep_get_am_lane(ep); - } else { - req->send.lane = ucp_ep_get_wireup_msg_lane(ep); - } + req->send.lane = ucp_wireup_get_msg_lane(ep, req->send.wireup.type); am_flags = 0; if ((req->send.wireup.type == UCP_WIREUP_MSG_REQUEST) || @@ -67,7 +110,8 @@ ucs_status_t ucp_wireup_msg_progress(uct_pending_req_t *self) ucp_wireup_msg_pack, req, am_flags); if (packed_len < 0) { if (packed_len != UCS_ERR_NO_RESOURCE) { - ucs_error("failed to send wireup: %s", ucs_status_string(packed_len)); + ucs_error("failed to send wireup: %s", + ucs_status_string((ucs_status_t)packed_len)); } return (ucs_status_t)packed_len; } @@ -93,13 +137,6 @@ ucs_status_t ucp_wireup_msg_progress(uct_pending_req_t *self) return UCS_OK; } -static unsigned ucp_wireup_address_index(const unsigned *order, - uint64_t tl_bitmap, - ucp_rsc_index_t tl_index) -{ - return order[ucs_bitmap2idx(tl_bitmap, tl_index)]; -} - static inline int ucp_wireup_is_ep_needed(ucp_ep_h ep) { return (ep != NULL) && !(ep->flags & UCP_EP_FLAG_LISTENER); @@ -108,18 +145,15 @@ static inline int ucp_wireup_is_ep_needed(ucp_ep_h ep) /* * @param [in] rsc_tli Resource index for every lane. */ -static ucs_status_t ucp_wireup_msg_send(ucp_ep_h ep, uint8_t type, - uint64_t tl_bitmap, - const ucp_rsc_index_t *rsc_tli) +static ucs_status_t +ucp_wireup_msg_send(ucp_ep_h ep, uint8_t type, uint64_t tl_bitmap, + const ucp_lane_index_t *lanes2remote) { - ucp_rsc_index_t rsc_index; - ucp_lane_index_t lane; ucp_request_t* req; ucs_status_t status; void *address; - unsigned *order = ucs_alloca(ep->worker->context->num_tls * sizeof(*order)); - ucs_assert(ep->cfg_index != (uint8_t)-1); + ucs_assert(ep->cfg_index != UCP_NULL_CFG_INDEX); /* We cannot allocate from memory pool because it's not thread safe * and this function may be called from any thread @@ -146,8 +180,10 @@ static ucs_status_t ucp_wireup_msg_send(ucp_ep_h ep, uint8_t type, ucp_request_send_state_init(req, ucp_dt_make_contig(1), 0); /* pack all addresses */ - status = ucp_address_pack(ep->worker, ucp_wireup_is_ep_needed(ep) ? ep : NULL, - tl_bitmap, order, &req->send.length, &address); + status = ucp_address_pack(ep->worker, + ucp_wireup_is_ep_needed(ep) ? ep : NULL, + tl_bitmap, UCP_ADDRESS_PACK_FLAGS_ALL, + lanes2remote, &req->send.length, &address); if (status != UCS_OK) { ucs_free(req); ucs_error("failed to pack address: %s", ucs_status_string(status)); @@ -156,30 +192,115 @@ static ucs_status_t ucp_wireup_msg_send(ucp_ep_h ep, uint8_t type, req->send.buffer = address; - /* send the indices addresses that should be connected by remote side */ + ucp_request_send(req, 0); + return UCS_OK; +} + +static uint64_t ucp_wireup_get_ep_tl_bitmap(ucp_ep_h ep, ucp_lane_map_t lane_map) +{ + uint64_t tl_bitmap = 0; + ucp_lane_index_t lane; + + ucs_for_each_bit(lane, lane_map) { + ucs_assert(lane < UCP_MAX_LANES); + tl_bitmap |= UCS_BIT(ucp_ep_get_rsc_index(ep, lane)); + } + + return tl_bitmap; +} + +/* + * Select remote ep address for every remote address entry (because there + * could be multiple ep addresses per entry). This selection is used to create + * 'lanes2remote' mapping with the remote lane index for each local lane. + */ +static void +ucp_wireup_match_p2p_lanes(ucp_ep_h ep, + const ucp_unpacked_address_t *remote_address, + const unsigned *addr_indices, + ucp_lane_index_t *lanes2remote) +{ + const ucp_address_entry_t *address; + unsigned address_index; + ucp_lane_index_t lane, remote_lane; + unsigned *ep_addr_indexes; + unsigned ep_addr_index; + uint64_t UCS_V_UNUSED used_remote_lanes; + + /* Initialize the counters of ep address index for each address entry */ + ep_addr_indexes = ucs_alloca(sizeof(ep_addr_index) * + remote_address->address_count); + for (address_index = 0; address_index < remote_address->address_count; + ++address_index) { + ep_addr_indexes[address_index] = 0; + } + + /* Initialize lanes2remote array */ for (lane = 0; lane < UCP_MAX_LANES; ++lane) { - rsc_index = rsc_tli[lane]; - if (rsc_index != UCP_NULL_RESOURCE) { - req->send.wireup.tli[lane] = ucp_wireup_address_index(order, - tl_bitmap, - rsc_index); - } else { - req->send.wireup.tli[lane] = -1; - } + lanes2remote[lane] = UCP_NULL_LANE; } - ucp_request_send(req, 0); - return UCS_OK; + used_remote_lanes = 0; + for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { + if (!ucp_ep_is_lane_p2p(ep, lane)) { + continue; + } + + /* Select next remote ep address within the address_index as specified + * by addr_indices argument + */ + address_index = addr_indices[lane]; + address = &remote_address->address_list[address_index]; + ep_addr_index = ep_addr_indexes[address_index]++; + ucs_assertv(ep_addr_index < address->num_ep_addrs, + "ep_addr_index=%u num_ep_addrs=%u", + ep_addr_index, address->num_ep_addrs); + remote_lane = address->ep_addrs[ep_addr_index].lane; + lanes2remote[lane] = remote_lane; + + if (used_remote_lanes & UCS_BIT(remote_lane)) { + ucs_fatal("ep %p: remote lane %d is used more than once", ep, + remote_lane); + } + used_remote_lanes |= UCS_BIT(remote_lane); + + ucs_trace("ep %p: lane[%d]->remote_lane[%d] (address[%d].ep_address[%d])", + ep, lane, remote_lane, address_index, ep_addr_index); + } } -static ucs_status_t ucp_wireup_connect_local(ucp_ep_h ep, const uint8_t *tli, - unsigned address_count, - const ucp_address_entry_t *address_list) +static ucs_status_t +ucp_wireup_find_remote_p2p_addr(ucp_ep_h ep, ucp_lane_index_t remote_lane, + const ucp_unpacked_address_t *remote_address, + const uct_ep_addr_t **ep_addr_p, + const uct_device_addr_t **dev_addr_p) { const ucp_address_entry_t *address; - ucp_lane_index_t lane; + unsigned ep_addr_index; + + ucp_unpacked_address_for_each(address, remote_address) { + for (ep_addr_index = 0; ep_addr_index < address->num_ep_addrs; + ++ep_addr_index) { + if (remote_lane == address->ep_addrs[ep_addr_index].lane) { + *ep_addr_p = address->ep_addrs[ep_addr_index].addr; + *dev_addr_p = address->dev_addr; + return UCS_OK; + } + } + } + + return UCS_ERR_UNREACHABLE; +} + +ucs_status_t +ucp_wireup_connect_local(ucp_ep_h ep, + const ucp_unpacked_address_t *remote_address, + const ucp_lane_index_t *lanes2remote) +{ + ucp_lane_index_t lane, remote_lane; + const uct_device_addr_t *dev_addr; + const uct_ep_addr_t *ep_addr; ucs_status_t status; - ucp_md_map_t UCS_V_UNUSED md_map; ucs_trace("ep %p: connect local transports", ep); @@ -188,9 +309,17 @@ static ucs_status_t ucp_wireup_connect_local(ucp_ep_h ep, const uint8_t *tli, continue; } - address = &address_list[tli[lane]]; - status = uct_ep_connect_to_ep(ep->uct_eps[lane], address->dev_addr, - address->ep_addr); + remote_lane = (lanes2remote == NULL) ? lane : lanes2remote[lane]; + + status = ucp_wireup_find_remote_p2p_addr(ep, remote_lane, remote_address, + &ep_addr, &dev_addr); + if (status != UCS_OK) { + ucs_error("ep %p: no remote ep address for lane[%d]->remote_lane[%d]", + ep, lane, remote_lane); + return status; + } + + status = uct_ep_connect_to_ep(ep->uct_eps[lane], dev_addr, ep_addr); if (status != UCS_OK) { return status; } @@ -199,7 +328,7 @@ static ucs_status_t ucp_wireup_connect_local(ucp_ep_h ep, const uint8_t *tli, return UCS_OK; } -static void ucp_wireup_remote_connected(ucp_ep_h ep) +void ucp_wireup_remote_connected(ucp_ep_h ep) { ucp_lane_index_t lane; @@ -225,14 +354,12 @@ static void ucp_wireup_remote_connected(ucp_ep_h ep) static ucs_status_t ucp_wireup_init_lanes_by_request(ucp_worker_h worker, ucp_ep_h ep, - const ucp_ep_params_t *params, - unsigned ep_init_flags, unsigned address_count, - const ucp_address_entry_t *address_list, - uint8_t *addr_indices) + unsigned ep_init_flags, + const ucp_unpacked_address_t *remote_address, + unsigned *addr_indices) { - ucs_status_t status = ucp_wireup_init_lanes(ep, params, ep_init_flags, - address_count, address_list, - addr_indices); + ucs_status_t status = ucp_wireup_init_lanes(ep, ep_init_flags, UINT64_MAX, + remote_address, addr_indices); if (status == UCS_OK) { return UCS_OK; } @@ -246,8 +373,8 @@ static UCS_F_NOINLINE void ucp_wireup_process_pre_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg, const ucp_unpacked_address_t *remote_address) { - uint8_t addr_indices[UCP_MAX_LANES]; - ucp_ep_params_t params; + unsigned ep_init_flags = UCP_EP_INIT_CREATE_AM_LANE; + unsigned addr_indices[UCP_MAX_LANES]; ucs_status_t status; ucp_ep_h ep; @@ -263,15 +390,13 @@ ucp_wireup_process_pre_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg, ucp_ep_update_dest_ep_ptr(ep, msg->src_ep_ptr); ucp_ep_flush_state_reset(ep); - params.field_mask = UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE; - params.err_mode = ucp_ep_config(ep)->key.err_mode; + if (ucp_ep_config(ep)->key.err_mode == UCP_ERR_HANDLING_MODE_PEER) { + ep_init_flags |= UCP_EP_INIT_ERR_MODE_PEER_FAILURE; + } /* initialize transport endpoints */ - status = ucp_wireup_init_lanes_by_request(worker, ep, ¶ms, - UCP_EP_CREATE_AM_LANE, - remote_address->address_count, - remote_address->address_list, - addr_indices); + status = ucp_wireup_init_lanes_by_request(worker, ep, ep_init_flags, + remote_address, addr_indices); if (status != UCS_OK) { return; } @@ -290,11 +415,8 @@ ucp_wireup_process_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg, uint64_t tl_bitmap = 0; int send_reply = 0; unsigned ep_init_flags = 0; - ucp_rsc_index_t rsc_tli[UCP_MAX_LANES]; - uint8_t addr_indices[UCP_MAX_LANES]; - ucp_lane_index_t lane, remote_lane; - ucp_rsc_index_t rsc_index; - ucp_ep_params_t params; + ucp_rsc_index_t lanes2remote[UCP_MAX_LANES]; + unsigned addr_indices[UCP_MAX_LANES]; ucs_status_t status; ucp_ep_flags_t listener_flag; ucp_ep_h ep; @@ -313,14 +435,14 @@ ucp_wireup_process_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg, * flush state) should be valid until user's callback invoking */ ucp_ep_flush_state_reset(ep); } - ep_init_flags |= UCP_EP_CREATE_AM_LANE; + ep_init_flags |= UCP_EP_INIT_CREATE_AM_LANE; } else { ep = ucp_ep_match_retrieve_exp(&worker->ep_match_ctx, remote_uuid, msg->conn_sn ^ (remote_uuid == worker->uuid)); if (ep == NULL) { /* Create a new endpoint if does not exist */ - status = ucp_ep_new(worker, remote_address->name, "remote-request", - &ep); + status = ucp_worker_create_ep(worker, remote_address->name, + "remote-request", &ep); if (status != UCS_OK) { return; } @@ -349,9 +471,6 @@ ucp_wireup_process_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg, } } - params.field_mask = UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE; - params.err_mode = msg->err_mode; - if (ep->flags & UCP_EP_FLAG_LISTENER) { /* If this is an ep on a listener (server) that received a partial * worker address from the client, then the following lanes initialization @@ -361,15 +480,19 @@ ucp_wireup_process_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg, ucp_ep_cleanup_lanes(ep); } + if (msg->err_mode == UCP_ERR_HANDLING_MODE_PEER) { + ep_init_flags |= UCP_EP_INIT_ERR_MODE_PEER_FAILURE; + } + /* Initialize lanes (possible destroy existing lanes) */ - status = ucp_wireup_init_lanes_by_request(worker, ep, ¶ms, ep_init_flags, - remote_address->address_count, - remote_address->address_list, - addr_indices); + status = ucp_wireup_init_lanes_by_request(worker, ep, ep_init_flags, + remote_address, addr_indices); if (status != UCS_OK) { return; } + ucp_wireup_match_p2p_lanes(ep, remote_address, addr_indices, lanes2remote); + /* Send a reply if remote side does not have ep_ptr (active-active flow) or * there are p2p lanes (client-server flow) */ @@ -377,13 +500,13 @@ ucp_wireup_process_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg, /* Connect p2p addresses to remote endpoint */ if (!(ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED)) { - status = ucp_wireup_connect_local(ep, addr_indices, - remote_address->address_count, - remote_address->address_list); + status = ucp_wireup_connect_local(ep, remote_address, lanes2remote); if (status != UCS_OK) { return; } + tl_bitmap = ucp_wireup_get_ep_tl_bitmap(ep, + ucp_ep_config(ep)->p2p_lanes); ep->flags |= UCP_EP_FLAG_LOCAL_CONNECTED; ucs_assert(send_reply); @@ -401,24 +524,9 @@ ucp_wireup_process_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg, * (so that address packing would be correct) */ ep->flags &= ~UCP_EP_FLAG_LISTENER; - /* Construct the list that tells the remote side with which address we - * have connected to each of its lanes. - */ - memset(rsc_tli, -1, sizeof(rsc_tli)); - for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { - rsc_index = ucp_ep_get_rsc_index(ep, lane); - for (remote_lane = 0; remote_lane < UCP_MAX_LANES; ++remote_lane) { - /* If 'lane' has connected to 'remote_lane' ... */ - if (addr_indices[lane] == msg->tli[remote_lane]) { - ucs_assert(ucp_worker_is_tl_p2p(worker, rsc_index)); - rsc_tli[remote_lane] = rsc_index; - tl_bitmap |= UCS_BIT(rsc_index); - } - } - } - ucs_trace("ep %p: sending wireup reply", ep); - status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_REPLY, tl_bitmap, rsc_tli); + status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_REPLY, tl_bitmap, + lanes2remote); if (status != UCS_OK) { return; } @@ -476,9 +584,13 @@ ucp_wireup_process_reply(ucp_worker_h worker, const ucp_wireup_msg_t *msg, /* Connect p2p addresses to remote endpoint */ if (!(ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED)) { - status = ucp_wireup_connect_local(ep, msg->tli, - remote_address->address_count, - remote_address->address_list); + + /* + * In the wireup reply message, the lane indexes specify which + * **receiver** ep lane should be connected to a given ep address. So we + * don't pass 'lanes2remote' mapping, and use local lanes directly. + */ + status = ucp_wireup_connect_local(ep, remote_address, NULL); if (status != UCS_OK) { return; } @@ -530,11 +642,20 @@ static ucs_status_t ucp_wireup_msg_handler(void *arg, void *data, ucp_worker_h worker = arg; ucp_wireup_msg_t *msg = data; ucp_unpacked_address_t remote_address; + ucp_ep_h ep UCS_V_UNUSED; ucs_status_t status; UCS_ASYNC_BLOCK(&worker->async); - status = ucp_address_unpack(worker, msg + 1, &remote_address); + if (msg->dest_ep_ptr != 0) { + ep = ucp_worker_get_ep_by_ptr(worker, msg->dest_ep_ptr); + /* Current CM connection establishment does not use extra wireup + messages */ + ucs_assert(!ucp_ep_has_cm_lane(ep)); + } + + status = ucp_address_unpack(worker, msg + 1, UCP_ADDRESS_PACK_FLAGS_ALL, + &remote_address); if (status != UCS_OK) { ucs_error("failed to unpack address: %s", ucs_status_string(status)); goto out; @@ -560,8 +681,8 @@ static ucs_status_t ucp_wireup_msg_handler(void *arg, void *data, return UCS_OK; } -static void ucp_wireup_assign_lane(ucp_ep_h ep, ucp_lane_index_t lane, - uct_ep_h uct_ep, const char *info) +void ucp_wireup_assign_lane(ucp_ep_h ep, ucp_lane_index_t lane, uct_ep_h uct_ep, + const char *info) { /* If ep already exists, it's a wireup proxy, and we need to update its * next_ep instead of replacing it. @@ -590,40 +711,32 @@ static uct_ep_h ucp_wireup_extract_lane(ucp_ep_h ep, ucp_lane_index_t lane) } } -static ucs_status_t ucp_wireup_connect_lane(ucp_ep_h ep, - const ucp_ep_params_t *params, - ucp_lane_index_t lane, - unsigned address_count, - const ucp_address_entry_t *address_list, - unsigned addr_index) +static ucs_status_t +ucp_wireup_connect_lane_to_iface(ucp_ep_h ep, ucp_lane_index_t lane, + unsigned path_index, + ucp_worker_iface_t *wiface, + const ucp_address_entry_t *address) { - ucp_worker_h worker = ep->worker; - ucp_rsc_index_t rsc_index = ucp_ep_get_rsc_index(ep, lane); - ucp_lane_index_t proxy_lane = ucp_ep_get_proxy_lane(ep, lane); - ucp_worker_iface_t *wiface = ucp_worker_iface(worker, rsc_index); + ucp_lane_index_t proxy_lane = ucp_ep_get_proxy_lane(ep, lane); uct_ep_params_t uct_ep_params; uct_ep_h uct_ep; ucs_status_t status; - ucs_trace("ep %p: connect lane[%d]", ep, lane); + ucs_assert(wiface->attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); - /* - * if the selected transport can be connected directly to the remote - * interface, just create a connected UCT endpoint. - */ - if ((wiface->attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) && - ((ep->uct_eps[lane] == NULL) || ucp_wireup_ep_test(ep->uct_eps[lane]))) - { + if ((ep->uct_eps[lane] == NULL) || ucp_wireup_ep_test(ep->uct_eps[lane])) { if ((proxy_lane == UCP_NULL_LANE) || (proxy_lane == lane)) { /* create an endpoint connected to the remote interface */ - ucs_trace("ep %p: connect uct_ep[%d] to addr[%d]", ep, lane, - addr_index); - uct_ep_params.field_mask = UCT_EP_PARAM_FIELD_IFACE | - UCT_EP_PARAM_FIELD_DEV_ADDR | - UCT_EP_PARAM_FIELD_IFACE_ADDR; + ucs_trace("ep %p: connect uct_ep[%d] to addr %p", ep, lane, + address); + uct_ep_params.field_mask = UCT_EP_PARAM_FIELD_IFACE | + UCT_EP_PARAM_FIELD_DEV_ADDR | + UCT_EP_PARAM_FIELD_IFACE_ADDR | + UCT_EP_PARAM_FIELD_PATH_INDEX; uct_ep_params.iface = wiface->iface; - uct_ep_params.dev_addr = address_list[addr_index].dev_addr; - uct_ep_params.iface_addr = address_list[addr_index].iface_addr; + uct_ep_params.dev_addr = address->dev_addr; + uct_ep_params.iface_addr = address->iface_addr; + uct_ep_params.path_index = path_index; status = uct_ep_create(&uct_ep_params, &uct_ep); if (status != UCS_OK) { /* coverity[leaked_storage] */ @@ -637,49 +750,96 @@ static ucs_status_t ucp_wireup_connect_lane(ucp_ep_h ep, return UCS_OK; } - /* - * create a wireup endpoint which will start connection establishment - * protocol using an auxiliary transport. - */ - if (wiface->attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) { + return UCS_ERR_UNREACHABLE; +} - /* For now, p2p transports have no reason to have proxy */ - ucs_assert_always(proxy_lane == UCP_NULL_LANE); +static ucs_status_t +ucp_wireup_connect_lane_to_ep(ucp_ep_h ep, unsigned ep_init_flags, + ucp_lane_index_t lane, unsigned path_index, + ucp_rsc_index_t rsc_index, + ucp_worker_iface_t *wiface, + const ucp_unpacked_address_t *remote_address) +{ + int connect_aux; + uct_ep_h uct_ep; + ucs_status_t status; - /* If ep already exists, it's a wireup proxy, and we need to start - * auxiliary wireup. - */ - if (ep->uct_eps[lane] == NULL) { - status = ucp_wireup_ep_create(ep, &uct_ep); - if (status != UCS_OK) { - /* coverity[leaked_storage] */ - return status; - } + /* p2p transports have no reason to have proxy because the progress is + enabled on both sides */ + ucs_assert_always(ucp_ep_get_proxy_lane(ep, lane) == UCP_NULL_LANE); - ucs_trace("ep %p: assign uct_ep[%d]=%p wireup", ep, lane, uct_ep); - ep->uct_eps[lane] = uct_ep; - } else { - uct_ep = ep->uct_eps[lane]; + if (ep->uct_eps[lane] == NULL) { + status = ucp_wireup_ep_create(ep, &uct_ep); + if (status != UCS_OK) { + /* coverity[leaked_storage] */ + return status; } - ucs_trace("ep %p: connect uct_ep[%d]=%p to addr[%d] wireup", ep, lane, - uct_ep, addr_index); - status = ucp_wireup_ep_connect(ep->uct_eps[lane], params, rsc_index, - lane == ucp_ep_get_wireup_msg_lane(ep), - address_count, address_list); + ucs_trace("ep %p: assign uct_ep[%d]=%p wireup", ep, lane, uct_ep); + ep->uct_eps[lane] = uct_ep; + } else { + uct_ep = ep->uct_eps[lane]; + ucs_assert(ucp_wireup_ep_test(uct_ep)); + } + + if (!(ep_init_flags & UCP_EP_INIT_CM_WIREUP_CLIENT)) { + ucs_trace("ep %p: connect uct_ep[%d]=%p to remote addr %p wireup", ep, + lane, uct_ep, remote_address); + connect_aux = !ucp_ep_init_flags_has_cm(ep_init_flags) && + (lane == ucp_ep_get_wireup_msg_lane(ep)); + status = ucp_wireup_ep_connect(ep->uct_eps[lane], ep_init_flags, + rsc_index, path_index, connect_aux, + remote_address); if (status != UCS_OK) { return status; } + } - ucp_worker_iface_progress_ep(wiface); + ucp_worker_iface_progress_ep(wiface); - return UCS_OK; - } + return UCS_OK; +} - return UCS_ERR_UNREACHABLE; +ucs_status_t +ucp_wireup_connect_lane(ucp_ep_h ep, unsigned ep_init_flags, + ucp_lane_index_t lane, unsigned path_index, + const ucp_unpacked_address_t *remote_address, + unsigned addr_index) +{ + ucp_worker_h worker = ep->worker; + ucp_rsc_index_t rsc_index; + ucp_worker_iface_t *wiface; + ucp_address_entry_t *address; + + ucs_trace("ep %p: connect lane[%d]", ep, lane); + + ucs_assert(lane != ucp_ep_get_cm_lane(ep)); + + ucs_assert_always(remote_address != NULL); + ucs_assert_always(remote_address->address_list != NULL); + ucs_assert_always(addr_index <= remote_address->address_count); + + rsc_index = ucp_ep_get_rsc_index(ep, lane); + wiface = ucp_worker_iface(worker, rsc_index); + + /* + * create a wireup endpoint which will start connection establishment + * protocol using an auxiliary transport. + */ + if (ucp_ep_config(ep)->p2p_lanes & UCS_BIT(lane)) { + return ucp_wireup_connect_lane_to_ep(ep, ep_init_flags, lane, + path_index, rsc_index, wiface, + remote_address); + } else if (ucp_worker_is_tl_2iface(worker, rsc_index)) { + address = &remote_address->address_list[addr_index]; + return ucp_wireup_connect_lane_to_iface(ep, lane, path_index, wiface, + address); + } else { + return UCS_ERR_UNREACHABLE; + } } -static ucs_status_t ucp_wireup_resolve_proxy_lanes(ucp_ep_h ep) +ucs_status_t ucp_wireup_resolve_proxy_lanes(ucp_ep_h ep) { ucp_lane_index_t lane, proxy_lane; uct_iface_attr_t *iface_attr; @@ -737,7 +897,7 @@ static ucs_status_t ucp_wireup_resolve_proxy_lanes(ucp_ep_h ep) static void ucp_wireup_print_config(ucp_context_h context, const ucp_ep_config_key_t *key, const char *title, - uint8_t *addr_indices, + const unsigned *addr_indices, ucs_log_level_t log_level) { char lane_info[128] = {0}; @@ -759,34 +919,124 @@ static void ucp_wireup_print_config(ucp_context_h context, } } -ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, - unsigned ep_init_flags, unsigned address_count, - const ucp_address_entry_t *address_list, - uint8_t *addr_indices) +int ucp_wireup_is_reachable(ucp_ep_h ep, ucp_rsc_index_t rsc_index, + const ucp_address_entry_t *ae) +{ + ucp_context_h context = ep->worker->context; + ucp_worker_iface_t *wiface = ucp_worker_iface(ep->worker, rsc_index); + + return (context->tl_rscs[rsc_index].tl_name_csum == ae->tl_name_csum) && + (ucp_ep_has_cm_lane(ep) || /* assume reachability is checked by CM */ + uct_iface_is_reachable(wiface->iface, ae->dev_addr, ae->iface_addr)); +} + +static void +ucp_wireup_get_reachable_mds(ucp_ep_h ep, + const ucp_unpacked_address_t *remote_address, + ucp_ep_config_key_t *key) +{ + ucp_context_h context = ep->worker->context; + const ucp_ep_config_key_t *prev_config_key; + ucp_rsc_index_t ae_cmpts[UCP_MAX_MDS]; /* component index for each address entry */ + const ucp_address_entry_t *ae; + ucp_rsc_index_t cmpt_index; + ucp_rsc_index_t rsc_index; + ucp_md_index_t dst_md_index; + ucp_md_map_t ae_dst_md_map, dst_md_map; + ucp_md_map_t prev_dst_md_map; + unsigned num_dst_mds; + + ae_dst_md_map = 0; + ucs_for_each_bit(rsc_index, context->tl_bitmap) { + ucp_unpacked_address_for_each(ae, remote_address) { + if (ucp_wireup_is_reachable(ep, rsc_index, ae)) { + ae_dst_md_map |= UCS_BIT(ae->md_index); + dst_md_index = context->tl_rscs[rsc_index].md_index; + ae_cmpts[ae->md_index] = context->tl_mds[dst_md_index].cmpt_index; + } + } + } + + if (ep->cfg_index == UCP_NULL_CFG_INDEX) { + prev_config_key = NULL; + prev_dst_md_map = 0; + } else { + prev_config_key = &ucp_ep_config(ep)->key; + prev_dst_md_map = prev_config_key->reachable_md_map; + } + + /* merge with previous configuration */ + dst_md_map = ae_dst_md_map | prev_dst_md_map; + num_dst_mds = 0; + ucs_for_each_bit(dst_md_index, dst_md_map) { + cmpt_index = UCP_NULL_RESOURCE; + /* remote md is reachable by the provided address */ + if (UCS_BIT(dst_md_index) & ae_dst_md_map) { + cmpt_index = ae_cmpts[dst_md_index]; + } + /* remote md is reachable by previous ep configuration */ + if (UCS_BIT(dst_md_index) & prev_dst_md_map) { + ucs_assert(prev_config_key != NULL); + cmpt_index = ucp_ep_config_get_dst_md_cmpt(prev_config_key, dst_md_index); + if (UCS_BIT(dst_md_index) & ae_dst_md_map) { + /* we expect previous configuration will not conflict with the + * new one + */ + ucs_assert_always(cmpt_index == ae_cmpts[dst_md_index]); + } + } + ucs_assert_always(cmpt_index != UCP_NULL_RESOURCE); + key->dst_md_cmpts[num_dst_mds++] = cmpt_index; + } + ucs_assert(num_dst_mds == ucs_popcount(dst_md_map)); + + key->reachable_md_map = dst_md_map; +} + +ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, unsigned ep_init_flags, + uint64_t local_tl_bitmap, + const ucp_unpacked_address_t *remote_address, + unsigned *addr_indices) { ucp_worker_h worker = ep->worker; + uint64_t tl_bitmap = local_tl_bitmap & worker->context->tl_bitmap; ucp_ep_config_key_t key; - uint16_t new_cfg_index; + ucp_ep_cfg_index_t new_cfg_index; ucp_lane_index_t lane; ucs_status_t status; char str[32]; + ucp_wireup_ep_t *cm_wireup_ep; + + ucs_assert(tl_bitmap != 0); ucs_trace("ep %p: initialize lanes", ep); - status = ucp_wireup_select_lanes(ep, params, ep_init_flags, address_count, - address_list, addr_indices, &key); + ucp_ep_config_key_reset(&key); + ucp_ep_config_key_set_err_mode(&key, ep_init_flags); + + status = ucp_wireup_select_lanes(ep, ep_init_flags, tl_bitmap, + remote_address, addr_indices, &key); if (status != UCS_OK) { return status; } - key.reachable_md_map |= ucp_ep_config(ep)->key.reachable_md_map; + /* Get all reachable MDs from full remote address list and join with + * current ep configuration + */ + key.dst_md_cmpts = ucs_alloca(sizeof(*key.dst_md_cmpts) * UCP_MAX_MDS); + ucp_wireup_get_reachable_mds(ep, remote_address, &key); + + /* Load new configuration */ + status = ucp_worker_get_ep_config(worker, &key, 1, &new_cfg_index); + if (status != UCS_OK) { + return status; + } - new_cfg_index = ucp_worker_get_ep_config(worker, &key); if (ep->cfg_index == new_cfg_index) { return UCS_OK; /* No change */ } - if ((ep->cfg_index != 0) && !ucp_ep_is_sockaddr_stub(ep)) { + if ((ep->cfg_index != UCP_NULL_CFG_INDEX) && !ucp_ep_is_sockaddr_stub(ep)) { /* * TODO handle a case where we have to change lanes and reconfigure the ep: * @@ -799,9 +1049,13 @@ ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, */ ucs_debug("cannot reconfigure ep %p from [%d] to [%d]", ep, ep->cfg_index, new_cfg_index); - return UCS_OK; /* No change */ + ucp_wireup_print_config(worker->context, &ucp_ep_config(ep)->key, "old", + NULL, UCS_LOG_LEVEL_ERROR); + ucp_wireup_print_config(worker->context, &key, "new", NULL, UCS_LOG_LEVEL_ERROR); + ucs_fatal("endpoint reconfiguration not supported yet"); } + cm_wireup_ep = ucp_ep_get_cm_wireup_ep(ep); ep->cfg_index = new_cfg_index; ep->am_lane = key.am_lane; @@ -811,8 +1065,15 @@ ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, /* establish connections on all underlying endpoints */ for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { - status = ucp_wireup_connect_lane(ep, params, lane, address_count, - address_list, addr_indices[lane]); + if (ucp_ep_get_cm_lane(ep) == lane) { + /* restore the cm lane after reconfiguration */ + ep->uct_eps[lane] = &cm_wireup_ep->super.super; + continue; + } + + status = ucp_wireup_connect_lane(ep, ep_init_flags, lane, + key.lanes[lane].path_index, + remote_address, addr_indices[lane]); if (status != UCS_OK) { return status; } @@ -833,23 +1094,13 @@ ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, ucs_status_t ucp_wireup_send_request(ucp_ep_h ep) { - ucp_worker_h worker = ep->worker; - ucp_rsc_index_t rsc_tli[UCP_MAX_LANES]; ucp_rsc_index_t rsc_index; - uint64_t tl_bitmap = 0; - ucp_lane_index_t lane; ucs_status_t status; + uint64_t tl_bitmap; - for (lane = 0; lane < UCP_MAX_LANES; ++lane) { - if (lane < ucp_ep_num_lanes(ep)) { - rsc_index = ucp_ep_get_rsc_index(ep, lane); - rsc_tli[lane] = ucp_worker_is_tl_p2p(worker, rsc_index) ? rsc_index : - UCP_NULL_RESOURCE; - tl_bitmap |= UCS_BIT(rsc_index); - } else { - rsc_tli[lane] = UCP_NULL_RESOURCE; - } - } + ucs_assert(!ucp_ep_has_cm_lane(ep)); + + tl_bitmap = ucp_wireup_get_ep_tl_bitmap(ep, UCS_MASK(ucp_ep_num_lanes(ep))); /* TODO make sure such lane would exist */ rsc_index = ucp_wireup_ep_get_aux_rsc_index( @@ -859,7 +1110,7 @@ ucs_status_t ucp_wireup_send_request(ucp_ep_h ep) } ucs_debug("ep %p: send wireup request (flags=0x%x)", ep, ep->flags); - status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_REQUEST, tl_bitmap, rsc_tli); + status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_REQUEST, tl_bitmap, NULL); ep->flags |= UCP_EP_FLAG_CONNECT_REQ_QUEUED; @@ -878,10 +1129,11 @@ static void ucp_wireup_connect_remote_purge_cb(uct_pending_req_t *self, void *ar ucs_status_t ucp_wireup_send_pre_request(ucp_ep_h ep) { + uint64_t tl_bitmap = UINT64_MAX; /* pack full worker address */ ucp_rsc_index_t rsc_tli[UCP_MAX_LANES]; - uint64_t tl_bitmap = -1; /* pack full worker address */ ucs_status_t status; + ucs_assert(!ucp_ep_has_cm_lane(ep)); ucs_assert(ep->flags & UCP_EP_FLAG_LISTENER); ucs_assert(!(ep->flags & UCP_EP_FLAG_CONNECT_PRE_REQ_QUEUED)); memset(rsc_tli, UCP_NULL_RESOURCE, sizeof(rsc_tli)); @@ -902,6 +1154,8 @@ ucs_status_t ucp_wireup_connect_remote(ucp_ep_h ep, ucp_lane_index_t lane) ucs_trace("ep %p: connect lane %d to remote peer", ep, lane); + ucs_assert(lane != UCP_NULL_LANE); + UCS_ASYNC_BLOCK(&ep->worker->async); /* checking again, with lock held, if already connected or connection is @@ -956,10 +1210,11 @@ ucs_status_t ucp_wireup_connect_remote(ucp_ep_h ep, ucp_lane_index_t lane) (req->send.uct.func == ucp_wireup_msg_progress) || (req->send.uct.func == ucp_wireup_ep_progress_pending) ? UCT_CB_FLAG_ASYNC : 0); - ucs_assert(status == UCS_OK); /* because it's a wireup proxy */ + if (status != UCS_OK) { + ucs_fatal("wireup proxy function must always return UCS_OK"); + } } - status = UCS_OK; goto out_unlock; err_destroy_wireup_ep: @@ -980,13 +1235,15 @@ static void ucp_wireup_msg_dump(ucp_worker_h worker, uct_am_trace_type_t type, ucp_unpacked_address_t unpacked_address; const ucp_address_entry_t *ae; ucp_tl_resource_desc_t *rsc; - ucp_lane_index_t lane; - unsigned addr_index; + unsigned ep_addr_index; ucs_status_t status; char *p, *end; ucp_rsc_index_t tl; - status = ucp_address_unpack(worker, msg + 1, &unpacked_address); + status = ucp_address_unpack(worker, msg + 1, + UCP_ADDRESS_PACK_FLAGS_ALL | + UCP_ADDRESS_PACK_FLAG_NO_TRACE, + &unpacked_address); if (status != UCS_OK) { strncpy(unpacked_address.name, "", UCP_WORKER_NAME_MAX); unpacked_address.uuid = 0; @@ -999,16 +1256,16 @@ static void ucp_wireup_msg_dump(ucp_worker_h worker, uct_am_trace_type_t type, snprintf(p, end - p, "WIREUP %s [%s uuid 0x%"PRIx64" src_ep 0x%lx dst_ep 0x%lx conn_sn %d]", - (msg->type == UCP_WIREUP_MSG_PRE_REQUEST ) ? "PRE_REQ" : - (msg->type == UCP_WIREUP_MSG_REQUEST ) ? "REQ" : - (msg->type == UCP_WIREUP_MSG_REPLY ) ? "REP" : - (msg->type == UCP_WIREUP_MSG_ACK ) ? "ACK" : "", - unpacked_address.name, unpacked_address.uuid, msg->src_ep_ptr, - msg->dest_ep_ptr, msg->conn_sn); + ucp_wireup_msg_str(msg->type), unpacked_address.name, + unpacked_address.uuid, msg->src_ep_ptr, msg->dest_ep_ptr, + msg->conn_sn); p += strlen(p); - for (addr_index = 0; addr_index < unpacked_address.address_count; ++addr_index) { - ae = &unpacked_address.address_list[addr_index]; + if (unpacked_address.address_list == NULL) { + return; /* No addresses were unpacked */ + } + + ucp_unpacked_address_for_each(ae, &unpacked_address) { ucs_for_each_bit(tl, context->tl_bitmap) { rsc = &context->tl_rscs[tl]; if (ae->tl_name_csum == rsc->tl_name_csum) { @@ -1021,16 +1278,44 @@ static void ucp_wireup_msg_dump(ucp_worker_h worker, uct_am_trace_type_t type, snprintf(p, end - p, "/md[%d]", ae->md_index); p += strlen(p); - for (lane = 0; lane < UCP_MAX_LANES; ++lane) { - if (msg->tli[lane] == addr_index) { - snprintf(p, end - p, "/lane[%d]", lane); - p += strlen(p); - } + for (ep_addr_index = 0; ep_addr_index < ae->num_ep_addrs; + ++ep_addr_index) { + snprintf(p, end - p, "/lane[%d]", ae->ep_addrs[ep_addr_index].lane); + p += strlen(p); } } ucs_free(unpacked_address.address_list); } -UCP_DEFINE_AM(-1, UCP_AM_ID_WIREUP, ucp_wireup_msg_handler, +int ucp_worker_iface_is_tl_p2p(const uct_iface_attr_t *iface_attr) +{ + return !!(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP); +} + +static ucp_err_handling_mode_t +ucp_ep_params_err_handling_mode(const ucp_ep_params_t *params) +{ + return (params->field_mask & UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE) ? + params->err_mode : UCP_ERR_HANDLING_MODE_NONE; +} + +unsigned ucp_ep_init_flags(const ucp_worker_h worker, + const ucp_ep_params_t *params) +{ + unsigned flags = ucp_cm_ep_init_flags(worker, params); + + if (!ucp_worker_sockaddr_is_cm_proto(worker) && + (params->field_mask & UCP_EP_PARAM_FIELD_SOCK_ADDR)) { + flags |= UCP_EP_INIT_CREATE_AM_LANE; + } + + if (ucp_ep_params_err_handling_mode(params) == UCP_ERR_HANDLING_MODE_PEER) { + flags |= UCP_EP_INIT_ERR_MODE_PEER_FAILURE; + } + + return flags; +} + +UCP_DEFINE_AM(UINT64_MAX, UCP_AM_ID_WIREUP, ucp_wireup_msg_handler, ucp_wireup_msg_dump, UCT_CB_FLAG_ASYNC); diff --git a/src/ucp/wireup/wireup.h b/src/ucp/wireup/wireup.h index cc23838f8de..8c971422179 100644 --- a/src/ucp/wireup/wireup.h +++ b/src/ucp/wireup/wireup.h @@ -15,6 +15,11 @@ #include +/* Peer name to show when we don't have debug information, or the name was not + * packed in the worker address */ +#define UCP_WIREUP_EMPTY_PEER_NAME "" + + /** * Wireup message types */ @@ -31,11 +36,13 @@ enum { * Criteria for transport selection. */ typedef struct { - const char *title; /* Name of the criteria for debugging */ - uint64_t local_md_flags; /* Required local MD flags */ - uint64_t remote_md_flags; /* Required remote MD flags */ - uint64_t local_iface_flags; /* Required local interface flags */ - uint64_t remote_iface_flags;/* Required remote interface flags */ + const char *title; /* Name of the criteria for debugging */ + uint64_t local_md_flags; /* Required local MD flags */ + uint64_t remote_md_flags; /* Required remote MD flags */ + uint64_t local_iface_flags; /* Required local interface flags */ + uint64_t remote_iface_flags; /* Required remote interface flags */ + uint64_t local_event_flags; /* Required local event flags */ + uint64_t remote_event_flags; /* Required remote event flags */ /** * Calculates score of a potential transport. @@ -67,32 +74,34 @@ typedef struct ucp_wireup_msg { ucp_ep_conn_sn_t conn_sn; /* Connection sequence number */ uintptr_t src_ep_ptr; /* Endpoint of source */ uintptr_t dest_ep_ptr; /* Endpoint of destination (0 - invalid) */ - - /* REQUEST - which p2p lanes must be connected - * REPLY - which p2p lanes have been connected - */ - uint8_t tli[UCP_MAX_LANES]; - /* packed addresses follow */ } UCS_S_PACKED ucp_wireup_msg_t; +typedef struct { + double score; + unsigned addr_index; + unsigned path_index; + ucp_rsc_index_t rsc_index; + uint8_t priority; +} ucp_wireup_select_info_t; + + ucs_status_t ucp_wireup_send_request(ucp_ep_h ep); ucs_status_t ucp_wireup_send_pre_request(ucp_ep_h ep); ucs_status_t ucp_wireup_connect_remote(ucp_ep_h ep, ucp_lane_index_t lane); -ucs_status_t ucp_wireup_select_aux_transport(ucp_ep_h ep, - const ucp_ep_params_t *params, - const ucp_address_entry_t *address_list, - unsigned address_count, - ucp_rsc_index_t *rsc_index_p, - unsigned *addr_index_p); +ucs_status_t +ucp_wireup_select_aux_transport(ucp_ep_h ep, unsigned ep_init_flags, + const ucp_unpacked_address_t *remote_address, + ucp_wireup_select_info_t *select_info); -ucs_status_t ucp_wireup_select_sockaddr_transport(ucp_ep_h ep, - const ucp_ep_params_t *params, - ucp_rsc_index_t *rsc_index_p); +ucs_status_t +ucp_wireup_select_sockaddr_transport(const ucp_context_h context, + const ucs_sock_addr_t *sockaddr, + ucp_rsc_index_t *rsc_index_p); double ucp_wireup_amo_score_func(ucp_context_h context, const uct_md_attr_t *md_attr, @@ -103,26 +112,94 @@ ucs_status_t ucp_wireup_msg_progress(uct_pending_req_t *self); int ucp_wireup_msg_ack_cb_pred(const ucs_callbackq_elem_t *elem, void *arg); -ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, - unsigned ep_init_flags, unsigned address_count, - const ucp_address_entry_t *address_list, - uint8_t *addr_indices); +int ucp_wireup_is_reachable(ucp_ep_h ep, ucp_rsc_index_t rsc_index, + const ucp_address_entry_t *ae); + +ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, unsigned ep_init_flags, + uint64_t local_tl_bitmap, + const ucp_unpacked_address_t *remote_address, + unsigned *addr_indices); -ucs_status_t ucp_wireup_select_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, - unsigned ep_init_flags, unsigned address_count, - const ucp_address_entry_t *address_list, - uint8_t *addr_indices, - ucp_ep_config_key_t *key); +ucs_status_t +ucp_wireup_select_lanes(ucp_ep_h ep, unsigned ep_init_flags, uint64_t tl_bitmap, + const ucp_unpacked_address_t *remote_address, + unsigned *addr_indices, ucp_ep_config_key_t *key); ucs_status_t ucp_signaling_ep_create(ucp_ep_h ucp_ep, uct_ep_h uct_ep, int is_owner, uct_ep_h *signaling_ep); -static inline int ucp_worker_is_tl_p2p(ucp_worker_h worker, ucp_rsc_index_t rsc_index) +/** + * Check if interface with @a iface_attr supports point to pont connections. + * + * @param [in] iface_attr iface attributes. + * + * @return 1 if iface supports point to pont connections, otherwise 0. + */ +int ucp_worker_iface_is_tl_p2p(const uct_iface_attr_t *iface_attr); + +void ucp_wireup_assign_lane(ucp_ep_h ep, ucp_lane_index_t lane, uct_ep_h uct_ep, + const char *info); + +ucs_status_t +ucp_wireup_connect_lane(ucp_ep_h ep, unsigned ep_init_flags, + ucp_lane_index_t lane, unsigned path_index, + const ucp_unpacked_address_t *remote_address, + unsigned addr_index); + +ucs_status_t ucp_wireup_resolve_proxy_lanes(ucp_ep_h ep); + +void ucp_wireup_remote_connected(ucp_ep_h ep); + +/** + * Check if TL supports point to pont connections. + * + * @param [in] worker UCP worker. + * @param [in] rsc_index resource index. + * + * @return 1 if TL supports point to pont connections, otherwise 0. + */ +static inline int ucp_worker_is_tl_p2p(ucp_worker_h worker, + ucp_rsc_index_t rsc_index) +{ + return ucp_worker_iface_is_tl_p2p(ucp_worker_iface_get_attr(worker, + rsc_index)); +} + +/** + * Check if TL supports connection to interface. + * + * @param [in] worker UCP worker. + * @param [in] rsc_index resource index. + * + * @return 1 if TL supports connection to interface, otherwise 0. + */ +static inline int ucp_worker_is_tl_2iface(ucp_worker_h worker, + ucp_rsc_index_t rsc_index) { - uint64_t flags = ucp_worker_iface_get_attr(worker, rsc_index)->cap.flags; + return !!(ucp_worker_iface_get_attr(worker, rsc_index)->cap.flags & + UCT_IFACE_FLAG_CONNECT_TO_IFACE); +} - return (flags & UCT_IFACE_FLAG_CONNECT_TO_EP) && - !(flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); +/** + * Check if TL supports connection to sockaddr. + * + * @param [in] worker UCP worker. + * @param [in] rsc_index resource index. + * + * @return 1 if TL supports connection to sockaddr, otherwise 0. + */ +static inline UCS_F_MAYBE_UNUSED int +ucp_worker_is_tl_2sockaddr(ucp_worker_h worker, ucp_rsc_index_t rsc_index) +{ + return !!(ucp_worker_iface_get_attr(worker, rsc_index)->cap.flags & + UCT_IFACE_FLAG_CONNECT_TO_SOCKADDR); } +unsigned ucp_ep_init_flags(const ucp_worker_h worker, + const ucp_ep_params_t *params); + +ucs_status_t +ucp_wireup_connect_local(ucp_ep_h ep, + const ucp_unpacked_address_t *remote_address, + const ucp_lane_index_t *lanes2remote); #endif diff --git a/src/ucp/wireup/wireup_cm.c b/src/ucp/wireup/wireup_cm.c new file mode 100644 index 00000000000..cbc94524e00 --- /dev/null +++ b/src/ucp/wireup/wireup_cm.c @@ -0,0 +1,1025 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "wireup_cm.h" +#include +#include +#include +#include +#include +#include + + +unsigned +ucp_cm_ep_init_flags(const ucp_worker_h worker, const ucp_ep_params_t *params) +{ + if (!ucp_worker_sockaddr_is_cm_proto(worker)) { + return 0; + } + + if (params->field_mask & UCP_EP_PARAM_FIELD_SOCK_ADDR) { + return UCP_EP_INIT_CM_WIREUP_CLIENT; + } + + if (params->field_mask & UCP_EP_PARAM_FIELD_CONN_REQUEST) { + return UCP_EP_INIT_CM_WIREUP_SERVER; + } + + return 0; +} + +int ucp_ep_init_flags_has_cm(unsigned ep_init_flags) +{ + return !!(ep_init_flags & (UCP_EP_INIT_CM_WIREUP_CLIENT | + UCP_EP_INIT_CM_WIREUP_SERVER)); +} + +static ucs_status_t +ucp_cm_ep_client_initial_config_get(ucp_ep_h ucp_ep, const char *dev_name, + ucp_ep_config_key_t *key) +{ + ucp_worker_h worker = ucp_ep->worker; + uint64_t addr_pack_flags = UCP_ADDRESS_PACK_FLAG_DEVICE_ADDR | + UCP_ADDRESS_PACK_FLAG_IFACE_ADDR; + ucp_wireup_ep_t *wireup_ep = ucp_ep_get_cm_wireup_ep(ucp_ep); + uint64_t tl_bitmap = ucp_context_dev_tl_bitmap(worker->context, + dev_name); + void *ucp_addr; + size_t ucp_addr_size; + ucp_unpacked_address_t unpacked_addr; + unsigned addr_indices[UCP_MAX_RESOURCES]; + ucs_status_t status; + + ucs_assert_always(wireup_ep != NULL); + + /* Construct local dummy address for lanes selection taking an assumption + * that server has the transports which are the best from client's + * perspective. */ + status = ucp_address_pack(worker, NULL, tl_bitmap, addr_pack_flags, NULL, + &ucp_addr_size, &ucp_addr); + if (status != UCS_OK) { + goto out; + } + + status = ucp_address_unpack(worker, ucp_addr, addr_pack_flags, + &unpacked_addr); + if (status != UCS_OK) { + goto free_ucp_addr; + } + + ucs_assert(unpacked_addr.address_count <= UCP_MAX_RESOURCES); + ucp_ep_config_key_reset(key); + ucp_ep_config_key_set_err_mode(key, wireup_ep->ep_init_flags); + status = ucp_wireup_select_lanes(ucp_ep, wireup_ep->ep_init_flags, + tl_bitmap, &unpacked_addr, addr_indices, + key); + + ucs_free(unpacked_addr.address_list); +free_ucp_addr: + ucs_free(ucp_addr); +out: + return status; +} + +static void ucp_cm_priv_data_pack(ucp_wireup_sockaddr_data_t *sa_data, + ucp_ep_h ep, ucp_rsc_index_t dev_index, + const ucp_address_t *addr, size_t addr_size) +{ + ucs_assert((int)ucp_ep_config(ep)->key.err_mode <= UINT8_MAX); + ucs_assert(dev_index != UCP_NULL_RESOURCE); + + sa_data->ep_ptr = (uintptr_t)ep; + sa_data->err_mode = ucp_ep_config(ep)->key.err_mode; + sa_data->addr_mode = UCP_WIREUP_SA_DATA_CM_ADDR; + sa_data->dev_index = dev_index; + memcpy(sa_data + 1, addr, addr_size); +} + +static ssize_t ucp_cm_client_priv_pack_cb(void *arg, + const uct_cm_ep_priv_data_pack_args_t + *pack_args, void *priv_data) +{ + ucp_wireup_sockaddr_data_t *sa_data = priv_data; + ucp_ep_h ep = arg; + ucp_worker_h worker = ep->worker; + uct_cm_h cm = worker->cms[/*cm_idx = */ 0].cm; + ucp_rsc_index_t dev_index = UCP_NULL_RESOURCE; + ucp_ep_config_key_t key; + uint64_t tl_bitmap; + uct_ep_h tl_ep; + ucp_wireup_ep_t *cm_wireup_ep; + uct_cm_attr_t cm_attr; + uct_ep_params_t tl_ep_params; + void* ucp_addr; + size_t ucp_addr_size; + ucs_status_t status; + ucp_lane_index_t lane_idx; + ucp_rsc_index_t rsc_idx; + const char *dev_name; + ucp_ep_h tmp_ep; + + UCS_ASYNC_BLOCK(&worker->async); + + ucs_assert_always(pack_args->field_mask & + UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME); + + dev_name = pack_args->dev_name; + + status = ucp_cm_ep_client_initial_config_get(ep, dev_name, &key); + if (status != UCS_OK) { + goto out; + } + + /* At this point the ep has only CM lane */ + ucs_assert((ucp_ep_num_lanes(ep) == 1) && ucp_ep_has_cm_lane(ep)); + cm_wireup_ep = ucp_ep_get_cm_wireup_ep(ep); + ucs_assert(cm_wireup_ep != NULL); + + /* Create tmp ep which will hold local tl addresses until connect + * event arrives, to avoid asynchronous ep reconfiguration. */ + status = ucp_ep_create_base(worker, "tmp_cm", "tmp cm client", &tmp_ep); + if (status != UCS_OK) { + goto out; + } + cm_wireup_ep->tmp_ep = tmp_ep; + + status = ucp_worker_get_ep_config(worker, &key, 0, &tmp_ep->cfg_index); + if (status != UCS_OK) { + goto out; + } + + cm_attr.field_mask = UCT_CM_ATTR_FIELD_MAX_CONN_PRIV; + status = uct_cm_query(cm, &cm_attr); + if (status != UCS_OK) { + goto out; + } + + tl_bitmap = 0; + for (lane_idx = 0; lane_idx < ucp_ep_num_lanes(tmp_ep); ++lane_idx) { + if (lane_idx == ucp_ep_get_cm_lane(tmp_ep)) { + continue; + } + + rsc_idx = ucp_ep_get_rsc_index(tmp_ep, lane_idx); + if (rsc_idx == UCP_NULL_RESOURCE) { + continue; + } + + status = ucp_wireup_ep_create(tmp_ep, &tmp_ep->uct_eps[lane_idx]); + if (status != UCS_OK) { + goto out; + } + + ucs_assert((dev_index == UCP_NULL_RESOURCE) || + (dev_index == worker->context->tl_rscs[rsc_idx].dev_index)); + dev_index = worker->context->tl_rscs[rsc_idx].dev_index; + + tl_bitmap |= UCS_BIT(rsc_idx); + if (ucp_ep_config(tmp_ep)->p2p_lanes & UCS_BIT(lane_idx)) { + tl_ep_params.field_mask = UCT_EP_PARAM_FIELD_IFACE | + UCT_EP_PARAM_FIELD_PATH_INDEX; + tl_ep_params.iface = ucp_worker_iface(worker, rsc_idx)->iface; + tl_ep_params.path_index = ucp_ep_get_path_index(tmp_ep, lane_idx); + status = uct_ep_create(&tl_ep_params, &tl_ep); + if (status != UCS_OK) { + /* coverity[leaked_storage] */ + goto out; + } + + ucp_wireup_ep_set_next_ep(tmp_ep->uct_eps[lane_idx], tl_ep); + } else { + ucs_assert(ucp_worker_is_tl_2iface(worker, rsc_idx)); + } + } + + /* Don't pack the device address to reduce address size, it will be + * delivered by uct_cm_listener_conn_request_callback_t in + * uct_cm_remote_data_t */ + status = ucp_address_pack(worker, tmp_ep, tl_bitmap, + UCP_ADDRESS_PACK_FLAG_IFACE_ADDR | + UCP_ADDRESS_PACK_FLAG_EP_ADDR, + NULL, &ucp_addr_size, &ucp_addr); + if (status != UCS_OK) { + goto out; + } + + if (cm_attr.max_conn_priv < (sizeof(*sa_data) + ucp_addr_size)) { + ucs_error("CM private data buffer is to small to pack UCP endpoint info, " + "ep %p/%p service data %lu, address length %lu, cm %p max_conn_priv %lu", + ep, tmp_ep, sizeof(*sa_data), ucp_addr_size, cm, + cm_attr.max_conn_priv); + status = UCS_ERR_BUFFER_TOO_SMALL; + goto free_addr; + } + + ucs_debug("client ep %p created on device %s idx %d, tl_bitmap 0x%zx", ep, + dev_name, dev_index, tl_bitmap); + /* Pass real ep (not tmp_ep), because only its pointer and err_mode is + * taken from the config. */ + ucp_cm_priv_data_pack(sa_data, ep, dev_index, ucp_addr, ucp_addr_size); + +free_addr: + ucs_free(ucp_addr); +out: + if (status == UCS_OK) { + ep->flags |= UCP_EP_FLAG_LOCAL_CONNECTED; + } else { + ucp_worker_set_ep_failed(worker, ep, + &ucp_ep_get_cm_wireup_ep(ep)->super.super, + ucp_ep_get_cm_lane(ep), status); + } + + UCS_ASYNC_UNBLOCK(&worker->async); + /* coverity[leaked_storage] */ + return (status == UCS_OK) ? (sizeof(*sa_data) + ucp_addr_size) : status; +} + +static void +ucp_cm_client_connect_prog_arg_free(ucp_cm_client_connect_progress_arg_t *arg) +{ + ucs_free(arg->sa_data); + ucs_free(arg->dev_addr); + ucs_free(arg); +} + +static void ucp_cm_client_restore_ep(ucp_wireup_ep_t *wireup_cm_ep, + ucp_ep_h ucp_ep) +{ + ucp_ep_h tmp_ep = wireup_cm_ep->tmp_ep; + ucp_wireup_ep_t *w_ep; + ucp_lane_index_t lane_idx; + + for (lane_idx = 0; lane_idx < ucp_ep_num_lanes(tmp_ep); ++lane_idx) { + if (tmp_ep->uct_eps[lane_idx] != NULL) { + ucs_assert(ucp_ep->uct_eps[lane_idx] == NULL); + ucp_ep->uct_eps[lane_idx] = tmp_ep->uct_eps[lane_idx]; + w_ep = ucs_derived_of(ucp_ep->uct_eps[lane_idx], ucp_wireup_ep_t); + w_ep->super.ucp_ep = ucp_ep; + } + } + + ucp_ep_delete(tmp_ep); /* not needed anymore */ + wireup_cm_ep->tmp_ep = NULL; +} + +/* + * The main thread progress part of connection establishment on client side + */ +static unsigned ucp_cm_client_connect_progress(void *arg) +{ + ucp_cm_client_connect_progress_arg_t *progress_arg = arg; + ucp_ep_h ucp_ep = progress_arg->ucp_ep; + ucp_worker_h worker = ucp_ep->worker; + ucp_context_h context = worker->context; + uct_ep_h uct_cm_ep = ucp_ep_get_cm_uct_ep(ucp_ep); + ucp_wireup_ep_t *wireup_ep; + ucp_unpacked_address_t addr; + uint64_t tl_bitmap; + ucp_rsc_index_t dev_index; + ucp_rsc_index_t rsc_index; + unsigned addr_idx; + unsigned addr_indices[UCP_MAX_RESOURCES]; + ucs_status_t status; + + UCS_ASYNC_BLOCK(&worker->async); + + wireup_ep = ucp_ep_get_cm_wireup_ep(ucp_ep); + ucs_assert(wireup_ep != NULL); + ucs_assert(wireup_ep->ep_init_flags & UCP_EP_INIT_CM_WIREUP_CLIENT); + + status = ucp_address_unpack(worker, progress_arg->sa_data + 1, + UCP_ADDRESS_PACK_FLAG_IFACE_ADDR | + UCP_ADDRESS_PACK_FLAG_EP_ADDR, &addr); + if (status != UCS_OK) { + goto out; + } + + if (addr.address_count == 0) { + status = UCS_ERR_UNREACHABLE; + goto out_free_addr; + } + + for (addr_idx = 0; addr_idx < addr.address_count; ++addr_idx) { + addr.address_list[addr_idx].dev_addr = progress_arg->dev_addr; + addr.address_list[addr_idx].dev_index = progress_arg->sa_data->dev_index; + } + + ucs_assert(addr.address_count <= UCP_MAX_RESOURCES); + ucp_ep_update_dest_ep_ptr(ucp_ep, progress_arg->sa_data->ep_ptr); + + /* Get tl bitmap from tmp_ep, because it contains initial configuration. */ + tl_bitmap = ucp_ep_get_tl_bitmap(wireup_ep->tmp_ep); + ucs_assert(tl_bitmap != 0); + rsc_index = ucs_ffs64(tl_bitmap); + dev_index = context->tl_rscs[rsc_index].dev_index; + + /* Restore initial configuration from tmp_ep created for packing local + * addresses. */ + ucp_cm_client_restore_ep(wireup_ep, ucp_ep); + +#ifdef ENABLE_ASSERT + ucs_for_each_bit(rsc_index, tl_bitmap) { + ucs_assert(dev_index == context->tl_rscs[rsc_index].dev_index); + } +#endif + + tl_bitmap = ucp_context_dev_idx_tl_bitmap(context, dev_index); + status = ucp_wireup_init_lanes(ucp_ep, wireup_ep->ep_init_flags, + tl_bitmap, &addr, addr_indices); + if (status != UCS_OK) { + goto out_free_addr; + } + + status = ucp_wireup_connect_local(ucp_ep, &addr, NULL); + if (status != UCS_OK) { + goto out_free_addr; + } + + status = uct_cm_client_ep_conn_notify(uct_cm_ep); + if (status != UCS_OK) { + /* connection can't be established by UCT, no need to disconnect */ + ucp_ep->flags &= ~UCP_EP_FLAG_LOCAL_CONNECTED; + goto out_free_addr; + } + + ucp_wireup_remote_connected(ucp_ep); + +out_free_addr: + ucs_free(addr.address_list); +out: + if (status != UCS_OK) { + ucp_worker_set_ep_failed(worker, ucp_ep, &wireup_ep->super.super, + ucp_ep_get_cm_lane(ucp_ep), status); + } + + UCS_ASYNC_UNBLOCK(&worker->async); + ucp_cm_client_connect_prog_arg_free(progress_arg); + return 1; +} + +static ucs_status_t +ucp_cm_remote_data_check(const uct_cm_remote_data_t *remote_data) +{ + if (ucs_test_all_flags(remote_data->field_mask, + UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR | + UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR_LENGTH | + UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA | + UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA_LENGTH)) { + return UCS_OK; + } + + ucs_error("incompatible client server connection establishment protocol"); + return UCS_ERR_UNSUPPORTED; +} + +/* + * Async callback on a client side which notifies that server is connected. + */ +static void ucp_cm_client_connect_cb(uct_ep_h uct_cm_ep, void *arg, + const uct_cm_ep_client_connect_args_t + *connect_args) +{ + ucp_ep_h ucp_ep = (ucp_ep_h)arg; + ucp_worker_h worker = ucp_ep->worker; + uct_worker_cb_id_t prog_id = UCS_CALLBACKQ_ID_NULL; + ucp_cm_client_connect_progress_arg_t *progress_arg; + const uct_cm_remote_data_t *remote_data; + ucs_status_t status; + + ucs_assert_always(ucs_test_all_flags(connect_args->field_mask, + (UCT_CM_EP_CLIENT_CONNECT_ARGS_FIELD_REMOTE_DATA | + UCT_CM_EP_CLIENT_CONNECT_ARGS_FIELD_STATUS))); + + remote_data = connect_args->remote_data; + status = connect_args->status; + + if (status != UCS_OK) { + /* connection can't be established by UCT, no need to disconnect */ + ucp_ep->flags &= ~UCP_EP_FLAG_LOCAL_CONNECTED; + goto err_out; + } + + status = ucp_cm_remote_data_check(remote_data); + if (status != UCS_OK) { + goto err_out; + } + + progress_arg = ucs_malloc(sizeof(*progress_arg), + "ucp_cm_client_connect_progress_arg_t"); + if (progress_arg == NULL) { + status = UCS_ERR_NO_MEMORY; + goto err_out; + } + + progress_arg->sa_data = ucs_malloc(remote_data->conn_priv_data_length, + "sa data"); + if (progress_arg->sa_data == NULL) { + status = UCS_ERR_NO_MEMORY; + goto err_free_arg; + } + + progress_arg->dev_addr = ucs_malloc(remote_data->dev_addr_length, + "device address"); + if (progress_arg->dev_addr == NULL) { + status = UCS_ERR_NO_MEMORY; + goto err_free_sa_data; + } + + progress_arg->ucp_ep = ucp_ep; + memcpy(progress_arg->dev_addr, remote_data->dev_addr, + remote_data->dev_addr_length); + memcpy(progress_arg->sa_data, remote_data->conn_priv_data, + remote_data->conn_priv_data_length); + + uct_worker_progress_register_safe(worker->uct, + ucp_cm_client_connect_progress, + progress_arg, UCS_CALLBACKQ_FLAG_ONESHOT, + &prog_id); + ucp_worker_signal_internal(ucp_ep->worker); + return; + +err_free_sa_data: + ucs_free(progress_arg->sa_data); +err_free_arg: + ucs_free(progress_arg); +err_out: + UCS_ASYNC_BLOCK(&worker->async); + ucp_worker_set_ep_failed(worker, ucp_ep, uct_cm_ep, + ucp_ep_get_cm_lane(ucp_ep), status); + UCS_ASYNC_UNBLOCK(&worker->async); +} + +/* + * Internal flush completion callback which is a part of close protocol, + * this flush was initiated by remote peer in disconnect callback on CM lane. + */ +static void ucp_ep_cm_disconnect_flushed_cb(ucp_request_t *req) +{ + ucp_ep_h ucp_ep = req->send.ep; + /* the EP can be closed/destroyed from err callback */ + ucs_async_context_t *async = &ucp_ep->worker->async; + + UCS_ASYNC_BLOCK(async); + if (req->status == UCS_OK) { + ucs_assert(ucp_ep_is_cm_local_connected(ucp_ep)); + ucp_ep_cm_disconnect_cm_lane(ucp_ep); + } else if (ucp_ep->flags & UCP_EP_FLAG_FAILED) { + ucs_assert(!ucp_ep_is_cm_local_connected(ucp_ep)); + } else { + /* 1) ucp_ep_close(force) is called from err callback which was invoked + on remote connection reset + TODO: remove this case when IB flush cancel is fixed (#4743), + moving QP to err state should move UCP EP to error state, + then ucp_worker_set_ep_failed disconnects CM lane + 2) transport err is also possible on flush + */ + ucs_assert((req->status == UCS_ERR_CANCELED) || + (req->status == UCS_ERR_ENDPOINT_TIMEOUT)); + } + + ucs_assert(!(req->flags & UCP_REQUEST_FLAG_CALLBACK)); + ucp_request_put(req); + UCS_ASYNC_UNBLOCK(async); +} + +static unsigned ucp_ep_cm_remote_disconnect_progress(void *arg) +{ + ucp_ep_h ucp_ep = arg; + void *req; + ucs_status_t status; + + ucs_trace("ep %p: flags 0x%x cm_remote_disconnect_progress", ucp_ep, + ucp_ep->flags); + + ucs_assert(ucp_ep_get_cm_uct_ep(ucp_ep) != NULL); + + ucs_assert(ucp_ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED); + if (ucs_test_all_flags(ucp_ep->flags, UCP_EP_FLAG_CLOSED | + UCP_EP_FLAG_CLOSE_REQ_VALID)) { + ucp_request_complete_send(ucp_ep_ext_gen(ucp_ep)->close_req.req, UCS_OK); + return 1; + } + + if (ucp_ep->flags & UCP_EP_FLAG_CLOSED) { + /* the ep is closed by API but close req is not valid yet (checked + * above), it will be set later from scheduled + * @ref ucp_ep_close_flushed_callback */ + ucs_debug("ep %p: ep closed but request is not set, waiting for the flush callback", + ucp_ep); + return 1; + } + + /* + * TODO: set the ucp_ep to error state to prevent user from sending more + * ops. + */ + ucs_assert(ucp_ep->flags & UCP_EP_FLAG_FLUSH_STATE_VALID); + ucs_assert(!(ucp_ep->flags & UCP_EP_FLAG_CLOSED)); + req = ucp_ep_flush_internal(ucp_ep, UCT_FLUSH_FLAG_LOCAL, 0, + &ucp_request_null_param, NULL, + ucp_ep_cm_disconnect_flushed_cb, + "cm_disconnected_cb"); + if (req == NULL) { + /* flush is successfully completed in place, notify remote peer + * that we are disconnected, the EP will be destroyed from API call */ + ucp_ep_cm_disconnect_cm_lane(ucp_ep); + } else if (UCS_PTR_IS_ERR(req)) { + status = UCS_PTR_STATUS(req); + ucs_error("ucp_ep_flush_internal completed with error: %s", + ucs_status_string(status)); + goto err; + } + + return 1; + +err: + ucp_worker_set_ep_failed(ucp_ep->worker, ucp_ep, + ucp_ep_get_cm_uct_ep(ucp_ep), + ucp_ep_get_cm_lane(ucp_ep), status); + return 1; +} + +static unsigned ucp_ep_cm_disconnect_progress(void *arg) +{ + ucp_ep_h ucp_ep = arg; + uct_ep_h uct_cm_ep = ucp_ep_get_cm_uct_ep(ucp_ep); + ucs_async_context_t *async = &ucp_ep->worker->async; + ucp_request_t *close_req; + + UCS_ASYNC_BLOCK(async); + + ucs_trace("ep %p: got remote disconnect, cm_ep %p, flags 0x%x", ucp_ep, + uct_cm_ep, ucp_ep->flags); + ucs_assert(ucp_ep_get_cm_uct_ep(ucp_ep) == uct_cm_ep); + + ucp_ep->flags &= ~UCP_EP_FLAG_REMOTE_CONNECTED; + + if (ucp_ep->flags & UCP_EP_FLAG_FAILED) { + /* - ignore close event on failed ep, since all lanes are destroyed in + generic err flow + - close request is valid only if all lanes are flushed, transport + error is unexpected */ + ucs_assert(!(ucp_ep->flags & UCP_EP_FLAG_CLOSE_REQ_VALID)); + } else if (ucp_ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED) { + /* if the EP is local connected, need to flush it from main thread first */ + ucp_ep_cm_remote_disconnect_progress(ucp_ep); + ucp_ep_invoke_err_cb(ucp_ep, UCS_ERR_CONNECTION_RESET); + } else if (ucp_ep->flags & UCP_EP_FLAG_CLOSE_REQ_VALID) { + /* if the EP is not local connected, the EP has been closed and flushed, + CM lane is disconnected, complete close request and destroy EP */ + ucs_assert(ucp_ep->flags & UCP_EP_FLAG_CLOSED); + close_req = ucp_ep_ext_gen(ucp_ep)->close_req.req; + ucp_ep_local_disconnect_progress(close_req); + } else { + ucs_warn("ep %p: unexpected state on disconnect, flags: 0x%u", + ucp_ep, ucp_ep->flags); + } + + UCS_ASYNC_UNBLOCK(async); + return 1; +} + +static void ucp_cm_disconnect_cb(uct_ep_h uct_cm_ep, void *arg) +{ + ucp_ep_h ucp_ep = arg; + uct_worker_cb_id_t prog_id = UCS_CALLBACKQ_ID_NULL; + + ucs_trace("ep %p: CM remote disconnect callback invoked, flags 0x%x", + ucp_ep, ucp_ep->flags); + + uct_worker_progress_register_safe(ucp_ep->worker->uct, + ucp_ep_cm_disconnect_progress, + ucp_ep, UCS_CALLBACKQ_FLAG_ONESHOT, + &prog_id); + ucp_worker_signal_internal(ucp_ep->worker); +} + +ucs_status_t ucp_ep_client_cm_connect_start(ucp_ep_h ucp_ep, + const ucp_ep_params_t *params) +{ + ucp_wireup_ep_t *wireup_ep = ucp_ep_get_cm_wireup_ep(ucp_ep); + ucp_worker_h worker = ucp_ep->worker; + uct_ep_h cm_ep; + uct_ep_params_t cm_lane_params; + ucs_status_t status; + + wireup_ep->ep_init_flags = ucp_ep_init_flags(ucp_ep->worker, params); + + cm_lane_params.field_mask = UCT_EP_PARAM_FIELD_CM | + UCT_EP_PARAM_FIELD_USER_DATA | + UCT_EP_PARAM_FIELD_SOCKADDR | + UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS | + UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB | + UCT_EP_PARAM_FIELD_SOCKADDR_CONNECT_CB_CLIENT | + UCT_EP_PARAM_FIELD_SOCKADDR_DISCONNECT_CB; + + cm_lane_params.user_data = ucp_ep; + cm_lane_params.sockaddr = ¶ms->sockaddr; + cm_lane_params.sockaddr_cb_flags = UCT_CB_FLAG_ASYNC; + cm_lane_params.sockaddr_pack_cb = ucp_cm_client_priv_pack_cb; + cm_lane_params.sockaddr_cb_client = ucp_cm_client_connect_cb; + cm_lane_params.disconnect_cb = ucp_cm_disconnect_cb; + ucs_assert_always(ucp_worker_num_cm_cmpts(worker) == 1); + cm_lane_params.cm = worker->cms[0].cm; + + status = uct_ep_create(&cm_lane_params, &cm_ep); + if (status != UCS_OK) { + /* coverity[leaked_storage] */ + return status; + } + + ucp_wireup_ep_set_next_ep(&wireup_ep->super.super, cm_ep); + ucp_ep_flush_state_reset(ucp_ep); + + return UCS_OK; +} + +static unsigned ucp_cm_server_conn_request_progress(void *arg) +{ + ucp_conn_request_h conn_request = arg; + ucp_listener_h listener = conn_request->listener; + ucp_worker_h worker = listener->worker; + ucp_ep_h ep; + + ucs_trace_func("listener %p, connect request %p", listener, conn_request); + + if (listener->conn_cb) { + listener->conn_cb(conn_request, listener->arg); + return 1; + } + + UCS_ASYNC_BLOCK(&worker->async); + ucp_ep_create_server_accept(worker, conn_request, &ep); + UCS_ASYNC_UNBLOCK(&worker->async); + return 1; +} + +void ucp_cm_server_conn_request_cb(uct_listener_h listener, void *arg, + const uct_cm_listener_conn_request_args_t + *conn_req_args) +{ + ucp_listener_h ucp_listener = arg; + uct_worker_cb_id_t prog_id = UCS_CALLBACKQ_ID_NULL; + ucp_conn_request_h ucp_conn_request; + uct_conn_request_h conn_request; + const uct_cm_remote_data_t *remote_data; + ucs_status_t status; + + ucs_assert_always(ucs_test_all_flags(conn_req_args->field_mask, + (UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_CONN_REQUEST | + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_REMOTE_DATA | + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_DEV_NAME | + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_CLIENT_ADDR))); + + conn_request = conn_req_args->conn_request; + remote_data = conn_req_args->remote_data; + + status = ucp_cm_remote_data_check(remote_data); + if (status != UCS_OK) { + goto err_reject; + } + + ucp_conn_request = ucs_malloc(ucs_offsetof(ucp_conn_request_t, sa_data) + + remote_data->conn_priv_data_length, + "ucp_conn_request_h"); + if (ucp_conn_request == NULL) { + ucs_error("failed to allocate connect request, rejecting connection " + "request %p on TL listener %p", + conn_request, listener); + goto err_reject; + } + + ucp_conn_request->remote_dev_addr = ucs_malloc(remote_data->dev_addr_length, + "remote device address"); + if (ucp_conn_request->remote_dev_addr == NULL) { + ucs_error("failed to allocate device address, rejecting connection " + "request %p on TL listener %p", + conn_request, listener); + goto err_free_ucp_conn_request; + } + + ucp_conn_request->listener = ucp_listener; + ucp_conn_request->uct.listener = listener; + ucp_conn_request->uct_req = conn_request; + + status = ucs_sockaddr_copy((struct sockaddr *)&ucp_conn_request->client_address, + conn_req_args->client_address.addr); + if (status != UCS_OK) { + goto err_free_remote_dev_addr; + } + + ucs_strncpy_safe(ucp_conn_request->dev_name, conn_req_args->dev_name, + UCT_DEVICE_NAME_MAX); + memcpy(ucp_conn_request->remote_dev_addr, remote_data->dev_addr, + remote_data->dev_addr_length); + memcpy(&ucp_conn_request->sa_data, remote_data->conn_priv_data, + remote_data->conn_priv_data_length); + + uct_worker_progress_register_safe(ucp_listener->worker->uct, + ucp_cm_server_conn_request_progress, + ucp_conn_request, + UCS_CALLBACKQ_FLAG_ONESHOT, &prog_id); + + /* If the worker supports the UCP_FEATURE_WAKEUP feature, signal the user so + * that he can wake-up on this event */ + ucp_worker_signal_internal(ucp_listener->worker); + return; + +err_free_remote_dev_addr: + ucs_free(ucp_conn_request->remote_dev_addr); +err_free_ucp_conn_request: + ucs_free(ucp_conn_request); +err_reject: + status = uct_listener_reject(listener, conn_request); + if (status != UCS_OK) { + ucs_warn("failed to reject connect request %p on listener %p", + conn_request, listener); + } +} + +ucs_status_t +ucp_ep_cm_server_create_connected(ucp_worker_h worker, unsigned ep_init_flags, + const ucp_unpacked_address_t *remote_addr, + ucp_conn_request_h conn_request, + ucp_ep_h *ep_p) +{ + uint64_t tl_bitmap = ucp_context_dev_tl_bitmap(worker->context, + conn_request->dev_name); + ucp_ep_h ep; + ucs_status_t status; + + /* Create and connect TL part */ + status = ucp_ep_create_to_worker_addr(worker, tl_bitmap, remote_addr, + ep_init_flags, + "conn_request on uct_listener", &ep); + if (status != UCS_OK) { + ucs_warn("server ep %p failed to connect to worker address on device %s, tl_bitmap 0x%zx, status %s", + ep, conn_request->dev_name, tl_bitmap, + ucs_status_string(status)); + uct_listener_reject(conn_request->uct.listener, conn_request->uct_req); + goto out; + } + + status = ucp_wireup_connect_local(ep, remote_addr, NULL); + if (status != UCS_OK) { + ucs_warn("server ep %p failed to connect to remote address on device %s, tl_bitmap 0x%zx, status %s", + ep, conn_request->dev_name, tl_bitmap, + ucs_status_string(status)); + uct_listener_reject(conn_request->uct.listener, conn_request->uct_req); + ucp_ep_destroy_internal(ep); + goto out; + } + + status = ucp_ep_cm_connect_server_lane(ep, conn_request->uct.listener, + conn_request->uct_req); + if (status != UCS_OK) { + ucs_warn("server ep %p failed to connect CM lane on device %s, tl_bitmap 0x%zx, status %s", + ep, conn_request->dev_name, tl_bitmap, + ucs_status_string(status)); + ucp_ep_destroy_internal(ep); + goto out; + } + + ep->flags |= UCP_EP_FLAG_LISTENER; + ucp_ep_ext_gen(ep)->listener = conn_request->listener; + ucp_ep_update_dest_ep_ptr(ep, conn_request->sa_data.ep_ptr); + ucp_listener_schedule_accept_cb(ep); + *ep_p = ep; + +out: + ucs_free(conn_request->remote_dev_addr); + ucs_free(conn_request); + + return status; +} + +static ssize_t ucp_cm_server_priv_pack_cb(void *arg, + const uct_cm_ep_priv_data_pack_args_t + *pack_args, void *priv_data) +{ + ucp_wireup_sockaddr_data_t *sa_data = priv_data; + ucp_ep_h ep = arg; + ucp_worker_h worker = ep->worker; + uint64_t tl_bitmap; + uct_cm_attr_t cm_attr; + void* ucp_addr; + size_t ucp_addr_size; + ucp_rsc_index_t rsc_index; + ucp_rsc_index_t dev_index; + ucs_status_t status; + + UCS_ASYNC_BLOCK(&worker->async); + + tl_bitmap = ucp_ep_get_tl_bitmap(ep); + /* make sure that all lanes are created on correct device */ + ucs_assert_always(pack_args->field_mask & + UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME); + ucs_assert(!(tl_bitmap & ~ucp_context_dev_tl_bitmap(worker->context, + pack_args->dev_name))); + + status = ucp_address_pack(worker, ep, tl_bitmap, + UCP_ADDRESS_PACK_FLAG_IFACE_ADDR | + UCP_ADDRESS_PACK_FLAG_EP_ADDR, NULL, + &ucp_addr_size, &ucp_addr); + if (status != UCS_OK) { + goto out; + } + + cm_attr.field_mask = UCT_CM_ATTR_FIELD_MAX_CONN_PRIV; + ucs_assert(ucp_worker_num_cm_cmpts(worker) == 1); + status = uct_cm_query(worker->cms[0].cm, &cm_attr); + if (status != UCS_OK) { + goto out; + } + + if (cm_attr.max_conn_priv < (sizeof(*sa_data) + ucp_addr_size)) { + status = UCS_ERR_BUFFER_TOO_SMALL; + goto free_addr; + } + + rsc_index = ucs_ffs64_safe(tl_bitmap); + ucs_assert(rsc_index != UCP_NULL_RESOURCE); + dev_index = worker->context->tl_rscs[rsc_index].dev_index; + ucp_cm_priv_data_pack(sa_data, ep, dev_index, ucp_addr, ucp_addr_size); + +free_addr: + ucs_free(ucp_addr); +out: + if (status == UCS_OK) { + ep->flags |= UCP_EP_FLAG_LOCAL_CONNECTED; + } else { + ucp_worker_set_ep_failed(worker, ep, + &ucp_ep_get_cm_wireup_ep(ep)->super.super, + ucp_ep_get_cm_lane(ep), status); + } + + UCS_ASYNC_UNBLOCK(&worker->async); + + return (status == UCS_OK) ? (sizeof(*sa_data) + ucp_addr_size) : status; +} + +/* + * The main thread progress part of connection establishment on server side + */ +static unsigned ucp_cm_server_conn_notify_progress(void *arg) +{ + ucp_ep_h ucp_ep = arg; + + UCS_ASYNC_BLOCK(&ucp_ep->worker->async); + ucp_wireup_remote_connected(ucp_ep); + UCS_ASYNC_UNBLOCK(&ucp_ep->worker->async); + return 1; +} + +/* + * Async callback on a server side which notifies that client is connected. + */ +static void ucp_cm_server_conn_notify_cb(uct_ep_h ep, void *arg, + const uct_cm_ep_server_conn_notify_args_t + *notify_args) +{ + ucp_ep_h ucp_ep = arg; + uct_worker_cb_id_t prog_id = UCS_CALLBACKQ_ID_NULL; + ucp_lane_index_t cm_lane; + ucs_status_t status; + + ucs_assert_always(notify_args->field_mask & + UCT_CM_EP_SERVER_CONN_NOTIFY_ARGS_FIELD_STATUS); + + status = notify_args->status; + + if (status == UCS_OK) { + uct_worker_progress_register_safe(ucp_ep->worker->uct, + ucp_cm_server_conn_notify_progress, + ucp_ep, UCS_CALLBACKQ_FLAG_ONESHOT, + &prog_id); + ucp_worker_signal_internal(ucp_ep->worker); + } else { + /* if reject is arrived on server side, then UCT does something wrong */ + ucs_assert(status != UCS_ERR_REJECTED); + cm_lane = ucp_ep_get_cm_lane(ucp_ep); + ucp_worker_set_ep_failed(ucp_ep->worker, ucp_ep, + ucp_ep->uct_eps[cm_lane], cm_lane, status); + } +} + +ucs_status_t ucp_ep_cm_connect_server_lane(ucp_ep_h ep, + uct_listener_h uct_listener, + uct_conn_request_h uct_conn_req) +{ + ucp_worker_h worker = ep->worker; + ucp_lane_index_t lane = ucp_ep_get_cm_lane(ep); + uct_ep_params_t uct_ep_params; + uct_ep_h uct_ep; + ucs_status_t status; + + ucs_assert(lane != UCP_NULL_LANE); + ucs_assert(ep->uct_eps[lane] == NULL); + + /* TODO: split CM and wireup lanes */ + status = ucp_wireup_ep_create(ep, &ep->uct_eps[lane]); + if (status != UCS_OK) { + ucs_warn("server ep %p failed to create wireup CM lane, status %s", + ep, ucs_status_string(status)); + uct_listener_reject(uct_listener, uct_conn_req); + return status; + } + + /* create a server side CM endpoint */ + ucs_trace("ep %p: uct_ep[%d]", ep, lane); + uct_ep_params.field_mask = UCT_EP_PARAM_FIELD_CM | + UCT_EP_PARAM_FIELD_CONN_REQUEST | + UCT_EP_PARAM_FIELD_USER_DATA | + UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS | + UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB | + UCT_EP_PARAM_FIELD_SOCKADDR_NOTIFY_CB_SERVER | + UCT_EP_PARAM_FIELD_SOCKADDR_DISCONNECT_CB; + + ucs_assertv_always(ucp_worker_num_cm_cmpts(worker) == 1, + "multiple CMs are not supported"); + uct_ep_params.cm = worker->cms[0].cm; + uct_ep_params.user_data = ep; + uct_ep_params.conn_request = uct_conn_req; + uct_ep_params.sockaddr_cb_flags = UCT_CB_FLAG_ASYNC; + uct_ep_params.sockaddr_pack_cb = ucp_cm_server_priv_pack_cb; + uct_ep_params.sockaddr_cb_server = ucp_cm_server_conn_notify_cb; + uct_ep_params.disconnect_cb = ucp_cm_disconnect_cb; + + status = uct_ep_create(&uct_ep_params, &uct_ep); + if (status != UCS_OK) { + /* coverity[leaked_storage] */ + return status; + } + + ucp_wireup_ep_set_next_ep(ep->uct_eps[lane], uct_ep); + return UCS_OK; +} + +void ucp_ep_cm_disconnect_cm_lane(ucp_ep_h ucp_ep) +{ + uct_ep_h uct_cm_ep = ucp_ep_get_cm_uct_ep(ucp_ep); + ucs_status_t status; + + ucs_assert_always(uct_cm_ep != NULL); + /* No reason to try disconnect twice */ + ucs_assert(ucp_ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED); + ucs_assert(!(ucp_ep->flags & UCP_EP_FLAG_FAILED)); + + ucp_ep->flags &= ~UCP_EP_FLAG_LOCAL_CONNECTED; + /* this will invoke @ref ucp_cm_disconnect_cb on remote side */ + status = uct_ep_disconnect(uct_cm_ep, 0); + if (status != UCS_OK) { + ucs_warn("failed to disconnect CM lane %p of ep %p, %s", ucp_ep, + uct_cm_ep, ucs_status_string(status)); + } +} + +ucp_request_t* ucp_ep_cm_close_request_get(ucp_ep_h ep) +{ + ucp_request_t *request = ucp_request_get(ep->worker); + + if (request == NULL) { + ucs_error("failed to allocate close request for ep %p", ep); + return NULL; + } + + request->status = UCS_OK; + request->flags = 0; + request->send.ep = ep; + request->send.flush.uct_flags = UCT_FLUSH_FLAG_LOCAL; + + return request; +} + +static int ucp_cm_cbs_remove_filter(const ucs_callbackq_elem_t *elem, void *arg) +{ + ucp_cm_client_connect_progress_arg_t *client_connect_arg; + + if (elem->cb == ucp_cm_client_connect_progress) { + client_connect_arg = elem->arg; + if (client_connect_arg->ucp_ep == arg) { + ucp_cm_client_connect_prog_arg_free(client_connect_arg); + return 1; + } else { + return 0; + } + } else if ((elem->cb == ucp_ep_cm_disconnect_progress) || + (elem->cb == ucp_cm_server_conn_notify_progress)) { + return arg == elem->arg; + } else { + return 0; + } +} + +void ucp_ep_cm_slow_cbq_cleanup(ucp_ep_h ep) +{ + ucs_callbackq_remove_if(&ep->worker->uct->progress_q, + ucp_cm_cbs_remove_filter, ep); +} diff --git a/src/ucp/wireup/wireup_cm.h b/src/ucp/wireup/wireup_cm.h new file mode 100644 index 00000000000..1c42516b11c --- /dev/null +++ b/src/ucp/wireup/wireup_cm.h @@ -0,0 +1,50 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef WIREUP_CM_H_ +#define WIREUP_CM_H_ + +#include +#include +#include + + +typedef struct ucp_cm_client_connect_progress_arg { + ucp_ep_h ucp_ep; + ucp_wireup_sockaddr_data_t *sa_data; + uct_device_addr_t *dev_addr; +} ucp_cm_client_connect_progress_arg_t; + + +unsigned ucp_cm_ep_init_flags(const ucp_worker_h worker, + const ucp_ep_params_t *params); + +int ucp_ep_init_flags_has_cm(unsigned ep_init_flags); + +ucs_status_t ucp_ep_cm_connect_server_lane(ucp_ep_h ep, + uct_listener_h uct_listener, + uct_conn_request_h uct_conn_req); + +ucs_status_t ucp_ep_client_cm_connect_start(ucp_ep_h ucp_ep, + const ucp_ep_params_t *params); + +void ucp_cm_server_conn_request_cb(uct_listener_h listener, void *arg, + const uct_cm_listener_conn_request_args_t + *conn_req_args); + +ucs_status_t +ucp_ep_cm_server_create_connected(ucp_worker_h worker, unsigned ep_init_flags, + const ucp_unpacked_address_t *remote_addr, + ucp_conn_request_h conn_request, + ucp_ep_h *ep_p); + +void ucp_ep_cm_disconnect_cm_lane(ucp_ep_h ucp_ep); + +ucp_request_t* ucp_ep_cm_close_request_get(ucp_ep_h ep); + +void ucp_ep_cm_slow_cbq_cleanup(ucp_ep_h ep); + +#endif /* WIREUP_CM_H_ */ diff --git a/src/ucp/wireup/wireup_ep.c b/src/ucp/wireup/wireup_ep.c index 77e26c52975..53fdc20cd98 100644 --- a/src/ucp/wireup/wireup_ep.c +++ b/src/ucp/wireup/wireup_ep.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "wireup_ep.h" #include "wireup.h" @@ -13,6 +17,7 @@ #include #include #include +#include #include #include @@ -32,7 +37,8 @@ static ucs_status_t ucp_wireup_ep_connect_to_ep(uct_ep_h uct_ep, const uct_device_addr_t *dev_addr, const uct_ep_addr_t *ep_addr) { - ucp_wireup_ep_t *wireup_ep = ucs_derived_of(uct_ep, ucp_wireup_ep_t); + ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep); + wireup_ep->flags |= UCP_WIREUP_EP_FLAG_LOCAL_CONNECTED; return uct_ep_connect_to_ep(wireup_ep->super.uct_ep, dev_addr, ep_addr); } @@ -132,7 +138,7 @@ ucs_status_t ucp_wireup_ep_progress_pending(uct_pending_req_t *self) status = req->func(req); if (status == UCS_OK) { - ucs_atomic_add32(&wireup_ep->pending_count, -1); + ucs_atomic_sub32(&wireup_ep->pending_count, 1); ucs_free(proxy_req); } return status; @@ -146,7 +152,7 @@ ucp_wireup_ep_pending_req_release(uct_pending_req_t *self, void *arg) ucp_wireup_ep_t *wireup_ep = proxy_req->send.proxy.wireup_ep; ucp_request_t *req; - ucs_atomic_add32(&wireup_ep->pending_count, -1); + ucs_atomic_sub32(&wireup_ep->pending_count, 1); if (proxy_req->send.proxy.req->func == ucp_wireup_msg_progress) { req = ucs_container_of(proxy_req->send.proxy.req, ucp_request_t, @@ -162,7 +168,7 @@ static ucs_status_t ucp_wireup_ep_pending_add(uct_ep_h uct_ep, uct_pending_req_t *req, unsigned flags) { - ucp_wireup_ep_t *wireup_ep = ucs_derived_of(uct_ep, ucp_wireup_ep_t); + ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep); ucp_ep_h ucp_ep = wireup_ep->super.ucp_ep; ucp_worker_h worker = ucp_ep->worker; ucp_request_t *proxy_req; @@ -198,6 +204,7 @@ static ucs_status_t ucp_wireup_ep_pending_add(uct_ep_h uct_ep, } out: UCS_ASYNC_UNBLOCK(&worker->async); + /* coverity[leaked_storage] */ return status; } @@ -205,7 +212,7 @@ static void ucp_wireup_ep_pending_purge(uct_ep_h uct_ep, uct_pending_purge_callback_t cb, void *arg) { - ucp_wireup_ep_t *wireup_ep = ucs_derived_of(uct_ep, ucp_wireup_ep_t); + ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep); ucp_worker_h worker; uct_pending_req_t *req; ucp_request_t *ucp_req; @@ -232,7 +239,7 @@ static ssize_t ucp_wireup_ep_am_bcopy(uct_ep_h uct_ep, uint8_t id, uct_pack_callback_t pack_cb, void *arg, unsigned flags) { - ucp_wireup_ep_t *wireup_ep = ucs_derived_of(uct_ep, ucp_wireup_ep_t); + ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep); if (id == UCP_AM_ID_WIREUP) { return uct_ep_am_bcopy(ucp_wireup_ep_get_msg_ep(wireup_ep), @@ -247,32 +254,29 @@ UCS_CLASS_DEFINE_NAMED_NEW_FUNC(ucp_wireup_ep_create, ucp_wireup_ep_t, uct_ep_t, ucp_ep_h); ucs_status_t -ucp_wireup_ep_connect_aux(ucp_wireup_ep_t *wireup_ep, - const ucp_ep_params_t *params, unsigned address_count, - const ucp_address_entry_t *address_list) +ucp_wireup_ep_connect_aux(ucp_wireup_ep_t *wireup_ep, unsigned ep_init_flags, + const ucp_unpacked_address_t *remote_address) { - ucp_ep_h ucp_ep = wireup_ep->super.ucp_ep; - ucp_worker_h worker = ucp_ep->worker; + ucp_ep_h ucp_ep = wireup_ep->super.ucp_ep; + ucp_worker_h worker = ucp_ep->worker; + ucp_wireup_select_info_t select_info = {0}; uct_ep_params_t uct_ep_params; const ucp_address_entry_t *aux_addr; ucp_worker_iface_t *wiface; - ucp_rsc_index_t rsc_index; - unsigned aux_addr_index; ucs_status_t status; /* select an auxiliary transport which would be used to pass connection * establishment messages. */ - status = ucp_wireup_select_aux_transport(ucp_ep, params, address_list, - address_count, &rsc_index, - &aux_addr_index); + status = ucp_wireup_select_aux_transport(ucp_ep, ep_init_flags, + remote_address, &select_info); if (status != UCS_OK) { return status; } - wireup_ep->aux_rsc_index = rsc_index; - aux_addr = &address_list[aux_addr_index]; - wiface = ucp_worker_iface(worker, rsc_index); + wireup_ep->aux_rsc_index = select_info.rsc_index; + aux_addr = &remote_address->address_list[select_info.addr_index]; + wiface = ucp_worker_iface(worker, select_info.rsc_index); /* create auxiliary endpoint connected to the remote iface. */ uct_ep_params.field_mask = UCT_EP_PARAM_FIELD_IFACE | @@ -291,14 +295,15 @@ ucp_wireup_ep_connect_aux(ucp_wireup_ep_t *wireup_ep, ucs_debug("ep %p: wireup_ep %p created aux_ep %p to %s using " UCT_TL_RESOURCE_DESC_FMT, ucp_ep, wireup_ep, wireup_ep->aux_ep, ucp_ep_peer_name(ucp_ep), - UCT_TL_RESOURCE_DESC_ARG(&worker->context->tl_rscs[rsc_index].tl_rsc)); + UCT_TL_RESOURCE_DESC_ARG(&worker->context->tl_rscs[select_info.rsc_index].tl_rsc)); + return UCS_OK; } static ucs_status_t ucp_wireup_ep_flush(uct_ep_h uct_ep, unsigned flags, uct_completion_t *comp) { - ucp_wireup_ep_t *wireup_ep = ucs_derived_of(uct_ep, ucp_wireup_ep_t); + ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep); if (flags & UCT_FLUSH_FLAG_CANCEL) { if (wireup_ep->aux_ep) { @@ -313,37 +318,38 @@ static ucs_status_t ucp_wireup_ep_flush(uct_ep_h uct_ep, unsigned flags, UCS_CLASS_INIT_FUNC(ucp_wireup_ep_t, ucp_ep_h ucp_ep) { static uct_iface_ops_t ops = { - .ep_connect_to_ep = ucp_wireup_ep_connect_to_ep, - .ep_flush = ucp_wireup_ep_flush, - .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(ucp_wireup_ep_t), - .ep_pending_add = ucp_wireup_ep_pending_add, - .ep_pending_purge = ucp_wireup_ep_pending_purge, - .ep_put_short = (void*)ucs_empty_function_return_no_resource, - .ep_put_bcopy = (void*)ucp_wireup_ep_bcopy_send_func, - .ep_put_zcopy = (void*)ucs_empty_function_return_no_resource, - .ep_get_short = (void*)ucs_empty_function_return_no_resource, - .ep_get_bcopy = (void*)ucs_empty_function_return_no_resource, - .ep_get_zcopy = (void*)ucs_empty_function_return_no_resource, - .ep_am_short = (void*)ucs_empty_function_return_no_resource, - .ep_am_bcopy = ucp_wireup_ep_am_bcopy, - .ep_am_zcopy = (void*)ucs_empty_function_return_no_resource, - .ep_tag_eager_short = (void*)ucs_empty_function_return_no_resource, - .ep_tag_eager_bcopy = (void*)ucp_wireup_ep_bcopy_send_func, - .ep_tag_eager_zcopy = (void*)ucs_empty_function_return_no_resource, - .ep_tag_rndv_zcopy = (void*)ucs_empty_function_return_ptr_no_resource, - .ep_tag_rndv_request = (void*)ucs_empty_function_return_no_resource, - .ep_atomic64_post = (void*)ucs_empty_function_return_no_resource, - .ep_atomic64_fetch = (void*)ucs_empty_function_return_no_resource, - .ep_atomic_cswap64 = (void*)ucs_empty_function_return_no_resource, - .ep_atomic32_post = (void*)ucs_empty_function_return_no_resource, - .ep_atomic32_fetch = (void*)ucs_empty_function_return_no_resource, - .ep_atomic_cswap32 = (void*)ucs_empty_function_return_no_resource + .ep_connect_to_ep = ucp_wireup_ep_connect_to_ep, + .ep_flush = ucp_wireup_ep_flush, + .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(ucp_wireup_ep_t), + .ep_pending_add = ucp_wireup_ep_pending_add, + .ep_pending_purge = ucp_wireup_ep_pending_purge, + .ep_put_short = (uct_ep_put_short_func_t)ucs_empty_function_return_no_resource, + .ep_put_bcopy = (uct_ep_put_bcopy_func_t)ucp_wireup_ep_bcopy_send_func, + .ep_put_zcopy = (uct_ep_put_zcopy_func_t)ucs_empty_function_return_no_resource, + .ep_get_short = (uct_ep_get_short_func_t)ucs_empty_function_return_no_resource, + .ep_get_bcopy = (uct_ep_get_bcopy_func_t)ucs_empty_function_return_no_resource, + .ep_get_zcopy = (uct_ep_get_zcopy_func_t)ucs_empty_function_return_no_resource, + .ep_am_short = (uct_ep_am_short_func_t)ucs_empty_function_return_no_resource, + .ep_am_bcopy = ucp_wireup_ep_am_bcopy, + .ep_am_zcopy = (uct_ep_am_zcopy_func_t)ucs_empty_function_return_no_resource, + .ep_tag_eager_short = (uct_ep_tag_eager_short_func_t)ucs_empty_function_return_no_resource, + .ep_tag_eager_bcopy = (uct_ep_tag_eager_bcopy_func_t)ucp_wireup_ep_bcopy_send_func, + .ep_tag_eager_zcopy = (uct_ep_tag_eager_zcopy_func_t)ucs_empty_function_return_no_resource, + .ep_tag_rndv_zcopy = (uct_ep_tag_rndv_zcopy_func_t)ucs_empty_function_return_ptr_no_resource, + .ep_tag_rndv_request = (uct_ep_tag_rndv_request_func_t)ucs_empty_function_return_no_resource, + .ep_atomic64_post = (uct_ep_atomic64_post_func_t)ucs_empty_function_return_no_resource, + .ep_atomic64_fetch = (uct_ep_atomic64_fetch_func_t)ucs_empty_function_return_no_resource, + .ep_atomic_cswap64 = (uct_ep_atomic_cswap64_func_t)ucs_empty_function_return_no_resource, + .ep_atomic32_post = (uct_ep_atomic32_post_func_t)ucs_empty_function_return_no_resource, + .ep_atomic32_fetch = (uct_ep_atomic32_fetch_func_t)ucs_empty_function_return_no_resource, + .ep_atomic_cswap32 = (uct_ep_atomic_cswap32_func_t)ucs_empty_function_return_no_resource }; UCS_CLASS_CALL_SUPER_INIT(ucp_proxy_ep_t, &ops, ucp_ep, NULL, 0); self->aux_ep = NULL; self->sockaddr_ep = NULL; + self->tmp_ep = NULL; self->aux_rsc_index = UCP_NULL_RESOURCE; self->sockaddr_rsc_index = UCP_NULL_RESOURCE; self->pending_count = 0; @@ -380,6 +386,10 @@ static UCS_CLASS_CLEANUP_FUNC(ucp_wireup_ep_t) uct_ep_destroy(self->sockaddr_ep); } + if (self->tmp_ep != NULL) { + ucp_ep_disconnected(self->tmp_ep, 1); + } + UCS_ASYNC_BLOCK(&worker->async); --worker->flush_ops_count; UCS_ASYNC_UNBLOCK(&worker->async); @@ -389,7 +399,7 @@ UCS_CLASS_DEFINE(ucp_wireup_ep_t, ucp_proxy_ep_t); ucp_rsc_index_t ucp_wireup_ep_get_aux_rsc_index(uct_ep_h uct_ep) { - ucp_wireup_ep_t *wireup_ep = ucs_derived_of(uct_ep, ucp_wireup_ep_t); + ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep); if (!ucp_wireup_ep_test(uct_ep)) { return UCP_NULL_RESOURCE; @@ -402,24 +412,28 @@ ucp_rsc_index_t ucp_wireup_ep_get_aux_rsc_index(uct_ep_h uct_ep) return wireup_ep->aux_rsc_index; } -ucs_status_t ucp_wireup_ep_connect(uct_ep_h uct_ep, const ucp_ep_params_t *params, - ucp_rsc_index_t rsc_index, int connect_aux, - unsigned address_count, - const ucp_address_entry_t *address_list) +ucs_status_t ucp_wireup_ep_connect(uct_ep_h uct_ep, unsigned ep_init_flags, + ucp_rsc_index_t rsc_index, + unsigned path_index, int connect_aux, + const ucp_unpacked_address_t *remote_address) { - ucp_wireup_ep_t *wireup_ep = ucs_derived_of(uct_ep, ucp_wireup_ep_t); + ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep); + ucp_ep_h ucp_ep = wireup_ep->super.ucp_ep; + ucp_worker_h worker = ucp_ep->worker; uct_ep_params_t uct_ep_params; - ucp_ep_h ucp_ep = wireup_ep->super.ucp_ep; - ucp_worker_h worker = ucp_ep->worker; ucs_status_t status; uct_ep_h next_ep; - ucs_assert(ucp_wireup_ep_test(uct_ep)); + ucs_assert(wireup_ep != NULL); - uct_ep_params.field_mask = UCT_EP_PARAM_FIELD_IFACE; + uct_ep_params.field_mask = UCT_EP_PARAM_FIELD_IFACE | + UCT_EP_PARAM_FIELD_PATH_INDEX; + uct_ep_params.path_index = path_index; uct_ep_params.iface = ucp_worker_iface(worker, rsc_index)->iface; status = uct_ep_create(&uct_ep_params, &next_ep); if (status != UCS_OK) { + /* make Coverity happy */ + ucs_assert(next_ep == NULL); goto err; } @@ -431,8 +445,8 @@ ucs_status_t ucp_wireup_ep_connect(uct_ep_h uct_ep, const ucp_ep_params_t *param /* we need to create an auxiliary transport only for active messages */ if (connect_aux) { - status = ucp_wireup_ep_connect_aux(wireup_ep, params, address_count, - address_list); + status = ucp_wireup_ep_connect_aux(wireup_ep, ep_init_flags, + remote_address); if (status != UCS_OK) { goto err_destroy_next_ep; } @@ -473,7 +487,8 @@ static ucs_status_t ucp_wireup_ep_pack_sockaddr_aux_tls(ucp_worker_h worker, } if (found_supported_tl) { - status = ucp_address_pack(worker, NULL, tl_bitmap, NULL, + status = ucp_address_pack(worker, NULL, tl_bitmap, + UCP_ADDRESS_PACK_FLAGS_ALL, NULL, address_length_p, (void**)address_p); } else { ucs_error("no supported sockaddr auxiliary transports found for %s", dev_name); @@ -484,33 +499,43 @@ static ucs_status_t ucp_wireup_ep_pack_sockaddr_aux_tls(ucp_worker_h worker, return status; } -ssize_t ucp_wireup_ep_sockaddr_fill_private_data(void *arg, const char *dev_name, - void *priv_data) +ssize_t ucp_wireup_ep_sockaddr_fill_private_data(void *arg, + const uct_cm_ep_priv_data_pack_args_t + *pack_args, void *priv_data) { - ucp_wireup_client_data_t *client_data = priv_data; - ucp_wireup_ep_t *wireup_ep = arg; - ucp_ep_h ucp_ep = wireup_ep->super.ucp_ep; - ucp_rsc_index_t sockaddr_rsc = wireup_ep->sockaddr_rsc_index; - ucp_worker_h worker = ucp_ep->worker; - ucp_context_h context = worker->context; + ucp_wireup_sockaddr_data_t *sa_data = priv_data; + ucp_wireup_ep_t *wireup_ep = arg; + ucp_ep_h ucp_ep = wireup_ep->super.ucp_ep; + ucp_rsc_index_t sockaddr_rsc = wireup_ep->sockaddr_rsc_index; + ucp_worker_h worker = ucp_ep->worker; + ucp_context_h context = worker->context; size_t address_length, conn_priv_len; ucp_address_t *worker_address, *rsc_address; uct_iface_attr_t *attrs; ucs_status_t status; uint64_t tl_bitmap; char aux_tls_str[64]; + const char *dev_name; + + ucs_assert_always(pack_args->field_mask & + UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME); - status = ucp_address_pack(worker, NULL, -1, NULL, &address_length, - (void**)&worker_address); + dev_name = pack_args->dev_name; + + status = ucp_address_pack(worker, NULL, UINT64_MAX, + UCP_ADDRESS_PACK_FLAGS_ALL, NULL, + &address_length, (void**)&worker_address); if (status != UCS_OK) { goto err; } - conn_priv_len = sizeof(*client_data) + address_length; + conn_priv_len = sizeof(*sa_data) + address_length; /* pack client data */ - client_data->err_mode = ucp_ep_config(ucp_ep)->key.err_mode; - client_data->ep_ptr = (uintptr_t)ucp_ep; + ucs_assert((int)ucp_ep_config(ucp_ep)->key.err_mode <= UINT8_MAX); + sa_data->err_mode = ucp_ep_config(ucp_ep)->key.err_mode; + sa_data->ep_ptr = (uintptr_t)ucp_ep; + sa_data->dev_index = UCP_NULL_RESOURCE; /* Not used */ attrs = ucp_worker_iface_get_attr(worker, sockaddr_rsc); @@ -519,13 +544,14 @@ ssize_t ucp_wireup_ep_sockaddr_fill_private_data(void *arg, const char *dev_name /* since the full worker address is too large to fit into the trasnport's * private data, try to pack sockaddr aux tls to pass in the address */ - status = ucp_wireup_ep_pack_sockaddr_aux_tls(worker, dev_name, &tl_bitmap, - &rsc_address, &address_length); + status = ucp_wireup_ep_pack_sockaddr_aux_tls(worker, dev_name, + &tl_bitmap, &rsc_address, + &address_length); if (status != UCS_OK) { goto err_free_address; } - conn_priv_len = sizeof(*client_data) + address_length; + conn_priv_len = sizeof(*sa_data) + address_length; /* check the private data length limitation again, now with partial * resources packed (and not the entire worker address) */ @@ -543,8 +569,8 @@ ssize_t ucp_wireup_ep_sockaddr_fill_private_data(void *arg, const char *dev_name goto err_free_address; } - client_data->is_full_addr = 0; - memcpy(client_data + 1, rsc_address, address_length); + sa_data->addr_mode = UCP_WIREUP_SA_DATA_PARTIAL_ADDR; + memcpy(sa_data + 1, rsc_address, address_length); ucp_ep->flags |= UCP_EP_FLAG_SOCKADDR_PARTIAL_ADDR; ucs_free(rsc_address); @@ -552,14 +578,13 @@ ssize_t ucp_wireup_ep_sockaddr_fill_private_data(void *arg, const char *dev_name ucs_trace("sockaddr tl ("UCT_TL_RESOURCE_DESC_FMT") sending partial address: " "(%s transports) (len=%zu) to server. " "total client priv data len: %zu", - context->tl_rscs[sockaddr_rsc].tl_rsc.tl_name, dev_name, + UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[sockaddr_rsc].tl_rsc), ucp_tl_bitmap_str(context, tl_bitmap, aux_tls_str, sizeof(aux_tls_str)), address_length, conn_priv_len); - } else { - client_data->is_full_addr = 1; - memcpy(client_data + 1, worker_address, address_length); + sa_data->addr_mode = UCP_WIREUP_SA_DATA_FULL_ADDR; + memcpy(sa_data + 1, worker_address, address_length); } ucp_worker_release_address(worker, worker_address); @@ -574,7 +599,7 @@ ssize_t ucp_wireup_ep_sockaddr_fill_private_data(void *arg, const char *dev_name ucs_status_t ucp_wireup_ep_connect_to_sockaddr(uct_ep_h uct_ep, const ucp_ep_params_t *params) { - ucp_wireup_ep_t *wireup_ep = ucs_derived_of(uct_ep, ucp_wireup_ep_t); + ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep); ucp_ep_h ucp_ep = wireup_ep->super.ucp_ep; ucp_worker_h worker = ucp_ep->worker; char saddr_str[UCS_SOCKADDR_STRING_LEN]; @@ -585,7 +610,9 @@ ucs_status_t ucp_wireup_ep_connect_to_sockaddr(uct_ep_h uct_ep, ucs_assert(ucp_wireup_ep_test(uct_ep)); - status = ucp_wireup_select_sockaddr_transport(ucp_ep, params, &sockaddr_rsc); + status = ucp_wireup_select_sockaddr_transport(worker->context, + ¶ms->sockaddr, + &sockaddr_rsc); if (status != UCS_OK) { goto out; } @@ -620,9 +647,9 @@ ucs_status_t ucp_wireup_ep_connect_to_sockaddr(uct_ep_h uct_ep, void ucp_wireup_ep_set_next_ep(uct_ep_h uct_ep, uct_ep_h next_ep) { - ucp_wireup_ep_t *wireup_ep = ucs_derived_of(uct_ep, ucp_wireup_ep_t); + ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep); - ucs_assert(ucp_wireup_ep_test(uct_ep)); + ucs_assert(wireup_ep != NULL); ucs_assert(wireup_ep->super.uct_ep == NULL); wireup_ep->flags |= UCP_WIREUP_EP_FLAG_LOCAL_CONNECTED; ucp_proxy_ep_set_uct_ep(&wireup_ep->super, next_ep, 1); @@ -630,10 +657,10 @@ void ucp_wireup_ep_set_next_ep(uct_ep_h uct_ep, uct_ep_h next_ep) uct_ep_h ucp_wireup_ep_extract_next_ep(uct_ep_h uct_ep) { - ucp_wireup_ep_t *wireup_ep = ucs_derived_of(uct_ep, ucp_wireup_ep_t); + ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep); uct_ep_h next_ep; - ucs_assert_always(ucp_wireup_ep_test(uct_ep)); + ucs_assert_always(wireup_ep != NULL); next_ep = wireup_ep->super.uct_ep; wireup_ep->super.uct_ep = NULL; return next_ep; @@ -641,13 +668,15 @@ uct_ep_h ucp_wireup_ep_extract_next_ep(uct_ep_h uct_ep) void ucp_wireup_ep_remote_connected(uct_ep_h uct_ep) { - ucp_wireup_ep_t *wireup_ep = ucs_derived_of(uct_ep, ucp_wireup_ep_t); - ucp_ep_h ucp_ep = wireup_ep->super.ucp_ep; + ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep); + ucp_ep_h ucp_ep; - ucs_assert(ucp_wireup_ep_test(uct_ep)); + ucs_assert(wireup_ep != NULL); ucs_assert(wireup_ep->super.uct_ep != NULL); ucs_assert(wireup_ep->flags & UCP_WIREUP_EP_FLAG_LOCAL_CONNECTED); + ucp_ep = wireup_ep->super.ucp_ep; + ucs_trace("ep %p: wireup ep %p is remote-connected", ucp_ep, wireup_ep); wireup_ep->flags |= UCP_WIREUP_EP_FLAG_READY; uct_worker_progress_register_safe(ucp_ep->worker->uct, @@ -664,13 +693,12 @@ int ucp_wireup_ep_test(uct_ep_h uct_ep) int ucp_wireup_ep_is_owner(uct_ep_h uct_ep, uct_ep_h owned_ep) { - ucp_wireup_ep_t *wireup_ep; + ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep); - if (!ucp_wireup_ep_test(uct_ep)) { + if (wireup_ep == NULL) { return 0; } - wireup_ep = ucs_derived_of(uct_ep, ucp_wireup_ep_t); return (wireup_ep->aux_ep == owned_ep) || (wireup_ep->sockaddr_ep == owned_ep) || (wireup_ep->super.uct_ep == owned_ep); @@ -678,9 +706,9 @@ int ucp_wireup_ep_is_owner(uct_ep_h uct_ep, uct_ep_h owned_ep) void ucp_wireup_ep_disown(uct_ep_h uct_ep, uct_ep_h owned_ep) { - ucp_wireup_ep_t *wireup_ep = ucs_derived_of(uct_ep, ucp_wireup_ep_t); + ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep); - ucs_assert_always(ucp_wireup_ep_test(uct_ep)); + ucs_assert_always(wireup_ep != NULL); if (wireup_ep->aux_ep == owned_ep) { wireup_ep->aux_ep = NULL; } else if (wireup_ep->sockaddr_ep == owned_ep) { @@ -689,3 +717,9 @@ void ucp_wireup_ep_disown(uct_ep_h uct_ep, uct_ep_h owned_ep) ucp_proxy_ep_extract(uct_ep); } } + +ucp_wireup_ep_t *ucp_wireup_ep(uct_ep_h uct_ep) +{ + return ucp_wireup_ep_test(uct_ep) ? + ucs_derived_of(uct_ep, ucp_wireup_ep_t) : NULL; +} diff --git a/src/ucp/wireup/wireup_ep.h b/src/ucp/wireup/wireup_ep.h index c5616a48ec4..efa3eb60f17 100644 --- a/src/ucp/wireup/wireup_ep.h +++ b/src/ucp/wireup/wireup_ep.h @@ -34,11 +34,13 @@ struct ucp_wireup_ep { ucs_queue_head_t pending_q; /**< Queue of pending operations */ uct_ep_h aux_ep; /**< Used to wireup the "real" endpoint */ uct_ep_h sockaddr_ep; /**< Used for client-server wireup */ + ucp_ep_h tmp_ep; /**< Used by the client for local tls setup */ ucp_rsc_index_t aux_rsc_index; /**< Index of auxiliary transport */ ucp_rsc_index_t sockaddr_rsc_index; /**< Index of sockaddr transport */ volatile uint32_t pending_count; /**< Number of pending wireup operations */ volatile uint32_t flags; /**< Connection state flags */ uct_worker_cb_id_t progress_id; /**< ID of progress function */ + unsigned ep_init_flags; /**< UCP wireup EP init flags */ }; @@ -60,23 +62,25 @@ ucp_rsc_index_t ucp_wireup_ep_get_aux_rsc_index(uct_ep_h uct_ep); * After this function is called, it would be possible to send wireup messages * on this endpoint, if connect_aux is 1. * - * @param [in] uct_ep Stub endpoint to connect. - * @param [in] rsc_index Resource of the real transport. - * @param [in] connect_aux Whether to connect the auxiliary transport, for - * sending + * @param [in] uct_ep Stub endpoint to connect. + * @param [in] ucp_ep_init_flags Initial flags of UCP EP. + * @param [in] rsc_index Resource of the real transport. + * @param [in] path_index Path index the transport endpoint should use. + * @param [in] connect_aux Whether to connect the auxiliary transport, + * for sending. + * @param [in] remote_address Remote address connect to. */ -ucs_status_t ucp_wireup_ep_connect(uct_ep_h uct_ep, const ucp_ep_params_t *params, - ucp_rsc_index_t rsc_index, int connect_aux, - unsigned address_count, - const ucp_address_entry_t *address_list); +ucs_status_t ucp_wireup_ep_connect(uct_ep_h uct_ep, unsigned ucp_ep_init_flags, + ucp_rsc_index_t rsc_index, + unsigned path_index, int connect_aux, + const ucp_unpacked_address_t *remote_address); ucs_status_t ucp_wireup_ep_connect_to_sockaddr(uct_ep_h uct_ep, const ucp_ep_params_t *params); -ucs_status_t ucp_wireup_ep_connect_aux(ucp_wireup_ep_t *wireup_ep, - const ucp_ep_params_t *params, - unsigned address_count, - const ucp_address_entry_t *address_list); +ucs_status_t +ucp_wireup_ep_connect_aux(ucp_wireup_ep_t *wireup_ep, unsigned ep_init_flags, + const ucp_unpacked_address_t *remote_address); void ucp_wireup_ep_set_next_ep(uct_ep_h uct_ep, uct_ep_h next_ep); @@ -92,4 +96,6 @@ void ucp_wireup_ep_disown(uct_ep_h uct_ep, uct_ep_h owned_ep); ucs_status_t ucp_wireup_ep_progress_pending(uct_pending_req_t *self); +ucp_wireup_ep_t *ucp_wireup_ep(uct_ep_h uct_ep); + #endif diff --git a/src/ucs/Makefile.am b/src/ucs/Makefile.am index 7e8153a4fd0..b612ddeb54f 100644 --- a/src/ucs/Makefile.am +++ b/src/ucs/Makefile.am @@ -16,6 +16,10 @@ libucs_ladir = $(includedir)/ucs libucs_la_LIBADD = $(LIBM) $(top_builddir)/src/ucm/libucm.la nobase_dist_libucs_la_HEADERS = \ + arch/aarch64/bitops.h \ + arch/ppc64/bitops.h \ + arch/x86_64/bitops.h \ + arch/bitops.h \ algorithm/crc.h \ algorithm/qsort_r.h \ async/async_fwd.h \ @@ -23,44 +27,53 @@ nobase_dist_libucs_la_HEADERS = \ config/parser.h \ config/types.h \ datastruct/callbackq.h \ - datastruct/list_types.h \ + datastruct/hlist.h \ + datastruct/khash.h \ + datastruct/linear_func.h \ datastruct/list.h \ datastruct/mpool.h \ datastruct/pgtable.h \ datastruct/queue_types.h \ datastruct/strided_alloc.h \ + datastruct/string_buffer.h \ + datastruct/string_set.h \ + debug/log_def.h \ memory/rcache.h \ + memory/memory_type.h \ memory/memtype_cache.h \ profile/profile_defs.h \ profile/profile_off.h \ profile/profile_on.h \ stats/stats_fwd.h \ stats/libstats.h \ + sys/event_set.h \ sys/compiler_def.h\ sys/math.h \ sys/preprocessor.h \ sys/string.h \ sys/sock.h \ + sys/topo.h \ + sys/stubs.h \ time/time_def.h \ type/class.h \ type/init_once.h \ type/spinlock.h \ type/status.h \ type/thread_mode.h \ - type/cpu_set.h + type/cpu_set.h \ + arch/x86_64/global_opts.h \ + arch/aarch64/global_opts.h \ + arch/ppc64/global_opts.h \ + arch/global_opts.h noinst_HEADERS = \ - arch/aarch64/bitops.h \ arch/aarch64/cpu.h \ arch/generic/atomic.h \ arch/generic/cpu.h \ - arch/ppc64/bitops.h \ arch/ppc64/cpu.h \ arch/x86_64/atomic.h \ - arch/x86_64/bitops.h \ arch/x86_64/cpu.h \ arch/atomic.h \ - arch/bitops.h \ arch/cpu.h \ datastruct/arbiter.h \ datastruct/frag_list.h \ @@ -70,7 +83,7 @@ noinst_HEADERS = \ datastruct/queue.h \ datastruct/sglib.h \ datastruct/sglib_wrapper.h \ - datastruct/khash.h \ + datastruct/conn_match.h \ debug/assert.h \ debug/debug.h \ debug/log.h \ @@ -83,6 +96,8 @@ noinst_HEADERS = \ sys/compiler.h \ sys/module.h \ sys/sys.h \ + sys/iovec.h \ + sys/iovec.inl \ time/time.h \ time/timerq.h \ time/timer_wheel.h \ @@ -96,8 +111,12 @@ libucs_la_SOURCES = \ algorithm/crc.c \ algorithm/qsort_r.c \ arch/aarch64/cpu.c \ + arch/aarch64/global_opts.c \ arch/ppc64/timebase.c \ + arch/ppc64/global_opts.c \ arch/x86_64/cpu.c \ + arch/x86_64/global_opts.c \ + arch/cpu.c \ async/async.c \ async/signal.c \ async/pipe.c \ @@ -113,27 +132,40 @@ libucs_la_SOURCES = \ datastruct/pgtable.c \ datastruct/ptr_array.c \ datastruct/strided_alloc.c \ + datastruct/string_buffer.c \ + datastruct/string_set.c \ + datastruct/conn_match.c \ debug/assert.c \ debug/debug.c \ debug/log.c \ debug/memtrack.c \ + memory/memory_type.c \ + memory/memtype_cache.c \ memory/numa.c \ memory/rcache.c \ - memory/memtype_cache.c \ profile/profile.c \ stats/stats.c \ + sys/event_set.c \ sys/init.c \ sys/math.c \ sys/module.c \ sys/string.c \ sys/sys.c \ + sys/iovec.c \ sys/sock.c \ + sys/topo.c \ + sys/stubs.c \ time/time.c \ time/timer_wheel.c \ time/timerq.c \ type/class.c \ - type/spinlock.c \ - type/status.c + type/status.c \ + type/init_once.c + +if HAVE_AARCH64_THUNDERX2 +libucs_la_SOURCES += \ + arch/aarch64/memcpy_thunderx2.S +endif if HAVE_STATS libucs_la_SOURCES += \ diff --git a/src/ucs/algorithm/crc.c b/src/ucs/algorithm/crc.c index 2b4f6fdf113..20965cfb760 100644 --- a/src/ucs/algorithm/crc.c +++ b/src/ucs/algorithm/crc.c @@ -4,39 +4,55 @@ * See file LICENSE for terms. */ -#include "crc.h" +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include #include +/* CRC-16-CCITT */ +#define UCS_CRC16_POLY 0x8408u + +/* CRC-32 (ISO 3309) */ +#define UCS_CRC32_POLY 0xedb88320l + +#define UCS_CRC_CALC(_width, _buffer, _size, _crc) \ + do { \ + const uint8_t *end = (const uint8_t*)(UCS_PTR_BYTE_OFFSET(_buffer, _size)); \ + const uint8_t *p; \ + uint8_t bit; \ + \ + if ((_size) != 0) { \ + for (p = (_buffer); p < end; ++p) { \ + (_crc) ^= *p; \ + for (bit = 0; bit < 8; ++bit) { \ + (_crc) = ((_crc) >> 1) ^ (-(int)((_crc) & 1) & \ + UCS_CRC ## _width ## _POLY); \ + } \ + } \ + } \ + (_crc) = ~(_crc); \ + } while (0) + + uint16_t ucs_crc16(const void *buffer, size_t size) { - const uint8_t *p; - uint16_t result; - uint8_t data; - int bit; - - if (size == 0) { - return 0; - } - - result = -1; - for (p = buffer; p < (const uint8_t*)(buffer + size); ++p) { - data = *p; - for (bit = 0; bit < 8; ++bit) { - result >>= 1; - if ((result ^ data) & 1) { - result = result ^ 0x8048; - } - data >>= 1; - } - }; - - result = ((result & 0xff) << 8) | ((result >> 8) & 0xff); - return ~result; + uint16_t crc = UINT16_MAX; + UCS_CRC_CALC(16, buffer, size, crc); + return crc; } uint16_t ucs_crc16_string(const char *s) { - return ucs_crc16((char*)s, strlen(s)); + return ucs_crc16((const char*)s, strlen(s)); +} + +uint32_t ucs_crc32(uint32_t prev_crc, const void *buffer, size_t size) +{ + uint32_t crc = ~prev_crc; + UCS_CRC_CALC(32, buffer, size, crc); + return crc; } diff --git a/src/ucs/algorithm/crc.h b/src/ucs/algorithm/crc.h index 1e76471aaca..277d81b88a1 100644 --- a/src/ucs/algorithm/crc.h +++ b/src/ucs/algorithm/crc.h @@ -7,11 +7,11 @@ #ifndef UCS_ALGORITHM_CRC_H_ #define UCS_ALGORITHM_CRC_H_ +#include + #include #include -#include - BEGIN_C_DECLS /** @file crc.h */ @@ -19,8 +19,8 @@ BEGIN_C_DECLS /** * Calculate CRC16 of an arbitrary buffer. * - * @param [in] buffer Buffer to compute crc for. - * @param [in] size Buffer size. + * @param [in] buffer Buffer to compute crc for. + * @param [in] size Buffer size. * * @return crc16() function of the buffer. */ @@ -29,9 +29,25 @@ uint16_t ucs_crc16(const void *buffer, size_t size); /** * Calculate CRC16 of a NULL-terminated string. + * + * @param [in] s NULL-terminated string to compute crc for. + * + * @return crc16() function of the string. */ uint16_t ucs_crc16_string(const char *s); + +/** + * Calculate CRC32 of an arbitrary buffer. + * + * @param [in] prev_crc Intitial CRC value. + * @param [in] buffer Buffer to compute crc for. + * @param [in] size Buffer size. + * + * @return crc32() function of the buffer. + */ +uint32_t ucs_crc32(uint32_t prev_crc, const void *buffer, size_t size); + END_C_DECLS #endif diff --git a/src/ucs/algorithm/qsort_r.c b/src/ucs/algorithm/qsort_r.c index 1566d33dd34..41116e7ec3f 100644 --- a/src/ucs/algorithm/qsort_r.c +++ b/src/ucs/algorithm/qsort_r.c @@ -32,6 +32,10 @@ * SUCH DAMAGE. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "qsort_r.h" #include diff --git a/src/ucs/arch/aarch64/bitops.h b/src/ucs/arch/aarch64/bitops.h index a93739ce8f9..05f93611994 100644 --- a/src/ucs/arch/aarch64/bitops.h +++ b/src/ucs/arch/aarch64/bitops.h @@ -8,27 +8,28 @@ #ifndef UCS_AARCH64_BITOPS_H_ #define UCS_AARCH64_BITOPS_H_ -#include +#include +#include +#include -static inline unsigned __ucs_ilog2_u32(uint32_t n) +static UCS_F_ALWAYS_INLINE unsigned __ucs_ilog2_u32(uint32_t n) { int bit; asm ("clz %w0, %w1" : "=r" (bit) : "r" (n)); return 31 - bit; } -static inline unsigned __ucs_ilog2_u64(uint64_t n) +static UCS_F_ALWAYS_INLINE unsigned __ucs_ilog2_u64(uint64_t n) { int64_t bit; asm ("clz %0, %1" : "=r" (bit) : "r" (n)); return 63 - bit; } -static inline unsigned ucs_ffs64(uint64_t n) +static UCS_F_ALWAYS_INLINE unsigned ucs_ffs64(uint64_t n) { return __ucs_ilog2_u64(n & -n); } - #endif diff --git a/src/ucs/arch/aarch64/cpu.c b/src/ucs/arch/aarch64/cpu.c index 4d8c8308854..a95b02597ef 100644 --- a/src/ucs/arch/aarch64/cpu.c +++ b/src/ucs/arch/aarch64/cpu.c @@ -6,6 +6,10 @@ #if defined(__aarch64__) +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include diff --git a/src/ucs/arch/aarch64/cpu.h b/src/ucs/arch/aarch64/cpu.h index f054367ec19..4c3d10a28d4 100644 --- a/src/ucs/arch/aarch64/cpu.h +++ b/src/ucs/arch/aarch64/cpu.h @@ -1,7 +1,7 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. -* Copyright (C) ARM Ltd. 2016-2017. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. +* Copyright (C) ARM Ltd. 2016-2020. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -11,11 +11,13 @@ #include "config.h" #include +#include #include #include #include #include -#if __ARM_NEON +#include +#ifdef __ARM_NEON #include #endif @@ -29,14 +31,41 @@ BEGIN_C_DECLS /** * Assume the worst - weak memory ordering. */ -#define ucs_memory_bus_fence() asm volatile ("dsb sy" ::: "memory"); -#define ucs_memory_bus_store_fence() asm volatile ("dsb st" ::: "memory"); -#define ucs_memory_bus_load_fence() asm volatile ("dsb ld" ::: "memory"); -#define ucs_memory_bus_wc_flush() -#define ucs_memory_cpu_fence() asm volatile ("dmb ish" ::: "memory"); -#define ucs_memory_cpu_store_fence() asm volatile ("dmb ishst" ::: "memory"); -#define ucs_memory_cpu_load_fence() asm volatile ("dmb ishld" ::: "memory"); -#define ucs_memory_cpu_wc_fence() asm volatile ("dmb st" ::: "memory"); + +#define ucs_aarch64_dmb(_op) asm volatile ("dmb " #_op ::: "memory") +#define ucs_aarch64_isb(_op) asm volatile ("isb " #_op ::: "memory") +#define ucs_aarch64_dsb(_op) asm volatile ("dsb " #_op ::: "memory") + +/* The macro is used to serialize stores across Normal NC (or Device) and WB + * memory, (see Arm Spec, B2.7.2). Based on recent changes in Linux kernel: + * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=22ec71615d824f4f11d38d0e55a88d8956b7e45f + * + * The underlying barrier code was changed to use lighter weight DMB instead + * of DSB. The barrier used for synchronization of access between write back + * and device mapped memory (PCIe BAR). + */ +#define ucs_memory_bus_fence() ucs_aarch64_dmb(oshsy) +#define ucs_memory_bus_store_fence() ucs_aarch64_dmb(oshst) +#define ucs_memory_bus_load_fence() ucs_aarch64_dmb(oshld) + +/* The macro is used to flush all pending stores from write combining buffer. + * Some uarch "auto" flush the stores once cache line is full (no need for additional barrier). + */ +#if defined(HAVE_AARCH64_THUNDERX2) +#define ucs_memory_bus_cacheline_wc_flush() +#else +/* The macro is used to flush stores to Normal NC or Device memory */ +#define ucs_memory_bus_cacheline_wc_flush() ucs_aarch64_dmb(oshst) +#endif + +#define ucs_memory_cpu_fence() ucs_aarch64_dmb(ish) +#define ucs_memory_cpu_store_fence() ucs_aarch64_dmb(ishst) +#define ucs_memory_cpu_load_fence() ucs_aarch64_dmb(ishld) + +/* The macro is used to serialize stores to Normal NC or Device memory + * (see Arm Spec, B2.7.2) + */ +#define ucs_memory_cpu_wc_fence() ucs_aarch64_dmb(oshst) /* @@ -57,6 +86,11 @@ typedef struct ucs_aarch64_cpuid { void ucs_aarch64_cpuid(ucs_aarch64_cpuid_t *cpuid); +#if defined(HAVE_AARCH64_THUNDERX2) +extern void *__memcpy_thunderx2(void *, const void *, size_t); +#endif + + #if HAVE_HW_TIMER static inline uint64_t ucs_arch_read_hres_clock(void) { @@ -85,11 +119,27 @@ static inline ucs_cpu_model_t ucs_arch_get_cpu_model() return UCS_CPU_MODEL_ARM_AARCH64; } +static inline ucs_cpu_vendor_t ucs_arch_get_cpu_vendor() +{ + ucs_aarch64_cpuid_t cpuid; + ucs_aarch64_cpuid(&cpuid); + + if ((cpuid.implementer == 0x46) && (cpuid.architecture == 8)) { + return UCS_CPU_VENDOR_FUJITSU_ARM; + } + + return UCS_CPU_VENDOR_GENERIC_ARM; +} + static inline int ucs_arch_get_cpu_flag() { return UCS_CPU_FLAG_UNKNOWN; } +static inline void ucs_cpu_init() +{ +} + static inline void ucs_arch_wait_mem(void *address) { unsigned long tmp; @@ -111,14 +161,19 @@ static inline void ucs_arch_clear_cache(void *start, void *end) uintptr_t ptr; unsigned icache; unsigned dcache; + unsigned dic; + unsigned idc; unsigned ctr_el0; /* Get cache line size, using ctr_el0 register * * Bits Name Function * ***************************** - * [31] - Reserved, res1. - * [30:28] - Reserved, res0. + * [31] - Reserved, RES1. + * [30] - Reserved, RES0. + * [29] DIC Instruction cache invalidation requirements for data to instruction + * coherence. + * [28] IDC Data cache clean requirements for instruction to data coherence. * [27:24] CWG Cache Write-Back granule. Log2 of the number of words of the * maximum size of memory that can be overwritten as a result of * the eviction of a cache entry that has had a memory location @@ -147,20 +202,60 @@ static inline void ucs_arch_clear_cache(void *start, void *end) asm volatile ("mrs\t%0, ctr_el0":"=r" (ctr_el0)); icache = sizeof(int) << (ctr_el0 & 0xf); dcache = sizeof(int) << ((ctr_el0 >> 16) & 0xf); + dic = (ctr_el0 >> 29) & 0x1; + idc = (ctr_el0 >> 28) & 0x1; - for (ptr = ucs_align_down((uintptr_t)start, dcache); ptr < (uintptr_t)end; ptr += dcache) { - asm volatile ("dc cvau, %0" :: "r" (ptr) : "memory"); + /* + * Check if Data cache clean to the Point of Unification is required for instruction to + * data coherence + */ + if (idc == 0) { + for (ptr = ucs_align_down((uintptr_t)start, dcache); ptr < (uintptr_t)end; ptr += dcache) { + asm volatile ("dc cvau, %0" :: "r" (ptr) : "memory"); + } } - asm volatile ("dsb ish" ::: "memory"); - for (ptr = ucs_align_down((uintptr_t)start, icache); ptr < (uintptr_t)end; ptr += icache) { - asm volatile ("ic ivau, %0" :: "r" (ptr) : "memory"); + /* + * Check if Instruction cache invalidation to the Point of Unification is required for + * data to instruction coherence. + */ + if (dic == 0) { + ucs_aarch64_dsb(ish); + for (ptr = ucs_align_down((uintptr_t)start, icache); ptr < (uintptr_t)end; ptr += icache) { + asm volatile ("ic ivau, %0" :: "r" (ptr) : "memory"); + } } - asm volatile ("dsb ish; isb" ::: "memory"); + ucs_aarch64_dsb(ish); + ucs_aarch64_isb(); #endif } #endif +static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len) +{ +#if defined(HAVE_AARCH64_THUNDERX2) + return __memcpy_thunderx2(dst, src,len); +#else + return memcpy(dst, src, len); +#endif +} + +static UCS_F_ALWAYS_INLINE void +ucs_memcpy_nontemporal(void *dst, const void *src, size_t len) +{ +#if defined(HAVE_AARCH64_THUNDERX2) + __memcpy_thunderx2(dst, src,len); +#else + memcpy(dst, src, len); +#endif + +} + +static inline ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes) +{ + return UCS_ERR_UNSUPPORTED; +} + static inline void ucs_arch_clear_cache(void *start, void *end) { uintptr_t ptr; @@ -173,6 +268,7 @@ static inline void ucs_arch_clear_cache(void *start, void *end) } asm volatile ("dsb ish" ::: "memory"); } + // copy from ucs/arch/x86_64/cpu.h for compiling successfully. static inline void ucs_arch_writeback_cache(void *start, void *end) { @@ -183,7 +279,6 @@ static inline void ucs_arch_writeback_cache(void *start, void *end) #endif } - END_C_DECLS #endif diff --git a/src/ucs/arch/aarch64/global_opts.c b/src/ucs/arch/aarch64/global_opts.c new file mode 100644 index 00000000000..649dfe4e5f5 --- /dev/null +++ b/src/ucs/arch/aarch64/global_opts.c @@ -0,0 +1,24 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#if defined(__aarch64__) + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include + +ucs_config_field_t ucs_arch_global_opts_table[] = { + {NULL} +}; + +void ucs_arch_print_memcpy_limits(ucs_arch_global_opts_t *config) +{ +} + +#endif diff --git a/src/ucs/arch/aarch64/global_opts.h b/src/ucs/arch/aarch64/global_opts.h new file mode 100644 index 00000000000..e46026aa985 --- /dev/null +++ b/src/ucs/arch/aarch64/global_opts.h @@ -0,0 +1,23 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef UCS_AARCH64_GLOBAL_OPTS_H_ +#define UCS_AARCH64_GLOBAL_OPTS_H_ + +#include + +BEGIN_C_DECLS + +#define UCS_ARCH_GLOBAL_OPTS_INITALIZER {} + +/* built-in memcpy config */ +typedef struct ucs_arch_global_opts { + char dummy; +} ucs_arch_global_opts_t; + +END_C_DECLS + +#endif diff --git a/src/ucs/arch/aarch64/memcpy_thunderx2.S b/src/ucs/arch/aarch64/memcpy_thunderx2.S new file mode 100644 index 00000000000..c2b619b35fb --- /dev/null +++ b/src/ucs/arch/aarch64/memcpy_thunderx2.S @@ -0,0 +1,437 @@ +/* Copyright (c) 2018, Marvell Technology Group Ltd. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ********************************************************************** + * Alternativ ely, you may choose to be licensed under the terms of the + * following license: + j + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp2 x6 +#define tmp3 x7 +#define tmp3w w7 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define A_hw w7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l src +#define E_h count +#define F_l srcend +#define F_h dst +#define G_l count +#define G_h dst +#define tmp1 x14 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 +#define I_q q16 +#define J_q q17 + +#define A_v v0 +#define B_v v1 +#define C_v v2 +#define D_v v3 +#define E_v v4 +#define F_v v5 +#define G_v v6 +#define H_v v7 +#define I_v v16 +#define J_v v17 + +#ifndef MEMCPY_NAME +#define MEMCPY_NAME __memcpy_thunderx2 +#endif + +/* Local label name for asm code. */ +#ifndef L +# define L(name) .L##name +#endif + +/* Copies are split into 3 main cases: small copies of up to 16 bytes, + medium copies of 17..96 bytes which are fully unrolled. Large copies + of more than 96 bytes align the destination and use load-and-merge + approach in the case src and dst addresses are unaligned not evenly, + so that, loads and stores are always aligned. + Large copies use an unrolled loop processing 64 bytes per iteration. + The current optimized memcpy implementation is not compatible with + memmove and is separated from it completely. + + memcpy implementation below is not compatible with memmove + because of pipelined loads/stores, which are faster, but they + can't be used in the case of overlapping memmove arrays */ + +#define MEMCPY_PREFETCH_LDR 640 + + .globl MEMCPY_NAME + .type MEMCPY_NAME,%function + .p2align 6 + +MEMCPY_NAME: + .cfi_startproc + add srcend, src, count + cmp count, 16 + b.ls L(memcopy16) + ldr A_q, [src], #16 + add dstend, dstin, count + and tmp1, src, 15 + cmp count, 96 + b.hi L(memcopy_long) + + /* Medium copies: 17..96 bytes. */ + ldr E_q, [srcend, -16] + cmp count, 64 + b.gt L(memcpy_copy96) + cmp count, 48 + b.le L(bytes_17_to_48) + /* 49..64 bytes */ + ldp B_q, C_q, [src] + str E_q, [dstend, -16] + stp A_q, B_q, [dstin] + str C_q, [dstin, 32] + ret + +L(bytes_17_to_48): + /* 17..48 bytes*/ + cmp count, 32 + b.gt L(bytes_32_to_48) + /* 17..32 bytes*/ + str A_q, [dstin] + str E_q, [dstend, -16] + ret + +L(bytes_32_to_48): + /* 32..48 */ + ldr B_q, [src] + str A_q, [dstin] + str E_q, [dstend, -16] + str B_q, [dstin, 16] + ret + + .p2align 4 + /* Small copies: 0..16 bytes. */ +L(memcopy16): + cmp count, 8 + b.lo L(bytes_0_to_8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + add dstend, dstin, count + str A_l, [dstin] + str A_h, [dstend, -8] + ret + .p2align 4 + +L(bytes_0_to_8): + tbz count, 2, L(bytes_0_to_3) + ldr A_lw, [src] + ldr A_hw, [srcend, -4] + add dstend, dstin, count + str A_lw, [dstin] + str A_hw, [dstend, -4] + ret + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ +L(bytes_0_to_3): + cbz count, L(end) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb A_hw, [srcend, -1] + add dstend, dstin, count + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] +L(end): ret + + .p2align 4 + +L(memcpy_copy96): + /* Copying 65..96 bytes. A_q (first 16 bytes) and + E_q(last 16 bytes) are already loaded. + + The size is large enough to benefit from aligned + loads */ + bic src, src, 15 + ldp B_q, C_q, [src] + str A_q, [dstin] + /* Loaded 64 bytes, second 16-bytes chunk can be + overlapping with the first chunk by tmp1 bytes. + Stored 16 bytes. */ + sub dst, dstin, tmp1 + add count, count, tmp1 + /* The range of count being [65..96] becomes [65..111] + after tmp [0..15] gets added to it, + count now is +48 */ + cmp count, 80 + b.gt L(copy96_medium) + ldr D_q, [src, 32] + stp B_q, C_q, [dst, 16] + str E_q, [dstend, -16] + str D_q, [dst, 48] + ret + + .p2align 4 +L(copy96_medium): + ldp D_q, A_q, [src, 32] + str B_q, [dst, 16] + cmp count, 96 + b.gt L(copy96_large) + str E_q, [dstend, -16] + stp C_q, D_q, [dst, 32] + str A_q, [dst, 64] + ret + +L(copy96_large): + ldr F_q, [src, 64] + stp C_q, D_q, [dst, 32] + str E_q, [dstend, -16] + stp A_q, F_q, [dst, 64] + ret + + .p2align 4 +L(memcopy_long): + bic src, src, 15 + ldp B_q, C_q, [src], #32 + str A_q, [dstin] + sub dst, dstin, tmp1 + add count, count, tmp1 + add dst, dst, 16 + and tmp1, dst, 15 + ldp D_q, E_q, [src], #32 + str B_q, [dst], #16 + + /* Already loaded 64+16 bytes. Check if at + least 64 more bytes left */ + subs count, count, 64+64+16 + b.lt L(loop128_exit2) + cmp count, MEMCPY_PREFETCH_LDR + 64 + 32 + b.lt L(loop128) + cbnz tmp1, L(dst_unaligned) + sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32 + + .p2align 4 + +L(loop128_prefetch): + str C_q, [dst], #16 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + str D_q, [dst], #16 + ldp F_q, G_q, [src], #32 + str E_q, [dst], #16 + ldp H_q, A_q, [src], #32 + str F_q, [dst], #16 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + str G_q, [dst], #16 + ldp B_q, C_q, [src], #32 + str H_q, [dst], #16 + ldp D_q, E_q, [src], #32 + stp A_q, B_q, [dst], #32 + subs count, count, 128 + b.ge L(loop128_prefetch) + +L(preloop128): + add count, count, MEMCPY_PREFETCH_LDR + 64 + 32 + .p2align 4 +L(loop128): + ldp F_q, G_q, [src], #32 + str C_q, [dst], #16 + ldp B_q, A_q, [src], #32 + str D_q, [dst], #16 + stp E_q, F_q, [dst], #32 + stp G_q, B_q, [dst], #32 + subs count, count, 64 + b.lt L(loop128_exit1) +L(loop128_proceed): + ldp B_q, C_q, [src], #32 + str A_q, [dst], #16 + ldp D_q, E_q, [src], #32 + str B_q, [dst], #16 + subs count, count, 64 + b.ge L(loop128) + + .p2align 4 +L(loop128_exit2): + stp C_q, D_q, [dst], #32 + str E_q, [dst], #16 + b L(copy_long_check32); + +L(loop128_exit1): + /* A_q is still not stored and 0..63 bytes left, + so, count is -64..-1. + Check if less than 32 bytes left (count < -32) */ + str A_q, [dst], #16 +L(copy_long_check32): + cmn count, 64 + b.eq L(copy_long_done) + cmn count, 32 + b.le L(copy_long_last32) + ldp B_q, C_q, [src] + stp B_q, C_q, [dst] + +L(copy_long_last32): + ldp F_q, G_q, [srcend, -32] + stp F_q, G_q, [dstend, -32] + +L(copy_long_done): + ret + +L(dst_unaligned): + /* For the unaligned store case the code loads two + aligned chunks and then merges them using ext + instruction. This can be up to 30% faster than + the the simple unaligned store access. + + Current state: tmp1 = dst % 16; C_q, D_q, E_q + contains data yet to be stored. src and dst points + to next-to-be-processed data. A_q, B_q contains + data already stored before, count = bytes left to + be load decremented by 64. + + The control is passed here if at least 64 bytes left + to be loaded. The code does two aligned loads and then + extracts (16-tmp1) bytes from the first register and + tmp1 bytes from the next register forming the value + for the aligned store. + + As ext instruction can only have it's index encoded + as immediate. 15 code chunks process each possible + index value. Computed goto is used to reach the + required code. */ + + /* Store the 16 bytes to dst and align dst for further + operations, several bytes will be stored at this + address once more */ + str C_q, [dst], #16 + ldp F_q, G_q, [src], #32 + bic dst, dst, 15 + adrp tmp2, L(ext_table) + add tmp2, tmp2, :lo12:L(ext_table) + add tmp2, tmp2, tmp1, LSL #2 + ldr tmp3w, [tmp2] + add tmp2, tmp2, tmp3w, SXTW + br tmp2 + +#define EXT_CHUNK(shft) \ +.p2align 4 ;\ +L(ext_size_ ## shft):;\ + ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\ + ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\ + subs count, count, 32;\ + b.ge 2f;\ +1:;\ + stp A_q, B_q, [dst], #32;\ + ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\ + ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\ + stp H_q, I_q, [dst], #16;\ + add dst, dst, tmp1;\ + str G_q, [dst], #16;\ + b L(copy_long_check32);\ +2:;\ + stp A_q, B_q, [dst], #32;\ + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\ + ldp D_q, J_q, [src], #32;\ + ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\ + ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\ + mov C_v.16b, G_v.16b;\ + stp H_q, I_q, [dst], #32;\ + ldp F_q, G_q, [src], #32;\ + ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\ + ext B_v.16b, D_v.16b, J_v.16b, 16-shft;\ + mov E_v.16b, J_v.16b;\ + subs count, count, 64;\ + b.ge 2b;\ + b 1b;\ + +EXT_CHUNK(1) +EXT_CHUNK(2) +EXT_CHUNK(3) +EXT_CHUNK(4) +EXT_CHUNK(5) +EXT_CHUNK(6) +EXT_CHUNK(7) +EXT_CHUNK(8) +EXT_CHUNK(9) +EXT_CHUNK(10) +EXT_CHUNK(11) +EXT_CHUNK(12) +EXT_CHUNK(13) +EXT_CHUNK(14) +EXT_CHUNK(15) + + .cfi_endproc + .size MEMCPY_NAME,.-MEMCPY_NAME + + .section .rodata + .p2align 4 + +L(ext_table): + /* The first entry is for the alignment of 0 and is never + actually used (could be any value). */ + .word 0 + .word L(ext_size_1) -. + .word L(ext_size_2) -. + .word L(ext_size_3) -. + .word L(ext_size_4) -. + .word L(ext_size_5) -. + .word L(ext_size_6) -. + .word L(ext_size_7) -. + .word L(ext_size_8) -. + .word L(ext_size_9) -. + .word L(ext_size_10) -. + .word L(ext_size_11) -. + .word L(ext_size_12) -. + .word L(ext_size_13) -. + .word L(ext_size_14) -. + .word L(ext_size_15) -. diff --git a/src/ucs/arch/atomic.h b/src/ucs/arch/atomic.h index 0caea9b1f3b..99e53ca5d0f 100644 --- a/src/ucs/arch/atomic.h +++ b/src/ucs/arch/atomic.h @@ -55,6 +55,18 @@ return __sync_fetch_and_or(ptr, value); \ } +#define UCS_DEFINE_ATOMIC_SUB(wordsize, suffix) \ + static inline void ucs_atomic_sub##wordsize(volatile uint##wordsize##_t *ptr, \ + uint##wordsize##_t value) { \ + ucs_atomic_add##wordsize(ptr, (uint##wordsize##_t)-value); \ + } + +#define UCS_DEFINE_ATOMIC_FSUB(wordsize, suffix) \ + static inline uint##wordsize##_t ucs_atomic_fsub##wordsize(volatile uint##wordsize##_t *ptr, \ + uint##wordsize##_t value) { \ + return ucs_atomic_fadd##wordsize(ptr, (uint##wordsize##_t)-value); \ + } + /* * Define atomic functions */ @@ -68,6 +80,16 @@ UCS_DEFINE_ATOMIC_FADD(16, w); UCS_DEFINE_ATOMIC_FADD(32, l); UCS_DEFINE_ATOMIC_FADD(64, q); +UCS_DEFINE_ATOMIC_SUB(8, b); +UCS_DEFINE_ATOMIC_SUB(16, w); +UCS_DEFINE_ATOMIC_SUB(32, l); +UCS_DEFINE_ATOMIC_SUB(64, q); + +UCS_DEFINE_ATOMIC_FSUB(8, b); +UCS_DEFINE_ATOMIC_FSUB(16, w); +UCS_DEFINE_ATOMIC_FSUB(32, l); +UCS_DEFINE_ATOMIC_FSUB(64, q); + UCS_DEFINE_ATOMIC_AND(8, b); UCS_DEFINE_ATOMIC_AND(16, w); UCS_DEFINE_ATOMIC_AND(32, l); diff --git a/src/ucs/arch/bitops.h b/src/ucs/arch/bitops.h index af7bb93392d..10a86b53ca4 100644 --- a/src/ucs/arch/bitops.h +++ b/src/ucs/arch/bitops.h @@ -7,6 +7,10 @@ #ifndef UCS_ARCH_BITOPS_H #define UCS_ARCH_BITOPS_H +#include +#include + +BEGIN_C_DECLS #if defined(__x86_64__) # include "x86_64/bitops.h" @@ -18,6 +22,7 @@ # error "Unsupported architecture" #endif + #define ucs_ilog2(_n) \ ( \ __builtin_constant_p(_n) ? ( \ @@ -93,9 +98,16 @@ __ucs_ilog2_u64((uint64_t)(_n)) \ ) +#define ucs_ilog2_or0(_n) \ + ( ((_n) == 0) ? 0 : ucs_ilog2(_n) ) + /* Returns the number of 1-bits in x */ #define ucs_popcount(_n) \ - ((sizeof(_n) <= 4) ? __builtin_popcount((uint32_t)(_n)) : __builtin_popcountl(_n)) + ((sizeof(_n) <= 4) ? __builtin_popcount((uint32_t)(_n)) : \ + __builtin_popcountl(_n)) + +/* On some arch ffs64(0) returns 0, on other -1, let's unify this */ +#define ucs_ffs64_safe(_val) ((_val) ? ucs_ffs64(_val) : 64) /* Returns the number of trailing 0-bits in x, starting at the least * significant bit position. If x is 0, the result is undefined. @@ -107,4 +119,6 @@ #define ucs_bitmap2idx(_map, _idx) \ ucs_popcount((_map) & (UCS_MASK(_idx))) +END_C_DECLS + #endif diff --git a/src/ucs/arch/cpu.c b/src/ucs/arch/cpu.c new file mode 100644 index 00000000000..6d9ebbafeae --- /dev/null +++ b/src/ucs/arch/cpu.c @@ -0,0 +1,155 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include +#include +#include + +#define UCS_CPU_CACHE_FILE_FMT "/sys/devices/system/cpu/cpu%d/cache/index%d/%s" +#define UCS_CPU_CACHE_LEVEL_FILE "level" +#define UCS_CPU_CACHE_TYPE_FILE "type" +#define UCS_CPU_CACHE_SIZE_FILE "size" + +/* cache size array. index - cache type (ucs_cpu_cache_type_t), value - cache value, + * 0 means cache is not supported */ +static size_t ucs_cpu_cache_size[UCS_CPU_CACHE_LAST] = {0}; + +static ucs_init_once_t ucs_cache_read_once = UCS_INIT_ONCE_INITIALIZER; + +/* cache datatypes */ +struct { /* sysfs entries for system cache sizes */ + int level; + const char *type; +} const ucs_cpu_cache_sysfs_name[] = { + [UCS_CPU_CACHE_L1d] = {.level = 1, .type = "Data"}, + [UCS_CPU_CACHE_L1i] = {.level = 1, .type = "Instruction"}, + [UCS_CPU_CACHE_L2] = {.level = 2, .type = "Unified"}, + [UCS_CPU_CACHE_L3] = {.level = 3, .type = "Unified"} +}; + +const ucs_cpu_builtin_memcpy_t ucs_cpu_builtin_memcpy[UCS_CPU_VENDOR_LAST] = { + [UCS_CPU_VENDOR_UNKNOWN] = { + .min = UCS_MEMUNITS_INF, + .max = UCS_MEMUNITS_INF + }, + [UCS_CPU_VENDOR_INTEL] = { + .min = 1 * UCS_KBYTE, + .max = 8 * UCS_MBYTE + }, + /* TODO: investigate why `rep movsb` is slow for shared buffers + * on some AMD configurations */ + [UCS_CPU_VENDOR_AMD] = { + .min = UCS_MEMUNITS_INF, + .max = UCS_MEMUNITS_INF + }, + [UCS_CPU_VENDOR_GENERIC_ARM] = { + .min = UCS_MEMUNITS_INF, + .max = UCS_MEMUNITS_INF + }, + [UCS_CPU_VENDOR_GENERIC_PPC] = { + .min = UCS_MEMUNITS_INF, + .max = UCS_MEMUNITS_INF + }, + [UCS_CPU_VENDOR_FUJITSU_ARM] = { + .min = UCS_MEMUNITS_INF, + .max = UCS_MEMUNITS_INF + } +}; + +const size_t ucs_cpu_est_bcopy_bw[UCS_CPU_VENDOR_LAST] = { + [UCS_CPU_VENDOR_UNKNOWN] = 5800 * UCS_MBYTE, + [UCS_CPU_VENDOR_INTEL] = 5800 * UCS_MBYTE, + [UCS_CPU_VENDOR_AMD] = 5008 * UCS_MBYTE, + [UCS_CPU_VENDOR_GENERIC_ARM] = 5800 * UCS_MBYTE, + [UCS_CPU_VENDOR_GENERIC_PPC] = 5800 * UCS_MBYTE, + [UCS_CPU_VENDOR_FUJITSU_ARM] = 5800 * UCS_MBYTE +}; + +static void ucs_sysfs_get_cache_size() +{ + char type_str[32]; /* Data/Instruction/Unified */ + char size_str[32]; /* memunits */ + int cache_index; + int cpu; + long level; + ssize_t file_size; + ucs_cpu_cache_type_t cache_type; + ucs_status_t status; + + cpu = ucs_get_first_cpu(); + + for (cache_index = 0;; cache_index++) { + file_size = ucs_read_file_str(type_str, sizeof(type_str), 1, + UCS_CPU_CACHE_FILE_FMT, cpu, + cache_index, UCS_CPU_CACHE_TYPE_FILE); + if (file_size < 0) { + return; /* no more files */ + } + + ucs_strtrim(type_str); + status = ucs_read_file_number(&level, 1, UCS_CPU_CACHE_FILE_FMT, + cpu, cache_index, UCS_CPU_CACHE_LEVEL_FILE); + if (status != UCS_OK) { + return; /* no more files */ + } + + /* ok, we found valid directory, let's try to read cache size */ + file_size = ucs_read_file_str(size_str, sizeof(size_str), 1, UCS_CPU_CACHE_FILE_FMT, + cpu, cache_index, UCS_CPU_CACHE_SIZE_FILE); + if (file_size < 0) { + return; /* no more files */ + } + + /* now lookup for cache size entry */ + for (cache_type = UCS_CPU_CACHE_L1d; cache_type < UCS_CPU_CACHE_LAST; cache_type++) { + if ((ucs_cpu_cache_sysfs_name[cache_type].level == level) && + !strcasecmp(ucs_cpu_cache_sysfs_name[cache_type].type, type_str)) { + if (ucs_cpu_cache_size[cache_type] != 0) { + break; + } + + status = ucs_str_to_memunits(ucs_strtrim(size_str), + &ucs_cpu_cache_size[cache_type]); + if (status != UCS_OK) { + ucs_cpu_cache_size[cache_type] = 0; /* reset cache value */ + } + } + } + } +} + +size_t ucs_cpu_get_cache_size(ucs_cpu_cache_type_t type) +{ + ucs_status_t status; + + if (type >= UCS_CPU_CACHE_LAST) { + return 0; + } + + UCS_INIT_ONCE(&ucs_cache_read_once) { + UCS_STATIC_ASSERT(ucs_array_size(ucs_cpu_cache_size) == UCS_CPU_CACHE_LAST); + /* try first CPU-specific algorithm */ + status = ucs_arch_get_cache_size(ucs_cpu_cache_size); + if (status != UCS_OK) { + /* read rest of caches from sysfs */ + ucs_sysfs_get_cache_size(); + } + } + + return ucs_cpu_cache_size[type]; +} + +double ucs_cpu_get_memcpy_bw() +{ + return ucs_cpu_est_bcopy_bw[ucs_arch_get_cpu_vendor()]; +} diff --git a/src/ucs/arch/cpu.h b/src/ucs/arch/cpu.h index 38fe2051ae7..245b205d891 100644 --- a/src/ucs/arch/cpu.h +++ b/src/ucs/arch/cpu.h @@ -1,7 +1,7 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. * Copyright (C) ARM Ltd. 2016. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -14,7 +14,9 @@ #endif #include +#include +BEGIN_C_DECLS /* CPU models */ typedef enum ucs_cpu_model { @@ -27,6 +29,8 @@ typedef enum ucs_cpu_model { UCS_CPU_MODEL_INTEL_BROADWELL, UCS_CPU_MODEL_INTEL_SKYLAKE, UCS_CPU_MODEL_ARM_AARCH64, + UCS_CPU_MODEL_AMD_NAPLES, + UCS_CPU_MODEL_AMD_ROME, UCS_CPU_MODEL_LAST } ucs_cpu_model_t; @@ -49,6 +53,35 @@ typedef enum ucs_cpu_flag { } ucs_cpu_flag_t; +/* CPU vendors */ +typedef enum ucs_cpu_vendor { + UCS_CPU_VENDOR_UNKNOWN, + UCS_CPU_VENDOR_INTEL, + UCS_CPU_VENDOR_AMD, + UCS_CPU_VENDOR_GENERIC_ARM, + UCS_CPU_VENDOR_GENERIC_PPC, + UCS_CPU_VENDOR_FUJITSU_ARM, + UCS_CPU_VENDOR_LAST +} ucs_cpu_vendor_t; + + +/* CPU cache types */ +typedef enum ucs_cpu_cache_type { + UCS_CPU_CACHE_L1d, /**< L1 data cache */ + UCS_CPU_CACHE_L1i, /**< L1 instruction cache */ + UCS_CPU_CACHE_L2, /**< L2 cache */ + UCS_CPU_CACHE_L3, /**< L3 cache */ + UCS_CPU_CACHE_LAST +} ucs_cpu_cache_type_t; + + +/* Built-in memcpy settings */ +typedef struct ucs_cpu_builtin_memcpy { + size_t min; + size_t max; +} ucs_cpu_builtin_memcpy_t; + + /* System constants */ #define UCS_SYS_POINTER_SIZE (sizeof(void*)) #define UCS_SYS_PARAGRAPH_SIZE 16 @@ -72,6 +105,25 @@ static inline void ucs_clear_cache(void *start, void *end); #define UCS_SYS_CACHE_LINE_SIZE UCS_ARCH_CACHE_LINE_SIZE #endif +/* Array of default built-in memcpy settings for different CPU architectures */ +extern const ucs_cpu_builtin_memcpy_t ucs_cpu_builtin_memcpy[UCS_CPU_VENDOR_LAST]; + +#if HAVE___CLEAR_CACHE +/* libc routine declaration */ +void __clear_cache(void* beg, void* end); +#endif + +/** + * Get size of CPU cache. + * + * @param type Cache type. + * @param value Filled with the cache size. + * + * @return Cache size value or 0 if cache is not supported or can't be read. + */ +size_t ucs_cpu_get_cache_size(ucs_cpu_cache_type_t type); + + /** * Clear processor data and instruction caches, intended for * self-modifying code. @@ -82,9 +134,6 @@ static inline void ucs_clear_cache(void *start, void *end); static inline void ucs_clear_cache(void *start, void *end) { #if HAVE___CLEAR_CACHE - /* do not allow global declaration of compiler intrinsic */ - void __clear_cache(void* beg, void* end); - __clear_cache(start, end); #else ucs_arch_clear_cache(start, end); @@ -102,4 +151,20 @@ static inline void ucs_writeback_cache(void *start, void *end) ucs_arch_writeback_cache(start, end); } +/** + * Get memory copy bandwidth. + * + * @return Memory copy bandwidth estimation based on CPU used. + */ +double ucs_cpu_get_memcpy_bw(); + + +static inline int ucs_cpu_prefer_relaxed_order() +{ + return ucs_arch_get_cpu_vendor() == UCS_CPU_VENDOR_FUJITSU_ARM; +} + + +END_C_DECLS + #endif diff --git a/src/ucs/arch/global_opts.h b/src/ucs/arch/global_opts.h new file mode 100644 index 00000000000..8786f130290 --- /dev/null +++ b/src/ucs/arch/global_opts.h @@ -0,0 +1,26 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef UCS_ARCH_GLOBAL_OPTS_H +#define UCS_ARCH_GLOBAL_OPTS_H + +#include + +#if defined(__x86_64__) +# include "x86_64/global_opts.h" +#elif defined(__powerpc64__) +# include "ppc64/global_opts.h" +#elif defined(__aarch64__) +# include "aarch64/global_opts.h" +#else +# error "Unsupported architecture" +#endif + +extern ucs_config_field_t ucs_arch_global_opts_table[]; + +void ucs_arch_print_memcpy_limits(ucs_arch_global_opts_t *config); + +#endif diff --git a/src/ucs/arch/ppc64/bitops.h b/src/ucs/arch/ppc64/bitops.h index 93a35fcf9cb..1d8a2817576 100644 --- a/src/ucs/arch/ppc64/bitops.h +++ b/src/ucs/arch/ppc64/bitops.h @@ -7,24 +7,25 @@ #ifndef UCS_PPC64_BITOPS_H_ #define UCS_PPC64_BITOPS_H_ +#include #include -static inline unsigned __ucs_ilog2_u32(uint32_t n) +static UCS_F_ALWAYS_INLINE unsigned __ucs_ilog2_u32(uint32_t n) { int bit; asm ("cntlzw %0,%1" : "=r" (bit) : "r" (n)); return 31 - bit; } -static inline unsigned __ucs_ilog2_u64(uint64_t n) +static UCS_F_ALWAYS_INLINE unsigned __ucs_ilog2_u64(uint64_t n) { int bit; asm ("cntlzd %0,%1" : "=r" (bit) : "r" (n)); return 63 - bit; } -static inline unsigned ucs_ffs64(uint64_t n) +static UCS_F_ALWAYS_INLINE unsigned ucs_ffs64(uint64_t n) { return __ucs_ilog2_u64(n & -n); } diff --git a/src/ucs/arch/ppc64/cpu.h b/src/ucs/arch/ppc64/cpu.h index 24e94243396..995f7854cce 100644 --- a/src/ucs/arch/ppc64/cpu.h +++ b/src/ucs/arch/ppc64/cpu.h @@ -1,7 +1,7 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2013. ALL RIGHTS RESERVED. * Copyright (C) ARM Ltd. 2016-2017. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -11,12 +11,14 @@ #define UCS_PPC64_CPU_H_ #include +#include #ifdef HAVE_SYS_PLATFORM_PPC_H # include #endif #include #include #include +#include BEGIN_C_DECLS @@ -28,7 +30,7 @@ BEGIN_C_DECLS #define ucs_memory_bus_fence() asm volatile ("sync"::: "memory") #define ucs_memory_bus_store_fence() ucs_memory_bus_fence() #define ucs_memory_bus_load_fence() ucs_memory_bus_fence() -#define ucs_memory_bus_wc_flush() +#define ucs_memory_bus_cacheline_wc_flush() #define ucs_memory_cpu_fence() ucs_memory_bus_fence() #define ucs_memory_cpu_store_fence() asm volatile ("lwsync \n" \ ::: "memory") @@ -54,11 +56,20 @@ static inline ucs_cpu_model_t ucs_arch_get_cpu_model() return UCS_CPU_MODEL_UNKNOWN; } +static inline ucs_cpu_vendor_t ucs_arch_get_cpu_vendor() +{ + return UCS_CPU_VENDOR_GENERIC_PPC; +} + static inline int ucs_arch_get_cpu_flag() { return UCS_CPU_FLAG_UNKNOWN; } +static inline void ucs_cpu_init() +{ +} + double ucs_arch_get_clocks_per_sec(); #define ucs_arch_wait_mem ucs_arch_generic_wait_mem @@ -75,6 +86,22 @@ static inline void ucs_arch_writeback_cache(void *start, void *end) ucs_memory_cpu_fence(); } +static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len) +{ + return memcpy(dst, src, len); +} + +static UCS_F_ALWAYS_INLINE void +ucs_memcpy_nontemporal(void *dst, const void *src, size_t len) +{ + memcpy(dst, src, len); +} + +static inline ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes) +{ + return UCS_ERR_UNSUPPORTED; +} + END_C_DECLS #endif diff --git a/src/ucs/arch/ppc64/global_opts.c b/src/ucs/arch/ppc64/global_opts.c new file mode 100644 index 00000000000..e2734378d0d --- /dev/null +++ b/src/ucs/arch/ppc64/global_opts.c @@ -0,0 +1,24 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#if defined(__powerpc64__) + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include + +ucs_config_field_t ucs_arch_global_opts_table[] = { + {NULL} +}; + +void ucs_arch_print_memcpy_limits(ucs_arch_global_opts_t *config) +{ +} + +#endif diff --git a/src/ucs/arch/ppc64/global_opts.h b/src/ucs/arch/ppc64/global_opts.h new file mode 100644 index 00000000000..225e4e5e896 --- /dev/null +++ b/src/ucs/arch/ppc64/global_opts.h @@ -0,0 +1,25 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + + +#ifndef UCS_PPC64_GLOBAL_OPTS_H_ +#define UCS_PPC64_GLOBAL_OPTS_H_ + +#include + +BEGIN_C_DECLS + +#define UCS_ARCH_GLOBAL_OPTS_INITALIZER {} + +/* built-in memcpy config */ +typedef struct ucs_arch_global_opts { + char dummy; +} ucs_arch_global_opts_t; + +END_C_DECLS + +#endif + diff --git a/src/ucs/arch/ppc64/timebase.c b/src/ucs/arch/ppc64/timebase.c index 8f3a6dd3365..b751addb648 100644 --- a/src/ucs/arch/ppc64/timebase.c +++ b/src/ucs/arch/ppc64/timebase.c @@ -5,6 +5,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #ifdef HAVE_SYS_PLATFORM_PPC_H # include diff --git a/src/ucs/arch/x86_64/bitops.h b/src/ucs/arch/x86_64/bitops.h index 76a34db5385..71f3c818585 100644 --- a/src/ucs/arch/x86_64/bitops.h +++ b/src/ucs/arch/x86_64/bitops.h @@ -7,10 +7,11 @@ #ifndef UCS_X86_64_BITOPS_H_ #define UCS_X86_64_BITOPS_H_ +#include #include -static inline unsigned ucs_ffs64(uint64_t n) +static UCS_F_ALWAYS_INLINE unsigned ucs_ffs64(uint64_t n) { uint64_t result; asm("bsfq %1,%0" @@ -19,7 +20,7 @@ static inline unsigned ucs_ffs64(uint64_t n) return result; } -static inline unsigned __ucs_ilog2_u32(uint32_t n) +static UCS_F_ALWAYS_INLINE unsigned __ucs_ilog2_u32(uint32_t n) { uint32_t result; asm("bsrl %1,%0" @@ -28,7 +29,7 @@ static inline unsigned __ucs_ilog2_u32(uint32_t n) return result; } -static inline unsigned __ucs_ilog2_u64(uint64_t n) +static UCS_F_ALWAYS_INLINE unsigned __ucs_ilog2_u64(uint64_t n) { uint64_t result; asm("bsrq %1,%0" diff --git a/src/ucs/arch/x86_64/cpu.c b/src/ucs/arch/x86_64/cpu.c index 6e8f185319f..605a4d6a577 100644 --- a/src/ucs/arch/x86_64/cpu.c +++ b/src/ucs/arch/x86_64/cpu.c @@ -1,40 +1,212 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. +* Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ #if defined(__x86_64__) +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include #include #include +#include +#define X86_CPUID_GENUINEINTEL "GenuntelineI" /* GenuineIntel in magic notation */ +#define X86_CPUID_AUTHENTICAMD "AuthcAMDenti" /* AuthenticAMD in magic notation */ #define X86_CPUID_GET_MODEL 0x00000001u #define X86_CPUID_GET_BASE_VALUE 0x00000000u #define X86_CPUID_GET_EXTD_VALUE 0x00000007u #define X86_CPUID_GET_MAX_VALUE 0x80000000u #define X86_CPUID_INVARIANT_TSC 0x80000007u +#define X86_CPUID_GET_CACHE_INFO 0x00000002u +#define X86_CPUID_GET_LEAF4_INFO 0x00000004u + +#define X86_CPU_CACHE_RESERVED 0x80000000 +#define X86_CPU_CACHE_TAG_L1_ONLY 0x40 +#define X86_CPU_CACHE_TAG_LEAF4 0xff + +#if defined (__SSE4_1__) +#define _mm_load(a) _mm_stream_load_si128((__m128i *) (a)) +#define _mm_store(a,v) _mm_storeu_si128((__m128i *) (a), (v)) +#endif + + +typedef enum ucs_x86_cpu_cache_type { + X86_CPU_CACHE_TYPE_DATA = 1, + X86_CPU_CACHE_TYPE_INSTRUCTION = 2, + X86_CPU_CACHE_TYPE_UNIFIED = 3 +} ucs_x86_cpu_cache_type_t; + +/* CPU version */ +typedef union ucs_x86_cpu_version { + struct { + unsigned stepping : 4; + unsigned model : 4; + unsigned family : 4; + unsigned type : 2; + unsigned unused : 2; + unsigned ext_model : 4; + unsigned ext_family : 8; + }; + uint32_t reg; +} UCS_S_PACKED ucs_x86_cpu_version_t; + +/* cache datatypes */ +typedef struct ucs_x86_cpu_cache_info { + unsigned level; + ucs_x86_cpu_cache_type_t type; +} UCS_S_PACKED ucs_x86_cpu_cache_info_t; + +typedef union ucs_x86_cache_line_reg_info { + uint32_t reg; + struct { + unsigned size : 12; + unsigned partitions : 10; + unsigned associativity : 10; + }; + struct { + unsigned type : 5; + unsigned level : 3; + }; +} UCS_S_PACKED ucs_x86_cache_line_reg_info_t; + +typedef union ucs_x86_cpu_registers { + struct { + union { + uint32_t eax; + uint8_t max_iter; /* leaf 2 - max iterations */ + }; + union { + struct { + uint32_t ebx; + uint32_t ecx; + uint32_t edx; + }; + char id[sizeof(uint32_t) * 3]; /* leaf 0 - CPU ID */ + }; + }; + union { + uint32_t value; + uint8_t tag[sizeof(uint32_t)]; + } reg[4]; /* leaf 2 tags */ +} UCS_S_PACKED ucs_x86_cpu_registers; + +typedef struct ucs_x86_cpu_cache_size_codes { + ucs_cpu_cache_type_t type; + size_t size; +} ucs_x86_cpu_cache_size_codes_t; ucs_ternary_value_t ucs_arch_x86_enable_rdtsc = UCS_TRY; +static const ucs_x86_cpu_cache_info_t x86_cpu_cache[] = { + [UCS_CPU_CACHE_L1d] = {.level = 1, .type = X86_CPU_CACHE_TYPE_DATA}, + [UCS_CPU_CACHE_L1i] = {.level = 1, .type = X86_CPU_CACHE_TYPE_INSTRUCTION}, + [UCS_CPU_CACHE_L2] = {.level = 2, .type = X86_CPU_CACHE_TYPE_UNIFIED}, + [UCS_CPU_CACHE_L3] = {.level = 3, .type = X86_CPU_CACHE_TYPE_UNIFIED} +}; + +static const ucs_x86_cpu_cache_size_codes_t ucs_x86_cpu_cache_size_codes[] = { + [0x06] = {.type = UCS_CPU_CACHE_L1i, .size = 8192 }, + [0x08] = {.type = UCS_CPU_CACHE_L1i, .size = 16384 }, + [0x09] = {.type = UCS_CPU_CACHE_L1i, .size = 32768 }, + [0x0a] = {.type = UCS_CPU_CACHE_L1d, .size = 8192 }, + [0x0c] = {.type = UCS_CPU_CACHE_L1d, .size = 16384 }, + [0x0d] = {.type = UCS_CPU_CACHE_L1d, .size = 16384 }, + [0x0e] = {.type = UCS_CPU_CACHE_L1d, .size = 24576 }, + [0x21] = {.type = UCS_CPU_CACHE_L2, .size = 262144 }, + [0x22] = {.type = UCS_CPU_CACHE_L3, .size = 524288 }, + [0x23] = {.type = UCS_CPU_CACHE_L3, .size = 1048576 }, + [0x25] = {.type = UCS_CPU_CACHE_L3, .size = 2097152 }, + [0x29] = {.type = UCS_CPU_CACHE_L3, .size = 4194304 }, + [0x2c] = {.type = UCS_CPU_CACHE_L1d, .size = 32768 }, + [0x30] = {.type = UCS_CPU_CACHE_L1i, .size = 32768 }, + [0x39] = {.type = UCS_CPU_CACHE_L2, .size = 131072 }, + [0x3a] = {.type = UCS_CPU_CACHE_L2, .size = 196608 }, + [0x3b] = {.type = UCS_CPU_CACHE_L2, .size = 131072 }, + [0x3c] = {.type = UCS_CPU_CACHE_L2, .size = 262144 }, + [0x3d] = {.type = UCS_CPU_CACHE_L2, .size = 393216 }, + [0x3e] = {.type = UCS_CPU_CACHE_L2, .size = 524288 }, + [0x3f] = {.type = UCS_CPU_CACHE_L2, .size = 262144 }, + [0x41] = {.type = UCS_CPU_CACHE_L2, .size = 131072 }, + [0x42] = {.type = UCS_CPU_CACHE_L2, .size = 262144 }, + [0x43] = {.type = UCS_CPU_CACHE_L2, .size = 524288 }, + [0x44] = {.type = UCS_CPU_CACHE_L2, .size = 1048576 }, + [0x45] = {.type = UCS_CPU_CACHE_L2, .size = 2097152 }, + [0x46] = {.type = UCS_CPU_CACHE_L3, .size = 4194304 }, + [0x47] = {.type = UCS_CPU_CACHE_L3, .size = 8388608 }, + [0x48] = {.type = UCS_CPU_CACHE_L2, .size = 3145728 }, + [0x49] = {.type = UCS_CPU_CACHE_L2, .size = 4194304 }, + [0x4a] = {.type = UCS_CPU_CACHE_L3, .size = 6291456 }, + [0x4b] = {.type = UCS_CPU_CACHE_L3, .size = 8388608 }, + [0x4c] = {.type = UCS_CPU_CACHE_L3, .size = 12582912 }, + [0x4d] = {.type = UCS_CPU_CACHE_L3, .size = 16777216 }, + [0x4e] = {.type = UCS_CPU_CACHE_L2, .size = 6291456 }, + [0x60] = {.type = UCS_CPU_CACHE_L1d, .size = 16384 }, + [0x66] = {.type = UCS_CPU_CACHE_L1d, .size = 8192 }, + [0x67] = {.type = UCS_CPU_CACHE_L1d, .size = 16384 }, + [0x68] = {.type = UCS_CPU_CACHE_L1d, .size = 32768 }, + [0x78] = {.type = UCS_CPU_CACHE_L2, .size = 1048576 }, + [0x79] = {.type = UCS_CPU_CACHE_L2, .size = 131072 }, + [0x7a] = {.type = UCS_CPU_CACHE_L2, .size = 262144 }, + [0x7b] = {.type = UCS_CPU_CACHE_L2, .size = 524288 }, + [0x7c] = {.type = UCS_CPU_CACHE_L2, .size = 1048576 }, + [0x7d] = {.type = UCS_CPU_CACHE_L2, .size = 2097152 }, + [0x7f] = {.type = UCS_CPU_CACHE_L2, .size = 524288 }, + [0x80] = {.type = UCS_CPU_CACHE_L2, .size = 524288 }, + [0x82] = {.type = UCS_CPU_CACHE_L2, .size = 262144 }, + [0x83] = {.type = UCS_CPU_CACHE_L2, .size = 524288 }, + [0x84] = {.type = UCS_CPU_CACHE_L2, .size = 1048576 }, + [0x85] = {.type = UCS_CPU_CACHE_L2, .size = 2097152 }, + [0x86] = {.type = UCS_CPU_CACHE_L2, .size = 524288 }, + [0x87] = {.type = UCS_CPU_CACHE_L2, .size = 1048576 }, + [0xd0] = {.type = UCS_CPU_CACHE_L3, .size = 524288 }, + [0xd1] = {.type = UCS_CPU_CACHE_L3, .size = 1048576 }, + [0xd2] = {.type = UCS_CPU_CACHE_L3, .size = 2097152 }, + [0xd6] = {.type = UCS_CPU_CACHE_L3, .size = 1048576 }, + [0xd7] = {.type = UCS_CPU_CACHE_L3, .size = 2097152 }, + [0xd8] = {.type = UCS_CPU_CACHE_L3, .size = 4194304 }, + [0xdc] = {.type = UCS_CPU_CACHE_L3, .size = 2097152 }, + [0xdd] = {.type = UCS_CPU_CACHE_L3, .size = 4194304 }, + [0xde] = {.type = UCS_CPU_CACHE_L3, .size = 8388608 }, + [0xe2] = {.type = UCS_CPU_CACHE_L3, .size = 2097152 }, + [0xe3] = {.type = UCS_CPU_CACHE_L3, .size = 4194304 }, + [0xe4] = {.type = UCS_CPU_CACHE_L3, .size = 8388608 }, + [0xea] = {.type = UCS_CPU_CACHE_L3, .size = 12582912 }, + [0xeb] = {.type = UCS_CPU_CACHE_L3, .size = 18874368 }, + [0xec] = {.type = UCS_CPU_CACHE_L3, .size = 25165824 } +}; + + static UCS_F_NOOPTIMIZE inline void ucs_x86_cpuid(uint32_t level, - uint32_t *a, uint32_t *b, - uint32_t *c, uint32_t *d) + uint32_t *a, uint32_t *b, + uint32_t *c, uint32_t *d) +{ + asm volatile ("cpuid\n\t" + : "=a"(*a), "=b"(*b), "=c"(*c), "=d"(*d) + : "0"(level)); +} + +static UCS_F_NOOPTIMIZE inline void ucs_x86_cpuid_ecx(uint32_t level, uint32_t ecx, + uint32_t *a, uint32_t *b, + uint32_t *c, uint32_t *d) { - asm volatile ("cpuid\n\t" - : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d) - : "0" (level)); + asm volatile("cpuid" + : "=a"(*a), "=b"(*b), "=c"(*c), "=d"(*d) + : "0"(level), "2"(ecx)); } /* This allows the CPU detection to work with assemblers not supporting * the xgetbv mnemonic. These include clang and some BSD versions. */ #define ucs_x86_xgetbv(_index, _eax, _edx) \ - asm volatile (".byte 0x0f, 0x01, 0xd0" : "=a"(_eax), "=d"(_edx) : "c" (_index)) + asm volatile (".byte 0x0f, 0x01, 0xd0" : "=a"(_eax), "=d"(_edx) : "c" (_index)) static int ucs_x86_invariant_tsc() { @@ -148,24 +320,22 @@ double ucs_arch_get_clocks_per_sec() ucs_cpu_model_t ucs_arch_get_cpu_model() { - uint32_t _eax, _ebx, _ecx, _edx; + ucs_x86_cpu_version_t version; + uint32_t _ebx, _ecx, _edx; uint32_t model, family; - uint32_t ext_model, ext_family; /* Get CPU model/family */ - ucs_x86_cpuid(X86_CPUID_GET_MODEL, &_eax, &_ebx, &_ecx, &_edx); + ucs_x86_cpuid(X86_CPUID_GET_MODEL, ucs_unaligned_ptr(&version.reg), &_ebx, &_ecx, &_edx); - model = (_eax >> 4) & UCS_MASK(8 - 4 ); - family = (_eax >> 8) & UCS_MASK(12 - 8 ); - ext_model = (_eax >> 16) & UCS_MASK(20 - 16); - ext_family = (_eax >> 20) & UCS_MASK(28 - 20); + model = version.model; + family = version.family; /* Adjust family/model */ if (family == 0xf) { - family += ext_family; + family += version.ext_family; } - if (family == 0x6 || family == 0xf) { - model = (ext_model << 4) | model; + if ((family == 0x6) || (family == 0xf) || (family == 0x17)) { + model = (version.ext_model << 4) | model; } /* Check known CPUs */ @@ -203,6 +373,14 @@ ucs_cpu_model_t ucs_arch_get_cpu_model() } } + if (family == 0x17) { + switch (model) { + case 0x29: + return UCS_CPU_MODEL_AMD_NAPLES; + case 0x31: + return UCS_CPU_MODEL_AMD_ROME; + } + } return UCS_CPU_MODEL_UNKNOWN; } @@ -260,9 +438,6 @@ int ucs_arch_get_cpu_flag() if ((result & UCS_CPU_FLAG_AVX) && (_ebx & (1 << 5))) { result |= UCS_CPU_FLAG_AVX2; } - if (_ebx & (1 << 24)) { - result |= UCS_CPU_FLAG_CLWB; - } } cpu_flag = result; } @@ -270,4 +445,211 @@ int ucs_arch_get_cpu_flag() return cpu_flag; } +ucs_cpu_vendor_t ucs_arch_get_cpu_vendor() +{ + ucs_x86_cpu_registers reg; + + ucs_x86_cpuid(X86_CPUID_GET_BASE_VALUE, + ucs_unaligned_ptr(®.eax), ucs_unaligned_ptr(®.ebx), + ucs_unaligned_ptr(®.ecx), ucs_unaligned_ptr(®.edx)); + if (!memcmp(reg.id, X86_CPUID_GENUINEINTEL, sizeof(X86_CPUID_GENUINEINTEL) - 1)) { + return UCS_CPU_VENDOR_INTEL; + } else if (!memcmp(reg.id, X86_CPUID_AUTHENTICAMD, sizeof(X86_CPUID_AUTHENTICAMD) - 1)) { + return UCS_CPU_VENDOR_AMD; + } + + return UCS_CPU_VENDOR_UNKNOWN; +} + +#if ENABLE_BUILTIN_MEMCPY +static size_t ucs_cpu_memcpy_thresh(size_t user_val, size_t auto_val) +{ + if (user_val != UCS_MEMUNITS_AUTO) { + return user_val; + } + + if (((ucs_arch_get_cpu_vendor() == UCS_CPU_VENDOR_INTEL) && + (ucs_arch_get_cpu_model() >= UCS_CPU_MODEL_INTEL_HASWELL)) || + (ucs_arch_get_cpu_vendor() == UCS_CPU_VENDOR_AMD)) { + return auto_val; + } else { + return UCS_MEMUNITS_INF; + } +} +#endif + +void ucs_cpu_init() +{ +#if ENABLE_BUILTIN_MEMCPY + ucs_global_opts.arch.builtin_memcpy_min = + ucs_cpu_memcpy_thresh(ucs_global_opts.arch.builtin_memcpy_min, + ucs_cpu_builtin_memcpy[ucs_arch_get_cpu_vendor()].min); + ucs_global_opts.arch.builtin_memcpy_max = + ucs_cpu_memcpy_thresh(ucs_global_opts.arch.builtin_memcpy_max, + ucs_cpu_builtin_memcpy[ucs_arch_get_cpu_vendor()].max); +#endif +} + +ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes) +{ + ucs_x86_cache_line_reg_info_t cache_info; + ucs_x86_cache_line_reg_info_t line_info; + ucs_x86_cpu_registers reg; + uint32_t sets; + uint32_t i, t, r, l4; + uint32_t max_iter; + size_t c; + int level1_only; /* level 1 cache only supported */ + int tag; + int cache_count; + ucs_cpu_cache_type_t type; + + /* Get CPU ID and vendor - it will reset cache iteration sequence */ + if (ucs_arch_get_cpu_vendor() != UCS_CPU_VENDOR_INTEL) { + return UCS_ERR_UNSUPPORTED; + } + + ucs_x86_cpuid(X86_CPUID_GET_BASE_VALUE, + ucs_unaligned_ptr(®.eax), ucs_unaligned_ptr(®.ebx), + ucs_unaligned_ptr(®.ecx), ucs_unaligned_ptr(®.edx)); + if (reg.eax < X86_CPUID_GET_CACHE_INFO) { + return UCS_ERR_UNSUPPORTED; + } + + level1_only = 0; + cache_count = 0; + + for (i = 0, max_iter = 1; i < max_iter; i++) { + ucs_x86_cpuid(X86_CPUID_GET_CACHE_INFO, + ucs_unaligned_ptr(®.eax), ucs_unaligned_ptr(®.ebx), + ucs_unaligned_ptr(®.ecx), ucs_unaligned_ptr(®.edx)); + + if (i == 0) { /* on first iteration get max iteration number */ + max_iter = reg.max_iter; + reg.max_iter = 0; /* mask iteration register from processing */ + } + + for (r = 0; r < ucs_array_size(reg.reg); r++) { + if (ucs_test_all_flags(reg.reg[r].value, X86_CPU_CACHE_RESERVED)) { + continue; + } + + for (t = 0; (t < ucs_array_size(reg.reg[r].tag)) && (reg.reg[r].tag[t] != 0); t++) { + tag = reg.reg[r].tag[t]; + + switch(tag) { + case X86_CPU_CACHE_TAG_L1_ONLY: + level1_only = 1; + break; + case X86_CPU_CACHE_TAG_LEAF4: + for (l4 = 0; cache_count < UCS_CPU_CACHE_LAST; l4++) { + ucs_x86_cpuid_ecx(X86_CPUID_GET_LEAF4_INFO, l4, + ucs_unaligned_ptr(&cache_info.reg), + ucs_unaligned_ptr(&line_info.reg), + &sets, ucs_unaligned_ptr(®.edx)); + + if (cache_info.type == 0) { + /* we are done - nothing found, go to next register */ + break; + } + + for (c = 0; c < UCS_CPU_CACHE_LAST; c++) { + if ((cache_info.level == x86_cpu_cache[c].level) && + (cache_info.type == x86_cpu_cache[c].type)) { + /* found it */ + /* cache entry is not updated yet */ + /* and cache level is 1 or all levels are supported */ + if (!((cache_sizes[c] == 0) && + ((x86_cpu_cache[c].level == 1) || !level1_only))) { + break; + } + + cache_sizes[c] = (line_info.associativity + 1) * + (line_info.partitions + 1) * + (line_info.size + 1) * + (sets + 1); + cache_count++; + } + } + } + return cache_count == UCS_CPU_CACHE_LAST ? UCS_OK : UCS_ERR_UNSUPPORTED; + default: + if ((tag >= ucs_array_size(ucs_x86_cpu_cache_size_codes)) || + (ucs_x86_cpu_cache_size_codes[tag].size != 0)) { + break; /* tag is out of table or in empty entry */ + } + + type = ucs_x86_cpu_cache_size_codes[tag].type; + if (cache_sizes[type] != 0) { /* cache is filled already */ + break; + } + + cache_sizes[type] = ucs_x86_cpu_cache_size_codes[tag].size; + cache_count++; + break; + } + } + } + } + + return cache_count == UCS_CPU_CACHE_LAST ? UCS_OK : UCS_ERR_UNSUPPORTED; +} + +void ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len) +{ +#if defined (__SSE4_1__) + /* Copy unaligned portion of src */ + if ((uintptr_t)src & 15) { + uintptr_t aligned = (uintptr_t)src & ~15; + uintptr_t misalign = (uintptr_t)src & 15; + uintptr_t copy = ucs_min(len, 16 - misalign); + + __m128i tmp = _mm_load(aligned); + memcpy(dst, UCS_PTR_BYTE_OFFSET(&tmp, misalign), copy); + + src = UCS_PTR_BYTE_OFFSET(src, copy); + dst = UCS_PTR_BYTE_OFFSET(dst, copy); + len -= copy; + } + + /* Copy 64 bytes at a time */ + while (len >= 64) { + __m128i *S = (__m128i *)src; + __m128i *D = (__m128i *)dst; + __m128i tmp[4]; + + tmp[0] = _mm_load(S + 0); + tmp[1] = _mm_load(S + 1); + tmp[2] = _mm_load(S + 2); + tmp[3] = _mm_load(S + 3); + + _mm_store(D + 0, tmp[0]); + _mm_store(D + 1, tmp[1]); + _mm_store(D + 2, tmp[2]); + _mm_store(D + 3, tmp[3]); + + src = UCS_PTR_BYTE_OFFSET(src, 64); + dst = UCS_PTR_BYTE_OFFSET(dst, 64); + len -= 64; + } + + /* Copy 16 bytes at a time */ + while (len >= 16) { + _mm_store(dst, _mm_load(src)); + + src = UCS_PTR_BYTE_OFFSET(src, 16); + dst = UCS_PTR_BYTE_OFFSET(dst, 16); + len -= 16; + } + + /* Copy any remaining bytes */ + if (len) { + __m128i tmp = _mm_load(src); + memcpy(dst, &tmp, len); + } +#else + memcpy(dst, src, len); +#endif +} + #endif diff --git a/src/ucs/arch/x86_64/cpu.h b/src/ucs/arch/x86_64/cpu.h index 24b06d91832..687ae225164 100644 --- a/src/ucs/arch/x86_64/cpu.h +++ b/src/ucs/arch/x86_64/cpu.h @@ -1,7 +1,7 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2013. ALL RIGHTS RESERVED. * Copyright (C) ARM Ltd. 2016-2017. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -13,8 +13,10 @@ #include #include #include +#include #include #include +#include #ifdef __SSE4_1__ # include @@ -36,7 +38,7 @@ BEGIN_C_DECLS #define ucs_memory_bus_fence() asm volatile ("mfence"::: "memory") #define ucs_memory_bus_store_fence() asm volatile ("sfence" ::: "memory") #define ucs_memory_bus_load_fence() asm volatile ("lfence" ::: "memory") -#define ucs_memory_bus_wc_flush() +#define ucs_memory_bus_cacheline_wc_flush() #define ucs_memory_cpu_fence() ucs_compiler_fence() #define ucs_memory_cpu_store_fence() ucs_compiler_fence() #define ucs_memory_cpu_load_fence() ucs_compiler_fence() @@ -49,6 +51,10 @@ double ucs_x86_init_tsc_freq(); ucs_cpu_model_t ucs_arch_get_cpu_model() UCS_F_NOOPTIMIZE; ucs_cpu_flag_t ucs_arch_get_cpu_flag() UCS_F_NOOPTIMIZE; +ucs_cpu_vendor_t ucs_arch_get_cpu_vendor(); +void ucs_cpu_init(); +ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes); +void ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len); static inline int ucs_arch_x86_rdtsc_enabled() { @@ -96,6 +102,31 @@ static inline void ucs_arch_writeback_cache(void *start, void *end) #endif } +static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len) +{ +#if ENABLE_BUILTIN_MEMCPY + if (ucs_unlikely((len > ucs_global_opts.arch.builtin_memcpy_min) && + (len < ucs_global_opts.arch.builtin_memcpy_max))) { + asm volatile ("rep movsb" + : "=D" (dst), + "=S" (src), + "=c" (len) + : "0" (dst), + "1" (src), + "2" (len) + : "memory"); + return dst; + } +#endif + return memcpy(dst, src, len); +} + +static UCS_F_ALWAYS_INLINE void +ucs_memcpy_nontemporal(void *dst, const void *src, size_t len) +{ + ucs_x86_memcpy_sse_movntdqa(dst, src, len); +} + END_C_DECLS #endif diff --git a/src/ucs/arch/x86_64/global_opts.c b/src/ucs/arch/x86_64/global_opts.c new file mode 100644 index 00000000000..6dca6047561 --- /dev/null +++ b/src/ucs/arch/x86_64/global_opts.c @@ -0,0 +1,44 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#if defined(__x86_64__) + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include + +ucs_config_field_t ucs_arch_global_opts_table[] = { +#if ENABLE_BUILTIN_MEMCPY + {"BUILTIN_MEMCPY_MIN", "auto", + "Minimal threshold of buffer length for using built-in memcpy.", + ucs_offsetof(ucs_arch_global_opts_t, builtin_memcpy_min), UCS_CONFIG_TYPE_MEMUNITS}, + + {"BUILTIN_MEMCPY_MAX", "auto", + "Maximal threshold of buffer length for using built-in memcpy.", + ucs_offsetof(ucs_arch_global_opts_t, builtin_memcpy_max), UCS_CONFIG_TYPE_MEMUNITS}, +#endif + {NULL} +}; + + +void ucs_arch_print_memcpy_limits(ucs_arch_global_opts_t *config) +{ +#if ENABLE_BUILTIN_MEMCPY + char min_thresh_str[32]; + char max_thresh_str[32]; + + ucs_config_sprintf_memunits(min_thresh_str, sizeof(min_thresh_str), + &config->builtin_memcpy_min, NULL); + ucs_config_sprintf_memunits(max_thresh_str, sizeof(max_thresh_str), + &config->builtin_memcpy_max, NULL); + printf("# Using built-in memcpy() for size %s..%s\n", min_thresh_str, max_thresh_str); +#endif +} + +#endif diff --git a/src/ucs/arch/x86_64/global_opts.h b/src/ucs/arch/x86_64/global_opts.h new file mode 100644 index 00000000000..54892aebdcd --- /dev/null +++ b/src/ucs/arch/x86_64/global_opts.h @@ -0,0 +1,29 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef UCS_X86_64_GLOBAL_OPTS_H_ +#define UCS_X86_64_GLOBAL_OPTS_H_ + +#include + +#include + +BEGIN_C_DECLS + +#define UCS_ARCH_GLOBAL_OPTS_INITALIZER { \ + .builtin_memcpy_min = UCS_MEMUNITS_AUTO, \ + .builtin_memcpy_max = UCS_MEMUNITS_AUTO \ +} + +/* built-in memcpy config */ +typedef struct ucs_arch_global_opts { + size_t builtin_memcpy_min; + size_t builtin_memcpy_max; +} ucs_arch_global_opts_t; + +END_C_DECLS + +#endif diff --git a/src/ucs/async/async.c b/src/ucs/async/async.c index 7e2ff640ff2..b15dfce0f22 100644 --- a/src/ucs/async/async.c +++ b/src/ucs/async/async.c @@ -4,19 +4,29 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "async_int.h" #include #include #include -#include +#include + +#define UCS_ASYNC_TIMER_ID_MIN 1000000u +#define UCS_ASYNC_TIMER_ID_MAX 2000000u -#define UCS_ASYNC_TIMER_ID_MIN 1000000u -#define UCS_ASYNC_TIMER_ID_MAX 2000000u +#define UCS_ASYNC_HANDLER_FMT "%p [id=%d ref %d] %s()" +#define UCS_ASYNC_HANDLER_ARG(_h) (_h), (_h)->id, (_h)->refcount, \ + ucs_debug_get_symbol_name((_h)->cb) -#define UCS_ASYNC_HANDLER_FMT "%p [id=%d] %s()" -#define UCS_ASYNC_HANDLER_ARG(_h) (_h), (_h)->id, ucs_debug_get_symbol_name((_h)->cb) +#define UCS_ASYNC_HANDLER_CALLER_NULL ((pthread_t)-1) + +#define UCS_ASYNC_MISSED_QUEUE_SHIFT 32 +#define UCS_ASYNC_MISSED_QUEUE_MASK UCS_MASK(UCS_ASYNC_MISSED_QUEUE_SHIFT) /* Hash table for all event and timer handlers */ KHASH_MAP_INIT_INT(ucs_async_handler, ucs_async_handler_t *); @@ -87,6 +97,18 @@ static inline int ucs_async_handler_kh_is_end(khiter_t hash_it) return hash_it == kh_end(&ucs_async_global_context.handlers); } +static inline uint64_t ucs_async_missed_event_pack(int id, int events) +{ + return ((uint64_t)id << UCS_ASYNC_MISSED_QUEUE_SHIFT) | (uint32_t)events; +} + +static inline void ucs_async_missed_event_unpack(uint64_t value, int *id_p, + int *events_p) +{ + *id_p = value >> UCS_ASYNC_MISSED_QUEUE_SHIFT; + *events_p = value & UCS_ASYNC_MISSED_QUEUE_MASK; +} + static void ucs_async_handler_hold(ucs_async_handler_t *handler) { ucs_atomic_add32(&handler->refcount, 1); @@ -140,7 +162,7 @@ static ucs_async_handler_t *ucs_async_handler_extract(int id) /* decrement reference count and release the handler if reached 0 */ static void ucs_async_handler_put(ucs_async_handler_t *handler) { - if (ucs_atomic_fadd32(&handler->refcount, -1) > 1) { + if (ucs_atomic_fsub32(&handler->refcount, 1) > 1) { return; } @@ -153,9 +175,9 @@ static void ucs_async_handler_put(ucs_async_handler_t *handler) static ucs_status_t ucs_async_handler_add(int min_id, int max_id, ucs_async_handler_t *handler) { + khiter_t hash_it = 0; int hash_extra_status; ucs_status_t status; - khiter_t hash_it; int i, id; pthread_rwlock_wrlock(&ucs_async_global_context.handlers_lock); @@ -202,31 +224,48 @@ static ucs_status_t ucs_async_handler_add(int min_id, int max_id, return status; } -static ucs_status_t ucs_async_handler_dispatch(ucs_async_handler_t *handler) +static void ucs_async_handler_invoke(ucs_async_handler_t *handler, int events) +{ + ucs_trace_async("calling async handler " UCS_ASYNC_HANDLER_FMT, + UCS_ASYNC_HANDLER_ARG(handler)); + + /* track call count to allow removing the handler synchronously from itself + * the handler must always be called with async context blocked, so no need + * for atomic operations here. + */ + ucs_assert(handler->caller == UCS_ASYNC_HANDLER_CALLER_NULL); + handler->caller = pthread_self(); + handler->cb(handler->id, events, handler->arg); + handler->caller = UCS_ASYNC_HANDLER_CALLER_NULL; +} + +static ucs_status_t ucs_async_handler_dispatch(ucs_async_handler_t *handler, + int events) { ucs_async_context_t *async; ucs_async_mode_t mode; ucs_status_t status; + uint64_t value; mode = handler->mode; async = handler->async; - if (async != NULL) { - async->last_wakeup = ucs_get_time(); - } + if (async == NULL) { - ucs_trace_async("calling async handler " UCS_ASYNC_HANDLER_FMT, - UCS_ASYNC_HANDLER_ARG(handler)); - handler->cb(handler->id, handler->arg); - } else if (ucs_async_method_call(mode, context_try_block, async)) { - ucs_trace_async("calling async handler " UCS_ASYNC_HANDLER_FMT, - UCS_ASYNC_HANDLER_ARG(handler)); - handler->cb(handler->id, handler->arg); + ucs_async_handler_invoke(handler, events); + return UCS_OK; + } + + async->last_wakeup = ucs_get_time(); + if (ucs_async_method_call(mode, context_try_block, async)) { + ucs_async_handler_invoke(handler, events); ucs_async_method_call(mode, context_unblock, async); - } else /* async != NULL */ { + } else { ucs_trace_async("missed " UCS_ASYNC_HANDLER_FMT ", last_wakeup %lu", UCS_ASYNC_HANDLER_ARG(handler), async->last_wakeup); if (ucs_atomic_cswap32(&handler->missed, 0, 1) == 0) { - status = ucs_mpmc_queue_push(&async->missed, handler->id); + /* save both the handler_id and events */ + value = ucs_async_missed_event_pack(handler->id, events); + status = ucs_mpmc_queue_push(&async->missed, value); if (status != UCS_OK) { ucs_fatal("Failed to push event %d to miss queue: %s", handler->id, ucs_status_string(status)); @@ -237,19 +276,20 @@ static ucs_status_t ucs_async_handler_dispatch(ucs_async_handler_t *handler) return UCS_OK; } -ucs_status_t ucs_async_dispatch_handlers(int *events, size_t count) +ucs_status_t ucs_async_dispatch_handlers(int *handler_ids, size_t count, + int events) { ucs_status_t status = UCS_OK, tmp_status; ucs_async_handler_t *handler; - for (; count > 0; --count, ++events) { - handler = ucs_async_handler_get(*events); + for (; count > 0; --count, ++handler_ids) { + handler = ucs_async_handler_get(*handler_ids); if (handler == NULL) { - ucs_trace_async("handler for %d not found - ignoring", *events); + ucs_trace_async("handler for %d not found - ignoring", *handler_ids); continue; } - tmp_status = ucs_async_handler_dispatch(handler); + tmp_status = ucs_async_handler_dispatch(handler, events); if (tmp_status != UCS_OK) { status = tmp_status; } @@ -276,7 +316,8 @@ ucs_status_t ucs_async_dispatch_timerq(ucs_timer_queue_t *timerq, } }) - return ucs_async_dispatch_handlers(expired_timers, num_timers); + return ucs_async_dispatch_handlers(expired_timers, num_timers, + UCS_ASYNC_EVENT_DUMMY); } ucs_status_t ucs_async_context_init(ucs_async_context_t *async, ucs_async_mode_t mode) @@ -342,9 +383,8 @@ void ucs_async_context_cleanup(ucs_async_context_t *async) pthread_rwlock_rdlock(&ucs_async_global_context.handlers_lock); kh_foreach_value(&ucs_async_global_context.handlers, handler, { if (async == handler->async) { - ucs_warn("async %p handler "UCS_ASYNC_HANDLER_FMT" %s() not released", - async, UCS_ASYNC_HANDLER_ARG(handler), - ucs_debug_get_symbol_name(handler->cb)); + ucs_warn("async %p handler "UCS_ASYNC_HANDLER_FMT" not released", + async, UCS_ASYNC_HANDLER_ARG(handler)); } }); ucs_warn("releasing async context with %d handlers", async->num_handlers); @@ -380,7 +420,7 @@ ucs_async_alloc_handler(int min_id, int max_id, ucs_async_mode_t mode, /* Limit amount of handlers per context */ if (async != NULL) { - if (ucs_atomic_fadd32(&async->num_handlers, +1) >= ucs_global_opts.async_max_events) { + if (ucs_atomic_fadd32(&async->num_handlers, 1) >= ucs_global_opts.async_max_events) { status = UCS_ERR_EXCEEDS_LIMIT; goto err_dec_num_handlers; } @@ -394,6 +434,7 @@ ucs_async_alloc_handler(int min_id, int max_id, ucs_async_mode_t mode, handler->mode = mode; handler->events = events; + handler->caller = UCS_ASYNC_HANDLER_CALLER_NULL; handler->cb = cb; handler->arg = arg; handler->async = async; @@ -414,7 +455,7 @@ ucs_async_alloc_handler(int min_id, int max_id, ucs_async_mode_t mode, ucs_free(handler); err_dec_num_handlers: if (async != NULL) { - ucs_atomic_add32(&async->num_handlers, -1); + ucs_atomic_add32(&async->num_handlers, (uint32_t)-1); } err: return status; @@ -482,7 +523,7 @@ ucs_status_t ucs_async_add_timer(ucs_async_mode_t mode, ucs_time_t interval, return status; } -ucs_status_t ucs_async_remove_handler(int id, int sync) +ucs_status_t ucs_async_remove_handler(int id, int is_sync) { ucs_async_handler_t *handler; ucs_status_t status; @@ -513,11 +554,14 @@ ucs_status_t ucs_async_remove_handler(int id, int sync) } if (handler->async != NULL) { - ucs_atomic_add32(&handler->async->num_handlers, -1); + ucs_atomic_add32(&handler->async->num_handlers, (uint32_t)-1); } - if (sync) { - while (handler->refcount > 1) { + if (is_sync) { + int called = (pthread_self() == handler->caller); + ucs_trace("waiting for " UCS_ASYNC_HANDLER_FMT " completion (called=%d)", + UCS_ASYNC_HANDLER_ARG(handler), called); + while ((handler->refcount - called) > 1) { /* TODO use pthread_cond / futex to reduce CPU usage while waiting * for the async handler to complete */ sched_yield(); @@ -537,7 +581,10 @@ ucs_status_t ucs_async_modify_handler(int fd, int events) return UCS_ERR_INVALID_PARAM; } + ucs_async_method_call_all(block); handler = ucs_async_handler_get(fd); + ucs_async_method_call_all(unblock); + if (handler == NULL) { return UCS_ERR_NO_ELEM; } @@ -553,8 +600,9 @@ ucs_status_t ucs_async_modify_handler(int fd, int events) void __ucs_async_poll_missed(ucs_async_context_t *async) { ucs_async_handler_t *handler; + int handler_id, events; ucs_status_t status; - uint32_t value; + uint64_t value; ucs_trace_async("miss handler"); @@ -568,20 +616,17 @@ void __ucs_async_poll_missed(ucs_async_context_t *async) } ucs_async_method_call_all(block); - handler = ucs_async_handler_get(value); + UCS_ASYNC_BLOCK(async); + + ucs_async_missed_event_unpack(value, &handler_id, &events); + handler = ucs_async_handler_get(handler_id); if (handler != NULL) { - ucs_trace_async("calling missed async handler " UCS_ASYNC_HANDLER_FMT, - UCS_ASYNC_HANDLER_ARG(handler)); - if (handler->async) { - UCS_ASYNC_BLOCK(handler->async); - } + ucs_assert(handler->async == async); handler->missed = 0; - handler->cb(handler->id, handler->arg); - if (handler->async) { - UCS_ASYNC_UNBLOCK(handler->async); - } + ucs_async_handler_invoke(handler, events); ucs_async_handler_put(handler); } + UCS_ASYNC_UNBLOCK(async); ucs_async_method_call_all(unblock); } } @@ -608,7 +653,8 @@ void ucs_async_poll(ucs_async_context_t *async) pthread_rwlock_unlock(&ucs_async_global_context.handlers_lock); for (i = 0; i < n; ++i) { - ucs_async_handler_dispatch(handlers[i]); + /* dispatch the handler with all the registered events */ + ucs_async_handler_dispatch(handlers[i], handlers[i]->events); ucs_async_handler_put(handlers[i]); } } @@ -630,8 +676,8 @@ void ucs_async_global_cleanup() { int num_elems = kh_size(&ucs_async_global_context.handlers); if (num_elems != 0) { - ucs_info("async handler table is not empty during exit (contains %d elems)", - num_elems); + ucs_debug("async handler table is not empty during exit (contains %d elems)", + num_elems); } ucs_async_method_call_all(cleanup); kh_destroy_inplace(ucs_async_handler, &ucs_async_global_context.handlers); diff --git a/src/ucs/async/async.h b/src/ucs/async/async.h index 02188659bf4..e324574f4cc 100644 --- a/src/ucs/async/async.h +++ b/src/ucs/async/async.h @@ -20,6 +20,9 @@ BEGIN_C_DECLS /** @file async.h */ +#define UCS_ASYNC_EVENT_DUMMY 0 + + /** * Async event context. Manages timer and fd notifications. */ @@ -53,7 +56,8 @@ void ucs_async_global_cleanup(); * This can be used to ensure safe event delivery. * * @param async Event context to initialize. - * @param mode Either to use signals or epoll threads to wait. + * @param mode Indicates whether to use signals or polling threads + * for waiting. * * @return Error code as defined by @ref ucs_status_t. */ @@ -97,7 +101,7 @@ static inline int ucs_async_check_miss(ucs_async_context_t *async) #define UCS_ASYNC_BLOCK(_async) \ do { \ if ((_async)->mode == UCS_ASYNC_MODE_THREAD_SPINLOCK) { \ - ucs_spin_lock(&(_async)->thread.spinlock); \ + ucs_recursive_spin_lock(&(_async)->thread.spinlock); \ } else if ((_async)->mode == UCS_ASYNC_MODE_THREAD_MUTEX) { \ (void)pthread_mutex_lock(&(_async)->thread.mutex); \ } else if ((_async)->mode == UCS_ASYNC_MODE_SIGNAL) { \ @@ -116,7 +120,7 @@ static inline int ucs_async_check_miss(ucs_async_context_t *async) #define UCS_ASYNC_UNBLOCK(_async) \ do { \ if ((_async)->mode == UCS_ASYNC_MODE_THREAD_SPINLOCK) { \ - ucs_spin_unlock(&(_async)->thread.spinlock); \ + ucs_recursive_spin_unlock(&(_async)->thread.spinlock); \ } else if ((_async)->mode == UCS_ASYNC_MODE_THREAD_MUTEX) { \ (void)pthread_mutex_unlock(&(_async)->thread.mutex); \ } else if ((_async)->mode == UCS_ASYNC_MODE_SIGNAL) { \ diff --git a/src/ucs/async/async_fwd.h b/src/ucs/async/async_fwd.h index 1e19fde02ab..ca81dc7975d 100644 --- a/src/ucs/async/async_fwd.h +++ b/src/ucs/async/async_fwd.h @@ -10,6 +10,7 @@ #include #include #include +#include BEGIN_C_DECLS @@ -24,9 +25,10 @@ typedef struct ucs_async_context ucs_async_context_t; * Async event callback. * * @param id Event id (timer or file descriptor). + * @param events The events that triggered the callback. * @param arg User-defined argument. */ -typedef void (*ucs_async_event_cb_t)(int id, void *arg); +typedef void (*ucs_async_event_cb_t)(int id, int events, void *arg); /** @@ -37,7 +39,7 @@ typedef void (*ucs_async_event_cb_t)(int id, void *arg); * * @param mode Thread or signal. * @param event_fd File descriptor to set handler for. - * @param events Events to wait on (POLLxx/EPOLLxx bits). + * @param events Events to wait on (UCS_EVENT_SET_EVxxx bits). * @param cb Callback function to execute. * @param arg Argument to callback. * @param async Async context to which events are delivered. @@ -77,8 +79,9 @@ ucs_status_t ucs_async_add_timer(ucs_async_mode_t mode, ucs_time_t interval, * * @param id Timer/FD to remove. * @param sync If nonzero, wait until the handler for this event is not - * running anymore. Cannot be used in the context of the event - * handler itself because it would deadlock. + * running anymore. If called from the context of the callback, + * the handler will be removed immediately after the current + * callback returns. * * @return Error code as defined by @ref ucs_status_t. */ @@ -91,7 +94,7 @@ ucs_status_t ucs_async_remove_handler(int id, int sync); * Modify events mask for an existing event handler (event file). * * @param fd File descriptor modify events for. - * @param events New set of events to wait on (POLLxx/EPOLLxx bits). + * @param events New set of events to wait on (UCS_EVENT_SET_EVxxx bits). * * @return Error code as defined by @ref ucs_status_t. */ @@ -105,7 +108,8 @@ ucs_status_t ucs_async_modify_handler(int fd, int events); * Allocate and initialize an asynchronous execution context. * This can be used to ensure safe event delivery. * - * @param mode Either to use signals or epoll threads to wait. + * @param mode Indicates whether to use signals or polling threads + * for waiting. * @param async_p Event context pointer to initialize. * * @return Error code as defined by @ref ucs_status_t. diff --git a/src/ucs/async/async_int.h b/src/ucs/async/async_int.h index ff6b2329d0e..1e03f4dba72 100644 --- a/src/ucs/async/async_int.h +++ b/src/ucs/async/async_int.h @@ -19,6 +19,7 @@ struct ucs_async_handler { int id; /* Event/Timer ID */ ucs_async_mode_t mode; /* Event delivery mode */ int events; /* Bitmap of events */ + pthread_t caller; /* Thread which invokes the callback */ ucs_async_event_cb_t cb; /* Callback function */ void *arg; /* Callback argument */ ucs_async_context_t *async; /* Async context for the handler. Can be NULL */ @@ -30,10 +31,11 @@ struct ucs_async_handler { /** * Dispatch event coming from async context. * - * @param id Array of event IDs to dispatch. - * @param count Number of events + * @param handler_ids Array of handler IDs to dispatch. + * @param count Number of events + * @param events Events to pass to the handler */ -ucs_status_t ucs_async_dispatch_handlers(int *events, size_t count); +ucs_status_t ucs_async_dispatch_handlers(int *handler_ids, size_t count, int events); /** diff --git a/src/ucs/async/pipe.c b/src/ucs/async/pipe.c index b6679d40186..d3a6a411f1f 100644 --- a/src/ucs/async/pipe.c +++ b/src/ucs/async/pipe.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "pipe.h" #include @@ -22,8 +26,8 @@ ucs_status_t ucs_async_pipe_create(ucs_async_pipe_t *p) } /* Set pipe to non blocking */ - if (ucs_sys_fcntl_modfl(pipefds[0], O_NONBLOCK, 0) != UCS_OK || - ucs_sys_fcntl_modfl(pipefds[1], O_NONBLOCK, 0) != UCS_OK) + if ((ucs_sys_fcntl_modfl(pipefds[0], O_NONBLOCK, 0) != UCS_OK) || + (ucs_sys_fcntl_modfl(pipefds[1], O_NONBLOCK, 0) != UCS_OK)) { goto err_close_pipe; } diff --git a/src/ucs/async/pipe.h b/src/ucs/async/pipe.h index d81c0090f4b..5b0b6349c36 100644 --- a/src/ucs/async/pipe.h +++ b/src/ucs/async/pipe.h @@ -9,6 +9,7 @@ #include +BEGIN_C_DECLS /** * A pipe for event signaling. @@ -42,4 +43,6 @@ static inline int ucs_async_pipe_rfd(ucs_async_pipe_t *p) { return p->read_fd; } +END_C_DECLS + #endif diff --git a/src/ucs/async/signal.c b/src/ucs/async/signal.c index 7784fd4fbe7..30562907077 100644 --- a/src/ucs/async/signal.c +++ b/src/ucs/async/signal.c @@ -115,7 +115,13 @@ ucs_async_signal_sys_timer_create(int uid, pid_t tid, timer_t *sys_timer_id) ev.sigev_notify = SIGEV_THREAD_ID; ev.sigev_signo = ucs_global_opts.async_signo; ev.sigev_value.sival_int = uid; /* user parameter to timer */ +#if defined(HAVE_SIGEVENT_SIGEV_UN_TID) ev._sigev_un._tid = tid; /* target thread */ +#elif defined(HAVE_SIGEVENT_SIGEV_NOTIFY_THREAD_ID) + ev.sigev_notify_thread_id = tid; /* target thread */ +#else +#error "Port me" +#endif ret = timer_create(CLOCK_REALTIME, &ev, &timer); if (ret < 0) { ucs_error("failed to create an interval timer: %m"); @@ -175,6 +181,29 @@ static ucs_status_t ucs_async_signal_dispatch_timer(int uid) return ucs_async_dispatch_timerq(&timer->timerq, ucs_get_time()); } +static inline int ucs_signal_map_to_events(int si_code) +{ + int events; + + switch (si_code) { + case POLL_IN: + case POLL_MSG: + case POLL_PRI: + events = UCS_EVENT_SET_EVREAD; + return events; + case POLL_OUT: + events = UCS_EVENT_SET_EVWRITE; + return events; + case POLL_HUP: + case POLL_ERR: + events = UCS_EVENT_SET_EVERR; + return events; + default: + ucs_warn("unexpected si_code %d", si_code); + return UCS_ASYNC_EVENT_DUMMY; + } +} + static void ucs_async_signal_handler(int signo, siginfo_t *siginfo, void *arg) { ucs_assert(signo == ucs_global_opts.async_signo); @@ -182,8 +211,8 @@ static void ucs_async_signal_handler(int signo, siginfo_t *siginfo, void *arg) /* Check event code */ switch (siginfo->si_code) { case SI_TIMER: - ucs_trace_async("timer signal uid=%d", siginfo->si_int); - ucs_async_signal_dispatch_timer(siginfo->si_int); + ucs_trace_async("timer signal uid=%d", siginfo->si_value.sival_int); + ucs_async_signal_dispatch_timer(siginfo->si_value.sival_int); return; case POLL_IN: case POLL_OUT: @@ -192,7 +221,8 @@ static void ucs_async_signal_handler(int signo, siginfo_t *siginfo, void *arg) case POLL_MSG: case POLL_PRI: ucs_trace_async("async signal handler called for fd %d", siginfo->si_fd); - ucs_async_dispatch_handlers(&siginfo->si_fd, 1); + ucs_async_dispatch_handlers(&siginfo->si_fd, 1, + ucs_signal_map_to_events(siginfo->si_code)); return; default: ucs_warn("signal handler called with unexpected event code %d, ignoring", @@ -203,13 +233,13 @@ static void ucs_async_signal_handler(int signo, siginfo_t *siginfo, void *arg) static void ucs_async_signal_allow(int allow) { - sigset_t sigset; + sigset_t sig_set; ucs_trace_func("enable=%d tid=%d", allow, ucs_get_tid()); - sigemptyset(&sigset); - sigaddset(&sigset, ucs_global_opts.async_signo); - pthread_sigmask(allow ? SIG_UNBLOCK : SIG_BLOCK, &sigset, NULL); + sigemptyset(&sig_set); + sigaddset(&sig_set, ucs_global_opts.async_signo); + pthread_sigmask(allow ? SIG_UNBLOCK : SIG_BLOCK, &sig_set, NULL); } static void ucs_async_signal_block_all() @@ -243,7 +273,9 @@ static ucs_status_t ucs_async_signal_install_handler() new_action.sa_sigaction = ucs_async_signal_handler; sigemptyset(&new_action.sa_mask); new_action.sa_flags = SA_RESTART|SA_SIGINFO; +#if HAVE_SIGACTION_SA_RESTORER new_action.sa_restorer = NULL; +#endif ret = sigaction(ucs_global_opts.async_signo, &new_action, &ucs_async_signal_global_context.prev_sighandler); if (ret < 0) { @@ -310,20 +342,20 @@ static ucs_status_t ucs_async_signal_modify_event_fd(ucs_async_context_t *async, int event_fd, int events) { ucs_status_t status; - int add, remove; + int add, rm; UCS_ASYNC_SIGNAL_CHECK_THREAD(async); if (events) { - add = O_ASYNC; /* Enable notifications */ - remove = 0; + add = O_ASYNC; /* Enable notifications */ + rm = 0; } else { - add = 0; /* Disable notifications */ - remove = O_ASYNC; + add = 0; /* Disable notifications */ + rm = O_ASYNC; } - ucs_trace_async("fcntl(fd=%d, add=0x%x, remove=0x%x)", event_fd, add, remove); - status = ucs_sys_fcntl_modfl(event_fd, add, remove); + ucs_trace_async("fcntl(fd=%d, add=0x%x, remove=0x%x)", event_fd, add, rm); + status = ucs_sys_fcntl_modfl(event_fd, add, rm); if (status != UCS_OK) { ucs_error("fcntl F_SETFL failed: %m"); return UCS_ERR_IO_ERROR; @@ -576,7 +608,7 @@ static void ucs_async_signal_global_init() static void ucs_async_signal_global_cleanup() { if (ucs_async_signal_global_context.event_count != 0) { - ucs_info("signal handler not removed (%d events remaining)", + ucs_warn("signal handler not removed (%d events remaining)", ucs_async_signal_global_context.event_count); } pthread_mutex_destroy(&ucs_async_signal_global_context.timers_lock); diff --git a/src/ucs/async/thread.c b/src/ucs/async/thread.c index 98cec90b789..74f7f8a7886 100644 --- a/src/ucs/async/thread.c +++ b/src/ucs/async/thread.c @@ -14,7 +14,8 @@ #include #include -#include +#include +#include #define UCS_ASYNC_EPOLL_MAX_EVENTS 16 @@ -22,12 +23,12 @@ typedef struct ucs_async_thread { - ucs_async_pipe_t wakeup; - int epfd; - ucs_timer_queue_t timerq; - pthread_t thread_id; - int stop; - uint32_t refcnt; + ucs_async_pipe_t wakeup; + ucs_sys_event_set_t *event_set; + ucs_timer_queue_t timerq; + pthread_t thread_id; + int stop; + uint32_t refcnt; } ucs_async_thread_t; @@ -38,6 +39,12 @@ typedef struct ucs_async_thread_global_context { } ucs_async_thread_global_context_t; +typedef struct ucs_async_thread_callback_arg { + ucs_async_thread_t *thread; + int *is_missed; +} ucs_async_thread_callback_arg_t; + + static ucs_async_thread_global_context_t ucs_async_thread_global_context = { .thread = NULL, .use_count = 0, @@ -52,28 +59,54 @@ static void ucs_async_thread_hold(ucs_async_thread_t *thread) static void ucs_async_thread_put(ucs_async_thread_t *thread) { - if (ucs_atomic_fadd32(&thread->refcnt, -1) == 1) { - close(thread->epfd); + if (ucs_atomic_fsub32(&thread->refcnt, 1) == 1) { + ucs_event_set_cleanup(thread->event_set); ucs_async_pipe_destroy(&thread->wakeup); ucs_timerq_cleanup(&thread->timerq); ucs_free(thread); } } +static void ucs_async_thread_ev_handler(void *callback_data, int event, + void *arg) +{ + ucs_async_thread_callback_arg_t *cb_arg = (void*)arg; + int fd = (int)(uintptr_t)callback_data; + ucs_status_t status; + + ucs_trace_async("ucs_async_thread_ev_handler(fd=%d, event=%d)", + fd, event); + + if (fd == ucs_async_pipe_rfd(&cb_arg->thread->wakeup)) { + ucs_trace_async("progress thread woken up"); + ucs_async_pipe_drain(&cb_arg->thread->wakeup); + return; + } + + status = ucs_async_dispatch_handlers(&fd, 1, event); + if (status == UCS_ERR_NO_PROGRESS) { + *cb_arg->is_missed = 1; + } +} + static void *ucs_async_thread_func(void *arg) { ucs_async_thread_t *thread = arg; - struct epoll_event events[UCS_ASYNC_EPOLL_MAX_EVENTS]; ucs_time_t last_time, curr_time, timer_interval, time_spent; - int i, nready, is_missed, timeout_ms; + int is_missed, timeout_ms; ucs_status_t status; - int fd; + unsigned num_events; + ucs_async_thread_callback_arg_t cb_arg; - is_missed = 0; - curr_time = ucs_get_time(); - last_time = ucs_get_time(); + is_missed = 0; + curr_time = ucs_get_time(); + last_time = ucs_get_time(); + cb_arg.thread = thread; + cb_arg.is_missed = &is_missed; while (!thread->stop) { + num_events = ucs_min(UCS_ASYNC_EPOLL_MAX_EVENTS, + ucs_sys_event_set_max_wait_events); /* If we didn't get the lock, give other threads priority */ if (is_missed) { @@ -90,31 +123,13 @@ static void *ucs_async_thread_func(void *arg) timeout_ms = ucs_time_to_msec(timer_interval - ucs_min(time_spent, timer_interval)); } - nready = epoll_wait(thread->epfd, events, UCS_ASYNC_EPOLL_MAX_EVENTS, - timeout_ms); - if ((nready < 0) && (errno != EINTR)) { - ucs_fatal("epoll_wait() failed: %m"); - } - ucs_trace_async("epoll_wait(epfd=%d, timeout=%d) returned %d", - thread->epfd, timeout_ms, nready); - - /* Check ready files */ - if (nready > 0) { - for (i = 0; i < nready; ++i) { - fd = events[i].data.fd; - - /* Check wakeup pipe */ - if (fd == ucs_async_pipe_rfd(&thread->wakeup)) { - ucs_trace_async("progress thread woken up"); - ucs_async_pipe_drain(&thread->wakeup); - continue; - } - - status = ucs_async_dispatch_handlers(&fd, 1); - if (status == UCS_ERR_NO_PROGRESS) { - is_missed = 1; - } - } + + status = ucs_event_set_wait(thread->event_set, + &num_events, timeout_ms, + ucs_async_thread_ev_handler, + (void*)&cb_arg); + if (UCS_STATUS_IS_ERR(status)) { + ucs_fatal("ucs_event_set_wait() failed: %d", status); } /* Check timers */ @@ -136,7 +151,6 @@ static void *ucs_async_thread_func(void *arg) static ucs_status_t ucs_async_thread_start(ucs_async_thread_t **thread_p) { ucs_async_thread_t *thread; - struct epoll_event event; ucs_status_t status; int wakeup_rfd; int ret; @@ -171,40 +185,34 @@ static ucs_status_t ucs_async_thread_start(ucs_async_thread_t **thread_p) goto err_timerq_cleanup; } - /* Create epoll set the thread will wait on */ - thread->epfd = epoll_create(1); - if (thread->epfd < 0) { - ucs_error("epoll_create() failed: %m"); - status = UCS_ERR_IO_ERROR; + status = ucs_event_set_create(&thread->event_set); + if (status != UCS_OK) { goto err_close_pipe; } - /* Add wakeup pipe to epoll set */ + /* Store file descriptor into void * storage without memory allocation. */ wakeup_rfd = ucs_async_pipe_rfd(&thread->wakeup); - memset(&event, 0, sizeof(event)); - event.events = EPOLLIN; - event.data.fd = wakeup_rfd; - ret = epoll_ctl(thread->epfd, EPOLL_CTL_ADD, wakeup_rfd, &event); - if (ret < 0) { - ucs_error("epoll_ctl(epfd=%d, ADD, fd=%d) failed: %m",thread->epfd, - wakeup_rfd); + status = ucs_event_set_add(thread->event_set, wakeup_rfd, + UCS_EVENT_SET_EVREAD, + (void *)(uintptr_t)wakeup_rfd); + if (status != UCS_OK) { status = UCS_ERR_IO_ERROR; - goto err_close_epfd; + goto err_free_event_set; } ret = pthread_create(&thread->thread_id, NULL, ucs_async_thread_func, thread); if (ret != 0) { ucs_error("pthread_create() returned %d: %m", ret); status = UCS_ERR_IO_ERROR; - goto err_close_epfd; + goto err_free_event_set; } ucs_async_thread_global_context.thread = thread; status = UCS_OK; goto out_unlock; -err_close_epfd: - close(thread->epfd); +err_free_event_set: + ucs_event_set_cleanup(thread->event_set); err_close_pipe: ucs_async_pipe_destroy(&thread->wakeup); err_timerq_cleanup: @@ -248,22 +256,27 @@ static void ucs_async_thread_stop() static ucs_status_t ucs_async_thread_spinlock_init(ucs_async_context_t *async) { - return ucs_spinlock_init(&async->thread.spinlock); + return ucs_recursive_spinlock_init(&async->thread.spinlock, 0); } static void ucs_async_thread_spinlock_cleanup(ucs_async_context_t *async) { - ucs_spinlock_destroy(&async->thread.spinlock); + ucs_status_t status; + + status = ucs_recursive_spinlock_destroy(&async->thread.spinlock); + if (status != UCS_OK) { + ucs_warn("ucs_recursive_spinlock_destroy() failed (%d)", status); + } } static int ucs_async_thread_spinlock_try_block(ucs_async_context_t *async) { - return ucs_spin_trylock(&async->thread.spinlock); + return ucs_recursive_spin_trylock(&async->thread.spinlock); } static void ucs_async_thread_spinlock_unblock(ucs_async_context_t *async) { - ucs_spin_unlock(&async->thread.spinlock); + ucs_recursive_spin_unlock(&async->thread.spinlock); } static ucs_status_t ucs_async_thread_mutex_init(ucs_async_context_t *async) @@ -295,22 +308,18 @@ static ucs_status_t ucs_async_thread_add_event_fd(ucs_async_context_t *async, int event_fd, int events) { ucs_async_thread_t *thread; - struct epoll_event event; ucs_status_t status; - int ret; status = ucs_async_thread_start(&thread); if (status != UCS_OK) { goto err; } - memset(&event, 0, sizeof(event)); - event.events = events; - event.data.fd = event_fd; - ret = epoll_ctl(thread->epfd, EPOLL_CTL_ADD, event_fd, &event); - if (ret < 0) { - ucs_error("epoll_ctl(epfd=%d, ADD, fd=%d) failed: %m", thread->epfd, - event_fd); + /* Store file descriptor into void * storage without memory allocation. */ + status = ucs_event_set_add(thread->event_set, event_fd, + (ucs_event_set_type_t)events, + (void *)(uintptr_t)event_fd); + if (status != UCS_OK) { status = UCS_ERR_IO_ERROR; goto err_removed; } @@ -328,13 +337,11 @@ static ucs_status_t ucs_async_thread_remove_event_fd(ucs_async_context_t *async, int event_fd) { ucs_async_thread_t *thread = ucs_async_thread_global_context.thread; - int ret; + ucs_status_t status; - ret = epoll_ctl(thread->epfd, EPOLL_CTL_DEL, event_fd, NULL); - if (ret < 0) { - ucs_error("epoll_ctl(epfd=%d, DEL, fd=%d) failed: %m", thread->epfd, - event_fd); - return UCS_ERR_INVALID_PARAM; + status = ucs_event_set_del(thread->event_set, event_fd); + if (status != UCS_OK) { + return status; } ucs_async_thread_stop(); @@ -344,21 +351,10 @@ static ucs_status_t ucs_async_thread_remove_event_fd(ucs_async_context_t *async, static ucs_status_t ucs_async_thread_modify_event_fd(ucs_async_context_t *async, int event_fd, int events) { - ucs_async_thread_t *thread = ucs_async_thread_global_context.thread; - struct epoll_event event; - int ret; - - memset(&event, 0, sizeof(event)); - event.events = events; - event.data.fd = event_fd; - ret = epoll_ctl(thread->epfd, EPOLL_CTL_MOD, event_fd, &event); - if (ret < 0) { - ucs_error("epoll_ctl(epfd=%d, ADD, fd=%d) failed: %m", thread->epfd, - event_fd); - return UCS_ERR_IO_ERROR; - } - - return UCS_OK; + /* Store file descriptor into void * storage without memory allocation. */ + return ucs_event_set_mod(ucs_async_thread_global_context.thread->event_set, + event_fd, (ucs_event_set_type_t)events, + (void *)(uintptr_t)event_fd); } static int ucs_async_thread_mutex_try_block(ucs_async_context_t *async) @@ -415,8 +411,8 @@ static ucs_status_t ucs_async_thread_remove_timer(ucs_async_context_t *async, static void ucs_async_signal_global_cleanup() { if (ucs_async_thread_global_context.thread != NULL) { - ucs_info("async thread still running (use count %d)", - ucs_async_thread_global_context.use_count); + ucs_debug("async thread still running (use count %u)", + ucs_async_thread_global_context.use_count); } } diff --git a/src/ucs/async/thread.h b/src/ucs/async/thread.h index bba59872d12..73c48536712 100644 --- a/src/ucs/async/thread.h +++ b/src/ucs/async/thread.h @@ -13,8 +13,8 @@ typedef struct ucs_async_thread_context { union { - ucs_spinlock_t spinlock; - pthread_mutex_t mutex; + ucs_recursive_spinlock_t spinlock; + pthread_mutex_t mutex; }; } ucs_async_thread_context_t; diff --git a/src/ucs/config/global_opts.c b/src/ucs/config/global_opts.c index 0cba043ccb4..0260fd3ec42 100644 --- a/src/ucs/config/global_opts.c +++ b/src/ucs/config/global_opts.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "global_opts.h" #include @@ -11,13 +15,16 @@ #include #include #include +#include #include ucs_global_opts_t ucs_global_opts = { - .log_level = UCS_LOG_LEVEL_WARN, + .log_component = {UCS_LOG_LEVEL_WARN, "UCX"}, .log_print_enable = 0, .log_file = "", + .log_file_size = SIZE_MAX, + .log_file_rotate = 0, .log_buffer_size = 1024, .log_data_size = 0, .mpool_fifo = 0, @@ -41,7 +48,8 @@ ucs_global_opts_t ucs_global_opts = { .stats_format = UCS_STATS_FULL, .rcache_check_pfn = 0, .module_dir = UCX_MODULE_DIR, /* defined in Makefile.am */ - .module_log_level = UCS_LOG_LEVEL_TRACE + .module_log_level = UCS_LOG_LEVEL_TRACE, + .arch = UCS_ARCH_GLOBAL_OPTS_INITALIZER }; static const char *ucs_handle_error_modes[] = { @@ -61,7 +69,8 @@ static ucs_config_field_t ucs_global_opts_table[] = { "UCS logging level. Messages with a level higher or equal to the selected " "will be printed.\n" "Possible values are: fatal, error, warn, info, debug, trace, data, func, poll.", - ucs_offsetof(ucs_global_opts_t, log_level), UCS_CONFIG_TYPE_ENUM(ucs_log_level_names)}, + ucs_offsetof(ucs_global_opts_t, log_component), + UCS_CONFIG_TYPE_LOG_COMP}, {"LOG_FILE", "", "If not empty, UCS will print log messages to the specified file instead of stdout.\n" @@ -71,6 +80,16 @@ static ucs_config_field_t ucs_global_opts_table[] = { ucs_offsetof(ucs_global_opts_t, log_file), UCS_CONFIG_TYPE_STRING}, + {"LOG_FILE_SIZE", "inf", + "The maximal size of log file. The maximal log file size has to be >= LOG_BUFFER.", + ucs_offsetof(ucs_global_opts_t, log_file_size), UCS_CONFIG_TYPE_MEMUNITS}, + + {"LOG_FILE_ROTATE", "0", + "The maximal number of backup log files that could be created to save logs\n" + "after the previous ones (if any) are completely filled. The value has to be\n" + "less than the maximal signed integer value.", + ucs_offsetof(ucs_global_opts_t, log_file_rotate), UCS_CONFIG_TYPE_UINT}, + {"LOG_BUFFER", "1024", "Buffer size for a single log message.", ucs_offsetof(ucs_global_opts_t, log_buffer_size), UCS_CONFIG_TYPE_MEMUNITS}, @@ -138,7 +157,7 @@ static ucs_config_field_t ucs_global_opts_table[] = { "Signal number used for async signaling.", ucs_offsetof(ucs_global_opts_t, async_signo), UCS_CONFIG_TYPE_SIGNO}, -#if ENABLE_STATS +#ifdef ENABLE_STATS {"STATS_DEST", "", "Destination to send statistics to. If the value is empty, statistics are\n" "not reported. Possible values are:\n" @@ -176,7 +195,7 @@ static ucs_config_field_t ucs_global_opts_table[] = { #endif -#if ENABLE_MEMTRACK +#ifdef ENABLE_MEMTRACK {"MEMTRACK_DEST", "", "Destination to output memory tracking report to. If the value is empty,\n" "results are not reported. Possible values are:\n" @@ -193,7 +212,7 @@ static ucs_config_field_t ucs_global_opts_table[] = { ucs_offsetof(ucs_global_opts_t, profile_mode), UCS_CONFIG_TYPE_BITMAP(ucs_profile_mode_names)}, - {"PROFILE_FILE", "", + {"PROFILE_FILE", "ucx_%h_%p.prof", "File name to dump profiling data to.\n" "Substitutions: %h: host, %p: pid, %c: cpu, %t: time, %u: user, %e: exe.\n", ucs_offsetof(ucs_global_opts_t, profile_file), UCS_CONFIG_TYPE_STRING}, @@ -202,10 +221,11 @@ static ucs_config_field_t ucs_global_opts_table[] = { "Maximal size of profiling log. New records will replace old records.", ucs_offsetof(ucs_global_opts_t, profile_log_size), UCS_CONFIG_TYPE_MEMUNITS}, - {"RCACHE_CHECK_PFN", "n", - "Registration cache to check that the physical page frame number of a found\n" - "memory region was not changed since the time the region was registered.\n", - ucs_offsetof(ucs_global_opts_t, rcache_check_pfn), UCS_CONFIG_TYPE_BOOL}, + {"RCACHE_CHECK_PFN", "0", + "Registration cache to check that the physical pages frame number of a found\n" + "memory region were not changed since the time the region was registered.\n" + "Number of pages to check, 0 - disable checking.", + ucs_offsetof(ucs_global_opts_t, rcache_check_pfn), UCS_CONFIG_TYPE_UINT}, {"MODULE_DIR", UCX_MODULE_DIR, "Directory to search for loadable modules", @@ -215,6 +235,10 @@ static ucs_config_field_t ucs_global_opts_table[] = { "Logging level for module loader\n", ucs_offsetof(ucs_global_opts_t, module_log_level), UCS_CONFIG_TYPE_ENUM(ucs_log_level_names)}, + {"", "", NULL, + ucs_offsetof(ucs_global_opts_t, arch), + UCS_CONFIG_TYPE_TABLE(ucs_arch_global_opts_table)}, + {NULL} }; UCS_CONFIG_REGISTER_TABLE(ucs_global_opts_table, "UCS global", NULL, @@ -226,7 +250,7 @@ void ucs_global_opts_init() ucs_status_t status; status = ucs_config_parser_fill_opts(&ucs_global_opts, ucs_global_opts_table, - NULL, NULL, 1); + UCS_DEFAULT_ENV_PREFIX, NULL, 1); if (status != UCS_OK) { ucs_fatal("failed to parse global configuration - aborting"); } @@ -257,5 +281,6 @@ void ucs_global_opts_release() void ucs_global_opts_print(FILE *stream, ucs_config_print_flags_t print_flags) { ucs_config_parser_print_opts(stream, "Global configuration", &ucs_global_opts, - ucs_global_opts_table, NULL, print_flags); + ucs_global_opts_table, NULL, + UCS_DEFAULT_ENV_PREFIX, print_flags); } diff --git a/src/ucs/config/global_opts.h b/src/ucs/config/global_opts.h index a41184d0a33..03d54b20b19 100644 --- a/src/ucs/config/global_opts.h +++ b/src/ucs/config/global_opts.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -26,97 +27,105 @@ BEGIN_C_DECLS */ typedef struct { - /* Log level above which log messages will be printed */ - ucs_log_level_t log_level; + /* Log level above which log messages will be printed for default component */ + ucs_log_component_config_t log_component; /* Log file */ - char *log_file; + char *log_file; + + /* Maximal log file size */ + size_t log_file_size; + + /* Maximal backup log files count that could be created by log infrastructure */ + unsigned log_file_rotate; /* Size of log buffer for one message */ - size_t log_buffer_size; + size_t log_buffer_size; /* Maximal amount of packet data to print per packet */ - size_t log_data_size; + size_t log_data_size; /* Enable ucs_print() output */ - int log_print_enable; + int log_print_enable; /* Enable FIFO behavior for memory pool, instead of LIFO. Useful for * debugging because object pointers are not recycled. */ - int mpool_fifo; + int mpool_fifo; /* Handle errors mode */ - unsigned handle_errors; + unsigned handle_errors; /* Error signals */ UCS_CONFIG_ARRAY_FIELD(int, signals) error_signals; /* If not empty, send mail notifications to that address in case of error */ - char *error_mail_to; + char *error_mail_to; /* Footer for error report mail notification */ - char *error_mail_footer; + char *error_mail_footer; /* If not NULL, attach gdb to the process in case of error */ - char *gdb_command; + char *gdb_command; /* Signal number which causes to enter debug mode */ - unsigned debug_signo; + unsigned debug_signo; /* Log level to trigger error handling */ - ucs_log_level_t log_level_trigger; + ucs_log_level_t log_level_trigger; /* Issue warning about UCX_ env vars which were not used by config parser */ - int warn_unused_env_vars; + int warn_unused_env_vars; /* Max. events per context, will be removed in the future */ - unsigned async_max_events; + unsigned async_max_events; /* Destination for statistics: udp:host:port / file:path / stdout */ - char *stats_dest; + char *stats_dest; /* Trigger to dump statistics */ - char *stats_trigger; + char *stats_trigger; /* Named pipe file path for tuning. */ - char *tuning_path; + char *tuning_path; /* Number of performance stall loops to perform */ - size_t perf_stall_loops; + size_t perf_stall_loops; /* Signal number used by async handler (for signal mode) */ - unsigned async_signo; + unsigned async_signo; /* Destination for detailed memory tracking results: none / stdout / stderr */ - char *memtrack_dest; + char *memtrack_dest; /* Profiling mode */ - unsigned profile_mode; + unsigned profile_mode; /* Profiling output file name */ - char *profile_file; + char *profile_file; /* Limit for profiling log size */ - size_t profile_log_size; + size_t profile_log_size; /* Counters to be included in statistics summary */ - ucs_config_names_array_t stats_filter; + ucs_config_names_array_t stats_filter; /* statistics format options */ - ucs_stats_formats_t stats_format; + ucs_stats_formats_t stats_format; - /* registration cache checks if physical page is not moved */ - int rcache_check_pfn; + /* registration cache checks if physical pages are not moved */ + unsigned rcache_check_pfn; /* directory for loadable modules */ - char *module_dir; + char *module_dir; /* log level for module loader code */ - ucs_log_level_t module_log_level; + ucs_log_level_t module_log_level; + /* arch-specific global options */ + ucs_arch_global_opts_t arch; } ucs_global_opts_t; diff --git a/src/ucs/config/parser.c b/src/ucs/config/parser.c index d1b2a82de2a..624a203aaca 100644 --- a/src/ucs/config/parser.c +++ b/src/ucs/config/parser.c @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -22,11 +22,8 @@ #include -/* configuration value which specifies "infinity" for a numeric variable */ -#define UCS_CONFIG_PARSER_NUMERIC_INF_STR "inf" - /* width of titles in docstring */ -#define UCP_CONFIG_PARSER_DOCSTR_WIDTH 10 +#define UCS_CONFIG_PARSER_DOCSTR_WIDTH 10 /* list of prefixes for a configuration variable, used to dump all possible @@ -81,37 +78,20 @@ static int __find_string_in_list(const char *str, const char **list) return -1; } -static size_t ucs_config_parser_quantity_prefix_value(char prefix) -{ - switch (prefix) { - case 'B': - return 1; - case 'K': - return UCS_KBYTE; - case 'M': - return UCS_MBYTE; - case 'G': - return UCS_GBYTE; - case 'T': - return UCS_TBYTE; - default: - return 0; - } -} - int ucs_config_sscanf_string(const char *buf, void *dest, const void *arg) { *((char**)dest) = strdup(buf); return 1; } -int ucs_config_sprintf_string(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_string(char *buf, size_t max, + const void *src, const void *arg) { strncpy(buf, *((char**)src), max); return 1; } -ucs_status_t ucs_config_clone_string(void *src, void *dest, const void *arg) +ucs_status_t ucs_config_clone_string(const void *src, void *dest, const void *arg) { char *new_str = strdup(*(char**)src); if (new_str == NULL) { @@ -132,20 +112,21 @@ int ucs_config_sscanf_int(const char *buf, void *dest, const void *arg) return sscanf(buf, "%i", (unsigned*)dest); } -ucs_status_t ucs_config_clone_int(void *src, void *dest, const void *arg) +ucs_status_t ucs_config_clone_int(const void *src, void *dest, const void *arg) { *(int*)dest = *(int*)src; return UCS_OK; } -int ucs_config_sprintf_int(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_int(char *buf, size_t max, + const void *src, const void *arg) { return snprintf(buf, max, "%i", *(unsigned*)src); } int ucs_config_sscanf_uint(const char *buf, void *dest, const void *arg) { - if (!strcasecmp(buf, UCS_CONFIG_PARSER_NUMERIC_INF_STR)) { + if (!strcasecmp(buf, UCS_NUMERIC_INF_STR)) { *(unsigned*)dest = UINT_MAX; return 1; } else { @@ -153,17 +134,18 @@ int ucs_config_sscanf_uint(const char *buf, void *dest, const void *arg) } } -ucs_status_t ucs_config_clone_uint(void *src, void *dest, const void *arg) +ucs_status_t ucs_config_clone_uint(const void *src, void *dest, const void *arg) { *(unsigned*)dest = *(unsigned*)src; return UCS_OK; } -int ucs_config_sprintf_uint(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_uint(char *buf, size_t max, + const void *src, const void *arg) { unsigned value = *(unsigned*)src; if (value == UINT_MAX) { - snprintf(buf, max, UCS_CONFIG_PARSER_NUMERIC_INF_STR); + snprintf(buf, max, UCS_NUMERIC_INF_STR); return 1; } else { return snprintf(buf, max, "%u", value); @@ -175,12 +157,13 @@ int ucs_config_sscanf_ulong(const char *buf, void *dest, const void *arg) return sscanf(buf, "%lu", (unsigned long*)dest); } -int ucs_config_sprintf_ulong(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_ulong(char *buf, size_t max, + const void *src, const void *arg) { return snprintf(buf, max, "%lu", *(unsigned long*)src); } -ucs_status_t ucs_config_clone_ulong(void *src, void *dest, const void *arg) +ucs_status_t ucs_config_clone_ulong(const void *src, void *dest, const void *arg) { *(unsigned long*)dest = *(unsigned long*)src; return UCS_OK; @@ -191,12 +174,13 @@ int ucs_config_sscanf_double(const char *buf, void *dest, const void *arg) return sscanf(buf, "%lf", (double*)dest); } -int ucs_config_sprintf_double(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_double(char *buf, size_t max, + const void *src, const void *arg) { return snprintf(buf, max, "%.3f", *(double*)src); } -ucs_status_t ucs_config_clone_double(void *src, void *dest, const void *arg) +ucs_status_t ucs_config_clone_double(const void *src, void *dest, const void *arg) { *(double*)dest = *(double*)src; return UCS_OK; @@ -204,15 +188,26 @@ ucs_status_t ucs_config_clone_double(void *src, void *dest, const void *arg) int ucs_config_sscanf_hex(const char *buf, void *dest, const void *arg) { - if (strncasecmp(buf, "0x", 2) == 0) { + /* Special value: auto */ + if (!strcasecmp(buf, UCS_VALUE_AUTO_STR)) { + *(size_t*)dest = UCS_HEXUNITS_AUTO; + return 1; + } else if (strncasecmp(buf, "0x", 2) == 0) { return (sscanf(buf + 2, "%x", (unsigned int*)dest)); } else { return 0; } } -int ucs_config_sprintf_hex(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_hex(char *buf, size_t max, + const void *src, const void *arg) { + uint16_t val = *(uint16_t*)src; + + if (val == UCS_HEXUNITS_AUTO) { + return snprintf(buf, max, UCS_VALUE_AUTO_STR); + } + return snprintf(buf, max, "0x%x", *(unsigned int*)src); } @@ -229,7 +224,7 @@ int ucs_config_sscanf_bool(const char *buf, void *dest, const void *arg) } } -int ucs_config_sprintf_bool(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_bool(char *buf, size_t max, const void *src, const void *arg) { return snprintf(buf, max, "%c", *(int*)src ? 'y' : 'n'); } @@ -246,7 +241,8 @@ int ucs_config_sscanf_ternary(const char *buf, void *dest, const void *arg) } } -int ucs_config_sprintf_ternary(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_ternary(char *buf, size_t max, + const void *src, const void *arg) { if (*(int*)src == UCS_TRY) { return snprintf(buf, max, "try"); @@ -255,6 +251,46 @@ int ucs_config_sprintf_ternary(char *buf, size_t max, void *src, const void *arg } } +int ucs_config_sscanf_on_off(const char *buf, void *dest, const void *arg) +{ + if (!strcasecmp(buf, "on") || !strcmp(buf, "1")) { + *(int*)dest = UCS_CONFIG_ON; + return 1; + } else if (!strcasecmp(buf, "off") || !strcmp(buf, "0")) { + *(int*)dest = UCS_CONFIG_OFF; + return 1; + } else { + return 0; + } +} + +int ucs_config_sscanf_on_off_auto(const char *buf, void *dest, const void *arg) +{ + if (!strcasecmp(buf, "try") || + !strcasecmp(buf, "maybe") || + !strcasecmp(buf, "auto")) { + *(int*)dest = UCS_CONFIG_AUTO; + return 1; + } else { + return ucs_config_sscanf_on_off(buf, dest, arg); + } +} + +int ucs_config_sprintf_on_off_auto(char *buf, size_t max, + const void *src, const void *arg) +{ + switch (*(int*)src) { + case UCS_CONFIG_AUTO: + return snprintf(buf, max, "auto"); + case UCS_CONFIG_ON: + return snprintf(buf, max, "on"); + case UCS_CONFIG_OFF: + return snprintf(buf, max, "off"); + default: + return snprintf(buf, max, "%d", *(int*)src); + } +} + int ucs_config_sscanf_enum(const char *buf, void *dest, const void *arg) { int i; @@ -268,7 +304,8 @@ int ucs_config_sscanf_enum(const char *buf, void *dest, const void *arg) return 1; } -int ucs_config_sprintf_enum(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_enum(char *buf, size_t max, + const void *src, const void *arg) { char * const *table = arg; strncpy(buf, table[*(unsigned*)src], max); @@ -285,7 +322,6 @@ static void __print_table_values(char * const *table, char *buf, size_t max) } snprintf(ptr, end - ptr, "]"); - ptr += strlen(ptr); *buf = '['; } @@ -295,15 +331,30 @@ void ucs_config_help_enum(char *buf, size_t max, const void *arg) __print_table_values(arg, buf, max); } +ucs_status_t ucs_config_clone_log_comp(const void *src, void *dst, const void *arg) +{ + const ucs_log_component_config_t *src_comp = src; + ucs_log_component_config_t *dst_comp = dst; + + dst_comp->log_level = src_comp->log_level; + ucs_strncpy_safe(dst_comp->name, src_comp->name, sizeof(dst_comp->name)); + + return UCS_OK; +} + int ucs_config_sscanf_bitmap(const char *buf, void *dest, const void *arg) { char *str = strdup(buf); - char *p; + char *p, *saveptr; int ret, i; + if (str == NULL) { + return 0; + } + ret = 1; *((unsigned*)dest) = 0; - p = strtok(str, ","); + p = strtok_r(str, ",", &saveptr); while (p != NULL) { i = __find_string_in_list(p, (const char**)arg); if (i < 0) { @@ -311,31 +362,17 @@ int ucs_config_sscanf_bitmap(const char *buf, void *dest, const void *arg) break; } *((unsigned*)dest) |= UCS_BIT(i); - p = strtok(NULL, ","); + p = strtok_r(NULL, ",", &saveptr); } free(str); return ret; } -int ucs_config_sprintf_bitmap(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_bitmap(char *buf, size_t max, + const void *src, const void *arg) { - char * const *table; - int i, len; - - len = 0; - for (table = arg, i = 0; *table; ++table, ++i) { - if (*((unsigned*)src) & UCS_BIT(i)) { - snprintf(buf + len, max - len, "%s,", *table); - len = strlen(buf); - } - } - - if (len > 0) { - buf[len - 1] = '\0'; /* remove last ',' */ - } else { - buf[0] = '\0'; - } + ucs_flags_str(buf, max, *((unsigned*)src), (const char**)arg); return 1; } @@ -354,7 +391,8 @@ int ucs_config_sscanf_bitmask(const char *buf, void *dest, const void *arg) return ret; } -int ucs_config_sprintf_bitmask(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_bitmask(char *buf, size_t max, + const void *src, const void *arg) { return snprintf(buf, max, "%u", __builtin_popcount(*(unsigned*)src)); } @@ -392,7 +430,8 @@ int ucs_config_sscanf_time(const char *buf, void *dest, const void *arg) return 1; } -int ucs_config_sprintf_time(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_time(char *buf, size_t max, + const void *src, const void *arg) { snprintf(buf, max, "%.2fus", *(double*)src * UCS_USEC_PER_SEC); return 1; @@ -408,14 +447,19 @@ int ucs_config_sscanf_bw(const char *buf, void *dest, const void *arg) double value; int num_fields; - num_fields = sscanf(buf, "%lf%16s", &value, str); + if (!strcasecmp(buf, UCS_VALUE_AUTO_STR)) { + *dst = UCS_CONFIG_BW_AUTO; + return 1; + } + + num_fields = sscanf(buf, "%lf%15s", &value, str); if (num_fields < 2) { return 0; } ucs_assert(num_fields == 2); - units = (str[0] == 'b') ? 1 : ucs_config_parser_quantity_prefix_value(str[0]); + units = (str[0] == 'b') ? 1 : ucs_string_quantity_prefix_value(str[0]); if (!units) { return 0; } @@ -445,14 +489,25 @@ int ucs_config_sscanf_bw(const char *buf, void *dest, const void *arg) return 1; } -int ucs_config_sprintf_bw(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_bw(char *buf, size_t max, const void *src, + const void *arg) { - double value = *(double*)src; - size_t len; + static const double max_value = 50000.0; + double value = *(double*)src; + const char **suffix; + + if (UCS_CONFIG_BW_IS_AUTO(value)) { + ucs_strncpy_safe(buf, UCS_VALUE_AUTO_STR, max); + return 1; + } + + suffix = &ucs_memunits_suffixes[0]; + while ((value > max_value) && (*(suffix + 1) != NULL)) { + value /= 1024; + ++suffix; + } - ucs_memunits_to_str((size_t)value, buf, max); - len = strlen(buf); - snprintf(buf + len, max - len, "Bps"); + ucs_snprintf_safe(buf, max, "%.2f%sBps", value, *suffix); return 1; } @@ -474,7 +529,8 @@ int ucs_config_sscanf_bw_spec(const char *buf, void *dest, const void *arg) return dst->name != NULL; } -int ucs_config_sprintf_bw_spec(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_bw_spec(char *buf, size_t max, + const void *src, const void *arg) { ucs_config_bw_spec_t *bw = (ucs_config_bw_spec_t*)src; int len; @@ -488,7 +544,7 @@ int ucs_config_sprintf_bw_spec(char *buf, size_t max, void *src, const void *arg return 1; } -ucs_status_t ucs_config_clone_bw_spec(void *src, void *dest, const void *arg) +ucs_status_t ucs_config_clone_bw_spec(const void *src, void *dest, const void *arg) { ucs_config_bw_spec_t *s = (ucs_config_bw_spec_t*)src; ucs_config_bw_spec_t *d = (ucs_config_bw_spec_t*)dest; @@ -522,78 +578,50 @@ int ucs_config_sscanf_signo(const char *buf, void *dest, const void *arg) return ucs_config_sscanf_enum(buf, dest, ucs_signal_names); } -int ucs_config_sprintf_signo(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_signo(char *buf, size_t max, + const void *src, const void *arg) { return ucs_config_sprintf_enum(buf, max, src, ucs_signal_names); } int ucs_config_sscanf_memunits(const char *buf, void *dest, const void *arg) { - char units[3]; - int num_fields; - size_t value; - size_t bytes; - - /* Special value: infinity */ - if (!strcasecmp(buf, UCS_CONFIG_PARSER_NUMERIC_INF_STR)) { - *(size_t*)dest = UCS_CONFIG_MEMUNITS_INF; - return 1; - } - - /* Special value: auto */ - if (!strcasecmp(buf, "auto")) { - *(size_t*)dest = UCS_CONFIG_MEMUNITS_AUTO; - return 1; - } - - memset(units, 0, sizeof(units)); - num_fields = sscanf(buf, "%ld%c%c", &value, &units[0], &units[1]); - if (num_fields == 1) { - bytes = 1; - } else if (num_fields == 2 || num_fields == 3) { - bytes = ucs_config_parser_quantity_prefix_value(toupper(units[0])); - if (!bytes || ((num_fields == 3) && tolower(units[1]) != 'b')) { - return 0; - } - } else { + if (ucs_str_to_memunits(buf, dest) != UCS_OK) { return 0; } - - *(size_t*)dest = value * bytes; return 1; } -int ucs_config_sprintf_memunits(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_memunits(char *buf, size_t max, + const void *src, const void *arg) { - size_t sz = *(size_t*)src; - - if (sz == UCS_CONFIG_MEMUNITS_INF) { - snprintf(buf, max, UCS_CONFIG_PARSER_NUMERIC_INF_STR); - } else if (sz == UCS_CONFIG_MEMUNITS_AUTO) { - snprintf(buf, max, "auto"); - } else { - ucs_memunits_to_str(sz, buf, max); - } + ucs_memunits_to_str(*(size_t*)src, buf, max); return 1; } int ucs_config_sscanf_ulunits(const char *buf, void *dest, const void *arg) { /* Special value: auto */ - if (!strcasecmp(buf, "auto")) { - *(size_t*)dest = UCS_CONFIG_ULUNITS_AUTO; + if (!strcasecmp(buf, UCS_VALUE_AUTO_STR)) { + *(unsigned long*)dest = UCS_ULUNITS_AUTO; + return 1; + } else if (!strcasecmp(buf, UCS_NUMERIC_INF_STR)) { + *(unsigned long*)dest = UCS_ULUNITS_INF; return 1; } return ucs_config_sscanf_ulong(buf, dest, arg); } -int ucs_config_sprintf_ulunits(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_ulunits(char *buf, size_t max, + const void *src, const void *arg) { - size_t val = *(size_t*)src; + unsigned long val = *(unsigned long*)src; - if (val == UCS_CONFIG_ULUNITS_AUTO) { - return snprintf(buf, max, "auto"); + if (val == UCS_ULUNITS_AUTO) { + return snprintf(buf, max, UCS_VALUE_AUTO_STR); + } else if (val == UCS_ULUNITS_INF) { + return snprintf(buf, max, UCS_NUMERIC_INF_STR); } return ucs_config_sprintf_ulong(buf, max, src, arg); @@ -639,9 +667,10 @@ int ucs_config_sscanf_range_spec(const char *buf, void *dest, const void *arg) return ret; } -int ucs_config_sprintf_range_spec(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_range_spec(char *buf, size_t max, + const void *src, const void *arg) { - ucs_range_spec_t *range_spec = src; + const ucs_range_spec_t *range_spec = src; if (range_spec->first == range_spec->last) { snprintf(buf, max, "%d", range_spec->first); @@ -652,10 +681,10 @@ int ucs_config_sprintf_range_spec(char *buf, size_t max, void *src, const void * return 1; } -ucs_status_t ucs_config_clone_range_spec(void *src, void *dest, const void *arg) +ucs_status_t ucs_config_clone_range_spec(const void *src, void *dest, const void *arg) { - ucs_range_spec_t *src_range_spec = src; - ucs_range_spec_t *dest_ragne_spec = dest; + const ucs_range_spec_t *src_range_spec = src; + ucs_range_spec_t *dest_ragne_spec = dest; dest_ragne_spec->first = src_range_spec->first; dest_ragne_spec->last = src_range_spec->last; @@ -668,17 +697,17 @@ int ucs_config_sscanf_array(const char *buf, void *dest, const void *arg) ucs_config_array_field_t *field = dest; void *temp_field; const ucs_config_array_t *array = arg; - char *dup, *token, *saveptr; + char *str_dup, *token, *saveptr; int ret; unsigned i; - dup = strdup(buf); - if (dup == NULL) { + str_dup = strdup(buf); + if (str_dup == NULL) { return 0; } saveptr = NULL; - token = strtok_r(dup, ",", &saveptr); + token = strtok_r(str_dup, ",", &saveptr); temp_field = ucs_calloc(UCS_CONFIG_ARRAY_MAX, array->elem_size, "config array"); i = 0; while (token != NULL) { @@ -686,7 +715,7 @@ int ucs_config_sscanf_array(const char *buf, void *dest, const void *arg) array->parser.arg); if (!ret) { ucs_free(temp_field); - free(dup); + free(str_dup); return 0; } @@ -699,14 +728,15 @@ int ucs_config_sscanf_array(const char *buf, void *dest, const void *arg) field->data = temp_field; field->count = i; - free(dup); + free(str_dup); return 1; } -int ucs_config_sprintf_array(char *buf, size_t max, void *src, const void *arg) +int ucs_config_sprintf_array(char *buf, size_t max, + const void *src, const void *arg) { - ucs_config_array_field_t *field = src; - const ucs_config_array_t *array = arg; + const ucs_config_array_field_t *field = src; + const ucs_config_array_t *array = arg; size_t offset; unsigned i; int ret; @@ -728,10 +758,11 @@ int ucs_config_sprintf_array(char *buf, size_t max, void *src, const void *arg) return 1; } -ucs_status_t ucs_config_clone_array(void *src, void *dest, const void *arg) +ucs_status_t ucs_config_clone_array(const void *src, void *dest, const void *arg) { - ucs_config_array_field_t *dest_array = dest, *src_array = src; - const ucs_config_array_t *array = arg; + const ucs_config_array_field_t *src_array = src; + const ucs_config_array_t *array = arg; + ucs_config_array_field_t *dest_array = dest; ucs_status_t status; unsigned i; @@ -743,7 +774,7 @@ ucs_status_t ucs_config_clone_array(void *src, void *dest, const void *arg) dest_array->count = src_array->count; for (i = 0; i < src_array->count; ++i) { - status = array->parser.clone((char*)src_array->data + i * array->elem_size, + status = array->parser.clone((const char*)src_array->data + i * array->elem_size, (char*)dest_array->data + i * array->elem_size, array->parser.arg); if (status != UCS_OK) { @@ -820,7 +851,7 @@ int ucs_config_sscanf_table(const char *buf, void *dest, const void *arg) return 1; } -ucs_status_t ucs_config_clone_table(void *src, void *dst, const void *arg) +ucs_status_t ucs_config_clone_table(const void *src, void *dst, const void *arg) { return ucs_config_parser_clone_opts(src, dst, (ucs_config_field_t*)arg); } @@ -844,6 +875,11 @@ void ucs_config_help_generic(char *buf, size_t max, const void *arg) strncpy(buf, (char*)arg, max); } +static inline int ucs_config_is_deprecated_field(const ucs_config_field_t *field) +{ + return (field->offset == UCS_CONFIG_DEPRECATED_FIELD_OFFSET); +} + static inline int ucs_config_is_alias_field(const ucs_config_field_t *field) { return (field->dfl_value == NULL); @@ -902,6 +938,11 @@ static void ucs_config_parser_release_field(ucs_config_field_t *field, void *var field->parser.release(var, field->parser.arg); } +static int ucs_config_field_is_last(const ucs_config_field_t *field) +{ + return field->name == NULL; +} + ucs_status_t ucs_config_parser_set_default_values(void *opts, ucs_config_field_t *fields) { @@ -909,8 +950,9 @@ ucs_config_parser_set_default_values(void *opts, ucs_config_field_t *fields) ucs_status_t status; void *var; - for (field = fields; field->name; ++field) { - if (ucs_config_is_alias_field(field)) { + for (field = fields; !ucs_config_field_is_last(field); ++field) { + if (ucs_config_is_alias_field(field) || + ucs_config_is_deprecated_field(field)) { continue; } @@ -952,7 +994,7 @@ ucs_config_parser_set_value_internal(void *opts, ucs_config_field_t *fields, prefix_len = (table_prefix == NULL) ? 0 : strlen(table_prefix); count = 0; - for (field = fields; field->name; ++field) { + for (field = fields; !ucs_config_field_is_last(field); ++field) { var = (char*)opts + field->offset; @@ -985,6 +1027,10 @@ ucs_config_parser_set_value_internal(void *opts, ucs_config_field_t *fields, } else if (((table_prefix == NULL) || !strncmp(name, table_prefix, prefix_len)) && !strcmp(name + prefix_len, field->name)) { + if (ucs_config_is_deprecated_field(field)) { + return UCS_ERR_NO_ELEM; + } + ucs_config_parser_release_field(field, var); status = ucs_config_parser_parse_field(field, value, var); if (status != UCS_OK) { @@ -997,12 +1043,14 @@ ucs_config_parser_set_value_internal(void *opts, ucs_config_field_t *fields, return (count == 0) ? UCS_ERR_NO_ELEM : UCS_OK; } -static void ucs_config_parser_mark_env_var_used(const char *name) +static void ucs_config_parser_mark_env_var_used(const char *name, int *added) { khiter_t iter; char *key; int ret; + *added = 0; + if (!ucs_global_opts.warn_unused_env_vars) { return; } @@ -1020,7 +1068,22 @@ static void ucs_config_parser_mark_env_var_used(const char *name) goto out; } - kh_put(ucs_config_env_vars, &ucs_config_parser_env_vars, key, &ret); +#ifndef __clang_analyzer__ + /* Exclude this code from Clang examination as it generates + * false-postive warning about potential leak of memory + * pointed to by 'key' variable */ + iter = kh_put(ucs_config_env_vars, &ucs_config_parser_env_vars, key, &ret); + if ((ret <= 0) || (iter == kh_end(&ucs_config_parser_env_vars))) { + ucs_warn("kh_put(key=%s) failed", key); + ucs_free(key); + goto out; + } +#else + ucs_free(key); +#endif + + *added = 1; + out: pthread_mutex_unlock(&ucs_config_parser_env_vars_hash_lock); } @@ -1035,13 +1098,14 @@ static ucs_status_t ucs_config_apply_env_vars(void *opts, ucs_config_field_t *fi const char *env_value; void *var; char buf[256]; + int added; /* Put prefix in the buffer. Later we replace only the variable name part */ snprintf(buf, sizeof(buf) - 1, "%s%s", prefix, table_prefix ? table_prefix : ""); prefix_len = strlen(buf); /* Parse environment variables */ - for (field = fields; field->name; ++field) { + for (field = fields; !ucs_config_field_is_last(field); ++field) { var = (char*)opts + field->offset; @@ -1069,9 +1133,20 @@ static ucs_status_t ucs_config_apply_env_vars(void *opts, ucs_config_field_t *fi /* Read and parse environment variable */ strncpy(buf + prefix_len, field->name, sizeof(buf) - prefix_len - 1); env_value = getenv(buf); - if (env_value != NULL) { + if (env_value == NULL) { + continue; + } + + ucs_config_parser_mark_env_var_used(buf, &added); + + if (ucs_config_is_deprecated_field(field)) { + if (added && !ignore_errors) { + ucs_warn("%s is deprecated (set %s%s=n to suppress this warning)", + buf, UCS_DEFAULT_ENV_PREFIX, + UCS_GLOBAL_OPTS_WARN_UNUSED_CONFIG); + } + } else { ucs_config_parser_release_field(field, var); - ucs_config_parser_mark_env_var_used(buf); status = ucs_config_parser_parse_field(field, env_value, var); if (status != UCS_OK) { /* If set to ignore errors, restore the default value */ @@ -1092,13 +1167,38 @@ static ucs_status_t ucs_config_apply_env_vars(void *opts, ucs_config_field_t *fi return UCS_OK; } +/* Find if env_prefix consists of multiple prefixes and returns pointer + * to rightmost in this case, otherwise returns NULL + */ +static ucs_status_t ucs_config_parser_get_sub_prefix(const char *env_prefix, + const char **sub_prefix_p) +{ + size_t len; + + /* env_prefix always has "_" at the end and we want to find the last but one + * "_" in the env_prefix */ + len = strlen(env_prefix); + if (len < 2) { + ucs_error("Invalid value of env_prefix: '%s'", env_prefix); + return UCS_ERR_INVALID_PARAM; + } + + len -= 2; + while ((len > 0) && (env_prefix[len - 1] != '_')) { + len -= 1; + } + *sub_prefix_p = (len > 0) ? (env_prefix + len): NULL; + + return UCS_OK; +} + ucs_status_t ucs_config_parser_fill_opts(void *opts, ucs_config_field_t *fields, const char *env_prefix, const char *table_prefix, int ignore_errors) { + const char *sub_prefix = NULL; ucs_status_t status; - char prefix[128]; /* Set default values */ status = ucs_config_parser_set_default_values(opts, fields); @@ -1106,18 +1206,24 @@ ucs_status_t ucs_config_parser_fill_opts(void *opts, ucs_config_field_t *fields, goto err; } - /* Apply environment variables */ - status = ucs_config_apply_env_vars(opts, fields, UCS_CONFIG_PREFIX, - table_prefix, 1, ignore_errors); - if (status != UCS_OK) { - goto err_free; - } - - /* Apply environment variables with custom prefix */ if ((env_prefix != NULL) && (strlen(env_prefix) > 0)) { - snprintf(prefix, sizeof(prefix), "%s%s_", UCS_CONFIG_PREFIX, env_prefix); - status = ucs_config_apply_env_vars(opts, fields, prefix, table_prefix, - 1, ignore_errors); + status = ucs_config_parser_get_sub_prefix(env_prefix, &sub_prefix); + if (status != UCS_OK) { + goto err; + } + + /* Apply environment variables */ + if (sub_prefix != NULL) { + status = ucs_config_apply_env_vars(opts, fields, sub_prefix, table_prefix, + 1, ignore_errors); + if (status != UCS_OK) { + goto err_free; + } + } + + /* Apply environment variables with custom prefix */ + status = ucs_config_apply_env_vars(opts, fields, env_prefix, table_prefix, + 1, ignore_errors); if (status != UCS_OK) { goto err_free; } @@ -1153,13 +1259,13 @@ ucs_status_t ucs_config_parser_get_value(void *opts, ucs_config_field_t *fields, } for (field = fields, status = UCS_ERR_NO_ELEM; - field->name && (status == UCS_ERR_NO_ELEM); ++field) { + !ucs_config_field_is_last(field) && (status == UCS_ERR_NO_ELEM); ++field) { name_len = strlen(field->name); - ucs_trace("compare name \"%s\" with field \"%s\" which is%s subtable", + ucs_trace("compare name \"%s\" with field \"%s\" which is %s subtable", name, field->name, - ucs_config_is_table_field(field) ? "" : " NOT"); + ucs_config_is_table_field(field) ? "a" : "NOT a"); if (ucs_config_is_table_field(field) && !strncmp(field->name, name, name_len)) { @@ -1187,12 +1293,13 @@ ucs_status_t ucs_config_parser_clone_opts(const void *src, void *dst, ucs_status_t status; ucs_config_field_t *field; - for (field = fields; field->name; ++field) { - if (ucs_config_is_alias_field(field)) { + for (field = fields; !ucs_config_field_is_last(field); ++field) { + if (ucs_config_is_alias_field(field) || + ucs_config_is_deprecated_field(field)) { continue; } - status = field->parser.clone((char*)src + field->offset, + status = field->parser.clone((const char*)src + field->offset, (char*)dst + field->offset, field->parser.arg); if (status != UCS_OK) { @@ -1209,8 +1316,9 @@ void ucs_config_parser_release_opts(void *opts, ucs_config_field_t *fields) { ucs_config_field_t *field; - for (field = fields; field->name; ++field) { - if (ucs_config_is_alias_field(field)) { + for (field = fields; !ucs_config_field_is_last(field); ++field) { + if (ucs_config_is_alias_field(field) || + ucs_config_is_deprecated_field(field)) { continue; } @@ -1231,7 +1339,7 @@ ucs_config_find_aliased_field(const ucs_config_field_t *fields, const ucs_config_field_t *field, *result; size_t offset; - for (field = fields; field->name; ++field) { + for (field = fields; !ucs_config_field_is_last(field); ++field) { if (field == alias) { /* skip */ continue; @@ -1271,15 +1379,22 @@ ucs_config_parser_print_field(FILE *stream, const void *opts, const char *env_pr ucs_assert(!ucs_list_is_empty(prefix_list)); head = ucs_list_head(prefix_list, ucs_config_parser_prefix_t, list); - field->parser.write(value_buf, sizeof(value_buf) - 1, (char*)opts + field->offset, - field->parser.arg); - field->parser.help(syntax_buf, sizeof(syntax_buf) - 1, field->parser.arg); + if (ucs_config_is_deprecated_field(field)) { + snprintf(value_buf, sizeof(value_buf), " (deprecated)"); + snprintf(syntax_buf, sizeof(syntax_buf), "N/A"); + } else { + snprintf(value_buf, sizeof(value_buf), "="); + field->parser.write(value_buf + 1, sizeof(value_buf) - 2, + (char*)opts + field->offset, + field->parser.arg); + field->parser.help(syntax_buf, sizeof(syntax_buf) - 1, field->parser.arg); + } if (flags & UCS_CONFIG_PRINT_DOC) { fprintf(stream, "#\n"); ucs_config_print_doc_line_by_line(field, __print_stream_cb, stream); fprintf(stream, "#\n"); - fprintf(stream, "# %-*s %s\n", UCP_CONFIG_PARSER_DOCSTR_WIDTH, "syntax:", + fprintf(stream, "# %-*s %s\n", UCS_CONFIG_PARSER_DOCSTR_WIDTH, "syntax:", syntax_buf); /* Extra docstring */ @@ -1293,7 +1408,7 @@ ucs_config_parser_print_field(FILE *stream, const void *opts, const char *env_pr /* Parents in configuration hierarchy */ if (prefix_list->next != prefix_list->prev) { - fprintf(stream, "# %-*s", UCP_CONFIG_PARSER_DOCSTR_WIDTH, "inherits:"); + fprintf(stream, "# %-*s", UCS_CONFIG_PARSER_DOCSTR_WIDTH, "inherits:"); ucs_list_for_each(prefix, prefix_list, list) { if (prefix == head) { continue; @@ -1310,7 +1425,7 @@ ucs_config_parser_print_field(FILE *stream, const void *opts, const char *env_pr fprintf(stream, "#\n"); } - fprintf(stream, "%s%s%s=%s\n", env_prefix, head->prefix, name, value_buf); + fprintf(stream, "%s%s%s%s\n", env_prefix, head->prefix, name, value_buf); if (flags & UCS_CONFIG_PRINT_DOC) { fprintf(stream, "\n"); @@ -1320,7 +1435,7 @@ ucs_config_parser_print_field(FILE *stream, const void *opts, const char *env_pr static void ucs_config_parser_print_opts_recurs(FILE *stream, const void *opts, const ucs_config_field_t *fields, - unsigned flags, const char *env_prefix, + unsigned flags, const char *prefix, ucs_list_link_t *prefix_list) { const ucs_config_field_t *field, *aliased_field; @@ -1328,17 +1443,34 @@ ucs_config_parser_print_opts_recurs(FILE *stream, const void *opts, ucs_config_parser_prefix_t inner_prefix; size_t alias_table_offset; - for (field = fields; field->name; ++field) { + for (field = fields; !ucs_config_field_is_last(field); ++field) { if (ucs_config_is_table_field(field)) { /* Parse with sub-table prefix. * We start the leaf prefix and continue up the hierarchy. */ - inner_prefix.prefix = field->name; - ucs_list_add_tail(prefix_list, &inner_prefix.list); - ucs_config_parser_print_opts_recurs(stream, opts + field->offset, + /* Do not add the same prefix several times in a sequence. It can + * happen when similiar prefix names were used during config + * table inheritance, e.g. "IB_" -> "RC_" -> "RC_". We check the + * previous entry only, since it is currently impossible if + * something like "RC_" -> "IB_" -> "RC_" will be used. */ + if (ucs_list_is_empty(prefix_list) || + strcmp(ucs_list_tail(prefix_list, + ucs_config_parser_prefix_t, + list)->prefix, field->name)) { + inner_prefix.prefix = field->name; + ucs_list_add_tail(prefix_list, &inner_prefix.list); + } else { + inner_prefix.prefix = NULL; + } + + ucs_config_parser_print_opts_recurs(stream, + UCS_PTR_BYTE_OFFSET(opts, field->offset), field->parser.arg, flags, - env_prefix, prefix_list); - ucs_list_del(&inner_prefix.list); + prefix, prefix_list); + + if (inner_prefix.prefix != NULL) { + ucs_list_del(&inner_prefix.list); + } } else if (ucs_config_is_alias_field(field)) { if (flags & UCS_CONFIG_PRINT_HIDDEN) { aliased_field = @@ -1350,17 +1482,22 @@ ucs_config_parser_print_opts_recurs(FILE *stream, const void *opts, head = ucs_list_head(prefix_list, ucs_config_parser_prefix_t, list); - ucs_config_parser_print_field(stream, opts + alias_table_offset, - env_prefix, prefix_list, + ucs_config_parser_print_field(stream, + UCS_PTR_BYTE_OFFSET(opts, alias_table_offset), + prefix, prefix_list, field->name, aliased_field, flags, "%-*s %s%s%s", - UCP_CONFIG_PARSER_DOCSTR_WIDTH, - "alias of:", env_prefix, + UCS_CONFIG_PARSER_DOCSTR_WIDTH, + "alias of:", prefix, head->prefix, aliased_field->name); } } else { - ucs_config_parser_print_field(stream, opts, env_prefix, prefix_list, + if (ucs_config_is_deprecated_field(field) && + !(flags & UCS_CONFIG_PRINT_HIDDEN)) { + continue; + } + ucs_config_parser_print_field(stream, opts, prefix, prefix_list, field->name, field, flags, NULL); } } @@ -1368,7 +1505,7 @@ ucs_config_parser_print_opts_recurs(FILE *stream, const void *opts, void ucs_config_parser_print_opts(FILE *stream, const char *title, const void *opts, ucs_config_field_t *fields, const char *table_prefix, - ucs_config_print_flags_t flags) + const char *prefix, ucs_config_print_flags_t flags) { ucs_config_parser_prefix_t table_prefix_elem; UCS_LIST_HEAD(prefix_list); @@ -1385,7 +1522,7 @@ void ucs_config_parser_print_opts(FILE *stream, const char *title, const void *o table_prefix_elem.prefix = table_prefix ? table_prefix : ""; ucs_list_add_tail(&prefix_list, &table_prefix_elem.list); ucs_config_parser_print_opts_recurs(stream, opts, fields, flags, - UCS_CONFIG_PREFIX, &prefix_list); + prefix, &prefix_list); } if (flags & UCS_CONFIG_PRINT_HEADER) { @@ -1393,7 +1530,8 @@ void ucs_config_parser_print_opts(FILE *stream, const char *title, const void *o } } -void ucs_config_parser_print_all_opts(FILE *stream, ucs_config_print_flags_t flags) +void ucs_config_parser_print_all_opts(FILE *stream, const char *prefix, + ucs_config_print_flags_t flags) { const ucs_config_global_list_entry_t *entry; ucs_status_t status; @@ -1401,13 +1539,19 @@ void ucs_config_parser_print_all_opts(FILE *stream, ucs_config_print_flags_t fla void *opts; ucs_list_for_each(entry, &ucs_config_global_list, list) { + if ((entry->table == NULL) || + (ucs_config_field_is_last(&entry->table[0]))) { + /* don't print title for an empty configuration table */ + continue; + } + opts = ucs_malloc(entry->size, "tmp_opts"); if (opts == NULL) { ucs_error("could not allocate configuration of size %zu", entry->size); continue; } - status = ucs_config_parser_fill_opts(opts, entry->fields, NULL, + status = ucs_config_parser_fill_opts(opts, entry->table, prefix, entry->prefix, 0); if (status != UCS_OK) { ucs_free(opts); @@ -1415,17 +1559,16 @@ void ucs_config_parser_print_all_opts(FILE *stream, ucs_config_print_flags_t fla } snprintf(title, sizeof(title), "%s configuration", entry->name); - ucs_config_parser_print_opts(stream, title, opts, entry->fields, - entry->prefix, flags); + ucs_config_parser_print_opts(stream, title, opts, entry->table, + entry->prefix, prefix, flags); - ucs_config_parser_release_opts(opts, entry->fields); + ucs_config_parser_release_opts(opts, entry->table); ucs_free(opts); } } -void ucs_config_parser_warn_unused_env_vars() +static void ucs_config_parser_warn_unused_env_vars(const char *prefix) { - static uint32_t warn_once = 1; char unused_env_vars_names[40]; int num_unused_vars; char **envp, *envstr; @@ -1441,13 +1584,9 @@ void ucs_config_parser_warn_unused_env_vars() return; } - if (!ucs_atomic_cswap32(&warn_once, 1, 0)) { - return; - } - pthread_mutex_lock(&ucs_config_parser_env_vars_hash_lock); - prefix_len = strlen(UCS_CONFIG_PREFIX); + prefix_len = strlen(prefix); p = unused_env_vars_names; endp = p + sizeof(unused_env_vars_names) - 1; *endp = '\0'; @@ -1461,7 +1600,7 @@ void ucs_config_parser_warn_unused_env_vars() } var_name = strtok_r(envstr, "=", &saveptr); - if (!var_name || strncmp(var_name, UCS_CONFIG_PREFIX, prefix_len)) { + if (!var_name || strncmp(var_name, prefix, prefix_len)) { ucs_free(envstr); continue; /* Not UCX */ } @@ -1486,18 +1625,51 @@ void ucs_config_parser_warn_unused_env_vars() p[-1] = '\0'; /* remove trailing comma */ } ucs_warn("unused env variable%s:%s%s (set %s%s=n to suppress this warning)", - num_unused_vars > 1 ? "s" : "", unused_env_vars_names, - truncated ? "..." : "", UCS_CONFIG_PREFIX, + (num_unused_vars > 1) ? "s" : "", unused_env_vars_names, + truncated ? "..." : "", UCS_DEFAULT_ENV_PREFIX, UCS_GLOBAL_OPTS_WARN_UNUSED_CONFIG); } pthread_mutex_unlock(&ucs_config_parser_env_vars_hash_lock); } +void ucs_config_parser_warn_unused_env_vars_once(const char *env_prefix) +{ + const char *sub_prefix = NULL; + int added; + ucs_status_t status; + + /* Although env_prefix is not real environment variable put it + * into table anyway to save prefixes which was already checked. + * Need to save both env_prefix and base_prefix */ + ucs_config_parser_mark_env_var_used(env_prefix, &added); + if (!added) { + return; + } + + ucs_config_parser_warn_unused_env_vars(env_prefix); + + status = ucs_config_parser_get_sub_prefix(env_prefix, &sub_prefix); + if (status != UCS_OK) { + return; + } + + if (sub_prefix == NULL) { + return; + } + + ucs_config_parser_mark_env_var_used(sub_prefix, &added); + if (!added) { + return; + } + + ucs_config_parser_warn_unused_env_vars(sub_prefix); +} + size_t ucs_config_memunits_get(size_t config_size, size_t auto_size, size_t max_size) { - if (config_size == UCS_CONFIG_MEMUNITS_AUTO) { + if (config_size == UCS_MEMUNITS_AUTO) { return auto_size; } else { return ucs_min(config_size, max_size); diff --git a/src/ucs/config/parser.h b/src/ucs/config/parser.h index b71921a7453..b67c44df87a 100644 --- a/src/ucs/config/parser.h +++ b/src/ucs/config/parser.h @@ -1,6 +1,6 @@ /* -* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -13,10 +13,12 @@ #include #include #include +#include + #include -#define UCS_CONFIG_PREFIX "UCX_" +#define UCS_DEFAULT_ENV_PREFIX "UCX_" #define UCS_CONFIG_ARRAY_MAX 128 BEGIN_C_DECLS @@ -39,8 +41,9 @@ BEGIN_C_DECLS typedef struct ucs_config_parser { int (*read) (const char *buf, void *dest, const void *arg); - int (*write)(char *buf, size_t max, void *src, const void *arg); - ucs_status_t (*clone)(void *src, void *dest, const void *arg); + int (*write)(char *buf, size_t max, + const void *src, const void *arg); + ucs_status_t (*clone)(const void *src, void *dest, const void *arg); void (*release)(void *ptr, const void *arg); void (*help)(char *buf, size_t max, const void *arg); const void *arg; @@ -75,120 +78,140 @@ typedef struct ucs_range_spec { typedef struct ucs_config_global_list_entry { - ucs_list_link_t list; - const char *name; - const char *prefix; - ucs_config_field_t *fields; - size_t size; + const char *name; /* configuration table name */ + const char *prefix; /* configuration prefix */ + ucs_config_field_t *table; /* array of configuration fields */ + size_t size; /* size of config structure */ + ucs_list_link_t list; /* entry in global list */ } ucs_config_global_list_entry_t; typedef struct ucs_config_bw_spec { - char *name; - double bw; + char *name; + double bw; } ucs_config_bw_spec_t; -#define UCS_CONFIG_REGISTER_TABLE(_fields, _name, _prefix, _type) \ - static ucs_config_global_list_entry_t _fields##_config_entry; \ +#define UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY \ + { \ + .name = "", \ + .prefix = "", \ + .table = NULL, \ + .size = 0, \ + } + + +#define UCS_CONFIG_REGISTER_TABLE_ENTRY(_entry) \ UCS_STATIC_INIT { \ - ucs_config_global_list_entry_t *entry = &_fields##_config_entry; \ - extern ucs_list_link_t ucs_config_global_list; \ - entry->fields = _fields; \ - entry->name = _name; \ - entry->prefix = _prefix; \ - entry->size = sizeof(_type); \ - ucs_list_add_tail(&ucs_config_global_list, &entry->list); \ + ucs_list_add_tail(&ucs_config_global_list, &(_entry)->list); \ } \ \ UCS_STATIC_CLEANUP { \ - ucs_list_del(&_fields##_config_entry.list); \ + ucs_list_del(&(_entry)->list); \ } +#define UCS_CONFIG_REGISTER_TABLE(_table, _name, _prefix, _type) \ + static ucs_config_global_list_entry_t _table##_config_entry = { \ + .table = _table, \ + .name = _name, \ + .prefix = _prefix, \ + .size = sizeof(_type) \ + }; \ + UCS_CONFIG_REGISTER_TABLE_ENTRY(&_table##_config_entry); + +extern ucs_list_link_t ucs_config_global_list; /* * Parsing and printing different data types */ int ucs_config_sscanf_string(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_string(char *buf, size_t max, void *src, const void *arg); -ucs_status_t ucs_config_clone_string(void *src, void *dest, const void *arg); +int ucs_config_sprintf_string(char *buf, size_t max, const void *src, const void *arg); +ucs_status_t ucs_config_clone_string(const void *src, void *dest, const void *arg); void ucs_config_release_string(void *ptr, const void *arg); int ucs_config_sscanf_int(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_int(char *buf, size_t max, void *src, const void *arg); -ucs_status_t ucs_config_clone_int(void *src, void *dest, const void *arg); +int ucs_config_sprintf_int(char *buf, size_t max, const void *src, const void *arg); +ucs_status_t ucs_config_clone_int(const void *src, void *dest, const void *arg); int ucs_config_sscanf_uint(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_uint(char *buf, size_t max, void *src, const void *arg); -ucs_status_t ucs_config_clone_uint(void *src, void *dest, const void *arg); +int ucs_config_sprintf_uint(char *buf, size_t max, const void *src, const void *arg); +ucs_status_t ucs_config_clone_uint(const void *src, void *dest, const void *arg); int ucs_config_sscanf_ulong(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_ulong(char *buf, size_t max, void *src, const void *arg); -ucs_status_t ucs_config_clone_ulong(void *src, void *dest, const void *arg); +int ucs_config_sprintf_ulong(char *buf, size_t max, const void *src, const void *arg); +ucs_status_t ucs_config_clone_ulong(const void *src, void *dest, const void *arg); int ucs_config_sscanf_double(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_double(char *buf, size_t max, void *src, const void *arg); -ucs_status_t ucs_config_clone_double(void *src, void *dest, const void *arg); +int ucs_config_sprintf_double(char *buf, size_t max, const void *src, const void *arg); +ucs_status_t ucs_config_clone_double(const void *src, void *dest, const void *arg); int ucs_config_sscanf_hex(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_hex(char *buf, size_t max, void *src, const void *arg); +int ucs_config_sprintf_hex(char *buf, size_t max, const void *src, const void *arg); int ucs_config_sscanf_bool(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_bool(char *buf, size_t max, void *src, const void *arg); +int ucs_config_sprintf_bool(char *buf, size_t max, const void *src, const void *arg); int ucs_config_sscanf_ternary(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_ternary(char *buf, size_t max, void *src, const void *arg); +int ucs_config_sprintf_ternary(char *buf, size_t max, const void *src, const void *arg); + +int ucs_config_sscanf_on_off(const char *buf, void *dest, const void *arg); + +int ucs_config_sscanf_on_off_auto(const char *buf, void *dest, const void *arg); +int ucs_config_sprintf_on_off_auto(char *buf, size_t max, const void *src, const void *arg); int ucs_config_sscanf_enum(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_enum(char *buf, size_t max, void *src, const void *arg); +int ucs_config_sprintf_enum(char *buf, size_t max, const void *src, const void *arg); void ucs_config_help_enum(char *buf, size_t max, const void *arg); int ucs_config_sscanf_bitmap(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_bitmap(char *buf, size_t max, void *src, const void *arg); +int ucs_config_sprintf_bitmap(char *buf, size_t max, const void *src, const void *arg); void ucs_config_help_bitmap(char *buf, size_t max, const void *arg); int ucs_config_sscanf_bitmask(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_bitmask(char *buf, size_t max, void *src, const void *arg); +int ucs_config_sprintf_bitmask(char *buf, size_t max, const void *src, const void *arg); int ucs_config_sscanf_time(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_time(char *buf, size_t max, void *src, const void *arg); +int ucs_config_sprintf_time(char *buf, size_t max, const void *src, const void *arg); int ucs_config_sscanf_bw(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_bw(char *buf, size_t max, void *src, const void *arg); +int ucs_config_sprintf_bw(char *buf, size_t max, const void *src, const void *arg); int ucs_config_sscanf_bw_spec(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_bw_spec(char *buf, size_t max, void *src, const void *arg); -ucs_status_t ucs_config_clone_bw_spec(void *src, void *dest, const void *arg); +int ucs_config_sprintf_bw_spec(char *buf, size_t max, const void *src, const void *arg); +ucs_status_t ucs_config_clone_bw_spec(const void *src, void *dest, const void *arg); void ucs_config_release_bw_spec(void *ptr, const void *arg); int ucs_config_sscanf_signo(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_signo(char *buf, size_t max, void *src, const void *arg); +int ucs_config_sprintf_signo(char *buf, size_t max, const void *src, const void *arg); int ucs_config_sscanf_memunits(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_memunits(char *buf, size_t max, void *src, const void *arg); +int ucs_config_sprintf_memunits(char *buf, size_t max, const void *src, const void *arg); int ucs_config_sscanf_ulunits(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_ulunits(char *buf, size_t max, void *src, const void *arg); +int ucs_config_sprintf_ulunits(char *buf, size_t max, const void *src, const void *arg); int ucs_config_sscanf_range_spec(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_range_spec(char *buf, size_t max, void *src, const void *arg); -ucs_status_t ucs_config_clone_range_spec(void *src, void *dest, const void *arg); +int ucs_config_sprintf_range_spec(char *buf, size_t max, const void *src, const void *arg); +ucs_status_t ucs_config_clone_range_spec(const void *src, void *dest, const void *arg); int ucs_config_sscanf_array(const char *buf, void *dest, const void *arg); -int ucs_config_sprintf_array(char *buf, size_t max, void *src, const void *arg); -ucs_status_t ucs_config_clone_array(void *src, void *dest, const void *arg); +int ucs_config_sprintf_array(char *buf, size_t max, const void *src, const void *arg); +ucs_status_t ucs_config_clone_array(const void *src, void *dest, const void *arg); void ucs_config_release_array(void *ptr, const void *arg); void ucs_config_help_array(char *buf, size_t max, const void *arg); int ucs_config_sscanf_table(const char *buf, void *dest, const void *arg); -ucs_status_t ucs_config_clone_table(void *src, void *dest, const void *arg); +ucs_status_t ucs_config_clone_table(const void *src, void *dest, const void *arg); void ucs_config_release_table(void *ptr, const void *arg); void ucs_config_help_table(char *buf, size_t max, const void *arg); +ucs_status_t ucs_config_clone_log_comp(const void *src, void *dst, const void *arg); + void ucs_config_release_nop(void *ptr, const void *arg); void ucs_config_help_generic(char *buf, size_t max, const void *arg); +#define UCS_CONFIG_DEPRECATED_FIELD_OFFSET SIZE_MAX /* Forward declaration of array. Should be in header file. */ #define UCS_CONFIG_DECLARE_ARRAY(_name) \ @@ -217,7 +240,7 @@ void ucs_config_help_generic(char *buf, size_t max, const void *arg); #define UCS_CONFIG_TYPE_ULUNITS {ucs_config_sscanf_ulunits, ucs_config_sprintf_ulunits, \ ucs_config_clone_ulong, ucs_config_release_nop, \ ucs_config_help_generic, \ - "unsigned long: or \"auto\""} + "unsigned long: , \"inf\", or \"auto\""} #define UCS_CONFIG_TYPE_DOUBLE {ucs_config_sscanf_double, ucs_config_sprintf_double, \ ucs_config_clone_double, ucs_config_release_nop, \ @@ -225,7 +248,8 @@ void ucs_config_help_generic(char *buf, size_t max, const void *arg); #define UCS_CONFIG_TYPE_HEX {ucs_config_sscanf_hex, ucs_config_sprintf_hex, \ ucs_config_clone_uint, ucs_config_release_nop, \ - ucs_config_help_generic, "hex representation of a number"} + ucs_config_help_generic, \ + "hex representation of a number or \"auto\""} #define UCS_CONFIG_TYPE_BOOL {ucs_config_sscanf_bool, ucs_config_sprintf_bool, \ ucs_config_clone_int, ucs_config_release_nop, \ @@ -235,6 +259,14 @@ void ucs_config_help_generic(char *buf, size_t max, const void *arg); ucs_config_clone_int, ucs_config_release_nop, \ ucs_config_help_generic, ""} +#define UCS_CONFIG_TYPE_ON_OFF {ucs_config_sscanf_on_off, ucs_config_sprintf_on_off_auto, \ + ucs_config_clone_int, ucs_config_release_nop, \ + ucs_config_help_generic, ""} + +#define UCS_CONFIG_TYPE_ON_OFF_AUTO {ucs_config_sscanf_on_off_auto, ucs_config_sprintf_on_off_auto, \ + ucs_config_clone_int, ucs_config_release_nop, \ + ucs_config_help_generic, ""} + #define UCS_CONFIG_TYPE_ENUM(t) {ucs_config_sscanf_enum, ucs_config_sprintf_enum, \ ucs_config_clone_uint, ucs_config_release_nop, \ ucs_config_help_enum, t} @@ -254,12 +286,16 @@ void ucs_config_help_generic(char *buf, size_t max, const void *arg); #define UCS_CONFIG_TYPE_BW {ucs_config_sscanf_bw, ucs_config_sprintf_bw, \ ucs_config_clone_double, ucs_config_release_nop, \ ucs_config_help_generic, \ - "bandwidth value: [T|G|K]B|b[[p|/]s]"} + "bandwidth value: [T|G|M|K]B|b[[p|/]s] or \"auto\""} #define UCS_CONFIG_TYPE_BW_SPEC {ucs_config_sscanf_bw_spec, ucs_config_sprintf_bw_spec, \ ucs_config_clone_bw_spec, ucs_config_release_bw_spec, \ ucs_config_help_generic, \ - "device_name:[T|G|K]B|b[[p|/]s]"} + "device_name:[T|G|M|K]B|b[[p|/]s] or device_name:auto"} + +#define UCS_CONFIG_TYPE_LOG_COMP {ucs_config_sscanf_enum, ucs_config_sprintf_enum, \ + ucs_config_clone_log_comp, ucs_config_release_nop, \ + ucs_config_help_enum, ucs_log_level_names} #define UCS_CONFIG_TYPE_SIGNO {ucs_config_sscanf_signo, ucs_config_sprintf_signo, \ ucs_config_clone_int, ucs_config_release_nop, \ @@ -282,13 +318,28 @@ void ucs_config_help_generic(char *buf, size_t max, const void *arg); ucs_config_clone_range_spec, ucs_config_release_nop, \ ucs_config_help_generic, "numbers range: -"} -/* - * Helpers for using an array of strings. +#define UCS_CONFIG_TYPE_DEPRECATED {(ucs_field_type(ucs_config_parser_t, read)) ucs_empty_function_do_assert, \ + (ucs_field_type(ucs_config_parser_t, write)) ucs_empty_function_do_assert, \ + (ucs_field_type(ucs_config_parser_t, clone)) ucs_empty_function_do_assert, \ + (ucs_field_type(ucs_config_parser_t, release))ucs_empty_function_do_assert, \ + (ucs_field_type(ucs_config_parser_t, help)) ucs_empty_function_do_assert, \ + ""} + +/** + * Helpers for using an array of strings */ #define UCS_CONFIG_TYPE_STRING_ARRAY \ UCS_CONFIG_TYPE_ARRAY(string) -UCS_CONFIG_DECLARE_ARRAY(string); +UCS_CONFIG_DECLARE_ARRAY(string) + + +/** + * Helpers for Bandwidth units (see UCS_CONFIG_TYPE_BW) + */ +#define UCS_CONFIG_BW_AUTO ((double)-2) +#define UCS_CONFIG_BW_IS_AUTO(_value) ((ssize_t)(_value) == UCS_CONFIG_BW_AUTO) + /** * Set default values for options. @@ -305,7 +356,8 @@ ucs_config_parser_set_default_values(void *opts, ucs_config_field_t *fields); * * @param opts User-defined options structure to fill. * @param fields Array of fields which define how to parse. - * @param env_prefix Prefix to add to all environment variables. + * @param env_prefix Prefix to add to all environment variables, + * env_prefix may consist of multiple sub preifxes * @param table_prefix Optional prefix to add to the variables of top-level table. * @param ignore_errors Whether to ignore parsing errors and continue parsing * other fields. @@ -323,7 +375,7 @@ ucs_status_t ucs_config_parser_fill_opts(void *opts, ucs_config_field_t *fields, * @param table Array of fields which define the structure of the options. */ ucs_status_t ucs_config_parser_clone_opts(const void *src, void *dst, - ucs_config_field_t *fields); + ucs_config_field_t *fields); /** * Release the options fields. @@ -341,19 +393,22 @@ void ucs_config_parser_release_opts(void *opts, ucs_config_field_t *fields); * @param opts User-defined options structure. * @param fields Array of fields which define the options. * @param table_prefix Optional prefix to add to the variables of top-level table. + * @param prefix Prefix to add to all environment variables. * @param flags Flags which control the output. */ void ucs_config_parser_print_opts(FILE *stream, const char *title, const void *opts, ucs_config_field_t *fields, const char *table_prefix, - ucs_config_print_flags_t flags); + const char *prefix, ucs_config_print_flags_t flags); /** * Print all options defined in the library - names, values, documentation. * * @param stream Output stream to print to. + * @param prefix Prefix to add to all environment variables. * @param flags Flags which control the output. */ -void ucs_config_parser_print_all_opts(FILE *stream, ucs_config_print_flags_t flags); +void ucs_config_parser_print_all_opts(FILE *stream, const char *prefix, + ucs_config_print_flags_t flags); /** * Read a value from options structure. @@ -379,11 +434,14 @@ ucs_status_t ucs_config_parser_set_value(void *opts, ucs_config_field_t *fields, const char *name, const char *value); /** - * Check all UCX_ environment variables have been used so far by the - * configuration parser, issue a warning if not. Called just before program exit. + * Wrapper for `ucs_config_parser_warn_unused_env_vars` + * that ensures that this is called once + * + * @param env_prefix Environment variable prefix. + * env_prefix may consist of multiple sub prefixex */ -void ucs_config_parser_warn_unused_env_vars(); +void ucs_config_parser_warn_unused_env_vars_once(const char *env_prefix); /** * Translate configuration value of "MEMUNITS" type to actual value. diff --git a/src/ucs/config/types.h b/src/ucs/config/types.h index bbce1db157c..4a8659aa690 100644 --- a/src/ucs/config/types.h +++ b/src/ucs/config/types.h @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -7,7 +7,6 @@ #ifndef UCS_CONFIG_TYPES_H #define UCS_CONFIG_TYPES_H - #include #include @@ -18,6 +17,7 @@ typedef enum { UCS_LOG_LEVEL_FATAL, /* Immediate termination */ UCS_LOG_LEVEL_ERROR, /* Error is returned to the user */ UCS_LOG_LEVEL_WARN, /* Something's wrong, but we continue */ + UCS_LOG_LEVEL_DIAG, /* Diagnostics, silent adjustments or internal error handling */ UCS_LOG_LEVEL_INFO, /* Information */ UCS_LOG_LEVEL_DEBUG, /* Low-volume debugging */ UCS_LOG_LEVEL_TRACE, /* High-volume debugging */ @@ -58,6 +58,17 @@ typedef enum ucs_ternary_value { } ucs_ternary_value_t; +/** + * On/Off/Auto logic value. + */ +typedef enum ucs_on_off_auto_value { + UCS_CONFIG_OFF = 0, + UCS_CONFIG_ON = 1, + UCS_CONFIG_AUTO = 2, + UCS_CONFIG_ON_OFF_LAST +} ucs_on_off_auto_value_t; + + /** * Error handling modes */ @@ -80,12 +91,6 @@ typedef enum { } ucs_config_print_flags_t; -#define UCS_CONFIG_MEMUNITS_INF SIZE_MAX -#define UCS_CONFIG_MEMUNITS_AUTO (SIZE_MAX - 1) - -#define UCS_CONFIG_ULUNITS_AUTO (SIZE_MAX - 1) - - /** * Structure type for array configuration. Should be used inside the configuration * structure declaration. @@ -114,4 +119,12 @@ typedef struct ucs_sock_addr { socklen_t addrlen; /**< Address length */ } ucs_sock_addr_t; +/** + * Logging component. + */ +typedef struct ucs_log_component_config { + ucs_log_level_t log_level; + char name[16]; +} ucs_log_component_config_t; + #endif /* TYPES_H_ */ diff --git a/src/ucs/config/ucm_opts.c b/src/ucs/config/ucm_opts.c index 34d0c19a104..663f722b187 100644 --- a/src/ucs/config/ucm_opts.c +++ b/src/ucs/config/ucm_opts.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "parser.h" #include @@ -71,6 +75,11 @@ static ucs_config_field_t ucm_global_config_table[] = { ucs_offsetof(ucm_global_config_t, enable_dynamic_mmap_thresh), UCS_CONFIG_TYPE_BOOL}, + {"DLOPEN_PROCESS_RPATH", "yes", + "Process RPATH section of caller module during dynamic libraries opening.", + ucs_offsetof(ucm_global_config_t, dlopen_process_rpath), + UCS_CONFIG_TYPE_BOOL}, + {NULL} }; @@ -79,5 +88,5 @@ UCS_CONFIG_REGISTER_TABLE(ucm_global_config_table, "UCM", UCM_CONFIG_PREFIX, UCS_STATIC_INIT { (void)ucs_config_parser_fill_opts(&ucm_global_opts, ucm_global_config_table, - NULL, UCM_CONFIG_PREFIX, 0); + UCS_DEFAULT_ENV_PREFIX, UCM_CONFIG_PREFIX, 0); } diff --git a/src/ucs/datastruct/arbiter.c b/src/ucs/datastruct/arbiter.c index 9d26d768bab..ac327863f8c 100644 --- a/src/ucs/datastruct/arbiter.c +++ b/src/ucs/datastruct/arbiter.c @@ -4,33 +4,52 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "arbiter.h" #include #include -#define SENTINEL ((ucs_arbiter_elem_t*)0x1) - void ucs_arbiter_init(ucs_arbiter_t *arbiter) { - arbiter->current = NULL; - UCS_ARBITER_GUARD_INIT(arbiter); + ucs_list_head_init(&arbiter->list); } void ucs_arbiter_group_init(ucs_arbiter_group_t *group) { group->tail = NULL; + UCS_ARBITER_GROUP_GUARD_INIT(group); } void ucs_arbiter_cleanup(ucs_arbiter_t *arbiter) { - ucs_assert(arbiter->current == NULL); + ucs_assert_always(ucs_arbiter_is_empty(arbiter)); } void ucs_arbiter_group_cleanup(ucs_arbiter_group_t *group) { - ucs_assert(group->tail == NULL); + UCS_ARBITER_GROUP_GUARD_CHECK(group); + ucs_assert_always(ucs_arbiter_group_is_empty(group)); +} + +static inline int ucs_arbiter_group_head_is_scheduled(ucs_arbiter_elem_t *head) +{ + return head->list.next != NULL; +} + +static inline void ucs_arbiter_group_head_reset(ucs_arbiter_elem_t *head) +{ + head->list.next = NULL; /* Not scheduled yet */ +} + +static inline void ucs_arbiter_elem_set_scheduled(ucs_arbiter_elem_t *elem, + ucs_arbiter_group_t *group) +{ + elem->group = group; } void ucs_arbiter_group_push_elem_always(ucs_arbiter_group_t *group, @@ -39,48 +58,28 @@ void ucs_arbiter_group_push_elem_always(ucs_arbiter_group_t *group, ucs_arbiter_elem_t *tail = group->tail; if (tail == NULL) { - elem->list.next = NULL; /* Not scheduled yet */ - elem->next = elem; /* Connect to itself */ + /* group is empty */ + ucs_arbiter_group_head_reset(elem); + elem->next = elem; /* Connect to itself */ } else { elem->next = tail->next; /* Point to first element */ tail->next = elem; /* Point previous element to new one */ } - elem->group = group; /* Always point to group */ group->tail = elem; /* Update group tail */ + ucs_arbiter_elem_set_scheduled(elem, group); } -static void ucs_arbiter_group_head_replaced(ucs_arbiter_t *arbiter, - ucs_arbiter_elem_t *old_head, - ucs_arbiter_elem_t *new_head) -{ - ucs_assert(old_head->list.prev != NULL); - ucs_assert(old_head->list.next != NULL); - ucs_assert(old_head != new_head); - - if (old_head->list.next == &old_head->list) { - /* single group which was scheduled */ - ucs_assert(arbiter->current == old_head); - ucs_list_head_init(&new_head->list); - arbiter->current = new_head; - } else { - ucs_list_insert_replace(old_head->list.prev, old_head->list.next, - &new_head->list); - if (arbiter->current == old_head) { - arbiter->current = new_head; - } - } -} - -void ucs_arbiter_group_push_head_elem_always(ucs_arbiter_t *arbiter, - ucs_arbiter_group_t *group, +void ucs_arbiter_group_push_head_elem_always(ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem) { ucs_arbiter_elem_t *tail = group->tail; ucs_arbiter_elem_t *head; - elem->group = group; /* Always point to group */ - elem->list.next = NULL; /* Not scheduled yet */ + UCS_ARBITER_GROUP_GUARD_CHECK(group); + + ucs_arbiter_group_head_reset(elem); + ucs_arbiter_elem_set_scheduled(elem, group); if (tail == NULL) { elem->next = elem; /* Connect to itself */ @@ -92,63 +91,47 @@ void ucs_arbiter_group_push_head_elem_always(ucs_arbiter_t *arbiter, elem->next = head; /* Point to first element */ tail->next = elem; /* Point previous element to new one */ - if (head->list.next != NULL) { - ucs_assert(arbiter != NULL); - ucs_arbiter_group_head_replaced(arbiter, head, elem); - } -} - -void ucs_arbiter_group_head_desched(ucs_arbiter_t *arbiter, - ucs_arbiter_elem_t *head) -{ - ucs_arbiter_elem_t *next; - - if (head->list.next == NULL) { - return; /* Not scheduled */ - } - - /* If this group is the next to be scheduled, skip it */ - if (arbiter->current == head) { - next = ucs_list_next(&head->list, ucs_arbiter_elem_t, list); - arbiter->current = (next == head) ? NULL : next; + if (!ucs_arbiter_group_head_is_scheduled(head)) { + return; } - ucs_list_del(&head->list); + ucs_list_replace(&head->list, &elem->list); } void ucs_arbiter_group_purge(ucs_arbiter_t *arbiter, ucs_arbiter_group_t *group, ucs_arbiter_callback_t cb, void *cb_arg) { - ucs_arbiter_elem_t *tail = group->tail; - ucs_arbiter_elem_t *next_group = NULL; - ucs_arbiter_elem_t *prev_group = NULL; + ucs_arbiter_elem_t *tail = group->tail; + ucs_arbiter_elem_t dummy_group_head = {}; ucs_arbiter_elem_t *ptr, *next, *prev; - ucs_arbiter_elem_t *head, *orig_head; ucs_arbiter_cb_result_t result; - int is_scheduled; + ucs_arbiter_elem_t *head; + int sched_group; if (tail == NULL) { return; /* Empty group */ } - orig_head = head = tail->next; - is_scheduled = (head->list.next != NULL); - next = head; - prev = tail; + UCS_ARBITER_GROUP_GUARD_CHECK(group); + + head = tail->next; + next = head; + prev = tail; - if (is_scheduled) { - prev_group = ucs_list_prev(&head->list, ucs_arbiter_elem_t, list); - next_group = ucs_list_next(&head->list, ucs_arbiter_elem_t, list); + sched_group = ucs_arbiter_group_head_is_scheduled(head); + if (sched_group) { + /* put a placeholder on the arbiter queue */ + ucs_list_replace(&head->list, &dummy_group_head.list); } do { ptr = next; next = ptr->next; - /* Can't touch the element if it gets removed. But it can be reused - * later as well, so it's next should be NULL. */ - ptr->next = NULL; - result = cb(arbiter, ptr, cb_arg); + /* Can't touch the element after cb is called if it gets removed. But it + * can be reused later as well, so it's group should be NULL. */ + ucs_arbiter_elem_init(ptr); + result = cb(arbiter, group, ptr, cb_arg); if (result == UCS_ARBITER_CB_RESULT_REMOVE_ELEM) { if (ptr == head) { @@ -156,56 +139,74 @@ void ucs_arbiter_group_purge(ucs_arbiter_t *arbiter, if (ptr == tail) { /* Last element is being removed - mark group as empty */ group->tail = NULL; - /* Break here to keep ptr->next = NULL, otherwise ptr->next - will be set to itself below */ - break; + if (sched_group) { + ucs_list_del(&dummy_group_head.list); + } + /* Break here to avoid further processing of the group */ + return; } } else if (ptr == tail) { group->tail = prev; /* tail->next should point to head, make sure next is head - * (it is assinged 2 lines below) */ + * (it is assigned 2 lines below) */ ucs_assert_always(next == head); } prev->next = next; } else { /* keep the element */ - ptr->next = next; /* Restore next pointer */ - prev = ptr; + ucs_arbiter_elem_set_scheduled(ptr, group); + prev = ptr; } } while (ptr != tail); - if (is_scheduled) { - if (orig_head == prev_group) { - /* this is the only group which was scheduled */ - if (group->tail == NULL) { - /* group became empty - no more groups scheduled */ - arbiter->current = NULL; - } else if (orig_head != head) { - /* keep the group scheduled, but with new head element */ - arbiter->current = head; - ucs_list_head_init(&head->list); - } - } else { - if (group->tail == NULL) { - /* group became empty - deschedule it */ - prev_group->list.next = &next_group->list; - next_group->list.prev = &prev_group->list; - if (arbiter->current == orig_head) { - arbiter->current = next_group; - } - } else if (orig_head != head) { - /* keep the group scheduled, but with new head element */ - ucs_list_insert_replace(&prev_group->list, - &next_group->list, - &head->list); - if (arbiter->current == orig_head) { - arbiter->current = head; - } - } - } - } else if ((orig_head != head) && (group->tail != NULL)) { - /* Mark new head as unscheduled */ - head->list.next = NULL; + ucs_assert(group->tail != NULL); + + if (sched_group) { + /* restore group head (could be old or new) instead of the dummy element */ + ucs_list_replace(&dummy_group_head.list, &head->list); + } else { + /* mark the group head (could be old or new) as unscheduled */ + ucs_arbiter_group_head_reset(head); + } +} + +size_t ucs_arbiter_group_num_elems(ucs_arbiter_group_t *group) +{ + ucs_arbiter_elem_t *elem = group->tail; + size_t num_elems; + + if (elem == NULL) { + return 0; + } + + num_elems = 0; + do { + ++num_elems; + elem = elem->next; + } while (elem != group->tail); + + return num_elems; +} + + +int ucs_arbiter_group_is_scheduled(ucs_arbiter_group_t *group) +{ + ucs_arbiter_elem_t *head; + + if (ucs_arbiter_group_is_empty(group)) { + return 0; + } + + head = group->tail->next; + return ucs_arbiter_group_head_is_scheduled(head); +} + +static void +ucs_arbiter_schedule_head_if_not_scheduled(ucs_arbiter_t *arbiter, + ucs_arbiter_elem_t *head) +{ + if (!ucs_arbiter_group_head_is_scheduled(head)) { + ucs_list_add_tail(&arbiter->list, &head->list); } } @@ -213,167 +214,211 @@ void ucs_arbiter_group_schedule_nonempty(ucs_arbiter_t *arbiter, ucs_arbiter_group_t *group) { ucs_arbiter_elem_t *tail = group->tail; - ucs_arbiter_elem_t *current, *head; - - UCS_ARBITER_GUARD_CHECK(arbiter); + ucs_arbiter_elem_t *head; ucs_assert(tail != NULL); head = tail->next; - if (head == NULL) { - /* it means that 1 element group is - * scheduled during dispatch. - * Restore next pointer. - */ - head = tail->next = tail; - } + ucs_assert(head != NULL); + ucs_arbiter_schedule_head_if_not_scheduled(arbiter, head); + UCS_ARBITER_GROUP_ARBITER_SET(group, arbiter); +} + +void ucs_arbiter_group_desched_nonempty(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group) +{ + ucs_arbiter_elem_t *head = group->tail->next; - if (head->list.next != NULL) { - return; /* Already scheduled */ + if (!ucs_arbiter_group_head_is_scheduled(head)) { + return; } - current = arbiter->current; - if (current == NULL) { - ucs_list_head_init(&head->list); - arbiter->current = head; + UCS_ARBITER_GROUP_ARBITER_CHECK(group, arbiter); + UCS_ARBITER_GROUP_ARBITER_SET(group, NULL); + ucs_list_del(&head->list); + ucs_arbiter_group_head_reset(head); +} + +static inline void +ucs_arbiter_remove_and_reset_if_scheduled(ucs_arbiter_elem_t *elem) +{ + if (ucs_unlikely(ucs_arbiter_group_head_is_scheduled(elem))) { + ucs_list_del(&elem->list); + ucs_arbiter_group_head_reset(elem); + } +} + +static inline void +ucs_arbiter_group_head_replace(ucs_arbiter_group_t *group, + ucs_arbiter_elem_t *group_head, + ucs_arbiter_elem_t *new_group_head) +{ + /* check if this is really the group head */ + ucs_assert(!ucs_arbiter_group_is_empty(group)); + ucs_assert(group->tail->next == group_head); + + if (group_head->next == group_head) { + group->tail = new_group_head; } else { - ucs_list_insert_before(¤t->list, &head->list); + new_group_head->next = group_head->next; } + group->tail->next = new_group_head; } void ucs_arbiter_dispatch_nonempty(ucs_arbiter_t *arbiter, unsigned per_group, ucs_arbiter_callback_t cb, void *cb_arg) { - ucs_arbiter_elem_t *group_head, *last_elem, *elem, *next_elem; - ucs_list_link_t *elem_list_next; - ucs_arbiter_elem_t *next_group, *prev_group; - ucs_arbiter_group_t *group; + ucs_arbiter_elem_t *group_head; ucs_arbiter_cb_result_t result; unsigned group_dispatch_count; - UCS_LIST_HEAD(resched_groups); + ucs_arbiter_group_t *group; + UCS_LIST_HEAD(resched_list); + ucs_arbiter_elem_t dummy; + + ucs_assert(!ucs_list_is_empty(&arbiter->list)); - next_group = arbiter->current; - ucs_assert(next_group != NULL); + ucs_arbiter_group_head_reset(&dummy); do { - group_head = next_group; + group_head = ucs_list_extract_head(&arbiter->list, ucs_arbiter_elem_t, + list); ucs_assert(group_head != NULL); - prev_group = ucs_list_prev(&group_head->list, ucs_arbiter_elem_t, list); - next_group = ucs_list_next(&group_head->list, ucs_arbiter_elem_t, list); - ucs_assert(prev_group != NULL); - ucs_assert(next_group != NULL); - ucs_assert(prev_group->list.next == &group_head->list); - ucs_assert(next_group->list.prev == &group_head->list); + + /* Reset group head to allow the group to be moved to another arbiter by + * the dispatch callback. For example, when a DC endpoint is moved from + * waiting-for-DCI arbiter to waiting-for-TX-resources arbiter. + */ + ucs_arbiter_group_head_reset(group_head); group_dispatch_count = 0; - group = group_head->group; - last_elem = group->tail; - next_elem = group_head; + group = group_head->group; + dummy.group = group; + UCS_ARBITER_GROUP_GUARD_CHECK(group); + + for (;;) { + ucs_assert(group_head->group == group); + ucs_assert(dummy.group == group); + ucs_assert(group_dispatch_count < per_group); + + /* reset the dispatched element here because: + * 1. if the element is removed from the arbiter it must be kept in + * initialized state otherwise push will fail + * 2. we can't reset the element after calling the callback because + * the callback could release the element. + */ + ucs_arbiter_elem_init(group_head); + ucs_assert(!ucs_arbiter_group_head_is_scheduled(group_head)); - do { - elem = next_elem; - next_elem = elem->next; - /* zero pointer to next elem here because: - * - user callback may free() the element - * - push_elem() will fail if next is not NULL - * and elem is reused later. For example in - * rc/ud transports control. + /* replace group head by a dummy element, to allow scheduling more + * elements on this group from the dispatch callback. */ - elem->next = NULL; - elem_list_next = elem->list.next; - elem->list.next = NULL; - - ucs_assert(elem->group == group); - ucs_trace_poll("dispatching arbiter element %p", elem); - UCS_ARBITER_GUARD_ENTER(arbiter); - result = cb(arbiter, elem, cb_arg); - UCS_ARBITER_GUARD_EXIT(arbiter); - ucs_trace_poll("dispatch result %d", result); + ucs_arbiter_group_head_replace(group, group_head, &dummy); + + /* dispatch the element */ + ucs_trace_poll("dispatching arbiter element %p", group_head); + UCS_ARBITER_GROUP_GUARD_ENTER(group); + result = cb(arbiter, group, group_head, cb_arg); + UCS_ARBITER_GROUP_GUARD_EXIT(group); + ucs_trace_poll("dispatch result: %d", result); ++group_dispatch_count; - if (result == UCS_ARBITER_CB_RESULT_REMOVE_ELEM) { - if (elem == last_elem) { - /* Only element */ - group->tail = NULL; /* Group is empty now */ - if (group_head == prev_group) { - next_group = NULL; /* No more groups */ + /* recursive push to head (during dispatch) is not allowed */ + ucs_assert(group->tail->next == &dummy); + + /* element is not removed */ + if (ucs_unlikely(result != UCS_ARBITER_CB_RESULT_REMOVE_ELEM)) { + /* restore group pointer */ + ucs_arbiter_elem_set_scheduled(group_head, group); + + /* the head should not move, since dummy replaces it */ + ucs_assert(!ucs_arbiter_group_head_is_scheduled(group_head)); + + /* replace dummy element by group_head */ + ucs_arbiter_group_head_replace(group, &dummy, group_head); + + if (result == UCS_ARBITER_CB_RESULT_DESCHED_GROUP) { + /* take over a recursively scheduled group */ + if (ucs_unlikely(ucs_arbiter_group_head_is_scheduled(&dummy))) { + ucs_list_replace(&dummy.list, &group_head->list); + UCS_ARBITER_GROUP_ARBITER_SET(group, dummy.group->arbiter); + ucs_arbiter_group_head_reset(&dummy); } else { - /* Remove the group */ - prev_group->list.next = &next_group->list; - next_group->list.prev = &prev_group->list; + UCS_ARBITER_GROUP_ARBITER_SET(group, NULL); } } else { - /* Not only element */ - ucs_assert(elem == last_elem->next); /* first element should be removed */ - if (group_head == prev_group) { - next_group = next_elem; /* No more groups, point arbiter - to next element in this group */ - ucs_list_head_init(&next_elem->list); + /* remove a recursively scheduled group, give priority + * to the original order */ + ucs_arbiter_remove_and_reset_if_scheduled(&dummy); + + if (result == UCS_ARBITER_CB_RESULT_NEXT_GROUP) { + /* add to arbiter tail */ + ucs_list_add_tail(&arbiter->list, &group_head->list); + } else if (result == UCS_ARBITER_CB_RESULT_RESCHED_GROUP) { + /* add to resched list */ + ucs_list_add_tail(&resched_list, &group_head->list); + } else if (result == UCS_ARBITER_CB_RESULT_STOP) { + /* exit the outmost loop and make sure that next dispatch() + * will continue from the current group */ + ucs_list_add_head(&arbiter->list, &group_head->list); + goto out; } else { - /* Insert the next element to the arbiter list */ - ucs_list_insert_replace(&prev_group->list, - &next_group->list, - &next_elem->list); + ucs_bug("unexpected return value from arbiter callback"); } - last_elem->next = next_elem; /* Tail points to new head */ } - } else if (result == UCS_ARBITER_CB_RESULT_NEXT_GROUP) { - elem->next = next_elem; - /* avoid infinite loop */ - elem->list.next = elem_list_next; + break; - } else if ((result == UCS_ARBITER_CB_RESULT_DESCHED_GROUP) || - (result == UCS_ARBITER_CB_RESULT_RESCHED_GROUP)) { - elem->next = next_elem; - if (group_head == prev_group) { - next_group = NULL; /* No more groups */ - } else { - prev_group->list.next = &next_group->list; - next_group->list.prev = &prev_group->list; - } - if (result == UCS_ARBITER_CB_RESULT_RESCHED_GROUP) { - ucs_list_add_tail(&resched_groups, &elem->list); - } + } + + /* last element removed */ + if (dummy.next == &dummy) { + group->tail = NULL; /* group is empty now */ + group_head = NULL; /* for debugging */ + ucs_arbiter_remove_and_reset_if_scheduled(&dummy); + UCS_ARBITER_GROUP_ARBITER_SET(group, NULL); break; - } else if (result == UCS_ARBITER_CB_RESULT_STOP) { - elem->next = next_elem; - elem->list.next = elem_list_next; - /* make sure that next dispatch() will continue - * from the current group */ - arbiter->current = group_head; - goto out; - } else { - elem->next = next_elem; - elem->list.next = elem_list_next; - ucs_bug("unexpected return value from arbiter callback"); } - } while ((elem != last_elem) && (group_dispatch_count < per_group)); - } while (next_group != NULL); - arbiter->current = NULL; + + /* non-last element removed */ + group_head = dummy.next; /* Update group head */ + group->tail->next = group_head; /* Tail points to new head */ + + if (ucs_unlikely(ucs_arbiter_group_head_is_scheduled(&dummy))) { + /* take over a recursively scheduled group */ + ucs_list_replace(&dummy.list, &group_head->list); + ucs_arbiter_group_head_reset(&dummy); + /* the group is already scheduled, continue to next group */ + break; + } else if (group_dispatch_count >= per_group) { + /* add to arbiter tail and continue to next group */ + ucs_list_add_tail(&arbiter->list, &group_head->list); + break; + } + + /* continue with new group head */ + ucs_arbiter_group_head_reset(group_head); + } + } while (!ucs_list_is_empty(&arbiter->list)); + out: - ucs_list_for_each_safe(elem, next_elem, &resched_groups, list) { - ucs_list_del(&elem->list); - elem->list.next = NULL; - ucs_trace_poll("reschedule group %p", elem->group); - ucs_arbiter_group_schedule_nonempty(arbiter, elem->group); - } + ucs_list_splice_tail(&arbiter->list, &resched_list); } void ucs_arbiter_dump(ucs_arbiter_t *arbiter, FILE *stream) { - ucs_arbiter_elem_t *first_group, *group_head, *elem; + static const int max_groups = 100; + ucs_arbiter_elem_t *group_head, *elem; + int count; fprintf(stream, "-------\n"); - if (arbiter->current == NULL) { + if (ucs_list_is_empty(&arbiter->list)) { fprintf(stream, "(empty)\n"); goto out; } - first_group = arbiter->current; - group_head = first_group; - do { + count = 0; + ucs_list_for_each(group_head, &arbiter->list, list) { elem = group_head; - if (group_head == first_group) { + if (ucs_list_head(&arbiter->list, ucs_arbiter_elem_t, list) == group_head) { fprintf(stream, "=> "); } else { fprintf(stream, " * "); @@ -391,8 +436,13 @@ void ucs_arbiter_dump(ucs_arbiter_t *arbiter, FILE *stream) elem = elem->next; } while (elem != group_head); fprintf(stream, "\n"); - group_head = ucs_list_next(&group_head->list, ucs_arbiter_elem_t, list); - } while (group_head != first_group); + ++count; + if (count > max_groups) { + fprintf(stream, "more than %d groups - not printing any more\n", + max_groups); + break; + } + } out: fprintf(stream, "-------\n"); diff --git a/src/ucs/datastruct/arbiter.h b/src/ucs/datastruct/arbiter.h index 2b469446bda..9bd48be084b 100644 --- a/src/ucs/datastruct/arbiter.h +++ b/src/ucs/datastruct/arbiter.h @@ -21,6 +21,11 @@ * - "Element" - a single work element. * - "Group" - queue of work elements which would be dispatched in-order * + * The arbiter contains a double-linked list of the group head elements. The + * next group head to dispatch is the first entry in the list. Whenever a group + * is rescheduled it's moved to the tail of the list. At any point a group head + * can be removed from the "middle" of the list. + * * The groups and elements are arranged like this: * - every arbitrated element points to the group (head). * - first element in the group points to previous and next group (list) @@ -36,18 +41,21 @@ * * * Arbiter: - * +=========+ - * | current +-----------------------+ - * +=========+ | - * | - * Elements: | - * | - * +---------------------------------]----------------------------------+ - * | V | - * | +------------+ +------------+ +------------+<--+ - * +-->| list |<-------->| list |<-------->| list | - * +------------+ +------------+ +------------+<--+ - * +->| next +---+ +->| next +---+ + next +---+ + * +=============+ + * | list + + * +======+======+ + * | next | prev | + * +==|===+===|==+ + * | +----------------------------------------------+ + * | | + * +------+ | + * | | + * | | + * Elements: V V + * +------------+ +------------+ +------------+ + * | list |<-------->| list |<-------->| list | + * +------------+ +------------+ +------------+ + * +->| next +---+ +->| next +---+ + next + * | +------------+ | | +------------+ | +------------+ * | | group | | | | group | | | group | * | +------------+ | | +------------+ | +--------+---+ @@ -93,22 +101,31 @@ typedef enum { will start from the group that returned STOP */ } ucs_arbiter_cb_result_t; -#if ENABLE_ASSERT -#define UCS_ARBITER_GUARD int guard -#define UCS_ARBITER_GUARD_INIT(_arbiter) (_arbiter)->guard = 0 -#define UCS_ARBITER_GUARD_ENTER(_arbiter) (_arbiter)->guard++ -#define UCS_ARBITER_GUARD_EXIT(_arbiter) (_arbiter)->guard-- -#define UCS_ARBITER_GUARD_CHECK(_arbiter) \ - ucs_assertv((_arbiter)->guard == 0, \ - "scheduling group from the arbiter callback") +#if UCS_ENABLE_ASSERT +#define UCS_ARBITER_GROUP_GUARD_DEFINE int guard +#define UCS_ARBITER_GROUP_GUARD_INIT(_group) (_group)->guard = 0 +#define UCS_ARBITER_GROUP_GUARD_ENTER(_group) (_group)->guard++ +#define UCS_ARBITER_GROUP_GUARD_EXIT(_group) (_group)->guard-- +#define UCS_ARBITER_GROUP_GUARD_CHECK(_group) \ + ucs_assertv((_group)->guard == 0, \ + "scheduling arbiter group %p while it's being dispatched", _group) +#define UCS_ARBITER_GROUP_ARBITER_DEFINE ucs_arbiter_t *arbiter +#define UCS_ARBITER_GROUP_ARBITER_SET(_group, _arbiter) \ + (_group)->arbiter = (_arbiter) #else -#define UCS_ARBITER_GUARD -#define UCS_ARBITER_GUARD_INIT(_arbiter) -#define UCS_ARBITER_GUARD_ENTER(_arbiter) -#define UCS_ARBITER_GUARD_EXIT(_arbiter) -#define UCS_ARBITER_GUARD_CHECK(_arbiter) +#define UCS_ARBITER_GROUP_GUARD_DEFINE +#define UCS_ARBITER_GROUP_GUARD_INIT(_group) +#define UCS_ARBITER_GROUP_GUARD_ENTER(_group) +#define UCS_ARBITER_GROUP_GUARD_EXIT(_group) +#define UCS_ARBITER_GROUP_GUARD_CHECK(_group) +#define UCS_ARBITER_GROUP_ARBITER_DEFINE +#define UCS_ARBITER_GROUP_ARBITER_SET(_group, _arbiter) #endif +#define UCS_ARBITER_GROUP_ARBITER_CHECK(_group, _arbiter) \ + ucs_assertv((_group)->arbiter == (_arbiter), \ + "%p == %p", (_group)->arbiter, _group) + /** * Arbiter callback function. @@ -120,6 +137,7 @@ typedef enum { * @return According to @ref ucs_arbiter_cb_result_t. */ typedef ucs_arbiter_cb_result_t (*ucs_arbiter_callback_t)(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg); @@ -128,8 +146,7 @@ typedef ucs_arbiter_cb_result_t (*ucs_arbiter_callback_t)(ucs_arbiter_t *arbiter * Top-level arbiter. */ struct ucs_arbiter { - ucs_arbiter_elem_t *current; - UCS_ARBITER_GUARD; + ucs_list_link_t list; }; @@ -138,6 +155,8 @@ struct ucs_arbiter { */ struct ucs_arbiter_group { ucs_arbiter_elem_t *tail; + UCS_ARBITER_GROUP_GUARD_DEFINE; + UCS_ARBITER_GROUP_ARBITER_DEFINE; }; @@ -177,10 +196,16 @@ void ucs_arbiter_group_cleanup(ucs_arbiter_group_t *group); */ static inline void ucs_arbiter_elem_init(ucs_arbiter_elem_t *elem) { - elem->next = NULL; + elem->group = NULL; } +/** + * Check if a group is scheduled on an arbiter. + */ +int ucs_arbiter_group_is_scheduled(ucs_arbiter_group_t *group); + + /** * Add a new work element to a group - internal function */ @@ -191,8 +216,7 @@ void ucs_arbiter_group_push_elem_always(ucs_arbiter_group_t *group, /** * Add a new work element to the head of a group - internal function */ -void ucs_arbiter_group_push_head_elem_always(ucs_arbiter_t *arbiter, - ucs_arbiter_group_t *group, +void ucs_arbiter_group_push_head_elem_always(ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem); @@ -209,6 +233,12 @@ void ucs_arbiter_group_purge(ucs_arbiter_t *arbiter, ucs_arbiter_group_t *group, ucs_arbiter_callback_t cb, void *cb_arg); +/** + * @return Number of elements in the group + */ +size_t ucs_arbiter_group_num_elems(ucs_arbiter_group_t *group); + + void ucs_arbiter_dump(ucs_arbiter_t *arbiter, FILE *stream); @@ -218,13 +248,13 @@ void ucs_arbiter_group_schedule_nonempty(ucs_arbiter_t *arbiter, /* Internal function */ -void ucs_arbiter_dispatch_nonempty(ucs_arbiter_t *arbiter, unsigned per_group, - ucs_arbiter_callback_t cb, void *cb_arg); +void ucs_arbiter_group_desched_nonempty(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group); /* Internal function */ -void ucs_arbiter_group_head_desched(ucs_arbiter_t *arbiter, - ucs_arbiter_elem_t *head); +void ucs_arbiter_dispatch_nonempty(ucs_arbiter_t *arbiter, unsigned per_group, + ucs_arbiter_callback_t cb, void *cb_arg); /** @@ -234,7 +264,17 @@ void ucs_arbiter_group_head_desched(ucs_arbiter_t *arbiter, */ static inline int ucs_arbiter_is_empty(ucs_arbiter_t *arbiter) { - return arbiter->current == NULL; + return ucs_list_is_empty(&arbiter->list); +} + + +/** + * @return the last group element. + */ +static inline ucs_arbiter_elem_t* +ucs_arbiter_group_tail(ucs_arbiter_group_t *group) +{ + return group->tail; } @@ -270,16 +310,11 @@ static inline void ucs_arbiter_group_schedule(ucs_arbiter_t *arbiter, * @param [in] arbiter Arbiter object that group on. * @param [in] group Group to deschedule. */ - static inline void ucs_arbiter_group_desched(ucs_arbiter_t *arbiter, ucs_arbiter_group_t *group) { if (ucs_unlikely(!ucs_arbiter_group_is_empty(group))) { - ucs_arbiter_elem_t *head; - - head = group->tail->next; - ucs_arbiter_group_head_desched(arbiter, head); - head->list.next = NULL; + ucs_arbiter_group_desched_nonempty(arbiter, group); } } @@ -287,11 +322,10 @@ static inline void ucs_arbiter_group_desched(ucs_arbiter_t *arbiter, /** * @return Whether the element is queued in an arbiter group. * (an element can't be queued more than once) - * */ static inline int ucs_arbiter_elem_is_scheduled(ucs_arbiter_elem_t *elem) { - return elem->next != NULL; + return elem->group != NULL; } @@ -316,22 +350,18 @@ ucs_arbiter_group_push_elem(ucs_arbiter_group_t *group, /** * Add a new work element to the head of a group if it is not already there * - * @param [in] arbiter Arbiter object the group is on (since we modify the head - * element of a potentially scheduled group). If the group - * is not scheduled, arbiter may be NULL. * @param [in] group Group to add the element to. * @param [in] elem Work element to add. */ static inline void -ucs_arbiter_group_push_head_elem(ucs_arbiter_t *arbiter, - ucs_arbiter_group_t *group, +ucs_arbiter_group_push_head_elem(ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem) { if (ucs_arbiter_elem_is_scheduled(elem)) { return; } - ucs_arbiter_group_push_head_elem_always(arbiter, group, elem); + ucs_arbiter_group_push_head_elem_always(group, elem); } @@ -358,31 +388,13 @@ ucs_arbiter_dispatch(ucs_arbiter_t *arbiter, unsigned per_group, } -/** - * @return Group the element belongs to. - */ -static inline ucs_arbiter_group_t* ucs_arbiter_elem_group(ucs_arbiter_elem_t *elem) -{ - return elem->group; -} - - -/** - * @return true if element is the last one in the group - */ -static inline int -ucs_arbiter_elem_is_last(ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem) -{ - return group->tail == elem; -} - /** * @return true if element is the only one in the group */ static inline int -ucs_arbiter_elem_is_only(ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem) +ucs_arbiter_elem_is_only(ucs_arbiter_elem_t *elem) { - return ucs_arbiter_elem_is_last(group, elem) && (elem->next == elem); + return elem->next == elem; } #endif diff --git a/src/ucs/datastruct/callbackq.c b/src/ucs/datastruct/callbackq.c index 60ce8f81437..7a9c97d0fc1 100644 --- a/src/ucs/datastruct/callbackq.c +++ b/src/ucs/datastruct/callbackq.c @@ -5,6 +5,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include #include @@ -22,26 +26,25 @@ typedef struct ucs_callbackq_priv { - ucs_spinlock_t lock; /**< Protects adding / removing */ + ucs_recursive_spinlock_t lock; /**< Protects adding / removing */ - ucs_callbackq_elem_t *slow_elems; /**< Array of slow-path elements */ - unsigned num_slow_elems; /**< Number of slow-path elements */ - unsigned max_slow_elems; /**< Maximal number of slow-path elements */ - unsigned slow_idx; /**< Iterator over slow-path elements */ - int slow_proxy_id; /**< ID of slow-path proxy in fast-path array. - keep track while this moves around. */ + ucs_callbackq_elem_t *slow_elems; /**< Array of slow-path elements */ + unsigned num_slow_elems; /**< Number of slow-path elements */ + unsigned max_slow_elems; /**< Maximal number of slow-path elements */ + int slow_proxy_id; /**< ID of slow-path proxy in fast-path array. + keep track while this moves around. */ - uint64_t fast_remove_mask; /**< Mask of which fast-path elements - should be removed */ - unsigned num_fast_elems; /**< Number of fast-path elements */ + uint64_t fast_remove_mask; /**< Mask of which fast-path elements + should be removed */ + unsigned num_fast_elems; /**< Number of fast-path elements */ /* Lookup table for callback IDs. This allows moving callbacks around in * the arrays, while the user can always use a single ID to remove the * callback in O(1). */ - int free_idx_id; /**< Index of first free item in the list */ - int num_idxs; /**< Size of idxs array */ - unsigned *idxs; /**< ID-to-index lookup */ + int free_idx_id; /**< Index of first free item in the list */ + int num_idxs; /**< Size of idxs array */ + unsigned *idxs; /**< ID-to-index lookup */ } ucs_callbackq_priv_t; @@ -56,12 +59,12 @@ static inline ucs_callbackq_priv_t* ucs_callbackq_priv(ucs_callbackq_t *cbq) static void ucs_callbackq_enter(ucs_callbackq_t *cbq) { - ucs_spin_lock(&ucs_callbackq_priv(cbq)->lock); + ucs_recursive_spin_lock(&ucs_callbackq_priv(cbq)->lock); } static void ucs_callbackq_leave(ucs_callbackq_t *cbq) { - ucs_spin_unlock(&ucs_callbackq_priv(cbq)->lock); + ucs_recursive_spin_unlock(&ucs_callbackq_priv(cbq)->lock); } static void ucs_callbackq_elem_reset(ucs_callbackq_t *cbq, @@ -165,38 +168,6 @@ static unsigned ucs_callbackq_get_fast_idx(ucs_callbackq_t *cbq) return idx; } -static void ucs_callbackq_remove_common(ucs_callbackq_t *cbq, - ucs_callbackq_elem_t *elems, - unsigned idx, unsigned last_idx, - unsigned idx_slow_flag, - uint64_t *remove_mask) -{ - ucs_callbackq_priv_t *priv = ucs_callbackq_priv(cbq); - int id; - - ucs_trace_func("cbq=%p idx=%u last_idx=%u slow_flag=0x%x", cbq, idx, - last_idx, idx_slow_flag); - - ucs_assert(idx <= last_idx); - - /* replace removed with last */ - elems[idx] = elems[last_idx]; - ucs_callbackq_elem_reset(cbq, &elems[last_idx]); - - if (*remove_mask & UCS_BIT(last_idx)) { - /* replaced by marked-for-removal element, still need to remove 'idx' */ - *remove_mask &= ~UCS_BIT(last_idx); - } else { - /* replaced by a live element, remove from the mask and update 'idxs' */ - *remove_mask &= ~UCS_BIT(idx); - if (last_idx != idx) { - id = elems[idx].id; - ucs_assert(id != UCS_CALLBACKQ_ID_NULL); - priv->idxs[id] = idx | idx_slow_flag; - } - } -} - static int ucs_callbackq_add_fast(ucs_callbackq_t *cbq, ucs_callback_t cb, void *arg, unsigned flags) { @@ -220,15 +191,33 @@ static int ucs_callbackq_add_fast(ucs_callbackq_t *cbq, ucs_callback_t cb, /* should be called from dispatch thread only */ static void ucs_callbackq_remove_fast(ucs_callbackq_t *cbq, unsigned idx) { - ucs_callbackq_priv_t *priv = ucs_callbackq_priv(cbq); + ucs_callbackq_priv_t *priv = ucs_callbackq_priv(cbq); + ucs_callbackq_elem_t *dst_elem = &cbq->fast_elems[idx]; unsigned last_idx; + int id; ucs_trace_func("cbq=%p idx=%u", cbq, idx); ucs_assert(priv->num_fast_elems > 0); last_idx = --priv->num_fast_elems; - ucs_callbackq_remove_common(cbq, cbq->fast_elems, idx, last_idx, 0, - &priv->fast_remove_mask); + + /* replace removed with last */ + *dst_elem = cbq->fast_elems[last_idx]; + ucs_callbackq_elem_reset(cbq, &cbq->fast_elems[last_idx]); + + if (priv->fast_remove_mask & UCS_BIT(last_idx)) { + /* replaced by marked-for-removal element, still need to remove 'idx' */ + ucs_assert(priv->fast_remove_mask & UCS_BIT(idx)); + priv->fast_remove_mask &= ~UCS_BIT(last_idx); + } else { + /* replaced by a live element, remove from the mask and update 'idxs' */ + priv->fast_remove_mask &= ~UCS_BIT(idx); + if (last_idx != idx) { + id = dst_elem->id; + ucs_assert(id != UCS_CALLBACKQ_ID_NULL); + priv->idxs[id] = idx; + } + } } /* should be called from dispatch thread only */ @@ -333,24 +322,41 @@ static int ucs_callbackq_add_slow(ucs_callbackq_t *cbq, ucs_callback_t cb, static void ucs_callbackq_remove_slow(ucs_callbackq_t *cbq, unsigned idx) { ucs_callbackq_priv_t *priv = ucs_callbackq_priv(cbq); - unsigned last_idx; - uint64_t dummy = 0; ucs_trace_func("cbq=%p idx=%u", cbq, idx); - /* When the slow-path proxy callback sees there are no more elements, it - * will disable itself. + /* Mark for removal by ucs_callbackq_purge_slow() */ + ucs_callbackq_elem_reset(cbq, &priv->slow_elems[idx]); +} + +static void ucs_callbackq_purge_slow(ucs_callbackq_t *cbq) +{ + ucs_callbackq_priv_t *priv = ucs_callbackq_priv(cbq); + ucs_callbackq_elem_t *src_elem; + unsigned src_idx, dst_idx; + + ucs_trace_func("cbq=%p", cbq); + + /* + * Copy valid elements from src_idx to dst_idx, essentially rebuilding the + * array of elements in-place, keeping only the valid ones. + * As an optimization, if no elements are actually removed, then src_idx will + * always be equal to dst_idx, so nothing will be actually copied/moved. */ - ucs_assert(priv->num_slow_elems > 0); - last_idx = --priv->num_slow_elems; - ucs_callbackq_remove_common(cbq, priv->slow_elems, idx, last_idx, - UCS_CALLBACKQ_IDX_FLAG_SLOW, &dummy); - - /* Make the slow-path iterator go over the element we moved from the end of - * the array, otherwise it would be skipped. */ - if (idx <= priv->slow_idx) { - priv->slow_idx = idx; + dst_idx = 0; + for (src_idx = 0; src_idx < priv->num_slow_elems; ++src_idx) { + src_elem = &priv->slow_elems[src_idx]; + if (src_elem->id != UCS_CALLBACKQ_ID_NULL) { + ucs_assert(dst_idx <= src_idx); + if (dst_idx != src_idx) { + priv->idxs[src_elem->id] = dst_idx | UCS_CALLBACKQ_IDX_FLAG_SLOW; + priv->slow_elems[dst_idx] = *src_elem; + } + ++dst_idx; + } } + + priv->num_slow_elems = dst_idx; } static unsigned ucs_callbackq_slow_proxy(void *arg) @@ -368,10 +374,11 @@ static unsigned ucs_callbackq_slow_proxy(void *arg) ucs_callbackq_enter(cbq); /* Execute and update slow-path callbacks */ - while ( (slow_idx = priv->slow_idx) < priv->num_slow_elems ) { + for (slow_idx = 0; slow_idx < priv->num_slow_elems; ++slow_idx) { elem = &priv->slow_elems[slow_idx]; - priv->slow_idx++; /* Increment slow_idx here to give the remove functions - an opportunity to rewind it */ + if (elem->id == UCS_CALLBACKQ_ID_NULL) { + continue; + } tmp_elem = *elem; if (elem->flags & UCS_CALLBACKQ_FLAG_FAST) { @@ -395,9 +402,8 @@ static unsigned ucs_callbackq_slow_proxy(void *arg) ucs_callbackq_enter(cbq); } - priv->slow_idx = 0; - ucs_callbackq_purge_fast(cbq); + ucs_callbackq_purge_slow(cbq); /* Disable this proxy if no more work to do */ if (!priv->fast_remove_mask && (priv->num_slow_elems == 0)) { @@ -418,11 +424,10 @@ ucs_status_t ucs_callbackq_init(ucs_callbackq_t *cbq) ucs_callbackq_elem_reset(cbq, &cbq->fast_elems[idx]); } - ucs_spinlock_init(&priv->lock); + ucs_recursive_spinlock_init(&priv->lock, 0); priv->slow_elems = NULL; priv->num_slow_elems = 0; priv->max_slow_elems = 0; - priv->slow_idx = 0; priv->slow_proxy_id = UCS_CALLBACKQ_ID_NULL; priv->fast_remove_mask = 0; priv->num_fast_elems = 0; @@ -437,6 +442,8 @@ void ucs_callbackq_cleanup(ucs_callbackq_t *cbq) ucs_callbackq_priv_t *priv = ucs_callbackq_priv(cbq); ucs_callbackq_disable_proxy(cbq); + ucs_callbackq_purge_fast(cbq); + ucs_callbackq_purge_slow(cbq); if ((priv->num_fast_elems) > 0 || (priv->num_slow_elems > 0)) { ucs_warn("%d fast-path and %d slow-path callbacks remain in the queue", diff --git a/src/ucs/datastruct/callbackq.h b/src/ucs/datastruct/callbackq.h index dcddcdeea3f..f9364676df3 100644 --- a/src/ucs/datastruct/callbackq.h +++ b/src/ucs/datastruct/callbackq.h @@ -8,7 +8,7 @@ #ifndef UCS_CALLBACKQ_H #define UCS_CALLBACKQ_H -#include +#include #include #include #include diff --git a/src/ucs/datastruct/conn_match.c b/src/ucs/datastruct/conn_match.c new file mode 100644 index 00000000000..659f9767187 --- /dev/null +++ b/src/ucs/datastruct/conn_match.c @@ -0,0 +1,246 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "conn_match.h" + +#include +#include +#include +#include + + +/** + * Maximal length of address + */ +#define UCS_CONN_MATCH_ADDRESS_STR_MAX 128 + + +struct ucs_conn_match_peer { + ucs_hlist_head_t conn_q[UCS_CONN_MATCH_QUEUE_LAST]; /* Connection queues */ + ucs_conn_sn_t next_conn_sn; /* Sequence number of matching + connections, since wireup messages + used for connection establishment + procedure which were sent on different + connections could be provided + out-of-order */ + size_t address_length; /* Length of the addresses used for the + connection between peers */ + char address[0]; +}; + +static UCS_F_ALWAYS_INLINE khint_t +ucs_conn_match_peer_hash(ucs_conn_match_peer_t *peer) +{ + return ucs_crc32(0, &peer->address, peer->address_length); +} + +static UCS_F_ALWAYS_INLINE int +ucs_conn_match_peer_equal(ucs_conn_match_peer_t *peer1, + ucs_conn_match_peer_t *peer2) +{ + return (peer1->address_length == peer2->address_length) && + !memcmp(&peer1->address, &peer2->address, peer1->address_length); +} + +KHASH_IMPL(ucs_conn_match, ucs_conn_match_peer_t*, char, 0, + ucs_conn_match_peer_hash, ucs_conn_match_peer_equal); + + +const static char *ucs_conn_match_queue_title[] = { + [UCS_CONN_MATCH_QUEUE_EXP] = "expected", + [UCS_CONN_MATCH_QUEUE_UNEXP] = "unexpected" +}; + + +void ucs_conn_match_init(ucs_conn_match_ctx_t *conn_match_ctx, + size_t address_length, + const ucs_conn_match_ops_t *ops) +{ + kh_init_inplace(ucs_conn_match, &conn_match_ctx->hash); + conn_match_ctx->address_length = address_length; + conn_match_ctx->ops = *ops; +} + +void ucs_conn_match_cleanup(ucs_conn_match_ctx_t *conn_match_ctx) +{ + char address_str[UCS_CONN_MATCH_ADDRESS_STR_MAX]; + ucs_conn_match_peer_t *peer; + unsigned i; + + kh_foreach_key(&conn_match_ctx->hash, peer, { + for (i = 0; i < UCS_CONN_MATCH_QUEUE_LAST; i++) { + if (!ucs_hlist_is_empty(&peer->conn_q[i])) { + ucs_diag("match_ctx %p: %s queue is not empty for %s address", + conn_match_ctx, + ucs_conn_match_queue_title[i], + conn_match_ctx->ops.address_str(&peer->address, address_str, + UCS_CONN_MATCH_ADDRESS_STR_MAX)); + } + } + + ucs_free(peer); + }) + kh_destroy_inplace(ucs_conn_match, &conn_match_ctx->hash); +} + +static ucs_conn_match_peer_t* +ucs_conn_match_peer_alloc(ucs_conn_match_ctx_t *conn_match_ctx, + const void *address) +{ + char address_str[UCS_CONN_MATCH_ADDRESS_STR_MAX]; + ucs_conn_match_peer_t *peer; + + peer = ucs_calloc(1, sizeof(*peer), "conn match peer"); + if (peer == NULL) { + ucs_fatal("match_ctx %p: failed to allocate memory for %s address", + conn_match_ctx, + conn_match_ctx->ops.address_str(address, address_str, + UCS_CONN_MATCH_ADDRESS_STR_MAX)); + } + + peer->address_length = conn_match_ctx->address_length; + memcpy(&peer->address, address, peer->address_length); + + return peer; +} + +static ucs_conn_match_peer_t* +ucs_conn_match_get_conn(ucs_conn_match_ctx_t *conn_match_ctx, + const void *address) +{ + char address_str[UCS_CONN_MATCH_ADDRESS_STR_MAX]; + ucs_conn_match_peer_t *peer; + khiter_t iter; + int ret; + + peer = ucs_conn_match_peer_alloc(conn_match_ctx, address); + iter = kh_put(ucs_conn_match, &conn_match_ctx->hash, peer, &ret); + if (ucs_unlikely(ret == UCS_KH_PUT_FAILED)) { + ucs_free(peer); + ucs_fatal("match_ctx %p: kh_put failed for %s", + conn_match_ctx, + conn_match_ctx->ops.address_str(address, address_str, + UCS_CONN_MATCH_ADDRESS_STR_MAX)); + } + + if (ret == UCS_KH_PUT_KEY_PRESENT) { + ucs_free(peer); + return kh_key(&conn_match_ctx->hash, iter); + } + + /* initialize match list on first use */ + peer->next_conn_sn = 0; + ucs_hlist_head_init(&peer->conn_q[UCS_CONN_MATCH_QUEUE_EXP]); + ucs_hlist_head_init(&peer->conn_q[UCS_CONN_MATCH_QUEUE_UNEXP]); + + return peer; +} + +ucs_conn_sn_t ucs_conn_match_get_next_sn(ucs_conn_match_ctx_t *conn_match_ctx, + const void *address) +{ + ucs_conn_match_peer_t *peer = ucs_conn_match_get_conn(conn_match_ctx, + address); + return peer->next_conn_sn++; +} + +void ucs_conn_match_insert(ucs_conn_match_ctx_t *conn_match_ctx, + const void *address, ucs_conn_sn_t conn_sn, + ucs_conn_match_elem_t *conn_match, + ucs_conn_match_queue_type_t conn_queue_type) +{ + ucs_conn_match_peer_t *peer = ucs_conn_match_get_conn(conn_match_ctx, + address); + ucs_hlist_head_t *head = &peer->conn_q[conn_queue_type]; + char UCS_V_UNUSED address_str[UCS_CONN_MATCH_ADDRESS_STR_MAX]; + + ucs_hlist_add_tail(head, &conn_match->list); + ucs_trace("match_ctx %p: conn_match %p added as %s address %s conn_sn %zu", + conn_match_ctx, conn_match, + ucs_conn_match_queue_title[conn_queue_type], + conn_match_ctx->ops.address_str(address, address_str, + UCS_CONN_MATCH_ADDRESS_STR_MAX), + conn_sn); +} + +ucs_conn_match_elem_t * +ucs_conn_match_retrieve(ucs_conn_match_ctx_t *conn_match_ctx, + const void *address, ucs_conn_sn_t conn_sn, + ucs_conn_match_queue_type_t conn_queue_type) +{ + char UCS_V_UNUSED address_str[UCS_CONN_MATCH_ADDRESS_STR_MAX]; + ucs_conn_match_peer_t *peer; + ucs_conn_match_elem_t *elem; + ucs_hlist_head_t *head; + khiter_t iter; + + peer = ucs_conn_match_peer_alloc(conn_match_ctx, address); + iter = kh_get(ucs_conn_match, &conn_match_ctx->hash, peer); + ucs_free(peer); + if (iter == kh_end(&conn_match_ctx->hash)) { + goto notfound; /* no hash entry */ + } + + peer = kh_key(&conn_match_ctx->hash, iter); + head = &peer->conn_q[conn_queue_type]; + + ucs_hlist_for_each(elem, head, list) { + if (conn_match_ctx->ops.get_conn_sn(elem) == conn_sn) { + ucs_hlist_del(head, &elem->list); + ucs_trace("match_ctx %p: matched %s conn_match %p by address %s conn_sn %zu", + conn_match_ctx, ucs_conn_match_queue_title[conn_queue_type], elem, + conn_match_ctx->ops.address_str(address, address_str, + UCS_CONN_MATCH_ADDRESS_STR_MAX), + conn_sn); + return elem; + } + } + +notfound: + ucs_trace("match_ctx %p: %s address %s conn_sn %zu not found", + conn_match_ctx, ucs_conn_match_queue_title[conn_queue_type], + conn_match_ctx->ops.address_str(address, address_str, + UCS_CONN_MATCH_ADDRESS_STR_MAX), + conn_sn); + return NULL; +} + +void ucs_conn_match_remove_elem(ucs_conn_match_ctx_t *conn_match_ctx, + ucs_conn_match_elem_t *elem, + ucs_conn_match_queue_type_t conn_queue_type) +{ + const void *address = conn_match_ctx->ops.get_address(elem); + char UCS_V_UNUSED address_str[UCS_CONN_MATCH_ADDRESS_STR_MAX]; + ucs_conn_match_peer_t *peer; + ucs_hlist_head_t *head; + khiter_t iter; + + peer = ucs_conn_match_peer_alloc(conn_match_ctx, address); + iter = kh_get(ucs_conn_match, &conn_match_ctx->hash, peer); + if (iter == kh_end(&conn_match_ctx->hash)) { + ucs_fatal("match_ctx %p: conn_match %p address %s conn_sn %zu " + "wasn't found in hash", conn_match_ctx, elem, + conn_match_ctx->ops.address_str(&address, address_str, + UCS_CONN_MATCH_ADDRESS_STR_MAX), + conn_match_ctx->ops.get_conn_sn(elem)); + } + + ucs_free(peer); + + peer = kh_key(&conn_match_ctx->hash, iter); + head = &peer->conn_q[conn_queue_type]; + + ucs_hlist_del(head, &elem->list); + ucs_trace("match_ctx %p: remove %s conn_match %p address %s conn_sn %zu)", + conn_match_ctx, ucs_conn_match_queue_title[conn_queue_type], + elem, conn_match_ctx->ops.address_str(&address, address_str, + UCS_CONN_MATCH_ADDRESS_STR_MAX), + conn_match_ctx->ops.get_conn_sn(elem)); +} diff --git a/src/ucs/datastruct/conn_match.h b/src/ucs/datastruct/conn_match.h new file mode 100644 index 00000000000..da0a25cfac0 --- /dev/null +++ b/src/ucs/datastruct/conn_match.h @@ -0,0 +1,207 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCS_CONN_MATCH_H_ +#define UCS_CONN_MATCH_H_ + +#include +#include + +#include + + +BEGIN_C_DECLS + + +/** + * Connection sequence number + */ +typedef uint64_t ucs_conn_sn_t; + + +/** + * Connection queue type + */ +typedef enum ucs_conn_match_queue_type { + /* Queue type for connections created by API and not + * connected to remote peer */ + UCS_CONN_MATCH_QUEUE_EXP, + /* Queue type for connections created internally as + * connected to remote peer, but not provided to user yet */ + UCS_CONN_MATCH_QUEUE_UNEXP, + /* Number of queues that are used by connection matching */ + UCS_CONN_MATCH_QUEUE_LAST +} ucs_conn_match_queue_type_t; + + +/** + * Structure to embed in a connection entry to support matching with remote + * peer's connections. + */ +typedef struct ucs_conn_match_elem { + ucs_hlist_link_t list; /* List entry into endpoint + matching structure */ +} ucs_conn_match_elem_t; + + +/** + * Function to get the address of the connection between the peers. + * + * @param [in] elem Pointer to the connection matching element. + * + * @return Pointer to the address of the connection between the peers. + */ +typedef const void* +(*ucs_conn_match_get_address_t)(const ucs_conn_match_elem_t *elem); + + +/** + * Function to get the sequence number of the connection between the peers. + * + * @param [in] elem Pointer to the connection matching element. + * + * @return Sequnce number of the given connection between the peers. + */ +typedef ucs_conn_sn_t +(*ucs_conn_match_get_conn_sn_t)(const ucs_conn_match_elem_t *elem); + + +/** + * Function to get string representation of the connection address. + * + * @param [in] address Pointer to the connection address. + * @param [out] str A string filled with the address. + * @param [in] max_size Size of a string (considering '\0'-terminated symbol). + * + * @return A resulted string filled with the address. + */ +typedef const char* +(*ucs_conn_match_address_str_t)(const void *address, + char *str, size_t max_size); + + +/** + * Connection matching operations + */ +typedef struct ucs_conn_match_ops { + ucs_conn_match_get_address_t get_address; + ucs_conn_match_get_conn_sn_t get_conn_sn; + ucs_conn_match_address_str_t address_str; +} ucs_conn_match_ops_t; + + +/** + * Connection peer entry - allows *ordered* matching of connections between + * a pair of connected peers. The expected/unexpected lists are *not* circular + */ +typedef struct ucs_conn_match_peer ucs_conn_match_peer_t; + + +KHASH_TYPE(ucs_conn_match, ucs_conn_match_peer_t*, char) + + +/** + * Context for matching connections + */ +typedef struct ucs_conn_match_ctx { + khash_t(ucs_conn_match) hash; /* Hash of matched connections */ + size_t address_length; /* Length of the addresses used for the + connection between peers */ + ucs_conn_match_ops_t ops; /* User's connection matching operations */ +} ucs_conn_match_ctx_t; + + +/** + * Initialize the connection matching context. + * + * @param [in] conn_match_ctx Pointer to the connection matching context. + * @param [in] address_length Length of the addresses used for the connection + * between peers. + * @param [in] ops Pointer to the user-defined connection matching + * operations. + */ +void ucs_conn_match_init(ucs_conn_match_ctx_t *conn_match_ctx, + size_t address_length, + const ucs_conn_match_ops_t *ops); + + +/** + * Cleanup the connection matching context. + * + * @param [in] conn_match_ctx Pointer to the connection matching context. + */ +void ucs_conn_match_cleanup(ucs_conn_match_ctx_t *conn_match_ctx); + + +/** + * Get the next value of the connection sequence number between two peers. + * + * @param [in] conn_match_ctx Pointer to the connection matching context. + * @param [in] address Pointer to the address of the connection + * between the peers. + * + * @return The next value of the connection sequence number, this value is unique + * for the given connection. + */ +ucs_conn_sn_t ucs_conn_match_get_next_sn(ucs_conn_match_ctx_t *conn_match_ctx, + const void *address); + + +/** + * Insert the connection matching entry to the context. + * + * @param [in] conn_match_ctx Pointer to the connection matching context. + * @param [in] address Pointer to the address of the connection + * between the peers. + * @param [in] conn_sn Connection sequence number of the connection. + * @param [in] elem Pointer to the connection matching structure. + * @param [in] conn_queue_type Connection queue which should be used to insert + * the connection matching element to. + */ +void ucs_conn_match_insert(ucs_conn_match_ctx_t *conn_match_ctx, + const void *address, ucs_conn_sn_t conn_sn, + ucs_conn_match_elem_t *elem, + ucs_conn_match_queue_type_t conn_queue_type); + + +/** + * Retrieve the connection matching entry from the context. + * + * @param [in] conn_match_ctx Pointer to the connection matching context. + * @param [in] address Pointer to the address of the connection + * between the peers. + * @param [in] conn_sn Connection sequence number of the connection. + * @param [in] conn_queue_type Connection queue which should be used to retrieve + * the connection matching element from. + * + * @return Pointer to the found connection matching entry. + */ +ucs_conn_match_elem_t * +ucs_conn_match_retrieve(ucs_conn_match_ctx_t *conn_match_ctx, + const void *address, ucs_conn_sn_t conn_sn, + ucs_conn_match_queue_type_t conn_queue_type); + + +/** + * Remove the connection matching entry from the context. + * + * @param [in] conn_match_ctx Pointer to the connection matching context. + * @param [in] elem Pointer to the connection matching element. + * @param [in] conn_queue_type Connection queue which should be used to remove + * the connection matching element from. + * + * @note Connection @conn_match matching entry must be present in the queue + * pointed by @conn_queue_type. + */ +void ucs_conn_match_remove_elem(ucs_conn_match_ctx_t *conn_match_ctx, + ucs_conn_match_elem_t *elem, + ucs_conn_match_queue_type_t conn_queue_type); + + +END_C_DECLS + + +#endif diff --git a/src/ucs/datastruct/frag_list.c b/src/ucs/datastruct/frag_list.c index 719f2125df5..9590ca54e73 100644 --- a/src/ucs/datastruct/frag_list.c +++ b/src/ucs/datastruct/frag_list.c @@ -5,9 +5,13 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "frag_list.h" -#if ENABLE_STATS +#ifdef ENABLE_STATS static ucs_stats_class_t ucs_frag_list_stats_class = { .name = "frag_list", @@ -37,7 +41,7 @@ ucs_status_t ucs_frag_list_init(ucs_frag_list_sn_t initial_sn, ucs_frag_list_t * ucs_queue_head_init(&frag_list->list); ucs_queue_head_init(&frag_list->ready_list); -#if ENABLE_STATS +#ifdef ENABLE_STATS frag_list->prev_sn = initial_sn; #endif status = UCS_STATS_NODE_ALLOC(&frag_list->stats, &ucs_frag_list_stats_class, diff --git a/src/ucs/datastruct/frag_list.h b/src/ucs/datastruct/frag_list.h index 55169923e06..97fcf95c751 100644 --- a/src/ucs/datastruct/frag_list.h +++ b/src/ucs/datastruct/frag_list.h @@ -90,7 +90,7 @@ typedef struct ucs_frag_list { unsigned elem_count; /* total number of list elements */ unsigned list_count; /* number of independent lists */ int max_holes; /* do not allow insertion if ucs_list_count >= max_holes */ - UCS_STATS_NODE_DECLARE(stats); + UCS_STATS_NODE_DECLARE(stats) #ifdef ENABLE_STATS ucs_frag_list_sn_t prev_sn; /* needed to detect busrts */ #endif @@ -173,7 +173,7 @@ static inline ucs_frag_list_ooo_type_t ucs_frag_list_insert(ucs_frag_list_t *head, ucs_frag_list_elem_t *elem, ucs_frag_list_sn_t sn) { -#if ENABLE_STATS +#ifdef ENABLE_STATS ucs_frag_list_ooo_type_t ret; if (UCS_FRAG_LIST_SN_CMP(sn, >, head->head_sn)) { @@ -188,13 +188,14 @@ ucs_frag_list_insert(ucs_frag_list_t *head, ucs_frag_list_elem_t *elem, } #endif /* in order arrival on empty list - inc sn and do nothing */ + /* cppcheck-suppress syntaxError */ if (ucs_likely(UCS_FRAG_LIST_SN_CMP(sn, ==, head->head_sn + 1) && (head->elem_count == 0))) { head->head_sn = sn; return UCS_FRAG_LIST_INSERT_FAST; } /* return either dup or slow */ -#if ENABLE_STATS +#ifdef ENABLE_STATS ret = ucs_frag_list_insert_slow(head, elem, sn); UCS_STATS_UPDATE_COUNTER(head->stats, UCS_FRAG_LIST_STAT_GAP_OUT, ret != UCS_FRAG_LIST_INSERT_DUP ? head->list_count : 0); diff --git a/src/ucs/datastruct/hlist.h b/src/ucs/datastruct/hlist.h new file mode 100644 index 00000000000..1f7c038b735 --- /dev/null +++ b/src/ucs/datastruct/hlist.h @@ -0,0 +1,251 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCS_HLIST_H_ +#define UCS_HLIST_H_ + +#include "list.h" + +#include + + +BEGIN_C_DECLS + +/** + * Detached-head circular list: unlike the basic double-linked list, the head + * element is separate from the list, and it points to first element, or NULL if + * the list is empty. + * It reduces the size of list head from 2 pointers to 1 pointer, and allows + * storing the head element inside a reallocating hash table, but adds some + * overhead to basic list operations. + */ + + +/*** + * List element of a detached-head list. + */ +typedef struct ucs_hlist_link { + ucs_list_link_t list; +} ucs_hlist_link_t; + + +/** + * Head of a circular detached-head list. + */ +typedef struct ucs_hlist_head { + ucs_hlist_link_t *ptr; +} ucs_hlist_head_t; + + +/** + * Initialize a detached-head list. + * + * @param [in] head List head to initialize. + */ +static UCS_F_ALWAYS_INLINE void +ucs_hlist_head_init(ucs_hlist_head_t *head) +{ + head->ptr = NULL; +} + + +/** + * Check if a detached-head list is empty. + * + * @param [in] head List head to check. + * + * @return Whether the list is empty. + */ +static UCS_F_ALWAYS_INLINE int +ucs_hlist_is_empty(const ucs_hlist_head_t *head) +{ + return head->ptr == NULL; +} + + +/** + * Common function to add elements to the list head or tail. + * + * @param [in] head List head to add to. + * @param [in] elem Element to add. + * @param [in] set_head_to_elem Whether to set the list head to the newly added + * element. + */ +static UCS_F_ALWAYS_INLINE void +ucs_hlist_add_common(ucs_hlist_head_t *head, ucs_hlist_link_t *elem, + int set_head_to_elem) +{ + if (head->ptr == NULL) { + head->ptr = elem; + ucs_list_head_init(&elem->list); + } else { + ucs_list_insert_before(&head->ptr->list, &elem->list); + if (set_head_to_elem) { + head->ptr = elem; + } + } +} + + +/** + * Add element to the beginning of a detached-head list. + * + * @param [in] head List head to add to. + * @param [in] elem Element to add. + */ +static UCS_F_ALWAYS_INLINE void +ucs_hlist_add_head(ucs_hlist_head_t *head, ucs_hlist_link_t *elem) +{ + ucs_hlist_add_common(head, elem, 1); +} + + +/** + * Add element to the end of a detached-head list. + * + * @param [in] head List head to add to. + * @param [in] elem Element to add. + */ +static UCS_F_ALWAYS_INLINE void +ucs_hlist_add_tail(ucs_hlist_head_t *head, ucs_hlist_link_t *elem) +{ + ucs_hlist_add_common(head, elem, 0); +} + + +/** + * Remove an element from a detached-head list. + * + * @param [in] head List head to remove from. + * @param [in] elem Element to remove. + * + * @note If the element is not present in the list, this function has undefined + * behavior. + */ +static UCS_F_ALWAYS_INLINE void +ucs_hlist_del(ucs_hlist_head_t *head, ucs_hlist_link_t *elem) +{ + if (ucs_list_is_empty(&elem->list)) { + /* Remove elem if it's not the only one in the list. + * We assume here that head->ptr == elem, but cannot assert() to avoid + * dependency of assert.h */ + head->ptr = NULL; + } else { + if (head->ptr == elem) { + /* removing head of non-empty list, point to next elem */ + head->ptr = ucs_list_next(&elem->list, ucs_hlist_link_t, list); + } + ucs_list_del(&elem->list); + } +} + + +/** + * Remove the first element from a detached-head list, and return it. + * + * @param [in] head List head to remove from. + * + * @return The former list head element, or NULL if the list is empty. + */ +static UCS_F_ALWAYS_INLINE ucs_hlist_link_t* +ucs_hlist_extract_head(ucs_hlist_head_t *head) +{ + ucs_hlist_link_t *elem; + + if (head->ptr == NULL) { + return NULL; + } + + elem = head->ptr; + ucs_hlist_del(head, elem); // TOOD optimize by assuming elem=head->ptr + return elem; +} + + +/** + * Get list head element as the containing type, assuming the list is not empty. + * + * @param _head List head. + * @param _type Containing structure type. + * @param _member List element inside the containing structure. + * + * @note If the list is empty this macro has undefined behavior. + */ +#define ucs_hlist_head_elem(_head, _type, _member) \ + ucs_container_of((_head)->ptr, _type, _member) + + +/** + * Get list next element as the containing type. + * + * @param _elem List element. + * @param _type Containing structure type. + * @param _member List element inside the containing structure. + */ +#define ucs_hlist_next_elem(_elem, _member) \ + ucs_container_of(ucs_list_next(&(_elem)->_member.list, ucs_hlist_link_t, \ + list), \ + typeof(*(_elem)), _member) + + +/** + * Iterate over detached-head list. + * + * @param _elem Variable to hold the current list element + * @param _head Pointer to list head. + * @param _member List element inside the containing structure. + * + * @note The iteration is implemented by first setting the element to NULL, then + * inside 'for' loop condition (which is done before each iteration), we advance + * the element pointer and check for end condition: in the first iteration, + * when elem is NULL, we check that the list is not empty. For subsequent + * iterations, we check that elem has not reached the list head yet. + */ +#define ucs_hlist_for_each(_elem, _head, _member) \ + for (_elem = NULL; \ + (_elem == NULL) ? \ + /* First iteration: check _head->ptr != NULL. 2nd && condition is \ + * actually _elem != NULL which is always expected to be true. \ + * We can't check (&_elem->_member != NULL) because some compilers \ + * assume pointer-to-member is never NULL */ \ + (!ucs_hlist_is_empty(_head) && \ + ((_elem = ucs_hlist_head_elem(_head, typeof(*(_elem)), _member)) \ + != NULL)) : \ + /* rest of iterations: check _elem != _head->ptr */ \ + ((_elem = ucs_hlist_next_elem(_elem, _member)) != \ + ucs_hlist_head_elem(_head, typeof(*(_elem)), _member)); \ + ) + + +/** + * Remove the first element from a detached-head list, and return its containing + * type. + * + * @param _head List head to remove from. + * @param _type Type of the structure containing list element. + * @param _member List element inside the containing structure. + */ +#define ucs_list_extract_head_elem(_head, _type, _member) \ + ucs_container_of(ucs_hlist_extract_head(_head), _type, _member) + + +/** + * Iterate over detached-head list, while removing the head element, until the + * list becomes empty. + * + * @param _elem Variable to hold the current list element + * @param _head Pointer to list head. + * @param _member List element inside the containing structure. + */ +#define ucs_hlist_for_each_extract(_elem, _head, _member) \ + for (_elem = ucs_list_extract_head_elem(_head, typeof(*(_elem)), _member); \ + _elem != ucs_container_of(NULL, typeof(*(_elem)), _member); \ + _elem = ucs_list_extract_head_elem(_head, typeof(*(_elem)), _member)) + + +END_C_DECLS + +#endif diff --git a/src/ucs/datastruct/khash.h b/src/ucs/datastruct/khash.h index a9d7dc6fced..cb378966a6c 100644 --- a/src/ucs/datastruct/khash.h +++ b/src/ucs/datastruct/khash.h @@ -129,6 +129,17 @@ int main() { #include #include +/* Clang analyzer thinks that `h->flags` can be NULL, but this is + * the wrong assumption - add `kassert()` to suppress the warning */ +#ifdef __clang_analyzer__ +#include +#define kassert(...) assert(__VA_ARGS__) +#define kmemset_analyzer(P, Z, N) kmemset(P, Z, N) +#else +#define kassert(...) +#define kmemset_analyzer(P, Z, N) +#endif + /* compiler specific configuration */ #if UINT_MAX == 0xffffffffu @@ -246,6 +257,9 @@ static const double __ac_HASH_UPPER = 0.77; if (h->n_buckets) { \ khint_t k, i, last, mask, step = 0; \ mask = h->n_buckets - 1; \ + \ + kassert(h->flags != NULL); \ + \ k = __hash_func(key); i = k & mask; \ last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ @@ -271,6 +285,8 @@ static const double __ac_HASH_UPPER = 0.77; khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ if (!new_keys) { kfree(new_flags); return -1; } \ h->keys = new_keys; \ + kmemset_analyzer(h->keys + (h->n_buckets * sizeof(khkey_t)), 0, \ + (new_n_buckets - h->n_buckets) * sizeof(khkey_t)); \ if (kh_is_map) { \ khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ if (!new_vals) { kfree(new_flags); return -1; } \ @@ -330,6 +346,9 @@ static const double __ac_HASH_UPPER = 0.77; *ret = -1; return h->n_buckets; \ } \ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + \ + kassert(h->flags != NULL); \ + \ { \ khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ @@ -507,6 +526,13 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) */ #define kh_put(name, h, k, r) kh_put_##name(h, k, r) +typedef enum ucs_kh_put { + UCS_KH_PUT_FAILED = -1, + UCS_KH_PUT_KEY_PRESENT = 0, + UCS_KH_PUT_BUCKET_EMPTY = 1, + UCS_KH_PUT_BUCKET_CLEAR = 2 +} ucs_kh_put_t; + /*! @function @abstract Retrieve a key from the hash table. @param name Name of the hash table [symbol] diff --git a/src/ucs/datastruct/linear_func.h b/src/ucs/datastruct/linear_func.h new file mode 100644 index 00000000000..296042f2026 --- /dev/null +++ b/src/ucs/datastruct/linear_func.h @@ -0,0 +1,172 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCS_LINEAR_FUNC_H_ +#define UCS_LINEAR_FUNC_H_ + +#include +#include +#include + + +/** + * A 1d linear function, represented as f(x) = c + x * m. + */ +typedef struct { + double c; /* constant factor */ + double m; /* multiplicative factor */ +} ucs_linear_func_t; + + +/** + * Construct a linear function + * + * @param [in] c Linear function constant functor + * @param [in] m Linear function multiplicative functor + * + * @return A linear function which represents f(x) = c + x * m + */ +static UCS_F_ALWAYS_INLINE ucs_linear_func_t +ucs_linear_func_make(double c, double m) +{ + ucs_linear_func_t result; + + result.c = c; + result.m = m; + + return result; +} + + +/** + * Calculate the linear function value for a specific point. + * + * @param [in] func Linear function to apply. + * @param [in] x Point to apply the function at. + * + * @return f(x) + */ +static UCS_F_ALWAYS_INLINE double +ucs_linear_func_apply(ucs_linear_func_t f, double x) +{ + return f.c + (f.m * x); +} + + +/** + * Sum two linear functions. + * + * @param [in] func1 First function to add. + * @param [in] func2 Second function to add. + * + * @return Linear function representing (func1 + func2) + */ +static UCS_F_ALWAYS_INLINE ucs_linear_func_t +ucs_linear_func_add(ucs_linear_func_t func1, ucs_linear_func_t func2) +{ + return ucs_linear_func_make(func1.c + func2.c, func1.m + func2.m); +} + + +/** + * Subtract one linear function from another. + * + * @param [in] func1 Linear function to subtract from. + * @param [in] func2 Linear function to subtract. + * + * @return Linear function representing (func1 - func2) + */ +static inline ucs_linear_func_t +ucs_linear_func_sub(ucs_linear_func_t func1, ucs_linear_func_t func2) +{ + return ucs_linear_func_make(func1.c - func2.c, func1.m - func2.m); +} + + +/** + * Add one linear function to another in-place. + * + * @param [inout] func1 First linear function to add, and the result of the + * operation + * @param [in] func2 Second function to add. + */ +static inline void +ucs_linear_func_add_inplace(ucs_linear_func_t *func1, ucs_linear_func_t func2) +{ + func1->m += func2.m; + func1->c += func2.c; +} + + +/** + * Substitute the "x" argument of a linear function by another linear function + * and return the composition of the functions. + * + * @param [in] outer Linear function whose "x" argument should be substituted. + * @param [in] inner Linear function to substitute. + * + * @return Linear function representing outer(inner(x)) + */ +static inline ucs_linear_func_t +ucs_linear_func_compose(ucs_linear_func_t outer, ucs_linear_func_t inner) +{ + /* + * let outer(x) = outer.m*x + outer.c, and g(x) = g.m*x + g.c + * then outer(g(x)) = outer.m(inner.m*x + g.c) + outer.c = + * (outer.m * inner.m)x + (outer.m*g.c + outer.c) + */ + return ucs_linear_func_make((outer.m * inner.c) + outer.c, + outer.m * inner.m); +} + + +/** + * Find the intersection point between two linear functions. If the functions + * do not intersect, the result is undefined. + * + * @param [in] func1 First function to intersect. + * @param [in] func2 Second function to intersect. + * @param [out] x_intersect Upon success, set to the X-value of the + * intersection point. + * + * @return UCS_OK if success, UCS_ERR_INVALID_PARAM if the linear functions have + * no intersection, or if their intersection point exceeds the maximal + * double value. + * + */ +static inline ucs_status_t +ucs_linear_func_intersect(ucs_linear_func_t func1, ucs_linear_func_t func2, + double *x_intersect) +{ + double x; + + x = (func2.c - func1.c) / (func1.m - func2.m); + if (isnan(x) || isinf(x)) { + return UCS_ERR_INVALID_PARAM; + } + + *x_intersect = x; + return UCS_OK; +} + + +/* + * Increase the constant of a given linear function by a value of another linear + * function at a specific point. + * + * @param [inout] func Increase the constant of this linear function. + * @param [in] baseline_func Add the value of this linear function. + * @param [in] baseline_x Point at which to take the value of + * @a baseline_func. + */ +static inline void +ucs_linear_func_add_value_at(ucs_linear_func_t *func, + ucs_linear_func_t baseline_func, double baseline_x) +{ + func->c += ucs_linear_func_apply(baseline_func, baseline_x); +} + +#endif diff --git a/src/ucs/datastruct/list.h b/src/ucs/datastruct/list.h index b20848c4351..a67201a26fc 100644 --- a/src/ucs/datastruct/list.h +++ b/src/ucs/datastruct/list.h @@ -8,9 +8,10 @@ #define UCS_LIST_H_ #include -#include +BEGIN_C_DECLS + #define UCS_LIST_INITIALIZER(_prev, _next) \ { (_prev), (_next) } @@ -22,6 +23,15 @@ ucs_list_link_t name = UCS_LIST_INITIALIZER(&(name), &(name)) +/** + * A link in a circular list. + */ +typedef struct ucs_list_link { + struct ucs_list_link *prev; + struct ucs_list_link *next; +} ucs_list_link_t; + + /** * Initialize list head. * @@ -49,6 +59,18 @@ static inline void ucs_list_insert_replace(ucs_list_link_t *prev, next->prev = elem; } +/** + * Replace an element in a list with another element. + * + * @param elem Element in the list to replace. + * @param replacement New element to insert in place of 'elem'. + */ +static inline void ucs_list_replace(ucs_list_link_t *elem, + ucs_list_link_t *replacement) +{ + ucs_list_insert_replace(elem->prev, elem->next, replacement); +} + /** * Insert an item to a list after another item. * @@ -78,10 +100,10 @@ static inline void ucs_list_insert_before(ucs_list_link_t *pos, * * @param link Item to remove. */ -static inline void ucs_list_del(ucs_list_link_t *link) +static inline void ucs_list_del(ucs_list_link_t *elem) { - link->prev->next = link->next; - link->next->prev = link->prev; + elem->prev->next = elem->next; + elem->next->prev = elem->prev; } /** @@ -190,4 +212,6 @@ static inline unsigned long ucs_list_length(ucs_list_link_t *head) ucs_container_of(tmp, _type, _member); \ }) +END_C_DECLS + #endif diff --git a/src/ucs/datastruct/list_types.h b/src/ucs/datastruct/list_types.h deleted file mode 100644 index 631576a2e4c..00000000000 --- a/src/ucs/datastruct/list_types.h +++ /dev/null @@ -1,20 +0,0 @@ -/* -* Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. -* -* See file LICENSE for terms. -*/ - -#ifndef UCS_LIST_TYPES_H_ -#define UCS_LIST_TYPES_H_ - - -/** - * A link in a circular list. - */ -typedef struct ucs_list_link { - struct ucs_list_link *prev; - struct ucs_list_link *next; -} ucs_list_link_t; - - -#endif /* UCS_LIST_TYPES_H_ */ diff --git a/src/ucs/datastruct/mpmc.c b/src/ucs/datastruct/mpmc.c index be3cc54b565..07b49bafc4e 100644 --- a/src/ucs/datastruct/mpmc.c +++ b/src/ucs/datastruct/mpmc.c @@ -5,6 +5,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "mpmc.h" #include @@ -19,13 +23,13 @@ ucs_status_t ucs_mpmc_queue_init(ucs_mpmc_queue_t *mpmc, uint32_t length) mpmc->length = ucs_roundup_pow2(length); mpmc->shift = ucs_ilog2(mpmc->length); - if (mpmc->length >= UCS_BIT(UCS_MPMC_VALID_SHIFT)) { + if (mpmc->shift >= UCS_MPMC_VALID_SHIFT) { return UCS_ERR_INVALID_PARAM; } mpmc->consumer = 0; mpmc->producer = 0; - mpmc->queue = ucs_malloc(sizeof(int) * mpmc->length, "mpmc"); + mpmc->queue = ucs_malloc(sizeof(*mpmc->queue) * mpmc->length, "mpmc"); if (mpmc->queue == NULL) { return UCS_ERR_NO_MEMORY; } @@ -42,12 +46,12 @@ void ucs_mpmc_queue_cleanup(ucs_mpmc_queue_t *mpmc) ucs_free(mpmc->queue); } -static inline uint32_t __ucs_mpmc_queue_valid_bit(ucs_mpmc_queue_t *mpmc, uint32_t location) +static inline uint64_t __ucs_mpmc_queue_valid_bit(ucs_mpmc_queue_t *mpmc, uint32_t location) { - return (location >> mpmc->shift) & 1; + return ((uint64_t)location >> mpmc->shift) & 1; } -ucs_status_t ucs_mpmc_queue_push(ucs_mpmc_queue_t *mpmc, uint32_t value) +ucs_status_t ucs_mpmc_queue_push(ucs_mpmc_queue_t *mpmc, uint64_t value) { uint32_t location; @@ -67,9 +71,10 @@ ucs_status_t ucs_mpmc_queue_push(ucs_mpmc_queue_t *mpmc, uint32_t value) } -ucs_status_t ucs_mpmc_queue_pull(ucs_mpmc_queue_t *mpmc, uint32_t *value_p) +ucs_status_t ucs_mpmc_queue_pull(ucs_mpmc_queue_t *mpmc, uint64_t *value_p) { - uint32_t location, value; + uint32_t location; + uint64_t value; location = mpmc->consumer; if (location == mpmc->producer) { diff --git a/src/ucs/datastruct/mpmc.h b/src/ucs/datastruct/mpmc.h index 6c9df6c3dc4..3b9770f436b 100644 --- a/src/ucs/datastruct/mpmc.h +++ b/src/ucs/datastruct/mpmc.h @@ -10,7 +10,7 @@ #include #include -#define UCS_MPMC_VALID_SHIFT 31 +#define UCS_MPMC_VALID_SHIFT 63 #define UCS_MPMC_VALUE_MAX UCS_BIT(UCS_MPMC_VALID_SHIFT) /** @@ -25,7 +25,7 @@ typedef struct ucs_mpmc_queue { int shift; volatile uint32_t producer; /* Producer index */ volatile uint32_t consumer; /* Consumer index */ - uint32_t *queue; /* Array of data */ + uint64_t *queue; /* Array of data */ } ucs_mpmc_queue_t; @@ -49,7 +49,7 @@ void ucs_mpmc_queue_cleanup(ucs_mpmc_queue_t *mpmc); * @param value Value to push. * @return UCS_ERR_EXCEEDS_LIMIT if the queue is full. */ -ucs_status_t ucs_mpmc_queue_push(ucs_mpmc_queue_t *mpmc, uint32_t value); +ucs_status_t ucs_mpmc_queue_push(ucs_mpmc_queue_t *mpmc, uint64_t value); /** @@ -59,7 +59,7 @@ ucs_status_t ucs_mpmc_queue_push(ucs_mpmc_queue_t *mpmc, uint32_t value); * @param UCS_ERR_NO_PROGRESS if there is currently no available item to retrieve, * or another thread removed the current item. */ -ucs_status_t ucs_mpmc_queue_pull(ucs_mpmc_queue_t *mpmc, uint32_t *value_p); +ucs_status_t ucs_mpmc_queue_pull(ucs_mpmc_queue_t *mpmc, uint64_t *value_p); /** diff --git a/src/ucs/datastruct/mpool.c b/src/ucs/datastruct/mpool.c index 821ad62fc80..1b142f8ad23 100644 --- a/src/ucs/datastruct/mpool.c +++ b/src/ucs/datastruct/mpool.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "mpool.h" #include "mpool.inl" #include "queue.h" @@ -23,7 +27,8 @@ static inline ucs_mpool_elem_t *ucs_mpool_chunk_elem(ucs_mpool_data_t *data, ucs_mpool_chunk_t *chunk, unsigned elem_index) { - return chunk->elems + elem_index * ucs_mpool_elem_total_size(data); + return UCS_PTR_BYTE_OFFSET(chunk->elems, + elem_index * ucs_mpool_elem_total_size(data)); } static void ucs_mpool_chunk_leak_check(ucs_mpool_t *mp, ucs_mpool_chunk_t *chunk) @@ -192,7 +197,7 @@ void ucs_mpool_grow(ucs_mpool_t *mp, unsigned num_elems) chunk = ptr; chunk_padding = ucs_padding((uintptr_t)(chunk + 1) + data->align_offset, data->alignment); - chunk->elems = (void*)(chunk + 1) + chunk_padding; + chunk->elems = UCS_PTR_BYTE_OFFSET(chunk + 1, chunk_padding); chunk->num_elems = ucs_min(data->quota, (chunk_size - chunk_padding - sizeof(*chunk)) / ucs_mpool_elem_total_size(data)); @@ -286,11 +291,14 @@ typedef struct ucs_hugetlb_mpool_chunk_hdr { ucs_status_t ucs_mpool_hugetlb_malloc(ucs_mpool_t *mp, size_t *size_p, void **chunk_p) { ucs_hugetlb_mpool_chunk_hdr_t *chunk; + size_t real_size; +#ifdef SHM_HUGETLB void *ptr; ucs_status_t status; - size_t real_size; int shmid; +#endif +#ifdef SHM_HUGETLB ptr = NULL; /* First, try hugetlb */ @@ -302,6 +310,7 @@ ucs_status_t ucs_mpool_hugetlb_malloc(ucs_mpool_t *mp, size_t *size_p, void **ch chunk->hugetlb = 1; goto out_ok; } +#endif /* Fallback to glibc */ real_size = *size_p; diff --git a/src/ucs/datastruct/pgtable.c b/src/ucs/datastruct/pgtable.c index 1ee0d48c22a..cb9dd8b1614 100644 --- a/src/ucs/datastruct/pgtable.c +++ b/src/ucs/datastruct/pgtable.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "pgtable.h" #include @@ -81,6 +85,14 @@ static inline void ucs_pgt_dir_release(ucs_pgtable_t *pgtable, ucs_pgt_dir_t* pg pgtable->pgd_release_cb(pgtable, pgd); } +static inline void ucs_pgt_address_advance(ucs_pgt_addr_t *address_p, + unsigned order) +{ + ucs_assert(order < 64); + /* coverity[large_shift] */ + *address_p += 1ul << order; +} + static void ucs_pgt_entry_dump_recurs(const ucs_pgtable_t *pgtable, unsigned indent, const ucs_pgt_entry_t *pte, unsigned pte_index, ucs_pgt_addr_t base, ucs_pgt_addr_t mask, @@ -223,15 +235,20 @@ static unsigned ucs_pgtable_get_next_page_order(ucs_pgt_addr_t start, ucs_pgt_ad ucs_assertv(ucs_pgt_is_addr_aligned(start), "start=0x%lx", start); ucs_assertv(ucs_pgt_is_addr_aligned(end), "end=0x%lx", end); - if (end - start == 0) { + if ((end == 0) && (start == 0)) { log2_len = UCS_PGT_ADDR_ORDER; /* entire range */ + } else if (end == start) { + log2_len = UCS_PGT_ADDR_SHIFT; } else { log2_len = ucs_ilog2(end - start); + if (start) { + log2_len = ucs_min(ucs_ffs64(start), log2_len); + } } - if (start != 0) { - log2_len = ucs_min(ucs_ffs64(start), log2_len); - } - ucs_assertv(log2_len >= UCS_PGT_ADDR_SHIFT, "log2_len=%u start=0x%lx end=0x%lx", + + ucs_assertv((log2_len >= UCS_PGT_ADDR_SHIFT) && + (log2_len <= UCS_PGT_ADDR_ORDER), + "log2_len=%u start=0x%lx end=0x%lx", log2_len, start, end); /* Order should be: [ADDR_SHIFT + k * ENTRY_SHIFT] */ @@ -250,7 +267,7 @@ static ucs_status_t ucs_pgtable_insert_page(ucs_pgtable_t *pgtable, ucs_pgt_addr_t address, unsigned order, ucs_pgt_region_t *region) { - ucs_pgt_dir_t dummy_pgd; + ucs_pgt_dir_t dummy_pgd = {}; ucs_pgt_entry_t *pte; ucs_pgt_dir_t *pgd; unsigned shift; @@ -362,7 +379,7 @@ static ucs_status_t ucs_pgtable_remove_page(ucs_pgtable_t *pgtable, ucs_pgt_addr_t address, unsigned order, ucs_pgt_region_t *region) { - ucs_pgt_dir_t dummy_dir; + ucs_pgt_dir_t dummy_pgd = {}; ucs_status_t status; ucs_pgtable_check_page(address, order); @@ -371,7 +388,7 @@ ucs_pgtable_remove_page(ucs_pgtable_t *pgtable, ucs_pgt_addr_t address, return UCS_ERR_NO_ELEM; } - status = ucs_pgtable_remove_page_recurs(pgtable, address, order, &dummy_dir, + status = ucs_pgtable_remove_page_recurs(pgtable, address, order, &dummy_pgd, &pgtable->root, pgtable->shift, region); if (status != UCS_OK) { @@ -404,7 +421,8 @@ ucs_status_t ucs_pgtable_insert(ucs_pgtable_t *pgtable, ucs_pgt_region_t *region if (status != UCS_OK) { goto err; } - address += 1ul << order; + + ucs_pgt_address_advance(&address, order); } ++pgtable->num_regions; @@ -418,7 +436,7 @@ ucs_status_t ucs_pgtable_insert(ucs_pgtable_t *pgtable, ucs_pgt_region_t *region while (address < end) { order = ucs_pgtable_get_next_page_order(address, end); ucs_pgtable_remove_page(pgtable, address, order, region); - address += 1ul << order; + ucs_pgt_address_advance(&address, order); } return status; } @@ -445,7 +463,8 @@ ucs_status_t ucs_pgtable_remove(ucs_pgtable_t *pgtable, ucs_pgt_region_t *region ucs_assert(address == region->start); /* Cannot be partially removed */ return status; } - address += 1ul << order; + + ucs_pgt_address_advance(&address, order); } ucs_assert(pgtable->num_regions > 0); @@ -550,7 +569,7 @@ void ucs_pgtable_search_range(const ucs_pgtable_t *pgtable, ucs_pgt_addr_t address = ucs_align_down_pow2(from, UCS_PGT_ADDR_ALIGN); ucs_pgt_addr_t end = ucs_align_up_pow2(to, UCS_PGT_ADDR_ALIGN); ucs_pgt_region_t *last; - unsigned order = 0; + unsigned order; /* if the page table is covering only part of the address space, intersect * the range with page table address span */ @@ -562,13 +581,18 @@ void ucs_pgtable_search_range(const ucs_pgtable_t *pgtable, } last = NULL; - while ((address <= to) && (order != UCS_PGT_ADDR_ORDER)) { + while (address <= to) { order = ucs_pgtable_get_next_page_order(address, end); if ((address & pgtable->mask) == pgtable->base) { ucs_pgtable_search_recurs(pgtable, address, order, &pgtable->root, pgtable->shift, cb, arg, &last); } - address += 1ul << order; + + if (order == UCS_PGT_ADDR_ORDER) { + break; + } + + ucs_pgt_address_advance(&address, order); } } diff --git a/src/ucs/datastruct/ptr_array.c b/src/ucs/datastruct/ptr_array.c index f2f2c3d8007..691281bddf2 100644 --- a/src/ucs/datastruct/ptr_array.c +++ b/src/ucs/datastruct/ptr_array.c @@ -1,49 +1,48 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2013. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2020. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ptr_array.h" #include #include #include #include -#include /* Initial allocation size */ #define UCS_PTR_ARRAY_INITIAL_SIZE 8 -static inline int ucs_ptr_array_is_free(ucs_ptr_array_t *ptr_array, unsigned index) +static UCS_F_ALWAYS_INLINE int +ucs_ptr_array_is_free(ucs_ptr_array_t *ptr_array, unsigned element_index) { - return (index < ptr_array->size) && - __ucs_ptr_array_is_free(ptr_array->start[index]); + return (element_index < ptr_array->size) && + __ucs_ptr_array_is_free(ptr_array->start[element_index]); } -static inline uint32_t ucs_ptr_array_placeholder_get(ucs_ptr_array_elem_t elem) +static UCS_F_ALWAYS_INLINE uint32_t +ucs_ptr_array_size_free_get_free_ahead(ucs_ptr_array_elem_t elem) { ucs_assert(__ucs_ptr_array_is_free(elem)); - return elem >> UCS_PTR_ARRAY_PLCHDR_SHIFT; -} - -static inline void ucs_ptr_array_placeholder_set(ucs_ptr_array_elem_t *elem, - uint32_t placeholder) -{ - *elem = (*elem & ~UCS_PTR_ARRAY_PLCHDR_MASK) | - (((ucs_ptr_array_elem_t)placeholder) << UCS_PTR_ARRAY_PLCHDR_SHIFT); + return elem >> UCS_PTR_ARRAY_FREE_AHEAD_SHIFT; } -static inline unsigned +static UCS_F_ALWAYS_INLINE unsigned ucs_ptr_array_freelist_get_next(ucs_ptr_array_elem_t elem) { ucs_assert(__ucs_ptr_array_is_free(elem)); return (elem & UCS_PTR_ARRAY_NEXT_MASK) >> UCS_PTR_ARRAY_NEXT_SHIFT; } -static inline void +static UCS_F_ALWAYS_INLINE void ucs_ptr_array_freelist_set_next(ucs_ptr_array_elem_t *elem, unsigned next) { ucs_assert(next <= UCS_PTR_ARRAY_NEXT_MASK); @@ -51,20 +50,49 @@ ucs_ptr_array_freelist_set_next(ucs_ptr_array_elem_t *elem, unsigned next) (((ucs_ptr_array_elem_t)next) << UCS_PTR_ARRAY_NEXT_SHIFT); } +/** + * Sets all values of a free ptr array element + * + * @param [in/out] elem Pointer to a free element in the ptr array. + * @param [in] free_ahead Number of free consecutive elements ahead. + * @param [in] next Pointer to the next element in the ptr array. + * + * Complexity: O(1) + */ +static UCS_F_ALWAYS_INLINE void +ucs_ptr_array_freelist_element_set(ucs_ptr_array_elem_t *elem, + uint32_t free_ahead, + unsigned next) +{ + ucs_assert(next <= UCS_PTR_ARRAY_NEXT_MASK); + + *elem = UCS_PTR_ARRAY_FLAG_FREE | + (((ucs_ptr_array_elem_t)free_ahead) << UCS_PTR_ARRAY_FREE_AHEAD_SHIFT) | + (((ucs_ptr_array_elem_t)next) << UCS_PTR_ARRAY_NEXT_SHIFT); +} + +static UCS_F_ALWAYS_INLINE void +ucs_ptr_array_freelist_element_set_free_ahead(ucs_ptr_array_elem_t *elem, + uint32_t free_ahead) +{ + ucs_ptr_array_freelist_element_set(elem, free_ahead, + ucs_ptr_array_freelist_get_next(*elem)); +} + static void UCS_F_MAYBE_UNUSED ucs_ptr_array_dump(ucs_ptr_array_t *ptr_array) { -#if ENABLE_ASSERT - ucs_ptr_array_elem_t elem; +#if UCS_ENABLE_ASSERT unsigned i; - ucs_trace_data("ptr_array start %p size %u", ptr_array->start, ptr_array->size); + ucs_trace_data("ptr_array start %p size %u", + ptr_array->start, ptr_array->size); for (i = 0; i < ptr_array->size; ++i) { - elem = ptr_array->start[i]; if (ucs_ptr_array_is_free(ptr_array, i)) { - ucs_trace_data("[%u]= (%u)", i, - ucs_ptr_array_placeholder_get(elem)); + ucs_trace_data("(%u) [%u]= [%u]=", i, + ucs_ptr_array_size_free_get_free_ahead(ptr_array->start[i]), + ucs_ptr_array_freelist_get_next(ptr_array->start[i])); } else { - ucs_trace_data("[%u]=%p", i, (void*)elem); + ucs_trace_data("[%u]=%p", i, (void*)ptr_array->start[i]); } } @@ -84,12 +112,10 @@ static void ucs_ptr_array_clear(ucs_ptr_array_t *ptr_array) ptr_array->freelist = UCS_PTR_ARRAY_SENTINEL; } -void ucs_ptr_array_init(ucs_ptr_array_t *ptr_array, uint32_t init_placeholder, - const char *name) +void ucs_ptr_array_init(ucs_ptr_array_t *ptr_array, const char *name) { - ptr_array->init_placeholder = init_placeholder; ucs_ptr_array_clear(ptr_array); -#if ENABLE_MEMTRACK +#ifdef ENABLE_MEMTRACK ucs_snprintf_zero(ptr_array->name, sizeof(ptr_array->name), "%s", name); #endif } @@ -102,7 +128,8 @@ void ucs_ptr_array_cleanup(ucs_ptr_array_t *ptr_array) for (i = 0; i < ptr_array->size; ++i) { if (!ucs_ptr_array_is_free(ptr_array, i)) { ++inuse; - ucs_trace("ptr_array(%p) idx %d is not free during cleanup", ptr_array, i); + ucs_trace("ptr_array(%p) idx %d is not free during cleanup", + ptr_array, i); } } @@ -114,29 +141,22 @@ void ucs_ptr_array_cleanup(ucs_ptr_array_t *ptr_array) ucs_ptr_array_clear(ptr_array); } -static void ucs_ptr_array_grow(ucs_ptr_array_t *ptr_array UCS_MEMTRACK_ARG) +static void ucs_ptr_array_grow(ucs_ptr_array_t *ptr_array, unsigned new_size + UCS_MEMTRACK_ARG) { ucs_ptr_array_elem_t *new_array; - unsigned curr_size, new_size; - unsigned i, next; - - curr_size = ptr_array->size; - if (curr_size == 0) { - new_size = UCS_PTR_ARRAY_INITIAL_SIZE; - } else { - new_size = curr_size * 2; - } + unsigned curr_size, i, next; /* Allocate new array */ new_array = ucs_malloc(new_size * sizeof(ucs_ptr_array_elem_t) UCS_MEMTRACK_VAL); ucs_assert_always(new_array != NULL); + curr_size = ptr_array->size; memcpy(new_array, ptr_array->start, curr_size * sizeof(ucs_ptr_array_elem_t)); /* Link all new array items */ for (i = curr_size; i < new_size; ++i) { - new_array[i] = UCS_PTR_ARRAY_FLAG_FREE; - ucs_ptr_array_placeholder_set(&new_array[i], ptr_array->init_placeholder); - ucs_ptr_array_freelist_set_next(&new_array[i], i + 1); + ucs_ptr_array_freelist_element_set(&new_array[i], new_size - i, + i + 1); } ucs_ptr_array_freelist_set_next(&new_array[new_size - 1], UCS_PTR_ARRAY_SENTINEL); @@ -147,9 +167,9 @@ static void ucs_ptr_array_grow(ucs_ptr_array_t *ptr_array UCS_MEMTRACK_ARG) next = ptr_array->freelist; do { i = next; - next = ucs_ptr_array_freelist_get_next(ptr_array->start[i]); + next = ucs_ptr_array_freelist_get_next(new_array[i]); } while (next != UCS_PTR_ARRAY_SENTINEL); - ucs_ptr_array_freelist_set_next(&ptr_array->start[i], curr_size); + ucs_ptr_array_freelist_set_next(&new_array[i], curr_size); } /* Switch to new array */ @@ -158,50 +178,187 @@ static void ucs_ptr_array_grow(ucs_ptr_array_t *ptr_array UCS_MEMTRACK_ARG) ptr_array->size = new_size; } -unsigned ucs_ptr_array_insert(ucs_ptr_array_t *ptr_array, void *value, - uint32_t *placeholder_p) +unsigned ucs_ptr_array_insert(ucs_ptr_array_t *ptr_array, void *value) { ucs_ptr_array_elem_t *elem; - unsigned index; + unsigned element_index, new_size; ucs_assert_always(((uintptr_t)value & UCS_PTR_ARRAY_FLAG_FREE) == 0); if (ptr_array->freelist == UCS_PTR_ARRAY_SENTINEL) { - ucs_ptr_array_grow(ptr_array UCS_MEMTRACK_NAME(ptr_array->name)); + new_size = ucs_max(UCS_PTR_ARRAY_INITIAL_SIZE, ptr_array->size * 2); + ucs_ptr_array_grow(ptr_array, new_size UCS_MEMTRACK_NAME(ptr_array->name)); } /* Get the first item on the free list */ - index = ptr_array->freelist; - ucs_assert(index != UCS_PTR_ARRAY_SENTINEL); - elem = &ptr_array->start[index]; + element_index = ptr_array->freelist; + ucs_assert(element_index != UCS_PTR_ARRAY_SENTINEL); + + elem = &ptr_array->start[element_index]; - /* Remove from free list */ + /* Remove from free list and populate */ ptr_array->freelist = ucs_ptr_array_freelist_get_next(*elem); + *elem = (uintptr_t)value; + + return element_index; +} + +void ucs_ptr_array_set(ucs_ptr_array_t *ptr_array, unsigned element_index, + void *new_val) +{ + ucs_ptr_array_elem_t *elem; + unsigned next, free_iter, free_ahead, new_size; + + if (ucs_unlikely(element_index > ptr_array->size)) { + new_size = ucs_max(ptr_array->size * 2, element_index + 1); + ucs_ptr_array_grow(ptr_array, new_size UCS_MEMTRACK_NAME(ptr_array->name)); + } else if (!__ucs_ptr_array_is_free(ptr_array->start[element_index])) { + ptr_array->start[element_index] = (uintptr_t)new_val; + return; + } + + next = ucs_ptr_array_freelist_get_next(ptr_array->start[element_index]); + ptr_array->start[element_index] = (uintptr_t)new_val; + + /* update the "next index" in the free list (removing element_index from it) */ + free_iter = ptr_array->freelist; + if (ucs_unlikely(free_iter == element_index)) { + ptr_array->freelist = next; + } else { + while (element_index != + ucs_ptr_array_freelist_get_next(ptr_array->start[free_iter])) { + free_iter = + ucs_ptr_array_freelist_get_next(ptr_array->start[free_iter]); + ucs_assert(free_iter != UCS_PTR_ARRAY_SENTINEL); + } + ucs_ptr_array_freelist_set_next(ptr_array->start + free_iter, next); + } + + /* update the "free-ahead" for the cells before me */ + free_ahead = 1; + elem = ptr_array->start + element_index - 1; + while ((elem >= ptr_array->start) && (__ucs_ptr_array_is_free(*elem))) { + ucs_ptr_array_freelist_element_set_free_ahead(elem, free_ahead); + free_ahead++; + elem--; + } +} + +void ucs_ptr_array_remove(ucs_ptr_array_t *ptr_array, unsigned element_index) +{ + ucs_ptr_array_elem_t *next_elem; + uint32_t size_free_ahead; - /* Populate */ - *placeholder_p = ucs_ptr_array_placeholder_get(*elem); - *elem = (uintptr_t)value; - return index; + ucs_assert_always(!ucs_ptr_array_is_free(ptr_array, element_index)); + + if (ucs_ptr_array_is_free(ptr_array, element_index + 1)) { + next_elem = &ptr_array->start[element_index + 1]; + size_free_ahead = ucs_ptr_array_size_free_get_free_ahead(*next_elem) + 1; + } else { + size_free_ahead = 1; + } + + ucs_ptr_array_freelist_element_set(&ptr_array->start[element_index], + size_free_ahead, ptr_array->freelist); + + /* Make sure the next element is free */ + ucs_assert(__ucs_ptr_array_is_free(ptr_array->start[element_index + size_free_ahead - 1])); + + ptr_array->freelist = element_index; +} + +void *ucs_ptr_array_replace(ucs_ptr_array_t *ptr_array, unsigned element_index, + void *new_val) +{ + void *old_elem; + + ucs_assert_always(!ucs_ptr_array_is_free(ptr_array, element_index)); + old_elem = (void*)ptr_array->start[element_index]; + ptr_array->start[element_index] = (uintptr_t)new_val; + return old_elem; +} + + +/* + * Locked interface functions implementation + */ + +ucs_status_t +ucs_ptr_array_locked_init(ucs_ptr_array_locked_t *locked_ptr_array, + const char *name) +{ + ucs_status_t status; + + /* Initialize spinlock */ + status = ucs_recursive_spinlock_init(&locked_ptr_array->lock, 0); + if (status != UCS_OK) { + return status; + } + + /* Call unlocked function */ + ucs_ptr_array_init(&locked_ptr_array->super, name); + + return UCS_OK; +} + +void ucs_ptr_array_locked_cleanup(ucs_ptr_array_locked_t *locked_ptr_array) +{ + ucs_status_t status; + + ucs_recursive_spin_lock(&locked_ptr_array->lock); + /* Call unlocked function */ + ucs_ptr_array_cleanup(&locked_ptr_array->super); + ucs_recursive_spin_unlock(&locked_ptr_array->lock); + + /* Destroy spinlock */ + status = ucs_recursive_spinlock_destroy(&locked_ptr_array->lock); + if (status != UCS_OK) { + ucs_warn("ucs_recursive_spinlock_destroy() - failed (%d)", status); + } } -void ucs_ptr_array_remove(ucs_ptr_array_t *ptr_array, unsigned index, - uint32_t placeholder) +unsigned ucs_ptr_array_locked_insert(ucs_ptr_array_locked_t *locked_ptr_array, + void *value) { - ucs_ptr_array_elem_t *elem = &ptr_array->start[index]; + unsigned element_index; - ucs_assert_always(!ucs_ptr_array_is_free(ptr_array, index)); - *elem = UCS_PTR_ARRAY_FLAG_FREE; - ucs_ptr_array_placeholder_set(elem, placeholder); - ucs_ptr_array_freelist_set_next(elem, ptr_array->freelist); - ptr_array->freelist = index; + ucs_recursive_spin_lock(&locked_ptr_array->lock); + /* Call unlocked function */ + element_index = ucs_ptr_array_insert(&locked_ptr_array->super, value); + ucs_recursive_spin_unlock(&locked_ptr_array->lock); + + return element_index; } -void *ucs_ptr_array_replace(ucs_ptr_array_t *ptr_array, unsigned index, void *new_val) +void ucs_ptr_array_locked_set(ucs_ptr_array_locked_t *locked_ptr_array, + unsigned element_index, void *new_val) +{ + ucs_recursive_spin_lock(&locked_ptr_array->lock); + /* Call unlocked function */ + ucs_ptr_array_set(&locked_ptr_array->super, element_index, new_val); + ucs_recursive_spin_unlock(&locked_ptr_array->lock); +} + +void ucs_ptr_array_locked_remove(ucs_ptr_array_locked_t *locked_ptr_array, + unsigned element_index) +{ + ucs_recursive_spin_lock(&locked_ptr_array->lock); + /* Call unlocked function */ + ucs_ptr_array_remove(&locked_ptr_array->super, element_index); + ucs_recursive_spin_unlock(&locked_ptr_array->lock); +} + +void *ucs_ptr_array_locked_replace(ucs_ptr_array_locked_t *locked_ptr_array, + unsigned element_index, void *new_val) { void *old_elem; - ucs_assert_always(!ucs_ptr_array_is_free(ptr_array, index)); - old_elem = (void *)ptr_array->start[index]; - ptr_array->start[index] = (uintptr_t)new_val; + ucs_recursive_spin_lock(&locked_ptr_array->lock); + /* Call unlocked function */ + old_elem = ucs_ptr_array_replace(&locked_ptr_array->super, element_index, + new_val); + ucs_recursive_spin_unlock(&locked_ptr_array->lock); + return old_elem; } + diff --git a/src/ucs/datastruct/ptr_array.h b/src/ucs/datastruct/ptr_array.h index 6d7d38371ad..c57adabe260 100644 --- a/src/ucs/datastruct/ptr_array.h +++ b/src/ucs/datastruct/ptr_array.h @@ -1,5 +1,6 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2013. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2020. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -9,33 +10,64 @@ #include #include - +#include +#include /* * Array element layout: * * 64 32 1 0 * +-----------------+----------------+---+ - * free: | placeholder | next index | 1 | + * free: | free_ahead | next index | 1 | * +-----------------+----------------+---+ * used: | user pointer | 0 | * +-----------------+----------------+---+ * * + * free_ahead is the number of consecutive free elements ahead. + * + * The remove / insert algorithm works as follows: + * On remove of an index: If start[index+1] is free ==> + * start[index].free_elements_ahead = start[index+1].free_elements_ahead+1 + * Then, the removed index is pushed to the HEAD of the freelist. + * NOTE, that if start[index+1] is free ==> It's already in the freelist !!! + * + * On insert, we fetch the first entry of the freelist and we rely on the + * fact that the remove/insert mechanism effectively implements a LIFO + * freelist, i.e. the last item pushed into the freelist will be fetched + * first ==> There is no chance that index+1 will be fetched before index, + * since index+1 was already in the list before index was put into the list. + * + * Therefore, we can rely on the free_size_ahead field to tell how many free + * elements are from any index in the freelist. + * + * To clarify, "free_ahead" is a best-effort optimization, so when it is not + * updated on removal - the for-each code runs slower, but still correctly. + * This decision was made in order to preserve the O(1) performance of + * ucs_ptr_array_remove() - at the expense of ptr_array_for_each() performance. + * If we wanted to favor ptr_array_for_each() we had to update "free_ahead" + * values in all the empty cells before the changed one, a noticeable overhead. + * Instead, the for-each checks if the cell is empty even if it's indicated as + * such by "free_ahead". As for insert() - a new cell can be either inserted + * right after an occupied cell (no need to update "free_ahead") or instead of + * a removed cell (so that "free_ahead" already points to it). The resulting + * effect is that "free_ahead" may have "false positives" but never "false + * negatives". Set() is different, because it "messes" with this logic - and + * can create that "false negative". This is why it requires such a complicated + * update of the "free_ahead" (unless the set overwrites an occupied cell). + * */ typedef uint64_t ucs_ptr_array_elem_t; /** * A sparse array of pointers. - * Free slots can hold 32-bit placeholder value. */ typedef struct ucs_ptr_array { - uint32_t init_placeholder; ucs_ptr_array_elem_t *start; unsigned freelist; unsigned size; -#if ENABLE_MEMTRACK +#ifdef ENABLE_MEMTRACK char name[64]; #endif } ucs_ptr_array_t; @@ -44,11 +76,11 @@ typedef struct ucs_ptr_array { /* Flags added to lower bits of the value */ #define UCS_PTR_ARRAY_FLAG_FREE ((unsigned long)0x01) /* Slot is free */ -#define UCS_PTR_ARRAY_PLCHDR_SHIFT 32 -#define UCS_PTR_ARRAY_PLCHDR_MASK (((ucs_ptr_array_elem_t)-1) & ~UCS_MASK(UCS_PTR_ARRAY_PLCHDR_SHIFT)) -#define UCS_PTR_ARRAY_NEXT_SHIFT 1 -#define UCS_PTR_ARRAY_NEXT_MASK (UCS_MASK(UCS_PTR_ARRAY_PLCHDR_SHIFT) & ~UCS_MASK(UCS_PTR_ARRAY_NEXT_SHIFT)) -#define UCS_PTR_ARRAY_SENTINEL (UCS_PTR_ARRAY_NEXT_MASK >> UCS_PTR_ARRAY_NEXT_SHIFT) +#define UCS_PTR_ARRAY_FREE_AHEAD_SHIFT 32 +#define UCS_PTR_ARRAY_FREE_AHEAD_MASK (((ucs_ptr_array_elem_t)-1) & ~UCS_MASK(UCS_PTR_ARRAY_FREE_AHEAD_SHIFT)) +#define UCS_PTR_ARRAY_NEXT_SHIFT 1 +#define UCS_PTR_ARRAY_NEXT_MASK (UCS_MASK(UCS_PTR_ARRAY_FREE_AHEAD_SHIFT) & ~UCS_MASK(UCS_PTR_ARRAY_NEXT_SHIFT)) +#define UCS_PTR_ARRAY_SENTINEL (UCS_PTR_ARRAY_NEXT_MASK >> UCS_PTR_ARRAY_NEXT_SHIFT) #define __ucs_ptr_array_is_free(_elem) \ ((uintptr_t)(_elem) & UCS_PTR_ARRAY_FLAG_FREE) @@ -57,15 +89,18 @@ typedef struct ucs_ptr_array { /** * Initialize the array. * - * @param init_placeholder Default placeholder value. + * @param [in] ptr_array Pointer to a ptr array. + * @param [in] name The name of the ptr array. */ -void ucs_ptr_array_init(ucs_ptr_array_t *ptr_array, uint32_t init_placeholder, - const char *name); +void ucs_ptr_array_init(ucs_ptr_array_t *ptr_array, const char *name); /** * Cleanup the array. - * All values should already be removed from it. + * + * @param ptr_array Pointer to a ptr array. + * + * @note All values should already be removed from it. */ void ucs_ptr_array_cleanup(ucs_ptr_array_t *ptr_array); @@ -73,45 +108,77 @@ void ucs_ptr_array_cleanup(ucs_ptr_array_t *ptr_array); /** * Insert a pointer to the array. * - * @param value Pointer to insert. Must be 8-byte aligned. - * @param placeholder Filled with placeholder value. - * @return The index to which the value was inserted. + * @param [in] ptr_array Pointer to a ptr array. + * @param [in] value Pointer to insert. Must be 8-byte aligned. + * + * @return The index to which the value was inserted. * * Complexity: amortized O(1) * - * Note: The array will grow if needed. + * @note The array will grow if needed. + */ +unsigned ucs_ptr_array_insert(ucs_ptr_array_t *ptr_array, void *value); + + +/** + * Set a pointer in the array, overwriting the contents of the slot. + * + * @param [in] ptr_array Pointer to a ptr array. + * @param [in] element_index Index of slot. + * @param [in] new_val Value to put into slot given by index. + * + * Complexity: O(n) */ -unsigned ucs_ptr_array_insert(ucs_ptr_array_t *ptr_array, void *value, - uint32_t *placeholder_p); +void ucs_ptr_array_set(ucs_ptr_array_t *ptr_array, unsigned element_index, + void *new_val); /** * Remove a pointer from the array. * - * @param index Index to remove from. - * @param placeholder Value to put in the free slot. + * @param [in] ptr_array Pointer to a ptr array. + * @param [in] element_index Index to remove from. * * Complexity: O(1) */ -void ucs_ptr_array_remove(ucs_ptr_array_t *ptr_array, unsigned index, - uint32_t placeholder); +void ucs_ptr_array_remove(ucs_ptr_array_t *ptr_array, unsigned element_index); + + +/** + * Replace pointer in the array, assuming the slot is occupied. + * + * @param [in] ptr_array Pointer to a ptr array. + * @param [in] element_index Index of slot. + * @param [in] new_val Value to put into slot given by index. + * + * @return Old value of the slot + */ +void *ucs_ptr_array_replace(ucs_ptr_array_t *ptr_array, unsigned element_index, + void *new_val); /** - * Replace pointer in the array - * @param index index of slot - * @param new_val value to put into slot given by index - * @return old value of the slot + * Get the current size of the ptr array + * + * @param [in] ptr_array Pointer to a ptr array. + * + * @return Size of the ptr array. */ -void *ucs_ptr_array_replace(ucs_ptr_array_t *ptr_array, unsigned index, void *new_val); +static UCS_F_ALWAYS_INLINE unsigned +ucs_ptr_array_get_size(ucs_ptr_array_t *ptr_array) +{ + return ptr_array->size; +} /** * Retrieve a value from the array. * - * @param index Index to retrieve the value from. - * @param value Filled with the value. - * @return Whether the value is present and valid. + * @param [in] _ptr_array Pointer to a ptr array. + * @param [in] _index Index to retrieve the value from. + * @param [out] _var Filled with the value. + * + * @return Whether the value is present and valid. * * Complexity: O(1) */ @@ -121,12 +188,249 @@ void *ucs_ptr_array_replace(ucs_ptr_array_t *ptr_array, unsigned index, void *ne !__ucs_ptr_array_is_free(_var = (void*)((_ptr_array)->start[_index]))) +/** + * For-each user function: Calculates how many free elements are ahead. + * + * @param [in] ptr_array Pointer to a ptr array. + * @param [in] element_index Index of slot + * + * @return size_elem - The number of free elements ahead if free, if not 1. + */ +static UCS_F_ALWAYS_INLINE uint32_t +__ucs_ptr_array_for_each_get_step_size(ucs_ptr_array_t *ptr_array, + unsigned element_index) +{ + uint32_t size_elem; + ucs_ptr_array_elem_t elem = ptr_array->start[element_index]; + + if (ucs_unlikely(__ucs_ptr_array_is_free((ucs_ptr_array_elem_t)elem))) { + size_elem = (elem >> UCS_PTR_ARRAY_FREE_AHEAD_SHIFT); + } else { + size_elem = 1; + } + + /* Prefetch the next item */ + ucs_prefetch(&ptr_array->start[element_index + size_elem]); + ucs_assert(size_elem > 0); + + return size_elem; +} + + +/** + * Check if element is free. + * + * @param [in] _elem An element in the ptr array. + * + * @return 1 if the element is free and 0 if it's occupied. + */ +#define __ucs_ptr_array_is_free(_elem) ((uintptr_t)(_elem) & UCS_PTR_ARRAY_FLAG_FREE) + + /** * Iterate over all valid elements in the array. + * + * @param [out] _var Pointer to current array element in the foreach. + * @param [out] _index Index variable to use as iterator (unsigned). + * @param [in] _ptr_array Pointer to a ptr array. */ #define ucs_ptr_array_for_each(_var, _index, _ptr_array) \ - for (_index = 0; _index < (_ptr_array)->size; ++_index) \ - if (!__ucs_ptr_array_is_free(_var = (void*)((_ptr_array)->start[_index]))) \ + for ((_index) = 0; ((_index) < (_ptr_array)->size); \ + (_index) += __ucs_ptr_array_for_each_get_step_size((_ptr_array), (_index))) \ + if ((ucs_likely(!__ucs_ptr_array_is_free( \ + (ucs_ptr_array_elem_t)((_var) = (void *)((_ptr_array)->start[(_index)])))))) + + +/** + * Locked interface + */ + + +/* Locked ptr array */ +typedef struct ucs_ptr_array_locked { + ucs_ptr_array_t super; + ucs_recursive_spinlock_t lock; +} ucs_ptr_array_locked_t; + + +/** + * Locked array init + * + * @param [in] locked_ptr_array Pointer to a locked ptr array. + * @param [in] name The name of the ptr array. + * + * @return Success or failure. + */ +ucs_status_t +ucs_ptr_array_locked_init(ucs_ptr_array_locked_t *locked_ptr_array, + const char *name); + + +/** + * Cleanup the locked array. + * + * @param [in] locked_ptr_array Pointer to a locked ptr array. + * + * @note All values should already be removed from it. + */ +void ucs_ptr_array_locked_cleanup(ucs_ptr_array_locked_t *locked_ptr_array); + + +/** + * Insert a pointer to the locked array. + * + * @param [in] locked_ptr_array Pointer to a locked ptr array. + * @param [in] value Pointer to insert. Must be 8-byte aligned. + * + * @return The index to which the value was inserted. + * + * Complexity: Amortized O(1) + * + * @note The array will grow if needed. + */ +unsigned ucs_ptr_array_locked_insert(ucs_ptr_array_locked_t *locked_ptr_array, + void *value); + + +/** + * Set a pointer in the array, overwriting the contents of the slot. + * + * @param [in] locked_ptr_array Pointer to a locked ptr array. + * @param [in] element_index Index of slot. + * @param [in] new_val Value to put into slot given by index. + * + * Complexity: O(n) + */ +void ucs_ptr_array_locked_set(ucs_ptr_array_locked_t *locked_ptr_array, + unsigned element_index, void *new_val); + + +/** + * Remove a pointer from the locked array. + * + * @param [in] locked_ptr_array Pointer to a locked ptr array. + * @param [in] element_index Index to remove from. + * + * Complexity: O(1) + */ +void ucs_ptr_array_locked_remove(ucs_ptr_array_locked_t *locked_ptr_array, + unsigned element_index); +/** + * Replace pointer in the locked array, assuming the slot is occupied. + * + * @param [in] locked_ptr_array Pointer to a locked ptr array. + * @param [in] element_index Index of slot. + * @param [in] new_val Value to put into slot given by index. + * + * @return Old value of the slot + * + * Complexity: O(1) + */ +void *ucs_ptr_array_locked_replace(ucs_ptr_array_locked_t *locked_ptr_array, + unsigned element_index, void *new_val); + + +/** + * Acquire the ptr_array lock. + * + * @param [in] _locked_ptr_array Pointer to a locked ptr array. + */ +#define ucs_ptr_array_locked_acquire_lock(_locked_ptr_array) \ + ucs_recursive_spin_lock(&(_locked_ptr_array)->lock) + + +/** + * Release the ptr_array lock. + * + * @param [in] _locked_ptr_array Pointer to a locked ptr array. + */ +#define ucs_ptr_array_locked_release_lock(_locked_ptr_array) \ + ucs_recursive_spin_unlock(&(_locked_ptr_array)->lock) + + +/** + * Retrieves a value from the locked array. + * + * @param [in] locked_ptr_array Pointer to a locked ptr array. + * @param [in] element_index Index to retrieve the value from. + * @param [out] var Filled with the value. + * + * @return Whether the value is present and valid. + * + * Complexity: O(1) + */ +static UCS_F_ALWAYS_INLINE int +ucs_ptr_array_locked_lookup(ucs_ptr_array_locked_t *locked_ptr_array, + unsigned element_index, void **var) +{ + int present; + + ucs_ptr_array_locked_acquire_lock(locked_ptr_array); + present = ucs_ptr_array_lookup(&locked_ptr_array->super, element_index, + *var); + ucs_ptr_array_locked_release_lock(locked_ptr_array); + + return present; +} + + +/** + * Get the current size of the locked ptr array + * + * @param [in] locked_ptr_array Pointer to a locked ptr array. + * + * @return Size of the locked ptr array. + */ +static UCS_F_ALWAYS_INLINE unsigned +ucs_ptr_array_locked_get_size(ucs_ptr_array_locked_t *locked_ptr_array) +{ + return ucs_ptr_array_get_size(&locked_ptr_array->super); +} + + +/** + * If foreach locked ptr_array is finalized, releases lock. + * + * @param [in] locked_ptr_array Pointer to a locked ptr array. + * @param [in] element_index The current for loop index. + * + * @return is_continue_loop for the for() loop end condition. + */ +static UCS_F_ALWAYS_INLINE int +__ucx_ptr_array_locked_foreach_finalize(ucs_ptr_array_locked_t *locked_ptr_array, + uint32_t element_index) +{ + if (element_index < locked_ptr_array->super.size) { + return 1; + } + + ucs_ptr_array_locked_release_lock(locked_ptr_array); + return 0; +} + + +/** + * Iterate over all valid elements in the locked array. + * + * Please notice that using break or return are not allowed in + * this implementation. + * Using break or return would require releasing the lock before by calling, + * ucs_ptr_array_locked_release_lock(_locked_ptr_array); + * + * @param [out] _var Pointer to current array element in the foreach. + * @param [out] _index Index variable to use as iterator (unsigned). + * @param [in] _locked_ptr_array Pointer to a locked ptr array. + */ +#define ucs_ptr_array_locked_for_each(_var, _index, _locked_ptr_array) \ + for ((_index) = 0, \ + ucs_ptr_array_locked_acquire_lock(_locked_ptr_array); \ + __ucx_ptr_array_locked_foreach_finalize(_locked_ptr_array, (_index)); \ + (_index) += __ucs_ptr_array_for_each_get_step_size((&(_locked_ptr_array)->super), (_index))) \ + if ((ucs_likely(!__ucs_ptr_array_is_free( \ + (ucs_ptr_array_elem_t)((_var) = \ + (void *)((&(_locked_ptr_array)->super)->start[(_index)])))))) + #endif /* PTR_ARRAY_H_ */ + diff --git a/src/ucs/datastruct/queue.h b/src/ucs/datastruct/queue.h index 5c1860d708a..cd434c6baa5 100644 --- a/src/ucs/datastruct/queue.h +++ b/src/ucs/datastruct/queue.h @@ -20,6 +20,9 @@ */ static inline void ucs_queue_head_init(ucs_queue_head_t *queue) { +#ifdef __clang_analyzer__ + queue->head = (ucs_queue_elem_t*)(void*)queue; +#endif queue->ptail = &queue->head; } @@ -56,7 +59,7 @@ static inline void ucs_queue_push(ucs_queue_head_t *queue, ucs_queue_elem_t *ele { *queue->ptail = elem; queue->ptail = &elem->next; -#if ENABLE_ASSERT +#if UCS_ENABLE_ASSERT elem->next = NULL; /* For sanity check below */ #endif } @@ -102,6 +105,8 @@ static inline ucs_queue_elem_t *ucs_queue_pull_non_empty(ucs_queue_head_t *queue */ static inline void ucs_queue_del_iter(ucs_queue_head_t *queue, ucs_queue_iter_t iter) { + ucs_assert((iter != NULL) && (*iter != NULL)); + if (queue->ptail == &(*iter)->next) { queue->ptail = iter; /* deleting the last element */ *iter = NULL; /* make *ptail point to NULL */ @@ -194,9 +199,11 @@ static inline void ucs_queue_splice(ucs_queue_head_t *queue, * @param member Member inside 'elem' which is the queue link. */ #define ucs_queue_for_each(elem, queue, member) \ - for (*(queue)->ptail = NULL, \ + /* we set `ptail` field to queue address to not subtract NULL pointer */ \ + for (*(queue)->ptail = (ucs_queue_elem_t*)(void*)(queue), \ elem = ucs_container_of((queue)->head, typeof(*elem), member); \ - (elem) != ucs_container_of(NULL, typeof(*elem), member); \ + (UCS_PTR_BYTE_OFFSET(elem, ucs_offsetof(typeof(*elem), member)) != \ + (void*)(queue)); \ elem = ucs_container_of(elem->member.next, typeof(*elem), member)) /** diff --git a/src/ucs/datastruct/strided_alloc.c b/src/ucs/datastruct/strided_alloc.c index 9d5c46400b9..a688e12c318 100644 --- a/src/ucs/datastruct/strided_alloc.c +++ b/src/ucs/datastruct/strided_alloc.c @@ -3,6 +3,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "strided_alloc.h" #include "queue.h" @@ -71,7 +75,8 @@ static void ucs_strided_alloc_calc(ucs_strided_alloc_t *sa, size_t *chunk_size, sizeof(ucs_strided_alloc_chunk_t)) / sa->elem_size; } -static void ucs_strided_alloc_grow(ucs_strided_alloc_t *sa UCS_MEMTRACK_ARG) +static ucs_status_t +ucs_strided_alloc_grow(ucs_strided_alloc_t *sa UCS_MEMTRACK_ARG) { size_t chunk_size, elems_per_chunk; ucs_strided_alloc_chunk_t *chunk; @@ -83,18 +88,20 @@ static void ucs_strided_alloc_grow(ucs_strided_alloc_t *sa UCS_MEMTRACK_ARG) chunk = ucs_strided_alloc_chunk_alloc(sa, chunk_size UCS_MEMTRACK_VAL); if (chunk == NULL) { - return; + return UCS_ERR_NO_MEMORY; } chunk_mem = ucs_strided_alloc_chunk_to_mem(chunk); for (i = elems_per_chunk - 1; i >= 0; --i) { - elem = chunk_mem + (i * sa->elem_size); + elem = UCS_PTR_BYTE_OFFSET(chunk_mem, i * sa->elem_size); ucs_strided_alloc_push_to_freelist(sa, elem); } ucs_queue_push(&sa->chunks, &chunk->queue); VALGRIND_MAKE_MEM_NOACCESS(chunk_mem, chunk_size); + + return UCS_OK; } void ucs_strided_alloc_init(ucs_strided_alloc_t *sa, size_t elem_size, @@ -135,15 +142,17 @@ void ucs_strided_alloc_cleanup(ucs_strided_alloc_t *sa) void* ucs_strided_alloc_get(ucs_strided_alloc_t *sa, const char *alloc_name) { ucs_strided_alloc_elem_t *elem; + ucs_status_t status; unsigned i; if (sa->freelist == NULL) { - ucs_strided_alloc_grow(sa UCS_MEMTRACK_VAL); + status = ucs_strided_alloc_grow(sa UCS_MEMTRACK_VAL); + if (status != UCS_OK) { + return NULL; + } } - if (sa->freelist == NULL) { - return NULL; - } + ucs_assert(sa->freelist != NULL); elem = sa->freelist; VALGRIND_MAKE_MEM_DEFINED(elem, sizeof(*elem)); diff --git a/src/ucs/datastruct/strided_alloc.h b/src/ucs/datastruct/strided_alloc.h index c662c04ed71..68b87653c79 100644 --- a/src/ucs/datastruct/strided_alloc.h +++ b/src/ucs/datastruct/strided_alloc.h @@ -59,7 +59,7 @@ BEGIN_C_DECLS * @return Pointer to the desired element */ #define ucs_strided_elem_get(_elem, _stride_idx, _wanted_idx) \ - UCS_PTR_BYTE_OFFSET(_elem, UCS_STRIDED_ALLOC_STRIDE * \ + UCS_PTR_BYTE_OFFSET(_elem, (ptrdiff_t)UCS_STRIDED_ALLOC_STRIDE * \ ((ptrdiff_t)(_wanted_idx) - (ptrdiff_t)(_stride_idx))) diff --git a/src/ucs/datastruct/string_buffer.c b/src/ucs/datastruct/string_buffer.c new file mode 100644 index 00000000000..f5b7504c248 --- /dev/null +++ b/src/ucs/datastruct/string_buffer.c @@ -0,0 +1,145 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "string_buffer.h" + +#include +#include +#include +#include +#include +#include + + +#define UCS_STRING_BUFFER_GROW 32 +#define UCS_STRING_BUFFER_ALLOC_NAME "string_buffer" + + +static void ucs_string_buffer_reset(ucs_string_buffer_t *strb) +{ + strb->buffer = NULL; + strb->length = 0; + strb->capacity = 0; +} + +void ucs_string_buffer_init(ucs_string_buffer_t *strb) +{ + ucs_string_buffer_reset(strb); +} + +void ucs_string_buffer_cleanup(ucs_string_buffer_t *strb) +{ + ucs_free(strb->buffer); + ucs_string_buffer_reset(strb); +} + +/* + * Grow the buffer to at least the required size and at least double the + * previous size (to reduce the amortized cost of realloc) + */ +static ucs_status_t ucs_string_buffer_grow(ucs_string_buffer_t *strb, + size_t min_capacity) +{ + size_t new_capacity; + char *new_buffer; + + new_capacity = ucs_max(strb->capacity * 2, min_capacity); + new_buffer = ucs_realloc(strb->buffer, new_capacity, + UCS_STRING_BUFFER_ALLOC_NAME); + if (new_buffer == NULL) { + ucs_error("failed to grow string from %zu to %zu characters", + strb->capacity, new_capacity); + return UCS_ERR_NO_MEMORY; + } + + strb->buffer = new_buffer; + strb->capacity = new_capacity; + /* length stays the same */ + return UCS_OK; +} + +ucs_status_t ucs_string_buffer_appendf(ucs_string_buffer_t *strb, + const char *fmt, ...) +{ + ucs_status_t status; + size_t max_print; + va_list ap; + int ret; + + /* set minimal initial size */ + if ((strb->capacity - strb->length) <= 1) { + status = ucs_string_buffer_grow(strb, + strb->capacity + UCS_STRING_BUFFER_GROW); + if (status != UCS_OK) { + return status; + } + } + + /* try to write to existing buffer */ + va_start(ap, fmt); + max_print = strb->capacity - strb->length - 1; + ret = vsnprintf(strb->buffer + strb->length, max_print, fmt, ap); + va_end(ap); + + /* if failed, grow the buffer accommodate for the expected extra length */ + if (ret >= max_print) { + status = ucs_string_buffer_grow(strb, strb->length + ret + 1); + if (status != UCS_OK) { + return status; + } + + va_start(ap, fmt); + max_print = strb->capacity - strb->length; + ret = vsnprintf(strb->buffer + strb->length, max_print, fmt, ap); + va_end(ap); + + /* since we've grown the buffer, it should be sufficient now */ + ucs_assertv(ret < max_print, "ret=%d max_print=%zu", ret, max_print); + } + + /* string length grows by the amount of characters written by vsnprintf */ + strb->length += ret; + + ucs_assert(strb->length < strb->capacity); + ucs_assert(strb->buffer[strb->length] == '\0'); /* \0 is written by vsnprintf */ + + return UCS_OK; +} + +void ucs_string_buffer_rtrim(ucs_string_buffer_t *strb, const char *charset) +{ + char *ptr; + + ptr = &strb->buffer[strb->length]; + while (strb->length > 0) { + --ptr; + if (((charset == NULL) && !isspace(*ptr)) || + ((charset != NULL) && (strchr(charset, *ptr) == NULL))) { + /* if the last character should NOT be removed - stop */ + break; + } + + --strb->length; + } + + /* mark the new end of string */ + *(ptr + 1) = '\0'; +} + +const char *ucs_string_buffer_cstr(const ucs_string_buffer_t *strb) +{ + if (strb->length == 0) { + return ""; + } + + ucs_assert(strb->buffer != NULL); + ucs_assert(strb->capacity > 0); + return strb->buffer; +} diff --git a/src/ucs/datastruct/string_buffer.h b/src/ucs/datastruct/string_buffer.h new file mode 100644 index 00000000000..8acec7f14d5 --- /dev/null +++ b/src/ucs/datastruct/string_buffer.h @@ -0,0 +1,86 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCS_STRING_BUFFER_H_ +#define UCS_STRING_BUFFER_H_ + +#include +#include +#include + + +BEGIN_C_DECLS + +/** + * String buffer - a dynamic NULL-terminated character buffer which can grow + * on demand. + */ +typedef struct ucs_string_buffer { + char *buffer; /* Buffer pointer */ + size_t length; /* Actual string length */ + size_t capacity; /* Allocated memory size */ +} ucs_string_buffer_t; + + +/** + * Initialize a string buffer + * + * @param [out] strb String buffer to initialize. + */ +void ucs_string_buffer_init(ucs_string_buffer_t *strb); + + +/** + * Cleanup a string buffer and release any memory associated with it. + * + * @param [out] strb String buffer to clean up. + */ +void ucs_string_buffer_cleanup(ucs_string_buffer_t *strb); + + +/** + * Append a formatted string to the string buffer. + * + * @param [inout] strb String buffer to append to. + * @param [in] fmt Format string. + * + * @return UCS_OK on success or UCS_ERR_NO_MEOMRY if could not allocate memory + * to grow the string. + */ +ucs_status_t ucs_string_buffer_appendf(ucs_string_buffer_t *strb, + const char *fmt, ...) + UCS_F_PRINTF(2, 3); + + +/** + * Remove specific characters from the end of the string. + * + * @param [inout] strb String buffer remote characters from. + * @param [in] charset C-string with the set of characters to remove. + * If NULL, this function removes whitespace characters, + * as defined by isspace (3). + * + * This function removes the largest contiguous suffix from the input string + * 'strb', which consists entirely of characters in 'charset'. + */ +void ucs_string_buffer_rtrim(ucs_string_buffer_t *strb, const char *charset); + + +/** + * Return a temporary pointer to a C-style string which represents the string + * buffer. The returned string is valid only as long as no other operation is + * done on the string buffer (including append). + * + * @param [in] strb String buffer to convert to a C-style string + * + * @return C-style string representing the data in the buffer. + */ +const char *ucs_string_buffer_cstr(const ucs_string_buffer_t *strb); + + +END_C_DECLS + +#endif diff --git a/src/ucs/datastruct/string_set.c b/src/ucs/datastruct/string_set.c new file mode 100644 index 00000000000..f8153aceb49 --- /dev/null +++ b/src/ucs/datastruct/string_set.c @@ -0,0 +1,148 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "string_set.h" + +#include +#include +#include + + +#define UCS_STRING_SET_ALLOC_NAME "string_set" + + +void ucs_string_set_init(ucs_string_set_t *sset) +{ + kh_init_inplace(ucs_string_set, sset); +} + +void ucs_string_set_cleanup(ucs_string_set_t *sset) +{ + char *str; + + kh_foreach_key(sset, str, { + ucs_free(str); + }); + kh_destroy_inplace(ucs_string_set, sset); +} + +/* Adds string by pointer, and releases the string if add fails or the string + * already exists in the set + */ +static ucs_status_t ucs_string_set_add_ptr(ucs_string_set_t *sset, char *str) +{ + int ret; + + kh_put(ucs_string_set, sset, str, &ret); + + switch (ret) { + case -1: + ucs_free(str); + return UCS_ERR_NO_MEMORY; + case 0: + /* key already present */ + ucs_free(str); + return UCS_OK; + case 1: + case 2: + /* key inserted */ + return UCS_OK; + default: + ucs_error("unexpected return value from kh_put(ucs_string_set): %d", ret); + return UCS_ERR_INVALID_PARAM; + } +} + +ucs_status_t ucs_string_set_add(ucs_string_set_t *sset, const char *str) +{ + char *str_copy; + + str_copy = ucs_strdup(str, UCS_STRING_SET_ALLOC_NAME); + if (str_copy == NULL) { + return UCS_ERR_NO_MEMORY; + } + + return ucs_string_set_add_ptr(sset, str_copy); +} + +ucs_status_t ucs_string_set_addf(ucs_string_set_t *sset, const char *fmt, ...) +{ + int length; + va_list ap; + char *str; + + va_start(ap, fmt); + length = vsnprintf(NULL, 0, fmt, ap); + va_end(ap); + + str = ucs_malloc(length + 1, UCS_STRING_SET_ALLOC_NAME); + if (str == NULL) { + return UCS_ERR_NO_MEMORY; + } + + va_start(ap, fmt); + vsnprintf(str, length + 1, fmt, ap); + va_end(ap); + + return ucs_string_set_add_ptr(sset, str); +} + +int ucs_string_set_contains(const ucs_string_set_t *sset, const char *str) +{ + return kh_get(ucs_string_set, sset, (char*)str) != kh_end(sset); +} + +static int ucs_string_set_compare_func(const void *a, const void *b) +{ + return strcmp(*(const char**)a, *(const char**)b); +} + +ucs_status_t ucs_string_set_print_sorted(const ucs_string_set_t *sset, + ucs_string_buffer_t *strb, + const char *sep) +{ + const char **sorted_strings; + ucs_status_t status; + size_t idx, count; + char *str; + + /* allocate a temporary array to hold the sorted strings */ + count = kh_size(sset); + sorted_strings = ucs_calloc(count, sizeof(*sorted_strings), "string_set"); + if (sorted_strings == NULL) { + status = UCS_ERR_NO_MEMORY; + goto out; + } + + /* collect and sort the strings */ + idx = 0; + kh_foreach_key(sset, str, { + sorted_strings[idx++] = str; + }) + ucs_assert(idx == count); + qsort(sorted_strings, count, sizeof(*sorted_strings), + ucs_string_set_compare_func); + + /* append the sorted strings to the string buffer */ + for (idx = 0; idx < count; ++idx) { + status = ucs_string_buffer_appendf(strb, "%s%s", (idx > 0) ? sep : "", + sorted_strings[idx]); + if (status != UCS_OK) { + goto out_free_array; + } + } + + status = UCS_OK; + +out_free_array: + ucs_free(sorted_strings); +out: + return status; +} diff --git a/src/ucs/datastruct/string_set.h b/src/ucs/datastruct/string_set.h new file mode 100644 index 00000000000..7acab233d56 --- /dev/null +++ b/src/ucs/datastruct/string_set.h @@ -0,0 +1,98 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCS_STRING_SET_H_ +#define UCS_STRING_SET_H_ + +#include +#include +#include +#include +#include + +BEGIN_C_DECLS + + +/* + * Define ucs_string_set_t as a khash/set type + */ +KHASH_INIT(ucs_string_set, char*, char, 0, kh_str_hash_func, kh_str_hash_equal) +typedef khash_t(ucs_string_set) ucs_string_set_t; + + +/** + * Initialize a string set + * + * @param [out] sset String set to initialize. + */ +void ucs_string_set_init(ucs_string_set_t *sset); + + +/** + * Cleanup a string set and release any memory associated with it. + * + * @param [out] sset String set to clean up. + */ +void ucs_string_set_cleanup(ucs_string_set_t *sset); + + +/** + * Add a copy of a string to the string set + * + * @param [inout] sset String set to add to. + * @param [in] str String to add. The passed string can be released + * immediately after this call, since the contents of the + * string are copied to an internal buffer. + * + * @param UCS_OK if successful, or UCS_ERR_NO_MEMORY if could not allocate + * enough memory to add the string. + */ +ucs_status_t ucs_string_set_add(ucs_string_set_t *sset, const char *str); + + +/** + * Add a formatted string to the string set + * + * @param [inout] sset String set to add to. + * @param [in] fmt Format string to add. + * + * @param UCS_OK if successful, or UCS_ERR_NO_MEMORY if could not allocate + * enough memory to add the string. + */ +ucs_status_t ucs_string_set_addf(ucs_string_set_t *sset, const char *fmt, ...) + UCS_F_PRINTF(2, 3); + + +/** + * Check whether a string set contains a given string + * + * @param [in] sset String set to check. + * @param [in] str String to check if contained in the set. + * + * @return Nonzero if the string is contained in the set, 0 otherwise. + */ +int ucs_string_set_contains(const ucs_string_set_t *sset, const char *str); + + +/** + * Print set contents to a string buffer in a lexicographical order + * + * @param [in] sset String set whose contents to print. + * @param [inout] strb Append the strings in the set to this string buffer. + * @param [in] sep Separator string to insert between every two printed + * strings, for example: "," + * + * @param UCS_OK if successful, or UCS_ERR_NO_MEMORY if could not allocate + * enough memory to sort the set or to grow the string buffer. + */ +ucs_status_t ucs_string_set_print_sorted(const ucs_string_set_t *sset, + ucs_string_buffer_t *strb, + const char *sep); + + +END_C_DECLS + +#endif diff --git a/src/ucs/debug/assert.c b/src/ucs/debug/assert.c index 8239550ded2..e22536cdfa4 100644 --- a/src/ucs/debug/assert.c +++ b/src/ucs/debug/assert.c @@ -4,12 +4,17 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "assert.h" #include #include #include #include +#include #include #include #include @@ -19,17 +24,13 @@ void ucs_fatal_error_message(const char *file, unsigned line, const char *function, char *message_buf) { char *message_line, *save_ptr = NULL; - const char *short_file; ucs_log_flush(); - short_file = strrchr(file, '/'); - short_file = (short_file == NULL) ? file : short_file + 1; - message_line = (message_buf == NULL) ? NULL : strtok_r(message_buf, "\n", &save_ptr); while (message_line != NULL) { - ucs_log_fatal_error("%13s:%-4u %s", short_file, line, message_line); + ucs_log_fatal_error("%13s:%-4u %s", ucs_basename(file), line, message_line); message_line = strtok_r(NULL, "\n", &save_ptr); } diff --git a/src/ucs/debug/assert.h b/src/ucs/debug/assert.h index b52fe53551c..ea45fafa1a1 100644 --- a/src/ucs/debug/assert.h +++ b/src/ucs/debug/assert.h @@ -51,7 +51,9 @@ BEGIN_C_DECLS "Fatal: " _fmt, ## __VA_ARGS__) -#if ENABLE_ASSERT +#if defined (ENABLE_ASSERT) || defined(__COVERITY__) || defined(__clang_analyzer__) + +#define UCS_ENABLE_ASSERT 1 /** * Generate a program bug report if assertions are enabled @@ -65,6 +67,8 @@ BEGIN_C_DECLS #else +#define UCS_ENABLE_ASSERT 0 + #define ucs_bug(...) #define ucs_assert(...) #define ucs_assertv(...) diff --git a/src/ucs/debug/debug.c b/src/ucs/debug/debug.c index 651225e5a06..73a0cd485cb 100644 --- a/src/ucs/debug/debug.c +++ b/src/ucs/debug/debug.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -28,11 +29,24 @@ KHASH_MAP_INIT_INT64(ucs_debug_symbol, char*); +KHASH_MAP_INIT_INT(ucs_signal_orig_action, struct sigaction*); #define UCS_GDB_MAX_ARGS 32 #define BACKTRACE_MAX 64 #define UCS_DEBUG_UNKNOWN_SYM "???" +#ifdef HAVE_DETAILED_BACKTRACE +# define UCS_DEBUG_BACKTRACE_LINE_FMT "%2d 0x%016lx %s() %s:%u\n" +# define UCS_DEBUG_BACKTRACE_LINE_ARG(_n, _line) \ + _n, (_line)->address, \ + (_line)->function ? (_line)->function : "??", \ + (_line)->file ? (_line)->file : "??", \ + (_line)->lineno +#else +# define UCS_DEBUG_BACKTRACE_LINE_FMT "%2d %s\n" +# define UCS_DEBUG_BACKTRACE_LINE_ARG(_n, _line) _n, (_line)->symbol +#endif + struct dl_address_search { unsigned long address; const char *filename; @@ -41,6 +55,34 @@ struct dl_address_search { #ifdef HAVE_DETAILED_BACKTRACE +#if HAVE_DECL_BFD_GET_SECTION_FLAGS +# define ucs_debug_bfd_section_flags(_abfd, _section) \ + bfd_get_section_flags(_abfd, _section) +#elif HAVE_DECL_BFD_SECTION_FLAGS +# define ucs_debug_bfd_section_flags(_abfd, _section) \ + bfd_section_flags(_section) +#else +# error "Unsupported BFD API" +#endif + +#if HAVE_DECL_BFD_GET_SECTION_VMA +# define ucs_debug_bfd_section_vma(_abfd, _section) \ + bfd_get_section_vma(_abfd, _section) +#elif HAVE_DECL_BFD_SECTION_VMA +# define ucs_debug_bfd_section_vma(_abfd, _section) \ + bfd_section_vma(_section) +#else +# error "Unsupported BFD API" +#endif + +#if HAVE_1_ARG_BFD_SECTION_SIZE +# define ucs_debug_bfd_section_size(_abfd, _section) \ + bfd_section_size(_section) +#else +# define ucs_debug_bfd_section_size(_abfd, _section) \ + bfd_section_size(_abfd, _section); +#endif + struct backtrace_line { unsigned long address; char *file; @@ -54,7 +96,6 @@ struct backtrace_file { asymbol **syms; }; -typedef struct backtrace *backtrace_h; struct backtrace { struct backtrace_line lines[BACKTRACE_MAX]; int size; @@ -70,6 +111,21 @@ struct backtrace_search { int max_lines; }; +#else /* HAVE_DETAILED_BACKTRACE */ + +struct backtrace_line { + void *address; + char *symbol; +}; + +struct backtrace { + char **symbols; + void *addresses[BACKTRACE_MAX]; + int size; + int position; + struct backtrace_line line; +}; + #endif /* HAVE_DETAILED_BACKTRACE */ #define UCS_SYS_SIGNAME(signame) [SIG ## signame] = #signame @@ -90,7 +146,9 @@ const char *ucs_signal_names[] = { UCS_SYS_SIGNAME(PIPE), UCS_SYS_SIGNAME(ALRM), UCS_SYS_SIGNAME(TERM), +#ifdef SIGSTKFLT UCS_SYS_SIGNAME(STKFLT), +#endif UCS_SYS_SIGNAME(CHLD), UCS_SYS_SIGNAME(CONT), UCS_SYS_SIGNAME(STOP), @@ -104,16 +162,34 @@ const char *ucs_signal_names[] = { UCS_SYS_SIGNAME(PROF), UCS_SYS_SIGNAME(WINCH), UCS_SYS_SIGNAME(IO), +#ifdef SIGPWR UCS_SYS_SIGNAME(PWR), +#endif UCS_SYS_SIGNAME(SYS), +#if defined __linux__ [SIGSYS + 1] = NULL +#elif defined __FreeBSD__ + [SIGRTMIN] = NULL +#else +#error "Port me" +#endif }; +#if HAVE_SIGACTION_SA_RESTORER static void *ucs_debug_signal_restorer = &ucs_debug_signal_restorer; +#endif static stack_t ucs_debug_signal_stack = {NULL, 0, 0}; -khash_t(ucs_debug_symbol) ucs_debug_symbols_cache; +static khash_t(ucs_debug_symbol) ucs_debug_symbols_cache; +static khash_t(ucs_signal_orig_action) ucs_signal_orig_action_map; +static ucs_recursive_spinlock_t ucs_kh_lock; + +static int ucs_debug_initialized = 0; + +#ifdef HAVE_CPLUS_DEMANGLE +extern char *cplus_demangle(const char *, int); +#endif static int ucs_debug_backtrace_is_excluded(void *address, const char *symbol); @@ -131,6 +207,8 @@ static char *ucs_debug_strdup(const char *str) return newstr; } +#ifdef HAVE_DETAILED_BACKTRACE + static int dl_match_address(struct dl_phdr_info *info, size_t size, void *data) { struct dl_address_search *dl = data; @@ -167,8 +245,6 @@ static int dl_lookup_address(struct dl_address_search *dl) return 1; } -#ifdef HAVE_DETAILED_BACKTRACE - /* * The dl member in file should be initialized */ @@ -223,7 +299,6 @@ static char *ucs_debug_demangle(const char *name) { char *demangled = NULL; #ifdef HAVE_CPLUS_DEMANGLE - extern char *cplus_demangle(const char *, int); demangled = cplus_demangle(name, 0); #endif return demangled ? demangled : strdup(name); @@ -240,17 +315,17 @@ static void find_address_in_section(bfd *abfd, asection *section, void *data) int found; if ((search->count > 0) || (search->max_lines == 0) || - ((bfd_get_section_flags(abfd, section) & SEC_ALLOC) == 0)) { + ((ucs_debug_bfd_section_flags(abfd, section) & SEC_ALLOC) == 0)) { return; } address = search->file->dl.address - search->file->dl.base; - vma = bfd_get_section_vma(abfd, section); + vma = ucs_debug_bfd_section_vma(abfd, section); if (address < vma) { return; } - size = bfd_section_size(abfd, section); + size = ucs_debug_bfd_section_size(abfd, section); if (address >= vma + size) { return; } @@ -294,26 +369,40 @@ static int get_line_info(struct backtrace_file *file, int backoff, /** * Create a backtrace from the calling location. - */ -static void ucs_debug_backtrace_create(struct backtrace *bckt) + * + * @param bckt Backtrace object. + * @param strip How many frames to strip. +*/ +ucs_status_t ucs_debug_backtrace_create(backtrace_h *bckt, int strip) { + size_t size = sizeof(**bckt); struct backtrace_file file; void *addresses[BACKTRACE_MAX]; int i, num_addresses; + ucs_status_t status; + + *bckt = NULL; + status = ucs_mmap_alloc(&size, (void**)bckt, 0 + UCS_MEMTRACK_NAME("debug backtrace object")); + if (status != UCS_OK) { + return status; + } num_addresses = backtrace(addresses, BACKTRACE_MAX); - bckt->size = 0; - bckt->position = 0; + (*bckt)->size = 0; + (*bckt)->position = strip; for (i = 0; i < num_addresses; ++i) { file.dl.address = (unsigned long)addresses[i]; if (dl_lookup_address(&file.dl) && load_file(&file)) { - bckt->size += get_line_info(&file, 1, bckt->lines + bckt->size, - BACKTRACE_MAX - bckt->size); + (*bckt)->size += get_line_info(&file, 1, + (*bckt)->lines + (*bckt)->size, + BACKTRACE_MAX - (*bckt)->size); unload_file(&file); } } + return UCS_OK; } /** @@ -321,7 +410,7 @@ static void ucs_debug_backtrace_create(struct backtrace *bckt) * * @param bckt Backtrace object. */ -static void ucs_debug_backtrace_destroy(backtrace_h bckt) +void ucs_debug_backtrace_destroy(backtrace_h bckt) { int i; @@ -330,6 +419,7 @@ static void ucs_debug_backtrace_destroy(backtrace_h bckt) free(bckt->lines[i].file); } bckt->size = 0; + ucs_mmap_free(bckt, sizeof(*bckt)); } static ucs_status_t @@ -398,63 +488,24 @@ ucs_status_t ucs_debug_lookup_address(void *address, ucs_debug_address_info_t *i * Walk to the next backtrace line information. * * @param bckt Backtrace object. - * @param address Filled with backtrace address. - * @param file Filled with a pointer to the source file name. - * @param function Filled with a pointer to function name. - * @param lineno Filled with source line number. + * @param line Filled with backtrace frame info. * - * NOTE: the file and function memory remains valid as long as the backtrace - * object is not destroyed. + * NOTE: the line remains valid as long as the backtrace object is not destroyed. */ -int backtrace_next(backtrace_h bckt, unsigned long *address, char const ** file, - char const ** function, unsigned *lineno) +int ucs_debug_backtrace_next(backtrace_h bckt, backtrace_line_h *line) { - struct backtrace_line *line; - - if (bckt->position >= bckt->size) - return 0; - - line = &bckt->lines[bckt->position++]; - *address = line->address; - *file = line->file; - *function = line->function; - *lineno = line->lineno; - return 1; -} - -/* - * Filter specific functions from the head of the backtrace. - */ -void ucs_debug_print_backtrace(FILE *stream, int strip) -{ - const char *file, *function; - struct backtrace bckt; - unsigned long address; - unsigned line; - int exclude; - int i, n; - - ucs_debug_backtrace_create(&bckt); + backtrace_line_h ln; - fprintf(stream, "==== backtrace (tid:%7d) ====\n", ucs_get_tid()); - exclude = 1; - i = 0; - n = 0; - while (backtrace_next(&bckt, &address, &file, &function, &line)) { - if (i >= strip) { - exclude = exclude && ucs_debug_backtrace_is_excluded((void*)address, - function); - if (!exclude) { - fprintf(stream, "%2d 0x%016lx %s() %s:%u\n", n, address, - function ? function : "??", file ? file : "??", line); - ++n; - } + do { + if (bckt->position >= bckt->size) { + return 0; } - ++i; - } - fprintf(stream, "=================================\n"); - ucs_debug_backtrace_destroy(&bckt); + ln = &bckt->lines[bckt->position++]; + } while (ucs_debug_backtrace_is_excluded((void*)ln->address, ln->function)); + + *line = ln; + return 1; } static void ucs_debug_print_source_file(const char *file, unsigned line, @@ -490,37 +541,38 @@ static void ucs_debug_print_source_file(const char *file, unsigned line, static void ucs_debug_show_innermost_source_file(FILE *stream) { - const char *file, *function; - struct backtrace bckt; - unsigned long address; - unsigned line; + backtrace_h bckt; + backtrace_line_h bckt_line; + ucs_status_t status; - ucs_debug_backtrace_create(&bckt); - while (backtrace_next(&bckt, &address, &file, &function, &line)) { - if (!ucs_debug_backtrace_is_excluded((void*)address, function)) { - ucs_debug_print_source_file(file, line, function, stream); - break; - } + status = ucs_debug_backtrace_create(&bckt, 0); + if (status != UCS_OK) { + return; + } + + if (ucs_debug_backtrace_next(bckt, &bckt_line)) { + ucs_debug_print_source_file(bckt_line->file, bckt_line->lineno, + bckt_line->function, stream); } - ucs_debug_backtrace_destroy(&bckt); + ucs_debug_backtrace_destroy(bckt); } #else /* HAVE_DETAILED_BACKTRACE */ ucs_status_t ucs_debug_lookup_address(void *address, ucs_debug_address_info_t *info) { - Dl_info dlinfo; + Dl_info dl_info; int ret; - ret = dladdr(address, &dlinfo); + ret = dladdr(address, &dl_info); if (!ret) { return UCS_ERR_NO_ELEM; } - ucs_strncpy_safe(info->file.path, dlinfo.dli_fname, sizeof(info->file.path)); - info->file.base = (uintptr_t)dlinfo.dli_fbase; + ucs_strncpy_safe(info->file.path, dl_info.dli_fname, sizeof(info->file.path)); + info->file.base = (uintptr_t)dl_info.dli_fbase; ucs_strncpy_safe(info->function, - (dlinfo.dli_sname != NULL) ? dlinfo.dli_sname : UCS_DEBUG_UNKNOWN_SYM, + (dl_info.dli_sname != NULL) ? dl_info.dli_sname : UCS_DEBUG_UNKNOWN_SYM, sizeof(info->function)); ucs_strncpy_safe(info->source_file, UCS_DEBUG_UNKNOWN_SYM, sizeof(info->source_file)); info->line_number = 0; @@ -528,26 +580,62 @@ ucs_status_t ucs_debug_lookup_address(void *address, ucs_debug_address_info_t *i return UCS_OK; } -void ucs_debug_print_backtrace(FILE *stream, int strip) +/** + * Create a backtrace from the calling location. + */ +ucs_status_t ucs_debug_backtrace_create(backtrace_h *bckt, int strip) { - char **symbols; - void *addresses[BACKTRACE_MAX]; - int count, i, n; + size_t size = sizeof(**bckt); + ucs_status_t status; - fprintf(stream, "==== backtrace ====\n"); + *bckt = NULL; + status = ucs_mmap_alloc(&size, (void**)bckt, 0 + UCS_MEMTRACK_NAME("debug backtrace object")); + if (status != UCS_OK) { + return status; + } - count = backtrace(addresses, BACKTRACE_MAX); - symbols = backtrace_symbols(addresses, count); - n = 0; - for (i = strip; i < count; ++i) { - if (!ucs_debug_backtrace_is_excluded(addresses[i], symbols[i])) { - fprintf(stream, " %2d %s\n", n, symbols[i]); - ++n; + (*bckt)->size = backtrace((*bckt)->addresses, BACKTRACE_MAX); + (*bckt)->symbols = backtrace_symbols((*bckt)->addresses, (*bckt)->size); + (*bckt)->position = strip; + + return UCS_OK; +} + +/** + * Destroy a backtrace and free all memory. + * + * @param bckt Backtrace object. + */ +void ucs_debug_backtrace_destroy(backtrace_h bckt) +{ + free(bckt->symbols); + ucs_mmap_free(bckt, sizeof(*bckt)); +} + +/** + * Walk to the next backtrace line information. + * + * @param bckt Backtrace object. + * @param line Filled with backtrace frame info. + * + * NOTE: the line remains valid as long as the backtrace object is not destroyed. + */ +int ucs_debug_backtrace_next(backtrace_h bckt, backtrace_line_h *line) +{ + while (bckt->position < bckt->size) { + bckt->line.address = bckt->addresses[bckt->position]; + bckt->line.symbol = bckt->symbols[bckt->position]; + bckt->position++; + + if (!ucs_debug_backtrace_is_excluded(bckt->line.address, + bckt->line.symbol)) { + *line = &bckt->line; + return 1; } } - free(symbols); - fprintf(stream, "===================\n"); + return 0; } static void ucs_debug_show_innermost_source_file(FILE *stream) @@ -556,6 +644,36 @@ static void ucs_debug_show_innermost_source_file(FILE *stream) #endif /* HAVE_DETAILED_BACKTRACE */ +/* + * Filter specific functions from the head of the backtrace. + */ +void ucs_debug_print_backtrace(FILE *stream, int strip) +{ + backtrace_h bckt; + backtrace_line_h bckt_line; + int i; + + ucs_debug_backtrace_create(&bckt, strip); + fprintf(stream, "==== backtrace (tid:%7d) ====\n", ucs_get_tid()); + for (i = 0; ucs_debug_backtrace_next(bckt, &bckt_line); ++i) { + fprintf(stream, UCS_DEBUG_BACKTRACE_LINE_FMT, + UCS_DEBUG_BACKTRACE_LINE_ARG(i, bckt_line)); + } + fprintf(stream, "=================================\n"); + + ucs_debug_backtrace_destroy(bckt); +} + +/* + * Filter specific functions from the head of the backtrace. + */ +void ucs_debug_print_backtrace_line(char *buffer, size_t maxlen, + int frame_num, + backtrace_line_h line) +{ + snprintf(buffer, maxlen, UCS_DEBUG_BACKTRACE_LINE_FMT, + UCS_DEBUG_BACKTRACE_LINE_ARG(frame_num, line)); +} const char *ucs_debug_get_symbol_name(void *address) { @@ -613,7 +731,7 @@ static void ucs_debugger_attach() char* argv[6 + UCS_GDB_MAX_ARGS]; pid_t pid, debug_pid; int fd, ret, narg; - char *self_exe; + char UCS_V_UNUSED *self_exe; /* Fork a process which will execute gdb and attach to the current process. * We must avoid trigerring calls to malloc/free, since the heap may be corrupted. @@ -640,6 +758,11 @@ static void ucs_debugger_attach() argv[narg] = strtok(NULL, " \t"); } + /* Make coverity know that argv[0] will not be affected by TMPDIR */ + if (narg == 0) { + return; + } + if (!RUNNING_ON_VALGRIND) { snprintf(pid_str, sizeof(pid_str), "%d", debug_pid); argv[narg++] = "-p"; @@ -649,7 +772,7 @@ static void ucs_debugger_attach() /* Generate a file name for gdb commands */ memset(gdb_commands_file, 0, sizeof(gdb_commands_file)); snprintf(gdb_commands_file, sizeof(gdb_commands_file) - 1, - "/tmp/.gdbcommands.uid-%d", geteuid()); + "%s/.gdbcommands.uid-%d", ucs_get_tmpdir(), geteuid()); /* Write gdb commands and add the file to argv is successful */ fd = open(gdb_commands_file, O_WRONLY|O_TRUNC|O_CREAT, 0600); @@ -679,6 +802,7 @@ static void ucs_debugger_attach() argv[narg++] = NULL; /* Execute GDB */ + /* coverity[tainted_string] */ ret = execvp(argv[0], argv); if (ret < 0) { ucs_log_fatal_error("Failed to execute %s: %m", argv[0]); @@ -702,48 +826,24 @@ static void ucs_debug_stop_handler(int signo) ucs_debug_freeze(); } -static void ucs_debug_stop_other_threads() +static ucs_status_t ucs_debug_enum_threads_cb(pid_t tid, void *ctx) { - static const char *task_dir = "/proc/self/task"; - struct dirent *entry; - DIR *dir; int ret; - int tid; - - dir = opendir(task_dir); - if (dir == NULL) { - ucs_log_fatal_error("Unable to open %s: %m", task_dir); - return; - } - - signal(SIGUSR1, ucs_debug_stop_handler); - - for (;;) { - errno = 0; - entry = readdir(dir); - if (entry == NULL) { - if (errno != 0) { - ucs_log_fatal_error("Unable to read from %s: %m", task_dir); - } - break; - } - - if (!strncmp(entry->d_name, ".", 1)) { - continue; - } - - tid = atoi(entry->d_name); - if ((tid == 0) || (tid == ucs_get_tid())) { - continue; - } + if ((tid != 0) && (tid != ucs_get_tid())) { ret = ucs_tgkill(getpid(), tid, SIGUSR1); if (ret < 0) { - break; + return UCS_ERR_NO_MESSAGE; } } - closedir(dir); + return UCS_OK; +} + +static void ucs_debug_stop_other_threads() +{ + signal(SIGUSR1, ucs_debug_stop_handler); + ucs_sys_enum_threads(ucs_debug_enum_threads_cb, NULL); } static void ucs_debug_send_mail(const char *message) @@ -830,8 +930,12 @@ static const char *ucs_signal_cause_common(int si_code) case SI_TIMER : return "POSIX timer expired"; case SI_MESGQ : return "POSIX message queue state changed"; case SI_ASYNCIO : return "AIO completed"; +#ifdef SI_SIGIO case SI_SIGIO : return "queued SIGIO"; +#endif +#ifdef SI_TKILL case SI_TKILL : return "tkill(2) or tgkill(2)"; +#endif default : return ""; } } @@ -978,19 +1082,19 @@ void ucs_handle_error(const char *message) static int ucs_debug_is_error_signal(int signum) { - int i; + khiter_t hash_it; + int result; if (!ucs_global_opts.handle_errors) { return 0; } - for (i = 0; i < ucs_global_opts.error_signals.count; ++i) { - if (signum == ucs_global_opts.error_signals.signals[i]) { - return 1; - } - } - - return 0; + /* If this signal is error, but was disabled. */ + ucs_recursive_spin_lock(&ucs_kh_lock); + hash_it = kh_get(ucs_signal_orig_action, &ucs_signal_orig_action_map, signum); + result = (hash_it != kh_end(&ucs_signal_orig_action_map)); + ucs_recursive_spin_unlock(&ucs_kh_lock); + return result; } static void* ucs_debug_get_orig_func(const char *symbol, void *replacement) @@ -1004,16 +1108,25 @@ static void* ucs_debug_get_orig_func(const char *symbol, void *replacement) return func_ptr; } +#if !HAVE_SIGHANDLER_T +#if HAVE___SIGHANDLER_T +typedef __sighandler_t *sighandler_t; +#else +#error "Port me" +#endif +#endif sighandler_t signal(int signum, sighandler_t handler) { - static sighandler_t (*orig)(int, sighandler_t) = NULL; + typedef sighandler_t (*sighandler_func_t)(int, sighandler_t); + + static sighandler_func_t orig = NULL; - if (ucs_debug_is_error_signal(signum)) { + if (ucs_debug_initialized && ucs_debug_is_error_signal(signum)) { return SIG_DFL; } if (orig == NULL) { - orig = ucs_debug_get_orig_func("signal", signal); + orig = (sighandler_func_t)ucs_debug_get_orig_func("signal", signal); } return orig(signum, handler); @@ -1022,10 +1135,12 @@ sighandler_t signal(int signum, sighandler_t handler) static int orig_sigaction(int signum, const struct sigaction *act, struct sigaction *oact) { - static int (*orig)(int, const struct sigaction*, struct sigaction*) = NULL; + typedef int (*sigaction_func_t)(int, const struct sigaction*, struct sigaction*); + + static sigaction_func_t orig = NULL; if (orig == NULL) { - orig = ucs_debug_get_orig_func("sigaction", sigaction); + orig = (sigaction_func_t)ucs_debug_get_orig_func("sigaction", sigaction); } return orig(signum, act, oact); @@ -1033,7 +1148,7 @@ static int orig_sigaction(int signum, const struct sigaction *act, int sigaction(int signum, const struct sigaction *act, struct sigaction *oact) { - if (ucs_debug_is_error_signal(signum)) { + if (ucs_debug_initialized && ucs_debug_is_error_signal(signum)) { return orig_sigaction(signum, NULL, oact); /* Return old, do not set new */ } @@ -1043,7 +1158,7 @@ int sigaction(int signum, const struct sigaction *act, struct sigaction *oact) static void ucs_debug_signal_handler(int signo) { ucs_log_flush(); - ucs_global_opts.log_level = UCS_LOG_LEVEL_TRACE_DATA; + ucs_global_opts.log_component.log_level = UCS_LOG_LEVEL_TRACE_DATA; ucs_profile_dump(); } @@ -1076,21 +1191,44 @@ static void ucs_debug_set_signal_alt_stack() ucs_debug_signal_stack.ss_size); } +static inline void ucs_debug_save_original_sighandler(int signum, + const struct sigaction* orig_handler) +{ + struct sigaction *oact_copy; + khiter_t hash_it; + int hash_extra_status; + + ucs_recursive_spin_lock(&ucs_kh_lock); + hash_it = kh_get(ucs_signal_orig_action, &ucs_signal_orig_action_map, signum); + if (hash_it != kh_end(&ucs_signal_orig_action_map)) { + goto out; + } + + oact_copy = ucs_malloc(sizeof(*orig_handler), "orig_sighandler"); + if (oact_copy == NULL) { + goto out; + } + + *oact_copy = *orig_handler; + hash_it = kh_put(ucs_signal_orig_action, + &ucs_signal_orig_action_map, + signum, &hash_extra_status); + kh_value(&ucs_signal_orig_action_map, hash_it) = oact_copy; + +out: + ucs_recursive_spin_unlock(&ucs_kh_lock); +} + static void ucs_set_signal_handler(void (*handler)(int, siginfo_t*, void *)) { struct sigaction sigact, old_action; int i; int ret; - if (handler == NULL) { - sigact.sa_handler = SIG_DFL; - sigact.sa_flags = 0; - } else { - sigact.sa_sigaction = handler; - sigact.sa_flags = SA_SIGINFO; - if (ucs_debug_signal_stack.ss_sp != NULL) { - sigact.sa_flags |= SA_ONSTACK; - } + sigact.sa_sigaction = handler; + sigact.sa_flags = SA_SIGINFO; + if (ucs_debug_signal_stack.ss_sp != NULL) { + sigact.sa_flags |= SA_ONSTACK; } sigemptyset(&sigact.sa_mask); @@ -1101,14 +1239,22 @@ static void ucs_set_signal_handler(void (*handler)(int, siginfo_t*, void *)) ucs_warn("failed to set signal handler for sig %d : %m", ucs_global_opts.error_signals.signals[i]); } +#if HAVE_SIGACTION_SA_RESTORER ucs_debug_signal_restorer = old_action.sa_restorer; +#endif + ucs_debug_save_original_sighandler(ucs_global_opts.error_signals.signals[i], &old_action); } } static int ucs_debug_backtrace_is_excluded(void *address, const char *symbol) { - return !strcmp(symbol, "ucs_handle_error") || - !strcmp(symbol, "ucs_fatal_error") || + return +#if HAVE_SIGACTION_SA_RESTORER + address == ucs_debug_signal_restorer || +#endif + !strcmp(symbol, "ucs_handle_error") || + !strcmp(symbol, "ucs_fatal_error_format") || + !strcmp(symbol, "ucs_fatal_error_message") || !strcmp(symbol, "ucs_error_freeze") || !strcmp(symbol, "ucs_error_signal_handler") || !strcmp(symbol, "ucs_debug_handle_error_signal") || @@ -1119,75 +1265,137 @@ static int ucs_debug_backtrace_is_excluded(void *address, const char *symbol) !strcmp(symbol, "ucs_log_dispatch") || !strcmp(symbol, "__ucs_log") || !strcmp(symbol, "ucs_debug_send_mail") || - (strstr(symbol, "_L_unlock_") == symbol) || - (address == ucs_debug_signal_restorer); + (strstr(symbol, "_L_unlock_") == symbol); } -static struct dl_address_search *ucs_debug_get_lib_info() +static ucs_status_t ucs_debug_get_lib_info(Dl_info *dl_info) { - static struct dl_address_search dl = {0, NULL, 0}; + int ret; - if (dl.address == 0) { - dl.address = (unsigned long)&ucs_debug_get_lib_info; - if (!dl_lookup_address(&dl)) { - dl.filename = NULL; - dl.base = 0; - } + (void)dlerror(); + ret = dladdr(ucs_debug_get_lib_info, dl_info); + if (ret == 0) { + return UCS_ERR_NO_MEMORY; } - /* If we failed to look up the address, return NULL */ - return (dl.filename == NULL || dl.base == 0) ? NULL : &dl; + return UCS_OK; } const char *ucs_debug_get_lib_path() { - static char ucs_lib_path[256] = {0}; - struct dl_address_search *dl; + ucs_status_t status; + Dl_info dl_info; - if (!strlen(ucs_lib_path)) { - dl = ucs_debug_get_lib_info(); - if (dl != NULL) { - ucs_expand_path(dl->filename, ucs_lib_path, sizeof(ucs_lib_path)); - } + status = ucs_debug_get_lib_info(&dl_info); + if (status != UCS_OK) { + return ""; } - return ucs_lib_path; + return dl_info.dli_fname; } unsigned long ucs_debug_get_lib_base_addr() { - struct dl_address_search *dl = ucs_debug_get_lib_info(); - return (dl == NULL) ? 0 : dl->base; + ucs_status_t status; + Dl_info dl_info; + + status = ucs_debug_get_lib_info(&dl_info); + if (status != UCS_OK) { + return 0; + } + + return (uintptr_t)dl_info.dli_fbase; } void ucs_debug_init() { + ucs_recursive_spinlock_init(&ucs_kh_lock, 0); + + kh_init_inplace(ucs_signal_orig_action, &ucs_signal_orig_action_map); kh_init_inplace(ucs_debug_symbol, &ucs_debug_symbols_cache); + if (ucs_global_opts.handle_errors) { ucs_debug_set_signal_alt_stack(); ucs_set_signal_handler(ucs_error_signal_handler); } if (ucs_global_opts.debug_signo > 0) { - signal(ucs_global_opts.debug_signo, ucs_debug_signal_handler); + struct sigaction sigact, old_action; + memset(&sigact, 0, sizeof(sigact)); + memset(&old_action, 0, sizeof(old_action)); + sigact.sa_handler = ucs_debug_signal_handler; + orig_sigaction(ucs_global_opts.debug_signo, &sigact, &old_action); + ucs_debug_save_original_sighandler(ucs_global_opts.debug_signo, &old_action); } #ifdef HAVE_DETAILED_BACKTRACE bfd_init(); #endif + + ucs_debug_initialized = 1; } void ucs_debug_cleanup(int on_error) { char *sym; + int signum; + struct sigaction *hndl; + ucs_status_t status; + + ucs_debug_initialized = 0; + + kh_foreach_key(&ucs_signal_orig_action_map, signum, + ucs_debug_disable_signal(signum)); - if (ucs_global_opts.handle_errors) { - ucs_set_signal_handler(NULL); - } - if (ucs_global_opts.debug_signo > 0) { - signal(ucs_global_opts.debug_signo, SIG_DFL); - } if (!on_error) { kh_foreach_value(&ucs_debug_symbols_cache, sym, ucs_free(sym)); + kh_foreach_value(&ucs_signal_orig_action_map, hndl, ucs_free(hndl)); kh_destroy_inplace(ucs_debug_symbol, &ucs_debug_symbols_cache); + kh_destroy_inplace(ucs_signal_orig_action, &ucs_signal_orig_action_map); + } + + status = ucs_recursive_spinlock_destroy(&ucs_kh_lock); + if (status != UCS_OK) { + ucs_warn("ucs_recursive_spinlock_destroy() failed (%d)", status); } } + +static inline void ucs_debug_disable_signal_nolock(int signum) +{ + khiter_t hash_it; + struct sigaction *original_action, ucs_action; + int ret; + + hash_it = kh_get(ucs_signal_orig_action, &ucs_signal_orig_action_map, + signum); + if (hash_it == kh_end(&ucs_signal_orig_action_map)) { + ucs_warn("ucs_debug_disable_signal: signal %d was not set in ucs", + signum); + return; + } + + original_action = kh_val(&ucs_signal_orig_action_map, hash_it); + ret = orig_sigaction(signum, original_action, &ucs_action); + if (ret < 0) { + ucs_warn("failed to set signal handler for sig %d : %m", signum); + } + + kh_del(ucs_signal_orig_action, &ucs_signal_orig_action_map, hash_it); + ucs_free(original_action); +} + +void ucs_debug_disable_signal(int signum) +{ + ucs_recursive_spin_lock(&ucs_kh_lock); + ucs_debug_disable_signal_nolock(signum); + ucs_recursive_spin_unlock(&ucs_kh_lock); +} + +void ucs_debug_disable_signals() +{ + int signum; + + ucs_recursive_spin_lock(&ucs_kh_lock); + kh_foreach_key(&ucs_signal_orig_action_map, signum, + ucs_debug_disable_signal_nolock(signum)); + ucs_recursive_spin_unlock(&ucs_kh_lock); +} diff --git a/src/ucs/debug/debug.h b/src/ucs/debug/debug.h index 9902c61e97d..66b90dc0a79 100644 --- a/src/ucs/debug/debug.h +++ b/src/ucs/debug/debug.h @@ -9,6 +9,7 @@ #include #include +#include #include @@ -26,6 +27,9 @@ typedef struct ucs_debug_address_info { } ucs_debug_address_info_t; +typedef struct backtrace *backtrace_h; +typedef struct backtrace_line *backtrace_line_h; + extern const char *ucs_state_detail_level_names[]; extern const char *ucs_signal_names[]; @@ -41,7 +45,18 @@ void ucs_debug_init(); */ void ucs_debug_cleanup(int on_error); +/** + * Disable signal handling in UCS for signal. + * Previous signal handler is set. + */ +void ucs_debug_disable_signal(int signum); +/** + * Disable signal handling in UCS for all signals + * that was set in ucs_global_opts.error_signals. + * Previous signal handlers are set. + */ +void ucs_debug_disable_signals(); /** * Get information about an address in the code of the current program. * @param address Address to look up. @@ -66,6 +81,46 @@ const char *ucs_debug_get_lib_path(); unsigned long ucs_debug_get_lib_base_addr(); +/** + * Create a backtrace from the calling location. + * + * @param bckt Backtrace object. + * @param strip How many frames to strip. +*/ +ucs_status_t ucs_debug_backtrace_create(backtrace_h *bckt, int strip); + + +/** + * Destroy a backtrace and free all memory. + * + * @param bckt Backtrace object. + */ +void ucs_debug_backtrace_destroy(backtrace_h bckt); + + +/** + * Walk to the next backtrace line information. + * + * @param bckt Backtrace object. + * @param line Filled with backtrace frame info. + * + * NOTE: the line remains valid as long as the backtrace object is not destroyed. + */ +int ucs_debug_backtrace_next(backtrace_h bckt, backtrace_line_h *line); + + +/** + * Print backtrace line to string buffer. + * + * @param buffer Target buffer to print to. + * @param maxlen Size of target buffer. + * @param frame_num Frame number + * @param line Backtrace line to print + */ +void ucs_debug_print_backtrace_line(char *buffer, size_t maxlen, + int frame_num, + backtrace_line_h line); + /** * Print backtrace to an output stream. * diff --git a/src/ucs/debug/log.c b/src/ucs/debug/log.c index 39f2de3bb9c..5ff78c1093d 100644 --- a/src/ucs/debug/log.c +++ b/src/ucs/debug/log.c @@ -4,22 +4,45 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "log.h" #include #include #include +#include #include #include #include #define UCS_MAX_LOG_HANDLERS 32 +#define UCS_LOG_TIME_FMT "[%lu.%06lu]" +#define UCS_LOG_FILE_FMT "%16s:%-4u" +#define UCS_LOG_METADATA_FMT "%-4s %-5s" +#define UCS_LOG_PROC_DATA_FMT "[%s:%-5d:%d]" +#define UCS_LOG_SHORT_FMT UCS_LOG_TIME_FMT" "UCS_LOG_FILE_FMT" " \ + UCS_LOG_METADATA_FMT" ""%s\n" +#define UCS_LOG_FMT UCS_LOG_TIME_FMT" "UCS_LOG_PROC_DATA_FMT" " \ + UCS_LOG_FILE_FMT" "UCS_LOG_METADATA_FMT" ""%s\n" + +#define UCS_LOG_TIME_ARG(_tv) (_tv)->tv_sec, (_tv)->tv_usec +#define UCS_LOG_SHORT_ARG(_short_file, _line, _level, _comp_conf, _tv, _message) \ + UCS_LOG_TIME_ARG(_tv), _short_file, _line, (_comp_conf)->name, \ + ucs_log_level_names[_level], _message +#define UCS_LOG_ARG(_short_file, _line, _level, _comp_conf, _tv, _message) \ + UCS_LOG_TIME_ARG(_tv), ucs_log_hostname, ucs_log_pid, \ + ucs_log_get_thread_num(),_short_file, _line, (_comp_conf)->name, \ + ucs_log_level_names[_level], _message const char *ucs_log_level_names[] = { [UCS_LOG_LEVEL_FATAL] = "FATAL", [UCS_LOG_LEVEL_ERROR] = "ERROR", [UCS_LOG_LEVEL_WARN] = "WARN", + [UCS_LOG_LEVEL_DIAG] = "DIAG", [UCS_LOG_LEVEL_INFO] = "INFO", [UCS_LOG_LEVEL_DEBUG] = "DEBUG", [UCS_LOG_LEVEL_TRACE] = "TRACE", @@ -32,22 +55,24 @@ const char *ucs_log_level_names[] = { [UCS_LOG_LEVEL_PRINT] = "PRINT" }; -static unsigned ucs_log_handlers_count = 0; +static unsigned ucs_log_handlers_count = 0; +static int ucs_log_initialized = 0; +static char ucs_log_hostname[HOST_NAME_MAX] = {0}; +static int ucs_log_pid = 0; +static FILE *ucs_log_file = NULL; +static char *ucs_log_file_base_name = NULL; +static int ucs_log_file_close = 0; +static int ucs_log_file_last_idx = 0; +static unsigned threads_count = 0; +static pthread_spinlock_t threads_lock = 0; +static pthread_t threads[128] = {0}; static ucs_log_func_t ucs_log_handlers[UCS_MAX_LOG_HANDLERS]; -static int ucs_log_initialized = 0; -static char ucs_log_hostname[256] = {0}; -static int ucs_log_pid = 0; -static FILE *ucs_log_file = NULL; -static int ucs_log_file_close = 0; -static unsigned threads_count = 0; -static pthread_spinlock_t threads_lock = 0; -static pthread_t threads[128] = {0}; static int ucs_log_get_thread_num(void) { pthread_t self = pthread_self(); - unsigned i; + int i; for (i = 0; i < threads_count; ++i) { if (threads[i] == self) { @@ -63,7 +88,7 @@ static int ucs_log_get_thread_num(void) } } - if (threads_count >= sizeof(threads) / sizeof(threads[0])) { + if (threads_count >= ucs_static_array_size(threads)) { i = -1; goto unlock_and_return_i; } @@ -88,48 +113,138 @@ void ucs_log_flush() size_t ucs_log_get_buffer_size() { return ucs_config_memunits_get(ucs_global_opts.log_buffer_size, - 256, 2048); + 256, UCS_ALLOCA_MAX_SIZE); +} + +static void ucs_log_get_file_name(char *log_file_name, size_t max, int idx) +{ + ucs_assert(idx <= ucs_global_opts.log_file_rotate); + + if (idx == 0) { + ucs_strncpy_zero(log_file_name, ucs_log_file_base_name, max); + return; + } + + ucs_snprintf_zero(log_file_name, max, "%s.%d", + ucs_log_file_base_name, idx); +} + +static void ucs_log_file_rotate() +{ + char old_log_file_name[PATH_MAX]; + char new_log_file_name[PATH_MAX]; + int idx, ret; + + if (ucs_log_file_last_idx == ucs_global_opts.log_file_rotate) { + /* remove the last file and log rotation from the + * `log_file_rotate - 1` file */ + ucs_log_get_file_name(old_log_file_name, + sizeof(old_log_file_name), + ucs_log_file_last_idx); + unlink(old_log_file_name); + } else { + ucs_log_file_last_idx++; + } + + ucs_assert(ucs_log_file_last_idx <= ucs_global_opts.log_file_rotate); + + for (idx = ucs_log_file_last_idx - 1; idx >= 0; --idx) { + ucs_log_get_file_name(old_log_file_name, + sizeof(old_log_file_name), idx); + ucs_log_get_file_name(new_log_file_name, + sizeof(new_log_file_name), idx + 1); + + if (access(old_log_file_name, W_OK) != 0) { + ucs_fatal("unable to write to %s", old_log_file_name); + } + + /* coverity[toctou] */ + ret = rename(old_log_file_name, new_log_file_name); + if (ret) { + ucs_fatal("failed to rename %s to %s: %m", + old_log_file_name, new_log_file_name); + } + + + if (access(old_log_file_name, F_OK) != -1) { + ucs_fatal("%s must not exist on the filesystem", old_log_file_name); + } + + if (access(new_log_file_name, W_OK) != 0) { + ucs_fatal("unable to write to %s", new_log_file_name); + } + } +} + +static void ucs_log_handle_file_max_size(int log_entry_len) +{ + const char *next_token; + + /* check if it is necessary to find a new storage for logs */ + if ((log_entry_len + ftell(ucs_log_file)) < ucs_global_opts.log_file_size) { + return; + } + + fclose(ucs_log_file); + + if (ucs_global_opts.log_file_rotate != 0) { + ucs_log_file_rotate(); + } else { + unlink(ucs_log_file_base_name); + } + + ucs_open_output_stream(ucs_log_file_base_name, UCS_LOG_LEVEL_FATAL, + &ucs_log_file, &ucs_log_file_close, + &next_token, NULL); } static void ucs_log_print(size_t buffer_size, const char *short_file, int line, - ucs_log_level_t level, const struct timeval *tv, - const char *message) + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const struct timeval *tv, const char *message) { - char *valg_buf; + char *log_buf; + int log_entry_len; if (RUNNING_ON_VALGRIND) { - valg_buf = ucs_alloca(buffer_size + 1); - snprintf(valg_buf, buffer_size, - "[%lu.%06lu] %16s:%-4u %-4s %-5s %s\n", tv->tv_sec, tv->tv_usec, - short_file, line, "UCX", ucs_log_level_names[level], - message); - VALGRIND_PRINTF("%s", valg_buf); + log_buf = ucs_alloca(buffer_size + 1); + snprintf(log_buf, buffer_size, UCS_LOG_SHORT_FMT, + UCS_LOG_SHORT_ARG(short_file, line, level, + comp_conf, tv, message)); + VALGRIND_PRINTF("%s", log_buf); } else if (ucs_log_initialized) { - fprintf(ucs_log_file, - "[%lu.%06lu] [%s:%-5d:%d] %16s:%-4u %-4s %-5s %s\n", - tv->tv_sec, tv->tv_usec, ucs_log_hostname, ucs_log_pid, - ucs_log_get_thread_num(), short_file, line, "UCX", - ucs_log_level_names[level], message); + if (ucs_log_file_close) { /* non-stdout/stderr */ + /* get log entry size */ + log_entry_len = snprintf(NULL, 0, UCS_LOG_FMT, + UCS_LOG_ARG(short_file, line, level, + comp_conf, tv, message)); + ucs_log_handle_file_max_size(log_entry_len); + } + + fprintf(ucs_log_file, UCS_LOG_FMT, + UCS_LOG_ARG(short_file, line, level, + comp_conf, tv, message)); } else { - fprintf(stdout, - "[%lu.%06lu] %16s:%-4u %-4s %-5s %s\n", - tv->tv_sec, tv->tv_usec, short_file, line, - "UCX", ucs_log_level_names[level], message); + fprintf(stdout, UCS_LOG_SHORT_FMT, + UCS_LOG_SHORT_ARG(short_file, line, level, + comp_conf, tv, message)); } } ucs_log_func_rc_t ucs_log_default_handler(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *format, va_list ap) + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *format, va_list ap) { size_t buffer_size = ucs_log_get_buffer_size(); - char *log_line, *saveptr; - const char *short_file; + char *saveptr = ""; + char *log_line; struct timeval tv; char *buf; - if (!ucs_log_is_enabled(level) && (level != UCS_LOG_LEVEL_PRINT)) { - return UCS_LOG_FUNC_RC_CONTINUE; + if (!ucs_log_component_is_enabled(level, comp_conf) && (level != UCS_LOG_LEVEL_PRINT)) { + return UCS_LOG_FUNC_RC_CONTINUE; } buf = ucs_alloca(buffer_size + 1); @@ -139,13 +254,12 @@ ucs_log_default_handler(const char *file, unsigned line, const char *function, if (level <= ucs_global_opts.log_level_trigger) { ucs_fatal_error_message(file, line, function, buf); } else { - short_file = strrchr(file, '/'); - short_file = (short_file == NULL) ? file : short_file + 1; gettimeofday(&tv, NULL); log_line = strtok_r(buf, "\n", &saveptr); while (log_line != NULL) { - ucs_log_print(buffer_size, short_file, line, level, &tv, log_line); + ucs_log_print(buffer_size, ucs_basename(file), line, level, comp_conf, + &tv, log_line); log_line = strtok_r(NULL, "\n", &saveptr); } } @@ -178,19 +292,21 @@ unsigned ucs_log_num_handlers() } void ucs_log_dispatch(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *format, ...) + ucs_log_level_t level, ucs_log_component_config_t *comp_conf, + const char *format, ...) { ucs_log_func_rc_t rc; - unsigned index; + unsigned idx; va_list ap; /* Call handlers in reverse order */ rc = UCS_LOG_FUNC_RC_CONTINUE; - index = ucs_log_handlers_count; - while ((index > 0) && (rc == UCS_LOG_FUNC_RC_CONTINUE)) { - --index; + idx = ucs_log_handlers_count; + while ((idx > 0) && (rc == UCS_LOG_FUNC_RC_CONTINUE)) { + --idx; va_start(ap, format); - rc = ucs_log_handlers[index](file, line, function, level, format, ap); + rc = ucs_log_handlers[idx](file, line, function, + level, comp_conf, format, ap); va_end(ap); } } @@ -289,41 +405,15 @@ const char *ucs_log_bitmap_to_str(unsigned n, uint8_t *bitmap, size_t length) return buf; } - -const char * ucs_log_dump_hex(const void* data, size_t length, char *buf, - size_t max) -{ - static const char hexchars[] = "0123456789abcdef"; - char *p, *endp; - uint8_t value; - size_t i; - - p = buf; - endp = buf + max - 2; - - i = 0; - while ((p < endp) && (i < length)) { - if (((i % 4) == 0) && (i > 0)) { - *(p++) = ':'; - } - value = *(uint8_t*)(data + i); - p[0] = hexchars[value / 16]; - p[1] = hexchars[value % 16]; - p += 2; - ++i; - } - *p = 0; - return buf; -} - void ucs_log_early_init() { - ucs_log_initialized = 0; - ucs_log_hostname[0] = 0; - ucs_log_pid = getpid(); - ucs_log_file = NULL; - ucs_log_file_close = 0; - threads_count = 0; + ucs_log_initialized = 0; + ucs_log_hostname[0] = 0; + ucs_log_pid = getpid(); + ucs_log_file = NULL; + ucs_log_file_last_idx = 0; + ucs_log_file_close = 0; + threads_count = 0; pthread_spin_init(&threads_lock, 0); } @@ -337,25 +427,70 @@ void ucs_log_init() ucs_log_initialized = 1; /* Set this to 1 immediately to avoid infinite recursion */ + if (ucs_global_opts.log_file_size < ucs_log_get_buffer_size()) { + ucs_fatal("the maximal log file size (%zu) has to be >= %zu", + ucs_global_opts.log_file_size, + ucs_log_get_buffer_size()); + } + + if (ucs_global_opts.log_file_rotate > INT_MAX) { + ucs_fatal("the log file rotate (%u) has to be <= %d", + ucs_global_opts.log_file_rotate, INT_MAX); + } + + strcpy(ucs_log_hostname, ucs_get_host_name()); - ucs_log_file = stdout; - ucs_log_file_close = 0; + ucs_log_file = stdout; + ucs_log_file_base_name = NULL; + ucs_log_file_close = 0; + ucs_log_file_last_idx = 0; ucs_log_push_handler(ucs_log_default_handler); if (strlen(ucs_global_opts.log_file) != 0) { - ucs_open_output_stream(ucs_global_opts.log_file, UCS_LOG_LEVEL_FATAL, - &ucs_log_file, &ucs_log_file_close, &next_token); + ucs_open_output_stream(ucs_global_opts.log_file, UCS_LOG_LEVEL_FATAL, + &ucs_log_file, &ucs_log_file_close, + &next_token, &ucs_log_file_base_name); } } void ucs_log_cleanup() { + ucs_assert(ucs_log_initialized); + ucs_log_flush(); if (ucs_log_file_close) { fclose(ucs_log_file); } + pthread_spin_destroy(&threads_lock); + + ucs_free(ucs_log_file_base_name); + ucs_log_file_base_name = NULL; ucs_log_file = NULL; + ucs_log_file_last_idx = 0; ucs_log_initialized = 0; ucs_log_handlers_count = 0; } + +void ucs_log_print_backtrace(ucs_log_level_t level) +{ + backtrace_h bckt; + backtrace_line_h bckt_line; + int i; + char buf[1024]; + ucs_status_t status; + + status = ucs_debug_backtrace_create(&bckt, 1); + if (status != UCS_OK) { + return; + } + + ucs_log(level, "==== backtrace (tid:%7d) ====\n", ucs_get_tid()); + for (i = 0; ucs_debug_backtrace_next(bckt, &bckt_line); ++i) { + ucs_debug_print_backtrace_line(buf, sizeof(buf), i, bckt_line); + ucs_log(level, "%s", buf); + } + ucs_log(level, "=================================\n"); + + ucs_debug_backtrace_destroy(bckt); +} diff --git a/src/ucs/debug/log.h b/src/ucs/debug/log.h index 001f68f9c98..1e26bcd0f9d 100644 --- a/src/ucs/debug/log.h +++ b/src/ucs/debug/log.h @@ -1,10 +1,9 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ - #ifndef UCS_LOG_H_ #define UCS_LOG_H_ @@ -12,151 +11,6 @@ # include "config.h" /* Defines UCS_MAX_LOG_LEVEL */ #endif -#include -#include -#include -#include - - -BEGIN_C_DECLS - -/** @file log.h */ - -#define ucs_log_is_enabled(_level) \ - ucs_unlikely(((_level) <= UCS_MAX_LOG_LEVEL) && ((_level) <= (ucs_global_opts.log_level))) - - -#define ucs_log(_level, _fmt, ...) \ - do { \ - if (ucs_log_is_enabled(_level)) { \ - ucs_log_dispatch(__FILE__, __LINE__, __FUNCTION__, (_level), \ - _fmt, ## __VA_ARGS__); \ - } \ - } while (0) - - -#define ucs_error(_fmt, ...) ucs_log(UCS_LOG_LEVEL_ERROR, _fmt, ## __VA_ARGS__) -#define ucs_warn(_fmt, ...) ucs_log(UCS_LOG_LEVEL_WARN, _fmt, ## __VA_ARGS__) -#define ucs_info(_fmt, ...) ucs_log(UCS_LOG_LEVEL_INFO, _fmt, ## __VA_ARGS__) -#define ucs_debug(_fmt, ...) ucs_log(UCS_LOG_LEVEL_DEBUG, _fmt, ## __VA_ARGS__) -#define ucs_trace(_fmt, ...) ucs_log(UCS_LOG_LEVEL_TRACE, _fmt, ## __VA_ARGS__) -#define ucs_trace_req(_fmt, ...) ucs_log(UCS_LOG_LEVEL_TRACE_REQ, _fmt, ## __VA_ARGS__) -#define ucs_trace_data(_fmt, ...) ucs_log(UCS_LOG_LEVEL_TRACE_DATA, _fmt, ## __VA_ARGS__) -#define ucs_trace_async(_fmt, ...) ucs_log(UCS_LOG_LEVEL_TRACE_ASYNC, _fmt, ## __VA_ARGS__) -#define ucs_trace_func(_fmt, ...) ucs_log(UCS_LOG_LEVEL_TRACE_FUNC, "%s(" _fmt ")", __FUNCTION__, ## __VA_ARGS__) -#define ucs_trace_poll(_fmt, ...) ucs_log(UCS_LOG_LEVEL_TRACE_POLL, _fmt, ## __VA_ARGS__) - - -/** - * Print a message regardless of current log level. Output can be - * enabled/disabled via environment variable/configuration settings. - * - * During debugging it can be useful to add a few prints to the code - * without changing a current log level. Also it is useful to be able - * to see messages only from specific processes. For example, one may - * want to see prints only from rank 0 when debugging MPI. - * - * The function is intended for debugging only. It should not be used - * in the real code. - */ - -#define ucs_print(_fmt, ...) \ - do { \ - if (ucs_global_opts.log_print_enable) { \ - ucs_log_dispatch(__FILE__, __LINE__, __FUNCTION__, \ - UCS_LOG_LEVEL_PRINT, _fmt, ## __VA_ARGS__); \ - } \ - } while(0) - - -typedef enum { - UCS_LOG_FUNC_RC_STOP, - UCS_LOG_FUNC_RC_CONTINUE -} ucs_log_func_rc_t; - - -/** - * Function type for handling log messages. - * - * @param file Source file name. - * @param line Source line number. - * @param function Function name. - * @param message Log message - format string - * @param ap Log message format parameters. - * - * @return UCS_LOG_FUNC_RC_CONTINUE - continue to next log handler - * UCS_LOG_FUNC_RC_STOP - don't continue - */ -typedef ucs_log_func_rc_t (*ucs_log_func_t)(const char *file, unsigned line, - const char *function, ucs_log_level_t level, - const char *message, va_list ap); - - -extern const char *ucs_log_level_names[]; -extern const char *ucs_log_category_names[]; - - -/** - * Dispatch a logging message. - * - * @param [in] file Source file name. - * @param [in] line Source line number. - * @param [in] function Function name which generated the log. - * @param [in] level Log level of the message, - * @param [in] message Log format - */ -void ucs_log_dispatch(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *format, ...) - UCS_F_PRINTF(5, 6); - - -/** - * Flush logging output. - */ -void ucs_log_flush(); - - -/** - * @return Configured log buffer size - */ -size_t ucs_log_get_buffer_size(); - - -/** - * Default log handler, which prints the message to the output configured in - * UCS global options. See @ref ucs_log_func_t. - */ -ucs_log_func_rc_t -ucs_log_default_handler(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *format, va_list ap); - - -/** - * Show a fatal error - */ -void ucs_log_fatal_error(const char *format, ...); - - -/** - * Initialize/cleanup logging subsystem. - */ -void ucs_log_early_init(); -void ucs_log_init(); -void ucs_log_cleanup(); - - -const char *ucs_log_bitmap_to_str(unsigned n, uint8_t *bitmap, size_t length); - -const char *ucs_log_dump_hex(const void* data, size_t length, char *buf, - size_t max); - -/** - * Add/remove logging handlers - */ -void ucs_log_push_handler(ucs_log_func_t handler); -void ucs_log_pop_handler(); -unsigned ucs_log_num_handlers(); - -END_C_DECLS +#include /* Contains actual logger implementation */ #endif diff --git a/src/ucs/debug/log_def.h b/src/ucs/debug/log_def.h new file mode 100644 index 00000000000..8aa5795c2ef --- /dev/null +++ b/src/ucs/debug/log_def.h @@ -0,0 +1,181 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef UCS_LOG_DEF_H_ +#define UCS_LOG_DEF_H_ + +#ifndef UCS_MAX_LOG_LEVEL +# define UCS_MAX_LOG_LEVEL UCS_LOG_LEVEL_TRACE_LAST +#endif + +#include +#include +#include +#include + + +BEGIN_C_DECLS + +/** @file log_def.h */ + +#define ucs_log_component_is_enabled(_level, _comp_log_config) \ + ucs_unlikely(((_level) <= UCS_MAX_LOG_LEVEL) && \ + ((_level) <= (((ucs_log_component_config_t*)(_comp_log_config))->log_level))) + +#define ucs_log_is_enabled(_level) \ + ucs_log_component_is_enabled(_level, &ucs_global_opts.log_component) + +#define ucs_log_component(_level, _comp_log_config, _fmt, ...) \ + do { \ + if (ucs_log_component_is_enabled(_level, _comp_log_config)) { \ + ucs_log_dispatch(__FILE__, __LINE__, __func__, \ + (ucs_log_level_t)(_level), _comp_log_config, _fmt, ## __VA_ARGS__); \ + } \ + } while (0) + +#define ucs_log(_level, _fmt, ...) \ + do { \ + ucs_log_component(_level, &ucs_global_opts.log_component, _fmt, ## __VA_ARGS__); \ + } while (0) + +#define ucs_error(_fmt, ...) ucs_log(UCS_LOG_LEVEL_ERROR, _fmt, ## __VA_ARGS__) +#define ucs_warn(_fmt, ...) ucs_log(UCS_LOG_LEVEL_WARN, _fmt, ## __VA_ARGS__) +#define ucs_diag(_fmt, ...) ucs_log(UCS_LOG_LEVEL_DIAG, _fmt, ## __VA_ARGS__) +#define ucs_info(_fmt, ...) ucs_log(UCS_LOG_LEVEL_INFO, _fmt, ## __VA_ARGS__) +#define ucs_debug(_fmt, ...) ucs_log(UCS_LOG_LEVEL_DEBUG, _fmt, ## __VA_ARGS__) +#define ucs_trace(_fmt, ...) ucs_log(UCS_LOG_LEVEL_TRACE, _fmt, ## __VA_ARGS__) +#define ucs_trace_req(_fmt, ...) ucs_log(UCS_LOG_LEVEL_TRACE_REQ, _fmt, ## __VA_ARGS__) +#define ucs_trace_data(_fmt, ...) ucs_log(UCS_LOG_LEVEL_TRACE_DATA, _fmt, ## __VA_ARGS__) +#define ucs_trace_async(_fmt, ...) ucs_log(UCS_LOG_LEVEL_TRACE_ASYNC, _fmt, ## __VA_ARGS__) +#define ucs_trace_func(_fmt, ...) ucs_log(UCS_LOG_LEVEL_TRACE_FUNC, "%s(" _fmt ")", __FUNCTION__, ## __VA_ARGS__) +#define ucs_trace_poll(_fmt, ...) ucs_log(UCS_LOG_LEVEL_TRACE_POLL, _fmt, ## __VA_ARGS__) + + +/** + * Print a message regardless of current log level. Output can be + * enabled/disabled via environment variable/configuration settings. + * + * During debugging it can be useful to add a few prints to the code + * without changing a current log level. Also it is useful to be able + * to see messages only from specific processes. For example, one may + * want to see prints only from rank 0 when debugging MPI. + * + * The function is intended for debugging only. It should not be used + * in the real code. + */ + +#define ucs_print(_fmt, ...) \ + do { \ + if (ucs_global_opts.log_print_enable) { \ + ucs_log_dispatch(__FILE__, __LINE__, __FUNCTION__, \ + UCS_LOG_LEVEL_PRINT, &ucs_global_opts.log_component, _fmt, ## __VA_ARGS__); \ + } \ + } while(0) + + +typedef enum { + UCS_LOG_FUNC_RC_STOP, + UCS_LOG_FUNC_RC_CONTINUE +} ucs_log_func_rc_t; + +/** + * Function type for handling log messages. + * + * @param file Source file name. + * @param line Source line number. + * @param function Function name. + * @param level Log level. + * @param comp_conf Component specific log config. + * @param message Log message - format string. + * @param ap Log message format parameters. + * + * @return UCS_LOG_FUNC_RC_CONTINUE - continue to next log handler. + * UCS_LOG_FUNC_RC_STOP - don't continue. + */ +typedef ucs_log_func_rc_t (*ucs_log_func_t)(const char *file, unsigned line, + const char *function, ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap); + + +extern const char *ucs_log_level_names[]; +extern const char *ucs_log_category_names[]; + + +/** + * Dispatch a logging message. + * + * @param [in] file Source file name. + * @param [in] line Source line number. + * @param [in] function Function name which generated the log. + * @param [in] level Log level of the message. + * @param [in] comp_conf Component log config. + * @param [in] message Log format. + */ +void ucs_log_dispatch(const char *file, unsigned line, const char *function, + ucs_log_level_t level, ucs_log_component_config_t *comp_conf, + const char *format, ...) + UCS_F_PRINTF(6, 7); + + +/** + * Flush logging output. + */ +void ucs_log_flush(); + + +/** + * @return Configured log buffer size + */ +size_t ucs_log_get_buffer_size(); + + +/** + * Default log handler, which prints the message to the output configured in + * UCS global options. See @ref ucs_log_func_t. + */ +ucs_log_func_rc_t +ucs_log_default_handler(const char *file, unsigned line, const char *function, + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *format, va_list ap); + + +/** + * Show a fatal error + */ +void ucs_log_fatal_error(const char *format, ...); + + +/** + * Initialize/cleanup logging subsystem. + */ +void ucs_log_early_init(); +void ucs_log_init(); +void ucs_component_log_init(); +void ucs_log_cleanup(); + + +const char *ucs_log_bitmap_to_str(unsigned n, uint8_t *bitmap, size_t length); + +/** + * Add/remove logging handlers + */ +void ucs_log_push_handler(ucs_log_func_t handler); +void ucs_log_pop_handler(); +unsigned ucs_log_num_handlers(); + + +/** + * Log backtrace. + * + * @param level Log level. + */ +void ucs_log_print_backtrace(ucs_log_level_t level); + +END_C_DECLS + +#endif diff --git a/src/ucs/debug/memtrack.c b/src/ucs/debug/memtrack.c index a1656c36532..a6eeefb3246 100644 --- a/src/ucs/debug/memtrack.c +++ b/src/ucs/debug/memtrack.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "memtrack.h" #include @@ -11,11 +15,10 @@ #include #include #include -#include #include -#if ENABLE_MEMTRACK +#ifdef ENABLE_MEMTRACK #define UCS_MEMTRACK_FORMAT_STRING ("%22s: size: %9lu / %9lu\tcount: %9u / %9u\n") @@ -34,18 +37,17 @@ typedef struct ucs_memtrack_context { ucs_memtrack_entry_t total; khash_t(ucs_memtrack_ptr_hash) ptrs; khash_t(ucs_memtrack_entry_hash) entries; - UCS_STATS_NODE_DECLARE(stats); + UCS_STATS_NODE_DECLARE(stats) } ucs_memtrack_context_t; /* Global context for tracking allocated memory */ static ucs_memtrack_context_t ucs_memtrack_context = { .enabled = 0, - .lock = PTHREAD_MUTEX_INITIALIZER, - .total = {0} + .lock = PTHREAD_MUTEX_INITIALIZER }; -#if ENABLE_STATS +#ifdef ENABLE_STATS static ucs_stats_class_t ucs_memtrack_stats_class = { .name = "memtrack", .num_counters = UCS_MEMTRACK_STAT_LAST, @@ -197,11 +199,19 @@ void *ucs_realloc(void *ptr, size_t size, const char *name) return ptr; } -void *ucs_memalign(size_t boundary, size_t size, const char *name) +int ucs_posix_memalign(void **ptr, size_t boundary, size_t size, const char *name) { - void *ptr = memalign(boundary, size); - ucs_memtrack_allocated(ptr, size, name); - return ptr; + int ret; + +#if HAVE_POSIX_MEMALIGN + ret = posix_memalign(ptr, boundary, size); +#else +#error "Port me" +#endif + if (ret == 0) { + ucs_memtrack_allocated(*ptr, size, name); + } + return ret; } void ucs_free(void *ptr) @@ -310,7 +320,7 @@ static void ucs_memtrack_generate_report() status = ucs_open_output_stream(ucs_global_opts.memtrack_dest, UCS_LOG_LEVEL_ERROR, &output_stream, - &need_close, &next_token); + &need_close, &next_token, NULL); if (status != UCS_OK) { return; } @@ -357,8 +367,6 @@ void ucs_memtrack_cleanup() return; } - pthread_mutex_lock(&ucs_memtrack_context.lock); - ucs_memtrack_generate_report(); /* disable before releasing the stats node */ @@ -373,8 +381,6 @@ void ucs_memtrack_cleanup() /* destroy hash tables */ kh_destroy_inplace(ucs_memtrack_entry_hash, &ucs_memtrack_context.entries); kh_destroy_inplace(ucs_memtrack_ptr_hash, &ucs_memtrack_context.ptrs); - - pthread_mutex_unlock(&ucs_memtrack_context.lock); } int ucs_memtrack_is_enabled() diff --git a/src/ucs/debug/memtrack.h b/src/ucs/debug/memtrack.h index 72e1a0968dd..8e58e0f563a 100644 --- a/src/ucs/debug/memtrack.h +++ b/src/ucs/debug/memtrack.h @@ -1,7 +1,7 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2013. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * Copyright (C) UT-Battelle, LLC. 2014. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -14,7 +14,6 @@ #include #include -#include #include @@ -42,7 +41,7 @@ typedef struct ucs_memtrack_entry { -#if ENABLE_MEMTRACK +#ifdef ENABLE_MEMTRACK #define UCS_MEMTRACK_ARG , const char* alloc_name #define UCS_MEMTRACK_VAL , alloc_name @@ -51,13 +50,13 @@ typedef struct ucs_memtrack_entry { /** - * Start trakcing memory (or increment reference count). + * Start tracking memory (or increment reference count). */ void ucs_memtrack_init(); /** - * Stop trakcing memory (or decrement reference count). + * Stop tracking memory (or decrement reference count). */ void ucs_memtrack_cleanup(); @@ -104,7 +103,8 @@ void ucs_memtrack_releasing(void *ptr); void *ucs_malloc(size_t size, const char *name); void *ucs_calloc(size_t nmemb, size_t size, const char *name); void *ucs_realloc(void *ptr, size_t size, const char *name); -void *ucs_memalign(size_t boundary, size_t size, const char *name); +int ucs_posix_memalign(void **ptr, size_t boundary, size_t size, + const char *name); void ucs_free(void *ptr); void *ucs_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset, const char *name); @@ -131,7 +131,9 @@ char *ucs_strndup(const char *src, size_t n, const char *name); #define ucs_malloc(_s, ...) malloc(_s) #define ucs_calloc(_n, _s, ...) calloc(_n, _s) #define ucs_realloc(_p, _s, ...) realloc(_p, _s) -#define ucs_memalign(_b, _s, ...) memalign(_b, _s) +#if HAVE_POSIX_MEMALIGN +#define ucs_posix_memalign(_pp, _b, _s, ...) posix_memalign(_pp, _b, _s) +#endif #define ucs_free(_p) free(_p) #define ucs_mmap(_a, _l, _p, _fl, _fd, _o, ...) mmap(_a, _l, _p, _fl, _fd, _o) #define ucs_munmap(_a, _l) munmap(_a, _l) diff --git a/src/ucs/memory/memory_type.c b/src/ucs/memory/memory_type.c new file mode 100644 index 00000000000..94a1eb1a8d3 --- /dev/null +++ b/src/ucs/memory/memory_type.c @@ -0,0 +1,33 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "memory_type.h" + +#include + + +const char *ucs_memory_type_names[] = { + [UCS_MEMORY_TYPE_HOST] = "host", + [UCS_MEMORY_TYPE_CUDA] = "cuda" , + [UCS_MEMORY_TYPE_CUDA_MANAGED] = "cuda-managed", + [UCS_MEMORY_TYPE_ROCM] = "rocm", + [UCS_MEMORY_TYPE_ROCM_MANAGED] = "rocm-managed", + [UCS_MEMORY_TYPE_LAST] = "unknown" +}; + +const char *ucs_memory_type_descs[] = { + [UCS_MEMORY_TYPE_HOST] = "System memory", + [UCS_MEMORY_TYPE_CUDA] = "NVIDIA GPU memory" , + [UCS_MEMORY_TYPE_CUDA_MANAGED] = "NVIDIA GPU managed/unified memory", + [UCS_MEMORY_TYPE_ROCM] = "AMD/ROCm GPU memory", + [UCS_MEMORY_TYPE_ROCM_MANAGED] = "AMD/ROCm GPU managed memory", + [UCS_MEMORY_TYPE_LAST] = "unknown" +}; + diff --git a/src/ucs/memory/memory_type.h b/src/ucs/memory/memory_type.h new file mode 100644 index 00000000000..f8eb7b02019 --- /dev/null +++ b/src/ucs/memory/memory_type.h @@ -0,0 +1,50 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + + +#ifndef UCS_MEMORY_TYPE_H_ +#define UCS_MEMORY_TYPE_H_ + +#include + +BEGIN_C_DECLS + + +/* Memory types accessible from CPU */ +#define UCS_MEMORY_TYPES_CPU_ACCESSIBLE \ + (UCS_BIT(UCS_MEMORY_TYPE_HOST) | \ + UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED) | \ + UCS_BIT(UCS_MEMORY_TYPE_ROCM_MANAGED)) + + +/* + * @ingroup UCS_RESOURCE + * Memory types + */ +typedef enum ucs_memory_type { + UCS_MEMORY_TYPE_HOST, /**< Default system memory */ + UCS_MEMORY_TYPE_CUDA, /**< NVIDIA CUDA memory */ + UCS_MEMORY_TYPE_CUDA_MANAGED, /**< NVIDIA CUDA managed (or unified) memory*/ + UCS_MEMORY_TYPE_ROCM, /**< AMD ROCM memory */ + UCS_MEMORY_TYPE_ROCM_MANAGED, /**< AMD ROCM managed system memory */ + UCS_MEMORY_TYPE_LAST +} ucs_memory_type_t; + + +/** + * Array of string names for each memory type + */ +extern const char *ucs_memory_type_names[]; + +/** + * Array of string descriptions for each memory type + */ +extern const char *ucs_memory_type_descs[]; + + +END_C_DECLS + +#endif diff --git a/src/ucs/memory/memtype_cache.c b/src/ucs/memory/memtype_cache.c index de783811fa4..a0e105760cb 100644 --- a/src/ucs/memory/memtype_cache.c +++ b/src/ucs/memory/memtype_cache.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "memtype_cache.h" #include @@ -18,10 +22,20 @@ #include +typedef enum { + UCS_MEMTYPE_CACHE_ACTION_SET_MEMTYPE, + UCS_MEMTYPE_CACHE_ACTION_REMOVE +} ucs_memtype_cache_action_t; + static ucs_pgt_dir_t *ucs_memtype_cache_pgt_dir_alloc(const ucs_pgtable_t *pgtable) { - return ucs_memalign(UCS_PGT_ENTRY_MIN_ALIGN, sizeof(ucs_pgt_dir_t), - "memtype_cache_pgdir"); + void *ptr; + int ret; + + ret = ucs_posix_memalign(&ptr, + ucs_max(sizeof(void *), UCS_PGT_ENTRY_MIN_ALIGN), + sizeof(ucs_pgt_dir_t), "memtype_cache_pgdir"); + return (ret == 0) ? ptr : NULL; } static void ucs_memtype_cache_pgt_dir_release(const ucs_pgtable_t *pgtable, @@ -30,135 +44,228 @@ static void ucs_memtype_cache_pgt_dir_release(const ucs_pgtable_t *pgtable, ucs_free(dir); } -static UCS_F_ALWAYS_INLINE void -ucs_memtype_cache_insert(ucs_memtype_cache_t *memtype_cache, void *address, - size_t size, ucm_mem_type_t mem_type) +/* + * - Lock must be held in write mode + * - start, end must be aligned to page size + */ +static void ucs_memtype_cache_insert(ucs_memtype_cache_t *memtype_cache, + ucs_pgt_addr_t start, ucs_pgt_addr_t end, + ucs_memory_type_t mem_type) { ucs_memtype_cache_region_t *region; - ucs_pgt_addr_t start, end; ucs_status_t status; - - ucs_trace("memtype_cache:insert address:%p length:%zu mem_type:%d", - address, size, mem_type); - - pthread_rwlock_wrlock(&memtype_cache->lock); - - /* Align to page size */ - start = ucs_align_down_pow2((uintptr_t)address, UCS_PGT_ADDR_ALIGN); - end = ucs_align_up_pow2 ((uintptr_t)address + size, UCS_PGT_ADDR_ALIGN); - region = NULL; + int ret; /* Allocate structure for new region */ - region = ucs_memalign(UCS_PGT_ENTRY_MIN_ALIGN, sizeof(ucs_memtype_cache_region_t), - "memtype_cache_region"); - if (region == NULL) { + ret = ucs_posix_memalign((void **)®ion, + ucs_max(sizeof(void *), UCS_PGT_ENTRY_MIN_ALIGN), + sizeof(ucs_memtype_cache_region_t), + "memtype_cache_region"); + if (ret != 0) { ucs_warn("failed to allocate memtype_cache region"); - goto out_unlock; + return; } + ucs_assert((start % UCS_PGT_ADDR_ALIGN) == 0); + ucs_assert((end % UCS_PGT_ADDR_ALIGN) == 0); + region->super.start = start; region->super.end = end; region->mem_type = mem_type; + status = UCS_PROFILE_CALL(ucs_pgtable_insert, &memtype_cache->pgtable, ®ion->super); if (status != UCS_OK) { ucs_error("failed to insert region " UCS_PGT_REGION_FMT ": %s", UCS_PGT_REGION_ARG(®ion->super), ucs_status_string(status)); ucs_free(region); - goto out_unlock; + return; } -out_unlock: - pthread_rwlock_unlock(&memtype_cache->lock); + ucs_trace("memtype_cache: insert " UCS_PGT_REGION_FMT " mem_type %s", + UCS_PGT_REGION_ARG(®ion->super), + ucs_memory_type_names[mem_type]); } -static UCS_F_ALWAYS_INLINE void -ucs_memtype_cache_delete(ucs_memtype_cache_t *memtype_cache, void *address, - size_t size, ucm_mem_type_t mem_type) +static void ucs_memtype_cache_region_collect_callback(const ucs_pgtable_t *pgtable, + ucs_pgt_region_t *pgt_region, + void *arg) { - ucs_pgt_addr_t start = (uintptr_t)address; - ucs_pgt_region_t *pgt_region; - ucs_memtype_cache_region_t *region; + ucs_memtype_cache_region_t *region = ucs_derived_of(pgt_region, + ucs_memtype_cache_region_t); + ucs_list_link_t *list = arg; + ucs_list_add_tail(list, ®ion->list); +} + +UCS_PROFILE_FUNC_VOID(ucs_memtype_cache_update_internal, + (memtype_cache, address, size, mem_type, action), + ucs_memtype_cache_t *memtype_cache, const void *address, + size_t size, ucs_memory_type_t mem_type, + ucs_memtype_cache_action_t action) +{ + ucs_memtype_cache_region_t *region, *tmp; + UCS_LIST_HEAD(region_list); + ucs_pgt_addr_t start, end, search_start, search_end; ucs_status_t status; - ucs_trace("memtype_cache:delete address:%p length:%zu mem_type:%d", - address, size, mem_type); + if (!size) { + return; + } - pthread_rwlock_rdlock(&memtype_cache->lock); + start = ucs_align_down_pow2((uintptr_t)address, UCS_PGT_ADDR_ALIGN); + end = ucs_align_up_pow2 ((uintptr_t)address + size, UCS_PGT_ADDR_ALIGN); + + ucs_trace("%s: [0x%lx..0x%lx] mem_type %s", + ((action == UCS_MEMTYPE_CACHE_ACTION_SET_MEMTYPE) ? + "update" : "remove"), + start, end, ucs_memory_type_names[mem_type]); + + if (action == UCS_MEMTYPE_CACHE_ACTION_SET_MEMTYPE) { + /* try to find regions that are contiguous and instersected + * with current one */ + search_start = start - 1; + search_end = end; + } else { + /* try to find regions that are instersected with current one */ + search_start = start; + search_end = end - 1; + } - pgt_region = UCS_PROFILE_CALL(ucs_pgtable_lookup, &memtype_cache->pgtable, start); - assert(pgt_region != NULL); + pthread_rwlock_wrlock(&memtype_cache->lock); - region = ucs_derived_of(pgt_region, ucs_memtype_cache_region_t); + /* find and remove all regions which intersect with new one */ + ucs_pgtable_search_range(&memtype_cache->pgtable, search_start, search_end, + ucs_memtype_cache_region_collect_callback, + ®ion_list); + ucs_list_for_each_safe(region, tmp, ®ion_list, list) { + if (action == UCS_MEMTYPE_CACHE_ACTION_SET_MEMTYPE) { + if (region->mem_type == mem_type) { + /* merge current region with overlapping or adjacent regions + * of same memory type */ + start = ucs_min(start, region->super.start); + end = ucs_max(end, region->super.end); + } else if ((region->super.end < start) || + (region->super.start >= end)) { + /* ignore regions which are not really overlapping and can't + * be merged because of different memory types */ + ucs_list_del(®ion->list); + continue; + } + } + + status = ucs_pgtable_remove(&memtype_cache->pgtable, ®ion->super); + if (status != UCS_OK) { + ucs_error("failed to remove " UCS_PGT_REGION_FMT + " from memtype_cache: %s", + UCS_PGT_REGION_ARG(®ion->super), + ucs_status_string(status)); + goto out_unlock; + } + + ucs_trace("memtype_cache: removed " UCS_PGT_REGION_FMT " %s", + UCS_PGT_REGION_ARG(®ion->super), + ucs_memory_type_names[region->mem_type]); + } - status = ucs_pgtable_remove(&memtype_cache->pgtable, ®ion->super); - if (status != UCS_OK) { - ucs_warn("failed to remove address:%p from memtype_cache", address); + if (action == UCS_MEMTYPE_CACHE_ACTION_SET_MEMTYPE) { + ucs_memtype_cache_insert(memtype_cache, start, end, mem_type); + } + + /* slice old regions by the new region, to preserve the previous memory type + * of the non-overlapping parts + */ + ucs_list_for_each_safe(region, tmp, ®ion_list, list) { + if (start > region->super.start) { + /* create previous region */ + ucs_memtype_cache_insert(memtype_cache, region->super.start, start, + region->mem_type); + } + if (end < region->super.end) { + /* create next region */ + ucs_memtype_cache_insert(memtype_cache, end, region->super.end, + region->mem_type); + } + + ucs_free(region); } - ucs_free(region); + +out_unlock: pthread_rwlock_unlock(&memtype_cache->lock); } +void ucs_memtype_cache_update(ucs_memtype_cache_t *memtype_cache, + const void *address, size_t size, + ucs_memory_type_t mem_type) +{ + ucs_memtype_cache_update_internal(memtype_cache, address, size, mem_type, + UCS_MEMTYPE_CACHE_ACTION_SET_MEMTYPE); +} + +void ucs_memtype_cache_remove(ucs_memtype_cache_t *memtype_cache, + const void *address, size_t size) +{ + ucs_memtype_cache_update_internal(memtype_cache, address, size, + UCS_MEMORY_TYPE_LAST, + UCS_MEMTYPE_CACHE_ACTION_REMOVE); +} + static void ucs_memtype_cache_event_callback(ucm_event_type_t event_type, ucm_event_t *event, void *arg) { ucs_memtype_cache_t *memtype_cache = arg; + ucs_memtype_cache_action_t action; if (event_type & UCM_EVENT_MEM_TYPE_ALLOC) { - ucs_memtype_cache_insert(memtype_cache, event->mem_type.address, - event->mem_type.size, event->mem_type.mem_type); + action = UCS_MEMTYPE_CACHE_ACTION_SET_MEMTYPE; } else if (event_type & UCM_EVENT_MEM_TYPE_FREE) { - ucs_memtype_cache_delete(memtype_cache, event->mem_type.address, - event->mem_type.size, event->mem_type.mem_type); + action = UCS_MEMTYPE_CACHE_ACTION_REMOVE; + } else { + return; } -} -static void ucs_memtype_cache_region_collect_callback(const ucs_pgtable_t *pgtable, - ucs_pgt_region_t *pgt_region, - void *arg) -{ - ucs_memtype_cache_region_t *region = ucs_derived_of(pgt_region, - ucs_memtype_cache_region_t); - ucs_list_link_t *list = arg; - ucs_list_add_tail(list, ®ion->list); + ucs_memtype_cache_update_internal(memtype_cache, event->mem_type.address, + event->mem_type.size, + event->mem_type.mem_type, action); } static void ucs_memtype_cache_purge(ucs_memtype_cache_t *memtype_cache) { ucs_memtype_cache_region_t *region, *tmp; - ucs_list_link_t region_list; + UCS_LIST_HEAD(region_list); ucs_trace_func("memtype_cache purge"); - ucs_list_head_init(®ion_list); - ucs_pgtable_purge(&memtype_cache->pgtable, ucs_memtype_cache_region_collect_callback, - ®ion_list); + ucs_pgtable_purge(&memtype_cache->pgtable, + ucs_memtype_cache_region_collect_callback, ®ion_list); ucs_list_for_each_safe(region, tmp, ®ion_list, list) { - ucs_warn("destroying inuse address:%p ", (void *)region->super.start); ucs_free(region); } } UCS_PROFILE_FUNC(ucs_status_t, ucs_memtype_cache_lookup, - (memtype_cache, address, length, ucm_mem_type), - ucs_memtype_cache_t *memtype_cache, void *address, - size_t length, ucm_mem_type_t *ucm_mem_type) + (memtype_cache, address, size, mem_type_p), + ucs_memtype_cache_t *memtype_cache, const void *address, + size_t size, ucs_memory_type_t *mem_type_p) { - ucs_pgt_addr_t start = (uintptr_t)address; - ucs_pgt_region_t *pgt_region; + const ucs_pgt_addr_t start = (uintptr_t)address; ucs_memtype_cache_region_t *region; + ucs_pgt_region_t *pgt_region; ucs_status_t status; pthread_rwlock_rdlock(&memtype_cache->lock); - pgt_region = UCS_PROFILE_CALL(ucs_pgtable_lookup, &memtype_cache->pgtable, start); - if (pgt_region && pgt_region->end >= (start + length)) { - region = ucs_derived_of(pgt_region, ucs_memtype_cache_region_t); - *ucm_mem_type = region->mem_type; - status = UCS_OK; + pgt_region = UCS_PROFILE_CALL(ucs_pgtable_lookup, &memtype_cache->pgtable, + start); + if (pgt_region == NULL) { + status = UCS_ERR_NO_ELEM; goto out_unlock; } - status = UCS_ERR_NO_ELEM; + + region = ucs_derived_of(pgt_region, ucs_memtype_cache_region_t); + *mem_type_p = ((pgt_region->end >= (start + size)) ? + region->mem_type : UCS_MEMORY_TYPE_LAST); + status = UCS_OK; + out_unlock: pthread_rwlock_unlock(&memtype_cache->lock); return status; @@ -182,9 +289,14 @@ static UCS_CLASS_INIT_FUNC(ucs_memtype_cache_t) goto err_destroy_rwlock; } - status = ucm_set_event_handler((UCM_EVENT_MEM_TYPE_ALLOC | UCM_EVENT_MEM_TYPE_FREE), - 1000, ucs_memtype_cache_event_callback, self); - if (status != UCS_OK) { + status = ucm_set_event_handler(UCM_EVENT_MEM_TYPE_ALLOC | + UCM_EVENT_MEM_TYPE_FREE | + UCM_EVENT_FLAG_EXISTING_ALLOC, + 1000, ucs_memtype_cache_event_callback, + self); + if ((status != UCS_OK) && (status != UCS_ERR_UNSUPPORTED)) { + ucs_error("failed to set UCM memtype event handler: %s", + ucs_status_string(status)); goto err_cleanup_pgtable; } diff --git a/src/ucs/memory/memtype_cache.h b/src/ucs/memory/memtype_cache.h index 209be050057..708f6e144b9 100644 --- a/src/ucs/memory/memtype_cache.h +++ b/src/ucs/memory/memtype_cache.h @@ -7,10 +7,16 @@ #ifndef UCS_MEMTYPE_CACHE_H_ #define UCS_MEMTYPE_CACHE_H_ +#include "memory_type.h" + #include #include #include -#include +#include +#include + + +BEGIN_C_DECLS typedef struct ucs_memtype_cache ucs_memtype_cache_t; typedef struct ucs_memtype_cache_region ucs_memtype_cache_region_t; @@ -19,7 +25,7 @@ typedef struct ucs_memtype_cache_region ucs_memtype_cache_region_t; struct ucs_memtype_cache_region { ucs_pgt_region_t super; /**< Base class - page table region */ ucs_list_link_t list; /**< List element */ - ucm_mem_type_t mem_type; /**< Memory type the address belongs to */ + ucs_memory_type_t mem_type; /**< Memory type the address belongs to */ }; @@ -33,6 +39,8 @@ struct ucs_memtype_cache { * Create a memtype cache. * * @param [out] memtype_cache_p Filled with a pointer to the memtype cache. + * + * @return Error code. */ ucs_status_t ucs_memtype_cache_create(ucs_memtype_cache_t **memtype_cache_p); @@ -45,17 +53,50 @@ ucs_status_t ucs_memtype_cache_create(ucs_memtype_cache_t **memtype_cache_p); void ucs_memtype_cache_destroy(ucs_memtype_cache_t *memtype_cache); -/** Find if address range is in memtype cache. +/** + * Find if address range is in memtype cache. * - * @param [in] memtype_cache Memtype cache to search - * @param [in] address Address to lookup - * @param [in] length Length of the memory - * @param [out] ucm_mem_type Memory type of the address + * @param [in] memtype_cache Memtype cache to search. + * @param [in] address Address to lookup. + * @param [in] size Length of the memory. + * @param [out] mem_type_p Set to the memory type of the address range. + * UCS_MEMORY_TYPE_LAST is a special value which + * means the memory type is an unknown non-host + * memory, and should be detected in another way. * * @return Error code. */ -ucs_status_t ucs_memtype_cache_lookup(ucs_memtype_cache_t *memtype_cache, void *address, - size_t length, ucm_mem_type_t *ucm_mem_type); +ucs_status_t +ucs_memtype_cache_lookup(ucs_memtype_cache_t *memtype_cache, const void *address, + size_t size, ucs_memory_type_t *mem_type_p); + + +/** + * Update the memory type of an address range. + * Can be used after @ucs_memtype_cache_lookup returns UCM_MEM_TYPE_LAST, to + * set the memory type after it was detected. + * + * @param [in] memtype_cache Memtype cache to update. + * @param [in] address Start address to update. + * @param [in] size Size of the memory to update. + * @param [out] mem_type Set the memory type of the address range to this + * value. + */ +void ucs_memtype_cache_update(ucs_memtype_cache_t *memtype_cache, + const void *address, size_t size, + ucs_memory_type_t mem_type); + + +/** + * Remove the address range from a memtype cache. + * + * @param [in] memtype_cache Memtype cache to remove. + * @param [in] address Start address to remove. + * @param [in] size Size of the memory to remove. + */ +void ucs_memtype_cache_remove(ucs_memtype_cache_t *memtype_cache, + const void *address, size_t size); +END_C_DECLS #endif diff --git a/src/ucs/memory/numa.c b/src/ucs/memory/numa.c index b7d7b5f7f98..b2af49f7398 100644 --- a/src/ucs/memory/numa.c +++ b/src/ucs/memory/numa.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "numa.h" #include diff --git a/src/ucs/memory/rcache.c b/src/ucs/memory/rcache.c index 0594be3b278..6ae0c8b01ad 100644 --- a/src/ucs/memory/rcache.c +++ b/src/ucs/memory/rcache.c @@ -4,6 +4,9 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif #include #include @@ -14,6 +17,7 @@ #include #include #include +#include #include #include "rcache.h" @@ -38,6 +42,27 @@ #define ucs_rcache_region_pfn(_region) \ ((_region)->priv) +#define ucs_rcache_region_pfn_ptr(_region) \ + ((_region)->pfn) + + +enum { + /* Need to page table lock while destroying */ + UCS_RCACHE_REGION_PUT_FLAG_TAKE_PGLOCK = UCS_BIT(0), + /* Instead of actually destroying the region, add it to garbage collection + * list. This is used when region put is done in the context of memory + * event callback. */ + UCS_RCACHE_REGION_PUT_FLAG_ADD_TO_GC = UCS_BIT(1), +#if UCS_ENABLE_ASSERT + /* Region is expected to reach a reference count of 0 and be destroyed */ + UCS_RCACHE_REGION_PUT_FLAG_MUST_DESTROY = UCS_BIT(2), + /* Region is expected to be present in the page table */ + UCS_RCACHE_REGION_PUT_FLAG_IN_PGTABLE = UCS_BIT(3) +#else + UCS_RCACHE_REGION_PUT_FLAG_MUST_DESTROY = 0, + UCS_RCACHE_REGION_PUT_FLAG_IN_PGTABLE = 0 +#endif +}; typedef struct ucs_rcache_inv_entry { @@ -47,7 +72,13 @@ typedef struct ucs_rcache_inv_entry { } ucs_rcache_inv_entry_t; -#if ENABLE_STATS +typedef struct { + ucs_rcache_t *rcache; + ucs_rcache_region_t *region; +} ucs_rcache_region_validate_pfn_t; + + +#ifdef ENABLE_STATS static ucs_stats_class_t ucs_rcache_stats_class = { .name = "rcache", .num_counters = UCS_RCACHE_STAT_LAST, @@ -87,7 +118,7 @@ static void __ucs_rcache_region_log(const char *file, int line, const char *func strcpy(region_desc, ""); } - ucs_log_dispatch(file, line, function, level, + ucs_log_dispatch(file, line, function, level, &ucs_global_opts.log_component, "%s: %s region " UCS_PGT_REGION_FMT " %c%c "UCS_RCACHE_PROT_FMT" ref %u %s", rcache->name, message, UCS_PGT_REGION_ARG(®ion->super), @@ -100,14 +131,24 @@ static void __ucs_rcache_region_log(const char *file, int line, const char *func static ucs_pgt_dir_t *ucs_rcache_pgt_dir_alloc(const ucs_pgtable_t *pgtable) { - return ucs_memalign(UCS_PGT_ENTRY_MIN_ALIGN, sizeof(ucs_pgt_dir_t), - "rcache_pgdir"); + ucs_rcache_t *rcache = ucs_container_of(pgtable, ucs_rcache_t, pgtable); + ucs_pgt_dir_t *dir; + + ucs_spin_lock(&rcache->lock); + dir = ucs_mpool_get(&rcache->mp); + ucs_spin_unlock(&rcache->lock); + + return dir; } static void ucs_rcache_pgt_dir_release(const ucs_pgtable_t *pgtable, ucs_pgt_dir_t *dir) { - ucs_free(dir); + ucs_rcache_t *rcache = ucs_container_of(pgtable, ucs_rcache_t, pgtable); + + ucs_spin_lock(&rcache->lock); + ucs_mpool_put(dir); + ucs_spin_unlock(&rcache->lock); } static ucs_status_t ucs_rcache_mp_chunk_alloc(ucs_mpool_t *mp, size_t *size_p, @@ -126,8 +167,8 @@ static ucs_status_t ucs_rcache_mp_chunk_alloc(ucs_mpool_t *mp, size_t *size_p, /* Store the size in the first bytes of the chunk */ *(size_t*)ptr = size; - *chunk_p = ptr + sizeof(size_t); - *size_p = size - sizeof(size_t); + *chunk_p = UCS_PTR_BYTE_OFFSET(ptr, sizeof(size_t)); + *size_p = size - sizeof(size_t); return UCS_OK; } @@ -137,7 +178,7 @@ static void ucs_rcache_mp_chunk_release(ucs_mpool_t *mp, void *chunk) void *ptr; int ret; - ptr = chunk - sizeof(size_t); + ptr = UCS_PTR_BYTE_OFFSET(chunk, -sizeof(size_t)); size = *(size_t*)ptr; ret = ucm_orig_munmap(ptr, size); if (ret) { @@ -152,23 +193,76 @@ static ucs_mpool_ops_t ucs_rcache_mp_ops = { .obj_cleanup = NULL }; +static unsigned ucs_rcache_region_page_count(ucs_rcache_region_t *region) +{ + size_t page_size = ucs_get_page_size(); + + return (ucs_align_up(region->super.end, page_size) - + ucs_align_down(region->super.start, page_size)) / + ucs_get_page_size(); +} + +static void ucs_rcache_validate_pfn(ucs_rcache_t *rcache, + ucs_rcache_region_t *region, + unsigned page_num, + unsigned long region_pfn, + unsigned long actual_pfn) +{ + if (region_pfn != actual_pfn) { + ucs_rcache_region_error(rcache, region, "pfn check failed"); + ucs_fatal("%s: page at virtual address 0x%lx moved from pfn 0x%lx to pfn 0x%lx", + rcache->name, + region->super.start + (page_num * ucs_get_page_size()), + region_pfn, actual_pfn); + } +} + +static void ucs_rcache_region_validate_pfn_cb(unsigned page_num, + unsigned long pfn, + void *ctx) +{ + ucs_rcache_region_validate_pfn_t *data = (ucs_rcache_region_validate_pfn_t*)ctx; + + ucs_rcache_validate_pfn(data->rcache, data->region, page_num, + ucs_rcache_region_pfn_ptr(data->region)[page_num], + pfn); +} + /* Lock must be held for read */ static void ucs_rcache_region_validate_pfn(ucs_rcache_t *rcache, ucs_rcache_region_t *region) { unsigned long region_pfn, actual_pfn; + unsigned page_count; + ucs_rcache_region_validate_pfn_t ctx; + ucs_status_t status; - if (!ucs_unlikely(ucs_global_opts.rcache_check_pfn)) { + if ((rcache->params.flags & UCS_RCACHE_FLAG_NO_PFN_CHECK) || + (ucs_global_opts.rcache_check_pfn == 0)) { return; } - region_pfn = ucs_rcache_region_pfn(region); - actual_pfn = ucs_sys_get_pfn(region->super.start); - if (region_pfn != actual_pfn) { - ucs_rcache_region_error(rcache, region, "pfn check failed"); - ucs_fatal("%s: page at virtual address 0x%lx moved from pfn 0x%lx to pfn 0x%lx", - rcache->name, region->super.start, region_pfn, actual_pfn); - } else { + if (ucs_global_opts.rcache_check_pfn == 1) { + /* in case if only 1 page to check - save PFN value in-place + in priv section */ + region_pfn = ucs_rcache_region_pfn(region); + status = ucs_sys_get_pfn(region->super.start, 1, &actual_pfn); + if (status != UCS_OK) { + goto out; + } + ucs_rcache_validate_pfn(rcache, region, 0, region_pfn, actual_pfn); + goto out; + } + + page_count = ucs_min(ucs_global_opts.rcache_check_pfn, + ucs_rcache_region_page_count(region)); + ctx.rcache = rcache; + ctx.region = region; + status = ucs_sys_enum_pfn(region->super.start, page_count, + ucs_rcache_region_validate_pfn_cb, &ctx); + +out: + if (status == UCS_OK) { ucs_rcache_region_trace(rcache, region, "pfn ok"); } } @@ -207,40 +301,58 @@ static void ucs_mem_region_destroy_internal(ucs_rcache_t *rcache, } } + if (!(rcache->params.flags & UCS_RCACHE_FLAG_NO_PFN_CHECK) && + (ucs_global_opts.rcache_check_pfn > 1)) { + ucs_free(ucs_rcache_region_pfn_ptr(region)); + } + ucs_free(region); } static inline void ucs_rcache_region_put_internal(ucs_rcache_t *rcache, ucs_rcache_region_t *region, - int lock, - int must_be_destroyed) + unsigned flags) { - ucs_rcache_region_trace(rcache, region, lock ? "put" : "put_nolock"); + ucs_rcache_region_trace(rcache, region, "flags 0x%x", flags); ucs_assert(region->refcount > 0); - if (ucs_unlikely(ucs_atomic_fadd32(®ion->refcount, -1) == 1)) { - if (lock) { - pthread_rwlock_wrlock(&rcache->lock); - } - ucs_mem_region_destroy_internal(rcache, region); - if (lock) { - pthread_rwlock_unlock(&rcache->lock); - } - } else { - ucs_assert(!must_be_destroyed); + if (ucs_likely(ucs_atomic_fsub32(®ion->refcount, 1) != 1)) { + ucs_assert(!(flags & UCS_RCACHE_REGION_PUT_FLAG_MUST_DESTROY)); + return; + } + + if (flags & UCS_RCACHE_REGION_PUT_FLAG_ADD_TO_GC) { + /* Put the region on garbage collection list */ + ucs_spin_lock(&rcache->lock); + ucs_rcache_region_trace(rcache, region, "put on GC list", flags); + ucs_list_add_tail(&rcache->gc_list, ®ion->list); + ucs_spin_unlock(&rcache->lock); + return; + } + + /* Destroy region and de-register memory */ + if (flags & UCS_RCACHE_REGION_PUT_FLAG_TAKE_PGLOCK) { + pthread_rwlock_wrlock(&rcache->pgt_lock); + } + + ucs_mem_region_destroy_internal(rcache, region); + + if (flags & UCS_RCACHE_REGION_PUT_FLAG_TAKE_PGLOCK) { + pthread_rwlock_unlock(&rcache->pgt_lock); } } /* Lock must be held in write mode */ static void ucs_rcache_region_invalidate(ucs_rcache_t *rcache, ucs_rcache_region_t *region, - int must_be_in_pgt, - int must_be_destroyed) + unsigned flags) { ucs_status_t status; ucs_rcache_region_trace(rcache, region, "invalidate"); + ucs_assert(!(flags & UCS_RCACHE_REGION_PUT_FLAG_TAKE_PGLOCK)); + /* Remove the memory region from page table, if it's there */ if (region->flags & UCS_RCACHE_REGION_FLAG_PGTABLE) { status = ucs_pgtable_remove(&rcache->pgtable, ®ion->super); @@ -250,15 +362,15 @@ static void ucs_rcache_region_invalidate(ucs_rcache_t *rcache, } region->flags &= ~UCS_RCACHE_REGION_FLAG_PGTABLE; } else { - ucs_assert(!must_be_in_pgt); + ucs_assert(!(flags & UCS_RCACHE_REGION_PUT_FLAG_IN_PGTABLE)); } - ucs_rcache_region_put_internal(rcache, region, 0, must_be_destroyed); + ucs_rcache_region_put_internal(rcache, region, flags); } /* Lock must be held in write mode */ static void ucs_rcache_invalidate_range(ucs_rcache_t *rcache, ucs_pgt_addr_t start, - ucs_pgt_addr_t end) + ucs_pgt_addr_t end, unsigned flags) { ucs_rcache_region_t *region, *tmp; ucs_list_link_t region_list; @@ -268,19 +380,20 @@ static void ucs_rcache_invalidate_range(ucs_rcache_t *rcache, ucs_pgt_addr_t sta ucs_rcache_find_regions(rcache, start, end - 1, ®ion_list); ucs_list_for_each_safe(region, tmp, ®ion_list, list) { /* all regions on the list are in the page table */ - ucs_rcache_region_invalidate(rcache, region, 1, 0); + ucs_rcache_region_invalidate(rcache, region, + flags | UCS_RCACHE_REGION_PUT_FLAG_IN_PGTABLE); UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_UNMAP_INVALIDATES, 1); } } /* Lock must be held in write mode */ -static void ucs_rcache_check_inv_queue(ucs_rcache_t *rcache) +static void ucs_rcache_check_inv_queue(ucs_rcache_t *rcache, unsigned flags) { ucs_rcache_inv_entry_t *entry; ucs_trace_func("rcache=%s", rcache->name); - pthread_spin_lock(&rcache->inv_lock); + ucs_spin_lock(&rcache->lock); while (!ucs_queue_is_empty(&rcache->inv_q)) { entry = ucs_queue_pull_elem_non_empty(&rcache->inv_q, ucs_rcache_inv_entry_t, queue); @@ -289,15 +402,40 @@ static void ucs_rcache_check_inv_queue(ucs_rcache_t *rcache) * operations, which could trigger vm_unmapped event which also takes * this lock. */ - pthread_spin_unlock(&rcache->inv_lock); + ucs_spin_unlock(&rcache->lock); - ucs_rcache_invalidate_range(rcache, entry->start, entry->end); + ucs_rcache_invalidate_range(rcache, entry->start, entry->end, flags); - pthread_spin_lock(&rcache->inv_lock); + ucs_spin_lock(&rcache->lock); ucs_mpool_put(entry); /* Must be done with the lock held */ } - pthread_spin_unlock(&rcache->inv_lock); + ucs_spin_unlock(&rcache->lock); +} + +/* Lock must be held in write mode */ +static void ucs_rcache_check_gc_list(ucs_rcache_t *rcache) +{ + ucs_rcache_region_t *region; + + ucs_trace_func("rcache=%s", rcache->name); + + ucs_spin_lock(&rcache->lock); + while (!ucs_list_is_empty(&rcache->gc_list)) { + region = ucs_list_extract_head(&rcache->gc_list, ucs_rcache_region_t, + list); + + /* We need to drop the lock since the following code may trigger memory + * operations, which could trigger vm_unmapped event which also takes + * this lock. + */ + ucs_spin_unlock(&rcache->lock); + + ucs_mem_region_destroy_internal(rcache, region); + + ucs_spin_lock(&rcache->lock); + } + ucs_spin_unlock(&rcache->lock); } static void ucs_rcache_unmapped_callback(ucm_event_type_t event_type, @@ -323,10 +461,24 @@ static void ucs_rcache_unmapped_callback(ucm_event_type_t event_type, ucs_trace_func("%s: event vm_unmapped 0x%lx..0x%lx", rcache->name, start, end); - pthread_spin_lock(&rcache->inv_lock); - entry = ucs_mpool_get(&rcache->inv_mp); + /* + * Try to lock the page table and invalidate the region immediately. + * This way we avoid queuing endless events on the invalidation queue when + * no rcache operations are performed to clean it. + */ + if (!pthread_rwlock_trywrlock(&rcache->pgt_lock)) { + ucs_rcache_invalidate_range(rcache, start, end, + UCS_RCACHE_REGION_PUT_FLAG_ADD_TO_GC); + UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_UNMAPS, 1); + ucs_rcache_check_inv_queue(rcache, UCS_RCACHE_REGION_PUT_FLAG_ADD_TO_GC); + pthread_rwlock_unlock(&rcache->pgt_lock); + return; + } + + /* Could not lock - add region to invalidation queue */ + ucs_spin_lock(&rcache->lock); + entry = ucs_mpool_get(&rcache->mp); if (entry != NULL) { - /* Add region to invalidation list */ entry->start = start; entry->end = end; ucs_queue_push(&rcache->inv_q, &entry->queue); @@ -335,7 +487,7 @@ static void ucs_rcache_unmapped_callback(ucm_event_type_t event_type, ucs_error("Failed to allocate invalidation entry for 0x%lx..0x%lx, " "data corruption may occur", start, end); } - pthread_spin_unlock(&rcache->inv_lock); + ucs_spin_unlock(&rcache->lock); } /* Clear all regions @@ -354,7 +506,7 @@ static void ucs_rcache_purge(ucs_rcache_t *rcache) ucs_list_for_each_safe(region, tmp, ®ion_list, list) { if (region->flags & UCS_RCACHE_REGION_FLAG_PGTABLE) { region->flags &= ~UCS_RCACHE_REGION_FLAG_PGTABLE; - ucs_atomic_add32(®ion->refcount, -1); + ucs_atomic_add32(®ion->refcount, (uint32_t)-1); } if (region->refcount > 0) { ucs_rcache_region_warn(rcache, region, "destroying inuse"); @@ -382,7 +534,8 @@ ucs_rcache_check_overlap(ucs_rcache_t *rcache, ucs_pgt_addr_t *start, ucs_trace_func("rcache=%s, *start=0x%lx, *end=0x%lx", rcache->name, *start, *end); - ucs_rcache_check_inv_queue(rcache); + ucs_rcache_check_inv_queue(rcache, 0); + ucs_rcache_check_gc_list(rcache); ucs_rcache_find_regions(rcache, *start, *end - 1, ®ion_list); @@ -425,7 +578,8 @@ ucs_rcache_check_overlap(ucs_rcache_t *rcache, ucs_pgt_addr_t *start, * region. However mem_reg still may be able to deal with it. * Do the safest thing: invalidate cached region */ - ucs_rcache_region_invalidate(rcache, region, 1, 0); + ucs_rcache_region_invalidate(rcache, region, + UCS_RCACHE_REGION_PUT_FLAG_IN_PGTABLE); continue; } else if (ucs_test_all_flags(mem_prot, region->prot)) { *prot |= region->prot; @@ -439,7 +593,8 @@ ucs_rcache_check_overlap(ucs_rcache_t *rcache, ucs_pgt_addr_t *start, ucs_rcache_region_trace(rcache, region, "do not merge mem "UCS_RCACHE_PROT_FMT" with", UCS_RCACHE_PROT_ARG(mem_prot)); - ucs_rcache_region_invalidate(rcache, region, 1, 0); + ucs_rcache_region_invalidate(rcache, region, + UCS_RCACHE_REGION_PUT_FLAG_IN_PGTABLE); continue; } } @@ -450,11 +605,44 @@ ucs_rcache_check_overlap(ucs_rcache_t *rcache, ucs_pgt_addr_t *start, *start = ucs_min(*start, region->super.start); *end = ucs_max(*end, region->super.end); *merged = 1; - ucs_rcache_region_invalidate(rcache, region, 1, 0); + ucs_rcache_region_invalidate(rcache, region, + UCS_RCACHE_REGION_PUT_FLAG_IN_PGTABLE); } return UCS_OK; } +static ucs_status_t ucs_rcache_fill_pfn(ucs_rcache_region_t *region) +{ + unsigned page_count; + ucs_status_t status; + + if (ucs_global_opts.rcache_check_pfn == 0) { + ucs_rcache_region_pfn(region) = 0; + return UCS_OK; + } + + if (ucs_global_opts.rcache_check_pfn == 1) { + return ucs_sys_get_pfn(region->super.start, 1, &ucs_rcache_region_pfn(region)); + } + + page_count = ucs_min(ucs_rcache_region_page_count(region), + ucs_global_opts.rcache_check_pfn); + ucs_rcache_region_pfn_ptr(region) = + ucs_malloc(sizeof(*ucs_rcache_region_pfn_ptr(region)) * page_count, + "pfn list"); + if (ucs_rcache_region_pfn_ptr(region) == NULL) { + return UCS_ERR_NO_MEMORY; + } + + status = ucs_sys_get_pfn(region->super.start, page_count, + ucs_rcache_region_pfn_ptr(region)); + if (status != UCS_OK) { + ucs_free(ucs_rcache_region_pfn_ptr(region)); + } + + return status; +} + static ucs_status_t ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, size_t length, int prot, void *arg, ucs_rcache_region_t **region_p) @@ -462,12 +650,12 @@ ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, size_t length, ucs_rcache_region_t *region; ucs_pgt_addr_t start, end; ucs_status_t status; - int merged; + int error, merged; ucs_trace_func("rcache=%s, address=%p, length=%zu", rcache->name, address, length); - pthread_rwlock_wrlock(&rcache->lock); + pthread_rwlock_wrlock(&rcache->pgt_lock); retry: /* Align to page size */ @@ -497,9 +685,12 @@ ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, size_t length, } /* Allocate structure for new region */ - region = ucs_memalign(UCS_PGT_ENTRY_MIN_ALIGN, rcache->params.region_struct_size, - "rcache_region"); - if (region == NULL) { + error = ucs_posix_memalign((void **)®ion, + ucs_max(sizeof(void *), UCS_PGT_ENTRY_MIN_ALIGN), + rcache->params.region_struct_size, + "rcache_region"); + if (error != 0) { + ucs_error("failed to allocate rcache region descriptor: %m"); status = UCS_ERR_NO_MEMORY; goto out_unlock; } @@ -538,7 +729,9 @@ ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, size_t length, */ ucs_debug("failed to register merged region " UCS_PGT_REGION_FMT ": %s, retrying", UCS_PGT_REGION_ARG(®ion->super), ucs_status_string(status)); - ucs_rcache_region_invalidate(rcache, region, 1, 1); + ucs_rcache_region_invalidate(rcache, region, + UCS_RCACHE_REGION_PUT_FLAG_IN_PGTABLE | + UCS_RCACHE_REGION_PUT_FLAG_MUST_DESTROY); goto retry; } else { ucs_debug("failed to register region " UCS_PGT_REGION_FMT ": %s", @@ -550,10 +743,13 @@ ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, size_t length, region->flags |= UCS_RCACHE_REGION_FLAG_REGISTERED; region->refcount = 2; /* Page-table + user */ - if (ucs_global_opts.rcache_check_pfn) { - ucs_rcache_region_pfn(region) = ucs_sys_get_pfn(region->super.start); - } else { - ucs_rcache_region_pfn(region) = 0; + if (!(rcache->params.flags & UCS_RCACHE_FLAG_NO_PFN_CHECK)) { + status = ucs_rcache_fill_pfn(region); + if (status != UCS_OK) { + ucs_error("failed to allocate pfn list"); + ucs_free(region); + goto out_unlock; + } } UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_MISSES, 1); @@ -563,7 +759,7 @@ ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, size_t length, out_set_region: *region_p = region; out_unlock: - pthread_rwlock_unlock(&rcache->lock); + pthread_rwlock_unlock(&rcache->pgt_lock); return status; } @@ -583,7 +779,7 @@ ucs_status_t ucs_rcache_get(ucs_rcache_t *rcache, void *address, size_t length, ucs_trace_func("rcache=%s, address=%p, length=%zu", rcache->name, address, length); - pthread_rwlock_rdlock(&rcache->lock); + pthread_rwlock_rdlock(&rcache->pgt_lock); UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_GETS, 1); if (ucs_queue_is_empty(&rcache->inv_q)) { pgt_region = UCS_PROFILE_CALL(ucs_pgtable_lookup, &rcache->pgtable, @@ -597,12 +793,12 @@ ucs_status_t ucs_rcache_get(ucs_rcache_t *rcache, void *address, size_t length, ucs_rcache_region_validate_pfn(rcache, region); *region_p = region; UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_HITS_FAST, 1); - pthread_rwlock_unlock(&rcache->lock); + pthread_rwlock_unlock(&rcache->pgt_lock); return UCS_OK; } } } - pthread_rwlock_unlock(&rcache->lock); + pthread_rwlock_unlock(&rcache->pgt_lock); /* Fall back to slow version (with rw lock) in following cases: * - invalidation list not empty @@ -615,14 +811,16 @@ ucs_status_t ucs_rcache_get(ucs_rcache_t *rcache, void *address, size_t length, void ucs_rcache_region_put(ucs_rcache_t *rcache, ucs_rcache_region_t *region) { - ucs_rcache_region_put_internal(rcache, region, 1, 0); + ucs_rcache_region_put_internal(rcache, region, + UCS_RCACHE_REGION_PUT_FLAG_TAKE_PGLOCK); UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_PUTS, 1); } static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params, const char *name, ucs_stats_node_t *stats_parent) { - ucs_status_t status; + ucs_status_t status, spinlock_status; + size_t mp_obj_size, mp_align; int ret; if (params->region_struct_size < sizeof(ucs_rcache_region_t)) { @@ -655,17 +853,15 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params, goto err_destroy_stats; } - ret = pthread_rwlock_init(&self->lock, NULL); + ret = pthread_rwlock_init(&self->pgt_lock, NULL); if (ret) { ucs_error("pthread_rwlock_init() failed: %m"); status = UCS_ERR_INVALID_PARAM; goto err_free_name; } - ret = pthread_spin_init(&self->inv_lock, 0); - if (ret) { - ucs_error("pthread_spin_init() failed: %m"); - status = UCS_ERR_INVALID_PARAM; + status = ucs_spinlock_init(&self->lock, 0); + if (status != UCS_OK) { goto err_destroy_rwlock; } @@ -675,13 +871,16 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params, goto err_destroy_inv_q_lock; } - status = ucs_mpool_init(&self->inv_mp, 0, sizeof(ucs_rcache_inv_entry_t), 0, - 1, 1024, -1, &ucs_rcache_mp_ops, "rcache_inv_mp"); + mp_obj_size = ucs_max(sizeof(ucs_pgt_dir_t), sizeof(ucs_rcache_inv_entry_t)); + mp_align = ucs_max(sizeof(void *), UCS_PGT_ENTRY_MIN_ALIGN); + status = ucs_mpool_init(&self->mp, 0, mp_obj_size, 0, mp_align, 1024, + UINT_MAX, &ucs_rcache_mp_ops, "rcache_mp"); if (status != UCS_OK) { goto err_cleanup_pgtable; } ucs_queue_head_init(&self->inv_q); + ucs_list_head_init(&self->gc_list); status = ucm_set_event_handler(params->ucm_events, params->ucm_event_priority, ucs_rcache_unmapped_callback, self); @@ -692,13 +891,16 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params, return UCS_OK; err_destroy_mp: - ucs_mpool_cleanup(&self->inv_mp, 1); + ucs_mpool_cleanup(&self->mp, 1); err_cleanup_pgtable: ucs_pgtable_cleanup(&self->pgtable); err_destroy_inv_q_lock: - pthread_spin_destroy(&self->inv_lock); + spinlock_status = ucs_spinlock_destroy(&self->lock); + if (spinlock_status != UCS_OK) { + ucs_warn("ucs_recursive_spinlock_destroy() failed (%d)", spinlock_status); + } err_destroy_rwlock: - pthread_rwlock_destroy(&self->lock); + pthread_rwlock_destroy(&self->pgt_lock); err_free_name: free(self->name); err_destroy_stats: @@ -709,15 +911,21 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params, static UCS_CLASS_CLEANUP_FUNC(ucs_rcache_t) { + ucs_status_t status; + ucm_unset_event_handler(self->params.ucm_events, ucs_rcache_unmapped_callback, self); - ucs_rcache_check_inv_queue(self); + ucs_rcache_check_inv_queue(self, 0); + ucs_rcache_check_gc_list(self); ucs_rcache_purge(self); - ucs_mpool_cleanup(&self->inv_mp, 1); + ucs_mpool_cleanup(&self->mp, 1); ucs_pgtable_cleanup(&self->pgtable); - pthread_spin_destroy(&self->inv_lock); - pthread_rwlock_destroy(&self->lock); + status = ucs_spinlock_destroy(&self->lock); + if (status != UCS_OK) { + ucs_warn("ucs_recursive_spinlock_destroy() failed (%d)", status); + } + pthread_rwlock_destroy(&self->pgt_lock); UCS_STATS_NODE_FREE(self->stats); free(self->name); } diff --git a/src/ucs/memory/rcache.h b/src/ucs/memory/rcache.h index d9db909b227..0f0fb96d666 100644 --- a/src/ucs/memory/rcache.h +++ b/src/ucs/memory/rcache.h @@ -46,6 +46,13 @@ enum { UCS_RCACHE_MEM_REG_HIDE_ERRORS = UCS_BIT(0) /**< Hide errors on memory registration */ }; +/* + * Rcache flags. + */ +enum { + UCS_RCACHE_FLAG_NO_PFN_CHECK = UCS_BIT(0), /**< PFN check not supported for this rcache */ +}; + /* * Registration cache operations. */ @@ -113,6 +120,7 @@ struct ucs_rcache_params { const ucs_rcache_ops_t *ops; /**< Memory operations functions */ void *context; /**< User-defined context that will be passed to mem_reg/mem_dereg */ + int flags; /**< Flags */ }; @@ -124,7 +132,13 @@ struct ucs_rcache_region { ucs_status_t status; /**< Current status code */ uint8_t prot; /**< Protection bits */ uint16_t flags; /**< Status flags. Protected by page table lock. */ - uint64_t priv; /**< Used internally */ + union { + uint64_t priv; /**< Used internally */ + unsigned long *pfn; /**< Pointer to PFN array. In case if requested + evaluation more than 1 page - PFN array is + allocated, if 1 page requested - used + in-place priv value. */ + }; }; diff --git a/src/ucs/memory/rcache_int.h b/src/ucs/memory/rcache_int.h index 2d8d5329425..a2270182b72 100644 --- a/src/ucs/memory/rcache_int.h +++ b/src/ucs/memory/rcache_int.h @@ -7,6 +7,9 @@ #ifndef UCS_REG_CACHE_INT_H_ #define UCS_REG_CACHE_INT_H_ +#include + + /* Names of rcache stats counters */ enum { UCS_RCACHE_GETS, /* number of get operations */ @@ -25,23 +28,32 @@ enum { struct ucs_rcache { - ucs_rcache_params_t params; /**< rcache parameters (immutable) */ - pthread_rwlock_t lock; /**< Protects the page table and all regions - whose refcount is 0 */ - ucs_pgtable_t pgtable; /**< page table to hold the regions */ - - pthread_spinlock_t inv_lock; /**< Lock for inv_q and inv_mp. This is a - separate lock because we may want to put - regions on inv_q while the page table - lock is held by the calling context */ - ucs_queue_head_t inv_q; /**< Regions which were invalidated during - memory events */ - ucs_mpool_t inv_mp; /**< Memory pool to allocate entries for inv_q, - since we cannot use regulat malloc(). - The backing storage is original mmap() - which does not generate memory events */ - char *name; - UCS_STATS_NODE_DECLARE(stats); + ucs_rcache_params_t params; /**< rcache parameters (immutable) */ + + pthread_rwlock_t pgt_lock; /**< Protects the page table and all + regions whose refcount is 0 */ + ucs_pgtable_t pgtable; /**< page table to hold the regions */ + + + ucs_spinlock_t lock; /**< Protects 'mp', 'inv_q' and 'gc_list'. + This is a separate lock because we + may want to invalidate regions + while the page table lock is held by + the calling context. + @note: This lock should always be + taken **after** 'pgt_lock'. */ + ucs_mpool_t mp; /**< Memory pool to allocate entries for + inv_q and page table entries, since + we cannot use regular malloc(). + The backing storage is original mmap() + which does not generate memory events */ + ucs_queue_head_t inv_q; /**< Regions which were invalidated during + memory events */ + ucs_list_link_t gc_list; /**< list for regions to destroy, regions + could not be destroyed from memhook */ + + char *name; /**< Name of the cache, for debug purpose */ + UCS_STATS_NODE_DECLARE(stats) }; #endif diff --git a/src/ucs/profile/profile.c b/src/ucs/profile/profile.c index 698915a4772..08585bcf966 100644 --- a/src/ucs/profile/profile.c +++ b/src/ucs/profile/profile.c @@ -1,12 +1,17 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "profile.h" +#include +#include #include #include #include @@ -14,28 +19,55 @@ #include +typedef struct ucs_profile_global_location { + ucs_profile_location_t super; /*< Location info */ + volatile int *loc_id_p; /*< Back-pointer to location index */ +} ucs_profile_global_location_t; + + /** * Profiling global context */ typedef struct ucs_profile_global_context { + ucs_profile_global_location_t *locations; /**< Array of all locations */ + unsigned num_locations; /**< Number of valid locations */ + unsigned max_locations; /**< Size of locations array */ + pthread_mutex_t mutex; /**< Protects updating the locations array */ + pthread_key_t tls_key; /**< TLS key for per-thread context */ + ucs_list_link_t thread_list; /**< List of all thread contexts */ +} ucs_profile_global_context_t; - ucs_profile_location_t *locations; /**< Array of all locations */ - unsigned num_locations; /**< Number of valid locations */ - unsigned max_locations; /**< Size of locations array */ - pthread_mutex_t mutex; /**< Protects updating the locations array */ + +/* Profiling per-thread context */ +typedef struct ucs_profile_thread_context { + pthread_t pthread_id; /**< POSIX thread id */ + int tid; /**< System thread id */ + ucs_time_t start_time; /**< Thread context init time */ + ucs_time_t end_time; /**< Thread end time */ + ucs_list_link_t list; /**< Entry in thread list */ + int is_completed; /**< Set to 1 when thread exits */ struct { - ucs_profile_record_t *start, *end; /**< Circular log buffer */ - ucs_profile_record_t *current; /**< Current log pointer */ - int wraparound; /**< Whether log was rotated */ + ucs_profile_record_t *start; /**< Circular log buffer start */ + ucs_profile_record_t *end; /**< Circular log buffer end */ + ucs_profile_record_t *current; /**< Current log pointer */ + int wraparound; /**< Whether log was rotated */ } log; struct { - int stack_top; /**< Index of stack top */ - ucs_time_t stack[UCS_PROFILE_STACK_MAX]; /**< Timestamps for each nested scope */ + unsigned num_locations; /**< Number of valid locations */ + ucs_profile_thread_location_t *locations; /**< Statistics per location */ + int stack_top; /**< Index of stack top */ + ucs_time_t stack[UCS_PROFILE_STACK_MAX]; /**< Timestamps for each nested scope */ } accum; +} ucs_profile_thread_context_t; -} ucs_profile_global_context_t; + +#define ucs_profile_for_each_location(_var) \ + for ((_var) = ucs_profile_global_ctx.locations; \ + (_var) < (ucs_profile_global_ctx.locations + \ + ucs_profile_global_ctx.num_locations); \ + ++(_var)) const char *ucs_profile_mode_names[] = { @@ -44,46 +76,160 @@ const char *ucs_profile_mode_names[] = { [UCS_PROFILE_MODE_LAST] = NULL }; -ucs_profile_global_context_t ucs_profile_ctx = { - .locations = NULL, - .log.start = NULL, - .log.end = NULL, - .log.current = NULL, - .log.wraparound = 0, - .accum.stack_top = -1, - .num_locations = 0, - .max_locations = 0, - .mutex = PTHREAD_MUTEX_INITIALIZER +static ucs_profile_global_context_t ucs_profile_global_ctx = { + .locations = NULL, + .num_locations = 0, + .max_locations = 0, + .mutex = PTHREAD_MUTEX_INITIALIZER, + .thread_list = UCS_LIST_INITIALIZER(&ucs_profile_global_ctx.thread_list, + &ucs_profile_global_ctx.thread_list), }; -static void ucs_profile_file_write_data(int fd, void *data, size_t size) +static ucs_status_t ucs_profile_file_write_data(int fd, void *data, size_t size) +{ + ssize_t written; + + if (size > 0) { + written = write(fd, data, size); + if (written < 0) { + ucs_error("failed to write %zu bytes to profiling file: %m", size); + return UCS_ERR_IO_ERROR; + } else if (size != written) { + ucs_error("wrote only %zd of %zu bytes to profiling file: %m", + written, size); + return UCS_ERR_IO_ERROR; + } + } + + return UCS_OK; +} + +static ucs_status_t +ucs_profile_file_write_records(int fd, ucs_profile_record_t *begin, + ucs_profile_record_t *end) +{ + return ucs_profile_file_write_data(fd, begin, UCS_PTR_BYTE_DIFF(begin, end)); +} + +/* Global lock must be held */ +static ucs_status_t +ucs_profile_file_write_thread(int fd, ucs_profile_thread_context_t *ctx, + ucs_time_t default_end_time) { - ssize_t written = write(fd, data, size); - if (written < 0) { - ucs_warn("failed to write %zu bytes to profiling file: %m", size); - } else if (size != written) { - ucs_warn("wrote only %zd of %zu bytes to profiling file: %m", - written, size); + ucs_profile_thread_location_t empty_location = { .total_time = 0, .count = 0 }; + ucs_profile_thread_header_t thread_hdr; + unsigned i, num_locations; + ucs_status_t status; + + /* + * NOTE: There is no protection against a race with a thread which is still + * producing profiling data (e.g updating the context structure without a + * lock). + * To avoid excess locking on fast-path, we assume that when we dump the + * profiling data (at program exit), the profiled threads are not calling + * ucs_profile_record() anymore. + */ + + ucs_debug("profiling context %p: write to file", ctx); + + /* write thread header */ + thread_hdr.tid = ctx->tid; + thread_hdr.start_time = ctx->start_time; + if (ctx->is_completed) { + thread_hdr.end_time = ctx->end_time; + } else { + thread_hdr.end_time = default_end_time; + } + + if (ucs_global_opts.profile_mode & UCS_BIT(UCS_PROFILE_MODE_LOG)) { + thread_hdr.num_records = ctx->log.wraparound ? + (ctx->log.end - ctx->log.start) : + (ctx->log.current - ctx->log.start); + } else { + thread_hdr.num_records = 0; + } + + status = ucs_profile_file_write_data(fd, &thread_hdr, sizeof(thread_hdr)); + if (status != UCS_OK) { + return status; + } + + /* If accumulate mode is not enabled, there are no location entries */ + if (ucs_global_opts.profile_mode & UCS_BIT(UCS_PROFILE_MODE_ACCUM)) { + num_locations = ctx->accum.num_locations; + } else { + num_locations = 0; + } + + /* write profiling information for every location + * note: the thread location array may be smaller (or even empty) than the + * global list, but it cannot be larger. If it's smaller, we pad with empty + * entries + */ + ucs_assert_always(num_locations <= ucs_profile_global_ctx.num_locations); + ucs_profile_file_write_data(fd, ctx->accum.locations, + num_locations * sizeof(*ctx->accum.locations)); + for (i = num_locations; i < ucs_profile_global_ctx.num_locations; ++i) { + status = ucs_profile_file_write_data(fd, &empty_location, + sizeof(empty_location)); + if (status != UCS_OK) { + return status; + } + } + + /* write profiling records */ + if (ucs_global_opts.profile_mode & UCS_BIT(UCS_PROFILE_MODE_LOG)) { + if (ctx->log.wraparound) { + status = ucs_profile_file_write_records(fd, ctx->log.current, + ctx->log.end); + if (status != UCS_OK) { + return status; + } + } + + status = ucs_profile_file_write_records(fd, ctx->log.start, + ctx->log.current); + if (status != UCS_OK) { + return status; + } } + + return UCS_OK; } -static void ucs_profile_file_write_records(int fd, ucs_profile_record_t *begin, - ucs_profile_record_t *end) +static ucs_status_t ucs_profile_write_locations(int fd) { - ucs_profile_file_write_data(fd, begin, (void*)end - (void*)begin); + ucs_profile_global_location_t *loc; + ucs_status_t status; + + ucs_profile_for_each_location(loc) { + status = ucs_profile_file_write_data(fd, &loc->super, sizeof(loc->super)); + if (status != UCS_OK) { + return status; + } + } + + return UCS_OK; } static void ucs_profile_write() { + ucs_profile_thread_context_t *ctx; ucs_profile_header_t header; char fullpath[1024] = {0}; char filename[1024] = {0}; + ucs_time_t write_time; + ucs_status_t status; int fd; if (!ucs_global_opts.profile_mode) { return; } + pthread_mutex_lock(&ucs_profile_global_ctx.mutex); + + write_time = ucs_get_time(); + ucs_fill_filename_template(ucs_global_opts.profile_file, filename, sizeof(filename)); ucs_expand_path(filename, fullpath, sizeof(fullpath) - 1); @@ -91,36 +237,124 @@ static void ucs_profile_write() fd = open(fullpath, O_WRONLY|O_CREAT|O_TRUNC, 0600); if (fd < 0) { ucs_error("failed to write profiling data to '%s': %m", fullpath); - return; + goto out_unlock; } /* write header */ memset(&header, 0, sizeof(header)); ucs_read_file(header.cmdline, sizeof(header.cmdline), 1, "/proc/self/cmdline"); strncpy(header.hostname, ucs_get_host_name(), sizeof(header.hostname) - 1); - header.pid = getpid(); - header.mode = ucs_global_opts.profile_mode; - header.num_locations = ucs_profile_ctx.num_locations; - header.num_records = ucs_profile_ctx.log.wraparound ? - (ucs_profile_ctx.log.end - ucs_profile_ctx.log.start) : - (ucs_profile_ctx.log.current - ucs_profile_ctx.log.start); + header.version = UCS_PROFILE_FILE_VERSION; + strncpy(header.ucs_path, ucs_debug_get_lib_path(), sizeof(header.ucs_path) - 1); + header.pid = getpid(); + header.mode = ucs_global_opts.profile_mode; + header.num_locations = ucs_profile_global_ctx.num_locations; + header.num_threads = ucs_list_length(&ucs_profile_global_ctx.thread_list); header.one_second = ucs_time_from_sec(1.0); ucs_profile_file_write_data(fd, &header, sizeof(header)); /* write locations */ - ucs_profile_file_write_data(fd, ucs_profile_ctx.locations, - sizeof(*ucs_profile_ctx.locations) * - ucs_profile_ctx.num_locations); + status = ucs_profile_write_locations(fd); + if (status != UCS_OK) { + goto out_close_fd; + } - /* write records */ - if (ucs_profile_ctx.log.wraparound > 0) { - ucs_profile_file_write_records(fd, ucs_profile_ctx.log.current, - ucs_profile_ctx.log.end); + /* write threads */ + ucs_list_for_each(ctx, &ucs_profile_global_ctx.thread_list, list) { + status = ucs_profile_file_write_thread(fd, ctx, write_time); + if (status != UCS_OK) { + goto out_close_fd; + } } - ucs_profile_file_write_records(fd, ucs_profile_ctx.log.start, - ucs_profile_ctx.log.current); +out_close_fd: close(fd); +out_unlock: + pthread_mutex_unlock(&ucs_profile_global_ctx.mutex); +} + +static UCS_F_NOINLINE +ucs_profile_thread_context_t* ucs_profile_thread_init() +{ + ucs_profile_thread_context_t *ctx; + size_t num_records; + + ucs_assert(ucs_global_opts.profile_mode); + + ctx = ucs_malloc(sizeof(*ctx), "profile_thread_context"); + if (ctx == NULL) { + ucs_error("failed to allocate profiling thread context"); + return NULL; + } + + ctx->tid = ucs_get_tid(); + ctx->start_time = ucs_get_time(); + ctx->end_time = 0; + ctx->pthread_id = pthread_self(); + + ucs_debug("profiling context %p: start on thread 0x%lx tid %d mode %d", + ctx, (unsigned long)pthread_self(), ucs_get_tid(), + ucs_global_opts.profile_mode); + + /* Initialize log mode */ + if (ucs_global_opts.profile_mode & UCS_BIT(UCS_PROFILE_MODE_LOG)) { + num_records = ucs_global_opts.profile_log_size / + sizeof(ucs_profile_record_t); + ctx->log.start = ucs_calloc(num_records, sizeof(ucs_profile_record_t), + "profile_log"); + if (ctx->log.start == NULL) { + ucs_fatal("failed to allocate profiling log"); + } + + ctx->log.end = ctx->log.start + num_records; + ctx->log.current = ctx->log.start; + ctx->log.wraparound = 0; + } + + /* Initialize accumulate mode */ + if (ucs_global_opts.profile_mode & UCS_BIT(UCS_PROFILE_MODE_ACCUM)) { + ctx->accum.num_locations = 0; + ctx->accum.locations = NULL; + ctx->accum.stack_top = -1; + } + + pthread_setspecific(ucs_profile_global_ctx.tls_key, ctx); + + pthread_mutex_lock(&ucs_profile_global_ctx.mutex); + ucs_list_add_tail(&ucs_profile_global_ctx.thread_list, &ctx->list); + pthread_mutex_unlock(&ucs_profile_global_ctx.mutex); + + return ctx; +} + +static void ucs_profile_thread_cleanup(ucs_profile_thread_context_t *ctx) +{ + ucs_debug("profiling context %p: cleanup", ctx); + + if (ucs_global_opts.profile_mode & UCS_BIT(UCS_PROFILE_MODE_LOG)) { + ucs_free(ctx->log.start); + } + + if (ucs_global_opts.profile_mode & UCS_BIT(UCS_PROFILE_MODE_ACCUM)) { + ucs_free(ctx->accum.locations); + } + + ucs_list_del(&ctx->list); + ucs_free(ctx); +} + +static void ucs_profile_thread_finalize(ucs_profile_thread_context_t *ctx) +{ + ucs_debug("profiling context %p: completed", ctx); + + ctx->end_time = ucs_get_time(); + ctx->is_completed = 1; +} + +static void ucs_profile_thread_key_destr(void *data) +{ + ucs_profile_thread_context_t *ctx = data; + ucs_profile_thread_finalize(ctx); } /* @@ -128,113 +362,150 @@ static void ucs_profile_write() * code, before the first record of each such location is made. * SHOULD NOT be used directly - use UCS_PROFILE macros instead. * - * @param [in] type Location type. - * @param [in] file Source file name. - * @param [in] line Source line number. - * @param [in] function Calling function name. - * @param [in] name Location name. - * @param [out] loc_id_p Filled with location ID: - * 0 - profiling is disabled - * >0 - location index + 1 + * @param [in] type Location type. + * @param [in] file Source file name. + * @param [in] line Source line number. + * @param [in] function Calling function name. + * @param [in] name Location name. + * @param [out] loc_id_p Filled with location ID: + * 0 - profiling is disabled + * >0 - location index + 1 */ -static void ucs_profile_get_location(ucs_profile_type_t type, const char *name, - const char *file, int line, - const char *function, volatile int *loc_id_p) +static UCS_F_NOINLINE +int ucs_profile_get_location(ucs_profile_type_t type, const char *name, + const char *file, int line, const char *function, + volatile int *loc_id_p) { - ucs_profile_global_context_t *ctx = &ucs_profile_ctx; - ucs_profile_location_t *loc; - int location; - int i; + ucs_profile_global_location_t *loc, *new_locations; + int loc_id; - pthread_mutex_lock(&ucs_profile_ctx.mutex); + pthread_mutex_lock(&ucs_profile_global_ctx.mutex); - if (*loc_id_p == 0) { + /* Check, with lock held, that the location is not already initialized */ + if (*loc_id_p >= 0) { + loc_id = *loc_id_p; goto out_unlock; } /* Check if profiling is disabled */ if (!ucs_global_opts.profile_mode) { - *loc_id_p = 0; + *loc_id_p = loc_id = 0; goto out_unlock; } /* Location ID must be uninitialized */ ucs_assert(*loc_id_p == -1); - for (i = 0; i < ctx->num_locations; ++i) { - loc = &ctx->locations[i]; - - if ((type == loc->type) && - (line == loc->line) && - !strcmp(loc->name, name) && - !strcmp(loc->file, basename(file)) && - !strcmp(loc->function, function)) { - - *loc_id_p = i + 1; - goto out_unlock; + ucs_profile_for_each_location(loc) { + if ((type == loc->super.type) && (line == loc->super.line) && + !strcmp(loc->super.name, name) && + !strcmp(loc->super.file, basename(file)) && + !strcmp(loc->super.function, function)) { + goto out_found; } } - location = ucs_profile_ctx.num_locations++; + ++ucs_profile_global_ctx.num_locations; /* Reallocate array if needed */ - if (ucs_profile_ctx.num_locations > ucs_profile_ctx.max_locations) { - ucs_profile_ctx.max_locations = ucs_profile_ctx.num_locations * 2; - ucs_profile_ctx.locations = ucs_realloc(ucs_profile_ctx.locations, - sizeof(*ucs_profile_ctx.locations) * - ucs_profile_ctx.max_locations, - "profile_locations"); - if (ucs_profile_ctx.locations == NULL) { + if (ucs_profile_global_ctx.num_locations > ucs_profile_global_ctx.max_locations) { + ucs_profile_global_ctx.max_locations = + 2 * ucs_profile_global_ctx.num_locations; + new_locations = ucs_realloc(ucs_profile_global_ctx.locations, + sizeof(*ucs_profile_global_ctx.locations) * + ucs_profile_global_ctx.max_locations, + "profile_locations"); + if (new_locations == NULL) { ucs_warn("failed to expand locations array"); - *loc_id_p = 0; + *loc_id_p = loc_id = 0; goto out_unlock; } + + ucs_profile_global_ctx.locations = new_locations; } /* Initialize new location */ - loc = &ucs_profile_ctx.locations[location]; - ucs_strncpy_zero(loc->file, basename(file), sizeof(loc->file)); - ucs_strncpy_zero(loc->function, function, sizeof(loc->function)); - ucs_strncpy_zero(loc->name, name, sizeof(loc->name)); - loc->line = line; - loc->type = type; - loc->total_time = 0; - loc->count = 0; + loc = &ucs_profile_global_ctx.locations[ucs_profile_global_ctx.num_locations - 1]; + ucs_strncpy_zero(loc->super.file, basename(file), sizeof(loc->super.file)); + ucs_strncpy_zero(loc->super.function, function, sizeof(loc->super.function)); + ucs_strncpy_zero(loc->super.name, name, sizeof(loc->super.name)); + loc->super.line = line; + loc->super.type = type; loc->loc_id_p = loc_id_p; +out_found: + *loc_id_p = loc_id = (loc - ucs_profile_global_ctx.locations) + 1; ucs_memory_cpu_store_fence(); - *loc_id_p = location + 1; - out_unlock: - pthread_mutex_unlock(&ucs_profile_ctx.mutex); + pthread_mutex_unlock(&ucs_profile_global_ctx.mutex); + return loc_id; +} + +static void ucs_profile_thread_expand_locations(int loc_id) +{ + ucs_profile_thread_context_t *ctx; + unsigned i, new_num_locations; + + ctx = pthread_getspecific(ucs_profile_global_ctx.tls_key); + ucs_assert(ctx != NULL); + + new_num_locations = ucs_max(loc_id, ctx->accum.num_locations); + ctx->accum.locations = ucs_realloc(ctx->accum.locations, + sizeof(*ctx->accum.locations) * + new_num_locations, + "profile_thread_locations"); + if (ctx->accum.locations == NULL) { + ucs_fatal("failed to allocate profiling per-thread locations"); + } + + for (i = ctx->accum.num_locations; i < new_num_locations; ++i) { + ctx->accum.locations[i].count = 0; + ctx->accum.locations[i].total_time = 0; + } + + ctx->accum.num_locations = new_num_locations; } void ucs_profile_record(ucs_profile_type_t type, const char *name, uint32_t param32, uint64_t param64, const char *file, int line, const char *function, volatile int *loc_id_p) { - extern ucs_profile_global_context_t ucs_profile_ctx; - ucs_profile_global_context_t *ctx = &ucs_profile_ctx; - ucs_profile_record_t *rec; - ucs_profile_location_t *loc; + ucs_profile_thread_location_t *loc; + ucs_profile_thread_context_t *ctx; + ucs_profile_record_t *rec; ucs_time_t current_time; int loc_id; /* If the location id is -1 or 0, need to re-read it with lock held */ - if (ucs_unlikely((loc_id = *loc_id_p) <= 0)) { - ucs_profile_get_location(type, name, file, line, function, loc_id_p); - if ((loc_id = *loc_id_p) == 0) { + loc_id = *loc_id_p; + if (ucs_unlikely(loc_id <= 0)) { + loc_id = ucs_profile_get_location(type, name, file, line, function, + loc_id_p); + if (loc_id == 0) { return; } } ucs_memory_cpu_load_fence(); + ucs_assert(*loc_id_p != 0); ucs_assert(ucs_global_opts.profile_mode != 0); + /* Get thread-specific profiling context */ + ctx = pthread_getspecific(ucs_profile_global_ctx.tls_key); + if (ucs_unlikely(ctx == NULL)) { + ctx = ucs_profile_thread_init(); + } + current_time = ucs_get_time(); if (ucs_global_opts.profile_mode & UCS_BIT(UCS_PROFILE_MODE_ACCUM)) { - loc = &ctx->locations[loc_id - 1]; + if (ucs_unlikely(loc_id > ctx->accum.num_locations)) { + /* expand the locations array of the current thread */ + ucs_profile_thread_expand_locations(loc_id); + } + ucs_assert(loc_id - 1 < ctx->accum.num_locations); + + loc = &ctx->accum.locations[loc_id - 1]; switch (type) { case UCS_PROFILE_TYPE_SCOPE_BEGIN: ctx->accum.stack[++ctx->accum.stack_top] = current_time; @@ -262,84 +533,81 @@ void ucs_profile_record(ucs_profile_type_t type, const char *name, } } - -void ucs_profile_global_init() +static void ucs_profile_check_active_threads() { - size_t num_records; + size_t num_active_threads; - if (!ucs_global_opts.profile_mode) { - goto off; - } + pthread_mutex_lock(&ucs_profile_global_ctx.mutex); + num_active_threads = ucs_list_length(&ucs_profile_global_ctx.thread_list); + pthread_mutex_unlock(&ucs_profile_global_ctx.mutex); - if (!strlen(ucs_global_opts.profile_file)) { - ucs_warn("profiling file not specified, profiling is disabled"); - goto disable; + if (num_active_threads > 0) { + ucs_warn("%zd profiled threads are still running", num_active_threads); } +} - if (ucs_global_opts.profile_mode & UCS_BIT(UCS_PROFILE_MODE_LOG)) { - num_records = ucs_global_opts.profile_log_size / sizeof(ucs_profile_record_t); - ucs_profile_ctx.log.start = ucs_calloc(num_records, - sizeof(ucs_profile_record_t), - "profile_log"); - if (ucs_profile_ctx.log.start == NULL) { - ucs_warn("failed to allocate profiling log"); - goto disable; - } +void ucs_profile_reset_locations() +{ + ucs_profile_global_location_t *loc; - ucs_profile_ctx.log.end = ucs_profile_ctx.log.start + num_records; - ucs_profile_ctx.log.current = ucs_profile_ctx.log.start; - } + pthread_mutex_lock(&ucs_profile_global_ctx.mutex); - if (ucs_global_opts.profile_mode & UCS_BIT(UCS_PROFILE_MODE_ACCUM)) { - ucs_profile_ctx.accum.stack_top = -1; + ucs_profile_for_each_location(loc) { + *loc->loc_id_p = -1; } - ucs_info("profiling is enabled"); - return; + ucs_profile_global_ctx.num_locations = 0; + ucs_profile_global_ctx.max_locations = 0; + ucs_free(ucs_profile_global_ctx.locations); + ucs_profile_global_ctx.locations = NULL; -disable: - ucs_global_opts.profile_mode = 0; -off: - ucs_trace("profiling is disabled"); + pthread_mutex_unlock(&ucs_profile_global_ctx.mutex); } -static void ucs_profile_reset_locations() +static void ucs_profile_cleanup_completed_threads() { - pthread_mutex_lock(&ucs_profile_ctx.mutex); - ucs_profile_ctx.num_locations = 0; - ucs_profile_ctx.max_locations = 0; - ucs_free(ucs_profile_ctx.locations); - ucs_profile_ctx.locations = NULL; - pthread_mutex_unlock(&ucs_profile_ctx.mutex); -} + ucs_profile_thread_context_t *ctx, *tmp; -void ucs_profile_global_cleanup() -{ - ucs_profile_write(); - ucs_free(ucs_profile_ctx.log.start); - ucs_profile_ctx.log.start = NULL; - ucs_profile_ctx.log.end = NULL; - ucs_profile_ctx.log.current = NULL; - ucs_profile_ctx.log.wraparound = 0; - ucs_profile_reset_locations(); + pthread_mutex_lock(&ucs_profile_global_ctx.mutex); + ucs_list_for_each_safe(ctx, tmp, &ucs_profile_global_ctx.thread_list, + list) { + if (ctx->is_completed) { + ucs_profile_thread_cleanup(ctx); + } + } + pthread_mutex_unlock(&ucs_profile_global_ctx.mutex); } void ucs_profile_dump() { - ucs_profile_location_t *loc; + ucs_profile_thread_context_t *ctx; + + /* finalize profiling on current thread */ + ctx = pthread_getspecific(ucs_profile_global_ctx.tls_key); + if (ctx) { + ucs_profile_thread_finalize(ctx); + pthread_setspecific(ucs_profile_global_ctx.tls_key, NULL); + } + /* write and cleanup all completed threads (including the current thread) */ ucs_profile_write(); + ucs_profile_cleanup_completed_threads(); +} - for (loc = ucs_profile_ctx.locations; - loc < ucs_profile_ctx.locations + ucs_profile_ctx.num_locations; - ++loc) - { - loc->count = 0; - loc->total_time = 0; +void ucs_profile_global_init() +{ + if (ucs_global_opts.profile_mode && !strlen(ucs_global_opts.profile_file)) { + // TODO make sure profiling file is writeable + ucs_warn("profiling file not specified"); } - if (ucs_global_opts.profile_mode & UCS_BIT(UCS_PROFILE_MODE_LOG)) { - ucs_profile_ctx.log.wraparound = 0; - ucs_profile_ctx.log.current = ucs_profile_ctx.log.start; - } + pthread_key_create(&ucs_profile_global_ctx.tls_key, + ucs_profile_thread_key_destr); +} + +void ucs_profile_global_cleanup() +{ + ucs_profile_dump(); + ucs_profile_check_active_threads(); + pthread_key_delete(ucs_profile_global_ctx.tls_key); } diff --git a/src/ucs/profile/profile.h b/src/ucs/profile/profile.h index e8e7fe857f5..1c631ac88e9 100644 --- a/src/ucs/profile/profile.h +++ b/src/ucs/profile/profile.h @@ -11,7 +11,7 @@ # include "config.h" #endif -#if HAVE_PROFILING +#ifdef HAVE_PROFILING # include "profile_on.h" #else # include "profile_off.h" diff --git a/src/ucs/profile/profile_defs.h b/src/ucs/profile/profile_defs.h index 6052751adaa..e315839f7a2 100644 --- a/src/ucs/profile/profile_defs.h +++ b/src/ucs/profile/profile_defs.h @@ -10,13 +10,14 @@ #include #include #include - +#include BEGIN_C_DECLS /** @file profile_defs.h */ -#define UCS_PROFILE_STACK_MAX 64 +#define UCS_PROFILE_STACK_MAX 64 +#define UCS_PROFILE_FILE_VERSION 2u /** @@ -43,31 +44,36 @@ typedef enum { } ucs_profile_type_t; +/* + * Profile file structure: + * + * < ucs_profile_header_t > + * < ucs_profile_location_t > * ucs_profile_header_t::num_locaitons + * [ + * < ucs_profile_thread_header_t > + * < ucs_profile_thread_location_t > * ucs_profile_header_t::num_locaitons + * < ucs_profile_record_t > * ucs_profile_thread_header_t::num_records + * + * ] * ucs_profile_thread_header_t::num_threads + */ + + /** * Profile output file header */ typedef struct ucs_profile_header { + uint32_t version; /**< File format version */ + char ucs_path[1024];/**< UCX library path*/ char cmdline[1024]; /**< Command line */ - char hostname[40]; /**< Host name */ + char hostname[64]; /**< Host name */ uint32_t pid; /**< Process ID */ - uint32_t mode; /**< Profiling mode */ + uint32_t mode; /**< Bitmask of profiling modes */ uint32_t num_locations; /**< Number of locations in the file */ - uint64_t num_records; /**< Number of records in the file */ + uint32_t num_threads; /**< Number of threads in the file */ uint64_t one_second; /**< How much time is one second on the sampled machine */ } UCS_S_PACKED ucs_profile_header_t; -/** - * Profile output file sample record - */ -typedef struct ucs_profile_record { - uint64_t timestamp; /**< Record timestamp */ - uint64_t param64; /**< Custom 64-bit parameter */ - uint32_t param32; /**< Custom 32-bit parameter */ - uint32_t location; /**< Location identifier */ -} UCS_S_PACKED ucs_profile_record_t; - - /** * Profile location record */ @@ -75,12 +81,40 @@ typedef struct ucs_profile_location { char file[64]; /**< Source file name */ char function[64]; /**< Function name */ char name[32]; /**< User-provided name */ - volatile int *loc_id_p; /**< Back-pointer for location ID */ int line; /**< Source line number */ uint8_t type; /**< From ucs_profile_type_t */ +} UCS_S_PACKED ucs_profile_location_t; + + +/** + * Profile output file thread header + */ +typedef struct ucs_profile_thread_header { + uint32_t tid; /**< System thread id */ + uint64_t start_time; /**< Time of thread start */ + uint64_t end_time; /**< Time of thread exit */ + uint64_t num_records; /**< Number of records for the thread */ +} UCS_S_PACKED ucs_profile_thread_header_t; + + +/** + * Profile thread location with samples + */ +typedef struct ucs_profile_thread_location { uint64_t total_time; /**< Total interval from previous location */ size_t count; /**< Number of times we've hit this location */ -} UCS_S_PACKED ucs_profile_location_t; +} UCS_S_PACKED ucs_profile_thread_location_t; + + +/** + * Profile output file sample record + */ +typedef struct ucs_profile_record { + uint64_t timestamp; /**< Record timestamp */ + uint64_t param64; /**< Custom 64-bit parameter */ + uint32_t param32; /**< Custom 32-bit parameter */ + uint32_t location; /**< Location identifier */ +} UCS_S_PACKED ucs_profile_record_t; extern const char *ucs_profile_mode_names[]; diff --git a/src/ucs/profile/profile_on.h b/src/ucs/profile/profile_on.h index 6ff865f7fab..d17a14a66fa 100644 --- a/src/ucs/profile/profile_on.h +++ b/src/ucs/profile/profile_on.h @@ -273,6 +273,14 @@ void ucs_profile_record(ucs_profile_type_t type, const char *name, uint32_t param32, uint64_t param64, const char *file, int line, const char *function, volatile int *loc_id_p); + +/** + * Reset the internal array of profiling locations. + * Used for testing purposes only. + */ +void ucs_profile_reset_locations(); + + END_C_DECLS #endif diff --git a/src/ucs/stats/client_server.c b/src/ucs/stats/client_server.c index 0c392dbe67f..44db28ca687 100644 --- a/src/ucs/stats/client_server.c +++ b/src/ucs/stats/client_server.c @@ -153,7 +153,8 @@ ucs_stats_sock_send_frags(int sockfd, uint64_t timestamp, void *buffer, size_t s { struct iovec iov[2]; ucs_stats_packet_hdr_t hdr; - size_t frag_size, offset, nsent; + size_t frag_size, offset; + ssize_t nsent; size_t max_frag = UCS_STATS_MSG_FRAG_SIZE - sizeof(hdr); offset = 0; @@ -173,7 +174,7 @@ ucs_stats_sock_send_frags(int sockfd, uint64_t timestamp, void *buffer, size_t s iov[0].iov_base = &hdr; iov[0].iov_len = sizeof(hdr); - iov[1].iov_base = buffer + offset; + iov[1].iov_base = UCS_PTR_BYTE_OFFSET(buffer, offset); iov[1].iov_len = hdr.frag_size; nsent = writev(sockfd, iov, 2); @@ -258,7 +259,7 @@ static stats_entity_t *ucs_stats_server_entity_alloc(struct sockaddr_in *addr) entity->in_addr = *addr; entity->timestamp = 0; - entity->buffer_size = -1; + entity->buffer_size = SIZE_MAX; entity->inprogress_buffer = NULL; entity->completed_buffer = NULL; entity->refcount = 1; @@ -299,7 +300,7 @@ ucs_stats_server_entity_get(ucs_stats_server_h server, struct sockaddr_in *addr) static void ucs_stats_server_entity_put(stats_entity_t * entity) { - if (__sync_fetch_and_add(&entity->refcount, -1) == 1) { + if (__sync_fetch_and_sub(&entity->refcount, 1) == 1) { ucs_stats_server_entity_free(entity); } } @@ -310,12 +311,14 @@ static void ucs_stats_server_entity_put(stats_entity_t * entity) static frag_hole_t * find_frag_hole(stats_entity_t *entity, size_t frag_size, size_t frag_offset) { - void *frag_start = entity->inprogress_buffer + frag_offset; - void *frag_end = entity->inprogress_buffer + frag_offset + frag_size; + void *frag_start = UCS_PTR_BYTE_OFFSET(entity->inprogress_buffer, frag_offset); + void *frag_end = UCS_PTR_BYTE_OFFSET(entity->inprogress_buffer, + frag_offset + frag_size); frag_hole_t *hole; ucs_list_for_each(hole, &entity->holes, list) { - if ((frag_start >= (void*)hole) && (frag_end <= (void*)hole + hole->size)) { + if ((frag_start >= (void*)hole) && + (frag_end <= UCS_PTR_BYTE_OFFSET(hole, hole->size))) { return hole; } } @@ -339,7 +342,7 @@ ucs_stats_server_entity_update(ucs_stats_server_h server, stats_entity_t *entity if (timestamp < entity->timestamp) { ucs_debug("Dropping - old timestamp"); - return 0; + return UCS_OK; } else if (timestamp > entity->timestamp) { ucs_debug("New timestamp, resetting buffer with size %zu", total_size); entity->timestamp = timestamp; @@ -358,13 +361,14 @@ ucs_stats_server_entity_update(ucs_stats_server_h server, stats_entity_t *entity return UCS_ERR_MESSAGE_TRUNCATED; } - frag_start = entity->inprogress_buffer + frag_offset; - frag_end = entity->inprogress_buffer + frag_offset + frag_size; - hole_end = (void*)hole + hole->size; + frag_start = UCS_PTR_BYTE_OFFSET(entity->inprogress_buffer, frag_offset); + frag_end = UCS_PTR_BYTE_OFFSET(entity->inprogress_buffer, + frag_offset + frag_size); + hole_end = UCS_PTR_BYTE_OFFSET(hole, hole->size); ucs_debug("inserting into a hole of %zu..%zu", - (void*)hole - entity->inprogress_buffer, - hole_end - entity->inprogress_buffer); + UCS_PTR_BYTE_DIFF(entity->inprogress_buffer, hole), + UCS_PTR_BYTE_DIFF(entity->inprogress_buffer, hole_end)); /* If the fragment does not reach the end of the hole, create a new hole * in this space. @@ -373,17 +377,18 @@ ucs_stats_server_entity_update(ucs_stats_server_h server, stats_entity_t *entity /* Make sure we don't create a hole which is too small for a free-list * pointer to fit in. An exception is the last fragment. */ - assert((hole_end - frag_end >= sizeof(*new_hole)) || - (hole_end == entity->inprogress_buffer + entity->buffer_size)); - new_hole = frag_end; - new_hole->size = hole_end - frag_end; + assert((UCS_PTR_BYTE_DIFF(frag_end, hole_end) >= sizeof(*new_hole)) || + (hole_end == UCS_PTR_BYTE_OFFSET(entity->inprogress_buffer, + entity->buffer_size))); + new_hole = frag_end; + new_hole->size = UCS_PTR_BYTE_DIFF(frag_end, hole_end); ucs_list_insert_after(&hole->list, &new_hole->list); } /* If we have room before the fragment, resize the hole. Otherwise, delete it */ if (frag_start > (void*)hole) { - assert(frag_start - (void*)hole >= sizeof(*hole)); - hole->size = frag_start - (void*)hole; + assert(UCS_PTR_BYTE_DIFF(hole, frag_start) >= sizeof(*hole)); + hole->size = UCS_PTR_BYTE_DIFF(hole, frag_start); } else { ucs_list_del(&hole->list); } diff --git a/src/ucs/stats/libstats.c b/src/ucs/stats/libstats.c index c469d1567a0..fb65cece10d 100644 --- a/src/ucs/stats/libstats.c +++ b/src/ucs/stats/libstats.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "libstats.h" #include diff --git a/src/ucs/stats/serialization.c b/src/ucs/stats/serialization.c index a4eebb855fd..a79b6781986 100644 --- a/src/ucs/stats/serialization.c +++ b/src/ucs/stats/serialization.c @@ -1,10 +1,13 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "libstats.h" #include @@ -69,8 +72,8 @@ SGLIB_DEFINE_HASHED_CONTAINER_FUNCTIONS(ucs_stats_clsid_t, UCS_STATS_CLS_HASH_SI #define FREAD(_buf, _size, _stream) \ { \ - size_t nread = fread(_buf, 1, _size, _stream); \ - assert(nread == _size); \ + size_t _nread = fread(_buf, 1, _size, _stream); \ + assert(_nread == _size); \ } #define FWRITE(_buf, _size, _stream) \ @@ -178,7 +181,7 @@ static void ucs_stats_write_counters(ucs_stats_counter_t *counters, const unsigned counters_per_byte = 8 / UCS_STATS_BITS_PER_COUNTER; ucs_stats_counter_t value; uint8_t *counter_desc, v; - void *counter_data, *pos; + char *counter_data, *pos; size_t counter_desc_size; unsigned i; @@ -263,7 +266,7 @@ ucs_stats_serialize_binary(FILE *stream, ucs_stats_node_t *root, ucs_stats_class_t *cls; ucs_stats_clsid_t *elem; ucs_stats_data_header_t hdr; - unsigned index, counter; + unsigned idx, counter; sglib_hashed_ucs_stats_clsid_t_init(cls_hash); @@ -276,7 +279,7 @@ ucs_stats_serialize_binary(FILE *stream, ucs_stats_node_t *root, FWRITE_ONE(&hdr, stream); /* Write stats node classes */ - index = 0; + idx = 0; for (elem = sglib_hashed_ucs_stats_clsid_t_it_init(&it, cls_hash); elem != NULL; elem = sglib_hashed_ucs_stats_clsid_t_it_next(&it)) { @@ -286,10 +289,10 @@ ucs_stats_serialize_binary(FILE *stream, ucs_stats_node_t *root, for (counter = 0; counter < cls->num_counters; ++counter) { ucs_stats_write_str(cls->counter_names[counter], stream); } - elem->clsid = index++; + elem->clsid = idx++; } - assert(index == hdr.num_classes); + assert(idx == hdr.num_classes); /* Write stats nodes */ ucs_stats_serialize_binary_recurs(stream, root, sel, cls_hash); @@ -448,7 +451,7 @@ ucs_stats_deserialize_recurs(FILE *stream, ucs_stats_class_t **classes, return UCS_ERR_NO_MEMORY; } - node = ptr + headroom; + node = UCS_PTR_BYTE_OFFSET(ptr, headroom); node->cls = cls; FREAD(node->name, namelen, stream); @@ -571,7 +574,6 @@ static void ucs_stats_free_recurs(ucs_stats_node_t *node) } ucs_list_for_each_safe(child, tmp, &node->children[UCS_STATS_INACTIVE_CHILDREN], list) { ucs_stats_free_recurs(child); - free(child->cls); free(child); } } @@ -584,6 +586,5 @@ void ucs_stats_free(ucs_stats_node_t *root) ucs_stats_free_recurs(&s->node); ucs_stats_free_classes(s->classes, s->num_classes); free(s); - s = NULL; } diff --git a/src/ucs/stats/stats.c b/src/ucs/stats/stats.c index a1bff0db43c..802036a7326 100644 --- a/src/ucs/stats/stats.c +++ b/src/ucs/stats/stats.c @@ -1,6 +1,5 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2013. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -17,9 +16,12 @@ #include #include #include +#include #include +#ifdef HAVE_LINUX_FUTEX_H #include +#endif const char *ucs_stats_formats_names[] = { [UCS_STATS_FULL] = "full", @@ -28,7 +30,7 @@ const char *ucs_stats_formats_names[] = { [UCS_STATS_LAST] = NULL }; -#if ENABLE_STATS +#ifdef ENABLE_STATS enum { UCS_STATS_FLAG_ON_EXIT = UCS_BIT(0), @@ -46,6 +48,8 @@ enum { UCS_ROOT_STATS_LAST }; +KHASH_MAP_INIT_STR(ucs_stats_cls, ucs_stats_class_t*) + typedef struct { volatile unsigned flags; @@ -64,7 +68,12 @@ typedef struct { double interval; }; + khash_t(ucs_stats_cls) cls; + pthread_mutex_t lock; +#ifndef HAVE_LINUX_FUTEX_H + pthread_cond_t cv; +#endif pthread_t thread; } ucs_stats_context_t; @@ -73,7 +82,10 @@ static ucs_stats_context_t ucs_stats_context = { .root_node = {}, .root_filter_node = {}, .lock = PTHREAD_MUTEX_INITIALIZER, - .thread = 0xfffffffful +#ifndef HAVE_LINUX_FUTEX_H + .cv = PTHREAD_COND_INITIALIZER, +#endif + .thread = (pthread_t)-1 }; static ucs_stats_class_t ucs_stats_root_node_class = { @@ -85,12 +97,14 @@ static ucs_stats_class_t ucs_stats_root_node_class = { }; +#ifdef HAVE_LINUX_FUTEX_H static inline int ucs_sys_futex(volatile void *addr1, int op, int val1, struct timespec *timeout, void *uaddr2, int val3) { return syscall(SYS_futex, addr1, op, val1, timeout, uaddr2, val3); } +#endif static void ucs_stats_clean_node(ucs_stats_node_t *node) { ucs_stats_filter_node_t * temp_filter_node; @@ -113,34 +127,75 @@ static void ucs_stats_clean_node(ucs_stats_node_t *node) { ucs_list_del(&node->type_list); } -static ucs_stats_class_t* ucs_stats_node_clone_cls(ucs_stats_class_t *orig_cls) +static void ucs_stats_free_class(ucs_stats_class_t *cls) { - size_t i; - size_t size = sizeof(ucs_stats_class_t) + strlen(orig_cls->name) + 1; - for (i = 0; i < orig_cls->num_counters; i++) { - size += sizeof(char*) + strlen(orig_cls->counter_names[i]) + 1; + unsigned i; + + for (i = 0; i < cls->num_counters; i++) { + ucs_free((void*)cls->counter_names[i]); } - ucs_stats_class_t *clone_cls = ucs_malloc(size, "stats_cls"); - if (!clone_cls) { - return NULL; + ucs_free((void*)cls->name); + ucs_free(cls); +} + +static ucs_stats_class_t *ucs_stats_dup_class(ucs_stats_class_t *cls) +{ + ucs_stats_class_t *class_dup; + + class_dup = ucs_calloc(1, sizeof(*cls) + sizeof(*cls->counter_names) * cls->num_counters, + "ucs_stats_class_dup"); + if (!class_dup) { + ucs_error("failed to allocate statistics class"); + goto err; + } + + class_dup->name = ucs_strdup(cls->name, "ucs_stats_class_t name"); + if (!class_dup->name) { + ucs_error("failed to allocate statistics class name"); + goto err_free; + } + + for (class_dup->num_counters = 0; + class_dup->num_counters < cls->num_counters; + class_dup->num_counters++) { + class_dup->counter_names[class_dup->num_counters] = + ucs_strdup(cls->counter_names[class_dup->num_counters], + "ucs_stats_class_t counter"); + if (!class_dup->counter_names[class_dup->num_counters]) { + ucs_error("failed to allocate statistics counter name"); + goto err_free; + } } - char *write_iterator = (char*)clone_cls + sizeof(ucs_stats_class_t) + - (orig_cls->num_counters * sizeof(char*)); - clone_cls->name = write_iterator; - strcpy(write_iterator, orig_cls->name); - write_iterator += strlen(orig_cls->name) + 1; + return class_dup; - for (i = 0; i < orig_cls->num_counters; i++) { - clone_cls->counter_names[i] = write_iterator; - strcpy(write_iterator, orig_cls->counter_names[i]); - write_iterator += strlen(orig_cls->counter_names[i]) + 1; +err_free: + ucs_stats_free_class(class_dup); +err: + return NULL; +} + +static ucs_stats_class_t *ucs_stats_get_class(ucs_stats_class_t *cls) +{ + ucs_stats_class_t *class_dup; + khiter_t iter; + int r; + + iter = kh_get(ucs_stats_cls, &ucs_stats_context.cls, cls->name); + if (iter != kh_end(&ucs_stats_context.cls)) { + return kh_val(&ucs_stats_context.cls, iter); + } + + class_dup = ucs_stats_dup_class(cls); + if (class_dup == NULL) { + return NULL; } - clone_cls->num_counters = orig_cls->num_counters; - ucs_assert(write_iterator - size == (char*)clone_cls); - return clone_cls; + iter = kh_put(ucs_stats_cls, &ucs_stats_context.cls, class_dup->name, &r); + ucs_assert_always(r != 0); /* initialize a previously empty hash entry */ + kh_val(&ucs_stats_context.cls, iter) = class_dup; + return class_dup; } static void ucs_stats_node_remove(ucs_stats_node_t *node, int make_inactive) @@ -156,8 +211,14 @@ static void ucs_stats_node_remove(ucs_stats_node_t *node, int make_inactive) ucs_list_del(&node->list); if (make_inactive) { - ucs_list_add_tail(&node->parent->children[UCS_STATS_INACTIVE_CHILDREN], &node->list); - node->cls = ucs_stats_node_clone_cls(node->cls); + node->cls = ucs_stats_get_class(node->cls); + if (node->cls) { + ucs_list_add_tail(&node->parent->children[UCS_STATS_INACTIVE_CHILDREN], &node->list); + } else { + /* failed to allocate class duplicate - remove node */ + ucs_stats_clean_node(node); + make_inactive = 0; + } } else { ucs_stats_clean_node(node); } @@ -167,10 +228,8 @@ static void ucs_stats_node_remove(ucs_stats_node_t *node, int make_inactive) if (!make_inactive) { if (!node->filter_node->type_list_len) { ucs_free(node->filter_node); - node->filter_node = NULL; } ucs_free(node); - node = NULL; } } @@ -313,9 +372,9 @@ static void ucs_stats_add_to_filter(ucs_stats_node_t *node, } } -static int ucs_stats_node_add(ucs_stats_node_t *node, - ucs_stats_node_t *parent, - ucs_stats_filter_node_t *filter_node) +static ucs_status_t ucs_stats_node_add(ucs_stats_node_t *node, + ucs_stats_node_t *parent, + ucs_stats_filter_node_t *filter_node) { ucs_assert(node != &ucs_stats_context.root_node); if (parent == NULL) { @@ -450,6 +509,12 @@ static void* ucs_stats_thread_func(void *arg) ptime = NULL; } + /* + * TODO: Switch to use the condvar on all systems, eliminating + * futexes. For now it is kept conditionally to not commit the + * change, runtime-untested on FreeBSD, to working Linux codebase. + */ +#ifdef HAVE_LINUX_FUTEX_H flags = ucs_stats_context.flags; while (flags & UCS_STATS_FLAG_ON_TIMER) { /* Wait for timeout/wakeup */ @@ -457,6 +522,18 @@ static void* ucs_stats_thread_func(void *arg) ucs_stats_dump(); flags = ucs_stats_context.flags; } +#else + pthread_mutex_lock(&ucs_stats_context.lock); + flags = ucs_stats_context.flags; + while (flags & UCS_STATS_FLAG_ON_TIMER) { + /* Wait for timeout/wakeup */ + pthread_cond_timedwait(&ucs_stats_context.cv, &ucs_stats_context.lock, + ptime); + __ucs_stats_dump(0); + flags = ucs_stats_context.flags; + } + pthread_mutex_unlock(&ucs_stats_context.lock); +#endif return NULL; } @@ -469,23 +546,29 @@ static void ucs_stats_open_dest() const char *next_token; int need_close; + copy_str = NULL; if (!strncmp(ucs_global_opts.stats_dest, "udp:", 4)) { - copy_str = strdupa(&ucs_global_opts.stats_dest[4]); + copy_str = ucs_strdup(&ucs_global_opts.stats_dest[4], + "statistics dest"); + if (copy_str == NULL) { + return; + } + saveptr = NULL; hostname = strtok_r(copy_str, ":", &saveptr); port_str = strtok_r(NULL, ":", &saveptr); if (hostname == NULL) { ucs_error("Invalid statistics destination format (%s)", ucs_global_opts.stats_dest); - return; + goto out_free; } status = ucs_stats_client_init(hostname, port_str ? atoi(port_str) : UCS_STATS_DEFAULT_UDP_PORT, &ucs_stats_context.client); if (status != UCS_OK) { - return; + goto out_free; } ucs_stats_context.flags |= UCS_STATS_FLAG_SOCKET; @@ -493,9 +576,9 @@ static void ucs_stats_open_dest() status = ucs_open_output_stream(ucs_global_opts.stats_dest, UCS_LOG_LEVEL_ERROR, &ucs_stats_context.stream, - &need_close, &next_token); + &need_close, &next_token, NULL); if (status != UCS_OK) { - return; + goto out_free; } /* File flags */ @@ -509,6 +592,9 @@ static void ucs_stats_open_dest() ucs_stats_context.flags |= UCS_STATS_FLAG_STREAM_BINARY; } } + +out_free: + ucs_free(copy_str); } static void ucs_stats_close_dest() @@ -568,11 +654,23 @@ static void ucs_stats_unset_trigger() { void *result; +#ifdef HAVE_LINUX_FUTEX_H if (ucs_stats_context.flags & UCS_STATS_FLAG_ON_TIMER) { ucs_stats_context.flags &= ~UCS_STATS_FLAG_ON_TIMER; ucs_sys_futex(&ucs_stats_context.flags, FUTEX_WAKE, 1, NULL, NULL, 0); pthread_join(ucs_stats_context.thread, &result); } +#else + pthread_mutex_lock(&ucs_stats_context.lock); + if (ucs_stats_context.flags & UCS_STATS_FLAG_ON_TIMER) { + ucs_stats_context.flags &= ~UCS_STATS_FLAG_ON_TIMER; + pthread_cond_broadcast(&ucs_stats_context.cv); + pthread_mutex_unlock(&ucs_stats_context.lock); + pthread_join(ucs_stats_context.thread, &result); + } else { + pthread_mutex_unlock(&ucs_stats_context.lock); + } +#endif if (ucs_stats_context.flags & UCS_STATS_FLAG_ON_EXIT) { ucs_debug("dumping stats"); @@ -597,7 +695,6 @@ static void ucs_stats_clean_node_recurs(ucs_stats_node_t *node) ucs_list_for_each_safe(child, tmp, &node->children[UCS_STATS_INACTIVE_CHILDREN], list) { ucs_stats_clean_node_recurs(child); - ucs_free(child->cls); ucs_stats_node_remove(child, 0); } } @@ -615,6 +712,7 @@ void ucs_stats_init() UCS_STATS_START_TIME(ucs_stats_context.start_time); ucs_stats_node_init_root("%s:%d", ucs_get_host_name(), getpid()); ucs_stats_set_trigger(); + kh_init_inplace(ucs_stats_cls, &ucs_stats_context.cls); ucs_debug("statistics enabled, flags: %c%c%c%c%c%c%c", (ucs_stats_context.flags & UCS_STATS_FLAG_ON_TIMER) ? 't' : '-', @@ -628,6 +726,8 @@ void ucs_stats_init() void ucs_stats_cleanup() { + ucs_stats_class_t *cls; + if (!ucs_stats_is_active()) { return; } @@ -636,6 +736,12 @@ void ucs_stats_cleanup() ucs_stats_clean_node_recurs(&ucs_stats_context.root_node); ucs_stats_close_dest(); ucs_assert(ucs_stats_context.flags == 0); + + kh_foreach_value(&ucs_stats_context.cls, cls, { + ucs_stats_free_class(cls); + }); + + kh_destroy_inplace(ucs_stats_cls, &ucs_stats_context.cls); } void ucs_stats_dump() diff --git a/src/ucs/stats/stats.h b/src/ucs/stats/stats.h index a95d12a6ada..6b58d5235c7 100644 --- a/src/ucs/stats/stats.h +++ b/src/ucs/stats/stats.h @@ -13,6 +13,7 @@ #endif #include +#include BEGIN_C_DECLS @@ -23,7 +24,7 @@ void ucs_stats_cleanup(); void ucs_stats_dump(); int ucs_stats_is_active(); #include "stats_fwd.h" -#if ENABLE_STATS +#ifdef ENABLE_STATS #include "libstats.h" @@ -44,7 +45,7 @@ void ucs_stats_node_free(ucs_stats_node_t *node); #define UCS_STATS_RVAL(_rval) _rval #define UCS_STATS_NODE_DECLARE(_node) \ - ucs_stats_node_t* _node + ucs_stats_node_t* _node; #define UCS_STATS_NODE_ALLOC(_p_node, _class, _parent, ...) \ ucs_stats_node_alloc(_p_node, _class, _parent, ## __VA_ARGS__ , "") @@ -98,7 +99,7 @@ void ucs_stats_node_free(ucs_stats_node_t *node); #define UCS_STATS_ARG(_arg) #define UCS_STATS_RVAL(_rval) NULL #define UCS_STATS_NODE_DECLARE(_node) -#define UCS_STATS_NODE_ALLOC(_p_node, _class, _parent, ...) UCS_OK +#define UCS_STATS_NODE_ALLOC(_p_node, _class, _parent, ...) ucs_empty_function_return_success() #define UCS_STATS_NODE_FREE(_node) #define UCS_STATS_UPDATE_COUNTER(_node, _index, _delta) #define UCS_STATS_SET_COUNTER(_node, _index, _value) diff --git a/src/ucs/stats/stats_parser.c b/src/ucs/stats/stats_parser.c index 59b267e8812..b2fb5718298 100644 --- a/src/ucs/stats/stats_parser.c +++ b/src/ucs/stats/stats_parser.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "stats.h" /* diff --git a/src/ucs/stats/stats_reader.c b/src/ucs/stats/stats_reader.c index f82d18f4933..432b04b1141 100644 --- a/src/ucs/stats/stats_reader.c +++ b/src/ucs/stats/stats_reader.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "libstats.h" #include diff --git a/src/ucs/sys/checker.h b/src/ucs/sys/checker.h index 8fe0fdef60d..4a1570144f2 100644 --- a/src/ucs/sys/checker.h +++ b/src/ucs/sys/checker.h @@ -43,7 +43,7 @@ /* * BullsEye Code Coverage tool */ -#if _BullseyeCoverage +#ifdef _BullseyeCoverage #define BULLSEYE_ON 1 #define BULLSEYE_EXCLUDE_START #pragma BullseyeCoverage off #define BULLSEYE_EXCLUDE_END #pragma BullseyeCoverage on diff --git a/src/ucs/sys/compiler.h b/src/ucs/sys/compiler.h index 00fc374da96..96cc7fc0200 100644 --- a/src/ucs/sys/compiler.h +++ b/src/ucs/sys/compiler.h @@ -17,7 +17,9 @@ #include #include #include +#ifdef HAVE_ALLOCA_H #include +#endif #ifndef ULLONG_MAX #define ULLONG_MAX (__LONG_LONG_MAX__ * 2ULL + 1) @@ -47,10 +49,10 @@ */ #define UCS_WORD_COPY(_dst_type, _dst, _src_type, _src, _size) \ { \ - unsigned i; \ + unsigned _i; \ UCS_STATIC_ASSERT(sizeof(_src_type) == sizeof(_dst_type)); \ - for (i = 0; i < (_size) / sizeof(_src_type); ++i) { \ - *((_dst_type*)(_dst) + i) = *((_src_type*)(_src) + i); \ + for (_i = 0; _i < (_size) / sizeof(_src_type); ++_i) { \ + *((_dst_type*)(_dst) + _i) = *((_src_type*)(_src) + _i); \ } \ } @@ -66,7 +68,7 @@ }) /** - * suppress unaligned pointer warning (actual on armclang5 platform) + * suppress unaligned pointer warning */ #define ucs_unaligned_ptr(_ptr) ({void *_p = (void*)(_ptr); _p;}) @@ -78,24 +80,10 @@ */ #define UCS_CACHELINE_PADDING(...) \ char UCS_PP_APPEND_UNIQUE_ID(pad)[UCS_SYS_CACHE_LINE_SIZE - \ - UCS_CACHELINE_PADDING_MISALIGN(__VA_ARGS__)]; + UCS_CACHELINE_PADDING_MISALIGN(__VA_ARGS__)] #define UCS_CACHELINE_PADDING_SIZEOF(_, _x) \ + sizeof(_x) #define UCS_CACHELINE_PADDING_MISALIGN(...) \ ((UCS_PP_FOREACH(UCS_CACHELINE_PADDING_SIZEOF, _, __VA_ARGS__)) % UCS_SYS_CACHE_LINE_SIZE) - -/* - * Define code which runs at global constructor phase - */ -#define UCS_STATIC_INIT \ - static void UCS_F_CTOR UCS_PP_APPEND_UNIQUE_ID(ucs_initializer)() - - -/* - * Define code which runs at global destructor phase - */ -#define UCS_STATIC_CLEANUP \ - static void UCS_F_DTOR UCS_PP_APPEND_UNIQUE_ID(ucs_initializer)() - #endif diff --git a/src/ucs/sys/compiler_def.h b/src/ucs/sys/compiler_def.h index e4d04fdcc7f..a989ba7902a 100644 --- a/src/ucs/sys/compiler_def.h +++ b/src/ucs/sys/compiler_def.h @@ -5,7 +5,6 @@ * See file LICENSE for terms. */ - #ifndef UCS_COMPILER_DEF_H #define UCS_COMPILER_DEF_H @@ -49,6 +48,9 @@ /* Silence "defined but not used" error for static function */ #define UCS_F_MAYBE_UNUSED __attribute__((used)) +/* Non-null return */ +#define UCS_F_NON_NULL __attribute__((nonnull)) + /* Always inline the function */ #ifdef __GNUC__ #define UCS_F_ALWAYS_INLINE inline __attribute__ ((always_inline)) @@ -90,7 +92,16 @@ /* Helper macro for address arithmetic in bytes */ #define UCS_PTR_BYTE_OFFSET(_ptr, _offset) \ - ((void *)((uintptr_t)(_ptr) + (_offset))) + ((void *)((intptr_t)(_ptr) + (intptr_t)(_offset))) + +/* Helper macro to calculate an address with offset equal to size of _type */ +#define UCS_PTR_TYPE_OFFSET(_ptr, _type) \ + ((void *)((typeof(_type) *)(_ptr) + 1)) + +/* Helper macro to calculate ptr difference (_end - _start) */ +#define UCS_PTR_BYTE_DIFF(_start, _end) \ + ((ptrdiff_t)((uintptr_t)(_end) - (uintptr_t)(_start))) + /** * Size of statically-declared array @@ -101,6 +112,12 @@ ( sizeof(_array) / sizeof((_array)[0]) ); \ }) +/** + * @return count of elements in const-size array + */ +#define ucs_array_size(_array) \ + (sizeof(_array) / sizeof((_array)[0])) + /** * @return Offset of _member in _type. _type is a structure type. */ @@ -132,11 +149,23 @@ }) /** - * @return Size of _member in _type. _type is a structure type. + * @param _type Structure type. + * @param _field Field of structure. + * + * @return Size of _field in _type. */ #define ucs_field_sizeof(_type, _field) \ sizeof(((_type*)0)->_field) +/** + * @param _type Structure type. + * @param _field Field of structure. + * + * @return Type of _field in _type. + */ +#define ucs_field_type(_type, _field) \ + typeof(((_type*)0)->_field) + /** * Prevent compiler from reordering instructions */ @@ -154,4 +183,16 @@ /* Check if an expression is a compile-time constant */ #define ucs_is_constant(expr) __builtin_constant_p(expr) +/* + * Define code which runs at global constructor phase + */ +#define UCS_STATIC_INIT \ + static void UCS_F_CTOR UCS_PP_APPEND_UNIQUE_ID(ucs_initializer_ctor)() + +/* + * Define code which runs at global destructor phase + */ +#define UCS_STATIC_CLEANUP \ + static void UCS_F_DTOR UCS_PP_APPEND_UNIQUE_ID(ucs_initializer_dtor)() + #endif /* UCS_COMPILER_DEF_H */ diff --git a/src/ucs/sys/event_set.c b/src/ucs/sys/event_set.c new file mode 100644 index 00000000000..06de682f978 --- /dev/null +++ b/src/ucs/sys/event_set.c @@ -0,0 +1,234 @@ +/** + * Copyright (C) Hiroyuki Sato. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "event_set.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +enum { + UCS_SYS_EVENT_SET_EXTERNAL_EVENT_FD = UCS_BIT(0), +}; + +struct ucs_sys_event_set { + int event_fd; + unsigned flags; +}; + +const unsigned ucs_sys_event_set_max_wait_events = + UCS_ALLOCA_MAX_SIZE / sizeof(struct epoll_event); + + +static inline int ucs_event_set_map_to_raw_events(int events) +{ + int raw_events = 0; + + if (events & UCS_EVENT_SET_EVREAD) { + raw_events |= EPOLLIN; + } + if (events & UCS_EVENT_SET_EVWRITE) { + raw_events |= EPOLLOUT; + } + if (events & UCS_EVENT_SET_EVERR) { + raw_events |= EPOLLERR; + } + if (events & UCS_EVENT_SET_EDGE_TRIGGERED) { + raw_events |= EPOLLET; + } + return raw_events; +} + +static inline int ucs_event_set_map_to_events(int raw_events) +{ + int events = 0; + + if (raw_events & EPOLLIN) { + events |= UCS_EVENT_SET_EVREAD; + } + if (raw_events & EPOLLOUT) { + events |= UCS_EVENT_SET_EVWRITE; + } + if (raw_events & EPOLLERR) { + events |= UCS_EVENT_SET_EVERR; + } + if (raw_events & EPOLLET) { + events |= UCS_EVENT_SET_EDGE_TRIGGERED; + } + return events; +} + +static ucs_sys_event_set_t *ucs_event_set_alloc(int event_fd, unsigned flags) +{ + ucs_sys_event_set_t *event_set; + + event_set = ucs_malloc(sizeof(ucs_sys_event_set_t), "ucs_sys_event_set"); + if (event_set == NULL) { + ucs_error("unable to allocate memory ucs_sys_event_set_t object"); + return NULL; + } + + event_set->flags = flags; + event_set->event_fd = event_fd; + return event_set; +} + +ucs_status_t ucs_event_set_create_from_fd(ucs_sys_event_set_t **event_set_p, + int event_fd) +{ + *event_set_p = ucs_event_set_alloc(event_fd, + UCS_SYS_EVENT_SET_EXTERNAL_EVENT_FD); + if (*event_set_p == NULL) { + return UCS_ERR_NO_MEMORY; + } + + return UCS_OK; +} + +ucs_status_t ucs_event_set_create(ucs_sys_event_set_t **event_set_p) +{ + ucs_status_t status; + int event_fd; + + /* Create epoll set the thread will wait on */ + event_fd = epoll_create(1); + if (event_fd < 0) { + ucs_error("epoll_create() failed: %m"); + return UCS_ERR_IO_ERROR; + } + + *event_set_p = ucs_event_set_alloc(event_fd, 0); + if (*event_set_p == NULL) { + status = UCS_ERR_NO_MEMORY; + goto err_close_event_fd; + } + + return UCS_OK; + +err_close_event_fd: + close(event_fd); + return status; +} + +ucs_status_t ucs_event_set_add(ucs_sys_event_set_t *event_set, int fd, + ucs_event_set_type_t events, void *callback_data) +{ + struct epoll_event raw_event; + int ret; + + memset(&raw_event, 0, sizeof(raw_event)); + raw_event.events = ucs_event_set_map_to_raw_events(events); + raw_event.data.ptr = callback_data; + + ret = epoll_ctl(event_set->event_fd, EPOLL_CTL_ADD, fd, &raw_event); + if (ret < 0) { + ucs_error("epoll_ctl(event_fd=%d, ADD, fd=%d) failed: %m", + event_set->event_fd, fd); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + +ucs_status_t ucs_event_set_mod(ucs_sys_event_set_t *event_set, int fd, + ucs_event_set_type_t events, void *callback_data) +{ + struct epoll_event raw_event; + int ret; + + memset(&raw_event, 0, sizeof(raw_event)); + raw_event.events = ucs_event_set_map_to_raw_events(events); + raw_event.data.ptr = callback_data; + + ret = epoll_ctl(event_set->event_fd, EPOLL_CTL_MOD, fd, &raw_event); + if (ret < 0) { + ucs_error("epoll_ctl(event_fd=%d, MOD, fd=%d) failed: %m", + event_set->event_fd, fd); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + +ucs_status_t ucs_event_set_del(ucs_sys_event_set_t *event_set, int fd) +{ + int ret; + + ret = epoll_ctl(event_set->event_fd, EPOLL_CTL_DEL, fd, NULL); + if (ret < 0) { + ucs_error("epoll_ctl(event_fd=%d, DEL, fd=%d) failed: %m", + event_set->event_fd, fd); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + +ucs_status_t ucs_event_set_wait(ucs_sys_event_set_t *event_set, + unsigned *num_events, int timeout_ms, + ucs_event_set_handler_t event_set_handler, + void *arg) +{ + struct epoll_event *events; + int nready, i, io_events; + + ucs_assert(event_set_handler != NULL); + ucs_assert(num_events != NULL); + ucs_assert(*num_events <= ucs_sys_event_set_max_wait_events); + + events = ucs_alloca(sizeof(*events) * *num_events); + + nready = epoll_wait(event_set->event_fd, events, *num_events, timeout_ms); + if (ucs_unlikely(nready < 0)) { + *num_events = 0; + if (errno == EINTR) { + return UCS_INPROGRESS; + } + ucs_error("epoll_wait() failed: %m"); + return UCS_ERR_IO_ERROR; + } + + ucs_assert(nready <= *num_events); + ucs_trace_poll("epoll_wait(event_fd=%d, num_events=%u, timeout=%d) " + "returned %u", + event_set->event_fd, *num_events, timeout_ms, nready); + + for (i = 0; i < nready; i++) { + io_events = ucs_event_set_map_to_events(events[i].events); + event_set_handler(events[i].data.ptr, io_events, arg); + } + + *num_events = nready; + return UCS_OK; +} + +void ucs_event_set_cleanup(ucs_sys_event_set_t *event_set) +{ + if (!(event_set->flags & UCS_SYS_EVENT_SET_EXTERNAL_EVENT_FD)) { + close(event_set->event_fd); + } + ucs_free(event_set); +} + +ucs_status_t ucs_event_set_fd_get(ucs_sys_event_set_t *event_set, + int *event_fd_p) +{ + ucs_assert(event_set != NULL); + *event_fd_p = event_set->event_fd; + return UCS_OK; +} diff --git a/src/ucs/sys/event_set.h b/src/ucs/sys/event_set.h new file mode 100644 index 00000000000..b333d208d29 --- /dev/null +++ b/src/ucs/sys/event_set.h @@ -0,0 +1,141 @@ +/** + * Copyright (C) Hiroyuki Sato. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCS_EVENT_SET_H +#define UCS_EVENT_SET_H + +#include + +/** + * ucs_sys_event_set_t structure used in ucs_event_set_XXX functions. + * + */ +typedef struct ucs_sys_event_set ucs_sys_event_set_t; + + +/** + * ucs_event_set_handler call this handler for notifying event + * + * @param [in] callback_data User data which set in ucs_event_set_add(). + * @param [in] event Detection event. Sets of ucs_event_set_type_t. + * @param [in] arg User data which set in ucs_event_set_wait(). + * + */ +typedef void (*ucs_event_set_handler_t)(void *callback_data, int event, + void *arg); + +/** + * ucs_event_set_type_t member is a bit set composed using the following + * available event types + */ +typedef enum { + UCS_EVENT_SET_EVREAD = UCS_BIT(0), + UCS_EVENT_SET_EVWRITE = UCS_BIT(1), + UCS_EVENT_SET_EVERR = UCS_BIT(2), + UCS_EVENT_SET_EDGE_TRIGGERED = UCS_BIT(3) +} ucs_event_set_type_t; + +/* The maximum possible number of events based on system constraints */ +extern const unsigned ucs_sys_event_set_max_wait_events; + +/** + * Allocate ucs_sys_event_set_t structure and assign provided file + * descriptor to wait for events on. + * + * @param [out] event_set_p Event set pointer to initialize. + * @param [in] event_fd File descriptor to wait for events on. + * + * @return UCS_OK on success or an error code on failure. + */ +ucs_status_t ucs_event_set_create_from_fd(ucs_sys_event_set_t **event_set_p, + int event_fd); + +/** + * Allocate ucs_sys_event_set_t structure. + * + * @param [out] event_set_p Event set pointer to initialize. + * + * @return UCS_OK on success or an error code on failure. + */ +ucs_status_t ucs_event_set_create(ucs_sys_event_set_t **event_set_p); + +/** + * Register the target event. + * + * @param [in] event_set_p Event set pointer to initialize. + * @param [in] fd Register the target file descriptor fd. + * @param [in] events Operation events. + * @param [in] callback_data ucs_event_set_handler_t accepts this data. + * + * @return UCS_OK on success or an error code on failure. + */ +ucs_status_t ucs_event_set_add(ucs_sys_event_set_t *event_set, int fd, + ucs_event_set_type_t events, + void *callback_data); + +/** + * Modify the target event. + * + * @param [in] event_set Event set created by ucs_event_set_create. + * @param [in] fd Register the target file descriptor fd. + * @param [in] events Operation events. + * @param [in] callback_data ucs_event_set_handler_t accepts this data. + * + * @return UCS_OK on success or an error code on failure. + */ +ucs_status_t ucs_event_set_mod(ucs_sys_event_set_t *event_set, int fd, + ucs_event_set_type_t events, + void *callback_data); + +/** + * Remove the target event. + * + * @param [in] event_set Event set created by ucs_event_set_create. + * @param [in] fd Register the target file descriptor fd. + * + * @return UCS_OK on success or an error code on failure. + */ +ucs_status_t ucs_event_set_del(ucs_sys_event_set_t *event_set, int fd); + +/** + * Wait for an I/O events + * + * @param [in] event_set Event set created by ucs_event_set_create. + * @param [in/out] num_events Number of expected/read events. + * @param [in] timeout_ms Timeout period in ms. + * @param [in] event_set_handler Callback functions. + * @param [in] arg User data variables. + * + * @return return UCS_OK on success, UCS_INPROGRESS - call was interrupted by a + * signal handler, UCS_ERR_IO_ERROR - an error occurred during waiting + * for I/O events. + */ +ucs_status_t ucs_event_set_wait(ucs_sys_event_set_t *event_set, + unsigned *num_events, int timeout_ms, + ucs_event_set_handler_t event_set_handler, + void *arg); + +/** + * Cleanup event set + * + * @param [in] event_set Event set created by ucs_event_set_create. + * + */ +void ucs_event_set_cleanup(ucs_sys_event_set_t *event_set); + +/** + * Get file descriptor for watching events. + * + * @param [in] event_set Event set created by ucs_event_set_create. + * @param [out] event_fd_p File descriptor that is used by Event set to wait + * for events on. + * + * @return UCS_OK on success or an error code on failure. + */ +ucs_status_t ucs_event_set_fd_get(ucs_sys_event_set_t *event_set, + int *event_fd_p); + +#endif diff --git a/src/ucs/sys/init.c b/src/ucs/sys/init.c index 9ab9326ba6b..1e0739b91c2 100644 --- a/src/ucs/sys/init.c +++ b/src/ucs/sys/init.c @@ -1,10 +1,14 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include #include @@ -15,6 +19,7 @@ #include #include #include +#include /* run-time CPU detection */ @@ -79,14 +84,16 @@ static void UCS_F_CTOR ucs_init() ucs_check_cpu_flags(); ucs_log_early_init(); /* Must be called before all others */ ucs_global_opts_init(); + ucs_cpu_init(); ucs_log_init(); -#if ENABLE_STATS +#ifdef ENABLE_STATS ucs_stats_init(); #endif ucs_memtrack_init(); ucs_debug_init(); ucs_profile_global_init(); ucs_async_global_init(); + ucs_topo_init(); ucs_debug("%s loaded at 0x%lx", ucs_debug_get_lib_path(), ucs_debug_get_lib_base_addr()); ucs_debug("cmd line: %s", ucs_get_process_cmdline()); @@ -94,11 +101,12 @@ static void UCS_F_CTOR ucs_init() static void UCS_F_DTOR ucs_cleanup(void) { + ucs_topo_cleanup(); ucs_async_global_cleanup(); ucs_profile_global_cleanup(); ucs_debug_cleanup(0); ucs_memtrack_cleanup(); -#if ENABLE_STATS +#ifdef ENABLE_STATS ucs_stats_cleanup(); #endif ucs_log_cleanup(); diff --git a/src/ucs/sys/iovec.c b/src/ucs/sys/iovec.c new file mode 100644 index 00000000000..211550df2d7 --- /dev/null +++ b/src/ucs/sys/iovec.c @@ -0,0 +1,110 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include + +#include +#include +/* Need this to get IOV_MAX on some platforms. */ +#ifndef __need_IOV_MAX +#define __need_IOV_MAX +#endif +#include + + +size_t ucs_iov_copy(const struct iovec *iov, size_t iov_cnt, + size_t iov_offset, void *buf, size_t max_copy, + ucs_iov_copy_direction_t dir) +{ + size_t copied = 0; + char *iov_buf; + size_t i, len; + + for (i = 0; (i < iov_cnt) && max_copy; i++) { + len = iov[i].iov_len; + + if (iov_offset > len) { + iov_offset -= len; + continue; + } + + iov_buf = UCS_PTR_BYTE_OFFSET(iov[i].iov_base, iov_offset); + len -= iov_offset; + + len = ucs_min(len, max_copy); + if (dir == UCS_IOV_COPY_FROM_BUF) { + memcpy(iov_buf, UCS_PTR_BYTE_OFFSET(buf, copied), len); + } else if (dir == UCS_IOV_COPY_TO_BUF) { + memcpy(UCS_PTR_BYTE_OFFSET(buf, copied), iov_buf, len); + } + + iov_offset = 0; + max_copy -= len; + copied += len; + } + + return copied; +} + +void ucs_iov_advance(struct iovec *iov, size_t iov_cnt, + size_t *cur_iov_idx, size_t consumed) +{ + size_t i; + + ucs_assert(*cur_iov_idx <= iov_cnt); + + for (i = *cur_iov_idx; i < iov_cnt; i++) { + if (consumed < iov[i].iov_len) { + iov[i].iov_len -= consumed; + iov[i].iov_base = UCS_PTR_BYTE_OFFSET(iov[i].iov_base, + consumed); + *cur_iov_idx = i; + return; + } + + consumed -= iov[i].iov_len; + iov[i].iov_base = UCS_PTR_BYTE_OFFSET(iov[i].iov_base, + iov[i].iov_len); + iov[i].iov_len = 0; + } + + ucs_assert(!consumed && (i == iov_cnt)); +} + +size_t ucs_iov_get_max() +{ + static int max_iov = -1; + +#ifdef _SC_IOV_MAX + if (max_iov != -1) { + return max_iov; + } + + max_iov = sysconf(_SC_IOV_MAX); + if (max_iov != -1) { + return max_iov; + } + /* if unable to get value from sysconf(), + * use a predefined value */ +#endif + +#if defined(IOV_MAX) + max_iov = IOV_MAX; +#elif defined(UIO_MAXIOV) + max_iov = UIO_MAXIOV; +#else + /* The value is used as a fallback when system value is not available. + * The latest kernels define it as 1024 */ + max_iov = 1024; +#endif + + return max_iov; +} diff --git a/src/ucs/sys/iovec.h b/src/ucs/sys/iovec.h new file mode 100644 index 00000000000..db2c8eadecc --- /dev/null +++ b/src/ucs/sys/iovec.h @@ -0,0 +1,73 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCS_IOVEC_H +#define UCS_IOVEC_H + +#include + +#include +#include + +BEGIN_C_DECLS + +/* A direction for copying a data to/from an array of iovec elements */ +typedef enum ucs_iov_copy_direction { + UCS_IOV_COPY_TO_BUF, + UCS_IOV_COPY_FROM_BUF +} ucs_iov_copy_direction_t; + + +/* An iterator that should be used by IOV convertor in order to save + * information about the current offset in the destination IOV array */ +typedef struct ucs_iov_iter { + size_t iov_index; /* The current index in iov array */ + size_t buffer_offset; /* The current offset in the buffer of the + * current iov element */ +} ucs_iov_iter_t; + + +/** + * Copy a data from iovec [buffer] to buffer [iovec]. + * + * @param [in] iov A pointer to an array of iovec elements. + * @param [in] iov_cnt A number of elements in a iov array. + * @param [in] iov_offset An offset in a iov array. + * @param [in] buf A buffer that should be used for copying a data. + * @param [in] max_copye A maximum amount of data that should be copied. + * @param [in] dir Direction that specifies destination and source. + * + * @return The amount, in bytes, of the data that was copied. + */ +size_t ucs_iov_copy(const struct iovec *iov, size_t iov_cnt, + size_t iov_offset, void *buf, size_t max_copy, + ucs_iov_copy_direction_t dir); + +/** + * Update an array of iovec elements to consider an already consumed data. + * + * @param [in] iov A pointer to an array of iovec elements. + * @param [in] iov_cnt A number of elements in a iov array. + * @param [in/out] cur_iov_idx A pointer to an index in a iov array from + * which the operation should be started. + * @param [in] consumed An amount of data consumed that should be + * considered in a current iov array. + */ +void ucs_iov_advance(struct iovec *iov, size_t iov_cnt, + size_t *cur_iov_idx, size_t consumed); + +/** + * Returns the maximum possible value for the number of IOVs. + * It maybe either value from the system configuration or IOV_MAX + * value or UIO_MAXIOV value or 1024 if nothing is defined. + * + * @return The maximum number of IOVs. + */ +size_t ucs_iov_get_max(); + +END_C_DECLS + +#endif diff --git a/src/ucs/sys/iovec.inl b/src/ucs/sys/iovec.inl new file mode 100644 index 00000000000..dc01d9558d0 --- /dev/null +++ b/src/ucs/sys/iovec.inl @@ -0,0 +1,205 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCS_IOVEC_INL_ +#define UCS_IOVEC_INL_ + +#include +#include +#include + + +/** + * Fill the destination array of IOVs by data provided in the source + * array of IOVs. + * The function avoids copying IOVs with zero length. + * + * @param [out] _dst_iov Pointer to the resulted array of IOVs. + * @param [in/out] _dst_iov_cnt_p Pointer to the varibale that holds the number + * of the elements in the array of IOVs (input: + * initial, out: result). + * @param [in] _dst_iov_set_buffer_f Function that sets the buffer to the IOV element + * from the destination array. + * @param [in] _dst_iov_set_length_f Function that sets the length to the IOV element + * from the destination array. + * @param [in] _src_iov Pointer to the source array of IOVs. + * @param [in] _src_iov_cnt Number of the elements in the source array of IOVs. + * @param [in] _src_iov_get_buffer_f Function that gets the buffer of the IOV element + * from the destination array. + * @param [in] _src_iov_get_length_f Function that gets the length of the IOV element + * from the destination array. + * @param [in] _max_length Maximal total length of the data that can be + * placed in the resulted array of IOVs. + * @param [in] _dst_iov_iter_p Pointer to the IOV iterator for the destination + * array of IOVs. + * + * @return The total length of the resulted array of IOVs. + */ +#define ucs_iov_converter(_dst_iov, _dst_iov_cnt_p, \ + _dst_iov_set_buffer_f, _dst_iov_set_length_f, \ + _src_iov, _src_iov_cnt, \ + _src_iov_get_buffer_f, _src_iov_get_length_f, \ + _max_length, _dst_iov_iter_p) \ + ({ \ + size_t __remain_length = _max_length; \ + size_t __dst_iov_index = 0; \ + size_t __src_iov_index = (_dst_iov_iter_p)->iov_index; \ + size_t __dst_iov_length, __src_iov_length; \ + void *__dst_iov_buffer; \ + \ + while ((__src_iov_index < (_src_iov_cnt)) && (__remain_length != 0) && \ + (__dst_iov_index < *(_dst_iov_cnt_p))) { \ + ucs_assert(_src_iov_get_length_f(&(_src_iov)[__src_iov_index]) >= \ + (_dst_iov_iter_p)->buffer_offset); \ + __src_iov_length = _src_iov_get_length_f(&(_src_iov)[__src_iov_index]) - \ + (_dst_iov_iter_p)->buffer_offset; \ + if (__src_iov_length == 0) { \ + /* Avoid zero length elements in resulted IOV */ \ + ++__src_iov_index; \ + continue; \ + } \ + \ + __dst_iov_length = ucs_min(__src_iov_length, __remain_length); \ + \ + _dst_iov_set_length_f(&(_dst_iov)[__dst_iov_index], __dst_iov_length); \ + __dst_iov_buffer = UCS_PTR_BYTE_OFFSET(_src_iov_get_buffer_f( \ + &(_src_iov)[__src_iov_index]), \ + (_dst_iov_iter_p)->buffer_offset); \ + _dst_iov_set_buffer_f(&(_dst_iov)[__dst_iov_index], __dst_iov_buffer); \ + \ + if (__src_iov_length > __remain_length) { \ + (_dst_iov_iter_p)->buffer_offset += __remain_length; \ + } else { \ + ucs_assert(((_dst_iov_iter_p)->buffer_offset == 0) || \ + (__src_iov_index == (_dst_iov_iter_p)->iov_index)); \ + (_dst_iov_iter_p)->buffer_offset = 0; \ + ++__src_iov_index; \ + } \ + \ + ucs_assert(__remain_length >= __dst_iov_length); \ + __remain_length -= __dst_iov_length; \ + ++__dst_iov_index; \ + \ + } \ + \ + ucs_assert(__dst_iov_index<= *(_dst_iov_cnt_p)); \ + (_dst_iov_iter_p)->iov_index = __src_iov_index; \ + *(_dst_iov_cnt_p) = __dst_iov_index; \ + ((_max_length) - __remain_length); \ + }) + +/** + * Calculates the total length of the IOV array buffers. + * + * @param [in] iov Pointer to the array of IOVs. + * @param [in] iov_cnt Number of the elements in the array of IOVs. + * + * @return The total length of the array of IOVs. + */ +#define ucs_iov_total_length(_iov, _iov_cnt, _iov_get_length_f) \ + ({ \ + size_t __total_length = 0; \ + size_t __iov_it; \ + \ + for (__iov_it = 0; __iov_it < (_iov_cnt); ++__iov_it) { \ + __total_length += _iov_get_length_f(&(_iov)[__iov_it]); \ + } \ + \ + __total_length; \ + }) + +/** + * Calculates the flat offset in the IOV array, which is the total data size + * before the position of the iterator. + * + * @param [in] iov Pointer to the array of IOVs. + * @param [in] iov_cnt Number of the elements in the array of IOVs. + * @param [in] iov_iter Pointer to the IOV iterator. + * + * @return The flat offset in the IOV array. + */ +#define ucs_iov_iter_flat_offset(_iov, _iov_cnt, _iov_iter, _iov_get_length_f) \ + ({ \ + size_t __offset = 0; \ + size_t __iov_it; \ + \ + for (__iov_it = 0; __iov_it < (_iov_iter)->iov_index; ++__iov_it) { \ + __offset += _iov_get_length_f(&(_iov)[__iov_it]); \ + } \ + \ + if ((_iov_iter)->iov_index < (_iov_cnt)) { \ + __offset += (_iov_iter)->buffer_offset; \ + } \ + \ + __offset; \ + }) + + +/** + * Initializes the IOV iterator by the initial values. + * + * @param [in] iov_iter Pointer to the IOV iterator. + */ +static UCS_F_ALWAYS_INLINE +void ucs_iov_iter_init(ucs_iov_iter_t *iov_iter) +{ + iov_iter->iov_index = 0; + iov_iter->buffer_offset = 0; +} + +/** + * Sets the particular IOVEC data buffer. + * + * @param [in] iov Pointer to the IOVEC element. + * @param [in] length Length that needs to be set. + */ +static UCS_F_ALWAYS_INLINE +void ucs_iovec_set_length(struct iovec *iov, size_t length) +{ + iov->iov_len = length; +} + +/** + * Sets the length of the particular IOVEC data buffer. + * + * @param [in] iov Pointer to the IOVEC element. + * @param [in] buffer Buffer that needs to be set. + */ +static UCS_F_ALWAYS_INLINE +void ucs_iovec_set_buffer(struct iovec *iov, void *buffer) +{ + iov->iov_base = buffer; +} + +/** + * Returns the length of the particular IOVEC data buffer. + * + * @param [in] iov Pointer to the IOVEC element. + * + * @return The length of the IOVEC data buffer. + */ +static UCS_F_ALWAYS_INLINE +size_t ucs_iovec_get_length(const struct iovec *iov) +{ + return iov->iov_len; +} + +/** + * Calculates the total length of the IOVEC array buffers. + * + * @param [in] iov Pointer to the array of IOVEC elements. + * @param [in] iov_cnt Number of elements in the IOVEC array. + * + * @return The amount, in bytes, of the data that is stored in the IOVEC + * array buffers. + */ +static UCS_F_ALWAYS_INLINE +size_t ucs_iovec_total_length(const struct iovec *iov, size_t iov_cnt) +{ + return ucs_iov_total_length(iov, iov_cnt, ucs_iovec_get_length); +} + +#endif diff --git a/src/ucs/sys/math.c b/src/ucs/sys/math.c index deb9c5a752e..2efaf3f24bc 100644 --- a/src/ucs/sys/math.c +++ b/src/ucs/sys/math.c @@ -4,53 +4,11 @@ * See file LICENSE for terms. */ -#include "math.h" +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif -static uint32_t crc32_tab[] = { - 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, - 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, - 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, - 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, - 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, - 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, - 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, - 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, - 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, - 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, - 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, - 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, - 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, - 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, - 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, - 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, - 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, - 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, - 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, - 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, - 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, - 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, - 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, - 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, - 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, - 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, - 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, - 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, - 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, - 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, - 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, - 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, - 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, - 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, - 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, - 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, - 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, - 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, - 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, - 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, - 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, - 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, - 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d -}; +#include "math.h" static uint64_t ucs_large_primes[] = { 14476643271716824181ull, 12086978239110065677ull, @@ -64,18 +22,6 @@ static uint64_t ucs_large_primes[] = { 9929050207ull, 9929050217ull, 9929050249ull, 9929050253ull }; -uint32_t ucs_calc_crc32(uint32_t crc, const void *buf, size_t size) -{ - const uint8_t *p = buf; - size_t i; - - crc = crc ^ ~0U; - for (i = 0; i < size; ++i) { - crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8); - } - return crc ^ ~0U; -} - uint64_t ucs_get_prime(unsigned index) { static const unsigned num_primes = sizeof(ucs_large_primes) / sizeof(ucs_large_primes[0]); diff --git a/src/ucs/sys/math.h b/src/ucs/sys/math.h index f854f52efa7..c006f186ae4 100644 --- a/src/ucs/sys/math.h +++ b/src/ucs/sys/math.h @@ -14,6 +14,7 @@ #include #include #include +#include BEGIN_C_DECLS @@ -26,16 +27,16 @@ BEGIN_C_DECLS #define ucs_min(_a, _b) \ ({ \ - typeof(_a) a = (_a); \ - typeof(_b) b = (_b); \ - a < b ? a : b; \ + typeof(_a) _min_a = (_a); \ + typeof(_b) _min_b = (_b); \ + (_min_a < _min_b) ? _min_a : _min_b; \ }) #define ucs_max(_a, _b) \ ({ \ - typeof(_a) a = (_a); \ - typeof(_b) b = (_b); \ - a > b ? a : b; \ + typeof(_a) _max_a = (_a); \ + typeof(_b) _max_b = (_b); \ + (_max_a > _max_b) ? _max_a : _max_b; \ }) #define ucs_is_pow2_or_zero(_n) \ @@ -73,6 +74,14 @@ BEGIN_C_DECLS pow2; \ }) +#define ucs_rounddown_pow2(_n) (ucs_roundup_pow2(_n + 1) / 2) + +#define ucs_signum(_n) \ + (((_n) > (typeof(_n))0) - ((_n) < (typeof(_n))0)) + +#define ucs_roundup_pow2_or0(_n) \ + ( ((_n) == 0) ? 0 : ucs_roundup_pow2(_n) ) + /* Return values: 0 - aligned, non-0 - unaligned */ #define ucs_check_if_align_pow2(_n, _p) ((_n) & ((_p) - 1)) @@ -126,7 +135,7 @@ static inline double ucs_log2(double x) * @param __a First number * @param __op Operator (e.g >=) * @param __b Second number - * @param _signed_type Signed type of __a/__b (e.g int_32_t) + * @param _signed_type Signed type of __a/__b (e.g int32_t) * * @return value of the expression "__a __op __b". */ @@ -138,20 +147,11 @@ static inline double ucs_log2(double x) #define UCS_CIRCULAR_COMPARE32(__a, __op, __b) UCS_CIRCULAR_COMPARE(__a, __op, __b, int32_t) #define UCS_CIRCULAR_COMPARE64(__a, __op, __b) UCS_CIRCULAR_COMPARE(__a, __op, __b, int64_t) -/* on some arch ffs64(0) returns 0, on other -1, let's unify this */ -#define ucs_ffs64_safe(_val) ((_val) ? ucs_ffs64(_val) : 64) - #define ucs_for_each_bit(_index, _map) \ for ((_index) = ucs_ffs64_safe(_map); (_index) < 64; \ (_index) = ucs_ffs64_safe((uint64_t)(_map) & (-2ull << (uint64_t)(_index)))) -/** - * Calculate CRC32 of a buffer. - */ -uint32_t ucs_calc_crc32(uint32_t crc, const void *buf, size_t size); - - /* * Generate a large prime number */ diff --git a/src/ucs/sys/module.c b/src/ucs/sys/module.c index 87452e901da..ee6b0dd5946 100644 --- a/src/ucs/sys/module.c +++ b/src/ucs/sys/module.c @@ -4,7 +4,13 @@ * See file LICENSE for terms. */ -#define _GNU_SOURCE /* for dladdr(3) */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#ifndef _GNU_SOURCE +# define _GNU_SOURCE /* for dladdr(3) */ +#endif #include "module.h" @@ -12,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -36,7 +43,7 @@ static struct { unsigned srchpath_cnt; char *srch_path[UCS_MODULE_SRCH_PATH_MAX]; } ucs_module_loader_state = { - .init = UCS_INIT_ONCE_INIITIALIZER, + .init = UCS_INIT_ONCE_INITIALIZER, .module_ext = ".so", /* default extension */ .srchpath_cnt = 0, .srch_path = { NULL, NULL} @@ -116,12 +123,6 @@ static void ucs_module_loader_init_paths() } } -static const char *ucs_module_short_path(const char *path) -{ - const char *p = strrchr(path, '/'); - return (p == NULL) ? path : p + 1; -} - /* Perform shallow search for a symbol */ static void *ucs_module_dlsym_shallow(const char *module_path, void *dl, const char *symbol) @@ -133,7 +134,6 @@ static void *ucs_module_dlsym_shallow(const char *module_path, void *dl, addr = dlsym(dl, symbol); if (addr == NULL) { - ucs_module_trace("could not find symbol '%s' in %s", symbol, module_path); return NULL; } @@ -156,8 +156,8 @@ static void *ucs_module_dlsym_shallow(const char *module_path, void *dl, */ if (lm_entry->l_addr != (uintptr_t)dl_info.dli_fbase) { ucs_module_debug("ignoring '%s' (%p) from %s (%p), expected in %s (%lx)", - symbol, addr, ucs_module_short_path(dl_info.dli_fname), - dl_info.dli_fbase, ucs_module_short_path(module_path), + symbol, addr, ucs_basename(dl_info.dli_fname), + dl_info.dli_fbase, ucs_basename(module_path), lm_entry->l_addr); return NULL; } @@ -167,17 +167,22 @@ static void *ucs_module_dlsym_shallow(const char *module_path, void *dl, static void ucs_module_init(const char *module_path, void *dl) { + typedef ucs_status_t (*init_func_t)(); + const char *module_init_name = UCS_PP_MAKE_STRING(UCS_MODULE_CONSTRUCTOR_NAME); char *fullpath, buffer[PATH_MAX]; - ucs_status_t (*init_func)(); + init_func_t init_func; ucs_status_t status; fullpath = realpath(module_path, buffer); ucs_module_trace("loaded %s [%p]", fullpath, dl); - init_func = ucs_module_dlsym_shallow(module_path, dl, module_init_name); + init_func = (init_func_t)ucs_module_dlsym_shallow(module_path, dl, + module_init_name); if (init_func == NULL) { + ucs_module_trace("not calling constructor '%s' in %s", module_init_name, + module_path); return; } diff --git a/src/ucs/sys/module.h b/src/ucs/sys/module.h index 0f4615b9368..a9e3af946b3 100644 --- a/src/ucs/sys/module.h +++ b/src/ucs/sys/module.h @@ -30,7 +30,7 @@ typedef enum { */ #define UCS_MODULE_FRAMEWORK_DECLARE(_name) \ static ucs_init_once_t ucs_framework_init_once_##_name = \ - UCS_INIT_ONCE_INIITIALIZER + UCS_INIT_ONCE_INITIALIZER /** diff --git a/src/ucs/sys/sock.c b/src/ucs/sys/sock.c index 00ed65ba55b..15f45abaafa 100644 --- a/src/ucs/sys/sock.c +++ b/src/ucs/sys/sock.c @@ -1,24 +1,66 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include +#include #include #include #include +#include +#include +#include +#include +#include +#include #include #include #include +#define UCS_NETIF_BOND_AD_NUM_PORTS_FMT "/sys/class/net/%s/bonding/ad_num_ports" +#define UCS_SOCKET_MAX_CONN_PATH "/proc/sys/net/core/somaxconn" + + +typedef ssize_t (*ucs_socket_io_func_t)(int fd, void *data, + size_t size, int flags); + +typedef ssize_t (*ucs_socket_iov_func_t)(int fd, const struct msghdr *msg, + int flags); + + +void ucs_close_fd(int *fd_p) +{ + if (*fd_p == -1) { + return; + } + + if (close(*fd_p) < 0) { + ucs_warn("failed to close fd %d: %m", *fd_p); + return; + } + + *fd_p = -1; +} + +int ucs_netif_flags_is_active(unsigned int flags) +{ + return (flags & IFF_UP) && (flags & IFF_RUNNING) && !(flags & IFF_LOOPBACK); +} + ucs_status_t ucs_netif_ioctl(const char *if_name, unsigned long request, struct ifreq *if_req) { ucs_status_t status; - int fd, ret; + int fd = -1, ret; ucs_strncpy_zero(if_req->ifr_name, if_name, sizeof(if_req->ifr_name)); @@ -37,7 +79,7 @@ ucs_status_t ucs_netif_ioctl(const char *if_name, unsigned long request, status = UCS_OK; out_close_fd: - close(fd); + ucs_close_fd(&fd); out: return status; } @@ -57,8 +99,24 @@ int ucs_netif_is_active(const char *if_name) return 0; } - return (ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING) && - !(ifr.ifr_flags & IFF_LOOPBACK); + return ucs_netif_flags_is_active(ifr.ifr_flags); +} + +unsigned ucs_netif_bond_ad_num_ports(const char *bond_name) +{ + ucs_status_t status; + long ad_num_ports; + + status = ucs_read_file_number(&ad_num_ports, 1, + UCS_NETIF_BOND_AD_NUM_PORTS_FMT, bond_name); + if ((status != UCS_OK) || (ad_num_ports <= 0) || + (ad_num_ports > UINT_MAX)) { + ucs_diag("failed to read from " UCS_NETIF_BOND_AD_NUM_PORTS_FMT ": %m, " + "assuming 802.3ad bonding is disabled", bond_name); + return 1; + } + + return (unsigned)ad_num_ports; } ucs_status_t ucs_socket_create(int domain, int type, int *fd_p) @@ -86,109 +144,501 @@ ucs_status_t ucs_socket_setopt(int fd, int level, int optname, return UCS_OK; } +ucs_status_t ucs_socket_getopt(int fd, int level, int optname, + void *optval, socklen_t optlen) +{ + socklen_t len = optlen; + int ret; + + ret = getsockopt(fd, level, optname, optval, &len); + if (ret < 0) { + ucs_error("failed to get %d option for %d level on fd %d: %m", + optname, level, fd); + return UCS_ERR_IO_ERROR; + } + + if (len != optlen) { + ucs_error("returned length of option (%d) is not the same as provided (%d)", + len, optlen); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + +const char *ucs_socket_getname_str(int fd, char *str, size_t max_size) +{ + struct sockaddr_storage sock_addr = {0}; /* Suppress Clang false-positive */ + socklen_t addr_size; + int ret; + + addr_size = sizeof(sock_addr); + ret = getsockname(fd, (struct sockaddr*)&sock_addr, + &addr_size); + if (ret < 0) { + ucs_debug("getsockname(fd=%d) failed: %m", fd); + ucs_strncpy_safe(str, "-", max_size); + return str; + } + + return ucs_sockaddr_str((const struct sockaddr*)&sock_addr, + str, max_size); +} + +static ucs_status_t ucs_socket_check_errno(int io_errno) +{ + if ((io_errno == EAGAIN) || (io_errno == EWOULDBLOCK) || (io_errno == EINTR)) { + /* IO operation or connection establishment procedure was interrupted + * or would block and need to try again */ + return UCS_ERR_NO_PROGRESS; + } + + if (io_errno == ECONNRESET) { + /* Connection reset by peer */ + return UCS_ERR_CONNECTION_RESET; + } else if (io_errno == ECONNREFUSED) { + /* A remote host refused to allow the network connection */ + return UCS_ERR_REJECTED; + } else if (io_errno == ETIMEDOUT) { + /* Connection establishment procedure timed out */ + return UCS_ERR_TIMED_OUT; + } + + return UCS_ERR_IO_ERROR; +} + ucs_status_t ucs_socket_connect(int fd, const struct sockaddr *dest_addr) { - char str[UCS_SOCKADDR_STRING_LEN]; + char dest_str[UCS_SOCKADDR_STRING_LEN]; + char src_str[UCS_SOCKADDR_STRING_LEN]; ucs_status_t status; - size_t addr_size; + size_t dest_addr_size; + int UCS_V_UNUSED conn_errno; int ret; - status = ucs_sockaddr_sizeof(dest_addr, &addr_size); + status = ucs_sockaddr_sizeof(dest_addr, &dest_addr_size); if (status != UCS_OK) { return status; } do { - ret = connect(fd, dest_addr, addr_size); + ret = connect(fd, dest_addr, dest_addr_size); if (ret < 0) { + /* Save errno to separate variable to not override it + * when calling getsockname() below */ + conn_errno = errno; + if (errno == EINPROGRESS) { status = UCS_INPROGRESS; - goto out; + break; } if (errno == EISCONN) { status = UCS_ERR_ALREADY_EXISTS; - goto out; + break; } if (errno != EINTR) { ucs_error("connect(fd=%d, dest_addr=%s) failed: %m", fd, - ucs_sockaddr_str(dest_addr, str, UCS_SOCKADDR_STRING_LEN)); + ucs_sockaddr_str(dest_addr, dest_str, + UCS_SOCKADDR_STRING_LEN)); return UCS_ERR_UNREACHABLE; } + } else { + conn_errno = 0; } } while ((ret < 0) && (errno == EINTR)); -out: - ucs_debug("connect(fd=%d, dest_addr=%s): %m", fd, - ucs_sockaddr_str(dest_addr, str, UCS_SOCKADDR_STRING_LEN)); + ucs_debug("connect(fd=%d, src_addr=%s dest_addr=%s): %s", fd, + ucs_socket_getname_str(fd, src_str, UCS_SOCKADDR_STRING_LEN), + ucs_sockaddr_str(dest_addr, dest_str, UCS_SOCKADDR_STRING_LEN), + strerror(conn_errno)); + return status; } -ucs_status_t ucs_socket_connect_nb_get_status(int fd) +ucs_status_t ucs_socket_accept(int fd, struct sockaddr *addr, socklen_t *length_ptr, + int *accept_fd) { - socklen_t conn_status_sz; - int ret, conn_status; + ucs_status_t status; + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; - conn_status_sz = sizeof(conn_status); + *accept_fd = accept(fd, addr, length_ptr); + if (*accept_fd < 0) { + status = ucs_socket_check_errno(errno); + if (status == UCS_ERR_NO_PROGRESS) { + return status; + } + + ucs_error("accept() failed (client addr %s): %m", + ucs_sockaddr_str(addr, ip_port_str, UCS_SOCKADDR_STRING_LEN)); + return status; + } - ret = getsockopt(fd, SOL_SOCKET, SO_ERROR, - &conn_status, &conn_status_sz); + return UCS_OK; +} + +ucs_status_t ucs_socket_getpeername(int fd, struct sockaddr_storage *peer_addr, + socklen_t *peer_addr_len) +{ + int ret; + + *peer_addr_len = sizeof(*peer_addr); + ret = getpeername(fd, (struct sockaddr*)peer_addr, + peer_addr_len); if (ret < 0) { - ucs_error("getsockopt(fd=%d) failed to get SOL_SOCKET(SO_ERROR): %m", fd); - return UCS_ERR_IO_ERROR; + if ((errno != ENOTCONN) && (errno != ECONNRESET)) { + ucs_error("getpeername(fd=%d) failed: %m", fd); + return UCS_ERR_IO_ERROR; + } + + return UCS_ERR_NOT_CONNECTED; } - if ((conn_status == EINPROGRESS) || (conn_status == EWOULDBLOCK)) { - return UCS_INPROGRESS; + return UCS_OK; +} + +int ucs_socket_is_connected(int fd) +{ + struct sockaddr_storage peer_addr = {0}; /* Suppress Clang false-positive */ + char peer_str[UCS_SOCKADDR_STRING_LEN]; + char local_str[UCS_SOCKADDR_STRING_LEN]; + socklen_t peer_addr_len; + ucs_status_t status; + + status = ucs_socket_getpeername(fd, &peer_addr, &peer_addr_len); + if (status != UCS_OK) { + return 0; + } + + ucs_debug("[%s]<->[%s] is a connected pair", + ucs_socket_getname_str(fd, local_str, UCS_SOCKADDR_STRING_LEN), + ucs_sockaddr_str((const struct sockaddr*)&peer_addr, peer_str, + UCS_SOCKADDR_STRING_LEN)); + + return 1; +} + +ucs_status_t ucs_socket_set_buffer_size(int fd, size_t sockopt_sndbuf, + size_t sockopt_rcvbuf) +{ + ucs_status_t status; + + if (sockopt_sndbuf != UCS_MEMUNITS_AUTO) { + status = ucs_socket_setopt(fd, SOL_SOCKET, SO_SNDBUF, + (const void*)&sockopt_sndbuf, sizeof(int)); + if (status != UCS_OK) { + return status; + } } - if (conn_status != 0) { - ucs_error("SOL_SOCKET(SO_ERROR) status on fd %d: %s", fd, strerror(conn_status)); - return UCS_ERR_UNREACHABLE; + if (sockopt_rcvbuf != UCS_MEMUNITS_AUTO) { + status = ucs_socket_setopt(fd, SOL_SOCKET, SO_RCVBUF, + (const void*)&sockopt_rcvbuf, sizeof(int)); + if (status != UCS_OK) { + return status; + } } return UCS_OK; } -ucs_status_t ucs_sockaddr_sizeof(const struct sockaddr *addr, size_t *size_p) +ucs_status_t ucs_socket_server_init(const struct sockaddr *saddr, socklen_t socklen, + int backlog, int *listen_fd) { - ucs_status_t status = UCS_OK; + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + ucs_status_t status; + int ret, fd = -1; + uint16_t port; + /* Create the server socket for accepting incoming connections */ + status = ucs_socket_create(saddr->sa_family, SOCK_STREAM, &fd); + if (status != UCS_OK) { + goto err; + } + + /* Set the fd to non-blocking mode (so that accept() won't be blocking) */ + status = ucs_sys_fcntl_modfl(fd, O_NONBLOCK, 0); + if (status != UCS_OK) { + goto err_close_socket; + } + + status = ucs_sockaddr_get_port(saddr, &port); + if (status != UCS_OK) { + goto err_close_socket; + } + + do { + ret = bind(fd, saddr, socklen); + } while (!port && (ret < 0) && (errno == EADDRINUSE)); + + if (ret < 0) { + ucs_error("bind(fd=%d addr=%s) failed: %m", + fd, ucs_sockaddr_str((struct sockaddr *)saddr, + ip_port_str, sizeof(ip_port_str))); + status = (errno == EADDRINUSE) ? UCS_ERR_BUSY : UCS_ERR_IO_ERROR; + goto err_close_socket; + } + + if (listen(fd, backlog) < 0) { + ucs_error("listen(fd=%d addr=%s backlog=%d) failed: %m", + fd, ucs_sockaddr_str(saddr, ip_port_str, sizeof(ip_port_str)), + backlog); + status = UCS_ERR_IO_ERROR; + goto err_close_socket; + } + + *listen_fd = fd; + return UCS_OK; + +err_close_socket: + ucs_close_fd(&fd); +err: + return status; +} + +int ucs_socket_max_conn() +{ + static long somaxconn_val = 0; + + if (somaxconn_val || + (ucs_read_file_number(&somaxconn_val, 1, + UCS_SOCKET_MAX_CONN_PATH) == UCS_OK)) { + ucs_assert(somaxconn_val <= INT_MAX); + return somaxconn_val; + } else { + ucs_warn("unable to read somaxconn value from %s file", + UCS_SOCKET_MAX_CONN_PATH); + somaxconn_val = SOMAXCONN; + return somaxconn_val; + } +} + +static ucs_status_t +ucs_socket_handle_io_error(int fd, const char *name, ssize_t io_retval, int io_errno, + ucs_socket_io_err_cb_t err_cb, void *err_cb_arg) +{ + ucs_status_t status; + + if (io_retval == 0) { + /* 0 can be returned only by recv() system call as an error if + * the connection was dropped by peer */ + ucs_assert(!strcmp(name, "recv")); + ucs_trace("fd %d is closed", fd); + return UCS_ERR_NOT_CONNECTED; /* Connection closed by peer */ + } + + status = ucs_socket_check_errno(io_errno); + if (status == UCS_ERR_NO_PROGRESS) { + return UCS_ERR_NO_PROGRESS; + } + + if (err_cb != NULL) { + status = err_cb(err_cb_arg, status); + if (status == UCS_OK) { + /* UCS_ERR_CANCELED has to be returned if no other actions + * are required in order to prevent an endless loop in + * blocking IO operations (they continue a loop if UCS_OK + * or UCS_ERR_NO_PROGRESS is returned) */ + return UCS_ERR_CANCELED; + } else if (status == UCS_ERR_NO_PROGRESS) { + /* No error will be printed, a caller should continue + * calling function later in order to send/recv data */ + return UCS_ERR_NO_PROGRESS; + } + } + + ucs_error("%s(fd=%d) failed: %s", name, fd, strerror(io_errno)); + + return status; +} + +/** + * Handle the IO operation. + * + * @param [in] fd The socket fd. + * @param [in] data The pointer to user's data or pointer to the array of + * iov elements. + * @param [in] count The length of user's data or the number of elemnts in + * the array of iov. + * @param [out] length_p Pointer to the result length of user's data that was + * sent/received. + * @param [in] is_iov Flag that specifies type of the operation (1 if vector + * operation). + * @param [in] io_retval The result of the IO operation. + * @param [in] io_errno IO operation errno. + * @param [in] err_cb Error callback. + * @param [in] err_cb_arg User's argument for the error callback. + * + * @return if the IO operation was successful - UCS_OK, otherwise - error status. + */ +static inline ucs_status_t +ucs_socket_handle_io(int fd, const void *data, size_t count, + size_t *length_p, int is_iov, int io_retval, + int io_errno, const char *name, + ucs_socket_io_err_cb_t err_cb, void *err_cb_arg) +{ + /* The IO operation is considered as successful if: */ + if (ucs_likely(io_retval > 0)) { + /* - the return value > 0 */ + *length_p = io_retval; + return UCS_OK; + } + + if ((io_retval == 0) && + ((count == 0) || + (is_iov && (ucs_iovec_total_length((const struct iovec*)data, + count) == 0)))) { + /* - the return value == 0 and the user's data length == 0 + * (the number of the iov array buffers == 0 or the total + * length of the iov array buffers == 0) */ + *length_p = 0; + return UCS_OK; + } + + *length_p = 0; + return ucs_socket_handle_io_error(fd, name, io_retval, io_errno, + err_cb, err_cb_arg); +} + +static inline ucs_status_t +ucs_socket_do_io_nb(int fd, void *data, size_t *length_p, + ucs_socket_io_func_t io_func, const char *name, + ucs_socket_io_err_cb_t err_cb, void *err_cb_arg) +{ + ssize_t ret = io_func(fd, data, *length_p, MSG_NOSIGNAL); + return ucs_socket_handle_io(fd, data, *length_p, length_p, 0, + ret, errno, name, err_cb, err_cb_arg); +} + +static inline ucs_status_t +ucs_socket_do_io_b(int fd, void *data, size_t length, + ucs_socket_io_func_t io_func, const char *name, + ucs_socket_io_err_cb_t err_cb, void *err_cb_arg) +{ + size_t done_cnt = 0, cur_cnt = length; + ucs_status_t status; + + do { + status = ucs_socket_do_io_nb(fd, data, &cur_cnt, io_func, + name, err_cb, err_cb_arg); + done_cnt += cur_cnt; + ucs_assert(done_cnt <= length); + cur_cnt = length - done_cnt; + } while ((done_cnt < length) && + ((status == UCS_OK) || (status == UCS_ERR_NO_PROGRESS))); + + return status; +} + +static inline ucs_status_t +ucs_socket_do_iov_nb(int fd, struct iovec *iov, size_t iov_cnt, size_t *length_p, + ucs_socket_iov_func_t iov_func, const char *name, + ucs_socket_io_err_cb_t err_cb, void *err_cb_arg) +{ + struct msghdr msg = { + .msg_iov = iov, + .msg_iovlen = iov_cnt + }; + ssize_t ret; + + ret = iov_func(fd, &msg, MSG_NOSIGNAL); + return ucs_socket_handle_io(fd, iov, iov_cnt, length_p, 1, + ret, errno, name, err_cb, err_cb_arg); +} + +ucs_status_t ucs_socket_send_nb(int fd, const void *data, size_t *length_p, + ucs_socket_io_err_cb_t err_cb, + void *err_cb_arg) +{ + return ucs_socket_do_io_nb(fd, (void*)data, length_p, + (ucs_socket_io_func_t)send, + "send", err_cb, err_cb_arg); +} + +/* recv is declared as 'always_inline' on some platforms, it leads to + * compilation warning. wrap it into static function */ +static ssize_t ucs_socket_recv_io(int fd, void *data, size_t size, int flags) +{ + return recv(fd, data, size, flags); +} + +ucs_status_t ucs_socket_recv_nb(int fd, void *data, size_t *length_p, + ucs_socket_io_err_cb_t err_cb, + void *err_cb_arg) +{ + return ucs_socket_do_io_nb(fd, data, length_p, ucs_socket_recv_io, + "recv", err_cb, err_cb_arg); +} + +ucs_status_t ucs_socket_send(int fd, const void *data, size_t length, + ucs_socket_io_err_cb_t err_cb, + void *err_cb_arg) +{ + return ucs_socket_do_io_b(fd, (void*)data, length, + (ucs_socket_io_func_t)send, + "send", err_cb, err_cb_arg); +} + +ucs_status_t ucs_socket_recv(int fd, void *data, size_t length, + ucs_socket_io_err_cb_t err_cb, + void *err_cb_arg) +{ + return ucs_socket_do_io_b(fd, data, length, ucs_socket_recv_io, + "recv", err_cb, err_cb_arg); +} + +ucs_status_t +ucs_socket_sendv_nb(int fd, struct iovec *iov, size_t iov_cnt, size_t *length_p, + ucs_socket_io_err_cb_t err_cb, void *err_cb_arg) +{ + return ucs_socket_do_iov_nb(fd, iov, iov_cnt, length_p, sendmsg, + "sendv", err_cb, err_cb_arg); +} + +ucs_status_t ucs_sockaddr_sizeof(const struct sockaddr *addr, size_t *size_p) +{ switch (addr->sa_family) { case AF_INET: *size_p = sizeof(struct sockaddr_in); - break; + return UCS_OK; case AF_INET6: *size_p = sizeof(struct sockaddr_in6); - break; + return UCS_OK; default: ucs_error("unknown address family: %d", addr->sa_family); - status = UCS_ERR_INVALID_PARAM; - break; + return UCS_ERR_INVALID_PARAM; } - - return status; } -ucs_status_t ucs_sockaddr_get_port(const struct sockaddr *addr, unsigned *port_p) +ucs_status_t ucs_sockaddr_get_port(const struct sockaddr *addr, uint16_t *port_p) { - ucs_status_t status = UCS_OK; - switch (addr->sa_family) { case AF_INET: *port_p = ntohs(UCS_SOCKET_INET_PORT(addr)); - break; + return UCS_OK; case AF_INET6: *port_p = ntohs(UCS_SOCKET_INET6_PORT(addr)); - break; + return UCS_OK; default: ucs_error("unknown address family: %d", addr->sa_family); - status = UCS_ERR_INVALID_PARAM; - break; + return UCS_ERR_INVALID_PARAM; } +} - return status; +ucs_status_t ucs_sockaddr_set_port(struct sockaddr *addr, uint16_t port) +{ + switch (addr->sa_family) { + case AF_INET: + UCS_SOCKET_INET_PORT(addr) = htons(port); + return UCS_OK; + case AF_INET6: + UCS_SOCKET_INET6_PORT(addr) = htons(port); + return UCS_OK; + default: + ucs_error("unknown address family: %d", addr->sa_family); + return UCS_ERR_INVALID_PARAM; + } } const void *ucs_sockaddr_get_inet_addr(const struct sockaddr *addr) @@ -204,13 +654,19 @@ const void *ucs_sockaddr_get_inet_addr(const struct sockaddr *addr) } } +static unsigned ucs_sockaddr_is_known_af(const struct sockaddr *sa) +{ + return ((sa->sa_family == AF_INET) || + (sa->sa_family == AF_INET6)); +} + const char* ucs_sockaddr_str(const struct sockaddr *sock_addr, char *str, size_t max_size) { - unsigned port; + uint16_t port; size_t str_len; - if ((sock_addr->sa_family != AF_INET) && (sock_addr->sa_family != AF_INET6)) { + if (!ucs_sockaddr_is_known_af(sock_addr)) { ucs_strncpy_zero(str, "", max_size); return str; } @@ -232,3 +688,163 @@ const char* ucs_sockaddr_str(const struct sockaddr *sock_addr, return str; } + +int ucs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2, + ucs_status_t *status_p) +{ + int result = 1; + uint16_t port1 = 0, port2 = 0; + ucs_status_t status = UCS_OK; + + if (!ucs_sockaddr_is_known_af(sa1) || + !ucs_sockaddr_is_known_af(sa2)) { + ucs_error("unknown address family: %d", + !ucs_sockaddr_is_known_af(sa1) ? + sa1->sa_family : sa2->sa_family); + status = UCS_ERR_INVALID_PARAM; + goto out; + } + + if (sa1->sa_family != sa2->sa_family) { + result = (int)sa1->sa_family - (int)sa2->sa_family; + goto out; + } + + switch (sa1->sa_family) { + case AF_INET: + result = memcmp(&UCS_SOCKET_INET_ADDR(sa1), + &UCS_SOCKET_INET_ADDR(sa2), + sizeof(UCS_SOCKET_INET_ADDR(sa1))); + port1 = ntohs(UCS_SOCKET_INET_PORT(sa1)); + port2 = ntohs(UCS_SOCKET_INET_PORT(sa2)); + break; + case AF_INET6: + result = memcmp(&UCS_SOCKET_INET6_ADDR(sa1), + &UCS_SOCKET_INET6_ADDR(sa2), + sizeof(UCS_SOCKET_INET6_ADDR(sa1))); + port1 = ntohs(UCS_SOCKET_INET6_PORT(sa1)); + port2 = ntohs(UCS_SOCKET_INET6_PORT(sa2)); + break; + } + + if (!result && (port1 != port2)) { + result = (int)port1 - (int)port2; + } + +out: + if (status_p) { + *status_p = status; + } + return result; +} + +int ucs_sockaddr_ip_cmp(const struct sockaddr *sa1, const struct sockaddr *sa2) +{ + if (!ucs_sockaddr_is_known_af(sa1) || !ucs_sockaddr_is_known_af(sa2)) { + ucs_error("unknown address family: %d", + !ucs_sockaddr_is_known_af(sa1) ? sa1->sa_family : sa2->sa_family); + return -1; + } + + return memcmp(ucs_sockaddr_get_inet_addr(sa1), + ucs_sockaddr_get_inet_addr(sa2), + (sa1->sa_family == AF_INET) ? + UCS_IPV4_ADDR_LEN : UCS_IPV6_ADDR_LEN); +} + +int ucs_sockaddr_is_inaddr_any(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + return UCS_SOCKET_INET_ADDR(addr).s_addr == INADDR_ANY; + case AF_INET6: + return !memcmp(&(UCS_SOCKET_INET6_ADDR(addr)), &in6addr_any, + sizeof(UCS_SOCKET_INET6_ADDR(addr))); + default: + ucs_debug("invalid address family: %d", addr->sa_family); + return 0; + } +} + +ucs_status_t ucs_sockaddr_copy(struct sockaddr *dst_addr, + const struct sockaddr *src_addr) +{ + ucs_status_t status; + size_t size; + + status = ucs_sockaddr_sizeof(src_addr, &size); + if (status != UCS_OK) { + return status; + } + + memcpy(dst_addr, src_addr, size); + return UCS_OK; +} + +ucs_status_t ucs_sockaddr_get_ifname(int fd, char *ifname_str, size_t max_strlen) +{ + ucs_status_t status = UCS_ERR_NO_DEVICE; + struct ifaddrs *ifa; + struct ifaddrs* ifaddrs; + struct sockaddr *sa; + struct sockaddr *my_addr; + socklen_t sockaddr_len; + char str_local_addr[UCS_SOCKADDR_STRING_LEN]; + + sockaddr_len = sizeof(struct sockaddr_storage); + my_addr = ucs_alloca(sockaddr_len); + + if (getsockname(fd, my_addr, &sockaddr_len)) { + ucs_warn("getsockname error: %m"); + return UCS_ERR_IO_ERROR; + } + + /* port number is not important, so we assign zero because sockaddr + * structures returned by getifaddrs have ports assigned to zero */ + if (UCS_OK != ucs_sockaddr_set_port(my_addr, 0)) { + ucs_warn("sockcm doesn't support unknown address family"); + return UCS_ERR_INVALID_PARAM; + } + + ucs_debug("check ifname for socket on %s", + ucs_sockaddr_str(my_addr, str_local_addr, UCS_SOCKADDR_STRING_LEN)); + + if (getifaddrs(&ifaddrs)) { + ucs_warn("getifaddrs error: %m"); + return UCS_ERR_IO_ERROR; + } + + for (ifa = ifaddrs; ifa != NULL; ifa = ifa->ifa_next) { + sa = (struct sockaddr*) ifa->ifa_addr; + + if (sa == NULL) { + ucs_debug("NULL ifaddr encountered with ifa_name: %s", ifa->ifa_name); + continue; + } + + if (((sa->sa_family == AF_INET) ||(sa->sa_family == AF_INET6)) && + (!ucs_sockaddr_cmp(sa, my_addr, NULL))) { + ucs_debug("matching ip found iface on %s", ifa->ifa_name); + ucs_strncpy_safe(ifname_str, ifa->ifa_name, max_strlen); + status = UCS_OK; + break; + } + } + + freeifaddrs(ifaddrs); + + return status; +} + +const char *ucs_sockaddr_address_family_str(sa_family_t af) +{ + switch (af) { + case AF_INET: + return "IPv4"; + case AF_INET6: + return "IPv6"; + default: + return "not IPv4 or IPv6"; + } +} diff --git a/src/ucs/sys/sock.h b/src/ucs/sys/sock.h index 779ade9a4de..d07a2f92402 100644 --- a/src/ucs/sys/sock.h +++ b/src/ucs/sys/sock.h @@ -1,5 +1,6 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * See file LICENSE for terms. */ @@ -14,18 +15,60 @@ #include #include #include +#include BEGIN_C_DECLS +#define UCS_IPV4_ADDR_LEN sizeof(struct in_addr) +#define UCS_IPV6_ADDR_LEN sizeof(struct in6_addr) + /* A string to hold the IP address and port from a sockaddr */ -#define UCS_SOCKADDR_STRING_LEN 60 +#define UCS_SOCKADDR_STRING_LEN 60 + +#define UCS_SOCKET_INET_ADDR(_addr) (((struct sockaddr_in*)(_addr))->sin_addr) +#define UCS_SOCKET_INET_PORT(_addr) (((struct sockaddr_in*)(_addr))->sin_port) + +#define UCS_SOCKET_INET6_ADDR(_addr) (((struct sockaddr_in6*)(_addr))->sin6_addr) +#define UCS_SOCKET_INET6_PORT(_addr) (((struct sockaddr_in6*)(_addr))->sin6_port) + + +/** + * Error callback to handle errno and status of a given socket IO operation. + * + * @param [in] arg User's argument for the error callback. + * @param [in] io_status Status set for a given IO operation. + * + * @return UCS_OK if error handling was done in the callback and no other + * actions are required from a caller (UCS_ERR_CANCELED will be + * returned as the result of the IO operation), UCS_ERR_NO_PROGRESS + * if error handling was done in the callback and the IO operation + * should be continued (UCS_ERR_NO_PROGRESS will be retuned as the + * result of the IO operation), otherwise - the default error handling + * should be done and the returned status will be the result of + * the IO operation. + */ +typedef ucs_status_t (*ucs_socket_io_err_cb_t)(void *arg, + ucs_status_t io_status); + -#define UCS_SOCKET_INET_ADDR(_addr) (((struct sockaddr_in*)(_addr))->sin_addr) -#define UCS_SOCKET_INET_PORT(_addr) (((struct sockaddr_in*)(_addr))->sin_port) +/** + * Close the given file descriptor. + * + * @param [in] fd_p pointer to the file descriptor to close. + */ +void ucs_close_fd(int *fd_p); -#define UCS_SOCKET_INET6_ADDR(_addr) (((struct sockaddr_in6*)(_addr))->sin6_addr) -#define UCS_SOCKET_INET6_PORT(_addr) (((struct sockaddr_in6*)(_addr))->sin6_port) + +/** + * Check if the given (interface) flags represent an active interface. + * + * @param [in] flags Interface flags (Can be obtained using getifaddrs + * or from SIOCGIFFLAGS ioctl). + * + * @return 1 if true, otherwise 0 + */ +int ucs_netif_flags_is_active(unsigned int flags); /** @@ -52,6 +95,17 @@ ucs_status_t ucs_netif_ioctl(const char *if_name, unsigned long request, int ucs_netif_is_active(const char *if_name); +/** + * Get number of active 802.3ad ports for a bond device. If the device is not + * a bond device, or 802.3ad is not enabled, return 1. + * + * @param [in] if_name Name of network interface to check. + * + * @return Number of active 802.3ad ports on @a if_name. + */ +unsigned ucs_netif_bond_ad_num_ports(const char *if_name); + + /** * Create a socket. * @@ -82,7 +136,23 @@ ucs_status_t ucs_socket_setopt(int fd, int level, int optname, /** - * Connects the socket referred to by the file descriptor `fd` + * Get options of a socket. + * + * @param [in] fd Socket fd. + * @param [in] level The level at which the option is defined. + * @param [in] optname The socket option for which the value is fetched. + * @param [in] optval A pointer to the buffer in which the value for the + * requested option is stored. + * @param [in] optlen The size, in bytes, of optval. + * + * @return UCS_OK on success or UCS_ERR_IO_ERROR on failure + */ +ucs_status_t ucs_socket_getopt(int fd, int level, int optname, + void *optval, socklen_t optlen); + + +/** + * Connect the socket referred to by the file descriptor `fd` * to the address specified by `dest_addr`. * * @param [in] fd Socket fd. @@ -95,19 +165,214 @@ ucs_status_t ucs_socket_connect(int fd, const struct sockaddr *dest_addr); /** - * Reports information about non-blocking connection status for - * the socket referred to by the file descriptor `fd`. + * Accept a connection request on the given socket fd. + * + * @param [in] fd Socket fd. + * @param [out] addr Client socket address that initiated the connection + * @param [out] length_ptr Client address socket's length + * @param [out] accept_fd Upon success, a non-negative file descriptor + * of the accepted socket. Otherwise, -1. + * + * @return UCS_OK on success or UCS_ERR_NO_PROGRESS to indicate that no progress + * was made or UCS_ERR_IO_ERROR on failure. + */ +ucs_status_t ucs_socket_accept(int fd, struct sockaddr *addr, socklen_t *length_ptr, + int *accept_fd); + + +/** + * Get the address of the peer's socket that the given fd is connected to + * + * @param [in] fd Socket fd. + * @param [out] peer_addr Address of the remote peer. + * @param [out] peer_addr_len Length of the remote peer's address. + * + * @return UCS_OK on success or UCS_ERR_IO_ERROR on failure + */ +ucs_status_t ucs_socket_getpeername(int fd, struct sockaddr_storage *peer_addr, + socklen_t *peer_addr_len); + + +/** + * Check whether the socket referred to by the file descriptor `fd` + * is connected to a peer or not. * * @param [in] fd Socket fd. * - * @return UCS_OK on success or UCS_ERR_UNREACHABLE on failure or - * UCS_INPROGRESS if operation is still in progress. + * @return 1 - connected, 0 - not connected. + */ +int ucs_socket_is_connected(int fd); + + +/** + * Set options on a socket for its send and receive buffers. + * Set the options only if the given buffers sizes are not set to UCS_MEMUNITS_AUTO. + * + * @param [in] fd Socket fd. + * @param [in] sockopt_sndbuf Send buffer in which the value for the + * option is specified. + * @param [in] sockopt_rcvbuf Receive buffer in which the value for the + * option is specified. + * + * @return UCS_OK on success or UCS_ERR_IO_ERROR on failure. */ -ucs_status_t ucs_socket_connect_nb_get_status(int fd); +ucs_status_t ucs_socket_set_buffer_size(int fd, size_t sockopt_sndbuf, + size_t sockopt_rcvbuf); /** - * Returns size of a given sockaddr structure. + * Initialize a TCP server. + * Open a socket, bind a sockadrr to that socket and start listening on it for + * incoming connection requests. + * + * @param [in] saddr Sockaddr for the server to listen on. + * If the port number inside is set to zero - + * use a random port. + * @param [in] socklen Size of saddr. + * @param [in] backlog Length of the queue for pending connections - + * for the listen() call. + * @param [out] listen_fd The fd that belongs to the server. + * + * @return UCS_OK on success or an error code on failure. + */ +ucs_status_t ucs_socket_server_init(const struct sockaddr *saddr, socklen_t socklen, + int backlog, int *listen_fd); + + +/** + * Returns the maximum possible value for the number of sockets that + * are ready to be accepted. It maybe either value from the system path + * or SOMAXCONN value. + * + * @return The queue length for completely established sockets + * waiting to be accepted. + */ +int ucs_socket_max_conn(); + + +/** + * Non-blocking send operation sends data on the connected (or bound + * connectionless) socket referred to by the file descriptor `fd`. + * + * @param [in] fd Socket fd. + * @param [in] data A pointer to a buffer containing the data to + * be transmitted. + * @param [in/out] length_p The length, in bytes, of the data in buffer + * pointed to by the `data` parameter. The amount of + * data transmitted is written to this argument. + * @param [in] err_cb Error callback. + * @param [in] err_cb_arg User's argument for the error callback. + * + * @return UCS_OK on success, UCS_ERR_CANCELED if some error happened, but it + * was handled in a user's err_cb and no other actions are required, + * UCS_ERR_NO_PROGRESS if system call was interrupted or would block, + * UCS_ERR_NOT_CONNECTED if the connection was destroyed, + * UCS_ERR_IO_ERROR on failure, or any other errors returned from a + * user's error callback. + */ +ucs_status_t ucs_socket_send_nb(int fd, const void *data, size_t *length_p, + ucs_socket_io_err_cb_t err_cb, + void *err_cb_arg); + + +/** + * Non-blocking receive operation receives data from the connected (or bound + * connectionless) socket referred to by the file descriptor `fd`. + * + * @param [in] fd Socket fd. + * @param [in] data A pointer to a buffer to receive the incoming + * data. + * @param [in/out] length_p The length, in bytes, of the data in buffer + * pointed to by the `data` parameter. The amount of + * data received is written to this argument. + * @param [in] err_cb Error callback. + * @param [in] err_cb_arg User's argument for the error callback. + * + * @return UCS_OK on success, UCS_ERR_CANCELED if some error happened, but it + * was handled in user's err_cb and no other actions are required, + * UCS_ERR_NO_PROGRESS if system call was interrupted or would block, + * UCS_ERR_NOT_CONNECTED if the connection was destroyed, + * UCS_ERR_IO_ERROR on failure, or any other errors returned from a + * user's error callback. + */ +ucs_status_t ucs_socket_recv_nb(int fd, void *data, size_t *length_p, + ucs_socket_io_err_cb_t err_cb, + void *err_cb_arg); + + +/** + * Blocking send operation sends data on the connected (or bound connectionless) + * socket referred to by the file descriptor `fd`. + * + * @param [in] fd Socket fd. + * @param [in] data A pointer to a buffer containing the data to + * be transmitted. + * @param [in/out] length The length, in bytes, of the data in buffer + * pointed to by the `data` parameter. + * @param [in] err_cb Error callback. + * @param [in] err_cb_arg User's argument for the error callback. + * + * @return UCS_OK on success, UCS_ERR_CANCELED if some error happened, but it + * was handled in user's err_cb and no other actions are required, + * UCS_ERR_NOT_CONNECTED if the connection was destroyed, + * UCS_ERR_IO_ERROR on failure, or any other errors returned from a + * user's error callback. + */ +ucs_status_t ucs_socket_send(int fd, const void *data, size_t length, + ucs_socket_io_err_cb_t err_cb, + void *err_cb_arg); + + +/** + * Non-blocking send operation sends I/O vector on the connected (or bound + * connectionless) socket referred to by the file descriptor `fd`. + * + * @param [in] fd Socket fd. + * @param [in] iov A pointer to an array of iovec buffers. + * @param [in] iov_cnt The number of buffers pointed to by + * the iov parameter. + * @param [out] length_p The amount of data transmitted is written to + * this argument. + * @param [in] err_cb Error callback. + * @param [in] err_cb_arg User's argument for the error callback. + * + * @return UCS_OK on success, UCS_ERR_CANCELED if some error happened, but it + * was handled in user's err_cb and no other actions are required, + * UCS_ERR_NO_PROGRESS if system call was interrupted or would block, + * UCS_ERR_NOT_CONNECTED if the connection was destroyed, + * UCS_ERR_IO_ERROR on failure, or any other errors returned from a + * user's error callback. + */ +ucs_status_t ucs_socket_sendv_nb(int fd, struct iovec *iov, size_t iov_cnt, + size_t *length_p, ucs_socket_io_err_cb_t err_cb, + void *err_cb_arg); + + +/** + * Blocking receive operation receives data from the connected (or bound + * connectionless) socket referred to by the file descriptor `fd`. + * + * @param [in] fd Socket fd. + * @param [in] data A pointer to a buffer to receive the incoming + * data. + * @param [in/out] length The length, in bytes, of the data in buffer + * pointed to by the `data` paramete. + * @param [in] err_cb Error callback. + * @param [in] err_cb_arg User's argument for the error callback. + * + * @return UCS_OK on success, UCS_ERR_CANCELED if some error happened, but it + * was handled in user's err_cb and no other actions are required, + * UCS_ERR_NOT_CONNECTED if the connection was destroyed, + * UCS_ERR_IO_ERROR on failure, or any other errors returned from a + * user's error callback. + */ +ucs_status_t ucs_socket_recv(int fd, void *data, size_t length, + ucs_socket_io_err_cb_t err_cb, + void *err_cb_arg); + + +/** + * Return size of a given sockaddr structure. * * @param [in] addr Pointer to sockaddr structure. * @param [out] size_p Pointer to variable where size of @@ -119,7 +384,7 @@ ucs_status_t ucs_sockaddr_sizeof(const struct sockaddr *addr, size_t *size_p); /** - * Returns port of a given sockaddr structure. + * Return port of a given sockaddr structure. * * @param [in] addr Pointer to sockaddr structure. * @param [out] port_p Pointer to variable where port (host notation) @@ -127,11 +392,22 @@ ucs_status_t ucs_sockaddr_sizeof(const struct sockaddr *addr, size_t *size_p); * * @return UCS_OK on success or UCS_ERR_INVALID_PARAM on failure. */ -ucs_status_t ucs_sockaddr_get_port(const struct sockaddr *addr, unsigned *port_p); +ucs_status_t ucs_sockaddr_get_port(const struct sockaddr *addr, uint16_t *port_p); + + +/** + * Set port to a given sockaddr structure. + * + * @param [in] addr Pointer to sockaddr structure. + * @param [in] port Port (host notation) that will be written + * + * @return UCS_OK on success or UCS_ERR_INVALID_PARAM on failure. + */ +ucs_status_t ucs_sockaddr_set_port(struct sockaddr *addr, uint16_t port); /** - * Returns IP addr of a given sockaddr structure. + * Return IP addr of a given sockaddr structure. * * @param [in] addr Pointer to sockaddr structure. * @@ -155,6 +431,98 @@ const char* ucs_sockaddr_str(const struct sockaddr *sock_addr, char *str, size_t max_size); +/** + * Extract the IP address from a given socket fd and return it as a string. + * + * @param [in] fd Socket fd. + * @param [out] str A string filled with the IP address. + * @param [in] max_size Size of a string (considering '\0'-terminated symbol) + * + * @return ip_str if the sock_addr has a valid IP address or 'Invalid address' + * otherwise. + */ +const char *ucs_socket_getname_str(int fd, char *str, size_t max_size); + + +/** + * Return a value indicating the relationships between passed sockaddr structures. + * + * @param [in] sa1 Pointer to sockaddr structure #1. + * @param [in] sa2 Pointer to sockaddr structure #2. + * @param [in/out] status_p Pointer (can be NULL) to a status: UCS_OK on success + * or UCS_ERR_INVALID_PARAM on failure. + * + * @return Returns an integral value indicating the relationship between the + * socket addresses: + * > 0 - the first socket address is greater than the second + * socket address; + * < 0 - the first socket address is lower than the second + * socket address; + * = 0 - the socket addresses are equal. + * Note: it returns a positive integer value in case of error occured + * during comparison. + */ +int ucs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2, + ucs_status_t *status_p); + + +/** + * Check if the IP addresses of the given sockaddrs are the same. + * + * @param [in] sa1 Pointer to sockaddr structure #1. + * @param [in] sa2 Pointer to sockaddr structure #2. + * + * @return Return 0 if the IP addresses are the same and a non-zero value + * otherwise. + */ +int ucs_sockaddr_ip_cmp(const struct sockaddr *sa1, const struct sockaddr *sa2); + + +/** + * Indicate if given IP addr is INADDR_ANY (IPV4) or in6addr_any (IPV6) + * + * @param [in] addr Pointer to sockaddr structure. + * + * @return 1 if input is INADDR_ANY or in6addr_any + * 0 if not + */ +int ucs_sockaddr_is_inaddr_any(struct sockaddr *addr); + + +/** + * Copy the src_addr sockaddr to dst_addr sockaddr. The length to copy is + * the size of the src_addr sockaddr. + * + * @param [in] dst_addr Pointer to destination sockaddr (to copy to). + * @param [in] src_addr Pointer to source sockaddr (to copy from). + * + * @return UCS_OK on success or UCS_ERR_INVALID_PARAM on failure. + */ +ucs_status_t ucs_sockaddr_copy(struct sockaddr *dst_addr, + const struct sockaddr *src_addr); + + +/** + * Copy into ifname_name the interface associated the IP on which the socket + * file descriptor fd is bound on. IPv4 and IPv6 addresses are handled. + * + * @param [in] fd Socket fd. + * @param [out] if_str A string filled with the interface name. + * @param [in] max_strlen Maximum length of the if_str. + */ +ucs_status_t ucs_sockaddr_get_ifname(int fd, char *ifname_str, size_t max_strlen); + + +/** + * Convert the given address family to a string containing its value. + * + * @param [in] af Address family to convert. + * + * Only IPv4 and IPv6 conversions are supported. + */ +const char *ucs_sockaddr_address_family_str(sa_family_t af); + END_C_DECLS #endif diff --git a/src/ucs/sys/string.c b/src/ucs/sys/string.c index 9f15c623cf8..333b40d1413 100644 --- a/src/ucs/sys/string.c +++ b/src/ucs/sys/string.c @@ -11,6 +11,9 @@ #include "string.h" #include "math.h" #include "sys.h" +#include +#include +#include #include #include @@ -18,6 +21,9 @@ #include +const char *ucs_memunits_suffixes[] = {"", "K", "M", "G", "T", "P", "E", NULL}; + + void ucs_fill_filename_template(const char *tmpl, char *buf, size_t max) { char *p, *end; @@ -59,7 +65,7 @@ void ucs_fill_filename_template(const char *tmpl, char *buf, size_t max) break; case 't': t = time(NULL); - strftime(p, end - p, "%Y-%m-%d-%H:%M:%S", localtime(&t)); + strftime(p, end - p, "%Y-%m-%d-%H-%M-%S", localtime(&t)); pf = pp + 2; p += strlen(p); break; @@ -109,22 +115,91 @@ uint64_t ucs_string_to_id(const char* str) return id; } -void ucs_memunits_to_str(size_t value, char *buf, size_t max) +size_t ucs_string_quantity_prefix_value(char prefix) { - static const char * suffixes[] = {"", "K", "M", "G", "T", NULL}; + switch (prefix) { + case 'B': + return 1; + case 'K': + return UCS_KBYTE; + case 'M': + return UCS_MBYTE; + case 'G': + return UCS_GBYTE; + case 'T': + return UCS_TBYTE; + default: + return 0; + } +} +char *ucs_memunits_to_str(size_t value, char *buf, size_t max) +{ const char **suffix; - if (value == SIZE_MAX) { - strncpy(buf, "(inf)", max); + if (value == UCS_MEMUNITS_INF) { + ucs_strncpy_safe(buf, UCS_NUMERIC_INF_STR, max); + } else if (value == UCS_MEMUNITS_AUTO) { + ucs_strncpy_safe(buf, UCS_VALUE_AUTO_STR, max); } else { - suffix = &suffixes[0]; + suffix = &ucs_memunits_suffixes[0]; while ((value >= 1024) && ((value % 1024) == 0) && *(suffix + 1)) { value /= 1024; ++suffix; } - snprintf(buf, max, "%zu%s", value, *suffix); + ucs_snprintf_safe(buf, max, "%zu%s", value, *suffix); } + return buf; +} + +ucs_status_t ucs_str_to_memunits(const char *buf, void *dest) +{ + char units[3]; + int num_fields; + size_t value; + size_t bytes; + + /* Special value: infinity */ + if (!strcasecmp(buf, UCS_NUMERIC_INF_STR)) { + *(size_t*)dest = UCS_MEMUNITS_INF; + return UCS_OK; + } + + /* Special value: auto */ + if (!strcasecmp(buf, UCS_VALUE_AUTO_STR)) { + *(size_t*)dest = UCS_MEMUNITS_AUTO; + return UCS_OK; + } + + memset(units, 0, sizeof(units)); + num_fields = sscanf(buf, "%ld%c%c", &value, &units[0], &units[1]); + if (num_fields == 1) { + bytes = 1; + } else if ((num_fields == 2) || (num_fields == 3)) { + bytes = ucs_string_quantity_prefix_value(toupper(units[0])); + if (!bytes || ((num_fields == 3) && tolower(units[1]) != 'b')) { + return UCS_ERR_INVALID_PARAM; + } + } else { + return UCS_ERR_INVALID_PARAM; + } + + *(size_t*)dest = value * bytes; + return UCS_OK; +} + +void ucs_snprintf_safe(char *buf, size_t size, const char *fmt, ...) +{ + va_list ap; + + if (size == 0) { + return; + } + + va_start(ap, fmt); + vsnprintf(buf, size - 1, fmt, ap); + buf[size - 1] = '\0'; + va_end(ap); } char* ucs_strncpy_safe(char *dst, const char *src, size_t len) @@ -164,3 +239,88 @@ char *ucs_strtrim(char *str) return start; } + +const char * ucs_str_dump_hex(const void* data, size_t length, char *buf, + size_t max, size_t per_line) +{ + static const char hexchars[] = "0123456789abcdef"; + char *p, *endp; + uint8_t value; + size_t i; + + p = buf; + endp = buf + max - 2; + i = 0; + while ((p < endp) && (i < length)) { + if (i > 0) { + if ((i % per_line) == 0) { + *(p++) = '\n'; + } else if ((i % 4) == 0) { + *(p++) = ':'; + } + + if (p == endp) { + break; + } + } + + value = *(const uint8_t*)(UCS_PTR_BYTE_OFFSET(data, i)); + p[0] = hexchars[value / 16]; + p[1] = hexchars[value % 16]; + p += 2; + ++i; + } + *p = 0; + return buf; +} + +const char* ucs_flags_str(char *buf, size_t max, + uint64_t flags, const char **str_table) +{ + size_t i, len = 0; + + for (i = 0; *str_table; ++str_table, ++i) { + if (flags & UCS_BIT(i)) { /* not using ucs_for_each_bit to silence coverity */ + snprintf(buf + len, max - len, "%s,", *str_table); + len = strlen(buf); + } + } + + if (len > 0) { + buf[len - 1] = '\0'; /* remove last ',' */ + } else { + buf[0] = '\0'; + } + + return buf; +} + +ssize_t ucs_path_calc_distance(const char *path1, const char *path2) +{ + unsigned distance = 0; + int same = 1; + char resolved_path1[PATH_MAX], resolved_path2[PATH_MAX]; + size_t comp_len, i; + size_t rp_len1, rp_len2; + + if ((NULL == realpath(path1, resolved_path1)) || + (NULL == realpath(path2, resolved_path2))) { + return UCS_ERR_INVALID_PARAM; + } + + rp_len1 = strlen(resolved_path1); + rp_len2 = strlen(resolved_path2); + comp_len = ucs_min(rp_len1, rp_len2); + + for (i = 0; i < comp_len; i++) { + if (resolved_path1[i] != resolved_path2[i]) { + same = 0; + } + + if ((resolved_path1[i] == '/') && !same) { + distance++; + } + } + + return distance; +} diff --git a/src/ucs/sys/string.h b/src/ucs/sys/string.h index 5e7718d6eb3..b011a5246bd 100644 --- a/src/ucs/sys/string.h +++ b/src/ucs/sys/string.h @@ -8,15 +8,34 @@ #define UCS_STRING_H_ #include "compiler_def.h" +#include +#include #include #include +#include #include BEGIN_C_DECLS /** @file string.h */ +/* value which specifies "infinity" for a numeric variable */ +#define UCS_NUMERIC_INF_STR "inf" + +/* value which specifies "auto" for a variable */ +#define UCS_VALUE_AUTO_STR "auto" + +/* the numeric value of "infinity" */ +#define UCS_MEMUNITS_INF ((size_t)-1) +#define UCS_ULUNITS_INF ((unsigned long)-1) + +/* value which specifies "auto" for a numeric variable */ +#define UCS_MEMUNITS_AUTO ((size_t)-2) +#define UCS_ULUNITS_AUTO ((unsigned long)-2) +#define UCS_HEXUNITS_AUTO ((uint16_t)-2) + + /** * Expand a partial path to full path. * @@ -71,8 +90,41 @@ uint64_t ucs_string_to_id(const char *str); * @param value Value to convert. * @param buf Buffer to place the string. * @param max Maximal length of the buffer. + * + * @return Pointer to 'buf', which holds the resulting string. + */ +char *ucs_memunits_to_str(size_t value, char *buf, size_t max); + + +/** + * Convert a string holding memory units to a numeric value. + * + * @param buf String to convert + * @param dest Numeric value of the string + * + * @return UCS_OK if successful, or error code otherwise. + */ +ucs_status_t ucs_str_to_memunits(const char *buf, void *dest); + + +/** + * Return the numeric value of the memunits prefix. + * For example: + * 'M' -> 1048576 */ -void ucs_memunits_to_str(size_t value, char *buf, size_t max); +size_t ucs_string_quantity_prefix_value(char prefix); + + +/** + * Format a string to a buffer of given size, and guarantee that the last char + * in the buffer is '\0'. + * + * @param buf Buffer to format the string to. + * @param size Buffer size. + * @param fmt Format string. + */ +void ucs_snprintf_safe(char *buf, size_t size, const char *fmt, ...) + UCS_F_PRINTF(3, 4); /** @@ -98,6 +150,71 @@ char* ucs_strncpy_safe(char *dst, const char *src, size_t len); char *ucs_strtrim(char *str); +/** + * Get pointer to file name in path, same as basename but do not + * modify source string. + * + * @param path Path to parse. + * + * @return file name + */ +static UCS_F_ALWAYS_INLINE const char* ucs_basename(const char *path) +{ + const char *name = strrchr(path, '/'); + + return (name == NULL) ? path : name + 1; +} + + +/** + * Dump binary array into string in hex format. Destination string is + * always ended by '\0'. + * + * @param data Source array to dump. + * @param length Length of source array in bytes. + * @param buf Destination string. + * @param max Max length of destination string including terminating + * '\0' byte. + * @param per_line Number of bytes in source array to print per line + * or SIZE_MAX for single line. + * + * @return address of destination buffer + */ +const char *ucs_str_dump_hex(const void* data, size_t length, char *buf, + size_t max, size_t per_line); + + +/** + * Convert the given flags to a string that represents them. + * + * @param str String to hold the flags string values. + * @param max Size of the string. + * @param flags Flags to be converted. + * @param str_table Conversion table - from flag value to a string. + * + * @return String that holds the representation of the given flags. + */ +const char* ucs_flags_str(char *str, size_t max, + uint64_t flags, const char **str_table); + + +/** + * Get estimated number of segments different in the two paths. Segments are + * separated by `/`. + * + * @param path1 String pointing to first path + * @param path2 String pointing to second path + * + * @return if either of the paths are invalid, UINT_MAX; if paths are the same 0 + * is returned; otherwise in between + */ +ssize_t ucs_path_calc_distance(const char *path1, const char *path2); + + +/** Quantifier suffixes for memory units ("K", "M", "G", etc) */ +extern const char *ucs_memunits_suffixes[]; + + END_C_DECLS #endif diff --git a/src/ucs/sys/stubs.c b/src/ucs/sys/stubs.c new file mode 100644 index 00000000000..ca577409d4a --- /dev/null +++ b/src/ucs/sys/stubs.c @@ -0,0 +1,83 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include + + +void ucs_empty_function() +{ +} + +unsigned ucs_empty_function_return_zero() +{ + return 0; +} + +int64_t ucs_empty_function_return_zero_int64() +{ + return 0; +} + +unsigned ucs_empty_function_return_one() +{ + return 1; +} + +ucs_status_t ucs_empty_function_return_success() +{ + return UCS_OK; +} + +ucs_status_t ucs_empty_function_return_unsupported() +{ + return UCS_ERR_UNSUPPORTED; +} + +ucs_status_t ucs_empty_function_return_inprogress() +{ + return UCS_INPROGRESS; +} + +ucs_status_t ucs_empty_function_return_no_resource() +{ + return UCS_ERR_NO_RESOURCE; +} + +ucs_status_t ucs_empty_function_return_invalid_param() +{ + return UCS_ERR_INVALID_PARAM; +} + +ucs_status_ptr_t ucs_empty_function_return_ptr_no_resource() +{ + return UCS_STATUS_PTR(UCS_ERR_NO_RESOURCE); +} + +ucs_status_t ucs_empty_function_return_ep_timeout() +{ + return UCS_ERR_ENDPOINT_TIMEOUT; +} + +ssize_t ucs_empty_function_return_bc_ep_timeout() +{ + return UCS_ERR_ENDPOINT_TIMEOUT; +} + +ucs_status_t ucs_empty_function_return_busy() +{ + return UCS_ERR_BUSY; +} + +int ucs_empty_function_do_assert() +{ + ucs_assert_always(0); + return 0; +} diff --git a/src/ucs/sys/stubs.h b/src/ucs/sys/stubs.h new file mode 100644 index 00000000000..7310bdce354 --- /dev/null +++ b/src/ucs/sys/stubs.h @@ -0,0 +1,40 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef UCS_STUBS_H +#define UCS_STUBS_H + +#include + +#include +#include +#include + +BEGIN_C_DECLS + +/** @file stubs.h */ + +/** + * Empty function which can be casted to a no-operation callback in various situations. + */ +void ucs_empty_function(); +unsigned ucs_empty_function_return_zero(); +unsigned ucs_empty_function_return_one(); +int64_t ucs_empty_function_return_zero_int64(); +ucs_status_t ucs_empty_function_return_success(); +ucs_status_t ucs_empty_function_return_unsupported(); +ucs_status_t ucs_empty_function_return_inprogress(); +ucs_status_t ucs_empty_function_return_no_resource(); +ucs_status_t ucs_empty_function_return_invalid_param(); +ucs_status_ptr_t ucs_empty_function_return_ptr_no_resource(); +ucs_status_t ucs_empty_function_return_ep_timeout(); +ssize_t ucs_empty_function_return_bc_ep_timeout(); +ucs_status_t ucs_empty_function_return_busy(); +int ucs_empty_function_do_assert(); + +END_C_DECLS + +#endif diff --git a/src/ucs/sys/sys.c b/src/ucs/sys/sys.c index 20e41b754a5..62e555b409d 100644 --- a/src/ucs/sys/sys.c +++ b/src/ucs/sys/sys.c @@ -1,6 +1,6 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2012. ALL RIGHTS RESERVED. -* Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. +* Copyright (c) UT-Battelle, LLC. 2014-2019. ALL RIGHTS RESERVED. * Copyright (C) ARM Ltd. 2016-2017. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -10,14 +10,18 @@ # include "config.h" #endif -#include "sys.h" -#include "checker.h" -#include "string.h" -#include "math.h" +#include +#include +#include +#include #include #include +#include #include + +#include +#include #include #include #include @@ -26,6 +30,9 @@ #include #include #include +#ifdef HAVE_SYS_THR_H +#include +#endif #if HAVE_SYS_CAPABILITY_H # include @@ -33,12 +40,50 @@ /* Default huge page size is 2 MBytes */ #define UCS_DEFAULT_MEM_FREE 640000 -#define UCS_PROCESS_MAPS_FILE "/proc/self/maps" +#define UCS_PROCESS_SMAPS_FILE "/proc/self/smaps" +#define UCS_PROCESS_NS_DIR "/proc/self/ns" +#define UCS_PROCESS_BOOTID_FILE "/proc/sys/kernel/random/boot_id" +#define UCS_PROCESS_BOOTID_FMT "%x-%4hx-%4hx-%4hx-%2hhx%2hhx%2hhx%2hhx%2hhx%2hhx" +#define UCS_PROCESS_NS_FIRST 0xF0000000U +#define UCS_PROCESS_NS_NET_DFLT 0xF0000080U + + +struct { + const char *name; + ucs_sys_ns_t dflt; +} static ucs_sys_namespace_info[] = { + [UCS_SYS_NS_TYPE_IPC] = {.name = "ipc", .dflt = UCS_PROCESS_NS_FIRST - 1}, + [UCS_SYS_NS_TYPE_MNT] = {.name = "mnt", .dflt = UCS_PROCESS_NS_FIRST - 0}, + [UCS_SYS_NS_TYPE_NET] = {.name = "net", .dflt = UCS_PROCESS_NS_NET_DFLT}, + [UCS_SYS_NS_TYPE_PID] = {.name = "pid", .dflt = UCS_PROCESS_NS_FIRST - 4}, + [UCS_SYS_NS_TYPE_USER] = {.name = "user", .dflt = UCS_PROCESS_NS_FIRST - 3}, + [UCS_SYS_NS_TYPE_UTS] = {.name = "uts", .dflt = UCS_PROCESS_NS_FIRST - 2} +}; + +typedef struct { + void *ctx; + ucs_sys_enum_threads_cb_t cb; +} ucs_sys_enum_threads_t; + +static const char *ucs_pagemap_file = "/proc/self/pagemap"; + + +const char *ucs_get_tmpdir() +{ + char *env_tmpdir; + + env_tmpdir = getenv("TMPDIR"); + if (env_tmpdir) { + return env_tmpdir; + } else { + return "/tmp/"; + } +} const char *ucs_get_host_name() { - static char hostname[256] = {0}; + static char hostname[HOST_NAME_MAX] = {0}; if (*hostname == 0) { gethostname(hostname, sizeof(hostname)); @@ -102,7 +147,7 @@ uint32_t ucs_file_checksum(const char *filename) do { nread = read(fd, buffer, sizeof(buffer)); if (nread > 0) { - crc = ucs_calc_crc32(crc, buffer, nread); + crc = ucs_crc32(crc, buffer, nread); } } while (nread == sizeof(buffer)); close(fd); @@ -210,7 +255,7 @@ static long ucs_sysconf(int name) int ucs_get_first_cpu() { int first_cpu, total_cpus, ret; - cpu_set_t mask; + ucs_sys_cpuset_t mask; ret = ucs_sysconf(_SC_NPROCESSORS_CONF); if (ret < 0) { @@ -220,7 +265,7 @@ int ucs_get_first_cpu() total_cpus = ret; CPU_ZERO(&mask); - ret = sched_getaffinity(0, sizeof(mask), &mask); + ret = ucs_sys_getaffinity(&mask); if (ret < 0) { ucs_error("failed to get process affinity: %m"); return ret; @@ -252,7 +297,7 @@ uint64_t ucs_generate_uuid(uint64_t seed) ucs_status_t ucs_open_output_stream(const char *config_str, ucs_log_level_t err_log_level, FILE **p_fstream, int *p_need_close, - const char **p_next_token) + const char **p_next_token, char **p_filename) { FILE *output_stream; char filename[256]; @@ -260,7 +305,10 @@ ucs_open_output_stream(const char *config_str, ucs_log_level_t err_log_level, const char *p; size_t len; - *p_next_token = config_str; + *p_next_token = config_str; + if (p_filename != NULL) { + *p_filename = NULL; + } len = strcspn(config_str, ":"); if (!strncmp(config_str, "stdout", len)) { @@ -290,6 +338,16 @@ ucs_open_output_stream(const char *config_str, ucs_log_level_t err_log_level, return UCS_ERR_IO_ERROR; } + if (p_filename != NULL) { + *p_filename = ucs_strdup(filename, "filename"); + if (*p_filename == NULL) { + ucs_log(err_log_level, "failed to allocate filename for '%s'", + filename); + fclose(output_stream); + return UCS_ERR_NO_MEMORY; + } + } + *p_fstream = output_stream; *p_need_close = 1; *p_next_token = p + len; @@ -375,19 +433,22 @@ ucs_status_t ucs_read_file_number(long *value, int silent, return UCS_OK; } -size_t ucs_get_max_iov() +ssize_t ucs_read_file_str(char *buffer, size_t max, int silent, + const char *filename_fmt, ...) { - static long max_iov = 0; - - if (max_iov == 0) { - max_iov = ucs_sysconf(_SC_IOV_MAX); - if (max_iov < 0) { - max_iov = 1; - ucs_debug("_SC_IOV_MAX is undefined, setting default value to %ld", - max_iov); - } + size_t max_read = ucs_max(max, 1) - 1; + ssize_t read_bytes; + va_list ap; + + va_start(ap, filename_fmt); + read_bytes = ucs_read_file_vararg(buffer, max_read, silent, filename_fmt, ap); + va_end(ap); + + if ((read_bytes >= 0) && (max > 0)) { + buffer[read_bytes] = '\0'; } - return max_iov; + + return read_bytes; } size_t ucs_get_page_size() @@ -405,6 +466,64 @@ size_t ucs_get_page_size() return page_size; } +void ucs_get_mem_page_size(void *address, size_t size, size_t *min_page_size_p, + size_t *max_page_size_p) +{ + int found = 0; + unsigned long start, end; + unsigned long page_size_kb; + size_t page_size; + char buf[1024]; + FILE *file; + int n; + + file = fopen(UCS_PROCESS_SMAPS_FILE, "r"); + if (!file) { + goto out; + } + + while (fgets(buf, sizeof(buf), file) != NULL) { + n = sscanf(buf, "%lx-%lx", &start, &end); + if (n != 2) { + continue; + } + + if (start > (uintptr_t)address + size) { + /* the scanned range is after memory range of interest - stop */ + break; + } + if (end <= (uintptr_t)address) { + /* the scanned range is still before the memory range of interest */ + continue; + } + + while (fgets(buf, sizeof(buf), file) != NULL) { + n = sscanf(buf, "KernelPageSize: %lu kB", &page_size_kb); + if (n < 1) { + continue; + } + + page_size = page_size_kb * UCS_KBYTE; + if (found) { + *min_page_size_p = ucs_min(*min_page_size_p, page_size); + *max_page_size_p = ucs_max(*max_page_size_p, page_size); + } else { + found = 1; + *min_page_size_p = page_size; + *max_page_size_p = page_size; + } + break; + } + } + + fclose(file); + +out: + if (!found) { + *min_page_size_p = *max_page_size_p = ucs_get_page_size(); + } +} + static ssize_t ucs_get_meminfo_entry(const char* pattern) { char buf[256]; @@ -431,7 +550,7 @@ static ssize_t ucs_get_meminfo_entry(const char* pattern) size_t ucs_get_memfree_size() { - size_t mem_free; + ssize_t mem_free; mem_free = ucs_get_meminfo_entry("MemFree"); if (mem_free == -1) { @@ -543,7 +662,6 @@ static void ucs_sysv_shmget_error_check_ENOSPC(size_t alloc_size, ", total shared memory pages in the system (%lu) would exceed the" " limit in /proc/sys/kernel/shmall (=%lu)", new_shm_tot, ipc_info->shmall); - p += strlen(p); } } @@ -637,23 +755,26 @@ ucs_status_t ucs_sysv_alloc(size_t *size, size_t max_size, void **address_p, int flags, const char *alloc_name, int *shmid) { char error_string[256]; +#ifdef SHM_HUGETLB ssize_t huge_page_size; +#endif size_t alloc_size; int sys_errno; void *ptr; int ret; +#ifdef SHM_HUGETLB if (flags & SHM_HUGETLB) { huge_page_size = ucs_get_huge_page_size(); if (huge_page_size <= 0) { ucs_debug("huge pages are not supported on the system"); return UCS_ERR_NO_MEMORY; /* Huge pages not supported */ } - } - if (flags & SHM_HUGETLB) { alloc_size = ucs_align_up(*size, huge_page_size); - } else { + } else +#endif + { alloc_size = ucs_align_up(*size, ucs_get_page_size()); } @@ -670,7 +791,10 @@ ucs_status_t ucs_sysv_alloc(size_t *size, size_t max_size, void **address_p, switch (sys_errno) { case ENOMEM: case EPERM: - if (!(flags & SHM_HUGETLB)) { +#ifdef SHM_HUGETLB + if (!(flags & SHM_HUGETLB)) +#endif + { ucs_error("%s", error_string); } return UCS_ERR_NO_MEMORY; @@ -753,8 +877,11 @@ ucs_status_t ucs_mmap_alloc(size_t *size, void **address_p, ucs_status_t ucs_mmap_free(void *address, size_t length) { int ret; + size_t alloc_length; - ret = ucs_munmap(address, length); + alloc_length = ucs_align_up_pow2(length, ucs_get_page_size()); + + ret = ucs_munmap(address, alloc_length); if (ret != 0) { ucs_warn("munmap(address=%p, length=%zu) failed: %m", address, length); return UCS_ERR_INVALID_PARAM; @@ -769,7 +896,8 @@ typedef struct { int found; } ucs_get_mem_prot_ctx_t; -static int ucs_get_mem_prot_cb(void *arg, void *addr, size_t length, int prot) +static int ucs_get_mem_prot_cb(void *arg, void *addr, size_t length, int prot, + const char *path) { ucs_get_mem_prot_ctx_t *ctx = arg; unsigned long seg_start = (uintptr_t)addr; @@ -829,44 +957,88 @@ const char* ucs_get_process_cmdline() return cmdline; } -unsigned long ucs_sys_get_pfn(uintptr_t address) +static ucs_status_t +ucs_sys_enum_pfn_internal(int pagemap_fd, unsigned start_page, uint64_t *data, + uintptr_t address, unsigned page_count, + ucs_sys_enum_pfn_cb_t cb, void *ctx) { - static const char *pagemap_file = "/proc/self/pagemap"; - static int initialized = 0; - static int pagemap_fd; - uint64_t data; off_t offset; ssize_t ret; + size_t len; + unsigned i; + + offset = ((address / ucs_get_page_size()) + start_page) * sizeof(*data); + len = page_count * sizeof(*data); + ret = pread(pagemap_fd, data, len, offset); + if (ret < 0) { + ucs_warn("pread(file=%s offset=%zu) failed: %m", ucs_pagemap_file, offset); + return UCS_ERR_IO_ERROR; + } + + for (i = 0; i < ret / sizeof(*data); i++) { + if (!(data[i] & UCS_BIT(63))) { + ucs_trace("address 0x%lx not present", + address + (ucm_get_page_size() * (i + start_page))); + return UCS_ERR_IO_ERROR; + } + + cb(i + start_page, data[i] & UCS_MASK(55), ctx); + } + + return UCS_OK; +} + +ucs_status_t ucs_sys_enum_pfn(uintptr_t address, unsigned page_count, + ucs_sys_enum_pfn_cb_t cb, void *ctx) +{ + /* by default use 1K buffer on stack */ + const int UCS_SYS_ENUM_PFN_ELEM_CNT = ucs_min(128, UCS_ALLOCA_MAX_SIZE / + sizeof(uint64_t)); + static int initialized = 0; + ucs_status_t status = UCS_OK; + static int pagemap_fd; + uint64_t *data; + unsigned page_num; if (!initialized) { - pagemap_fd = open(pagemap_file, O_RDONLY); + pagemap_fd = open(ucs_pagemap_file, O_RDONLY); if (pagemap_fd < 0) { - ucs_warn("failed to open %s: %m", pagemap_file); + ucs_warn("failed to open %s: %m", ucs_pagemap_file); } initialized = 1; } if (pagemap_fd < 0) { - return 0; /* could not open file */ + return UCS_ERR_IO_ERROR; /* could not open file */ } - offset = (address / ucs_get_page_size()) * sizeof(data); - data = 0; - ret = pread(pagemap_fd, &data, sizeof(data), offset); - if (ret < 0) { - ucs_warn("pread(file=%s offset=%zu) failed: %m", pagemap_file, offset); - return 0; - } + data = ucs_alloca(ucs_min(UCS_SYS_ENUM_PFN_ELEM_CNT, page_count) * + sizeof(*data)); - if (!(data & UCS_BIT(63))) { - ucs_trace("address 0x%lx not present", address); - return 0; + for (page_num = 0; (page_num < page_count) && (status == UCS_OK); + page_num += UCS_SYS_ENUM_PFN_ELEM_CNT) { + status = ucs_sys_enum_pfn_internal(pagemap_fd, page_num, data, address, + ucs_min(UCS_SYS_ENUM_PFN_ELEM_CNT, + page_count - page_num), + cb, ctx); } - return data & UCS_MASK(55); + return status; } -ucs_status_t ucs_sys_fcntl_modfl(int fd, int add, int remove) +static void ucs_sys_get_pfn_cb(unsigned page_number, unsigned long pfn, + void *ctx) +{ + ((unsigned long*)ctx)[page_number] = pfn; +} + +ucs_status_t ucs_sys_get_pfn(uintptr_t address, unsigned page_count, + unsigned long *data) +{ + return ucs_sys_enum_pfn(address, page_count, ucs_sys_get_pfn_cb, data); +} + +ucs_status_t ucs_sys_fcntl_modfl(int fd, int add, int rem) { int oldfl, ret; @@ -876,7 +1048,7 @@ ucs_status_t ucs_sys_fcntl_modfl(int fd, int add, int remove) return UCS_ERR_IO_ERROR; } - ret = fcntl(fd, F_SETFL, (oldfl | add) & ~remove); + ret = fcntl(fd, F_SETFL, (oldfl | add) & ~rem); if (ret < 0) { ucs_error("fcntl(fd=%d, F_SETFL) returned %d: %m", fd, ret); return UCS_ERR_IO_ERROR; @@ -887,12 +1059,27 @@ ucs_status_t ucs_sys_fcntl_modfl(int fd, int add, int remove) pid_t ucs_get_tid(void) { +#ifdef SYS_gettid return syscall(SYS_gettid); +#elif defined(HAVE_SYS_THR_H) + long id; + + thr_self(&id); + return (id); +#else +#error "Port me" +#endif } int ucs_tgkill(int tgid, int tid, int sig) { +#ifdef SYS_tgkill return syscall(SYS_tgkill, tgid, tid, sig); +#elif defined(HAVE_SYS_THR_H) + return (thr_kill2(tgid, tid, sig)); +#else +#error "Port me" +#endif } double ucs_get_cpuinfo_clock_freq(const char *header, double scale) @@ -981,56 +1168,203 @@ void ucs_sys_free(void *ptr, size_t length) } } -void ucs_empty_function() +char* ucs_make_affinity_str(const ucs_sys_cpuset_t *cpuset, char *str, size_t len) { -} + int i = 0, prev = -1; + char *p = str; -unsigned ucs_empty_function_return_zero() -{ - return 0; + for (i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, cpuset)) { + if (prev < 0) { + prev = i; + } + } else { + if (prev >= 0) { + if (prev == i - 1) { + p += snprintf(p, str + len - p, "%d,", prev); + } else { + p += snprintf(p, str + len - p, "%d-%d,", prev, i - 1); + } + } + if (p > str + len) { + p = str + len - 4; + while (*p != ',') { + p--; + } + sprintf(p, "..."); + return str; + } + prev = -1; + } + } + + *(--p) = 0; + return str; } -int64_t ucs_empty_function_return_zero_int64() +int ucs_sys_setaffinity(ucs_sys_cpuset_t *cpuset) { - return 0; + int ret; + +#if defined(HAVE_SCHED_SETAFFINITY) + ret = sched_setaffinity(0, sizeof(*cpuset), cpuset); +#elif defined(HAVE_CPUSET_SETAFFINITY) + ret = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, getpid(), + sizeof(*cpuset), cpuset); +#else +#error "Port me" +#endif + return ret; } -ucs_status_t ucs_empty_function_return_success() +int ucs_sys_getaffinity(ucs_sys_cpuset_t *cpuset) { - return UCS_OK; + int ret; + +#if defined(HAVE_SCHED_GETAFFINITY) + ret = sched_getaffinity(0, sizeof(*cpuset), cpuset); +#elif defined(HAVE_CPUSET_GETAFFINITY) + ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, getpid(), + sizeof(*cpuset), cpuset); +#else +#error "Port me" +#endif + return ret; } -ucs_status_t ucs_empty_function_return_unsupported() +void ucs_sys_cpuset_copy(ucs_cpu_set_t *dst, const ucs_sys_cpuset_t *src) { - return UCS_ERR_UNSUPPORTED; + int c; + + UCS_CPU_ZERO(dst); + for (c = 0; c < UCS_CPU_SETSIZE; ++c) { + if (CPU_ISSET(c, src)) { + UCS_CPU_SET(c, dst); + } + } } -ucs_status_t ucs_empty_function_return_inprogress() +ucs_sys_ns_t ucs_sys_get_ns(ucs_sys_namespace_type_t ns) { - return UCS_INPROGRESS; + char filename[MAXPATHLEN]; + int res; + struct stat st; + + if (ns >= UCS_SYS_NS_TYPE_LAST) { + return 0; + } + + snprintf(filename, sizeof(filename), "%s/%s", UCS_PROCESS_NS_DIR, + ucs_sys_namespace_info[ns].name); + + res = stat(filename, &st); + if (res == 0) { + return (ucs_sys_ns_t)st.st_ino; + } + + return ucs_sys_namespace_info[ns].dflt; } -ucs_status_t ucs_empty_function_return_no_resource() +int ucs_sys_ns_is_default(ucs_sys_namespace_type_t ns) { - return UCS_ERR_NO_RESOURCE; + return ucs_sys_get_ns(ns) == ucs_sys_namespace_info[ns].dflt; } -ucs_status_ptr_t ucs_empty_function_return_ptr_no_resource() +ucs_status_t ucs_sys_get_boot_id(uint64_t *high, uint64_t *low) { - return UCS_STATUS_PTR(UCS_ERR_NO_RESOURCE); + static struct { + uint64_t high; + uint64_t low; + } boot_id = {0, 0}; + + static ucs_init_once_t init_once = UCS_INIT_ONCE_INITIALIZER; + static ucs_status_t status = UCS_ERR_IO_ERROR; + char bootid_str[256]; + ssize_t size; + uint32_t v1; + uint16_t v2; + uint16_t v3; + uint16_t v4; + uint8_t v5[6]; + int res; + int i; + + UCS_INIT_ONCE(&init_once) { + size = ucs_read_file_str(bootid_str, sizeof(bootid_str), 1, + "%s", UCS_PROCESS_BOOTID_FILE); + if (size <= 0) { + continue; /* jump out of INIT_ONCE section */ + } + + res = sscanf(bootid_str, UCS_PROCESS_BOOTID_FMT, + &v1, &v2, &v3, &v4, + &v5[0], &v5[1], &v5[2], + &v5[3], &v5[4], &v5[5]); + if (res == 10) { /* 10 values should be scanned */ + status = UCS_OK; + boot_id.low = ((uint64_t)v1) | ((uint64_t)v2 << 32) | + ((uint64_t)v3 << 48); + boot_id.high = v4; + for (i = 0; i < ucs_array_size(v5); i++) { + boot_id.high |= (uint64_t)v5[i] << (16 + (i * 8)); + } + } + } + + if (status == UCS_OK) { + *high = boot_id.high; + *low = boot_id.low; + } + + return status; } -ucs_status_t ucs_empty_function_return_ep_timeout() +ucs_status_t ucs_sys_readdir(const char *path, ucs_sys_readdir_cb_t cb, void *ctx) { - return UCS_ERR_ENDPOINT_TIMEOUT; + ucs_status_t res = 0; + DIR *dir; + struct dirent *entry; + struct dirent *entry_out; + size_t entry_len; + + dir = opendir(path); + if (dir == NULL) { + return UCS_ERR_NO_ELEM; /* failed to open directory */ + } + + entry_len = ucs_offsetof(struct dirent, d_name) + + fpathconf(dirfd(dir), _PC_NAME_MAX) + 1; + entry = (struct dirent*)malloc(entry_len); + if (entry == NULL) { + res = UCS_ERR_NO_MEMORY; + goto failed_no_mem; + } + + while (!readdir_r(dir, entry, &entry_out) && (entry_out != NULL)) { + res = cb(entry, ctx); + if (res != UCS_OK) { + break; + } + } + + free(entry); +failed_no_mem: + closedir(dir); + return res; } -ssize_t ucs_empty_function_return_bc_ep_timeout() +static ucs_status_t ucs_sys_enum_threads_cb(struct dirent *entry, void *_ctx) { - return UCS_ERR_ENDPOINT_TIMEOUT; + ucs_sys_enum_threads_t *ctx = (ucs_sys_enum_threads_t*)_ctx; + + return strncmp(entry->d_name, ".", 1) ? + ctx->cb((pid_t)atoi(entry->d_name), ctx->ctx) : 0; } -ucs_status_t ucs_empty_function_return_busy() +ucs_status_t ucs_sys_enum_threads(ucs_sys_enum_threads_cb_t cb, void *ctx) { - return UCS_ERR_BUSY; + static const char *task_dir = "/proc/self/task"; + ucs_sys_enum_threads_t param = {.ctx = ctx, .cb = cb}; + + return ucs_sys_readdir(task_dir, &ucs_sys_enum_threads_cb, ¶m); } diff --git a/src/ucs/sys/sys.h b/src/ucs/sys/sys.h index 13202b5ba30..0ff07c46ac9 100644 --- a/src/ucs/sys/sys.h +++ b/src/ucs/sys/sys.h @@ -1,6 +1,6 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. -* Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. +* Copyright (c) UT-Battelle, LLC. 2014-2019. ALL RIGHTS RESERVED. * Copyright (C) ARM Ltd. 2016. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -24,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -48,11 +48,65 @@ #include #include #include +#include + + +#include +#if defined(__linux__) || defined(HAVE_CPU_SET_T) +#include +typedef cpu_set_t ucs_sys_cpuset_t; +#elif defined(__FreeBSD__) || defined(HAVE_CPUSET_T) +#include +typedef cpuset_t ucs_sys_cpuset_t; +#else +#error "Port me" +#endif + BEGIN_C_DECLS /** @file sys.h */ + +typedef ino_t ucs_sys_ns_t; + + +/* namespace type used in @ref ucs_sys_get_ns and @ref ucs_sys_ns_is_default */ +typedef enum { + UCS_SYS_NS_TYPE_IPC, + UCS_SYS_NS_TYPE_MNT, + UCS_SYS_NS_TYPE_NET, + UCS_SYS_NS_TYPE_PID, + UCS_SYS_NS_TYPE_USER, + UCS_SYS_NS_TYPE_UTS, + UCS_SYS_NS_TYPE_LAST +} ucs_sys_namespace_type_t; + + +/** + * Callback function type used in ucs_sys_readdir. + */ +typedef ucs_status_t (*ucs_sys_readdir_cb_t)(struct dirent *entry, void *ctx); + + +/** + * Callback function type used in ucs_sys_enum_threads. + */ +typedef ucs_status_t (*ucs_sys_enum_threads_cb_t)(pid_t pid, void *ctx); + + +/** + * Callback function type used in ucs_sys_enum_pfn. + */ +typedef void (*ucs_sys_enum_pfn_cb_t)(unsigned page_number, unsigned long pfn, + void *ctx); + + +/** + * @return TMPDIR environment variable if set. Otherwise, return "/tmp". + */ +const char *ucs_get_tmpdir(); + /** * @return Host name. */ @@ -115,14 +169,26 @@ uint64_t ucs_generate_uuid(uint64_t seed); * - stdout * - stderr * - * *p_fstream is filled with the stream handle, *p_need_close is set to whether - * fclose() should be called to release resources, *p_next_token to the remainder - * of config_str. + * @param [in] config_str The file name or name of the output stream + * (stdout/stderr). + * @param [in] err_log_level Logging level that should be used for printing + * errors. + * @param [out] p_fstream Pointer that is filled with the stream handle. + * User is responsible to close tha stream handle then. + * @param [out] p_need_close Pointer to the variable that is set to whether + * fclose() should be called to release resources (1) + * or not (0). + * @param [out] p_next_token Pointer that is set to remainder of @config_str. + * @oaram [out] p_filename Pointer to the variable that is filled with the + * resulted name of the log file (if it is not NULL). + * Caller is responsible to release memory then. + * + * @return UCS_OK if successful, or error code otherwise. */ ucs_status_t ucs_open_output_stream(const char *config_str, ucs_log_level_t err_log_level, FILE **p_fstream, int *p_need_close, - const char **p_next_token); + const char **p_next_token, char **p_filename); /** @@ -154,9 +220,17 @@ ucs_status_t ucs_read_file_number(long *value, int silent, /** - * @return Regular _SC_IOV_MAX on the system. + * Read file contents into a string closed by null terminator. + * + * @param buffer Buffer to fill with file contents. + * @param max Maximal buffer size. + * @param filename_fmt File name printf-like format string. + * + * @return Number of bytes read, or -1 in case of error. */ -size_t ucs_get_max_iov(); +ssize_t ucs_read_file_str(char *buffer, size_t max, int silent, + const char *filename_fmt, ...) + UCS_F_PRINTF(4, 5); /** @@ -165,6 +239,18 @@ size_t ucs_get_max_iov(); size_t ucs_get_page_size(); +/** + * Get page size of a memory region. + * + * @param [in] address Memory region start address, + * @param [in] size Memory region size. + * @param [out] min_page_size_p Set to the minimal page size in the memory region. + * @param [out] max_page_size_p Set to the maximal page size in the memory region. + */ +void ucs_get_mem_page_size(void *address, size_t size, size_t *min_page_size_p, + size_t *max_page_size_p); + + /** * @return Huge page size on the system, or -1 if unsupported. */ @@ -223,7 +309,7 @@ ucs_status_t ucs_mmap_alloc(size_t *size, void **address_p, * Release memory allocated via mmap API. * * @param address Address of memory to release as returned from @ref ucs_mmap_alloc. - * @param length Length of memory to release as returned from @ref ucs_mmap_alloc. + * @param length Length of memory to release passed to @ref ucs_mmap_alloc. */ ucs_status_t ucs_mmap_free(void *address, size_t length); @@ -245,10 +331,28 @@ int ucs_get_mem_prot(unsigned long start, unsigned long end); * If the page map file is non-readable (for example, due to permissions), or * the page is not present, this function returns 0. * - * @param address Virtual address to get the PFN for - * @return PFN number, or 0 if failed. + * @param address Virtual address to get the. PFN for. + * @param page_count Number of pages to process + * @param data Result buffer. + * @return UCS_OK if all pages are processed, else error code. + */ +ucs_status_t ucs_sys_get_pfn(uintptr_t address, unsigned page_count, + unsigned long *data); + + +/** + * Enums the physical page frame numbers of a given virtual address range. + * If the page map file is non-readable (for example, due to permissions), or + * the page is not present, this function returns error. + * + * @param address Virtual address to get the PFN for. + * @param page_number Number of pages to process. + * @param cb Callback function which is called for every page. + * @param ctx Context argument passed to @a cb call. + * @return error code if failed to enumerate or UCS_OK. */ -unsigned long ucs_sys_get_pfn(uintptr_t address); +ucs_status_t ucs_sys_enum_pfn(uintptr_t address, unsigned page_count, + ucs_sys_enum_pfn_cb_t cb, void *ctx); /** @@ -337,21 +441,107 @@ void *ucs_sys_realloc(void *old_ptr, size_t old_length, size_t new_length); */ void ucs_sys_free(void *ptr, size_t length); +/** + * Fill human readable cpu set representation + * + * @param [in] cpuset Set of CPUs + * @param [in] str String to fill + * @param [in] len String length + * + * @return Filled string + */ +char *ucs_make_affinity_str(const ucs_sys_cpuset_t *cpuset, char *str, size_t len); + +/** + * Sets affinity for the current process. + * + * @param [in] cpuset Pointer to the cpuset to assign + * + * @return -1 on error with errno set, 0 on success + */ +int ucs_sys_setaffinity(ucs_sys_cpuset_t *cpuset); + +/** + * Queries affinity for the current process. + * + * @param [out] cpuset Pointer to the cpuset to return result + * + * @return -1 on error with errno set, 0 on success + */ +int ucs_sys_getaffinity(ucs_sys_cpuset_t *cpuset); + +/** + * Copies ucs_sys_cpuset_t to ucs_cpu_set_t. + * + * @param [in] src Source + * @param [out] dst Destination + */ +void ucs_sys_cpuset_copy(ucs_cpu_set_t *dst, const ucs_sys_cpuset_t *src); + +/** + * Get namespace id for resource. + * + * @param [in] name Namespace to get value + * + * @return namespace value or 0 if namespaces are not supported + */ +ucs_sys_ns_t ucs_sys_get_ns(ucs_sys_namespace_type_t name); + + +/** + * Check if namespace is namespace of host system. + * + * @param [in] name Namespace to evaluate + * + * @return 1 in case if namespace is root, 0 - in other cases + */ +int ucs_sys_ns_is_default(ucs_sys_namespace_type_t name); + + +/** + * Get 128-bit boot ID value. + * + * @param [out] high Pointer to high 64 bit of 128 boot ID + * @param [out] low Pointer to low 64 bit of 128 boot ID + * + * @return UCS_OK or error in case of failure. + */ +ucs_status_t ucs_sys_get_boot_id(uint64_t *high, uint64_t *low); + /** - * Empty function which can be casted to a no-operation callback in various situations. + * Read directory + * + * @param [in] path Path to directory to read + * @param [in] cb Callback function, see NOTES + * @param [in] ctx Context pointer passed to callback + * + * @return UCS_OK if directory is found and successfully iterated thought all + * entries, error code in all other cases, see NOTES. + * + * @note ucs_sys_readdir function reads directory pointed by @a path argument + * and calls @a cb function for every entry in directory, including + * '.' and '..'. In case if @a cb function returns value different from + * UCS_OK then function breaks immediately and this value is returned + * from ucs_sys_readdir. */ -void ucs_empty_function(); -unsigned ucs_empty_function_return_zero(); -int64_t ucs_empty_function_return_zero_int64(); -ucs_status_t ucs_empty_function_return_success(); -ucs_status_t ucs_empty_function_return_unsupported(); -ucs_status_t ucs_empty_function_return_inprogress(); -ucs_status_t ucs_empty_function_return_no_resource(); -ucs_status_ptr_t ucs_empty_function_return_ptr_no_resource(); -ucs_status_t ucs_empty_function_return_ep_timeout(); -ssize_t ucs_empty_function_return_bc_ep_timeout(); -ucs_status_t ucs_empty_function_return_busy(); +ucs_status_t ucs_sys_readdir(const char *path, ucs_sys_readdir_cb_t cb, void *ctx); + +/** + * Enumerate process threads + * + * @param [in] cb Callback function, see NOTES + * @param [in] ctx Context pointer passed to callback + * + * @return UCS_OK if directory is found and successfully iterated thought all + * entries, error code in all other cases, see NOTES. + * + * @note ucs_sys_enum_threads function enumerates current process threads + * and calls @a cb function for every thread. In case if @a cb function + * returns value different from UCS_OK then function breaks + * immediately and this value is returned from ucs_sys_enum_threads. + */ +ucs_status_t ucs_sys_enum_threads(ucs_sys_enum_threads_cb_t cb, void *ctx); END_C_DECLS diff --git a/src/ucs/sys/topo.c b/src/ucs/sys/topo.c new file mode 100644 index 00000000000..0a6a15e628e --- /dev/null +++ b/src/ucs/sys/topo.c @@ -0,0 +1,151 @@ +/** +* Copyright (C) NVIDIA Corporation. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define UCS_TOPO_MAX_SYS_DEVICES 1024 +#define UCS_TOPO_HOP_OVERHEAD 1E-7 + +typedef int64_t ucs_bus_id_bit_rep_t; + +typedef struct ucs_topo_sys_dev_to_bus_arr { + ucs_sys_bus_id_t bus_arr[UCS_TOPO_MAX_SYS_DEVICES]; + unsigned count; +} ucs_topo_sys_dev_to_bus_arr_t; + +KHASH_MAP_INIT_INT64(bus_to_sys_dev, ucs_sys_device_t); + +typedef struct ucs_topo_global_ctx { + khash_t(bus_to_sys_dev) bus_to_sys_dev_hash; + ucs_spinlock_t lock; + ucs_topo_sys_dev_to_bus_arr_t sys_dev_to_bus_lookup; +} ucs_topo_global_ctx_t; + +static ucs_topo_global_ctx_t ucs_topo_ctx; + +static ucs_bus_id_bit_rep_t ucs_topo_get_bus_id_bit_repr(const ucs_sys_bus_id_t *bus_id) +{ + return (((uint64_t)bus_id->domain << 24) | + ((uint64_t)bus_id->bus << 16) | + ((uint64_t)bus_id->slot << 8) | + (bus_id->function)); +} + +void ucs_topo_init() +{ + ucs_spinlock_init(&ucs_topo_ctx.lock, 0); + kh_init_inplace(bus_to_sys_dev, &ucs_topo_ctx.bus_to_sys_dev_hash); + ucs_topo_ctx.sys_dev_to_bus_lookup.count = 0; +} + +void ucs_topo_cleanup() +{ + ucs_status_t status; + + kh_destroy_inplace(bus_to_sys_dev, &ucs_topo_ctx.bus_to_sys_dev_hash); + + status = ucs_spinlock_destroy(&ucs_topo_ctx.lock); + if (status != UCS_OK) { + ucs_warn("ucs_recursive_spinlock_destroy() failed: %s", + ucs_status_string(status)); + } +} + +ucs_status_t ucs_topo_find_device_by_bus_id(const ucs_sys_bus_id_t *bus_id, + ucs_sys_device_t *sys_dev) +{ + khiter_t hash_it; + ucs_kh_put_t kh_put_status; + ucs_bus_id_bit_rep_t bus_id_bit_rep; + + bus_id_bit_rep = ucs_topo_get_bus_id_bit_repr(bus_id); + + ucs_spin_lock(&ucs_topo_ctx.lock); + hash_it = kh_put(bus_to_sys_dev /*name*/, + &ucs_topo_ctx.bus_to_sys_dev_hash /*pointer to hashmap*/, + bus_id_bit_rep /*key*/, + &kh_put_status); + + if (kh_put_status == UCS_KH_PUT_KEY_PRESENT) { + *sys_dev = kh_value(&ucs_topo_ctx.bus_to_sys_dev_hash, hash_it); + ucs_debug("bus id %ld exists. sys_dev = %u", bus_id_bit_rep, *sys_dev); + } else if ((kh_put_status == UCS_KH_PUT_BUCKET_EMPTY) || + (kh_put_status == UCS_KH_PUT_BUCKET_CLEAR)) { + *sys_dev = ucs_topo_ctx.sys_dev_to_bus_lookup.count; + ucs_assert(*sys_dev < UCS_TOPO_MAX_SYS_DEVICES); + kh_value(&ucs_topo_ctx.bus_to_sys_dev_hash, hash_it) = *sys_dev; + ucs_debug("bus id %ld doesn't exist. sys_dev = %u", bus_id_bit_rep, + *sys_dev); + + ucs_topo_ctx.sys_dev_to_bus_lookup.bus_arr[*sys_dev] = *bus_id; + ucs_topo_ctx.sys_dev_to_bus_lookup.count++; + } + + ucs_spin_unlock(&ucs_topo_ctx.lock); + return UCS_OK; +} + +static void ucs_topo_get_path_with_bus(unsigned bus, char *path) +{ + static const char sysfs_pci_prefix[] = "/sys/class/pci_bus"; + + sprintf(path, "%s/0000:%02x", sysfs_pci_prefix, bus); +} + +ucs_status_t ucs_topo_get_distance(ucs_sys_device_t device1, + ucs_sys_device_t device2, + ucs_sys_dev_distance_t *distance) +{ + char path1[PATH_MAX], path2[PATH_MAX]; + unsigned bus1, bus2; + ssize_t path_distance; + + if ((device1 == UCS_SYS_DEVICE_ID_UNKNOWN) || + (device2 == UCS_SYS_DEVICE_ID_UNKNOWN) || + (ucs_topo_ctx.sys_dev_to_bus_lookup.count < 2) ) { + return UCS_ERR_IO_ERROR; + } + + if (device1 == device2) { + distance->latency = 0; + return UCS_OK; + } + + ucs_assert(device1 < UCS_TOPO_MAX_SYS_DEVICES); + ucs_assert(device2 < UCS_TOPO_MAX_SYS_DEVICES); + + bus1 = ucs_topo_ctx.sys_dev_to_bus_lookup.bus_arr[device1].bus; + bus2 = ucs_topo_ctx.sys_dev_to_bus_lookup.bus_arr[device2].bus; + + ucs_topo_get_path_with_bus(bus1, path1); + ucs_topo_get_path_with_bus(bus2, path2); + + path_distance = ucs_path_calc_distance(path1, path2); + if (path_distance < 0) { + return (ucs_status_t)path_distance; + } + + distance->latency = UCS_TOPO_HOP_OVERHEAD * path_distance; + + return UCS_OK; +} + + +void ucs_topo_print_info(FILE *stream) +{ +} diff --git a/src/ucs/sys/topo.h b/src/ucs/sys/topo.h new file mode 100644 index 00000000000..36589f1d58d --- /dev/null +++ b/src/ucs/sys/topo.h @@ -0,0 +1,98 @@ +/** +* Copyright (C) NVIDIA Corporation. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef UCS_TOPO_H +#define UCS_TOPO_H + +#include +#include +#include +#include + +BEGIN_C_DECLS + +#define UCS_SYS_DEVICE_ID_UNKNOWN UINT_MAX /* Indicate that the ucs_sys_device_t + for the device has no real bus_id + E.g. virtual devices like CMA/knem + */ + + +/** @file topo.h */ + +typedef struct ucs_sys_bus_id { + uint16_t domain; /* range: 0 to ffff */ + uint8_t bus; /* range: 0 to ff */ + uint8_t slot; /* range: 0 to 1f */ + uint8_t function; /* range: 0 to 7 */ +} ucs_sys_bus_id_t; + + +/** + * @ingroup UCS_RESOURCE + * System Device Index + * Obtained from a translation of the device bus id into an unsigned int + * Refer ucs_topo_find_device_by_bus_id() + */ +typedef unsigned ucs_sys_device_t; + + +/* + * Capture the estimated latency, bandwidth between two system devices + * referred by ucs_sys_device_t handle + */ +typedef struct ucs_sys_dev_distance { + double latency; /**< in seconds */ + double bandwidth; /**< in bytes/second */ +} ucs_sys_dev_distance_t; + + +/** + * Find system device by pci bus id + * + * @param [in] bus_id pointer to bus id of the device of interest + * @param [out] sys_dev system device index associated with the bus_id + * + * @return UCS_OK or error in case device cannot be found + */ +ucs_status_t ucs_topo_find_device_by_bus_id(const ucs_sys_bus_id_t *bus_id, + ucs_sys_device_t *sys_dev); + + +/** + * Find the distance between two system devices (in terms of latency, + * bandwidth, hops, etc) + * + * @param [in] device1 system device index of the first device + * @param [in] device2 system device index of the second device + * @param [out] distance result populated with distance details between the two + * devices + * + * @return UCS_OK or error in case distance cannot be determined + */ +ucs_status_t ucs_topo_get_distance(ucs_sys_device_t device1, + ucs_sys_device_t device2, + ucs_sys_dev_distance_t *distance); + + +/** + * Print a map indicating the topology information between system + * devices discovered + */ +void ucs_topo_print_info(FILE *stream); + +/** + * Initialize UCS topology subsystem. + */ +void ucs_topo_init(); + +/** + * Cleanup UCS topology subsystem. + */ +void ucs_topo_cleanup(); + +END_C_DECLS + +#endif diff --git a/src/ucs/time/time.c b/src/ucs/time/time.c index a3de64e3853..4ca8f90482e 100644 --- a/src/ucs/time/time.c +++ b/src/ucs/time/time.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include diff --git a/src/ucs/time/time.h b/src/ucs/time/time.h index 58ccbb0810a..2161c8459f5 100644 --- a/src/ucs/time/time.h +++ b/src/ucs/time/time.h @@ -94,41 +94,41 @@ static inline ucs_time_t ucs_time_from_usec(double usec) /** * Convert UCS time units to seconds. */ -static inline double ucs_time_to_sec(ucs_time_t time) +static inline double ucs_time_to_sec(ucs_time_t t) { - return time / ucs_time_sec_value(); + return t / ucs_time_sec_value(); } /** * Convert UCS time units to milliseconds. */ -static inline double ucs_time_to_msec(ucs_time_t time) +static inline double ucs_time_to_msec(ucs_time_t t) { - return ucs_time_to_sec(time) * UCS_MSEC_PER_SEC; + return ucs_time_to_sec(t) * UCS_MSEC_PER_SEC; } /** * Convert UCS time units to microseconds. */ -static inline double ucs_time_to_usec(ucs_time_t time) +static inline double ucs_time_to_usec(ucs_time_t t) { - return ucs_time_to_sec(time) * UCS_USEC_PER_SEC; + return ucs_time_to_sec(t) * UCS_USEC_PER_SEC; } /** * Convert UCS time units to nanoseconds. */ -static inline double ucs_time_to_nsec(ucs_time_t time) +static inline double ucs_time_to_nsec(ucs_time_t t) { - return ucs_time_to_sec(time) * UCS_NSEC_PER_SEC; + return ucs_time_to_sec(t) * UCS_NSEC_PER_SEC; } /** * Convert UCS time interval (small) to nanoseconds. */ -static inline double ucs_time_interval_to_nsec(ucs_time_t time) +static inline double ucs_time_interval_to_nsec(ucs_time_t t) { - return ucs_time_to_sec(time * UCS_NSEC_PER_SEC); + return ucs_time_to_sec(t * UCS_NSEC_PER_SEC); } /* Convert seconds to POSIX timeval */ diff --git a/src/ucs/time/timer_wheel.c b/src/ucs/time/timer_wheel.c index b1014a979cc..270de1181ef 100644 --- a/src/ucs/time/timer_wheel.c +++ b/src/ucs/time/timer_wheel.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include @@ -24,6 +28,10 @@ ucs_status_t ucs_twheel_init(ucs_twheel_t *twheel, ucs_time_t resolution, twheel->now = current_time; twheel->wheel = ucs_malloc(sizeof(*twheel->wheel) * twheel->num_slots, "twheel"); + twheel->count = 0; + if (twheel->wheel == NULL) { + return UCS_ERR_NO_MEMORY; + } for (i = 0; i < twheel->num_slots; i++) { ucs_list_head_init(&twheel->wheel[i]); @@ -69,6 +77,7 @@ void __ucs_wtimer_add(ucs_twheel_t *t, ucs_wtimer_t *timer, ucs_time_t delta) ucs_assert(slot != t->current); ucs_list_add_tail(&t->wheel[slot], &timer->list); + t->count++; } void __ucs_twheel_sweep(ucs_twheel_t *t, ucs_time_t current_time) @@ -90,6 +99,7 @@ void __ucs_twheel_sweep(ucs_twheel_t *t, ucs_time_t current_time) timer = ucs_list_extract_head(&t->wheel[t->current], ucs_wtimer_t, list); timer->is_active = 0; timer->cb(timer); + t->count--; } } } diff --git a/src/ucs/time/timer_wheel.h b/src/ucs/time/timer_wheel.h index 8c46d4e46c6..a85389fb7c0 100644 --- a/src/ucs/time/timer_wheel.h +++ b/src/ucs/time/timer_wheel.h @@ -40,6 +40,7 @@ struct ucs_timer_wheel { ucs_list_link_t *wheel; unsigned res_order; unsigned num_slots; + unsigned count; }; @@ -95,6 +96,14 @@ static inline ucs_time_t ucs_twheel_get_time(ucs_twheel_t *t) return t->now; } +/** + * Get current time + */ +static UCS_F_ALWAYS_INLINE int ucs_twheel_is_empty(ucs_twheel_t *t) +{ + return !t->count; +} + /** * Add a one shot timer. * @@ -123,11 +132,12 @@ static inline ucs_status_t ucs_wtimer_add(ucs_twheel_t *t, ucs_wtimer_t *timer, * * @param timer timer to remove. */ -static inline void ucs_wtimer_remove(ucs_wtimer_t *timer) +static inline void ucs_wtimer_remove(ucs_twheel_t *t, ucs_wtimer_t *timer) { if (ucs_likely(timer->is_active)) { ucs_list_del(&timer->list); timer->is_active = 0; + t->count--; } } diff --git a/src/ucs/time/timerq.c b/src/ucs/time/timerq.c index 311761e0fc8..a72528bbc4a 100644 --- a/src/ucs/time/timerq.c +++ b/src/ucs/time/timerq.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "timerq.h" #include @@ -16,7 +20,7 @@ ucs_status_t ucs_timerq_init(ucs_timer_queue_t *timerq) { ucs_trace_func("timerq=%p", timerq); - pthread_spin_init(&timerq->lock, 0); + ucs_recursive_spinlock_init(&timerq->lock, 0); timerq->timers = NULL; timerq->num_timers = 0; /* coverity[missing_lock] */ @@ -26,12 +30,19 @@ ucs_status_t ucs_timerq_init(ucs_timer_queue_t *timerq) void ucs_timerq_cleanup(ucs_timer_queue_t *timerq) { + ucs_status_t status; + ucs_trace_func("timerq=%p", timerq); if (timerq->num_timers > 0) { ucs_warn("timer queue with %d timers being destroyed", timerq->num_timers); } ucs_free(timerq->timers); + + status = ucs_recursive_spinlock_destroy(&timerq->lock); + if (status != UCS_OK) { + ucs_warn("ucs_recursive_spinlock_destroy() failed (%d)", status); + } } ucs_status_t ucs_timerq_add(ucs_timer_queue_t *timerq, int timer_id, @@ -43,7 +54,7 @@ ucs_status_t ucs_timerq_add(ucs_timer_queue_t *timerq, int timer_id, ucs_trace_func("timerq=%p interval=%.2fus timer_id=%d", timerq, ucs_time_to_usec(interval), timer_id); - pthread_spin_lock(&timerq->lock); + ucs_recursive_spin_lock(&timerq->lock); /* Make sure ID is unique */ for (ptr = timerq->timers; ptr < timerq->timers + timerq->num_timers; ++ptr) { @@ -74,7 +85,7 @@ ucs_status_t ucs_timerq_add(ucs_timer_queue_t *timerq, int timer_id, status = UCS_OK; out_unlock: - pthread_spin_unlock(&timerq->lock); + ucs_recursive_spin_unlock(&timerq->lock); return status; } @@ -87,7 +98,7 @@ ucs_status_t ucs_timerq_remove(ucs_timer_queue_t *timerq, int timer_id) status = UCS_ERR_NO_ELEM; - pthread_spin_lock(&timerq->lock); + ucs_recursive_spin_lock(&timerq->lock); timerq->min_interval = UCS_TIME_INFINITY; ptr = timerq->timers; while (ptr < timerq->timers + timerq->num_timers) { @@ -109,6 +120,6 @@ ucs_status_t ucs_timerq_remove(ucs_timer_queue_t *timerq, int timer_id) ucs_assert(timerq->min_interval != UCS_TIME_INFINITY); } - pthread_spin_unlock(&timerq->lock); + ucs_recursive_spin_unlock(&timerq->lock); return status; } diff --git a/src/ucs/time/timerq.h b/src/ucs/time/timerq.h index a79f2110161..986895cd35e 100644 --- a/src/ucs/time/timerq.h +++ b/src/ucs/time/timerq.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include typedef struct ucs_timer { @@ -22,7 +22,7 @@ typedef struct ucs_timer { typedef struct ucs_timer_queue { - pthread_spinlock_t lock; + ucs_recursive_spinlock_t lock; ucs_time_t min_interval; /* Expiration of next timer */ ucs_timer_t *timers; /* Array of timers */ unsigned num_timers; /* Number of timers */ @@ -102,7 +102,7 @@ static inline int ucs_timerq_is_empty(ucs_timer_queue_t *timerq) { #define ucs_timerq_for_each_expired(_timer, _timerq, _current_time, _code) \ { \ ucs_time_t __current_time = _current_time; \ - pthread_spin_lock(&(_timerq)->lock); /* Grab lock */ \ + ucs_recursive_spin_lock(&(_timerq)->lock); /* Grab lock */ \ for (_timer = (_timerq)->timers; \ _timer != (_timerq)->timers + (_timerq)->num_timers; \ ++_timer) \ @@ -113,7 +113,7 @@ static inline int ucs_timerq_is_empty(ucs_timer_queue_t *timerq) { _code; \ } \ } \ - pthread_spin_unlock(&(_timerq)->lock); /* Release lock */ \ + ucs_recursive_spin_unlock(&(_timerq)->lock); /* Release lock */ \ } #endif diff --git a/src/ucs/type/class.c b/src/ucs/type/class.c index 284f04b12b9..46ba925e312 100644 --- a/src/ucs/type/class.c +++ b/src/ucs/type/class.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "class.h" #include @@ -33,7 +37,7 @@ void ucs_class_call_cleanup_chain(ucs_class_t *cls, void *obj, int limit) ucs_class_t *c; int depth, skip; - ucs_assert((limit == -1) || (limit >= 1)); + ucs_assert(((limit == -1) || (limit >= 1)) && (cls != NULL)); /* Count how many classes are there */ for (depth = 0, c = cls; c != NULL; ++depth, c = c->superclass); @@ -41,7 +45,9 @@ void ucs_class_call_cleanup_chain(ucs_class_t *cls, void *obj, int limit) /* Skip some destructors, because we may have a limit here */ skip = (limit < 0) ? 0 : ucs_max(depth - limit, 0); c = cls; - while (skip-- > 0) { + + /* check for NULL pointer to suppress clang warning */ + while ((skip-- > 0) && (c != NULL)) { c = c->superclass; } @@ -61,3 +67,8 @@ void ucs_class_free(void *obj) { ucs_free(obj); } + +void ucs_class_check_new_func_result(ucs_status_t status, void *obj) +{ + ucs_assert((status == UCS_OK) || (obj == NULL)); +} diff --git a/src/ucs/type/class.h b/src/ucs/type/class.h index a13a0752c27..c76103b9d4d 100644 --- a/src/ucs/type/class.h +++ b/src/ucs/type/class.h @@ -34,6 +34,8 @@ struct ucs_class { /* * Helper: Define names of class-related identifiers. */ +#define UCS_CLASS_DECL_NAME(_type) \ + _UCS_CLASS_DECL_NAME(_type) #define _UCS_CLASS_DECL_NAME(_type) \ UCS_PP_TOKENPASTE(_type, _class) #define _UCS_CLASS_INIT_NAME(_type) \ @@ -91,19 +93,18 @@ struct ucs_class { */ #define UCS_CLASS_INIT(_type, _obj, ...) \ ({ \ - extern ucs_class_t _UCS_CLASS_DECL_NAME(_type); \ - ucs_class_t *cls = &_UCS_CLASS_DECL_NAME(_type); \ - int init_count = 1; \ - ucs_status_t status; \ + ucs_class_t *_cls = &_UCS_CLASS_DECL_NAME(_type); \ + int _init_counter = 1; \ + ucs_status_t __status; \ \ - status = _UCS_CLASS_INIT_NAME(_type)((_type*)(_obj), cls, &init_count, \ - ## __VA_ARGS__); \ - if ((status != UCS_OK) && (status != UCS_INPROGRESS)) { \ + __status = _UCS_CLASS_INIT_NAME(_type)((_type*)(_obj), _cls, \ + &_init_counter, ## __VA_ARGS__); \ + if (__status != UCS_OK) { \ ucs_class_call_cleanup_chain(&_UCS_CLASS_DECL_NAME(_type), \ - (_obj), init_count); \ + (_obj), _init_counter); \ } \ \ - (status); \ + (__status); \ }) @@ -125,7 +126,6 @@ struct ucs_class { */ #define UCS_CLASS_CLEANUP(_type, _obj) \ { \ - extern ucs_class_t _UCS_CLASS_DECL_NAME(_type); \ UCS_CLASS_CLEANUP_CALL(&_UCS_CLASS_DECL_NAME(_type), _obj); \ } @@ -143,24 +143,23 @@ struct ucs_class { _UCS_CLASS_NEW (_type, _obj, ## __VA_ARGS__) #define _UCS_CLASS_NEW(_type, _obj, ...) \ ({ \ - extern ucs_class_t _UCS_CLASS_DECL_NAME(_type); \ ucs_class_t *cls = &_UCS_CLASS_DECL_NAME(_type); \ - ucs_status_t status; \ + ucs_status_t _status; \ void *obj; \ \ obj = ucs_class_malloc(cls); \ if (obj != NULL) { \ - status = UCS_CLASS_INIT(_type, obj, ## __VA_ARGS__); \ - if (status == UCS_OK) { \ + _status = UCS_CLASS_INIT(_type, obj, ## __VA_ARGS__); \ + if (_status == UCS_OK) { \ *(_obj) = (typeof(*(_obj)))obj; /* Success - assign pointer */ \ } else { \ ucs_class_free(obj); /* Initialization failure */ \ } \ } else { \ - status = UCS_ERR_NO_MEMORY; /* Allocation failure */ \ + _status = UCS_ERR_NO_MEMORY; /* Allocation failure */ \ } \ \ - (status); \ + (_status); \ }) @@ -187,10 +186,10 @@ struct ucs_class { #define UCS_CLASS_CALL_SUPER_INIT(_superclass, ...) \ { \ { \ - ucs_status_t status = _UCS_CLASS_INIT_NAME(_superclass)\ + ucs_status_t _status = _UCS_CLASS_INIT_NAME(_superclass)\ (&self->super, _myclass->superclass, _init_count, ## __VA_ARGS__); \ - if (status != UCS_OK) { \ - return status; \ + if (_status != UCS_OK) { \ + return _status; \ } \ if (_myclass->superclass != &_UCS_CLASS_DECL_NAME(void)) { \ ++(*_init_count); \ @@ -221,9 +220,15 @@ struct ucs_class { _argtype **obj_p) #define UCS_CLASS_DEFINE_NAMED_NEW_FUNC(_name, _type, _argtype, ...) \ UCS_CLASS_DECLARE_NAMED_NEW_FUNC(_name, _argtype, ## __VA_ARGS__) { \ - return UCS_CLASS_NEW(_type, obj_p \ - UCS_PP_FOREACH(_UCS_CLASS_INIT_ARG_PASS, _, \ - UCS_PP_SEQ(UCS_PP_NUM_ARGS(__VA_ARGS__)))); \ + ucs_status_t status; \ + \ + *obj_p = NULL; \ + \ + status = UCS_CLASS_NEW(_type, obj_p \ + UCS_PP_FOREACH(_UCS_CLASS_INIT_ARG_PASS, _, \ + UCS_PP_SEQ(UCS_PP_NUM_ARGS(__VA_ARGS__)))); \ + ucs_class_check_new_func_result(status, *obj_p); \ + return status; \ } #define UCS_CLASS_DECLARE_NEW_FUNC(_type, _argtype, ...) \ UCS_CLASS_DECLARE_NAMED_NEW_FUNC(UCS_CLASS_NEW_FUNC_NAME(_type), _argtype, ## __VA_ARGS__) @@ -293,10 +298,14 @@ void ucs_class_call_cleanup_chain(ucs_class_t *cls, void *obj, int limit); /* - * Helpers: Allocate/release objects. + * Helpers: */ +/* Allocate objects */ void *ucs_class_malloc(ucs_class_t *cls); +/* Release objects */ void ucs_class_free(void *obj); +/* Check new function result */ +void ucs_class_check_new_func_result(ucs_status_t status, void *obj); /** diff --git a/src/ucs/type/init_once.c b/src/ucs/type/init_once.c new file mode 100644 index 00000000000..cfb05c9cb8c --- /dev/null +++ b/src/ucs/type/init_once.c @@ -0,0 +1,20 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include + + +unsigned ucs_init_once_mutex_unlock(pthread_mutex_t *lock) +{ + int ret = pthread_mutex_unlock(lock); + ucs_assert_always(ret == 0); + return 0; +} diff --git a/src/ucs/type/init_once.h b/src/ucs/type/init_once.h index d310de674c9..4b7e967ccbc 100644 --- a/src/ucs/type/init_once.h +++ b/src/ucs/type/init_once.h @@ -7,6 +7,7 @@ #ifndef UCS_TYPE_INIT_ONCE_H_ #define UCS_TYPE_INIT_ONCE_H_ + #include @@ -20,10 +21,15 @@ typedef struct ucs_init_once { /* Static initializer for @ref ucs_init_once_t */ -#define UCS_INIT_ONCE_INIITIALIZER \ +#define UCS_INIT_ONCE_INITIALIZER \ { PTHREAD_MUTEX_INITIALIZER, 0 } +/* Wrapper to unlock a mutex that always returns 0 to avoid endless loop + * and make static analyzers happy - they report "double unlock" warning */ +unsigned ucs_init_once_mutex_unlock(pthread_mutex_t *lock); + + /* * Start a code block to perform an arbitrary initialization step only once * during the lifetime of the provided synchronization object. @@ -35,17 +41,17 @@ typedef struct ucs_init_once { * ... code ... * } * - * @note It's safe to use a "break" statement in order to exit the code block, - * but "return" and "continue" statements may lead to unexpected behavior. + * @note It's safe to use a "continue" statement in order to exit the code block, + * but "return" and "break" statements may lead to unexpected behavior. * * How does it work? First, lock the mutex. Then check if already initialized, - * if yes unlock then mutex and exit the loop (pthread_mutex_unlock is expected + * if yes unlock the mutex and exit the loop (pthread_mutex_unlock is expected * to return 0). Otherwise, perform the "body" of the for loop, and then set * "initialized" to 1. On the next condition check, unlock the mutex and exit. */ #define UCS_INIT_ONCE(_once) \ for (pthread_mutex_lock(&(_once)->lock); \ - !(_once)->initialized || pthread_mutex_unlock(&(_once)->lock); \ + !(_once)->initialized || ucs_init_once_mutex_unlock(&(_once)->lock); \ (_once)->initialized = 1) #endif diff --git a/src/ucs/type/spinlock.c b/src/ucs/type/spinlock.c deleted file mode 100644 index da17059a295..00000000000 --- a/src/ucs/type/spinlock.c +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Copyright (C) Mellanox Technologies Ltd. 2018. ALL RIGHTS RESERVED. - * - * See file LICENSE for terms. - */ - -#include "spinlock.h" - -#include -#include - - -ucs_status_t ucs_spinlock_init(ucs_spinlock_t *lock) -{ - int ret; - - ret = pthread_spin_init(&lock->lock, 0); - if (ret != 0) { - return UCS_ERR_IO_ERROR; - } - - lock->count = 0; - lock->owner = 0xfffffffful; - return UCS_OK; -} - -void ucs_spinlock_destroy(ucs_spinlock_t *lock) -{ - int ret; - - if (lock->count != 0) { - ucs_warn("destroying spinlock %p with use count %d (owner: 0x%lx)", - lock, lock->count, lock->owner); - } - - ret = pthread_spin_destroy(&lock->lock); - if (ret != 0) { - ucs_warn("failed to destroy spinlock %p: %s", lock, strerror(ret)); - } -} diff --git a/src/ucs/type/spinlock.h b/src/ucs/type/spinlock.h index f07f73ba309..5481e22f376 100644 --- a/src/ucs/type/spinlock.h +++ b/src/ucs/type/spinlock.h @@ -1,5 +1,6 @@ /* * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2020. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -9,54 +10,134 @@ #include #include +#include BEGIN_C_DECLS /** @file spinlock.h */ + +/* Spinlock creation modifiers */ +enum { + UCS_SPINLOCK_FLAG_SHARED = UCS_BIT(0) /**< Make spinlock sharable in memory */ +}; + /** - * Reentrant spinlock. + * Simple spinlock. */ typedef struct ucs_spinlock { pthread_spinlock_t lock; - int count; - pthread_t owner; } ucs_spinlock_t; +/** + * Reentrant spinlock. + */ +typedef struct ucs_recursive_spinlock { + ucs_spinlock_t super; + int count; + pthread_t owner; +} ucs_recursive_spinlock_t; + +#define UCS_SPINLOCK_OWNER_NULL ((pthread_t)-1) + + +static ucs_status_t ucs_spinlock_init(ucs_spinlock_t *lock, int flags) +{ + int ret, lock_flags; -ucs_status_t ucs_spinlock_init(ucs_spinlock_t *lock); + if (flags & UCS_SPINLOCK_FLAG_SHARED) { + lock_flags = PTHREAD_PROCESS_SHARED; + } else { + lock_flags = PTHREAD_PROCESS_PRIVATE; + } -void ucs_spinlock_destroy(ucs_spinlock_t *lock); + ret = pthread_spin_init(&lock->lock, lock_flags); + if (ret != 0) { + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + +static inline ucs_status_t +ucs_recursive_spinlock_init(ucs_recursive_spinlock_t* lock, int flags) +{ + lock->count = 0; + lock->owner = UCS_SPINLOCK_OWNER_NULL; + + return ucs_spinlock_init(&lock->super, flags); +} + +static inline ucs_status_t ucs_spinlock_destroy(ucs_spinlock_t *lock) +{ + int ret; + + ret = pthread_spin_destroy(&lock->lock); + if (ret != 0) { + if (errno == EBUSY) { + return UCS_ERR_BUSY; + } else { + return UCS_ERR_INVALID_PARAM; + } + } + + return UCS_OK; +} -static inline int ucs_spin_is_owner(ucs_spinlock_t *lock, pthread_t self) +static inline ucs_status_t +ucs_recursive_spinlock_destroy(ucs_recursive_spinlock_t *lock) +{ + if (lock->count != 0) { + return UCS_ERR_BUSY; + } + + return ucs_spinlock_destroy(&lock->super); +} + +static inline int +ucs_recursive_spin_is_owner(ucs_recursive_spinlock_t *lock, pthread_t self) { return lock->owner == self; } static inline void ucs_spin_lock(ucs_spinlock_t *lock) +{ + pthread_spin_lock(&lock->lock); +} + +static inline void ucs_recursive_spin_lock(ucs_recursive_spinlock_t *lock) { pthread_t self = pthread_self(); - if (ucs_spin_is_owner(lock, self)) { + if (ucs_recursive_spin_is_owner(lock, self)) { ++lock->count; return; } - pthread_spin_lock(&lock->lock); + ucs_spin_lock(&lock->super); lock->owner = self; ++lock->count; } -static inline int ucs_spin_trylock(ucs_spinlock_t *lock) +static inline int ucs_spin_try_lock(ucs_spinlock_t *lock) +{ + if (pthread_spin_trylock(&lock->lock) != 0) { + return 0; + } + + return 1; +} + +static inline int ucs_recursive_spin_trylock(ucs_recursive_spinlock_t *lock) { pthread_t self = pthread_self(); - if (ucs_spin_is_owner(lock, self)) { + if (ucs_recursive_spin_is_owner(lock, self)) { ++lock->count; return 1; } - if (pthread_spin_trylock(&lock->lock) != 0) { + if (ucs_spin_try_lock(&lock->super) == 0) { return 0; } @@ -66,11 +147,16 @@ static inline int ucs_spin_trylock(ucs_spinlock_t *lock) } static inline void ucs_spin_unlock(ucs_spinlock_t *lock) +{ + pthread_spin_unlock(&lock->lock); +} + +static inline void ucs_recursive_spin_unlock(ucs_recursive_spinlock_t *lock) { --lock->count; if (lock->count == 0) { - lock->owner = 0xfffffffful; - pthread_spin_unlock(&lock->lock); + lock->owner = UCS_SPINLOCK_OWNER_NULL; + ucs_spin_unlock(&lock->super); } } diff --git a/src/ucs/type/status.c b/src/ucs/type/status.c index 09867872e69..86762eae7db 100644 --- a/src/ucs/type/status.c +++ b/src/ucs/type/status.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "status.h" #include @@ -64,6 +68,10 @@ const char *ucs_status_string(ucs_status_t status) return "Unsupported operation"; case UCS_ERR_REJECTED: return "Operation rejected by remote peer"; + case UCS_ERR_NOT_CONNECTED: + return "Endpoint is not connected"; + case UCS_ERR_CONNECTION_RESET: + return "Connection reset by remote peer"; case UCS_ERR_ENDPOINT_TIMEOUT: return "Endpoint timeout"; default: diff --git a/src/ucs/type/status.h b/src/ucs/type/status.h index fc12e63d8de..d6409ffbb11 100644 --- a/src/ucs/type/status.h +++ b/src/ucs/type/status.h @@ -73,6 +73,8 @@ typedef enum { UCS_ERR_EXCEEDS_LIMIT = -21, UCS_ERR_UNSUPPORTED = -22, UCS_ERR_REJECTED = -23, + UCS_ERR_NOT_CONNECTED = -24, + UCS_ERR_CONNECTION_RESET = -25, UCS_ERR_FIRST_LINK_FAILURE = -40, UCS_ERR_LAST_LINK_FAILURE = -59, @@ -81,7 +83,7 @@ typedef enum { UCS_ERR_ENDPOINT_TIMEOUT = -80, UCS_ERR_LAST = -100 -} UCS_S_PACKED ucs_status_t ; +} UCS_S_PACKED ucs_status_t; #define UCS_IS_LINK_ERROR(_code) \ @@ -103,11 +105,12 @@ typedef enum { */ typedef void *ucs_status_ptr_t; -#define UCS_PTR_STATUS(_ptr) ((ucs_status_t)(intptr_t)(_ptr)) -#define UCS_PTR_IS_ERR(_ptr) (((uintptr_t)(_ptr)) >= ((uintptr_t)UCS_ERR_LAST)) -#define UCS_PTR_IS_PTR(_ptr) (((uintptr_t)(_ptr) - 1) < ((uintptr_t)UCS_ERR_LAST - 1)) -#define UCS_STATUS_PTR(_status) ((void*)(intptr_t)(_status)) -#define UCS_STATUS_IS_ERR(_status) (_status < 0) +#define UCS_PTR_IS_ERR(_ptr) (((uintptr_t)(_ptr)) >= ((uintptr_t)UCS_ERR_LAST)) +#define UCS_PTR_IS_PTR(_ptr) (((uintptr_t)(_ptr) - 1) < ((uintptr_t)UCS_ERR_LAST - 1)) +#define UCS_PTR_RAW_STATUS(_ptr) ((ucs_status_t)(intptr_t)(_ptr)) +#define UCS_PTR_STATUS(_ptr) (UCS_PTR_IS_PTR(_ptr) ? UCS_INPROGRESS : UCS_PTR_RAW_STATUS(_ptr)) +#define UCS_STATUS_PTR(_status) ((void*)(intptr_t)(_status)) +#define UCS_STATUS_IS_ERR(_status) ((_status) < 0) /** diff --git a/src/uct/Makefile.am b/src/uct/Makefile.am index f2118df50ae..6114aca3bf1 100644 --- a/src/uct/Makefile.am +++ b/src/uct/Makefile.am @@ -3,6 +3,7 @@ # Copyright (c) UT-Battelle, LLC. 2014-2017. ALL RIGHTS RESERVED. # Copyright (c) The University of Tennesse and the University of Tennessee # Research Foundation. 2016. ALL RIGHTS RESERVED. +# Copyright (C) NVIDIA Corporation. 2019. ALL RIGHTS RESERVED. # See file LICENSE for terms. # @@ -23,23 +24,38 @@ nobase_dist_libuct_la_HEADERS = \ noinst_HEADERS = \ base/uct_md.h \ + base/uct_component.h \ base/uct_iface.h \ base/uct_log.h \ base/uct_worker.h \ + base/uct_cm.h \ + base/uct_iov.inl \ sm/base/sm_ep.h \ sm/base/sm_iface.h \ sm/mm/base/mm_iface.h \ sm/mm/base/mm_ep.h \ - sm/mm/base/mm_def.h \ sm/mm/base/mm_md.h \ + sm/scopy/base/scopy_iface.h \ + sm/scopy/base/scopy_ep.h \ sm/self/self.h \ - tcp/tcp.h + tcp/tcp_base.h \ + tcp/tcp.h \ + tcp/tcp_sockcm.h \ + tcp/tcp_listener.h \ + tcp/tcp_sockcm_ep.h \ + tcp/sockcm/sockcm_def.h \ + tcp/sockcm/sockcm_iface.h \ + tcp/sockcm/sockcm_ep.h \ + tcp/sockcm/sockcm_md.h + libuct_la_SOURCES = \ base/uct_md.c \ base/uct_mem.c \ + base/uct_component.c \ base/uct_iface.c \ base/uct_worker.c \ + base/uct_cm.c \ sm/base/sm_ep.c \ sm/base/sm_iface.c \ sm/mm/base/mm_iface.c \ @@ -47,8 +63,18 @@ libuct_la_SOURCES = \ sm/mm/base/mm_md.c \ sm/mm/posix/mm_posix.c \ sm/mm/sysv/mm_sysv.c \ + sm/scopy/base/scopy_iface.c \ + sm/scopy/base/scopy_ep.c \ sm/self/self.c \ tcp/tcp_ep.c \ tcp/tcp_iface.c \ tcp/tcp_md.c \ - tcp/tcp_net.c + tcp/tcp_net.c \ + tcp/tcp_cm.c \ + tcp/tcp_base.c \ + tcp/tcp_sockcm.c \ + tcp/tcp_listener.c \ + tcp/tcp_sockcm_ep.c \ + tcp/sockcm/sockcm_iface.c \ + tcp/sockcm/sockcm_ep.c \ + tcp/sockcm/sockcm_md.c diff --git a/src/uct/api/tl.h b/src/uct/api/tl.h index 15364b7b259..e6f7667e21a 100644 --- a/src/uct/api/tl.h +++ b/src/uct/api/tl.h @@ -23,188 +23,345 @@ BEGIN_C_DECLS /** @file tl.h */ -/** - * Transport interface operations. - * Every operation exposed in the API should appear in the table below, to allow - * creating interface/endpoint with custom operations. - */ -typedef struct uct_iface_ops { +/* endpoint - put */ + +typedef ucs_status_t (*uct_ep_put_short_func_t)(uct_ep_h ep, + const void *buffer, + unsigned length, + uint64_t remote_addr, + uct_rkey_t rkey); + +typedef ssize_t (*uct_ep_put_bcopy_func_t)(uct_ep_h ep, + uct_pack_callback_t pack_cb, + void *arg, + uint64_t remote_addr, + uct_rkey_t rkey); + +typedef ucs_status_t (*uct_ep_put_zcopy_func_t)(uct_ep_h ep, + const uct_iov_t *iov, + size_t iovcnt, + uint64_t remote_addr, + uct_rkey_t rkey, + uct_completion_t *comp); + +/* endpoint - get */ + +typedef ucs_status_t (*uct_ep_get_short_func_t)(uct_ep_h ep, + void *buffer, + unsigned length, + uint64_t remote_addr, + uct_rkey_t rkey); + +typedef ucs_status_t (*uct_ep_get_bcopy_func_t)(uct_ep_h ep, + uct_unpack_callback_t unpack_cb, + void *arg, + size_t length, + uint64_t remote_addr, + uct_rkey_t rkey, + uct_completion_t *comp); + +typedef ucs_status_t (*uct_ep_get_zcopy_func_t)(uct_ep_h ep, + const uct_iov_t *iov, + size_t iovcnt, + uint64_t remote_addr, + uct_rkey_t rkey, + uct_completion_t *comp); + +/* endpoint - active message */ + +typedef ucs_status_t (*uct_ep_am_short_func_t)(uct_ep_h ep, + uint8_t id, + uint64_t header, + const void *payload, + unsigned length); + +typedef ssize_t (*uct_ep_am_bcopy_func_t)(uct_ep_h ep, + uint8_t id, + uct_pack_callback_t pack_cb, + void *arg, + unsigned flags); + +typedef ucs_status_t (*uct_ep_am_zcopy_func_t)(uct_ep_h ep, + uint8_t id, + const void *header, + unsigned header_length, + const uct_iov_t *iov, + size_t iovcnt, + unsigned flags, + uct_completion_t *comp); + +/* endpoint - atomics */ + +typedef ucs_status_t (*uct_ep_atomic_cswap64_func_t)(uct_ep_h ep, + uint64_t compare, + uint64_t swap, + uint64_t remote_addr, + uct_rkey_t rkey, + uint64_t *result, + uct_completion_t *comp); + +typedef ucs_status_t (*uct_ep_atomic_cswap32_func_t)(uct_ep_h ep, + uint32_t compare, + uint32_t swap, + uint64_t remote_addr, + uct_rkey_t rkey, + uint32_t *result, + uct_completion_t *comp); + +typedef ucs_status_t (*uct_ep_atomic32_post_func_t)(uct_ep_h ep, + unsigned opcode, + uint32_t value, + uint64_t remote_addr, + uct_rkey_t rkey); + +typedef ucs_status_t (*uct_ep_atomic64_post_func_t)(uct_ep_h ep, + unsigned opcode, + uint64_t value, + uint64_t remote_addr, + uct_rkey_t rkey); + +typedef ucs_status_t (*uct_ep_atomic32_fetch_func_t)(uct_ep_h ep, + unsigned opcode, + uint32_t value, + uint32_t *result, + uint64_t remote_addr, + uct_rkey_t rkey, + uct_completion_t *comp); + +typedef ucs_status_t (*uct_ep_atomic64_fetch_func_t)(uct_ep_h ep, + unsigned opcode, + uint64_t value, + uint64_t *result, + uint64_t remote_addr, + uct_rkey_t rkey, + uct_completion_t *comp); + +/* endpoint - tagged operations */ + +typedef ucs_status_t (*uct_ep_tag_eager_short_func_t)(uct_ep_h ep, + uct_tag_t tag, + const void *data, + size_t length); + +typedef ssize_t (*uct_ep_tag_eager_bcopy_func_t)(uct_ep_h ep, + uct_tag_t tag, + uint64_t imm, + uct_pack_callback_t pack_cb, + void *arg, + unsigned flags); + +typedef ucs_status_t (*uct_ep_tag_eager_zcopy_func_t)(uct_ep_h ep, + uct_tag_t tag, + uint64_t imm, + const uct_iov_t *iov, + size_t iovcnt, + unsigned flags, + uct_completion_t *comp); + +typedef ucs_status_ptr_t (*uct_ep_tag_rndv_zcopy_func_t)(uct_ep_h ep, + uct_tag_t tag, + const void *header, + unsigned header_length, + const uct_iov_t *iov, + size_t iovcnt, + unsigned flags, + uct_completion_t *comp); - /* endpoint - put */ +typedef ucs_status_t (*uct_ep_tag_rndv_cancel_func_t)(uct_ep_h ep, void *op); - ucs_status_t (*ep_put_short)(uct_ep_h ep, const void *buffer, unsigned length, - uint64_t remote_addr, uct_rkey_t rkey); +typedef ucs_status_t (*uct_ep_tag_rndv_request_func_t)(uct_ep_h ep, + uct_tag_t tag, + const void* header, + unsigned header_length, + unsigned flags); - ssize_t (*ep_put_bcopy)(uct_ep_h ep, uct_pack_callback_t pack_cb, - void *arg, uint64_t remote_addr, uct_rkey_t rkey); +/* interface - tagged operations */ - ucs_status_t (*ep_put_zcopy)(uct_ep_h ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp); +typedef ucs_status_t (*uct_iface_tag_recv_zcopy_func_t)(uct_iface_h iface, + uct_tag_t tag, + uct_tag_t tag_mask, + const uct_iov_t *iov, + size_t iovcnt, + uct_tag_context_t *ctx); - /* endpoint - get */ +typedef ucs_status_t (*uct_iface_tag_recv_cancel_func_t)(uct_iface_h iface, + uct_tag_context_t *ctx, + int force); - ucs_status_t (*ep_get_short)(uct_ep_h ep, void *buffer, unsigned length, - uint64_t remote_addr, uct_rkey_t rkey); +/* endpoint - pending queue */ - ucs_status_t (*ep_get_bcopy)(uct_ep_h ep, uct_unpack_callback_t unpack_cb, - void *arg, size_t length, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp); +typedef ucs_status_t (*uct_ep_pending_add_func_t)(uct_ep_h ep, + uct_pending_req_t *n, + unsigned flags); - ucs_status_t (*ep_get_zcopy)(uct_ep_h ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp); +typedef void (*uct_ep_pending_purge_func_t)(uct_ep_h ep, + uct_pending_purge_callback_t cb, + void *arg); - /* endpoint - active message */ +/* endpoint - synchronization */ - ucs_status_t (*ep_am_short)(uct_ep_h ep, uint8_t id, uint64_t header, - const void *payload, unsigned length); +typedef ucs_status_t (*uct_ep_flush_func_t)(uct_ep_h ep, + unsigned flags, + uct_completion_t *comp); - ssize_t (*ep_am_bcopy)(uct_ep_h ep, uint8_t id, - uct_pack_callback_t pack_cb, void *arg, - unsigned flags); +typedef ucs_status_t (*uct_ep_fence_func_t)(uct_ep_h ep, unsigned flags); - ucs_status_t (*ep_am_zcopy)(uct_ep_h ep, uint8_t id, const void *header, - unsigned header_length, const uct_iov_t *iov, - size_t iovcnt, unsigned flags, - uct_completion_t *comp); +typedef ucs_status_t (*uct_ep_check_func_t)(uct_ep_h ep, + unsigned flags, + uct_completion_t *comp); - /* endpoint - atomics */ +/* endpoint - connection establishment */ - ucs_status_t (*ep_atomic_cswap64)(uct_ep_h ep, uint64_t compare, uint64_t swap, - uint64_t remote_addr, uct_rkey_t rkey, - uint64_t *result, uct_completion_t *comp); +typedef ucs_status_t (*uct_ep_create_func_t)(const uct_ep_params_t *params, + uct_ep_h *ep_p); - ucs_status_t (*ep_atomic_cswap32)(uct_ep_h ep, uint32_t compare, uint32_t swap, - uint64_t remote_addr, uct_rkey_t rkey, - uint32_t *result, uct_completion_t *comp); +typedef ucs_status_t (*uct_ep_disconnect_func_t)(uct_ep_h ep, unsigned flags); - ucs_status_t (*ep_atomic32_post)(uct_ep_h ep, unsigned opcode, uint32_t value, - uint64_t remote_addr, uct_rkey_t rkey); +typedef ucs_status_t (*uct_cm_ep_conn_notify_func_t)(uct_ep_h ep); - ucs_status_t (*ep_atomic64_post)(uct_ep_h ep, unsigned opcode, uint64_t value, - uint64_t remote_addr, uct_rkey_t rkey); +typedef void (*uct_ep_destroy_func_t)(uct_ep_h ep); - ucs_status_t (*ep_atomic32_fetch)(uct_ep_h ep, unsigned opcode, uint32_t value, - uint32_t *result, uint64_t remote_addr, - uct_rkey_t rkey, uct_completion_t *comp); +typedef ucs_status_t (*uct_ep_get_address_func_t)(uct_ep_h ep, + uct_ep_addr_t *addr); - ucs_status_t (*ep_atomic64_fetch)(uct_ep_h ep, unsigned opcode, uint64_t value, - uint64_t *result, uint64_t remote_addr, - uct_rkey_t rkey, uct_completion_t *comp); - - /* endpoint - tagged operations */ +typedef ucs_status_t (*uct_ep_connect_to_ep_func_t)(uct_ep_h ep, + const uct_device_addr_t *dev_addr, + const uct_ep_addr_t *ep_addr); - ucs_status_t (*ep_tag_eager_short)(uct_ep_h ep, uct_tag_t tag, - const void *data, size_t length); +typedef ucs_status_t (*uct_iface_accept_func_t)(uct_iface_h iface, + uct_conn_request_h conn_request); - ssize_t (*ep_tag_eager_bcopy)(uct_ep_h ep, uct_tag_t tag, uint64_t imm, - uct_pack_callback_t pack_cb, void *arg, - unsigned flags); +typedef ucs_status_t (*uct_iface_reject_func_t)(uct_iface_h iface, + uct_conn_request_h conn_request); - ucs_status_t (*ep_tag_eager_zcopy)(uct_ep_h ep, uct_tag_t tag, uint64_t imm, - const uct_iov_t *iov, size_t iovcnt, - unsigned flags, uct_completion_t *comp); +/* interface - synchronization */ - ucs_status_ptr_t (*ep_tag_rndv_zcopy)(uct_ep_h ep, uct_tag_t tag, - const void *header, - unsigned header_length, - const uct_iov_t *iov, - size_t iovcnt, unsigned flags, - uct_completion_t *comp); +typedef ucs_status_t (*uct_iface_flush_func_t)(uct_iface_h iface, + unsigned flags, + uct_completion_t *comp); - ucs_status_t (*ep_tag_rndv_cancel)(uct_ep_h ep, void *op); +typedef ucs_status_t (*uct_iface_fence_func_t)(uct_iface_h iface, unsigned flags); - ucs_status_t (*ep_tag_rndv_request)(uct_ep_h ep, uct_tag_t tag, - const void* header, - unsigned header_length, unsigned flags); +/* interface - progress control */ - /* interface - tagged operations */ +typedef void (*uct_iface_progress_enable_func_t)(uct_iface_h iface, + unsigned flags); - ucs_status_t (*iface_tag_recv_zcopy)(uct_iface_h iface, uct_tag_t tag, - uct_tag_t tag_mask, - const uct_iov_t *iov, - size_t iovcnt, - uct_tag_context_t *ctx); +typedef void (*uct_iface_progress_disable_func_t)(uct_iface_h iface, + unsigned flags); - ucs_status_t (*iface_tag_recv_cancel)(uct_iface_h iface, - uct_tag_context_t *ctx, - int force); +typedef unsigned (*uct_iface_progress_func_t)(uct_iface_h iface); - /* endpoint - pending queue */ +/* interface - events */ - ucs_status_t (*ep_pending_add)(uct_ep_h ep, uct_pending_req_t *n, - unsigned flags); +typedef ucs_status_t (*uct_iface_event_fd_get_func_t)(uct_iface_h iface, + int *fd_p); - void (*ep_pending_purge)(uct_ep_h ep, uct_pending_purge_callback_t cb, - void *arg); +typedef ucs_status_t (*uct_iface_event_arm_func_t)(uct_iface_h iface, + unsigned events); - /* endpoint - synchronization */ +/* interface - management */ - ucs_status_t (*ep_flush)(uct_ep_h ep, unsigned flags, - uct_completion_t *comp); +typedef void (*uct_iface_close_func_t)(uct_iface_h iface); - ucs_status_t (*ep_fence)(uct_ep_h ep, unsigned flags); +typedef ucs_status_t (*uct_iface_query_func_t)(uct_iface_h iface, + uct_iface_attr_t *iface_attr); - ucs_status_t (*ep_check)(uct_ep_h ep, unsigned flags, uct_completion_t *comp); +/* interface - connection establishment */ - /* endpoint - connection establishment */ +typedef ucs_status_t (*uct_iface_get_device_address_func_t)(uct_iface_h iface, + uct_device_addr_t *addr); - ucs_status_t (*ep_create)(const uct_ep_params_t *params, uct_ep_h *ep_p); +typedef ucs_status_t (*uct_iface_get_address_func_t)(uct_iface_h iface, + uct_iface_addr_t *addr); - void (*ep_destroy)(uct_ep_h ep); +typedef int (*uct_iface_is_reachable_func_t)(const uct_iface_h iface, + const uct_device_addr_t *dev_addr, + const uct_iface_addr_t *iface_addr); - ucs_status_t (*ep_get_address)(uct_ep_h ep, uct_ep_addr_t *addr); - ucs_status_t (*ep_connect_to_ep)(uct_ep_h ep, - const uct_device_addr_t *dev_addr, - const uct_ep_addr_t *ep_addr); +/** + * Transport interface operations. + * Every operation exposed in the API must appear in the table below, to allow + * creating interface/endpoint with custom operations. + */ +typedef struct uct_iface_ops { - ucs_status_t (*iface_accept)(uct_iface_h iface, - uct_conn_request_h conn_request); + /* endpoint - put */ + uct_ep_put_short_func_t ep_put_short; + uct_ep_put_bcopy_func_t ep_put_bcopy; + uct_ep_put_zcopy_func_t ep_put_zcopy; - ucs_status_t (*iface_reject)(uct_iface_h iface, - uct_conn_request_h conn_request); + /* endpoint - get */ + uct_ep_get_short_func_t ep_get_short; + uct_ep_get_bcopy_func_t ep_get_bcopy; + uct_ep_get_zcopy_func_t ep_get_zcopy; - /* interface - synchronization */ + /* endpoint - active message */ + uct_ep_am_short_func_t ep_am_short; + uct_ep_am_bcopy_func_t ep_am_bcopy; + uct_ep_am_zcopy_func_t ep_am_zcopy; - ucs_status_t (*iface_flush)(uct_iface_h iface, unsigned flags, - uct_completion_t *comp); + /* endpoint - atomics */ + uct_ep_atomic_cswap64_func_t ep_atomic_cswap64; + uct_ep_atomic_cswap32_func_t ep_atomic_cswap32; + uct_ep_atomic32_post_func_t ep_atomic32_post; + uct_ep_atomic64_post_func_t ep_atomic64_post; + uct_ep_atomic32_fetch_func_t ep_atomic32_fetch; + uct_ep_atomic64_fetch_func_t ep_atomic64_fetch; - ucs_status_t (*iface_fence)(uct_iface_h iface, unsigned flags); + /* endpoint - tagged operations */ + uct_ep_tag_eager_short_func_t ep_tag_eager_short; + uct_ep_tag_eager_bcopy_func_t ep_tag_eager_bcopy; + uct_ep_tag_eager_zcopy_func_t ep_tag_eager_zcopy; + uct_ep_tag_rndv_zcopy_func_t ep_tag_rndv_zcopy; + uct_ep_tag_rndv_cancel_func_t ep_tag_rndv_cancel; + uct_ep_tag_rndv_request_func_t ep_tag_rndv_request; - /* interface - progress control */ + /* interface - tagged operations */ + uct_iface_tag_recv_zcopy_func_t iface_tag_recv_zcopy; + uct_iface_tag_recv_cancel_func_t iface_tag_recv_cancel; - void (*iface_progress_enable)(uct_iface_h iface, unsigned flags); + /* endpoint - pending queue */ + uct_ep_pending_add_func_t ep_pending_add; + uct_ep_pending_purge_func_t ep_pending_purge; - void (*iface_progress_disable)(uct_iface_h iface, unsigned flags); + /* endpoint - synchronization */ + uct_ep_flush_func_t ep_flush; + uct_ep_fence_func_t ep_fence; + uct_ep_check_func_t ep_check; - unsigned (*iface_progress)(uct_iface_h iface); + /* endpoint - connection establishment */ + uct_ep_create_func_t ep_create; + uct_ep_disconnect_func_t ep_disconnect; + uct_cm_ep_conn_notify_func_t cm_ep_conn_notify; + uct_ep_destroy_func_t ep_destroy; + uct_ep_get_address_func_t ep_get_address; + uct_ep_connect_to_ep_func_t ep_connect_to_ep; + uct_iface_accept_func_t iface_accept; + uct_iface_reject_func_t iface_reject; - /* interface - events */ + /* interface - synchronization */ + uct_iface_flush_func_t iface_flush; + uct_iface_fence_func_t iface_fence; - ucs_status_t (*iface_event_fd_get)(uct_iface_h iface, int *fd_p); + /* interface - progress control */ + uct_iface_progress_enable_func_t iface_progress_enable; + uct_iface_progress_disable_func_t iface_progress_disable; + uct_iface_progress_func_t iface_progress; - ucs_status_t (*iface_event_arm)(uct_iface_h iface, unsigned events); + /* interface - events */ + uct_iface_event_fd_get_func_t iface_event_fd_get; + uct_iface_event_arm_func_t iface_event_arm; /* interface - management */ - - void (*iface_close)(uct_iface_h iface); - - ucs_status_t (*iface_query)(uct_iface_h iface, - uct_iface_attr_t *iface_attr); + uct_iface_close_func_t iface_close; + uct_iface_query_func_t iface_query; /* interface - connection establishment */ - - ucs_status_t (*iface_get_device_address)(uct_iface_h iface, - uct_device_addr_t *addr); - - ucs_status_t (*iface_get_address)(uct_iface_h iface, uct_iface_addr_t *addr); - - int (*iface_is_reachable)(const uct_iface_h iface, - const uct_device_addr_t *dev_addr, - const uct_iface_addr_t *iface_addr); + uct_iface_get_device_address_func_t iface_get_device_address; + uct_iface_get_address_func_t iface_get_address; + uct_iface_is_reachable_func_t iface_is_reachable; } uct_iface_ops_t; @@ -234,6 +391,14 @@ typedef struct uct_ep { } uct_ep_t; +/** + * Listener for incoming connections + */ +typedef struct uct_listener { + uct_cm_h cm; +} uct_listener_t; + + typedef struct uct_recv_desc uct_recv_desc_t; typedef void (*uct_desc_release_callback_t)(uct_recv_desc_t *self, void * desc); diff --git a/src/uct/api/uct.h b/src/uct/api/uct.h index 3dbeafe42d2..46462f53232 100644 --- a/src/uct/api/uct.h +++ b/src/uct/api/uct.h @@ -1,6 +1,7 @@ /** * @file uct.h - * @date 2014-2019 + * @date 2014-2020 + * @copyright NVIDIA Corporation. All rights reserved. * @copyright Mellanox Technologies Ltd. All rights reserved. * @copyright Oak Ridge National Laboratory. All rights received. * @copyright Advanced Micro Devices, Inc. All rights received. @@ -15,11 +16,14 @@ #include #include #include +#include +#include #include #include #include #include #include +#include #include #include @@ -106,6 +110,101 @@ BEGIN_C_DECLS * @} */ +/** + * @defgroup UCT_CLIENT_SERVER UCT client-server operations + * @ingroup UCT_API + * @{ + * Defines client-server operations. + * The client-server API allows the connection establishment between an active + * side - a client, and its peer - the passive side - a server. + * The connection can be established through a UCT transport that supports + * listening and connecting via IP address and port (listening can also be on INADDR_ANY). + * + * The following is a general overview of the operations on the server side: + * + * Connecting: + * @ref uct_cm_open + * Open a connection manager. + * @ref uct_listener_create + * Create a listener on the CM and start listening on a given IP,port / INADDR_ANY. + * @ref uct_cm_listener_conn_request_callback_t + * This callback is invoked by the UCT transport to handle an incoming connection + * request from a client. + * Accept or reject the client's connection request. + * @ref uct_ep_create + * Connect to the client by creating an endpoint if the request is accepted. + * The server creates a new endpoint for every connection request that it accepts. + * @ref uct_cm_ep_priv_data_pack_callback_t + * This callback is invoked by the UCT transport to fill auxiliary data in + * the connection acknowledgement or reject notification back to the client. + * Send the client a connection acknowledgement or reject notification. + * Wait for an acknowledgment from the client, indicating that it is connected. + * @ref uct_cm_ep_server_conn_notify_callback_t + * This callback is invoked by the UCT transport to handle the connection + * notification from the client. + * + * Disconnecting: + * @ref uct_ep_disconnect + * Disconnect the server's endpoint from the client. + * Can be called when initiating a disconnect or when receiving a disconnect + * notification from the remote side. + * @ref uct_ep_disconnect_cb_t + * This callback is invoked by the UCT transport when the client side calls + * uct_ep_disconnect as well. + * @ref uct_ep_destroy + * Destroy the endpoint connected to the remote peer. + * If this function is called before the endpoint was disconnected, the + * @ref uct_ep_disconnect_cb_t will not be invoked. + * + * Destroying the server's resources: + * @ref uct_listener_destroy + * Destroy the listener object. + * @ref uct_cm_close + * Close the connection manager. + * + * The following is a general overview of the operations on the client side: + * + * Connecting: + * @ref uct_cm_open + * Open a connection manager. + * @ref uct_ep_create + * Create an endpoint for establishing a connection to the server. + * @ref uct_cm_ep_priv_data_pack_callback_t + * This callback is invoked by the UCT transport to fill the user's private data + * in the connection request to be sent to the server. This connection request + * should be created by the transport. + * Send the connection request to the server. + * Wait for an acknowledgment from the server, indicating that it is connected. + * @ref uct_cm_ep_client_connect_callback_t + * This callback is invoked by the UCT transport to handle a connection response + * from the server. + * After invoking this callback, the UCT transport will finalize the client's + * connection to the server. + * @ref uct_cm_client_ep_conn_notify + * After the client's connection establishment is completed, the client + * should call this function in which it sends a notification message to + * the server stating that it (the client) is connected. + * The notification message that is sent depends on the transport's + * implementation. + * + * Disconnecting: + * @ref uct_ep_disconnect + * Disconnect the client's endpoint from the server. + * Can be called when initiating a disconnect or when receiving a disconnect + * notification from the remote side. + * @ref uct_ep_disconnect_cb_t + * This callback is invoked by the UCT transport when the server side calls + * uct_ep_disconnect as well. + * @ref uct_ep_destroy + * Destroy the endpoint connected to the remote peer. + * + * Destroying the client's resources: + * @ref uct_cm_close + * Close the connection manager. + * + * @} + */ + /** * @ingroup UCT_RESOURCE * @brief Memory domain resource descriptor. @@ -117,6 +216,81 @@ typedef struct uct_md_resource_desc { } uct_md_resource_desc_t; +/** + * @ingroup UCT_RESOURCE + * @brief UCT component attributes field mask + * + * The enumeration allows specifying which fields in @ref uct_component_attr_t + * are present. It is used for backward compatibility support. + */ +enum uct_component_attr_field { + UCT_COMPONENT_ATTR_FIELD_NAME = UCS_BIT(0), /**< Component name */ + UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT = UCS_BIT(1), /**< MD resource count */ + UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES = UCS_BIT(2), /**< MD resources array */ + UCT_COMPONENT_ATTR_FIELD_FLAGS = UCS_BIT(3) /**< Capability flags */ +}; + + +/** + * @ingroup UCT_RESOURCE + * @brief UCT component attributes + * + * This structure defines the attributes for UCT component. It is used for + * @ref uct_component_query + */ +typedef struct uct_component_attr { + /** + * Mask of valid fields in this structure, using bits from + * @ref uct_component_attr_field. + * Fields not specified in this mask will be ignored. + * Provides ABI compatibility with respect to adding new fields. + */ + uint64_t field_mask; + + /** Component name */ + char name[UCT_COMPONENT_NAME_MAX]; + + /** Number of memory-domain resources */ + unsigned md_resource_count; + + /** + * Array of memory domain resources. When used, it should be initialized + * prior to calling @ref uct_component_query with a pointer to an array, + * which is large enough to hold all memory domain resource entries. After + * the call, this array will be filled with information about existing + * memory domain resources. + * In order to allocate this array, you can call @ref uct_component_query + * twice: The first time would only obtain the amount of entries required, + * by specifying @ref UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT in + * field_mask. Then the array could be allocated with the returned number of + * entries, and passed to a second call to @ref uct_component_query, this + * time setting field_mask to @ref UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES. + */ + uct_md_resource_desc_t *md_resources; + + /** + * Flags as defined by UCT_COMPONENT_FLAG_xx. + */ + uint64_t flags; +} uct_component_attr_t; + + +/** + * @ingroup UCT_RESOURCE + * @brief Capability flags of @ref uct_component_h. + * + * The enumeration defines bit mask of @ref uct_component_h capabilities in + * @ref uct_component_attr_t::flags which is set by @ref uct_component_query. + */ +enum { + /** + * If set, the component supports @ref uct_cm_h functionality. + * See @ref uct_cm_open for details. + */ + UCT_COMPONENT_FLAG_CM = UCS_BIT(0) +}; + + /** * @ingroup UCT_RESOURCE * @brief List of UCX device types. @@ -144,7 +318,10 @@ typedef enum { typedef struct uct_tl_resource_desc { char tl_name[UCT_TL_NAME_MAX]; /**< Transport name */ char dev_name[UCT_DEVICE_NAME_MAX]; /**< Hardware device name */ - uct_device_type_t dev_type; /**< Device type. To which UCT group it belongs to */ + uct_device_type_t dev_type; /**< The device represented by this resource + (e.g. UCT_DEVICE_TYPE_NET for a network interface) */ + ucs_sys_device_t sys_device; /**< The identifier associated with the device + bus_id as captured in ucs_sys_bus_id_t struct */ } uct_tl_resource_desc_t; #define UCT_TL_RESOURCE_DESC_FMT "%s/%s" @@ -235,14 +412,6 @@ typedef enum uct_atomic_op { and it may also be invoked when uct_worker_progress() is called. */ - /* Event notification */ -#define UCT_IFACE_FLAG_EVENT_SEND_COMP UCS_BIT(46) /**< Event notification of send completion is - supported */ -#define UCT_IFACE_FLAG_EVENT_RECV UCS_BIT(47) /**< Event notification of tag and active message - receive is supported */ -#define UCT_IFACE_FLAG_EVENT_RECV_SIG UCS_BIT(48) /**< Event notification of signaled tag and active - message is supported */ - /* Tag matching operations */ #define UCT_IFACE_FLAG_TAG_EAGER_SHORT UCS_BIT(50) /**< Hardware tag matching short eager support */ #define UCT_IFACE_FLAG_TAG_EAGER_BCOPY UCS_BIT(51) /**< Hardware tag matching bcopy eager support */ @@ -253,6 +422,33 @@ typedef enum uct_atomic_op { */ +/** + * @defgroup UCT_RESOURCE_IFACE_EVENT_CAP UCT interface for asynchronous event capabilities + * @ingroup UCT_RESOURCE + * + * @brief List of capabilities supported by UCT iface event API + * + * The definition list presents a full list of operations and capabilities + * supported by UCT iface event. + * @{ + */ + /* Event types */ +#define UCT_IFACE_FLAG_EVENT_SEND_COMP UCS_BIT(0) /**< Event notification of send completion is + supported */ +#define UCT_IFACE_FLAG_EVENT_RECV UCS_BIT(1) /**< Event notification of tag and active message + receive is supported */ +#define UCT_IFACE_FLAG_EVENT_RECV_SIG UCS_BIT(2) /**< Event notification of signaled tag and active + message is supported */ + /* Event notification mechanisms */ +#define UCT_IFACE_FLAG_EVENT_FD UCS_BIT(3) /**< Event notification through File Descriptor + is supported */ +#define UCT_IFACE_FLAG_EVENT_ASYNC_CB UCS_BIT(4) /**< Event notification through asynchronous + callback invocation is supported */ +/** + * @} + */ + + /** * @ingroup UCT_CONTEXT * @brief Memory allocation methods. @@ -305,7 +501,13 @@ enum uct_flush_flags { canceled in which case the user will need to handle their completions through - the relevant callbacks. */ + the relevant callbacks. + After @ref uct_ep_flush + with this flag is completed, + the endpoint will be set to + error state, and it becomes + unusable for send operations + and should be destroyed. */ }; @@ -426,7 +628,13 @@ enum uct_iface_params_field { UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_ARG = UCS_BIT(11), /** Enables @ref uct_iface_params_t::rndv_cb */ - UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_CB = UCS_BIT(12) + UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_CB = UCS_BIT(12), + + /** Enables @ref uct_iface_params_t::async_event_arg */ + UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_ARG = UCS_BIT(13), + + /** Enables @ref uct_iface_params_t::async_event_cb */ + UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_CB = UCS_BIT(14) }; /** @@ -466,20 +674,6 @@ enum { sockaddr */ }; -/* - * @ingroup UCT_MD - * @brief Memory types - */ -typedef enum { - UCT_MD_MEM_TYPE_HOST = 0, /**< Default system memory */ - UCT_MD_MEM_TYPE_CUDA, /**< NVIDIA CUDA memory */ - UCT_MD_MEM_TYPE_CUDA_MANAGED, /**< NVIDIA CUDA managed (or unified) memory*/ - UCT_MD_MEM_TYPE_ROCM, /**< AMD ROCM memory */ - UCT_MD_MEM_TYPE_ROCM_MANAGED, /**< AMD ROCM managed system memory */ - UCT_MD_MEM_TYPE_LAST -} uct_memory_type_t; - - /** * @ingroup UCT_MD * @brief Memory allocation/registration flags. @@ -530,6 +724,51 @@ typedef enum { } uct_mem_advice_t; +/** + * @ingroup UCT_CLIENT_SERVER + * @brief UCT connection manager attributes field mask. + * + * The enumeration allows specifying which fields in @ref uct_cm_attr_t are + * present, for backward compatibility support. + */ +enum uct_cm_attr_field { + /** Enables @ref uct_cm_attr::max_conn_priv */ + UCT_CM_ATTR_FIELD_MAX_CONN_PRIV = UCS_BIT(0) +}; + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief UCT listener attributes field mask. + * + * The enumeration allows specifying which fields in @ref uct_listener_attr_t are + * present, for backward compatibility support. + */ +enum uct_listener_attr_field { + /** Enables @ref uct_listener_attr::sockaddr */ + UCT_LISTENER_ATTR_FIELD_SOCKADDR = UCS_BIT(0) +}; + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief UCT listener created by @ref uct_listener_create parameters field mask. + * + * The enumeration allows specifying which fields in @ref uct_listener_params_t + * are present, for backward compatibility support. + */ +enum uct_listener_params_field { + /** Enables @ref uct_listener_params::backlog */ + UCT_LISTENER_PARAM_FIELD_BACKLOG = UCS_BIT(0), + + /** Enables @ref uct_listener_params::conn_request_cb */ + UCT_LISTENER_PARAM_FIELD_CONN_REQUEST_CB = UCS_BIT(1), + + /** Enables @ref uct_listener_params::user_data */ + UCT_LISTENER_PARAM_FIELD_USER_DATA = UCS_BIT(2) +}; + + /** * @ingroup UCT_RESOURCE * @brief UCT endpoint created by @ref uct_ep_create parameters field mask. @@ -539,40 +778,58 @@ typedef enum { */ enum uct_ep_params_field { /** Enables @ref uct_ep_params::iface */ - UCT_EP_PARAM_FIELD_IFACE = UCS_BIT(0), + UCT_EP_PARAM_FIELD_IFACE = UCS_BIT(0), /** Enables @ref uct_ep_params::user_data */ - UCT_EP_PARAM_FIELD_USER_DATA = UCS_BIT(1), + UCT_EP_PARAM_FIELD_USER_DATA = UCS_BIT(1), /** Enables @ref uct_ep_params::dev_addr */ - UCT_EP_PARAM_FIELD_DEV_ADDR = UCS_BIT(2), + UCT_EP_PARAM_FIELD_DEV_ADDR = UCS_BIT(2), /** Enables @ref uct_ep_params::iface_addr */ - UCT_EP_PARAM_FIELD_IFACE_ADDR = UCS_BIT(3), + UCT_EP_PARAM_FIELD_IFACE_ADDR = UCS_BIT(3), /** Enables @ref uct_ep_params::sockaddr */ - UCT_EP_PARAM_FIELD_SOCKADDR = UCS_BIT(4), + UCT_EP_PARAM_FIELD_SOCKADDR = UCS_BIT(4), /** Enables @ref uct_ep_params::sockaddr_cb_flags */ - UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS = UCS_BIT(5), + UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS = UCS_BIT(5), /** Enables @ref uct_ep_params::sockaddr_pack_cb */ - UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB = UCS_BIT(6) + UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB = UCS_BIT(6), + + /** Enables @ref uct_ep_params::cm */ + UCT_EP_PARAM_FIELD_CM = UCS_BIT(7), + + /** Enables @ref uct_ep_params::conn_request */ + UCT_EP_PARAM_FIELD_CONN_REQUEST = UCS_BIT(8), + + /** Enables @ref uct_ep_params::sockaddr_cb_client */ + UCT_EP_PARAM_FIELD_SOCKADDR_CONNECT_CB_CLIENT = UCS_BIT(9), + + /** Enables @ref uct_ep_params::sockaddr_cb_server */ + UCT_EP_PARAM_FIELD_SOCKADDR_NOTIFY_CB_SERVER = UCS_BIT(10), + + /** Enables @ref uct_ep_params::disconnect_cb */ + UCT_EP_PARAM_FIELD_SOCKADDR_DISCONNECT_CB = UCS_BIT(11), + + /** Enables @ref uct_ep_params::path_index */ + UCT_EP_PARAM_FIELD_PATH_INDEX = UCS_BIT(12) }; /* * @ingroup UCT_RESOURCE - * @brief Linear growth specification: f(x) = overhead + growth * x + * @brief Process Per Node (PPN) bandwidth specification: f(ppn) = dedicated + shared / ppn * - * This structure specifies a linear function which is used as basis for time + * This structure specifies a function which is used as basis for bandwidth * estimation of various UCT operations. This information can be used to select * the best performing combination of UCT operations. */ -typedef struct uct_linear_growth { - double overhead; /**< Constant overhead factor */ - double growth; /**< Growth rate factor */ -} uct_linear_growth_t; +typedef struct uct_ppn_bandwidth { + double dedicated; /**< Dedicated bandwidth, bytes/second */ + double shared; /**< Shared bandwidth, bytes/second */ +} uct_ppn_bandwidth_t; /** @@ -673,6 +930,7 @@ struct uct_iface_attr { } atomic32, atomic64; /**< Attributes for atomic operations */ uint64_t flags; /**< Flags from @ref UCT_RESOURCE_IFACE_CAP */ + uint64_t event_flags;/**< Flags from @ref UCT_RESOURCE_IFACE_EVENT_CAP */ } cap; /**< Interface capabilities */ size_t device_addr_len;/**< Size of device address */ @@ -681,15 +939,28 @@ struct uct_iface_attr { size_t max_conn_priv; /**< Max size of the iface's private data. used for connection establishment with sockaddr */ + struct sockaddr_storage listen_sockaddr; /**< Sockaddr on which this iface + is listening. */ /* * The following fields define expected performance of the communication * interface, this would usually be a combination of device and system * characteristics and determined at run time. */ double overhead; /**< Message overhead, seconds */ - double bandwidth; /**< Maximal bandwidth, bytes/second */ - uct_linear_growth_t latency; /**< Latency model */ + uct_ppn_bandwidth_t bandwidth; /**< Bandwidth model */ + ucs_linear_func_t latency; /**< Latency as function of number of + active endpoints */ uint8_t priority; /**< Priority of device */ + size_t max_num_eps; /**< Maximum number of endpoints */ + unsigned dev_num_paths;/**< How many network paths can be + utilized on the device used by + this interface for optimal + performance. Endpoints that connect + to the same remote address but use + different paths can potentially + achieve higher total bandwidth + compared to using only a single + endpoint. */ }; @@ -761,6 +1032,13 @@ struct uct_iface_params { void *rndv_arg; /** Callback for tag matching unexpected rndv messages */ uct_tag_unexp_rndv_cb_t rndv_cb; + + void *async_event_arg; + /** Callback for asynchronous event handling. The callback will be + * invoked from UCT transport when there are new events to be + * read by user if the iface has @ref UCT_IFACE_FLAG_EVENT_ASYNC_CB + * capability */ + uct_async_event_cb_t async_event_cb; }; @@ -777,7 +1055,8 @@ struct uct_ep_params { uint64_t field_mask; /** - * Interface to create the endpoint on. This is a mandatory field. + * Interface to create the endpoint on. + * Either @a iface or @a cm field must be initialized but not both. */ uct_iface_h iface; @@ -818,13 +1097,121 @@ struct uct_ep_params { /** * Callback that will be used for filling the user's private data to be - * delivered to the server by @ref uct_sockaddr_conn_request_callback_t. + * delivered to the remote peer by the callback on the server or client side. * This field is only valid if @ref uct_ep_params_t::sockaddr is set. * @note It is never guaranteed that the callaback will be called. If, for * example, the endpoint goes into error state before issuing the connection * request, the callback will not be invoked. */ - uct_sockaddr_priv_pack_callback_t sockaddr_pack_cb; + uct_cm_ep_priv_data_pack_callback_t sockaddr_pack_cb; + + /** + * The connection manager object as created by @ref uct_cm_open. + * Either @a cm or @a iface field must be initialized but not both. + */ + uct_cm_h cm; + + /** + * Connection request that was passed to + * @ref uct_cm_listener_conn_request_args_t::conn_request. + * @note After a call to @ref uct_ep_create, @a params.conn_request is + * consumed and should not be used anymore, even if the call returns + * with an error. + */ + uct_conn_request_h conn_request; + + /** + * Callback that will be invoked when the endpoint on the client side + * is being connected to the server by a connection manager @ref uct_cm_h . + */ + uct_cm_ep_client_connect_callback_t sockaddr_cb_client; + + /** + * Callback that will be invoked when the endpoint on the server side + * is being connected to a client by a connection manager @ref uct_cm_h . + */ + uct_cm_ep_server_conn_notify_callback_t sockaddr_cb_server; + + /** + * Callback that will be invoked when the endpoint is disconnected. + */ + uct_ep_disconnect_cb_t disconnect_cb; + + /** + * Index of the path which the endpoint should use, must be in the range + * 0..(@ref uct_iface_attr_t.dev_num_paths - 1). + */ + unsigned path_index; +}; + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Connection manager attributes, capabilities and limitations. + */ +struct uct_cm_attr { + /** + * Mask of valid fields in this structure, using bits from + * @ref uct_cm_attr_field. Fields not specified by this mask + * will be ignored. + */ + uint64_t field_mask; + + /** + * Max size of the connection manager's private data used for connection + * establishment with sockaddr. + */ + size_t max_conn_priv; +}; + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief UCT listener attributes, capabilities and limitations. + */ +struct uct_listener_attr { + /** + * Mask of valid fields in this structure, using bits from + * @ref uct_listener_attr_field. Fields not specified by this mask + * will be ignored. + */ + uint64_t field_mask; + + /** + * Sockaddr on which this listener is listening. + */ + struct sockaddr_storage sockaddr; +}; + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Parameters for creating a listener object @ref uct_listener_h by + * @ref uct_listener_create + */ +struct uct_listener_params { + /** + * Mask of valid fields in this structure, using bits from + * @ref uct_listener_params_field. Fields not specified by this mask + * will be ignored. + */ + uint64_t field_mask; + + /** + * Backlog of incoming connection requests. + * If not specified, SOMAXCONN, as defined in , will be used. + */ + int backlog; + + /** + * Callback function for handling incoming connection requests. + */ + uct_cm_listener_conn_request_callback_t conn_request_cb; + + /** + * User data associated with the listener. + */ + void *user_data; }; @@ -841,20 +1228,91 @@ struct uct_md_attr { size_t max_alloc; /**< Maximal allocation size */ size_t max_reg; /**< Maximal registration size */ uint64_t flags; /**< UCT_MD_FLAG_xx */ - uint64_t reg_mem_types; /** UCS_BIT(uct_memory_type_t) */ - uct_memory_type_t mem_type; /**< Supported(owned) memory type */ + uint64_t reg_mem_types; /**< Bitmap of memory types that Memory Domain can be registered with */ + uint64_t detect_mem_types; /**< Bitmap of memory types that Memory Domain can detect if address belongs to it */ + ucs_memory_type_t access_mem_type; /**< Memory type that Memory Domain can access */ } cap; - uct_linear_growth_t reg_cost; /**< Memory registration cost estimation + ucs_linear_func_t reg_cost; /**< Memory registration cost estimation (time,seconds) as a linear function of the buffer size. */ - char component_name[UCT_MD_COMPONENT_NAME_MAX]; /**< MD component name */ + char component_name[UCT_COMPONENT_NAME_MAX]; /**< Component name */ size_t rkey_packed_size; /**< Size of buffer needed for packed rkey */ - cpu_set_t local_cpus; /**< Mask of CPUs near the resource */ + ucs_cpu_set_t local_cpus; /**< Mask of CPUs near the resource */ +}; + + +/** + * @ingroup UCT_MD + * @brief UCT MD memory attributes field mask + * + * The enumeration allows specifying which fields in @ref uct_md_mem_attr_t + * are present. + */ +enum uct_md_mem_attr_field { + UCT_MD_MEM_ATTR_FIELD_MEM_TYPE = UCS_BIT(0), /**< Indicate if memory type + is populated. E.g. CPU/GPU */ + UCT_MD_MEM_ATTR_FIELD_SYS_DEV = UCS_BIT(1) /**< Indicate if details of + system device backing + the pointer are populated. + E.g. NUMA/GPU */ }; +/** + * @ingroup UCT_MD + * @brief Memory domain attributes. + * + * This structure defines the attributes of a memory pointer which may + * include the memory type of the pointer, and the system device that backs + * the pointer depending on the bit fields populated in field_mask. + */ +typedef struct uct_md_mem_attr { + /** + * Mask of valid fields in this structure, using bits from + * @ref uct_md_mem_attr_t. Note that the field mask is + * populated upon return from uct_md_mem_query and not set by user. + * Subsequent use of members of the structure are valid after ensuring that + * relevant bits in the field_mask are set. + */ + uint64_t field_mask; + + /** + * The type of memory. E.g. CPU/GPU memory or some other valid type + */ + ucs_memory_type_t mem_type; + + /** + * Index of the system device on which the buffer resides. eg: NUMA/GPU + */ + ucs_sys_device_t sys_dev; +} uct_md_mem_attr_t; + + +/** + * @ingroup UCT_MD + * @brief Query attributes of a given pointer + * + * Return attributes such as memory type, and system device for the + * given pointer of specific length. + * + * @param [in] md Memory domain to run the query on. This function + * returns an error if the md does not recognize the + * pointer. + * @param [in] address The address of the pointer. Must be non-NULL + * else UCS_ERR_INVALID_PARAM error is returned. + * @param [in] length Length of the memory region to examine. + * Must be nonzero else UCS_ERR_INVALID_PARAM error + * is returned. + * @param [out] mem_attr If successful, filled with ptr attributes. + * + * @return Error code. + */ +ucs_status_t uct_md_mem_query(uct_md_h md, const void *address, const size_t length, + uct_md_mem_attr_t *mem_attr); + + /** * @ingroup UCT_MD * @brief Describes a memory allocated by UCT. @@ -867,7 +1325,7 @@ typedef struct uct_allocated_memory { void *address; /**< Address of allocated memory */ size_t length; /**< Real size of allocated memory */ uct_alloc_method_t method; /**< Method used to allocate the memory */ - uct_memory_type_t mem_type; /**< type of allocated memory */ + ucs_memory_type_t mem_type; /**< type of allocated memory */ uct_md_h md; /**< if method==MD: MD used to allocate the memory */ uct_mem_h memh; /**< if method==MD: MD memory handle */ } uct_allocated_memory_t; @@ -979,29 +1437,46 @@ extern const char *uct_alloc_method_names[]; /** * @ingroup UCT_RESOURCE - * @brief Query for memory resources. + * @brief Query for list of components. * - * Obtain the list of memory domain resources available on the current system. + * Obtain the list of transport components available on the current system. * - * @param [out] resources_p Filled with a pointer to an array of resource - * descriptors. - * @param [out] num_resources_p Filled with the number of resources in the array. + * @param [out] components_p Filled with a pointer to an array of component + * handles. + * @param [out] num_components_p Filled with the number of elements in the array. * - * @return Error code. + * @return UCS_OK if successful, or UCS_ERR_NO_MEMORY if failed to allocate the + * array of component handles. */ -ucs_status_t uct_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p); +ucs_status_t uct_query_components(uct_component_h **components_p, + unsigned *num_components_p); /** * @ingroup UCT_RESOURCE - * @brief Release the list of resources returned from @ref uct_query_md_resources. + * @brief Release the list of components returned from @ref uct_query_components. * - * This routine releases the memory associated with the list of resources - * allocated by @ref uct_query_md_resources. + * This routine releases the memory associated with the list of components + * allocated by @ref uct_query_components. * - * @param [in] resources Array of resource descriptors to release. + * @param [in] components Array of component handles to release. + */ +void uct_release_component_list(uct_component_h *components); + + +/** + * @ingroup UCT_RESOURCE + * @brief Get component attributes + * + * Query various attributes of a component. + * + * @param [in] component Component handle to query attributes for. The + * handle can be obtained from @ref uct_query_components. + * @param [inout] component_attr Filled with component attributes. + * + * @return UCS_OK if successful, or nonzero error code in case of failure. */ -void uct_release_md_resource_list(uct_md_resource_desc_t *resources); +ucs_status_t uct_component_query(uct_component_h component, + uct_component_attr_t *component_attr); /** @@ -1012,8 +1487,10 @@ void uct_release_md_resource_list(uct_md_resource_desc_t *resources); * are performed in the context of a specific memory domain. Therefore it * must be created before communication resources. * + * @param [in] component Component on which to open the memory domain, + * as returned from @ref uct_query_components. * @param [in] md_name Memory domain name, as returned from @ref - * uct_query_md_resources. + * uct_component_query. * @param [in] config MD configuration options. Should be obtained * from uct_md_config_read() function, or point to * MD-specific structure which extends uct_md_config_t. @@ -1021,8 +1498,8 @@ void uct_release_md_resource_list(uct_md_resource_desc_t *resources); * * @return Error code. */ -ucs_status_t uct_md_open(const char *md_name, const uct_md_config_t *config, - uct_md_h *md_p); +ucs_status_t uct_md_open(uct_component_h component, const char *md_name, + const uct_md_config_t *config, uct_md_h *md_p); /** * @ingroup UCT_RESOURCE @@ -1178,8 +1655,8 @@ ucs_status_t uct_md_iface_config_read(uct_md_h md, const char *tl_name, /** * @ingroup UCT_RESOURCE - * @brief Release configuration memory returned from uct_md_iface_config_read() or - * from uct_md_config_read(). + * @brief Release configuration memory returned from uct_md_iface_config_read(), + * uct_md_config_read(), or from uct_cm_config_read(). * * @param [in] config Configuration to release. */ @@ -1188,7 +1665,9 @@ void uct_config_release(void *config); /** * @ingroup UCT_CONTEXT - * @brief Get value by name from interface/MD configuration. + * @brief Get value by name from interface configuration (@ref uct_iface_config_t), + * memory domain configuration (@ref uct_md_config_t) + * or connection manager configuration (@ref uct_cm_config_t). * * @param [in] config Configuration to get from. * @param [in] name Configuration variable name. @@ -1205,7 +1684,9 @@ ucs_status_t uct_config_get(void *config, const char *name, char *value, /** * @ingroup UCT_CONTEXT - * @brief Modify interface/MD configuration. + * @brief Modify interface configuration (@ref uct_iface_config_t), + * memory domain configuration (@ref uct_md_config_t) + * or connection manager configuration (@ref uct_cm_config_t). * * @param [in] config Configuration to modify. * @param [in] name Configuration variable name. @@ -1430,7 +1911,7 @@ ucs_status_t uct_iface_set_am_tracer(uct_iface_h iface, uct_am_tracer_t tracer, /** - * @ingroup UCT_RESOURCE + * @ingroup UCT_CLIENT_SERVER * @brief Accept connection request. * * @param [in] iface Transport interface which generated connection @@ -1445,7 +1926,7 @@ ucs_status_t uct_iface_accept(uct_iface_h iface, /** - * @ingroup UCT_RESOURCE + * @ingroup UCT_CLIENT_SERVER * @brief Reject connection request. Will invoke an error handler @ref * uct_error_handler_t on the remote transport interface, if set. * @@ -1475,13 +1956,14 @@ ucs_status_t uct_iface_reject(uct_iface_h iface, * @ref uct_ep_params_t::iface_addr are set, this will establish an endpoint * that is connected to a remote interface. This requires that * @ref uct_ep_params_t::iface has the @ref UCT_IFACE_FLAG_CONNECT_TO_IFACE - * capability flag. It may be obtained by @ref uct_iface_query . + * capability flag. It may be obtained by @ref uct_iface_query. * -# Connect to a remote socket address: If @ref uct_ep_params_t::sockaddr is - * set, this will create an endpoint that is conected to a remote socket. - * This requires that @ref uct_ep_params_t::iface has the - * @ref UCT_IFACE_FLAG_CONNECT_TO_SOCKADDR capability flag. It may be - * obtained by @ref uct_iface_query .* - * @param [in] params User defined @ref uct_ep_params_t configurations for the + * set, this will create an endpoint that is connected to a remote socket. + * This requires that either @ref uct_ep_params::cm, or + * @ref uct_ep_params::iface will be set. In the latter case, the interface + * has to support @ref UCT_IFACE_FLAG_CONNECT_TO_SOCKADDR flag, which can be + * checked by calling @ref uct_iface_query. + * @param [in] params User defined @ref uct_ep_params_t configuration for the * @a ep_p. * @param [out] ep_p Filled with handle to the new endpoint. * @@ -1496,6 +1978,41 @@ ucs_status_t uct_iface_reject(uct_iface_h iface, ucs_status_t uct_ep_create(const uct_ep_params_t *params, uct_ep_h *ep_p); +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Initiate a disconnection of an endpoint connected to a + * sockaddr by a connection manager @ref uct_cm_h. + * + * This non-blocking routine will send a disconnect notification on the endpoint, + * so that @ref uct_ep_disconnect_cb_t will be called on the remote peer. + * The remote side should also call this routine when handling the initiator's + * disconnect. + * After a call to this function, the given endpoint may not be used for + * communications anymore. + * The @ref uct_ep_flush / @ref uct_iface_flush routines will guarantee that the + * disconnect notification is delivered to the remote peer. + * @ref uct_ep_destroy should be called on this endpoint after invoking this + * routine and @ref uct_ep_params::disconnect_cb was called. + * + * @param [in] ep Endpoint to disconnect. + * @param [in] flags Reserved for future use. + * + * @return UCS_OK Operation has completed successfully. + * UCS_ERR_BUSY The @a ep is not connected yet (either + * @ref uct_cm_ep_client_connect_callback_t or + * @ref uct_cm_ep_server_conn_notify_callback_t + * was not invoked). + * UCS_INPROGRESS The disconnect request has been initiated, but + * the remote peer has not yet responded to this + * request, and consequently the registered + * callback @ref uct_ep_disconnect_cb_t has not + * been invoked to handle the request. + * UCS_ERR_NOT_CONNECTED The @a ep is disconnected locally and remotely. + * Other error codes as defined by @ref ucs_status_t . + */ +ucs_status_t uct_ep_disconnect(uct_ep_h ep, unsigned flags); + + /** * @ingroup UCT_RESOURCE * @brief Destroy an endpoint. @@ -1573,12 +2090,12 @@ ucs_status_t uct_md_mem_free(uct_md_h md, uct_mem_h memh); /** * @ingroup UCT_MD - * @brief Give advice about the use of memory + * @brief Give advice about the use of memory * * This routine advises the UCT about how to handle memory range beginning at * address and size of length bytes. This call does not influence the semantics - * of the application, but may influence its performance. The advice may be - * ignored. + * of the application, but may influence its performance. The advice may be + * ignored. * * @param [in] md Memory domain memory was allocated or registered on. * @param [in] memh Memory handle, as returned from @ref uct_md_mem_alloc @@ -1621,16 +2138,21 @@ ucs_status_t uct_md_mem_dereg(uct_md_h md, uct_mem_h memh); /** * @ingroup UCT_MD - * @brief Check if memory type is owned by MD + * @brief Detect memory type * - * Check memory type. - * @return Nonzero if memory is owned, 0 if not owned * - * @param [in] md Memory domain to detect if memory belongs to. - * @param [in] addr Memory address to detect. - * @param [in] length Size of memory + * @param [in] md Memory domain to detect memory type + * @param [in] addr Memory address to detect. + * @param [in] length Size of memory + * @param [out] mem_type_p Filled with memory type of the address range if + function succeeds + * @return UCS_OK If memory type is successfully detected + * UCS_ERR_INVALID_ADDR If failed to detect memory type */ -int uct_md_is_mem_type_owned(uct_md_h md, void *addr, size_t length); +ucs_status_t uct_md_detect_memory_type(uct_md_h md, const void *addr, + size_t length, + ucs_memory_type_t *mem_type_p); + /** * @ingroup UCT_MD @@ -1685,9 +2207,9 @@ ucs_status_t uct_mem_free(const uct_allocated_memory_t *mem); /** * @ingroup UCT_MD - * @brief Read the configuration of the MD component. + * @brief Read the configuration for a memory domain. * - * @param [in] name Name of the MD or the MD component. + * @param [in] component Read the configuration of this component. * @param [in] env_prefix If non-NULL, search for environment variables * starting with this UCT__. Otherwise, search * for environment variables starting with just UCT_. @@ -1697,8 +2219,8 @@ ucs_status_t uct_mem_free(const uct_allocated_memory_t *mem); * * @return Error code. */ -ucs_status_t uct_md_config_read(const char *name, const char *env_prefix, - const char *filename, +ucs_status_t uct_md_config_read(uct_component_h component, + const char *env_prefix, const char *filename, uct_md_config_t **config_p); @@ -1746,12 +2268,22 @@ ucs_status_t uct_md_mkey_pack(uct_md_h md, uct_mem_h memh, void *rkey_buffer); * * @brief Unpack a remote key. * + * @param [in] component Component on which to unpack the remote key. * @param [in] rkey_buffer Packed remote key buffer. * @param [out] rkey_ob Filled with the unpacked remote key and its type. * + * @note The remote key must be unpacked with the same component that was used + * to pack it. For example, if a remote device address on the remote + * memory domain which was used to pack the key is reachable by a + * transport on a local component, then that component is eligible to + * unpack the key. + * If the remote key buffer cannot be unpacked with the given component, + * UCS_ERR_INVALID_PARAM will be returned. + * * @return Error code. */ -ucs_status_t uct_rkey_unpack(const void *rkey_buffer, uct_rkey_bundle_t *rkey_ob); +ucs_status_t uct_rkey_unpack(uct_component_h component, const void *rkey_buffer, + uct_rkey_bundle_t *rkey_ob); /** @@ -1763,6 +2295,8 @@ ucs_status_t uct_rkey_unpack(const void *rkey_buffer, uct_rkey_bundle_t *rkey_ob * described by the rkey bundle. The MD must support * @ref UCT_MD_FLAG_RKEY_PTR flag. * + * @param [in] component Component on which to obtain the pointer to the + * remote key. * @param [in] rkey_ob A remote key bundle as returned by * the @ref uct_rkey_unpack function. * @param [in] remote_addr A remote address within the memory area described @@ -1770,11 +2304,15 @@ ucs_status_t uct_rkey_unpack(const void *rkey_buffer, uct_rkey_bundle_t *rkey_ob * @param [out] addr_p A pointer that can be used for direct access to * the remote memory. * + * @note The component used to obtain a local pointer to the remote memory must + * be the same component that was used to pack the remote key. See notes + * section for @ref uct_rkey_unpack. + * * @return Error code if the remote memory cannot be accessed directly or * the remote address is not valid. */ -ucs_status_t uct_rkey_ptr(uct_rkey_bundle_t *rkey_ob, uint64_t remote_addr, - void **addr_p); +ucs_status_t uct_rkey_ptr(uct_component_h component, uct_rkey_bundle_t *rkey_ob, + uint64_t remote_addr, void **addr_p); /** @@ -1782,9 +2320,11 @@ ucs_status_t uct_rkey_ptr(uct_rkey_bundle_t *rkey_ob, uint64_t remote_addr, * * @brief Release a remote key. * + * @param [in] component Component which was used to unpack the remote key. * @param [in] rkey_ob Remote key to release. */ -ucs_status_t uct_rkey_release(const uct_rkey_bundle_t *rkey_ob); +ucs_status_t uct_rkey_release(uct_component_h component, + const uct_rkey_bundle_t *rkey_ob); /** @@ -1799,7 +2339,7 @@ ucs_status_t uct_rkey_release(const uct_rkey_bundle_t *rkey_ob); * * @param [in] worker Handle to worker. * - * @return Non-zero if any communication was progressed, zero otherwise. + * @return Nonzero if any communication was progressed, zero otherwise. */ UCT_INLINE_API unsigned uct_worker_progress(uct_worker_h worker) { @@ -1910,14 +2450,14 @@ UCT_INLINE_API ssize_t uct_ep_put_bcopy(uct_ep_h ep, uct_pack_callback_t pack_cb * * @param [in] ep Destination endpoint handle. * @param [in] iov Points to an array of @ref ::uct_iov_t structures. - * The @a iov pointer must be valid address of an array + * The @a iov pointer must be a valid address of an array * of @ref ::uct_iov_t structures. A particular structure - * pointer must be valid address. NULL terminated pointer - * is not required. + * pointer must be a valid address. A NULL terminated + * array is not required. * @param [in] iovcnt Size of the @a iov data @ref ::uct_iov_t structures * array. If @a iovcnt is zero, the data is considered empty. * @a iovcnt is limited by @ref uct_iface_attr_cap_put_max_iov - * "uct_iface_attr::cap::put::max_iov" + * "uct_iface_attr::cap::put::max_iov". * @param [in] remote_addr Remote address to place the @a iov data. * @param [in] rkey Remote key descriptor provided by @ref ::uct_rkey_unpack * @param [in] comp Completion handle as defined by @ref ::uct_completion_t. @@ -1973,14 +2513,14 @@ UCT_INLINE_API ucs_status_t uct_ep_get_bcopy(uct_ep_h ep, uct_unpack_callback_t * * @param [in] ep Destination endpoint handle. * @param [in] iov Points to an array of @ref ::uct_iov_t structures. - * The @a iov pointer must be valid address of an array + * The @a iov pointer must be a valid address of an array * of @ref ::uct_iov_t structures. A particular structure - * pointer must be valid address. NULL terminated pointer - * is not required. + * pointer must be a valid address. A NULL terminated + * array is not required. * @param [in] iovcnt Size of the @a iov data @ref ::uct_iov_t structures * array. If @a iovcnt is zero, the data is considered empty. * @a iovcnt is limited by @ref uct_iface_attr_cap_get_max_iov - * "uct_iface_attr::cap::get::max_iov" + * "uct_iface_attr::cap::get::max_iov". * @param [in] remote_addr Remote address of the data placed to the @a iov. * @param [in] rkey Remote key descriptor provided by @ref ::uct_rkey_unpack * @param [in] comp Completion handle as defined by @ref ::uct_completion_t. @@ -2032,26 +2572,32 @@ UCT_INLINE_API ssize_t uct_ep_am_bcopy(uct_ep_h ep, uint8_t id, * iov[1], and so on. * * - * @param [in] ep Destination endpoint handle. - * @param [in] id Active message id. Must be in range 0..UCT_AM_ID_MAX-1. - * @param [in] header Active message header. - * @param [in] header_length Active message header length in bytes. - * @param [in] iov Points to an array of @ref ::uct_iov_t structures. - * The @a iov pointer must be valid address of an array - * of @ref ::uct_iov_t structures. A particular structure - * pointer must be valid address. NULL terminated pointer - * is not required. - * @param [in] iovcnt Size of the @a iov data @ref ::uct_iov_t structures - * array. If @a iovcnt is zero, the data is considered empty. - * @a iovcnt is limited by @ref uct_iface_attr_cap_am_max_iov - * "uct_iface_attr::cap::am::max_iov" - * @param [in] flags Active message flags, see @ref uct_msg_flags. - * @param [in] comp Completion handle as defined by @ref ::uct_completion_t. - * - * @return UCS_INPROGRESS Some communication operations are still in progress. - * If non-NULL @a comp is provided, it will be updated - * upon completion of these operations. - * + * @param [in] ep Destination endpoint handle. + * @param [in] id Active message id. Must be in range 0..UCT_AM_ID_MAX-1. + * @param [in] header Active message header. + * @param [in] header_length Active message header length in bytes. + * @param [in] iov Points to an array of @ref ::uct_iov_t structures. + * The @a iov pointer must be a valid address of an array + * of @ref ::uct_iov_t structures. A particular structure + * pointer must be a valid address. A NULL terminated + * array is not required. + * @param [in] iovcnt Size of the @a iov data @ref ::uct_iov_t structures + * array. If @a iovcnt is zero, the data is considered empty. + * @a iovcnt is limited by @ref uct_iface_attr_cap_am_max_iov + * "uct_iface_attr::cap::am::max_iov". + * @param [in] flags Active message flags, see @ref uct_msg_flags. + * @param [in] comp Completion handle as defined by @ref ::uct_completion_t. + * + * @return UCS_OK Operation completed successfully. + * @return UCS_INPROGRESS Some communication operations are still in progress. + * If non-NULL @a comp is provided, it will be updated + * upon completion of these operations. + * @return UCS_ERR_NO_RESOURCE Could not start the operation due to lack of send + * resources. + * + * @note If the operation returns @a UCS_INPROGRESS, the memory buffers + * pointed to by @a iov array must not be modified until the operation + * is completed by @a comp. @a header can be released or changed. */ UCT_INLINE_API ucs_status_t uct_ep_am_zcopy(uct_ep_h ep, uint8_t id, const void *header, @@ -2144,16 +2690,16 @@ UCT_INLINE_API ucs_status_t uct_ep_atomic64_fetch(uct_ep_h ep, uct_atomic_op_t o * @ingroup UCT_RESOURCE * @brief Add a pending request to an endpoint. * - * Add a pending request to the endpoint pending queue. The request will be + * Add a pending request to the endpoint pending queue. The request will be * dispatched when the endpoint could potentially have additional send resources. * * @param [in] ep Endpoint to add the pending request to. * @param [in] req Pending request, which would be dispatched when more * resources become available. The user is expected to initialize * the "func" field. - * After passed to the function, the request is owned by UCT, + * After being passed to the function, the request is owned by UCT, * until the callback is called and returns UCS_OK. - * @param [in] flags Reserved for future use. + * @param [in] flags Flags that control pending request processing (see @ref uct_cb_flags) * * @return UCS_OK - request added to pending queue * UCS_ERR_BUSY - request was not added to pending queue, because send @@ -2250,7 +2796,7 @@ UCT_INLINE_API ucs_status_t uct_ep_fence(uct_ep_h ep, unsigned flags) * The data is provided as buffer and its length,and must not be larger than the * corresponding @a max_short value in @ref uct_iface_attr. * The immediate value delivered to the receiver is implicitly equal to 0. - * If it's required to pass non-zero imm value, @ref uct_ep_tag_eager_bcopy + * If it's required to pass nonzero imm value, @ref uct_ep_tag_eager_bcopy * should be used. * * @param [in] ep Destination endpoint handle. @@ -2259,8 +2805,8 @@ UCT_INLINE_API ucs_status_t uct_ep_fence(uct_ep_h ep, unsigned flags) * @param [in] length Data length. * * @return UCS_OK - operation completed successfully. - * @return UCS_ERR_NO_RESOURCE - could not start the operation now due to lack - * of send resources. + * @return UCS_ERR_NO_RESOURCE - could not start the operation due to lack of + * send resources. */ UCT_INLINE_API ucs_status_t uct_ep_tag_eager_short(uct_ep_h ep, uct_tag_t tag, const void *data, size_t length) @@ -2321,8 +2867,8 @@ UCT_INLINE_API ssize_t uct_ep_tag_eager_bcopy(uct_ep_h ep, uct_tag_t tag, * @param [in] imm Immediate value which will be available to the * receiver. * @param [in] iov Points to an array of @ref uct_iov_t structures. - * A particular structure pointer must be valid address. - * NULL terminated pointer is not required. + * A particular structure pointer must be a valid address. + * A NULL terminated array is not required. * @param [in] iovcnt Size of the @a iov array. If @a iovcnt is zero, the * data is considered empty. Note that @a iovcnt is * limited by the corresponding @a max_iov value in @@ -2333,8 +2879,8 @@ UCT_INLINE_API ssize_t uct_ep_tag_eager_bcopy(uct_ep_h ep, uct_tag_t tag, * can be reused or invalidated. * * @return UCS_OK - operation completed successfully. - * @return UCS_ERR_NO_RESOURCE - could not start the operation now due to lack - * of send resources. + * @return UCS_ERR_NO_RESOURCE - could not start the operation due to lack of + * send resources. * @return UCS_INPROGRESS - operation started, and @a comp will be used to * notify when it's completed. */ @@ -2371,7 +2917,7 @@ UCT_INLINE_API ucs_status_t uct_ep_tag_eager_zcopy(uct_ep_h ep, uct_tag_t tag, * value in @ref uct_iface_attr. * @param [in] iov Points to an array of @ref uct_iov_t structures. * A particular structure pointer must be valid - * address. NULL terminated pointer is not required. + * address. A NULL terminated array is not required. * @param [in] iovcnt Size of the @a iov array. If @a iovcnt is zero, * the data is considered empty. Note that @a iovcnt * is limited by the corresponding @a max_iov value @@ -2439,7 +2985,7 @@ UCT_INLINE_API ucs_status_t uct_ep_tag_rndv_cancel(uct_ep_h ep, void *op) * @param [in] flags Tag message flags, see @ref uct_msg_flags. * * @return UCS_OK - operation completed successfully. - * @return UCS_ERR_NO_RESOURCE - could not start the operation now due to lack of + * @return UCS_ERR_NO_RESOURCE - could not start the operation due to lack of * send resources. */ UCT_INLINE_API ucs_status_t uct_ep_tag_rndv_request(uct_ep_h ep, uct_tag_t tag, @@ -2466,14 +3012,14 @@ UCT_INLINE_API ucs_status_t uct_ep_tag_rndv_request(uct_ep_h ep, uct_tag_t tag, * @param [in] tag_mask Mask which specifies what bits of the tag to * compare. * @param [in] iov Points to an array of @ref ::uct_iov_t structures. - * The @a iov pointer must be valid address of an array + * The @a iov pointer must be a valid address of an array * of @ref ::uct_iov_t structures. A particular structure - * pointer must be valid address. NULL terminated pointer - * is not required. + * pointer must be a valid address. A NULL terminated + * array is not required. * @param [in] iovcnt Size of the @a iov data @ref ::uct_iov_t structures * array. If @a iovcnt is zero, the data is considered empty. * @a iovcnt is limited by @ref uct_iface_attr_cap_tag_recv_iov - * "uct_iface_attr::cap::tag::max_iov" + * "uct_iface_attr::cap::tag::max_iov". * @param [inout] ctx Context associated with this particular tag, "priv" field * in this structure is used to track the state internally. * @@ -2582,6 +3128,158 @@ UCT_INLINE_API unsigned uct_iface_progress(uct_iface_h iface) } +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Open a connection manager. + * + * Open a connection manager. All client server connection + * establishment operations are performed in the context of a specific + * connection manager. + * @note This is an alternative API for + * @ref uct_iface_open_mode::UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER and + * @ref uct_iface_open_mode::UCT_IFACE_OPEN_MODE_SOCKADDR_CLIENT . + * + * @param [in] component Component on which to open the connection manager, + * as returned from @ref uct_query_components. + * @param [in] worker Worker on which to open the connection manager. + * @param [in] config CM configuration options. Either obtained + * from @ref uct_cm_config_read() function, or pointer + * to CM-specific structure that extends + * @ref uct_cm_config_t. + * @param [out] cm_p Filled with a handle to the connection manager. + * + * @return Error code. + */ +ucs_status_t uct_cm_open(uct_component_h component, uct_worker_h worker, + const uct_cm_config_t *config, uct_cm_h *cm_p); + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Close a connection manager. + * + * @param [in] cm Connection manager to close. + */ +void uct_cm_close(uct_cm_h cm); + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Get connection manager attributes. + * + * This routine queries the @ref uct_cm_h "cm" for its attributes + * @ref uct_cm_attr_t. + * + * @param [in] cm Connection manager to query. + * @param [out] cm_attr Filled with connection manager attributes. + */ +ucs_status_t uct_cm_query(uct_cm_h cm, uct_cm_attr_t *cm_attr); + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Read the configuration for a connection manager. + * + * @param [in] component Read the configuration of the connection manager + * on this component. + * @param [in] env_prefix If non-NULL, search for environment variables + * starting with this UCT__. Otherwise, search + * for environment variables starting with just UCT_. + * @param [in] filename If non-NULL, read configuration from this file. If + * the file does not exist, or exists but cannot be + * opened or read, it will be ignored. + * @param [out] config_p Filled with a pointer to the configuration. + * + * @return Error code. + */ +ucs_status_t uct_cm_config_read(uct_component_h component, + const char *env_prefix, const char *filename, + uct_cm_config_t **config_p); + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Notify the server about client-side connection establishment. + * + * This routine should be called on the client side after the client completed + * establishing its connection to the server. The routine will send a + * notification message to the server indicating that the client is connected. + * + * @param [in] ep The connected endpoint on the client side. + * + * @return Error code. + */ +ucs_status_t uct_cm_client_ep_conn_notify(uct_ep_h ep); + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Create a new transport listener object. + * + * This routine creates a new listener on the given CM which will start + * listening on a given sockaddr. + * + * @param [in] cm Connection manager on which to open the listener. + * This cm should not be closed as long as there are + * open listeners on it. + * @param [in] saddr The socket address to listen on. + * @param [in] socklen The saddr length. + * @param [in] params User defined @ref uct_listener_params_t + * configurations for the @a listener_p. + * @param [out] listener_p Filled with handle to the new listener. + * + * @return Error code. + */ +ucs_status_t uct_listener_create(uct_cm_h cm, const struct sockaddr *saddr, + socklen_t socklen, + const uct_listener_params_t *params, + uct_listener_h *listener_p); + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Destroy a transport listener. + * + * @param [in] listener Listener to destroy. + */ +void uct_listener_destroy(uct_listener_h listener); + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Reject a connection request. + * + * This routine can be invoked on the server side. It rejects a connection request + * from the client. + * + * @param [in] listener Listener which will reject the connection request. + * @param [in] conn_request Connection establishment request passed as parameter + * of @ref uct_cm_listener_conn_request_callback_t in + * @ref uct_cm_listener_conn_request_args_t::conn_request. + * + * + * @return Error code as defined by @ref ucs_status_t + */ +ucs_status_t uct_listener_reject(uct_listener_h listener, + uct_conn_request_h conn_request); + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Get attributes specific to a particular listener. + * + * This routine queries the @ref uct_listener_h "listener" for its attributes + * @ref uct_listener_attr_t. + * + * @param [in] listener Listener object to query. + * @param [out] listener_attr Filled with attributes of the listener. + * + * @return Error code as defined by @ref ucs_status_t + */ +ucs_status_t uct_listener_query(uct_listener_h listener, + uct_listener_attr_t *listener_attr); + + /** * @example uct_hello_world.c * UCT hello world client / server example utility. diff --git a/src/uct/api/uct_def.h b/src/uct/api/uct_def.h index e10a733c14b..41d79620bd6 100644 --- a/src/uct/api/uct_def.h +++ b/src/uct/api/uct_def.h @@ -15,8 +15,8 @@ #include +#define UCT_COMPONENT_NAME_MAX 16 #define UCT_TL_NAME_MAX 10 -#define UCT_MD_COMPONENT_NAME_MAX 16 #define UCT_MD_NAME_MAX 16 #define UCT_DEVICE_NAME_MAX 32 #define UCT_PENDING_REQ_PRIV_LEN 40 @@ -45,10 +45,10 @@ enum uct_am_trace_type { * @ingroup UCT_RESOURCE * @brief Flags for active message and tag-matching offload callbacks (callback's parameters). * - * If this flag is enabled, then data is part of a descriptor which includes - * the user-defined rx_headroom, and the callback may return UCS_INPROGRESS - * and hold on to that descriptor. Otherwise, the data can't be used outside - * the callback. If needed, the data must be copied-out. + * If UCT_CB_PARAM_FLAG_DESC flag is enabled, then data is part of a descriptor + * which includes the user-defined rx_headroom, and the callback may return + * UCS_INPROGRESS and hold on to that descriptor. Otherwise, the data can't be + * used outside the callback. If needed, the data must be copied-out. * @verbatim descriptor data @@ -58,40 +58,55 @@ enum uct_am_trace_type { +-------------+-------------------------+ @endverbatim * + * UCT_CB_PARAM_FLAG_FIRST and UCT_CB_PARAM_FLAG_MORE flags are relevant for + * @ref uct_tag_unexp_eager_cb_t callback only. The former value indicates that + * the data is the first fragment of the message. The latter value means that + * more fragments of the message yet to be delivered. */ enum uct_cb_param_flags { - UCT_CB_PARAM_FLAG_DESC = UCS_BIT(0) + UCT_CB_PARAM_FLAG_DESC = UCS_BIT(0), + UCT_CB_PARAM_FLAG_FIRST = UCS_BIT(1), + UCT_CB_PARAM_FLAG_MORE = UCS_BIT(2) }; /** * @addtogroup UCT_RESOURCE * @{ */ -typedef struct uct_iface *uct_iface_h; -typedef struct uct_iface_config uct_iface_config_t; -typedef struct uct_md_config uct_md_config_t; -typedef struct uct_ep *uct_ep_h; -typedef void * uct_mem_h; -typedef uintptr_t uct_rkey_t; -typedef struct uct_md *uct_md_h; /**< @brief Memory domain handler */ -typedef struct uct_md_ops uct_md_ops_t; -typedef void *uct_rkey_ctx_h; -typedef struct uct_iface_attr uct_iface_attr_t; -typedef struct uct_iface_params uct_iface_params_t; -typedef struct uct_md_attr uct_md_attr_t; -typedef struct uct_completion uct_completion_t; -typedef struct uct_pending_req uct_pending_req_t; -typedef struct uct_worker *uct_worker_h; -typedef struct uct_md uct_md_t; -typedef enum uct_am_trace_type uct_am_trace_type_t; -typedef struct uct_device_addr uct_device_addr_t; -typedef struct uct_iface_addr uct_iface_addr_t; -typedef struct uct_ep_addr uct_ep_addr_t; -typedef struct uct_ep_params uct_ep_params_t; -typedef struct uct_tag_context uct_tag_context_t; -typedef uint64_t uct_tag_t; /* tag type - 64 bit */ -typedef int uct_worker_cb_id_t; -typedef void* uct_conn_request_h; +typedef struct uct_component *uct_component_h; +typedef struct uct_iface *uct_iface_h; +typedef struct uct_iface_config uct_iface_config_t; +typedef struct uct_md_config uct_md_config_t; +typedef struct uct_cm_config uct_cm_config_t; +typedef struct uct_ep *uct_ep_h; +typedef void * uct_mem_h; +typedef uintptr_t uct_rkey_t; +typedef struct uct_md *uct_md_h; /**< @brief Memory domain handler */ +typedef struct uct_md_ops uct_md_ops_t; +typedef void *uct_rkey_ctx_h; +typedef struct uct_iface_attr uct_iface_attr_t; +typedef struct uct_iface_params uct_iface_params_t; +typedef struct uct_md_attr uct_md_attr_t; +typedef struct uct_completion uct_completion_t; +typedef struct uct_pending_req uct_pending_req_t; +typedef struct uct_worker *uct_worker_h; +typedef struct uct_md uct_md_t; +typedef enum uct_am_trace_type uct_am_trace_type_t; +typedef struct uct_device_addr uct_device_addr_t; +typedef struct uct_iface_addr uct_iface_addr_t; +typedef struct uct_ep_addr uct_ep_addr_t; +typedef struct uct_ep_params uct_ep_params_t; +typedef struct uct_cm_attr uct_cm_attr_t; +typedef struct uct_cm uct_cm_t; +typedef uct_cm_t *uct_cm_h; +typedef struct uct_listener_attr uct_listener_attr_t; +typedef struct uct_listener *uct_listener_h; +typedef struct uct_listener_params uct_listener_params_t; +typedef struct uct_tag_context uct_tag_context_t; +typedef uint64_t uct_tag_t; /* tag type - 64 bit */ +typedef int uct_worker_cb_id_t; +typedef void* uct_conn_request_h; + /** * @} */ @@ -135,6 +150,273 @@ typedef struct uct_iov { } uct_iov_t; +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Client-Server private data pack callback arguments field mask. + * + * The enumeration allows specifying which fields in + * @ref uct_cm_ep_priv_data_pack_args are present, for backward compatibility support. + */ +enum uct_cm_ep_priv_data_pack_args_field { + /** Enables @ref uct_cm_ep_priv_data_pack_args::dev_name + * Indicates that dev_name field in uct_cm_ep_priv_data_pack_args_t is valid. + */ + UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME = UCS_BIT(0) +}; + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Arguments to the client-server private data pack callback. + * + * Used with the client-server API on a connection manager. + */ +typedef struct uct_cm_ep_priv_data_pack_args { + /** + * Mask of valid fields in this structure, using bits from + * @ref uct_cm_ep_priv_data_pack_args_field. + * Fields not specified by this mask should not be accessed by the callback. + */ + uint64_t field_mask; + + /** + * Device name. This routine may fill the user's private data according to + * the given device name. The device name that is passed to this routine, + * corresponds to @ref uct_tl_resource_desc_t::dev_name as returned from + * @ref uct_md_query_tl_resources. + */ + char dev_name[UCT_DEVICE_NAME_MAX]; +} uct_cm_ep_priv_data_pack_args_t; + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Remote data attributes field mask. + * + * The enumeration allows specifying which fields in @ref uct_cm_remote_data are + * present, for backward compatibility support. + */ +enum uct_cm_remote_data_field { + /** Enables @ref uct_cm_remote_data::dev_addr */ + UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR = UCS_BIT(0), + + /** Enables @ref uct_cm_remote_data::dev_addr_length */ + UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR_LENGTH = UCS_BIT(1), + + /** Enables @ref uct_cm_remote_data::conn_priv_data */ + UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA = UCS_BIT(2), + + /** Enables @ref uct_cm_remote_data::conn_priv_data_length */ + UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA_LENGTH = UCS_BIT(3) +}; + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Data received from the remote peer. + * + * The remote peer's device address, the data received from it and their lengths. + * Used with the client-server API on a connection manager. + */ +typedef struct uct_cm_remote_data { + /** + * Mask of valid fields in this structure, using bits from + * @ref uct_cm_remote_data_field. Fields not specified by this mask + * will be ignored. + */ + uint64_t field_mask; + + /** + * Device address of the remote peer. + */ + const uct_device_addr_t *dev_addr; + + /** + * Length of the remote device address. + */ + size_t dev_addr_length; + + /** + * Pointer to the received data. This is the private data that was passed to + * @ref uct_ep_params_t::sockaddr_pack_cb. + */ + const void *conn_priv_data; + + /** + * Length of the received data from the peer. + */ + size_t conn_priv_data_length; +} uct_cm_remote_data_t; + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Listener's connection request callback arguments field mask. + * + * The enumeration allows specifying which fields in + * @ref uct_cm_listener_conn_request_args are present, for backward compatibility + * support. + */ +enum uct_cm_listener_conn_request_args_field { + /** Enables @ref uct_cm_listener_conn_request_args::dev_name + * Indicates that dev_name field in uct_cm_listener_conn_request_args_t is + * valid. + */ + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_DEV_NAME = UCS_BIT(0), + + /** Enables @ref uct_cm_listener_conn_request_args::conn_request + * Indicates that conn_request field in uct_cm_listener_conn_request_args_t + * is valid. + */ + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_CONN_REQUEST = UCS_BIT(1), + + /** Enables @ref uct_cm_listener_conn_request_args::remote_data + * Indicates that remote_data field in uct_cm_listener_conn_request_args_t + * is valid. + */ + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_REMOTE_DATA = UCS_BIT(2), + + /** Enables @ref uct_cm_listener_conn_request_args::client_address + * Indicates that client_address field in uct_cm_listener_conn_request_args_t + * is valid. + */ + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_CLIENT_ADDR = UCS_BIT(3) +}; + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Arguments to the listener's connection request callback. + * + * The local device name, connection request handle and the data the client sent. + * Used with the client-server API on a connection manager. + */ +typedef struct uct_cm_listener_conn_request_args { + /** + * Mask of valid fields in this structure, using bits from + * @ref uct_cm_listener_conn_request_args_field. + * Fields not specified by this mask should not be acceessed by the callback. + */ + uint64_t field_mask; + + /** + * Local device name which handles the incoming connection request. + */ + char dev_name[UCT_DEVICE_NAME_MAX]; + + /** + * Connection request handle. Can be passed to this callback from the + * transport and will be used by it to accept or reject the connection + * request from the client. + */ + uct_conn_request_h conn_request; + + /** + * Remote data from the client. + */ + const uct_cm_remote_data_t *remote_data; + + /** + * Client's address. + */ + ucs_sock_addr_t client_address; +} uct_cm_listener_conn_request_args_t; + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Field mask flags for client-side connection established callback. + * + * The enumeration allows specifying which fields in + * @ref uct_cm_ep_client_connect_args are present, for backward compatibility + * support. + */ +enum uct_cm_ep_client_connect_args_field { + /** Enables @ref uct_cm_ep_client_connect_args::remote_data */ + UCT_CM_EP_CLIENT_CONNECT_ARGS_FIELD_REMOTE_DATA = UCS_BIT(0), + + /** Enables @ref uct_cm_ep_client_connect_args::status */ + UCT_CM_EP_CLIENT_CONNECT_ARGS_FIELD_STATUS = UCS_BIT(1) +}; + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Arguments to the client's connect callback. + * + * Used with the client-server API on a connection manager. + */ +typedef struct uct_cm_ep_client_connect_args { + /** + * Mask of valid fields in this structure, using bits from + * @ref uct_cm_ep_client_connect_args_field. + * Fields not specified by this mask should not be accessed by the callback. + */ + uint64_t field_mask; + + /** + * Remote data from the server. + */ + const uct_cm_remote_data_t *remote_data; + + /** + * Indicates the connection establishment response from the remote server: + * UCS_OK - the remote server accepted the connection request. + * UCS_ERR_REJECTED - the remote server rejected the connection request. + * UCS_ERR_CONNECTION_RESET - the server's connection was reset during + * the connection establishment to the client. + * Otherwise - indicates an internal connection establishment + * error on the local (client) side. + */ + ucs_status_t status; +} uct_cm_ep_client_connect_args_t; + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Field mask flags for server-side connection established notification + * callback. + * + * The enumeration allows specifying which fields in + * @ref uct_cm_ep_server_conn_notify_args are present, for backward compatibility + * support. + */ +enum uct_cm_ep_server_conn_notify_args_field { + /** Enables @ref uct_cm_ep_server_conn_notify_args::status + * Indicates that status field in uct_cm_ep_server_conn_notify_args_t is valid. + */ + UCT_CM_EP_SERVER_CONN_NOTIFY_ARGS_FIELD_STATUS = UCS_BIT(0) +}; + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Arguments to the server's notify callback. + * + * Used with the client-server API on a connection manager. + */ +typedef struct uct_cm_ep_server_conn_notify_args { + /** + * Mask of valid fields in this structure, using bits from + * @ref uct_cm_ep_server_conn_notify_args_field. + * Fields not specified by this mask should not be accessed by the callback. + */ + uint64_t field_mask; + + /** + * Indicates the client's @ref ucs_status_t status: + * UCS_OK - the client completed its connection + * establishment and called + * @ref uct_cm_client_ep_conn_notify + * UCS_ERR_CONNECTION_RESET - the client's connection was reset during + * the connection establishment to the server. + * Otherwise - indicates an internal connection establishment + * error on the local (server) side. + */ + ucs_status_t status; +} uct_cm_ep_server_conn_notify_args_t; + + /** * @ingroup UCT_AM * @brief Callback to process incoming active message @@ -269,9 +551,8 @@ typedef void (*uct_unpack_callback_t)(void *arg, const void *data, size_t length /** - * @ingroup UCT_RESOURCE - * @brief Callback to process an incoming connection request message on the server - * side. + * @ingroup UCT_CLIENT_SERVER + * @brief Callback to process an incoming connection request on the server side. * * This callback routine will be invoked on the server side upon receiving an * incoming connection request. It should be set by the server side while @@ -287,6 +568,8 @@ typedef void (*uct_unpack_callback_t)(void *arg, const void *data, size_t length * should accept or reject the request by calling * @ref uct_iface_accept or @ref uct_iface_reject * routines respectively. + * conn_request should not be used outside the + * scope of this callback. * @param [in] conn_priv_data Points to the received data. * This is the private data that was passed to the * @ref uct_ep_params_t::sockaddr_pack_cb on the @@ -302,35 +585,131 @@ typedef void /** - * @ingroup UCT_RESOURCE - * @brief Callback to fill the user's private data on the client side. + * @ingroup UCT_CLIENT_SERVER + * @brief Callback to process an incoming connection request on the server side + * listener in a connection manager. + * + * This callback routine will be invoked on the server side upon receiving an + * incoming connection request. It should be set by the server side while + * initializing a listener in a connection manager. + * This callback has to be thread safe. + * Other than communication progress routines, it is allowed to call other UCT + * communication routines from this callback. + * + * @param [in] listener Transport listener. + * @param [in] arg User argument for this callback as defined in + * @ref uct_listener_params_t::user_data + * @param [in] conn_req_args Listener's arguments to handle the connection + * request from the client. + */ +typedef void +(*uct_cm_listener_conn_request_callback_t)(uct_listener_h listener, void *arg, + const uct_cm_listener_conn_request_args_t + *conn_req_args); + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Callback to process an incoming connection establishment acknowledgment + * on the server side listener, from the client, which indicates that the + * client side is connected. + * The callback also notifies the server side of a local error on a + * not-yet-connected endpoint. + * + * This callback routine will be invoked on the server side upon receiving an + * incoming connection establishment acknowledgment from the client, which is sent + * from it once the client is connected to the server. Used to connect the server + * side to the client or handle an error from it - depending on the status field. + * This callback will also be invoked in the event of an internal local error + * with a failed @ref uct_cm_ep_server_conn_notify_args::status if the endpoint + * was not connected yet. + * This callback has to be thread safe. + * Other than communication progress routines, it is permissible to call other UCT + * communication routines from this callback. + * + * @param [in] ep Transport endpoint. + * @param [in] arg User argument for this callback as defined in + * @ref uct_ep_params_t::user_data + * @param [in] connect_args Server's connect callback arguments. + */ +typedef void (*uct_cm_ep_server_conn_notify_callback_t) + (uct_ep_h ep, void *arg, + const uct_cm_ep_server_conn_notify_args_t *connect_args); + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Callback to process an incoming connection response on the client side + * from the server or handle a local error on a not-yet-connected endpoint. + * + * This callback routine will be invoked on the client side upon receiving an + * incoming connection response from the server. Used to connect the client side + * to the server or handle an error from it - depending on the status field. + * This callback will also be invoked in the event of an internal local error + * with a failed @ref uct_cm_ep_client_connect_args::status if the endpoint was + * not connected yet. + * This callback has to be thread safe. + * Other than communication progress routines, it is permissible to call other UCT + * communication routines from this callback. * - * This callback routine will be invoked on the client side before sending the - * transport's connection request to the server. - * The callback routine must be set by the client when creating an endpoint. + * @param [in] ep Transport endpoint. + * @param [in] arg User argument for this callback as defined in + * @ref uct_ep_params_t::user_data. + * @param [in] connect_args Client's connect callback arguments + */ +typedef void (*uct_cm_ep_client_connect_callback_t)(uct_ep_h ep, void *arg, + const uct_cm_ep_client_connect_args_t + *connect_args); + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Callback to handle the disconnection of the remote peer. + * + * This callback routine will be invoked on the client and server sides upon + * a disconnect of the remote peer. It will disconnect the given endpoint from + * the remote peer. + * This callback won't be invoked if the endpoint was not connected to the remote + * peer yet. + * This callback has to be thread safe. + * Other than communication progress routines, it is permissible to call other UCT + * communication routines from this callback. + * + * @param [in] ep Transport endpoint to disconnect. + * @param [in] arg User argument for this callback as defined in + * @ref uct_ep_params_t::user_data. + */ +typedef void (*uct_ep_disconnect_cb_t)(uct_ep_h ep, void *arg); + + +/** + * @ingroup UCT_CLIENT_SERVER + * @brief Callback to fill the user's private data in a client-server flow. + * + * This callback routine will be invoked on the client side, before sending the + * transport's connection request to the server, or on the server side before + * sending a connection response to the client. + * The callback routine must be set when creating an endpoint. * The user's private data should be placed inside the priv_data buffer to be - * sent to the server side. + * sent to the remote side. * The maximal allowed length of the private data is indicated by the field - * max_conn_priv inside @ref uct_iface_attr. + * max_conn_priv inside @ref uct_iface_attr or inside @ref uct_cm_attr when using a + * connection manager. * Communication progress routines should not be called from this callback. * It is allowed to call other UCT communication routines from this callback. * - * @param [in] arg User defined argument for this callback. - * @param [in] dev_name Device name. This routine may fill the user's private - * data according to the given device name. - * The device name that is passed to this routine, - * corresponds to the dev_name field inside - * @ref uct_tl_resource_desc_t as returned from - * @ref uct_md_query_tl_resources. - * @param [out] priv_data User's private data to be passed to the server side. + * @param [in] arg User defined argument for this callback. + * @param [in] pack_args Handle for the the private data packing. + * @param [out] priv_data User's private data to be passed to the remote side. * * @return Negative value indicates an error according to @ref ucs_status_t. - * On success, non-negative value indicates actual number of + * On success, a non-negative value indicates actual number of * bytes written to the @a priv_data buffer. */ -typedef ssize_t (*uct_sockaddr_priv_pack_callback_t)(void *arg, - const char *dev_name, - void *priv_data); +typedef ssize_t +(*uct_cm_ep_priv_data_pack_callback_t)(void *arg, + const uct_cm_ep_priv_data_pack_args_t + *pack_args, void *priv_data); /** @@ -345,26 +724,44 @@ typedef ssize_t (*uct_sockaddr_priv_pack_callback_t)(void *arg, * * @note It is allowed to call other communication routines from the callback. * - * @param [in] arg User-defined argument - * @param [in] data Points to the received unexpected data. - * @param [in] length Length of data. - * @param [in] desc Points to the received descriptor, at the beginning of - * the user-defined rx_headroom. - * @param [in] stag Tag from sender. - * @param [in] imm Immediate data from sender. - * - * @warning If the user became the owner of the @a desc (by returning - * @ref UCS_INPROGRESS) the descriptor must be released later by - * @ref uct_iface_release_desc by the user. - * - * @retval UCS_OK - descriptor was consumed, and can be released - * by the caller. - * @retval UCS_INPROGRESS - descriptor is owned by the callee, and would be - * released later. + * @param [in] arg User-defined argument + * @param [in] data Points to the received unexpected data. + * @param [in] length Length of data. + * @param [in] flags Mask with @ref uct_cb_param_flags flags. If it + * contains @ref UCT_CB_PARAM_FLAG_DESC value, this means + * @a data is part of a descriptor which must be released + * later using @ref uct_iface_release_desc by the user if + * the callback returns @ref UCS_INPROGRESS. + * @param [in] stag Tag from sender. + * @param [in] imm Immediate data from sender. + * + * @param [inout] context Storage for a per-message user-defined context. In + * this context, the message is defined by the sender + * side as a single call to uct_ep_tag_eager_short/bcopy/zcopy. + * On the transport level the message can be fragmented + * and delivered to the target over multiple fragments. + * The fragments will preserve the original order of the + * message. Each fragment will result in invocation of + * the above callback. The user can use + * UCT_CB_PARAM_FLAG_FIRST to identify the first fragment, + * allocate the context object and use the context as a + * token that is set by the user and passed to subsequent + * callbacks of the same message. The user is responsible + * for allocation and release of the context. + * + * @note No need to allocate the context in the case of a single fragment message + * (i.e. @a flags contains @ref UCT_CB_PARAM_FLAG_FIRST, but does not + * contain @ref UCT_CB_PARAM_FLAG_MORE). + * + * @retval UCS_OK - data descriptor was consumed, and can be released + * by the caller. + * @retval UCS_INPROGRESS - data descriptor is owned by the callee, and will be + * released later. */ typedef ucs_status_t (*uct_tag_unexp_eager_cb_t)(void *arg, void *data, size_t length, unsigned flags, - uct_tag_t stag, uint64_t imm); + uct_tag_t stag, uint64_t imm, + void **context); /** @@ -405,4 +802,15 @@ typedef ucs_status_t (*uct_tag_unexp_rndv_cb_t)(void *arg, unsigned flags, const void *rkey_buf); +/** + * @ingroup UCT_RESOURCE + * @brief Callback to process asynchronous events. + * + * @param [in] arg User argument to be passed to the callback. + * @param [in] flags Flags to be passed to the callback (reserved for + * future use). + */ +typedef void (*uct_async_event_cb_t)(void *arg, unsigned flags); + + #endif diff --git a/src/uct/base/uct_cm.c b/src/uct/base/uct_cm.c new file mode 100644 index 00000000000..708b2308cd9 --- /dev/null +++ b/src/uct/base/uct_cm.c @@ -0,0 +1,267 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "uct_cm.h" + +#include +#include +#include + + +ucs_config_field_t uct_cm_config_table[] = { + {NULL} +}; + +ucs_status_t uct_cm_open(uct_component_h component, uct_worker_h worker, + const uct_cm_config_t *config, uct_cm_h *cm_p) +{ + return component->cm_open(component, worker, config, cm_p); +} + +void uct_cm_close(uct_cm_h cm) +{ + cm->ops->close(cm); +} + +ucs_status_t uct_cm_query(uct_cm_h cm, uct_cm_attr_t *cm_attr) +{ + return cm->ops->cm_query(cm, cm_attr); +} + +ucs_status_t uct_cm_config_read(uct_component_h component, + const char *env_prefix, const char *filename, + uct_cm_config_t **config_p) +{ + uct_config_bundle_t *bundle = NULL; + ucs_status_t status; + + status = uct_config_read(&bundle, component->cm_config.table, + component->cm_config.size, env_prefix, + component->cm_config.prefix); + if (status != UCS_OK) { + ucs_error("failed to read CM configuration"); + return status; + } + + *config_p = (uct_cm_config_t*) bundle->data; + /* coverity[leaked_storage] */ + return UCS_OK; +} + +ucs_status_t uct_cm_ep_pack_cb(uct_cm_base_ep_t *cep, void *arg, + const uct_cm_ep_priv_data_pack_args_t *pack_args, + void *priv_data, size_t priv_data_max, + size_t *priv_data_ret) +{ + ucs_status_t status = UCS_OK; + ssize_t ret; + + ret = cep->priv_pack_cb(arg, pack_args, priv_data); + if (ret < 0) { + ucs_assert(ret > UCS_ERR_LAST); + status = (ucs_status_t)ret; + ucs_error("private data pack function failed with error: %s", + ucs_status_string(status)); + goto out; + } else if (ret > priv_data_max) { + status = UCS_ERR_EXCEEDS_LIMIT; + ucs_error("private data pack function returned %zd (max: %zu)", + ret, priv_data_max); + goto out; + } + + *priv_data_ret = ret; +out: + return status; +} + +void uct_cm_ep_disconnect_cb(uct_cm_base_ep_t *cep) +{ + cep->disconnect_cb(&cep->super.super, cep->user_data); +} + +void uct_cm_ep_client_connect_cb(uct_cm_base_ep_t *cep, + uct_cm_remote_data_t *remote_data, + ucs_status_t status) +{ + uct_cm_ep_client_connect_args_t connect_args; + + connect_args.field_mask = UCT_CM_EP_CLIENT_CONNECT_ARGS_FIELD_REMOTE_DATA | + UCT_CM_EP_CLIENT_CONNECT_ARGS_FIELD_STATUS; + connect_args.remote_data = remote_data; + connect_args.status = status; + + cep->client.connect_cb(&cep->super.super, cep->user_data, &connect_args); +} + +void uct_cm_ep_server_conn_notify_cb(uct_cm_base_ep_t *cep, ucs_status_t status) +{ + uct_cm_ep_server_conn_notify_args_t notify_args; + + notify_args.field_mask = UCT_CM_EP_SERVER_CONN_NOTIFY_ARGS_FIELD_STATUS; + notify_args.status = status; + + cep->server.notify_cb(&cep->super.super, cep->user_data, ¬ify_args); +} + +static ucs_status_t uct_cm_check_ep_params(const uct_ep_params_t *params) +{ + if (!(params->field_mask & UCT_EP_PARAM_FIELD_CM)) { + ucs_error("UCT_EP_PARAM_FIELD_CM is not set. field_mask 0x%lx", + params->field_mask); + return UCS_ERR_INVALID_PARAM; + } + + if (!(params->field_mask & UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS) || + !(params->sockaddr_cb_flags & UCT_CB_FLAG_ASYNC)) { + ucs_error("UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS and UCT_CB_FLAG_ASYNC " + "should be set. field_mask 0x%lx, sockaddr_cb_flags 0x%x", + params->field_mask, params->sockaddr_cb_flags); + return UCS_ERR_UNSUPPORTED; + } + + return UCS_OK; +} + +ucs_status_t uct_cm_set_common_data(uct_cm_base_ep_t *ep, + const uct_ep_params_t *params) +{ + ucs_status_t status; + + status = uct_cm_check_ep_params(params); + if (status != UCS_OK) { + return status; + } + + status = UCT_CM_SET_CB(params, UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB, + ep->priv_pack_cb, params->sockaddr_pack_cb, + uct_cm_ep_priv_data_pack_callback_t, + ucs_empty_function_return_invalid_param); + if (status != UCS_OK) { + return status; + } + + status = UCT_CM_SET_CB(params, UCT_EP_PARAM_FIELD_SOCKADDR_DISCONNECT_CB, + ep->disconnect_cb, params->disconnect_cb, + uct_ep_disconnect_cb_t, ucs_empty_function); + if (status != UCS_OK) { + return status; + } + + ep->user_data = (params->field_mask & UCT_EP_PARAM_FIELD_USER_DATA) ? + params->user_data : NULL; + + return UCS_OK; +} + +UCS_CLASS_INIT_FUNC(uct_cm_base_ep_t, const uct_ep_params_t *params) +{ + ucs_status_t status; + + UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, ¶ms->cm->iface); + + status = uct_cm_set_common_data(self, params); + if (status != UCS_OK) { + return status; + } + + return UCS_OK; +} + +UCS_CLASS_CLEANUP_FUNC(uct_cm_base_ep_t){} + +UCS_CLASS_DEFINE(uct_cm_base_ep_t, uct_base_ep_t); +UCS_CLASS_DEFINE_NEW_FUNC(uct_cm_base_ep_t, uct_base_ep_t, const uct_ep_params_t *); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_cm_base_ep_t, uct_base_ep_t); + + +UCS_CLASS_INIT_FUNC(uct_listener_t, uct_cm_h cm) +{ + self->cm = cm; + return UCS_OK; +} + +UCS_CLASS_CLEANUP_FUNC(uct_listener_t){} + +UCS_CLASS_DEFINE(uct_listener_t, void); +UCS_CLASS_DEFINE_NEW_FUNC(uct_listener_t, void, uct_cm_h); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_listener_t, void); + +ucs_status_t uct_listener_create(uct_cm_h cm, const struct sockaddr *saddr, + socklen_t socklen, const uct_listener_params_t *params, + uct_listener_h *listener_p) +{ + if (!(params->field_mask & UCT_LISTENER_PARAM_FIELD_CONN_REQUEST_CB)) { + return UCS_ERR_INVALID_PARAM; + } + + return cm->ops->listener_create(cm, saddr, socklen, params, listener_p); +} + +void uct_listener_destroy(uct_listener_h listener) +{ + listener->cm->ops->listener_destroy(listener); +} + +ucs_status_t uct_listener_query(uct_listener_h listener, + uct_listener_attr_t *listener_attr) +{ + return listener->cm->ops->listener_query(listener, listener_attr); +} + +ucs_status_t uct_listener_reject(uct_listener_h listener, + uct_conn_request_h conn_request) +{ + return listener->cm->ops->listener_reject(listener, conn_request); +} + + +#ifdef ENABLE_STATS +static ucs_stats_class_t uct_cm_stats_class = { + .name = "rdmacm_cm", + .num_counters = 0 +}; +#endif + +UCS_CLASS_INIT_FUNC(uct_cm_t, uct_cm_ops_t* ops, uct_iface_ops_t* iface_ops, + uct_worker_h worker, uct_component_h component) +{ + self->ops = ops; + self->component = component; + self->iface.super.ops = *iface_ops; + self->iface.worker = ucs_derived_of(worker, uct_priv_worker_t); + + self->iface.md = NULL; + self->iface.am->arg = NULL; + self->iface.am->flags = 0; + self->iface.am->cb = (uct_am_callback_t)ucs_empty_function_return_unsupported; + self->iface.am_tracer = NULL; + self->iface.am_tracer_arg = NULL; + self->iface.err_handler = NULL; + self->iface.err_handler_arg = NULL; + self->iface.err_handler_flags = 0; + self->iface.prog.id = UCS_CALLBACKQ_ID_NULL; + self->iface.prog.refcount = 0; + self->iface.progress_flags = 0; + + return UCS_STATS_NODE_ALLOC(&self->iface.stats, &uct_cm_stats_class, + ucs_stats_get_root(), "%s-%p", "iface", + self->iface); +} + +UCS_CLASS_CLEANUP_FUNC(uct_cm_t) +{ + UCS_STATS_NODE_FREE(self->iface.stats); +} + +UCS_CLASS_DEFINE(uct_cm_t, void); +UCS_CLASS_DEFINE_NEW_FUNC(uct_cm_t, void, uct_cm_ops_t*, uct_iface_ops_t*, + uct_worker_h, uct_component_h); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_cm_t, void); diff --git a/src/uct/base/uct_cm.h b/src/uct/base/uct_cm.h new file mode 100644 index 00000000000..9544597465d --- /dev/null +++ b/src/uct/base/uct_cm.h @@ -0,0 +1,139 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef UCT_CM_H_ +#define UCT_CM_H_ + +#include +#include +#include +#include + + +UCS_CLASS_DECLARE(uct_listener_t, uct_cm_h); + +#define UCT_CM_SET_CB(_params, _flag, _cep_cb, _params_cb, _function_type, _default_cb) \ + ({ \ + ucs_status_t _status; \ + \ + if (!((_params)->field_mask & (_flag))) { \ + (_cep_cb) = (_function_type) (_default_cb); \ + _status = UCS_OK; \ + } else if ((_params_cb) == NULL) { \ + ucs_error(UCS_PP_MAKE_STRING(_flag) " is set but the callback is NULL"); \ + _status = UCS_ERR_INVALID_PARAM; \ + } else { \ + (_cep_cb) = (_params_cb); \ + _status = UCS_OK; \ + } \ + \ + (_status); \ + }) + +/** + * "Base" structure which defines CM configuration options. + * Specific CMs extend this structure. + */ +struct uct_cm_config { + /* C standard prohibits empty structures */ + char __dummy; +}; + +/** + * Connection manager component operations + */ +typedef struct uct_cm_ops { + void (*close)(uct_cm_h cm); + ucs_status_t (*cm_query)(uct_cm_h cm, uct_cm_attr_t *cm_attr); + ucs_status_t (*listener_create)(uct_cm_h cm, const struct sockaddr *saddr, + socklen_t socklen, + const uct_listener_params_t *params, + uct_listener_h *listener_p); + ucs_status_t (*listener_reject)(uct_listener_h listener, + uct_conn_request_h conn_request); + ucs_status_t (*listener_query) (uct_listener_h listener, + uct_listener_attr_t *listener_attr); + void (*listener_destroy)(uct_listener_h listener); + ucs_status_t (*ep_create)(const uct_ep_params_t *params, uct_ep_h *ep_p); +} uct_cm_ops_t; + + +struct uct_cm { + uct_cm_ops_t *ops; + uct_component_h component; + uct_base_iface_t iface; +}; + + +/** + * Connection manager base endpoint + */ +typedef struct uct_cm_base_ep { + uct_base_ep_t super; + + /* User data associated with the endpoint */ + void *user_data; + + /* Callback to handle the disconnection of the remote peer */ + uct_ep_disconnect_cb_t disconnect_cb; + + /* Callback to fill the user's private data */ + uct_cm_ep_priv_data_pack_callback_t priv_pack_cb; + + union { + struct { + /* On the client side - callback to process an incoming + * connection response from the server */ + uct_cm_ep_client_connect_callback_t connect_cb; + } client; + struct { + /* On the server side - callback to process an incoming connection + * establishment notification from the client */ + uct_cm_ep_server_conn_notify_callback_t notify_cb; + } server; + }; +} uct_cm_base_ep_t; + + +UCS_CLASS_DECLARE(uct_cm_base_ep_t, const uct_ep_params_t *); +UCS_CLASS_DECLARE_NEW_FUNC(uct_cm_base_ep_t, uct_base_ep_t, const uct_ep_params_t *); +UCS_CLASS_DECLARE_DELETE_FUNC(uct_cm_base_ep_t, uct_base_ep_t); + + +extern ucs_config_field_t uct_cm_config_table[]; + +UCS_CLASS_DECLARE(uct_cm_t, uct_cm_ops_t*, uct_iface_ops_t*, uct_worker_h, + uct_component_h); + +ucs_status_t uct_cm_set_common_data(uct_cm_base_ep_t *ep, + const uct_ep_params_t *params); + +ucs_status_t uct_cm_ep_pack_cb(uct_cm_base_ep_t *cep, void *arg, + const uct_cm_ep_priv_data_pack_args_t *pack_args, + void *priv_data, size_t priv_data_max, + size_t *priv_data_ret); + +ucs_status_t uct_cm_ep_set_pack_cb(const uct_ep_params_t *params, + uct_cm_base_ep_t *cep); + +ucs_status_t uct_cm_ep_set_disconnect_cb(const uct_ep_params_t *params, + uct_cm_base_ep_t *cep); + +ucs_status_t uct_cm_ep_client_set_connect_cb(const uct_ep_params_t *params, + uct_cm_base_ep_t *cep); + +ucs_status_t uct_cm_ep_server_set_notify_cb(const uct_ep_params_t *params, + uct_cm_base_ep_t *cep); + +void uct_cm_ep_disconnect_cb(uct_cm_base_ep_t *cep); + +void uct_cm_ep_client_connect_cb(uct_cm_base_ep_t *cep, + uct_cm_remote_data_t *remote_data, + ucs_status_t status); + +void uct_cm_ep_server_conn_notify_cb(uct_cm_base_ep_t *cep, ucs_status_t status); + +#endif /* UCT_CM_H_ */ diff --git a/src/uct/base/uct_component.c b/src/uct/base/uct_component.c new file mode 100644 index 00000000000..549b18fbde0 --- /dev/null +++ b/src/uct/base/uct_component.c @@ -0,0 +1,140 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "uct_component.h" + +#include +#include +#include +#include +#include +#include + + +UCS_LIST_HEAD(uct_components_list); + +ucs_status_t uct_query_components(uct_component_h **components_p, + unsigned *num_components_p) +{ + UCS_MODULE_FRAMEWORK_DECLARE(uct); + uct_component_h *components; + uct_component_t *component; + size_t num_components; + + UCS_MODULE_FRAMEWORK_LOAD(uct, 0); + num_components = ucs_list_length(&uct_components_list); + components = ucs_malloc(num_components * sizeof(*components), + "uct_components"); + if (components == NULL) { + return UCS_ERR_NO_MEMORY; + } + + ucs_assert(num_components < UINT_MAX); + *num_components_p = num_components; + *components_p = components; + + ucs_list_for_each(component, &uct_components_list, list) { + *(components++) = component; + } + + return UCS_OK; +} + +void uct_release_component_list(uct_component_h *components) +{ + ucs_free(components); +} + +ucs_status_t uct_component_query(uct_component_h component, + uct_component_attr_t *component_attr) +{ + uct_md_resource_desc_t *resources = NULL; + unsigned num_resources = 0; + ucs_status_t status; + + if (component_attr->field_mask & (UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT| + UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES)) { + status = component->query_md_resources(component, &resources, + &num_resources); + if (status != UCS_OK) { + return status; + } + + ucs_assertv((num_resources == 0) || (resources != NULL), + "component=%s", component->name); + } + + if (component_attr->field_mask & UCT_COMPONENT_ATTR_FIELD_NAME) { + ucs_snprintf_zero(component_attr->name, sizeof(component_attr->name), + "%s", component->name); + } + + if (component_attr->field_mask & UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT) { + component_attr->md_resource_count = num_resources; + + } + + if ((resources != NULL) && + (component_attr->field_mask & UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES)) + { + memcpy(component_attr->md_resources, resources, + sizeof(uct_md_resource_desc_t) * num_resources); + } + + if (component_attr->field_mask & UCT_COMPONENT_ATTR_FIELD_FLAGS) { + component_attr->flags = component->flags; + } + + ucs_free(resources); + return UCS_OK; +} + +ucs_status_t uct_config_read(uct_config_bundle_t **bundle, + ucs_config_field_t *config_table, + size_t config_size, const char *env_prefix, + const char *cfg_prefix) +{ + char full_prefix[128] = UCS_DEFAULT_ENV_PREFIX; + uct_config_bundle_t *config_bundle; + ucs_status_t status; + + config_bundle = ucs_calloc(1, sizeof(*config_bundle) + config_size, "uct_config"); + if (config_bundle == NULL) { + status = UCS_ERR_NO_MEMORY; + goto err; + } + + /* TODO use env_prefix */ + if ((env_prefix != NULL) && (strlen(env_prefix) > 0)) { + ucs_snprintf_zero(full_prefix, sizeof(full_prefix), "%s_%s", + env_prefix, UCS_DEFAULT_ENV_PREFIX); + } + + status = ucs_config_parser_fill_opts(config_bundle->data, config_table, + full_prefix, cfg_prefix, 0); + if (status != UCS_OK) { + goto err_free_bundle; + } + + config_bundle->table = config_table; + config_bundle->table_prefix = ucs_strdup(cfg_prefix, "uct_config"); + if (config_bundle->table_prefix == NULL) { + status = UCS_ERR_NO_MEMORY; + goto err_free_bundle; + } + + *bundle = config_bundle; + return UCS_OK; + +err_free_bundle: + ucs_free(config_bundle); +err: + return status; +} diff --git a/src/uct/base/uct_component.h b/src/uct/base/uct_component.h new file mode 100644 index 00000000000..7393f7702f5 --- /dev/null +++ b/src/uct/base/uct_component.h @@ -0,0 +1,184 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + + +#ifndef UCT_COMPONENT_H_ +#define UCT_COMPONENT_H_ + +#include +#include +#include + + +/* Forward declaration */ +typedef struct uct_component uct_component_t; + + +/** + * Keeps information about allocated configuration structure, to be used when + * releasing the options. + */ +typedef struct uct_config_bundle { + ucs_config_field_t *table; + const char *table_prefix; + char data[]; +} uct_config_bundle_t; + + +/** + * Component method to query component memory domain resources. + * + * @param [in] component Query memory domain resources for this + * component. + * @param [out] resources_p Filled with a pointer to an array of + * memory domain resources, which should be + * released with ucs_free(). + * @param [out] num_resources_p Filled with the number of memory domain + * resource entries in the array. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_component_query_md_resources_func_t)( + uct_component_t *component, uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p); + + +/** + * Component method to open a memory domain. + * + * @param [in] component Open memory domain resources on this + * component. + * @param [in] md_name Name of the memory domain to open, as + * returned by + * @ref uct_component_query_resources_func_t + * @param [in] config Memory domain configuration. + * @param [out] md_p Handle to the opened memory domain. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_component_md_open_func_t)( + uct_component_t *component, const char *md_name, + const uct_md_config_t *config, uct_md_h *md_p); + + +/** + * Component method to open a client/server connection manager. + * + * @param [in] component Open a connection manager on this + * component. + * @param [in] worker Open the connection manager on this worker. + * @param [in] config Connection manager configuration. + * @param [out] cm_p Filled with a handle to the connection manager. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_component_cm_open_func_t)( + uct_component_t *component, uct_worker_h worker, + const uct_cm_config_t *config, uct_cm_h *cm_p); + + +/** + * Component method to unpack a remote key buffer into a remote key object. + * + * @param [in] component Unpack the remote key buffer on this + * component. + * @param [in] rkey_buffer Remote key buffer to unpack. + * @param [in] config Memory domain configuration. + * @param [out] rkey_p Filled with a pointer to the unpacked + * remote key. + * @param [out] handle_p Filled with an additional handle which + * is used to release the remote key, but + * is not required for remote memory + * access operations. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_component_rkey_unpack_func_t)( + uct_component_t *component, const void *rkey_buffer, + uct_rkey_t *rkey_p, void **handle_p); + + +/** + * Component method to obtain a locally accessible pointer to a remote key. + * + * @param [in] component Get remote key memory pointer on this + * component. + * @param [in] rkey Obtain the pointer for this remote key. + * @param [in] handle Remote key handle, as returned from + * @ref uct_component_rkey_unpack_func_t. + * @param [in] remote_addr Remote address to obtain the pointer for. + * @param [out] local_addr_p Filled with the local access pointer. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_component_rkey_ptr_func_t)( + uct_component_t *component, uct_rkey_t rkey, void *handle, + uint64_t remote_addr, void **local_addr_p); + + +/** + * Component method to release an unpacked remote key. + * + * @param [in] component Release the remote key of this + * component. + * @param [in] rkey Release this remote key. + * @param [in] handle Remote key handle, as returned from + * @ref uct_component_rkey_unpack_func_t. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_component_rkey_release_func_t)( + uct_component_t *component, uct_rkey_t rkey, void *handle); + + +/** + * Defines a UCT component + */ +struct uct_component { + const char name[UCT_COMPONENT_NAME_MAX]; /**< Component name */ + uct_component_query_md_resources_func_t query_md_resources; /**< Query memory domain resources method */ + uct_component_md_open_func_t md_open; /**< Memory domain open method */ + uct_component_cm_open_func_t cm_open; /**< Connection manager open method */ + uct_component_rkey_unpack_func_t rkey_unpack; /**< Remote key unpack method */ + uct_component_rkey_ptr_func_t rkey_ptr; /**< Remote key access pointer method */ + uct_component_rkey_release_func_t rkey_release; /**< Remote key release method */ + ucs_config_global_list_entry_t md_config; /**< MD configuration entry */ + ucs_config_global_list_entry_t cm_config; /**< CM configuration entry */ + ucs_list_link_t tl_list; /**< List of transports */ + ucs_list_link_t list; /**< Entry in global list of components */ + uint64_t flags; /**< Flags as defined by + UCT_COMPONENT_FLAG_xx */ +}; + + +/** + * Register a component for usage, so it will be returned from + * @ref uct_query_components. + * + * @param [in] _component Pointer to a global component structure to register. + */ +#define UCT_COMPONENT_REGISTER(_component) \ + extern ucs_list_link_t uct_components_list; \ + UCS_STATIC_INIT { \ + ucs_list_add_tail(&uct_components_list, &(_component)->list); \ + } \ + UCS_CONFIG_REGISTER_TABLE_ENTRY(&(_component)->md_config); \ + UCS_CONFIG_REGISTER_TABLE_ENTRY(&(_component)->cm_config); \ + + +/** + * Helper macro to initialize component's transport list head. + */ +#define UCT_COMPONENT_TL_LIST_INITIALIZER(_component) \ + UCS_LIST_INITIALIZER(&(_component)->tl_list, &(_component)->tl_list) + + +ucs_status_t uct_config_read(uct_config_bundle_t **bundle, + ucs_config_field_t *config_table, + size_t config_size, const char *env_prefix, + const char *cfg_prefix); + +#endif diff --git a/src/uct/base/uct_iface.c b/src/uct/base/uct_iface.c index 4c5bad22f2a..48923084395 100644 --- a/src/uct/base/uct_iface.c +++ b/src/uct/base/uct_iface.c @@ -5,15 +5,21 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "uct_iface.h" -#include "uct_md.h" +#include "uct_cm.h" #include #include +#include #include +#include -#if ENABLE_STATS +#ifdef ENABLE_STATS static ucs_stats_class_t uct_ep_stats_class = { .name = "uct_ep", .num_counters = UCT_EP_STAT_LAST, @@ -54,8 +60,15 @@ static ucs_stats_class_t uct_iface_stats_class = { static ucs_status_t uct_iface_stub_am_handler(void *arg, void *data, size_t length, unsigned flags) { - uint8_t id = (uintptr_t)arg; + const size_t dump_len = 64; + uint8_t id = (uintptr_t)arg; + char dump_str[(dump_len * 4) + 1]; /* 1234:5678\n\0 */ + ucs_warn("got active message id %d, but no handler installed", id); + ucs_warn("payload %zu of %zu bytes:\n%s", ucs_min(length, dump_len), length, + ucs_str_dump_hex(data, ucs_min(length, dump_len), + dump_str, sizeof(dump_str), 16)); + ucs_log_print_backtrace(UCS_LOG_LEVEL_WARN); return UCS_OK; } @@ -327,35 +340,35 @@ ucs_status_t uct_set_ep_failed(ucs_class_t *cls, uct_ep_h tl_ep, * Failed ep will use that queue for purge. */ uct_ep_pending_purge(tl_ep, uct_ep_failed_purge_cb, &f_iface->pend_q); - ops->ep_put_short = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_put_bcopy = (void*)ucs_empty_function_return_bc_ep_timeout; - ops->ep_put_zcopy = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_get_short = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_get_bcopy = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_get_zcopy = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_am_short = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_am_bcopy = (void*)ucs_empty_function_return_bc_ep_timeout; - ops->ep_am_zcopy = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_atomic_cswap64 = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_atomic_cswap32 = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_atomic64_post = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_atomic32_post = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_atomic64_fetch = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_atomic32_fetch = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_tag_eager_short = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_tag_eager_bcopy = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_tag_eager_zcopy = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_tag_rndv_zcopy = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_tag_rndv_cancel = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_tag_rndv_request= (void*)ucs_empty_function_return_ep_timeout; - ops->ep_pending_add = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_pending_purge = uct_ep_failed_purge; - ops->ep_flush = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_fence = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_check = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_connect_to_ep = (void*)ucs_empty_function_return_ep_timeout; - ops->ep_destroy = uct_ep_failed_destroy; - ops->ep_get_address = (void*)ucs_empty_function_return_ep_timeout; + ops->ep_put_short = (uct_ep_put_short_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_put_bcopy = (uct_ep_put_bcopy_func_t)ucs_empty_function_return_bc_ep_timeout; + ops->ep_put_zcopy = (uct_ep_put_zcopy_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_get_short = (uct_ep_get_short_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_get_bcopy = (uct_ep_get_bcopy_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_get_zcopy = (uct_ep_get_zcopy_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_am_short = (uct_ep_am_short_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_am_bcopy = (uct_ep_am_bcopy_func_t)ucs_empty_function_return_bc_ep_timeout; + ops->ep_am_zcopy = (uct_ep_am_zcopy_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_atomic_cswap64 = (uct_ep_atomic_cswap64_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_atomic_cswap32 = (uct_ep_atomic_cswap32_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_atomic64_post = (uct_ep_atomic64_post_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_atomic32_post = (uct_ep_atomic32_post_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_atomic64_fetch = (uct_ep_atomic64_fetch_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_atomic32_fetch = (uct_ep_atomic32_fetch_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_tag_eager_short = (uct_ep_tag_eager_short_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_tag_eager_bcopy = (uct_ep_tag_eager_bcopy_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_tag_eager_zcopy = (uct_ep_tag_eager_zcopy_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_tag_rndv_zcopy = (uct_ep_tag_rndv_zcopy_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_tag_rndv_cancel = (uct_ep_tag_rndv_cancel_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_tag_rndv_request = (uct_ep_tag_rndv_request_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_pending_add = (uct_ep_pending_add_func_t)ucs_empty_function_return_busy; + ops->ep_pending_purge = uct_ep_failed_purge; + ops->ep_flush = (uct_ep_flush_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_fence = (uct_ep_fence_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_check = (uct_ep_check_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_connect_to_ep = (uct_ep_connect_to_ep_func_t)ucs_empty_function_return_ep_timeout; + ops->ep_destroy = uct_ep_failed_destroy; + ops->ep_get_address = (uct_ep_get_address_func_t)ucs_empty_function_return_ep_timeout; ucs_class_call_cleanup_chain(cls, tl_ep, -1); @@ -363,14 +376,48 @@ ucs_status_t uct_set_ep_failed(ucs_class_t *cls, uct_ep_h tl_ep, if (iface->err_handler) { return iface->err_handler(iface->err_handler_arg, tl_ep, status); + } else if (status == UCS_ERR_CANCELED) { + ucs_debug("error %s was suppressed for ep %p", + ucs_status_string(UCS_ERR_CANCELED), tl_ep); + /* Suppress this since the cancellation is initiated by user. */ + status = UCS_OK; + } else { + ucs_debug("error %s was not handled for ep %p", + ucs_status_string(status), tl_ep); } - ucs_debug("error %s was not handled for ep %p", ucs_status_string(status), - tl_ep); - return status; } +void uct_base_iface_query(uct_base_iface_t *iface, uct_iface_attr_t *iface_attr) +{ + memset(iface_attr, 0, sizeof(*iface_attr)); + + iface_attr->max_num_eps = iface->config.max_num_eps; + iface_attr->dev_num_paths = 1; +} + +ucs_status_t uct_single_device_resource(uct_md_h md, const char *dev_name, + uct_device_type_t dev_type, + uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) +{ + uct_tl_device_resource_t *device; + + device = ucs_calloc(1, sizeof(*device), "device resource"); + if (NULL == device) { + ucs_error("failed to allocate device resource"); + return UCS_ERR_NO_MEMORY; + } + + ucs_snprintf_zero(device->name, sizeof(device->name), "%s", dev_name); + device->type = dev_type; + + *num_tl_devices_p = 1; + *tl_devices_p = device; + return UCS_OK; +} + UCS_CLASS_INIT_FUNC(uct_iface_t, uct_iface_ops_t *ops) { ucs_assert_always(ops->ep_flush != NULL); @@ -437,7 +484,7 @@ UCS_CLASS_INIT_FUNC(uct_base_iface_t, uct_iface_ops_t *ops, uct_md_h md, /* Copy allocation methods configuration. In the process, remove duplicates. */ UCS_STATIC_ASSERT(sizeof(alloc_methods_bitmap) * 8 >= UCT_ALLOC_METHOD_LAST); self->config.num_alloc_methods = 0; - alloc_methods_bitmap = 0; + alloc_methods_bitmap = 0; for (i = 0; i < config->alloc_methods.count; ++i) { method = config->alloc_methods.methods[i]; if (alloc_methods_bitmap & UCS_BIT(method)) { @@ -449,7 +496,8 @@ UCS_CLASS_INIT_FUNC(uct_base_iface_t, uct_iface_ops_t *ops, uct_md_h md, alloc_methods_bitmap |= UCS_BIT(method); } - self->config.failure_level = config->failure; + self->config.failure_level = (ucs_log_level_t)config->failure; + self->config.max_num_eps = config->max_num_eps; return UCS_STATS_NODE_ALLOC(&self->stats, &uct_iface_stats_class, stats_parent, "-%s-%p", iface_name, self); @@ -479,11 +527,18 @@ ucs_status_t uct_iface_reject(uct_iface_h iface, ucs_status_t uct_ep_create(const uct_ep_params_t *params, uct_ep_h *ep_p) { - if (!(params->field_mask & UCT_EP_PARAM_FIELD_IFACE)) { - return UCS_ERR_INVALID_PARAM; + if (params->field_mask & UCT_EP_PARAM_FIELD_IFACE) { + return params->iface->ops.ep_create(params, ep_p); + } else if (params->field_mask & UCT_EP_PARAM_FIELD_CM) { + return params->cm->ops->ep_create(params, ep_p); } - return params->iface->ops.ep_create(params, ep_p); + return UCS_ERR_INVALID_PARAM; +} + +ucs_status_t uct_ep_disconnect(uct_ep_h ep, unsigned flags) +{ + return ep->iface->ops.ep_disconnect(ep, flags); } void uct_ep_destroy(uct_ep_h ep) @@ -502,6 +557,11 @@ ucs_status_t uct_ep_connect_to_ep(uct_ep_h ep, const uct_device_addr_t *dev_addr return ep->iface->ops.ep_connect_to_ep(ep, dev_addr, ep_addr); } +ucs_status_t uct_cm_client_ep_conn_notify(uct_ep_h ep) +{ + return ep->iface->ops.cm_ep_conn_notify(ep); +} + UCS_CLASS_INIT_FUNC(uct_ep_t, uct_iface_t *iface) { self->iface = iface; @@ -514,7 +574,6 @@ UCS_CLASS_CLEANUP_FUNC(uct_ep_t) UCS_CLASS_DEFINE(uct_ep_t, void); - UCS_CLASS_INIT_FUNC(uct_base_ep_t, uct_base_iface_t *iface) { UCS_CLASS_CALL_SUPER_INIT(uct_ep_t, &iface->super); @@ -535,15 +594,15 @@ UCS_CONFIG_DEFINE_ARRAY(alloc_methods, sizeof(uct_alloc_method_t), UCS_CONFIG_TYPE_ENUM(uct_alloc_method_names)); ucs_config_field_t uct_iface_config_table[] = { - {"MAX_SHORT", "128", - "Maximal size of short sends. The transport is allowed to support any size up\n" - "to this limit, the actual size can be lower due to transport constraints.", - ucs_offsetof(uct_iface_config_t, max_short), UCS_CONFIG_TYPE_MEMUNITS}, + {"MAX_SHORT", "", + "The configuration parameter replaced by: " + "UCX__TX_MIN_INLINE for IB, UCX_MM_FIFO_SIZE for MM", + UCS_CONFIG_DEPRECATED_FIELD_OFFSET, UCS_CONFIG_TYPE_DEPRECATED}, - {"MAX_BCOPY", "8192", - "Maximal size of copy-out sends. The transport is allowed to support any size\n" - "up to this limit, the actual size can be lower due to transport constraints.", - ucs_offsetof(uct_iface_config_t, max_bcopy), UCS_CONFIG_TYPE_MEMUNITS}, + {"MAX_BCOPY", "", + "The configuration parameter replaced by: " + "UCX__SEG_SIZE where is one of: IB, MM, SELF, TCP", + UCS_CONFIG_DEPRECATED_FIELD_OFFSET, UCS_CONFIG_TYPE_DEPRECATED}, {"ALLOC", "huge,thp,md,mmap,heap", "Priority of methods to allocate intermediate buffers for communication", @@ -553,5 +612,9 @@ ucs_config_field_t uct_iface_config_table[] = { "Level of network failure reporting", ucs_offsetof(uct_iface_config_t, failure), UCS_CONFIG_TYPE_ENUM(ucs_log_level_names)}, + {"MAX_NUM_EPS", "inf", + "Maximum number of endpoints that the transport interface is able to create", + ucs_offsetof(uct_iface_config_t, max_num_eps), UCS_CONFIG_TYPE_ULUNITS}, + {NULL} }; diff --git a/src/uct/base/uct_iface.h b/src/uct/base/uct_iface.h index d02df20e0a1..976a4529e27 100644 --- a/src/uct/base/uct_iface.h +++ b/src/uct/base/uct_iface.h @@ -10,12 +10,14 @@ #include "uct_worker.h" #include +#include #include #include #include #include #include #include +#include #include #include @@ -120,6 +122,10 @@ enum { "UCT_EP_PARAM_FIELD_DEV_ADDR and UCT_EP_PARAM_FIELD_IFACE_ADDR are not defined") +#define UCT_EP_PARAMS_GET_PATH_INDEX(_params) \ + (((_params)->field_mask & UCT_EP_PARAM_FIELD_PATH_INDEX) ? \ + (_params)->path_index : 0) + /** * Check the condition and return status as a pointer if not true. */ @@ -214,9 +220,10 @@ typedef struct uct_base_iface { unsigned num_alloc_methods; uct_alloc_method_t alloc_methods[UCT_ALLOC_METHOD_LAST]; ucs_log_level_t failure_level; + size_t max_num_eps; } config; - UCS_STATS_NODE_DECLARE(stats); /* Statistics */ + UCS_STATS_NODE_DECLARE(stats) /* Statistics */ } uct_base_iface_t; UCS_CLASS_DECLARE(uct_base_iface_t, uct_iface_ops_t*, uct_md_h, uct_worker_h, @@ -238,47 +245,70 @@ typedef struct uct_failed_iface { */ typedef struct uct_base_ep { uct_ep_t super; - UCS_STATS_NODE_DECLARE(stats); + UCS_STATS_NODE_DECLARE(stats) } uct_base_ep_t; UCS_CLASS_DECLARE(uct_base_ep_t, uct_base_iface_t*); /** - * Transport component. + * Internal resource descriptor of a transport device + */ +typedef struct uct_tl_device_resource { + char name[UCT_DEVICE_NAME_MAX]; /**< Hardware device name */ + uct_device_type_t type; /**< The device represented by this resource + (e.g. UCT_DEVICE_TYPE_NET for a network interface) */ + ucs_sys_device_t sys_device; /**< The identifier associated with the device + bus_id as captured in ucs_sys_bus_id_t struct */ +} uct_tl_device_resource_t; + + +/** + * UCT transport definition. This structure should not be used directly; use + * @ref UCT_TL_DEFINE macro to define a transport. */ -typedef struct uct_tl_component { - ucs_status_t (*query_resources)(uct_md_h md, - uct_tl_resource_desc_t **resources_p, - unsigned *num_resources_p); +typedef struct uct_tl { + char name[UCT_TL_NAME_MAX]; /**< Transport name */ + + ucs_status_t (*query_devices)(uct_md_h md, + uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p); ucs_status_t (*iface_open)(uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, const uct_iface_config_t *config, uct_iface_h *iface_p); - char name[UCT_TL_NAME_MAX];/**< Transport name */ - const char *cfg_prefix; /**< Prefix for configuration environment vars */ - ucs_config_field_t *iface_config_table; /**< Defines transport configuration options */ - size_t iface_config_size; /**< Transport configuration structure size */ -} uct_tl_component_t; + ucs_config_global_list_entry_t config; /**< Transport configuration entry */ + ucs_list_link_t list; /**< Entry in component's transports list */ +} uct_tl_t; /** - * Define a transport component. + * Define a transport + * + * @param _component Component to add the transport to + * @param _name Name of the transport (should be a token, not a string) + * @param _query_devices Function to query the list of available devices + * @param _iface_class Struct type defining the uct_iface class */ -#define UCT_TL_COMPONENT_DEFINE(_tlc, _query, _iface_struct, _name, \ - _cfg_prefix, _cfg_table, _cfg_struct) \ +#define UCT_TL_DEFINE(_component, _name, _query_devices, _iface_class, \ + _cfg_prefix, _cfg_table, _cfg_struct) \ \ - uct_tl_component_t _tlc = { \ - .query_resources = _query, \ - .iface_open = UCS_CLASS_NEW_FUNC_NAME(_iface_struct), \ - .name = _name, \ - .cfg_prefix = _cfg_prefix, \ - .iface_config_table = _cfg_table, \ - .iface_config_size = sizeof(_cfg_struct) \ + uct_tl_t uct_##_name##_tl = { \ + .name = #_name, \ + .query_devices = _query_devices, \ + .iface_open = UCS_CLASS_NEW_FUNC_NAME(_iface_class), \ + .config = { \ + .name = #_name" transport", \ + .prefix = _cfg_prefix, \ + .table = _cfg_table, \ + .size = sizeof(_cfg_struct), \ + } \ }; \ - UCS_CONFIG_REGISTER_TABLE(_cfg_table, _name" transport", _cfg_prefix, \ - _cfg_struct) + UCS_CONFIG_REGISTER_TABLE_ENTRY(&(uct_##_name##_tl).config); \ + UCS_STATIC_INIT { \ + ucs_list_add_tail(&(_component)->tl_list, &(uct_##_name##_tl).list); \ + } /** @@ -286,15 +316,13 @@ typedef struct uct_tl_component { * Specific transport extend this structure. */ struct uct_iface_config { - size_t max_short; - size_t max_bcopy; - struct { uct_alloc_method_t *methods; unsigned count; } alloc_methods; int failure; /* Level of failure reports */ + size_t max_num_eps; }; @@ -367,7 +395,7 @@ typedef struct uct_iface_mpool_config { * TL Memory pool object initialization callback. */ typedef void (*uct_iface_mpool_init_obj_cb_t)(uct_iface_h iface, void *obj, - uct_mem_h memh); + uct_mem_h memh); /** @@ -395,8 +423,8 @@ uct_pending_req_priv_arb_elem(uct_pending_req_t *req) #define uct_pending_req_arb_group_push(_arbiter_group, _req) \ do { \ ucs_arbiter_elem_init(uct_pending_req_priv_arb_elem(_req)); \ - ucs_arbiter_group_push_elem(_arbiter_group, \ - uct_pending_req_priv_arb_elem(_req)); \ + ucs_arbiter_group_push_elem_always(_arbiter_group, \ + uct_pending_req_priv_arb_elem(_req)); \ } while (0) @@ -406,8 +434,8 @@ uct_pending_req_priv_arb_elem(uct_pending_req_t *req) #define uct_pending_req_arb_group_push_head(_arbiter, _arbiter_group, _req) \ do { \ ucs_arbiter_elem_init(uct_pending_req_priv_arb_elem(_req)); \ - ucs_arbiter_group_push_head_elem(_arbiter, _arbiter_group, \ - uct_pending_req_priv_arb_elem(_req)); \ + ucs_arbiter_group_push_head_elem_always(_arbiter_group, \ + uct_pending_req_priv_arb_elem(_req)); \ } while (0) @@ -557,6 +585,13 @@ void uct_iface_mpool_empty_warn(uct_base_iface_t *iface, ucs_mpool_t *mp); ucs_status_t uct_set_ep_failed(ucs_class_t* cls, uct_ep_h tl_ep, uct_iface_h tl_iface, ucs_status_t status); +void uct_base_iface_query(uct_base_iface_t *iface, uct_iface_attr_t *iface_attr); + +ucs_status_t uct_single_device_resource(uct_md_h md, const char *dev_name, + uct_device_type_t dev_type, + uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p); + ucs_status_t uct_base_iface_flush(uct_iface_h tl_iface, unsigned flags, uct_completion_t *comp); @@ -588,11 +623,15 @@ uct_iface_invoke_am(uct_base_iface_t *iface, uint8_t id, void *data, unsigned length, unsigned flags) { ucs_status_t status; - uct_am_handler_t *handler = &iface->am[id]; + uct_am_handler_t *handler; + + ucs_assertv(id < UCT_AM_ID_MAX, "invalid am id: %d (max: %lu)", + id, UCT_AM_ID_MAX - 1); - ucs_assert(id < UCT_AM_ID_MAX); UCS_STATS_UPDATE_COUNTER(iface->stats, UCT_IFACE_STAT_RX_AM, 1); UCS_STATS_UPDATE_COUNTER(iface->stats, UCT_IFACE_STAT_RX_AM_BYTES, length); + + handler = &iface->am[id]; status = handler->cb(handler->arg, data, length, flags); ucs_assert((status == UCS_OK) || ((status == UCS_INPROGRESS) && (flags & UCT_CB_PARAM_FLAG_DESC))); @@ -610,35 +649,33 @@ static UCS_F_ALWAYS_INLINE void uct_invoke_completion(uct_completion_t *comp, ucs_status_t status) { ucs_trace_func("comp=%p, count=%d, status=%d", comp, comp->count, status); + ucs_assertv(comp->count > 0, "comp=%p count=%d", comp, comp->count); if (--comp->count == 0) { comp->func(comp, status); } } -/** - * Calculates total length of particular iov data buffer. - * Currently has no support for stride. - * If stride supported it should be like: length + ((count - 1) * stride) - */ -static UCS_F_ALWAYS_INLINE -size_t uct_iov_get_length(const uct_iov_t *iov) -{ - return iov->count * iov->length; -} /** - * Calculates total length of the iov array buffers. + * Copy data to target am_short buffer */ static UCS_F_ALWAYS_INLINE -size_t uct_iov_total_length(const uct_iov_t *iov, size_t iovcnt) +void uct_am_short_fill_data(void *buffer, uint64_t header, const void *payload, + size_t length) { - size_t iov_it, total_length = 0; - - for (iov_it = 0; iov_it < iovcnt; ++iov_it) { - total_length += uct_iov_get_length(&iov[iov_it]); - } - - return total_length; + /** + * Helper structure to fill send buffer of short messages for + * non-accelerated transports + */ + struct uct_am_short_packet { + uint64_t header; + char payload[]; + } UCS_S_PACKED *packet = (struct uct_am_short_packet*)buffer; + + packet->header = header; + /* suppress false positive diagnostic from uct_mm_ep_am_common_send call */ + /* cppcheck-suppress ctunullpointer */ + memcpy(packet->payload, payload, length); } #endif diff --git a/src/uct/base/uct_iov.inl b/src/uct/base/uct_iov.inl new file mode 100644 index 00000000000..b6728111367 --- /dev/null +++ b/src/uct/base/uct_iov.inl @@ -0,0 +1,107 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCT_IOV_INL_ +#define UCT_IOV_INL_ + +#include +#include +#include + +#include +#include + + +/** + * Calculates the total length of the particular UCT IOV data buffer. + * + * @param [in] iov Pointer to the UCT IOV element. + * + * @return The length of the UCT IOV data buffer. + * @note Currently has no support for the strides. If the strides are + * supported, it should be like: length + ((count - 1) * stride) + */ +static UCS_F_ALWAYS_INLINE +size_t uct_iov_get_length(const uct_iov_t *iov) +{ + return iov->count * iov->length; +} + +/** + * Returns the particular UCT IOV data buffer. + * + * @param [in] iov Pointer to the UCT IOV element. + * + * @return The UCT IOV data buffer. + */ +static UCS_F_ALWAYS_INLINE +void *uct_iov_get_buffer(const uct_iov_t *iov) +{ + return iov->buffer; +} + +/** + * Calculates the total length of the UCT IOV array buffers. + * + * @param [in] iov Pointer to the array of UCT IOVs. + * @param [in] iov_cnt Number of the elements in the array of UCT IOVs. + * + * @return The total length of the array of UCT IOVs. + */ +static UCS_F_ALWAYS_INLINE +size_t uct_iov_total_length(const uct_iov_t *iov, size_t iov_cnt) +{ + return ucs_iov_total_length(iov, iov_cnt, uct_iov_get_length); +} + +/** + * Calculates the flat offset in the UCT IOV array, which is the total data size + * before the position of the iterator. + * + * @param [in] iov Pointer to the array of UCT IOVs. + * @param [in] iov_cnt Number of the elements in the array of UCT IOVs. + * @param [in] iov_iter Pointer to the UCT IOV iterator. + * + * @return The flat offset in the UCT IOV array. + */ +static UCS_F_ALWAYS_INLINE +size_t uct_iov_iter_flat_offset(const uct_iov_t *iov, size_t iov_cnt, + const ucs_iov_iter_t *iov_iter) +{ + return ucs_iov_iter_flat_offset(iov, iov_cnt, iov_iter, + uct_iov_get_length); +} + +/** + * Fill IOVEC data structure by the data provided in the array of UCT IOVs. + * The function avoids copying IOVs with zero length. + * + * @param [out] io_vec Pointer to the resulted array of IOVECs. + * @param [in/out] io_vec_cnt_p Pointer to the varibale that holds the number + * of the elements in the array of IOVECs (input: + * initial, out: result). + * @param [in] uct_iov Pointer to the array of UCT IOVs. + * @param [in] uct_iov_cnt Number of the elements in the array of UCT IOVs. + * @param [in] max_length Maximal total length of the data that can be + * placed in the resulted array of IOVECs. + * @param [in] uct_iov_iter_p Pointer to the UCT IOV iterator. + * + * @return The amount, in bytes, of the data that is stored in the source + * array of IOVs. + */ +static UCS_F_ALWAYS_INLINE +size_t uct_iov_to_iovec(struct iovec *io_vec, size_t *io_vec_cnt_p, + const uct_iov_t *uct_iov, size_t uct_iov_cnt, + size_t max_length, ucs_iov_iter_t *uct_iov_iter_p) +{ + return ucs_iov_converter(io_vec, io_vec_cnt_p, + ucs_iovec_set_buffer, ucs_iovec_set_length, + uct_iov, uct_iov_cnt, + uct_iov_get_buffer, uct_iov_get_length, + max_length, uct_iov_iter_p); +} + +#endif diff --git a/src/uct/base/uct_log.h b/src/uct/base/uct_log.h index dc2c840ce9f..5267e9b9ca0 100644 --- a/src/uct/base/uct_log.h +++ b/src/uct/base/uct_log.h @@ -17,7 +17,8 @@ * In debug mode, print packet description to the log. */ #define uct_log_data(_file, _line, _function, _info) \ - ucs_log_dispatch(_file, _line, _function, UCS_LOG_LEVEL_TRACE_DATA, "%s", buf); + ucs_log_dispatch(_file, _line, _function, UCS_LOG_LEVEL_TRACE_DATA,\ + &ucs_global_opts.log_component, "%s", buf); /** diff --git a/src/uct/base/uct_md.c b/src/uct/base/uct_md.c index 5c96f51fb0d..d6824362090 100644 --- a/src/uct/base/uct_md.c +++ b/src/uct/base/uct_md.c @@ -6,6 +6,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "uct_md.h" #include "uct_iface.h" @@ -16,10 +20,7 @@ #include #include #include -#include - -UCS_LIST_HEAD(uct_md_components_list); ucs_config_field_t uct_md_config_table[] = { @@ -30,115 +31,32 @@ ucs_config_field_t uct_md_config_rcache_table[] = { {"RCACHE_MEM_PRIO", "1000", "Registration cache memory event priority", ucs_offsetof(uct_md_rcache_config_t, event_prio), UCS_CONFIG_TYPE_UINT}, - {"RCACHE_OVERHEAD", "90ns", "Registration cache lookup overhead", + {"RCACHE_OVERHEAD", "180ns", "Registration cache lookup overhead", ucs_offsetof(uct_md_rcache_config_t, overhead), UCS_CONFIG_TYPE_TIME}, {"RCACHE_ADDR_ALIGN", UCS_PP_MAKE_STRING(UCS_SYS_CACHE_LINE_SIZE), "Registration cache address alignment, must be power of 2\n" - "between "UCS_PP_MAKE_STRING(UCS_PGT_ADDR_ALIGN)"and system page size", + "between "UCS_PP_MAKE_STRING(UCS_PGT_ADDR_ALIGN)"and system page size", ucs_offsetof(uct_md_rcache_config_t, alignment), UCS_CONFIG_TYPE_UINT}, {NULL} }; -/** - * Keeps information about allocated configuration structure, to be used when - * releasing the options. - */ -typedef struct uct_config_bundle { - ucs_config_field_t *table; - const char *table_prefix; - char data[]; -} uct_config_bundle_t; - - -ucs_status_t uct_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) +ucs_status_t uct_md_open(uct_component_h component, const char *md_name, + const uct_md_config_t *config, uct_md_h *md_p) { - UCS_MODULE_FRAMEWORK_DECLARE(uct); - uct_md_resource_desc_t *resources, *md_resources, *tmp; - unsigned i, num_resources, num_md_resources; - uct_md_component_t *mdc; - ucs_status_t status; - - UCS_MODULE_FRAMEWORK_LOAD(uct, 0); - - resources = NULL; - num_resources = 0; - - ucs_list_for_each(mdc, &uct_md_components_list, list) { - status = mdc->query_resources(&md_resources, &num_md_resources); - if (status != UCS_OK) { - ucs_debug("Failed to query %s* resources: %s", mdc->name, - ucs_status_string(status)); - continue; - } - - if (num_md_resources == 0) { - ucs_free(md_resources); - continue; - } - - tmp = ucs_realloc(resources, - sizeof(*resources) * (num_resources + num_md_resources), - "md_resources"); - if (tmp == NULL) { - ucs_free(md_resources); - status = UCS_ERR_NO_MEMORY; - goto err; - } - - for (i = 0; i < num_md_resources; ++i) { - ucs_assertv_always(!strncmp(mdc->name, md_resources[i].md_name, - strlen(mdc->name)), - "MD name must begin with MD component name." - "MD name: %s MD component name: %s ", - md_resources[i].md_name, mdc->name); - } - resources = tmp; - memcpy(resources + num_resources, md_resources, - sizeof(*md_resources) * num_md_resources); - num_resources += num_md_resources; - ucs_free(md_resources); - } - - *resources_p = resources; - *num_resources_p = num_resources; - return UCS_OK; - -err: - ucs_free(resources); - return status; -} - -void uct_release_md_resource_list(uct_md_resource_desc_t *resources) -{ - ucs_free(resources); -} - -ucs_status_t uct_md_open(const char *md_name, const uct_md_config_t *config, - uct_md_h *md_p) -{ - uct_md_component_t *mdc; ucs_status_t status; uct_md_h md; - ucs_list_for_each(mdc, &uct_md_components_list, list) { - if (!strncmp(md_name, mdc->name, strlen(mdc->name))) { - status = mdc->md_open(md_name, config, &md); - if (status != UCS_OK) { - return status; - } - - ucs_assert_always(md->component == mdc); - *md_p = md; - return UCS_OK; - } + status = component->md_open(component, md_name, config, &md); + if (status != UCS_OK) { + return status; } - ucs_error("MD '%s' does not exist", md_name); - return UCS_ERR_NO_DEVICE; + *md_p = md; + ucs_assert_always(md->component == component); + return UCS_OK; } void uct_md_close(uct_md_h md) @@ -150,48 +68,51 @@ ucs_status_t uct_md_query_tl_resources(uct_md_h md, uct_tl_resource_desc_t **resources_p, unsigned *num_resources_p) { - uct_tl_resource_desc_t *resources, *tl_resources, *tmp; - unsigned i, num_resources, num_tl_resources; - uct_md_component_t *mdc = md->component; - uct_md_registered_tl_t *tlr; - uct_tl_component_t *tlc; + uct_component_t *component = md->component; + uct_tl_resource_desc_t *resources, *tmp; + uct_tl_device_resource_t *tl_devices; + unsigned i, num_resources, num_tl_devices; ucs_status_t status; + uct_tl_t *tl; resources = NULL; num_resources = 0; - ucs_list_for_each(tlr, &mdc->tl_list, list) { - tlc = tlr->tl; - - status = tlc->query_resources(md, &tl_resources, &num_tl_resources); + ucs_list_for_each(tl, &component->tl_list, list) { + status = tl->query_devices(md, &tl_devices, &num_tl_devices); if (status != UCS_OK) { - ucs_debug("Failed to query %s resources: %s", tlc->name, + ucs_debug("failed to query %s resources: %s", tl->name, ucs_status_string(status)); continue; } - if (num_tl_resources == 0) { - ucs_free(tl_resources); + if (num_tl_devices == 0) { + ucs_free(tl_devices); continue; } tmp = ucs_realloc(resources, - sizeof(*resources) * (num_resources + num_tl_resources), + sizeof(*resources) * (num_resources + num_tl_devices), "md_resources"); if (tmp == NULL) { - ucs_free(tl_resources); + ucs_free(tl_devices); status = UCS_ERR_NO_MEMORY; goto err; } - for (i = 0; i < num_tl_resources; ++i) { - ucs_assert_always(!strcmp(tlc->name, tl_resources[i].tl_name)); + /* add tl devices to overall list of resources */ + for (i = 0; i < num_tl_devices; ++i) { + ucs_strncpy_zero(tmp[num_resources + i].tl_name, tl->name, + sizeof(tmp[num_resources + i].tl_name)); + ucs_strncpy_zero(tmp[num_resources + i].dev_name, tl_devices[i].name, + sizeof(tmp[num_resources + i].dev_name)); + tmp[num_resources + i].dev_type = tl_devices[i].type; + tmp[num_resources + i].sys_device = tl_devices[i].sys_device; } - resources = tmp; - memcpy(resources + num_resources, tl_resources, - sizeof(*tl_resources) * num_tl_resources); - num_resources += num_tl_resources; - ucs_free(tl_resources); + + resources = tmp; + num_resources += num_tl_devices; + ucs_free(tl_devices); } *resources_p = resources; @@ -208,9 +129,10 @@ void uct_release_tl_resource_list(uct_tl_resource_desc_t *resources) ucs_free(resources); } -ucs_status_t uct_single_md_resource(uct_md_component_t *mdc, - uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) +ucs_status_t +uct_md_query_single_md_resource(uct_component_t *component, + uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p) { uct_md_resource_desc_t *resource; @@ -219,69 +141,41 @@ ucs_status_t uct_single_md_resource(uct_md_component_t *mdc, return UCS_ERR_NO_MEMORY; } - ucs_snprintf_zero(resource->md_name, UCT_MD_NAME_MAX, "%s", mdc->name); + ucs_snprintf_zero(resource->md_name, UCT_MD_NAME_MAX, "%s", + component->name); *resources_p = resource; *num_resources_p = 1; return UCS_OK; } -ucs_status_t uct_md_stub_rkey_unpack(uct_md_component_t *mdc, - const void *rkey_buffer, uct_rkey_t *rkey_p, - void **handle_p) +ucs_status_t +uct_md_query_empty_md_resource(uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p) { - *rkey_p = 0xdeadbeef; - *handle_p = NULL; + *resources_p = NULL; + *num_resources_p = 0; return UCS_OK; } -static ucs_status_t uct_config_read(uct_config_bundle_t **bundle, - ucs_config_field_t *config_table, - size_t config_size, const char *env_prefix, - const char *cfg_prefix) +ucs_status_t uct_md_stub_rkey_unpack(uct_component_t *component, + const void *rkey_buffer, uct_rkey_t *rkey_p, + void **handle_p) { - uct_config_bundle_t *config_bundle; - ucs_status_t status; - - config_bundle = ucs_calloc(1, sizeof(*config_bundle) + config_size, "uct_config"); - if (config_bundle == NULL) { - status = UCS_ERR_NO_MEMORY; - goto err; - } - - /* TODO use env_prefix */ - status = ucs_config_parser_fill_opts(config_bundle->data, config_table, - env_prefix, cfg_prefix, 0); - if (status != UCS_OK) { - goto err_free_bundle; - } - - config_bundle->table = config_table; - config_bundle->table_prefix = ucs_strdup(cfg_prefix, "uct_config"); - if (config_bundle->table_prefix == NULL) { - status = UCS_ERR_NO_MEMORY; - goto err_free_bundle; - } - - *bundle = config_bundle; + *rkey_p = 0xdeadbeef; + *handle_p = NULL; return UCS_OK; - -err_free_bundle: - ucs_free(config_bundle); -err: - return status; } -uct_tl_component_t *uct_find_tl_on_md(uct_md_component_t *mdc, - uint64_t md_flags, - const char *tl_name) +static uct_tl_t *uct_find_tl(uct_component_h component, uint64_t md_flags, + const char *tl_name) { - uct_md_registered_tl_t *tlr; + uct_tl_t *tl; - ucs_list_for_each(tlr, &mdc->tl_list, list) { - if (((tl_name != NULL) && !strcmp(tl_name, tlr->tl->name)) || + ucs_list_for_each(tl, &component->tl_list, list) { + if (((tl_name != NULL) && !strcmp(tl_name, tl->name)) || ((tl_name == NULL) && (md_flags & UCT_MD_FLAG_SOCKADDR))) { - return tlr->tl; + return tl; } } return NULL; @@ -292,9 +186,9 @@ ucs_status_t uct_md_iface_config_read(uct_md_h md, const char *tl_name, uct_iface_config_t **config_p) { uct_config_bundle_t *bundle = NULL; - uct_tl_component_t *tlc; uct_md_attr_t md_attr; ucs_status_t status; + uct_tl_t *tl; status = uct_md_query(md, &md_attr); if (status != UCS_OK) { @@ -302,8 +196,8 @@ ucs_status_t uct_md_iface_config_read(uct_md_h md, const char *tl_name, return status; } - tlc = uct_find_tl_on_md(md->component, md_attr.cap.flags, tl_name); - if (tlc == NULL) { + tl = uct_find_tl(md->component, md_attr.cap.flags, tl_name); + if (tl == NULL) { if (tl_name == NULL) { ucs_error("There is no sockaddr transport registered on the md"); } else { @@ -313,14 +207,15 @@ ucs_status_t uct_md_iface_config_read(uct_md_h md, const char *tl_name, return status; } - status = uct_config_read(&bundle, tlc->iface_config_table, - tlc->iface_config_size, env_prefix, tlc->cfg_prefix); + status = uct_config_read(&bundle, tl->config.table, tl->config.size, + env_prefix, tl->config.prefix); if (status != UCS_OK) { ucs_error("Failed to read iface config"); return status; } *config_p = (uct_iface_config_t*) bundle->data; + /* coverity[leaked_storage] */ return UCS_OK; } @@ -329,9 +224,9 @@ ucs_status_t uct_iface_open(uct_md_h md, uct_worker_h worker, const uct_iface_config_t *config, uct_iface_h *iface_p) { - uct_tl_component_t *tlc; uct_md_attr_t md_attr; ucs_status_t status; + uct_tl_t *tl; status = uct_md_query(md, &md_attr); if (status != UCS_OK) { @@ -343,60 +238,41 @@ ucs_status_t uct_iface_open(uct_md_h md, uct_worker_h worker, "UCT_IFACE_PARAM_FIELD_OPEN_MODE is not defined"); if (params->open_mode & UCT_IFACE_OPEN_MODE_DEVICE) { - tlc = uct_find_tl_on_md(md->component, md_attr.cap.flags, params->mode.device.tl_name); + tl = uct_find_tl(md->component, md_attr.cap.flags, + params->mode.device.tl_name); } else if ((params->open_mode & UCT_IFACE_OPEN_MODE_SOCKADDR_CLIENT) || (params->open_mode & UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER)) { - tlc = uct_find_tl_on_md(md->component, md_attr.cap.flags, NULL); + tl = uct_find_tl(md->component, md_attr.cap.flags, NULL); } else { ucs_error("Invalid open mode %zu", params->open_mode); return status; } - if (tlc == NULL) { + if (tl == NULL) { /* Non-existing transport */ return UCS_ERR_NO_DEVICE; } - return tlc->iface_open(md, worker, params, config, iface_p); + return tl->iface_open(md, worker, params, config, iface_p); } -static uct_md_component_t *uct_find_mdc(const char *name) -{ - uct_md_component_t *mdc; - - ucs_list_for_each(mdc, &uct_md_components_list, list) { - if (!strncmp(name, mdc->name, strlen(mdc->name))) { - return mdc; - } - } - return NULL; -} - -ucs_status_t uct_md_config_read(const char *name, const char *env_prefix, - const char *filename, +ucs_status_t uct_md_config_read(uct_component_h component, + const char *env_prefix, const char *filename, uct_md_config_t **config_p) { uct_config_bundle_t *bundle = NULL; - uct_md_component_t *mdc; ucs_status_t status; - /* find the matching mdc. the search can be by md_name or by mdc_name. - * (depending on the caller) */ - mdc = uct_find_mdc(name); - if (mdc == NULL) { - ucs_error("MD component does not exist for '%s'", name); - status = UCS_ERR_INVALID_PARAM; /* Non-existing MDC */ - return status; - } - - status = uct_config_read(&bundle, mdc->md_config_table, - mdc->md_config_size, env_prefix, mdc->cfg_prefix); + status = uct_config_read(&bundle, component->md_config.table, + component->md_config.size, env_prefix, + component->md_config.prefix); if (status != UCS_OK) { ucs_error("Failed to read MD config"); return status; } *config_p = (uct_md_config_t*) bundle->data; + /* coverity[leaked_storage] */ return UCS_OK; } @@ -429,41 +305,39 @@ ucs_status_t uct_md_mkey_pack(uct_md_h md, uct_mem_h memh, void *rkey_buffer) return md->ops->mkey_pack(md, memh, rbuf); } -ucs_status_t uct_rkey_unpack(const void *rkey_buffer, uct_rkey_bundle_t *rkey_ob) +ucs_status_t uct_rkey_unpack(uct_component_h component, const void *rkey_buffer, + uct_rkey_bundle_t *rkey_ob) { - uct_md_component_t *mdc; - ucs_status_t status; - char mdc_name[UCT_MD_COMPONENT_NAME_MAX + 1]; - - ucs_list_for_each(mdc, &uct_md_components_list, list) { - if (!strncmp(rkey_buffer, mdc->name, UCT_MD_COMPONENT_NAME_MAX)) { - status = mdc->rkey_unpack(mdc, rkey_buffer + UCT_MD_COMPONENT_NAME_MAX, - &rkey_ob->rkey, &rkey_ob->handle); - if (status == UCS_OK) { - rkey_ob->type = mdc; - } - - return status; + char component_name[UCT_COMPONENT_NAME_MAX + 1]; + + if (ENABLE_DEBUG_DATA) { + if (ENABLE_PARAMS_CHECK && + strncmp(rkey_buffer, component->name, UCT_COMPONENT_NAME_MAX)) { + ucs_snprintf_zero(component_name, sizeof(component_name), "%s", + (const char*)rkey_buffer); + ucs_error("invalid component for rkey unpack; " + "expected: %s, actual: %s", component_name, component->name); + return UCS_ERR_INVALID_PARAM; } + + rkey_buffer = UCS_PTR_BYTE_OFFSET(rkey_buffer, UCT_COMPONENT_NAME_MAX); } - ucs_snprintf_zero(mdc_name, sizeof(mdc_name), "%s", (const char*)rkey_buffer); - ucs_debug("No matching MD component found for '%s'", mdc_name); - return UCS_ERR_UNSUPPORTED; + return component->rkey_unpack(component, rkey_buffer, &rkey_ob->rkey, + &rkey_ob->handle); } -ucs_status_t uct_rkey_ptr(uct_rkey_bundle_t *rkey_ob, uint64_t remote_addr, - void **local_addr_p) +ucs_status_t uct_rkey_ptr(uct_component_h component, uct_rkey_bundle_t *rkey_ob, + uint64_t remote_addr, void **local_addr_p) { - uct_md_component_t *mdc = rkey_ob->type; - return mdc->rkey_ptr(mdc, rkey_ob->rkey, rkey_ob->handle, remote_addr, - local_addr_p); + return component->rkey_ptr(component, rkey_ob->rkey, rkey_ob->handle, + remote_addr, local_addr_p); } -ucs_status_t uct_rkey_release(const uct_rkey_bundle_t *rkey_ob) +ucs_status_t uct_rkey_release(uct_component_h component, + const uct_rkey_bundle_t *rkey_ob) { - uct_md_component_t *mdc = rkey_ob->type; - return mdc->rkey_release(mdc, rkey_ob->rkey, rkey_ob->handle); + return component->rkey_release(component, rkey_ob->rkey, rkey_ob->handle); } ucs_status_t uct_md_query(uct_md_h md, uct_md_attr_t *md_attr) @@ -475,9 +349,13 @@ ucs_status_t uct_md_query(uct_md_h md, uct_md_attr_t *md_attr) return status; } - /* MD component name + data */ - memcpy(md_attr->component_name, md->component->name, UCT_MD_COMPONENT_NAME_MAX); - md_attr->rkey_packed_size += UCT_MD_COMPONENT_NAME_MAX; + /* Component name + data */ + memcpy(md_attr->component_name, md->component->name, UCT_COMPONENT_NAME_MAX); + +#if ENABLE_DEBUG_DATA + /* MD name is packed into rkey in DEBUG mode only */ + md_attr->rkey_packed_size += UCT_COMPONENT_NAME_MAX; +#endif return UCS_OK; } @@ -541,14 +419,20 @@ ucs_status_t uct_md_mem_dereg(uct_md_h md, uct_mem_h memh) return md->ops->mem_dereg(md, memh); } +ucs_status_t uct_md_mem_query(uct_md_h md, const void *addr, const size_t length, + uct_md_mem_attr_t *mem_attr_p) +{ + return md->ops->mem_query(md, addr, length, mem_attr_p); +} + int uct_md_is_sockaddr_accessible(uct_md_h md, const ucs_sock_addr_t *sockaddr, uct_sockaddr_accessibility_t mode) { return md->ops->is_sockaddr_accessible(md, sockaddr, mode); } -int uct_md_is_mem_type_owned(uct_md_h md, void *addr, size_t length) +ucs_status_t uct_md_detect_memory_type(uct_md_h md, const void *addr, size_t length, + ucs_memory_type_t *mem_type_p) { - return md->ops->is_mem_type_owned(md, addr, length); + return md->ops->detect_memory_type(md, addr, length, mem_type_p); } - diff --git a/src/uct/base/uct_md.h b/src/uct/base/uct_md.h index 0c7926ba250..804bf748422 100644 --- a/src/uct/base/uct_md.h +++ b/src/uct/base/uct_md.h @@ -7,37 +7,15 @@ #ifndef UCT_MD_H_ #define UCT_MD_H_ -#include "uct_iface.h" +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "uct_component.h" #include #include - - -typedef struct uct_md_component uct_md_component_t; -struct uct_md_component { - ucs_status_t (*query_resources)(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p); - - ucs_status_t (*md_open)(const char *md_name, const uct_md_config_t *config, - uct_md_h *md_p); - - ucs_status_t (*rkey_unpack)(uct_md_component_t *mdc, const void *rkey_buffer, - uct_rkey_t *rkey_p, void **handle_p); - - ucs_status_t (*rkey_ptr)(uct_md_component_t *mdc, uct_rkey_t rkey, void *handle, - uint64_t raddr, void **laddr_p); - - ucs_status_t (*rkey_release)(uct_md_component_t *mdc, uct_rkey_t rkey, - void *handle); - - const char name[UCT_MD_COMPONENT_NAME_MAX]; - void *priv; - const char *cfg_prefix; /**< Prefix for configuration environment vars */ - ucs_config_field_t *md_config_table; /**< Defines MD configuration options */ - size_t md_config_size; /**< MD configuration structure size */ - ucs_list_link_t tl_list; /* List of uct_md_registered_tl_t */ - ucs_list_link_t list; -}; +#include typedef struct uct_md_rcache_config { @@ -46,6 +24,7 @@ typedef struct uct_md_rcache_config { double overhead; /**< Lookup overhead estimation */ } uct_md_rcache_config_t; + extern ucs_config_field_t uct_md_config_rcache_table[]; /** @@ -58,95 +37,66 @@ struct uct_md_config { }; -/** - * MD->Transport - */ -typedef struct uct_md_registered_tl { - ucs_list_link_t list; - uct_tl_component_t *tl; -} uct_md_registered_tl_t; +typedef void (*uct_md_close_func_t)(uct_md_h md); +typedef ucs_status_t (*uct_md_query_func_t)(uct_md_h md, + uct_md_attr_t *md_attr); -/** - * Define a MD component. - * - * @param _mdc MD component structure to initialize. - * @param _name MD component name. - * @param _query Function to query MD resources. - * @param _open Function to open a MD. - * @param _priv Custom private data. - * @param _rkey_unpack Function to unpack a remote key buffer to handle. - * @param _rkey_release Function to release a remote key handle. - * @param _cfg_prefix Prefix for configuration environment vars. - * @param _cfg_table Defines the MDC's configuration values. - * @param _cfg_struct MDC configuration structure. - */ -#define UCT_MD_COMPONENT_DEFINE(_mdc, _name, _query, _open, _priv, \ - _rkey_unpack, _rkey_release, \ - _cfg_prefix, _cfg_table, _cfg_struct) \ - \ - uct_md_component_t _mdc = { \ - .query_resources = _query, \ - .md_open = _open, \ - .cfg_prefix = _cfg_prefix, \ - .md_config_table = _cfg_table, \ - .md_config_size = sizeof(_cfg_struct), \ - .priv = _priv, \ - .rkey_unpack = _rkey_unpack, \ - .rkey_ptr = ucs_empty_function_return_unsupported, \ - .rkey_release = _rkey_release, \ - .name = _name, \ - .tl_list = { &_mdc.tl_list, &_mdc.tl_list } \ - }; \ - UCS_STATIC_INIT { \ - ucs_list_add_tail(&uct_md_components_list, &_mdc.list); \ - } \ - UCS_CONFIG_REGISTER_TABLE(_cfg_table, _name" memory domain", _cfg_prefix, \ - _cfg_struct) +typedef ucs_status_t (*uct_md_mem_alloc_func_t)(uct_md_h md, + size_t *length_p, + void **address_p, + unsigned flags, + const char *alloc_name, + uct_mem_h *memh_p); +typedef ucs_status_t (*uct_md_mem_free_func_t)(uct_md_h md, uct_mem_h memh); -/** - * Add a transport component to a md component - * (same transport component can be added to multiple md components). - * - * @param _mdc Pointer to MD component to add the TL component to. - * @param _tlc Pointer to TL component. - */ -#define UCT_MD_REGISTER_TL(_mdc, _tlc) \ - UCS_STATIC_INIT { \ - static uct_md_registered_tl_t reg; \ - reg.tl = (_tlc); \ - ucs_list_add_tail(&(_mdc)->tl_list, ®.list); \ - } +typedef ucs_status_t (*uct_md_mem_advise_func_t)(uct_md_h md, + uct_mem_h memh, + void *addr, + size_t length, + unsigned advice); +typedef ucs_status_t (*uct_md_mem_reg_func_t)(uct_md_h md, void *address, + size_t length, + unsigned flags, + uct_mem_h *memh_p); -/** - * Memory domain operations - */ -struct uct_md_ops { - void (*close)(uct_md_h md); +typedef ucs_status_t (*uct_md_mem_dereg_func_t)(uct_md_h md, uct_mem_h memh); - ucs_status_t (*query)(uct_md_h md, uct_md_attr_t *md_attr); +typedef ucs_status_t (*uct_md_mem_query_func_t)(uct_md_h md, + const void *addr, + const size_t length, + uct_md_mem_attr_t *mem_attr_p); - ucs_status_t (*mem_alloc)(uct_md_h md, size_t *length_p, void **address_p, - unsigned flags, const char *alloc_name, - uct_mem_h *memh_p); +typedef ucs_status_t (*uct_md_mkey_pack_func_t)(uct_md_h md, uct_mem_h memh, + void *rkey_buffer); - ucs_status_t (*mem_free)(uct_md_h md, uct_mem_h memh); - ucs_status_t (*mem_advise)(uct_md_h md, uct_mem_h memh, void *addr, - size_t length, unsigned advice); +typedef int (*uct_md_is_sockaddr_accessible_func_t)(uct_md_h md, + const ucs_sock_addr_t *sockaddr, + uct_sockaddr_accessibility_t mode); - ucs_status_t (*mem_reg)(uct_md_h md, void *address, size_t length, - unsigned flags, uct_mem_h *memh_p); +typedef ucs_status_t (*uct_md_detect_memory_type_func_t)(uct_md_h md, + const void *addr, + size_t length, + ucs_memory_type_t *mem_type_p); - ucs_status_t (*mem_dereg)(uct_md_h md, uct_mem_h memh); - ucs_status_t (*mkey_pack)(uct_md_h md, uct_mem_h memh, void *rkey_buffer); - - int (*is_sockaddr_accessible)(uct_md_h md, const ucs_sock_addr_t *sockaddr, - uct_sockaddr_accessibility_t mode); - - int (*is_mem_type_owned)(uct_md_h md, void *addr, size_t length); +/** + * Memory domain operations + */ +struct uct_md_ops { + uct_md_close_func_t close; + uct_md_query_func_t query; + uct_md_mem_alloc_func_t mem_alloc; + uct_md_mem_free_func_t mem_free; + uct_md_mem_advise_func_t mem_advise; + uct_md_mem_reg_func_t mem_reg; + uct_md_mem_dereg_func_t mem_dereg; + uct_md_mem_query_func_t mem_query; + uct_md_mkey_pack_func_t mkey_pack; + uct_md_is_sockaddr_accessible_func_t is_sockaddr_accessible; + uct_md_detect_memory_type_func_t detect_memory_type; }; @@ -155,37 +105,52 @@ struct uct_md_ops { */ struct uct_md { uct_md_ops_t *ops; - uct_md_component_t *component; + uct_component_t *component; }; +#define UCT_MD_DEFAULT_CONFIG_INITIALIZER \ + { \ + .name = "Default memory domain", \ + .prefix = "", \ + .table = uct_md_config_table, \ + .size = sizeof(uct_md_config_t), \ + } + + static UCS_F_ALWAYS_INLINE void* uct_md_fill_md_name(uct_md_h md, void *buffer) { - memcpy(buffer, md->component->name, UCT_MD_COMPONENT_NAME_MAX); - return (char*)buffer + UCT_MD_COMPONENT_NAME_MAX; +#if ENABLE_DEBUG_DATA + memcpy(buffer, md->component->name, UCT_COMPONENT_NAME_MAX); + return (char*)buffer + UCT_COMPONENT_NAME_MAX; +#else + return buffer; +#endif } +/* + * Base implementation of query_md_resources(), which returns a single md + * resource whose name is identical to component name. + */ +ucs_status_t +uct_md_query_single_md_resource(uct_component_t *component, + uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p); -ucs_status_t uct_single_md_resource(uct_md_component_t *mdc, - uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p); +ucs_status_t +uct_md_query_empty_md_resource(uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p); /** * @brief Dummy function * Dummy function to emulate unpacking a remote key buffer to handle. * */ -ucs_status_t uct_md_stub_rkey_unpack(uct_md_component_t *mdc, +ucs_status_t uct_md_stub_rkey_unpack(uct_component_t *component, const void *rkey_buffer, uct_rkey_t *rkey_p, void **handle_p); -uct_tl_component_t *uct_find_tl_on_md(uct_md_component_t *mdc, - uint64_t md_flags, - const char *tl_name); - - -extern ucs_list_link_t uct_md_components_list; extern ucs_config_field_t uct_md_config_table[]; #endif diff --git a/src/uct/base/uct_mem.c b/src/uct/base/uct_mem.c index cf8450402a7..743998bf825 100644 --- a/src/uct/base/uct_mem.c +++ b/src/uct/base/uct_mem.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "uct_iface.h" #include "uct_md.h" @@ -39,9 +43,11 @@ static inline int uct_mem_get_mmap_flags(unsigned uct_mmap_flags) { int mm_flags = 0; +#ifdef MAP_NONBLOCK if (uct_mmap_flags & UCT_MD_MEM_FLAG_NONBLOCK) { mm_flags |= MAP_NONBLOCK; } +#endif if (uct_mmap_flags & UCT_MD_MEM_FLAG_FIXED) { mm_flags |= MAP_FIXED; @@ -63,10 +69,12 @@ ucs_status_t uct_mem_alloc(void *addr, size_t min_length, unsigned flags, uct_mem_h memh; uct_md_h md; void *address; + int ret; +#ifdef SHM_HUGETLB int shmid; +#endif #ifdef MADV_HUGEPAGE ssize_t huge_page_size; - int ret; #endif if (min_length == 0) { @@ -128,9 +136,9 @@ ucs_status_t uct_mem_alloc(void *addr, size_t min_length, unsigned flags, } ucs_assert(memh != UCT_MEM_HANDLE_NULL); - mem->md = md; - mem->mem_type = md_attr.cap.mem_type; - mem->memh = memh; + mem->md = md; + mem->mem_type = md_attr.cap.access_mem_type; + mem->memh = memh; goto allocated; } @@ -157,8 +165,9 @@ ucs_status_t uct_mem_alloc(void *addr, size_t min_length, unsigned flags, break; } - address = ucs_memalign(huge_page_size, alloc_length UCS_MEMTRACK_VAL); - if (address == NULL) { + ret = ucs_posix_memalign(&address, huge_page_size, alloc_length + UCS_MEMTRACK_VAL); + if (ret != 0) { ucs_trace("failed to allocate %zu bytes using THP: %m", alloc_length); } else { ret = madvise(address, alloc_length, MADV_HUGEPAGE); @@ -182,9 +191,9 @@ ucs_status_t uct_mem_alloc(void *addr, size_t min_length, unsigned flags, } alloc_length = min_length; - address = ucs_memalign(UCS_SYS_CACHE_LINE_SIZE, alloc_length - UCS_MEMTRACK_VAL); - if (address != NULL) { + ret = ucs_posix_memalign(&address, UCS_SYS_CACHE_LINE_SIZE, + alloc_length UCS_MEMTRACK_VAL); + if (ret == 0) { goto allocated_without_md; } @@ -208,6 +217,7 @@ ucs_status_t uct_mem_alloc(void *addr, size_t min_length, unsigned flags, break; case UCT_ALLOC_METHOD_HUGE: +#ifdef SHM_HUGETLB /* Allocate huge pages */ alloc_length = min_length; address = (flags & UCT_MD_MEM_FLAG_FIXED) ? addr : NULL; @@ -216,6 +226,9 @@ ucs_status_t uct_mem_alloc(void *addr, size_t min_length, unsigned flags, if (status == UCS_OK) { goto allocated_without_md; } +#else + status = UCS_ERR_NO_MEMORY; +#endif ucs_trace("failed to allocate %zu bytes from hugetlb: %s", min_length, ucs_status_string(status)); @@ -232,7 +245,7 @@ ucs_status_t uct_mem_alloc(void *addr, size_t min_length, unsigned flags, allocated_without_md: mem->md = NULL; - mem->mem_type = UCT_MD_MEM_TYPE_HOST; + mem->mem_type = UCS_MEMORY_TYPE_HOST; mem->memh = UCT_MEM_HANDLE_NULL; allocated: ucs_trace("allocated %zu bytes at %p using %s", alloc_length, address, @@ -354,7 +367,7 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_iface_mp_chunk_alloc, (mp, size_p, chunk_p), hdr->method = mem.method; hdr->length = mem.length; hdr->memh = mem.memh; - *size_p = mem.length - sizeof(*hdr); + *size_p = mem.length - sizeof(*hdr); *chunk_p = hdr + 1; return UCS_OK; } @@ -366,7 +379,7 @@ UCS_PROFILE_FUNC_VOID(uct_iface_mp_chunk_release, (mp, chunk), uct_iface_mp_chunk_hdr_t *hdr; uct_allocated_memory_t mem; - hdr = chunk - sizeof(*hdr); + hdr = UCS_PTR_BYTE_OFFSET(chunk, -sizeof(*hdr)); mem.address = hdr; mem.method = hdr->method; @@ -384,7 +397,7 @@ static void uct_iface_mp_obj_init(ucs_mpool_t *mp, void *obj, void *chunk) uct_iface_mp_chunk_hdr_t *hdr; init_obj_cb = uct_iface_mp_priv(mp)->init_obj_cb; - hdr = chunk - sizeof(*hdr); + hdr = UCS_PTR_BYTE_OFFSET(chunk, -sizeof(*hdr)); if (init_obj_cb != NULL) { init_obj_cb(&iface->super, obj, hdr->memh); } diff --git a/src/uct/base/uct_worker.c b/src/uct/base/uct_worker.c index e9da0de50f3..5f7ec093e87 100644 --- a/src/uct/base/uct_worker.c +++ b/src/uct/base/uct_worker.c @@ -6,6 +6,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "uct_worker.h" #include @@ -72,7 +76,7 @@ void uct_worker_progress_remove(uct_priv_worker_t *worker, uct_worker_progress_t { UCS_ASYNC_BLOCK(worker->async); ucs_assert(prog->refcount > 0); - if (ucs_atomic_fadd32(&prog->refcount, -1) == 1) { + if (ucs_atomic_fsub32(&prog->refcount, 1) == 1) { ucs_callbackq_remove(&worker->super.progress_q, prog->id); prog->id = UCS_CALLBACKQ_ID_NULL; } @@ -90,6 +94,9 @@ void uct_worker_progress_remove_all(uct_priv_worker_t *worker, if (ucs_atomic_cswap32(&prog->refcount, ref, 0) == ref) { ucs_callbackq_remove(&worker->super.progress_q, prog->id); prog->id = UCS_CALLBACKQ_ID_NULL; + break; /* coverity thinks that `UCS_CALLBACKQ_ID_NULL` + * can be passed to `ucs_callbackq_remove()` + * make coverity happy - return from the loop */ } ref = prog->refcount; } diff --git a/src/uct/base/uct_worker.h b/src/uct/base/uct_worker.h index 78abfb1a15f..d189ae1e96f 100644 --- a/src/uct/base/uct_worker.h +++ b/src/uct/base/uct_worker.h @@ -40,7 +40,7 @@ typedef struct uct_worker_progress { ({ \ uct_worker_tl_data_t *data; \ _type *result; \ - ucs_status_t status; \ + ucs_status_t _status; \ \ ucs_list_for_each(data, &(_worker)->tl_data, list) { \ if ((data->key == (_key)) && _cmp_fn(ucs_derived_of(data, _type), \ @@ -52,19 +52,19 @@ typedef struct uct_worker_progress { } \ \ if (&data->list == &(_worker)->tl_data) { /* not found */ \ - data = ucs_malloc(sizeof(_type), UCS_PP_QUOTE(_type)); \ - if (data == NULL) { \ + result = ucs_malloc(sizeof(_type), UCS_PP_QUOTE(_type)); \ + if (result == NULL) { \ result = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); \ } else { \ + data = (uct_worker_tl_data_t*)result;\ data->key = (_key); \ data->refcount = 1; \ - status = _init_fn(ucs_derived_of(data, _type), ## __VA_ARGS__); \ - if (status != UCS_OK) { \ - ucs_free(data); \ - result = UCS_STATUS_PTR(status); \ + _status = _init_fn(ucs_derived_of(data, _type), ## __VA_ARGS__); \ + if (_status != UCS_OK) { \ + ucs_free(result); \ + result = UCS_STATUS_PTR(_status); \ } else { \ ucs_list_add_tail(&(_worker)->tl_data, &data->list); \ - result = ucs_derived_of(data, _type); \ } \ } \ } else { \ diff --git a/src/uct/cuda/Makefile.am b/src/uct/cuda/Makefile.am index e316001ca0b..0992bb4d180 100644 --- a/src/uct/cuda/Makefile.am +++ b/src/uct/cuda/Makefile.am @@ -26,6 +26,7 @@ noinst_HEADERS = \ cuda_ipc/cuda_ipc_cache.h libuct_cuda_la_SOURCES = \ + base/cuda_iface.c \ base/cuda_md.c \ cuda_copy/cuda_copy_md.c \ cuda_copy/cuda_copy_iface.c \ diff --git a/src/uct/cuda/base/cuda_iface.c b/src/uct/cuda/base/cuda_iface.c new file mode 100644 index 00000000000..3babb7ef423 --- /dev/null +++ b/src/uct/cuda/base/cuda_iface.c @@ -0,0 +1,21 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "cuda_iface.h" + + +ucs_status_t +uct_cuda_base_query_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) +{ + return uct_single_device_resource(md, UCT_CUDA_DEV_NAME, UCT_DEVICE_TYPE_ACC, + tl_devices_p, num_tl_devices_p); +} + diff --git a/src/uct/cuda/base/cuda_iface.h b/src/uct/cuda/base/cuda_iface.h index 4891182a44b..b612205954d 100644 --- a/src/uct/cuda/base/cuda_iface.h +++ b/src/uct/cuda/base/cuda_iface.h @@ -6,19 +6,24 @@ #ifndef UCT_CUDA_IFACE_H #define UCT_CUDA_IFACE_H +#include #include #include #include -#define UCT_CUDA_FUNC(_func) \ + +#define UCT_CUDA_DEV_NAME "cuda" + + +#define UCT_CUDA_FUNC(_func, _log_level) \ ({ \ ucs_status_t _status = UCS_OK; \ do { \ cudaError_t _result = (_func); \ if (cudaSuccess != _result) { \ - ucs_error("%s is failed. ret:%s", \ - UCS_PP_MAKE_STRING(_func), \ - cudaGetErrorString(_result)); \ + ucs_log((_log_level), "%s() failed: %s", \ + UCS_PP_MAKE_STRING(_func), \ + cudaGetErrorString(_result)); \ _status = UCS_ERR_IO_ERROR; \ } \ } while (0); \ @@ -26,7 +31,11 @@ }) -#define UCT_CUDADRV_FUNC(_func) \ +#define UCT_CUDA_FUNC_LOG_ERR(_func) \ + UCT_CUDA_FUNC(_func, UCS_LOG_LEVEL_ERROR) + + +#define UCT_CUDADRV_FUNC(_func, _log_level) \ ({ \ ucs_status_t _status = UCS_OK; \ do { \ @@ -36,12 +45,45 @@ _status = UCS_INPROGRESS; \ } else if (CUDA_SUCCESS != _result) { \ cuGetErrorString(_result, &cu_err_str); \ - ucs_error("%s is failed. ret:%s", \ - UCS_PP_MAKE_STRING(_func),cu_err_str);\ + ucs_log((_log_level), "%s() failed: %s", \ + UCS_PP_MAKE_STRING(_func), cu_err_str); \ _status = UCS_ERR_IO_ERROR; \ } \ } while (0); \ _status; \ }) + +#define UCT_CUDADRV_FUNC_LOG_ERR(_func) \ + UCT_CUDADRV_FUNC(_func, UCS_LOG_LEVEL_ERROR) + + +#define UCT_CUDADRV_CTX_ACTIVE(_state) \ + { \ + CUcontext cur_ctx; \ + CUdevice dev; \ + unsigned flags; \ + \ + _state = 0; \ + /* avoid active state check if no cuda activity */ \ + if ((CUDA_SUCCESS == cuCtxGetCurrent(&cur_ctx)) && \ + (NULL != cur_ctx)) { \ + UCT_CUDADRV_FUNC_LOG_ERR(cuCtxGetDevice(&dev)); \ + UCT_CUDADRV_FUNC_LOG_ERR(cuDevicePrimaryCtxGetState(dev, &flags, \ + &_state)); \ + } \ + } + + +typedef enum uct_cuda_base_gen { + UCT_CUDA_BASE_GEN_P100 = 6, + UCT_CUDA_BASE_GEN_V100 = 7, + UCT_CUDA_BASE_GEN_A100 = 8 +} uct_cuda_base_gen_t; + + +ucs_status_t +uct_cuda_base_query_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p); + #endif diff --git a/src/uct/cuda/base/cuda_md.c b/src/uct/cuda/base/cuda_md.c index a4c1749cf21..375c2f3111c 100644 --- a/src/uct/cuda/base/cuda_md.c +++ b/src/uct/cuda/base/cuda_md.c @@ -1,30 +1,75 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2018. ALL RIGHTS RESERVED. + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "cuda_md.h" #include +#include +#include #include #include -int uct_cuda_is_mem_type_owned(uct_md_h md, void *addr, size_t length) +UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_base_detect_memory_type, + (md, addr, length, mem_type_p), + uct_md_h md, const void *addr, size_t length, + ucs_memory_type_t *mem_type_p) { - CUmemorytype memType = 0; - uint32_t isManaged = 0; + CUmemorytype memType = (CUmemorytype)0; + uint32_t isManaged = 0; + unsigned value = 1; void *attrdata[] = {(void *)&memType, (void *)&isManaged}; CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE, CU_POINTER_ATTRIBUTE_IS_MANAGED}; CUresult cu_err; + const char *cu_err_str; if (addr == NULL) { - return 0; + *mem_type_p = UCS_MEMORY_TYPE_HOST; + return UCS_OK; } cu_err = cuPointerGetAttributes(2, attributes, attrdata, (CUdeviceptr)addr); - return ((cu_err == CUDA_SUCCESS) && (!isManaged && (memType == CU_MEMORYTYPE_DEVICE))); + if ((cu_err == CUDA_SUCCESS) && (memType == CU_MEMORYTYPE_DEVICE)) { + if (isManaged) { + *mem_type_p = UCS_MEMORY_TYPE_CUDA_MANAGED; + } else { + *mem_type_p = UCS_MEMORY_TYPE_CUDA; + cu_err = cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + (CUdeviceptr)addr); + if (cu_err != CUDA_SUCCESS) { + cuGetErrorString(cu_err, &cu_err_str); + ucs_warn("cuPointerSetAttribute(%p) error: %s", (void*) addr, cu_err_str); + } + } + return UCS_OK; + } + + return UCS_ERR_INVALID_ADDR; +} + +ucs_status_t +uct_cuda_base_query_md_resources(uct_component_t *component, + uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p) +{ + cudaError_t cudaErr; + int num_gpus; + + cudaErr = cudaGetDeviceCount(&num_gpus); + if ((cudaErr != cudaSuccess) || (num_gpus == 0)) { + return uct_md_query_empty_md_resource(resources_p, num_resources_p); + } + + return uct_md_query_single_md_resource(component, resources_p, + num_resources_p); } UCS_MODULE_INIT() { diff --git a/src/uct/cuda/base/cuda_md.h b/src/uct/cuda/base/cuda_md.h index 050848a17a1..f68d7035a38 100644 --- a/src/uct/cuda/base/cuda_md.h +++ b/src/uct/cuda/base/cuda_md.h @@ -8,6 +8,13 @@ #include -int uct_cuda_is_mem_type_owned(uct_md_h md, void *addr, size_t length); +ucs_status_t uct_cuda_base_detect_memory_type(uct_md_h md, const void *addr, + size_t length, + ucs_memory_type_t *mem_type_p); + +ucs_status_t +uct_cuda_base_query_md_resources(uct_component_t *component, + uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p); #endif diff --git a/src/uct/cuda/configure.m4 b/src/uct/cuda/configure.m4 index 816bf55199d..f3f942173f5 100644 --- a/src/uct/cuda/configure.m4 +++ b/src/uct/cuda/configure.m4 @@ -5,7 +5,7 @@ UCX_CHECK_CUDA -AS_IF([test "x$cuda_happy" = "xyes"], [uct_modules+=":cuda"]) +AS_IF([test "x$cuda_happy" = "xyes"], [uct_modules="${uct_modules}:cuda"]) uct_cuda_modules="" m4_include([src/uct/cuda/gdr_copy/configure.m4]) AC_DEFINE_UNQUOTED([uct_cuda_MODULES], ["${uct_cuda_modules}"], [CUDA loadable modules]) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_ep.c b/src/uct/cuda/cuda_copy/cuda_copy_ep.c index b7ade667f28..f6d2c4c9c31 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_ep.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_ep.c @@ -3,10 +3,16 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "cuda_copy_ep.h" #include "cuda_copy_iface.h" #include +#include +#include #include #include #include @@ -35,9 +41,19 @@ UCS_CLASS_DEFINE_DELETE_FUNC(uct_cuda_copy_ep_t, uct_ep_t); ucs_trace_data(_fmt " to %"PRIx64"(%+ld)", ## __VA_ARGS__, (_remote_addr), \ (_rkey)) +#define UCT_CUDA_COPY_CHECK_AND_CREATE_STREAM(_strm) \ + if ((_strm) == 0) { \ + ucs_status_t __status; \ + __status = UCT_CUDA_FUNC_LOG_ERR(cudaStreamCreateWithFlags(&(_strm), \ + cudaStreamNonBlocking)); \ + if (UCS_OK != __status) { \ + return UCS_ERR_IO_ERROR; \ + } \ + } + static UCS_F_ALWAYS_INLINE ucs_status_t uct_cuda_copy_post_cuda_async_copy(uct_ep_h tl_ep, void *dst, void *src, size_t length, - int direction, cudaStream_t stream, + enum cudaMemcpyKind direction, cudaStream_t stream, ucs_queue_head_t *outstanding_queue, uct_completion_t *comp) { @@ -55,12 +71,13 @@ uct_cuda_copy_post_cuda_async_copy(uct_ep_h tl_ep, void *dst, void *src, size_t return UCS_ERR_NO_MEMORY; } - status = UCT_CUDA_FUNC(cudaMemcpyAsync(dst, src, length, direction, stream)); + status = UCT_CUDA_FUNC_LOG_ERR(cudaMemcpyAsync(dst, src, length, direction, + stream)); if (UCS_OK != status) { return UCS_ERR_IO_ERROR; } - status = UCT_CUDA_FUNC(cudaEventRecord(cuda_event->event, stream)); + status = UCT_CUDA_FUNC_LOG_ERR(cudaEventRecord(cuda_event->event, stream)); if (UCS_OK != status) { return UCS_ERR_IO_ERROR; } @@ -72,25 +89,24 @@ uct_cuda_copy_post_cuda_async_copy(uct_ep_h tl_ep, void *dst, void *src, size_t return UCS_INPROGRESS; } -ucs_status_t uct_cuda_copy_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp) +UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_copy_ep_get_zcopy, + (tl_ep, iov, iovcnt, remote_addr, rkey, comp), + uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, + uint64_t remote_addr, uct_rkey_t rkey, + uct_completion_t *comp) { uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cuda_copy_iface_t); ucs_status_t status; - if (iface->stream_d2h == 0) { - status = UCT_CUDA_FUNC(cudaStreamCreateWithFlags(&iface->stream_d2h, - cudaStreamNonBlocking)); - if (UCS_OK != status) { - return UCS_ERR_IO_ERROR; - } - } + UCT_CUDA_COPY_CHECK_AND_CREATE_STREAM(iface->stream_d2h); status = uct_cuda_copy_post_cuda_async_copy(tl_ep, iov[0].buffer, (void *)remote_addr, iov[0].length, cudaMemcpyDeviceToHost, iface->stream_d2h, &iface->outstanding_d2h_cuda_event_q, comp); + if (!UCS_STATUS_IS_ERR(status)) { + VALGRIND_MAKE_MEM_DEFINED(iov[0].buffer, iov[0].length); + } UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, ZCOPY, uct_iov_total_length(iov, iovcnt)); @@ -99,21 +115,17 @@ ucs_status_t uct_cuda_copy_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, si return status; } -ucs_status_t uct_cuda_copy_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp) +UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_copy_ep_put_zcopy, + (tl_ep, iov, iovcnt, remote_addr, rkey, comp), + uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, + uint64_t remote_addr, uct_rkey_t rkey, + uct_completion_t *comp) { uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cuda_copy_iface_t); ucs_status_t status; - if (iface->stream_h2d == 0) { - status = UCT_CUDA_FUNC(cudaStreamCreateWithFlags(&iface->stream_h2d, - cudaStreamNonBlocking)); - if (UCS_OK != status) { - return UCS_ERR_IO_ERROR; - } - } + UCT_CUDA_COPY_CHECK_AND_CREATE_STREAM(iface->stream_h2d); status = uct_cuda_copy_post_cuda_async_copy(tl_ep, (void *)remote_addr, iov[0].buffer, iov[0].length, cudaMemcpyHostToDevice, @@ -122,21 +134,25 @@ ucs_status_t uct_cuda_copy_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, si UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), PUT, ZCOPY, uct_iov_total_length(iov, iovcnt)); - uct_cuda_copy_trace_data(remote_addr, rkey, "GET_ZCOPY [length %zu]", + uct_cuda_copy_trace_data(remote_addr, rkey, "PUT_ZCOPY [length %zu]", uct_iov_total_length(iov, iovcnt)); return status; } - -ucs_status_t uct_cuda_copy_ep_put_short(uct_ep_h tl_ep, const void *buffer, - unsigned length, uint64_t remote_addr, - uct_rkey_t rkey) +UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_copy_ep_put_short, + (tl_ep, buffer, length, remote_addr, rkey), + uct_ep_h tl_ep, const void *buffer, unsigned length, + uint64_t remote_addr, uct_rkey_t rkey) { + uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cuda_copy_iface_t); ucs_status_t status; - status = UCT_CUDA_FUNC(cudaMemcpy((void *)remote_addr, buffer, - length, cudaMemcpyHostToDevice)); + UCT_CUDA_COPY_CHECK_AND_CREATE_STREAM(iface->stream_h2d); + + UCT_CUDA_FUNC_LOG_ERR(cudaMemcpyAsync((void*)remote_addr, buffer, length, + cudaMemcpyHostToDevice, iface->stream_h2d)); + status = UCT_CUDA_FUNC_LOG_ERR(cudaStreamSynchronize(iface->stream_h2d)); UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), PUT, SHORT, length); ucs_trace_data("PUT_SHORT size %d from %p to %p", @@ -144,14 +160,20 @@ ucs_status_t uct_cuda_copy_ep_put_short(uct_ep_h tl_ep, const void *buffer, return status; } -ucs_status_t uct_cuda_copy_ep_get_short(uct_ep_h tl_ep, void *buffer, - unsigned length, uint64_t remote_addr, - uct_rkey_t rkey) +UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_copy_ep_get_short, + (tl_ep, buffer, length, remote_addr, rkey), + uct_ep_h tl_ep, void *buffer, unsigned length, + uint64_t remote_addr, uct_rkey_t rkey) { + uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cuda_copy_iface_t); ucs_status_t status; - status = UCT_CUDA_FUNC(cudaMemcpy(buffer, (void *)remote_addr, - length, cudaMemcpyDeviceToHost)); + UCT_CUDA_COPY_CHECK_AND_CREATE_STREAM(iface->stream_d2h); + + UCT_CUDA_FUNC_LOG_ERR(cudaMemcpyAsync(buffer, (void*)remote_addr, length, + cudaMemcpyDeviceToHost, + iface->stream_d2h)); + status = UCT_CUDA_FUNC_LOG_ERR(cudaStreamSynchronize(iface->stream_d2h)); UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, SHORT, length); ucs_trace_data("GET_SHORT size %d from %p to %p", diff --git a/src/uct/cuda/cuda_copy/cuda_copy_iface.c b/src/uct/cuda/cuda_copy/cuda_copy_iface.c index 86e0dc10a2b..912252c0caf 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_iface.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_iface.c @@ -3,10 +3,15 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "cuda_copy_iface.h" #include "cuda_copy_md.h" #include "cuda_copy_ep.h" +#include #include #include #include @@ -22,7 +27,7 @@ static ucs_config_field_t uct_cuda_copy_iface_config_table[] = { "Max number of event completions to pick during cuda events polling", ucs_offsetof(uct_cuda_copy_iface_config_t, max_poll), UCS_CONFIG_TYPE_UINT}, - {"MAX_EVENTS", "1024", + {"MAX_EVENTS", "inf", "Max number of cuda events. -1 is infinite", ucs_offsetof(uct_cuda_copy_iface_config_t, max_cuda_events), UCS_CONFIG_TYPE_UINT}, @@ -53,10 +58,12 @@ static int uct_cuda_copy_iface_is_reachable(const uct_iface_h tl_iface, return (addr != NULL) && (iface->id == *addr); } -static ucs_status_t uct_cuda_copy_iface_query(uct_iface_h iface, +static ucs_status_t uct_cuda_copy_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { - memset(iface_attr, 0, sizeof(uct_iface_attr_t)); + uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_copy_iface_t); + + uct_base_iface_query(&iface->super, iface_attr); iface_attr->iface_addr_len = sizeof(uct_cuda_copy_iface_addr_t); iface_attr->device_addr_len = 0; @@ -93,9 +100,9 @@ static ucs_status_t uct_cuda_copy_iface_query(uct_iface_h iface, iface_attr->cap.am.max_hdr = 0; iface_attr->cap.am.max_iov = 1; - iface_attr->latency.overhead = 10e-6; /* 10 us */ - iface_attr->latency.growth = 0; - iface_attr->bandwidth = 6911 * 1024.0 * 1024.0; + iface_attr->latency = ucs_linear_func_make(10e-6, 0); + iface_attr->bandwidth.dedicated = 0; + iface_attr->bandwidth.shared = 6911.0 * UCS_MBYTE; iface_attr->overhead = 0; iface_attr->priority = 0; @@ -179,7 +186,7 @@ static uct_iface_ops_t uct_cuda_copy_iface_ops = { .iface_progress = uct_cuda_copy_iface_progress, .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_copy_iface_t), .iface_query = uct_cuda_copy_iface_query, - .iface_get_device_address = (void*)ucs_empty_function_return_success, + .iface_get_device_address = (uct_iface_get_device_address_func_t)ucs_empty_function_return_success, .iface_get_address = uct_cuda_copy_iface_get_address, .iface_is_reachable = uct_cuda_copy_iface_is_reachable, }; @@ -190,8 +197,8 @@ static void uct_cuda_copy_event_desc_init(ucs_mpool_t *mp, void *obj, void *chun ucs_status_t status; memset(base, 0 , sizeof(*base)); - status = UCT_CUDA_FUNC(cudaEventCreateWithFlags(&(base->event), - cudaEventDisableTiming)); + status = UCT_CUDA_FUNC_LOG_ERR(cudaEventCreateWithFlags(&base->event, + cudaEventDisableTiming)); if (UCS_OK != status) { ucs_error("cudaEventCreateWithFlags Failed"); } @@ -200,11 +207,12 @@ static void uct_cuda_copy_event_desc_init(ucs_mpool_t *mp, void *obj, void *chun static void uct_cuda_copy_event_desc_cleanup(ucs_mpool_t *mp, void *obj) { uct_cuda_copy_event_desc_t *base = (uct_cuda_copy_event_desc_t *) obj; - ucs_status_t status; + int active; - status = UCT_CUDA_FUNC(cudaEventDestroy(base->event)); - if (UCS_OK != status) { - ucs_error("cudaEventDestroy Failed"); + UCT_CUDADRV_CTX_ACTIVE(active); + + if (active) { + UCT_CUDA_FUNC_LOG_ERR(cudaEventDestroy(base->event)); } } @@ -225,7 +233,7 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_copy_iface_t, uct_md_h md, uct_worker_h work UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_cuda_copy_iface_ops, md, worker, params, tl_config UCS_STATS_ARG(params->stats_root) - UCS_STATS_ARG(UCT_CUDA_COPY_TL_NAME)); + UCS_STATS_ARG("cuda_copy")); if (strncmp(params->mode.device.dev_name, UCT_CUDA_DEV_NAME, strlen(UCT_CUDA_DEV_NAME)) != 0) { @@ -263,14 +271,20 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_copy_iface_t, uct_md_h md, uct_worker_h work static UCS_CLASS_CLEANUP_FUNC(uct_cuda_copy_iface_t) { + int active; + + UCT_CUDADRV_CTX_ACTIVE(active); + uct_base_iface_progress_disable(&self->super.super, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); - if (self->stream_h2d != 0) { - UCT_CUDA_FUNC(cudaStreamDestroy(self->stream_h2d)); - } + if (active) { + if (self->stream_h2d != 0) { + UCT_CUDA_FUNC_LOG_ERR(cudaStreamDestroy(self->stream_h2d)); + } - if (self->stream_d2h != 0) { - UCT_CUDA_FUNC(cudaStreamDestroy(self->stream_d2h)); + if (self->stream_d2h != 0) { + UCT_CUDA_FUNC_LOG_ERR(cudaStreamDestroy(self->stream_d2h)); + } } ucs_mpool_cleanup(&self->cuda_event_desc, 1); @@ -282,34 +296,6 @@ UCS_CLASS_DEFINE_NEW_FUNC(uct_cuda_copy_iface_t, uct_iface_t, uct_md_h, uct_work static UCS_CLASS_DEFINE_DELETE_FUNC(uct_cuda_copy_iface_t, uct_iface_t); -static ucs_status_t uct_cuda_copy_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) -{ - uct_tl_resource_desc_t *resource; - - resource = ucs_calloc(1, sizeof(uct_tl_resource_desc_t), "resource desc"); - if (NULL == resource) { - ucs_error("Failed to allocate memory"); - return UCS_ERR_NO_MEMORY; - } - - ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", - UCT_CUDA_COPY_TL_NAME); - ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", - UCT_CUDA_DEV_NAME); - resource->dev_type = UCT_DEVICE_TYPE_ACC; - - *num_resources_p = 1; - *resource_p = resource; - return UCS_OK; -} - -UCT_TL_COMPONENT_DEFINE(uct_cuda_copy_tl, - uct_cuda_copy_query_tl_resources, - uct_cuda_copy_iface_t, - UCT_CUDA_COPY_TL_NAME, - "CUDA_COPY_", - uct_cuda_copy_iface_config_table, - uct_cuda_copy_iface_config_t); -UCT_MD_REGISTER_TL(&uct_cuda_copy_md_component, &uct_cuda_copy_tl); +UCT_TL_DEFINE(&uct_cuda_copy_component, cuda_copy, uct_cuda_base_query_devices, + uct_cuda_copy_iface_t, "CUDA_COPY_", + uct_cuda_copy_iface_config_table, uct_cuda_copy_iface_config_t); diff --git a/src/uct/cuda/cuda_copy/cuda_copy_iface.h b/src/uct/cuda/cuda_copy/cuda_copy_iface.h index 7c8c5de8054..7ca6d512efb 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_iface.h +++ b/src/uct/cuda/cuda_copy/cuda_copy_iface.h @@ -10,10 +10,6 @@ #include -#define UCT_CUDA_COPY_TL_NAME "cuda_copy" -#define UCT_CUDA_DEV_NAME "cudacopy0" - - typedef uint64_t uct_cuda_copy_iface_addr_t; diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index af6e17b6d28..b64b414413d 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -1,8 +1,12 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2017-2018. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2017-2019. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "cuda_copy_md.h" #include @@ -11,6 +15,8 @@ #include #include #include +#include +#include #include #include @@ -24,14 +30,15 @@ static ucs_config_field_t uct_cuda_copy_md_config_table[] = { static ucs_status_t uct_cuda_copy_md_query(uct_md_h md, uct_md_attr_t *md_attr) { - md_attr->cap.flags = UCT_MD_FLAG_REG; - md_attr->cap.reg_mem_types = UCS_BIT(UCT_MD_MEM_TYPE_HOST); - md_attr->cap.mem_type = UCT_MD_MEM_TYPE_CUDA; - md_attr->cap.max_alloc = 0; - md_attr->cap.max_reg = ULONG_MAX; - md_attr->rkey_packed_size = 0; - md_attr->reg_cost.overhead = 0; - md_attr->reg_cost.growth = 0; + md_attr->cap.flags = UCT_MD_FLAG_REG; + md_attr->cap.reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); + md_attr->cap.access_mem_type = UCS_MEMORY_TYPE_CUDA; + md_attr->cap.detect_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA) | + UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED); + md_attr->cap.max_alloc = 0; + md_attr->cap.max_reg = ULONG_MAX; + md_attr->rkey_packed_size = 0; + md_attr->reg_cost = ucs_linear_func_make(0, 0); memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); return UCS_OK; } @@ -42,8 +49,9 @@ static ucs_status_t uct_cuda_copy_mkey_pack(uct_md_h md, uct_mem_h memh, return UCS_OK; } -static ucs_status_t uct_cuda_copy_rkey_unpack(uct_md_component_t *mdc, - const void *rkey_buffer, uct_rkey_t *rkey_p, +static ucs_status_t uct_cuda_copy_rkey_unpack(uct_component_t *component, + const void *rkey_buffer, + uct_rkey_t *rkey_p, void **handle_p) { *rkey_p = 0xdeadbeef; @@ -51,62 +59,64 @@ static ucs_status_t uct_cuda_copy_rkey_unpack(uct_md_component_t *mdc, return UCS_OK; } -static ucs_status_t uct_cuda_copy_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, - void *handle) +static ucs_status_t uct_cuda_copy_rkey_release(uct_component_t *component, + uct_rkey_t rkey, void *handle) { return UCS_OK; } -static ucs_status_t uct_cuda_copy_mem_reg(uct_md_h md, void *address, size_t length, - unsigned flags, uct_mem_h *memh_p) +UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_copy_mem_reg, + (md, address, length, flags, memh_p), + uct_md_h md, void *address, size_t length, + unsigned flags, uct_mem_h *memh_p) { - cudaError_t cuerr = cudaSuccess; + ucs_log_level_t log_level; + CUmemorytype memType; + CUresult result; + ucs_status_t status; - if(address == NULL) { + if (address == NULL) { *memh_p = address; return UCS_OK; } - cuerr = cudaHostRegister(address, length, cudaHostRegisterPortable); - if (cuerr != cudaSuccess) { - return UCS_ERR_IO_ERROR; + result = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + (CUdeviceptr)(address)); + if ((result == CUDA_SUCCESS) && (memType == CU_MEMORYTYPE_HOST)) { + /* memory is allocated with cudaMallocHost which is already registered */ + *memh_p = NULL; + return UCS_OK; + } + + log_level = (flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ? UCS_LOG_LEVEL_DEBUG : + UCS_LOG_LEVEL_ERROR; + status = UCT_CUDA_FUNC(cudaHostRegister(address, length, + cudaHostRegisterPortable), + log_level); + if (status != UCS_OK) { + return status; } *memh_p = address; return UCS_OK; } -static ucs_status_t uct_cuda_copy_mem_dereg(uct_md_h md, uct_mem_h memh) +UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_copy_mem_dereg, + (md, memh), uct_md_h md, uct_mem_h memh) { void *address = (void *)memh; - cudaError_t cuerr; + ucs_status_t status; if (address == NULL) { return UCS_OK; } - cuerr = cudaHostUnregister(address); - if (cuerr != cudaSuccess) { - return UCS_ERR_IO_ERROR; - } - return UCS_OK; -} - -static ucs_status_t uct_cuda_copy_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) -{ - int num_gpus; - cudaError_t cudaErr; - - cudaErr = cudaGetDeviceCount(&num_gpus); - if ((cudaErr!= cudaSuccess) || (num_gpus == 0)) { - ucs_debug("Not found cuda devices"); - *resources_p = NULL; - *num_resources_p = 0; - return UCS_OK; + status = UCT_CUDA_FUNC_LOG_ERR(cudaHostUnregister(address)); + if (status != UCS_OK) { + return status; } - return uct_single_md_resource(&uct_cuda_copy_md_component, resources_p, num_resources_p); + return UCS_OK; } static void uct_cuda_copy_md_close(uct_md_h uct_md) { @@ -116,33 +126,49 @@ static void uct_cuda_copy_md_close(uct_md_h uct_md) { } static uct_md_ops_t md_ops = { - .close = uct_cuda_copy_md_close, - .query = uct_cuda_copy_md_query, - .mkey_pack = uct_cuda_copy_mkey_pack, - .mem_reg = uct_cuda_copy_mem_reg, - .mem_dereg = uct_cuda_copy_mem_dereg, - .is_mem_type_owned = uct_cuda_is_mem_type_owned, + .close = uct_cuda_copy_md_close, + .query = uct_cuda_copy_md_query, + .mkey_pack = uct_cuda_copy_mkey_pack, + .mem_reg = uct_cuda_copy_mem_reg, + .mem_dereg = uct_cuda_copy_mem_dereg, + .detect_memory_type = uct_cuda_base_detect_memory_type, }; -static ucs_status_t uct_cuda_copy_md_open(const char *md_name, const uct_md_config_t *md_config, - uct_md_h *md_p) +static ucs_status_t +uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, + const uct_md_config_t *config, uct_md_h *md_p) { uct_cuda_copy_md_t *md; md = ucs_malloc(sizeof(uct_cuda_copy_md_t), "uct_cuda_copy_md_t"); if (NULL == md) { - ucs_error("Failed to allocate memory for uct_cuda_copy_md_t"); + ucs_error("failed to allocate memory for uct_cuda_copy_md_t"); return UCS_ERR_NO_MEMORY; } - md->super.ops = &md_ops; - md->super.component = &uct_cuda_copy_md_component; - - *md_p = (uct_md_h) md; + md->super.ops = &md_ops; + md->super.component = &uct_cuda_copy_component; + *md_p = (uct_md_h)md; return UCS_OK; } -UCT_MD_COMPONENT_DEFINE(uct_cuda_copy_md_component, UCT_CUDA_COPY_MD_NAME, - uct_cuda_copy_query_md_resources, uct_cuda_copy_md_open, NULL, - uct_cuda_copy_rkey_unpack, uct_cuda_copy_rkey_release, "CUDA_COPY_", - uct_cuda_copy_md_config_table, uct_cuda_copy_md_config_t); +uct_component_t uct_cuda_copy_component = { + .query_md_resources = uct_cuda_base_query_md_resources, + .md_open = uct_cuda_copy_md_open, + .cm_open = ucs_empty_function_return_unsupported, + .rkey_unpack = uct_cuda_copy_rkey_unpack, + .rkey_ptr = ucs_empty_function_return_unsupported, + .rkey_release = uct_cuda_copy_rkey_release, + .name = "cuda_cpy", + .md_config = { + .name = "Cuda-copy memory domain", + .prefix = "CUDA_COPY_", + .table = uct_cuda_copy_md_config_table, + .size = sizeof(uct_cuda_copy_md_config_t), + }, + .cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY, + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_cuda_copy_component), + .flags = 0 +}; +UCT_COMPONENT_REGISTER(&uct_cuda_copy_component); + diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.h b/src/uct/cuda/cuda_copy/cuda_copy_md.h index bd50206e13d..f73e625982a 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.h +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.h @@ -9,9 +9,8 @@ #include #include -#define UCT_CUDA_COPY_MD_NAME "cuda_cpy" -extern uct_md_component_t uct_cuda_copy_md_component; +extern uct_component_t uct_cuda_copy_component; /** * @brief cuda_copy MD descriptor diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c index becb90a8909..d6151d52edc 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "cuda_ipc_cache.h" #include #include @@ -13,8 +17,13 @@ static ucs_pgt_dir_t *uct_cuda_ipc_cache_pgt_dir_alloc(const ucs_pgtable_t *pgtable) { - return ucs_memalign(UCS_PGT_ENTRY_MIN_ALIGN, sizeof(ucs_pgt_dir_t), - "cuda_ipc_cache_pgdir"); + void *ptr; + int ret; + + ret = ucs_posix_memalign(&ptr, + ucs_max(sizeof(void *), UCS_PGT_ENTRY_MIN_ALIGN), + sizeof(ucs_pgt_dir_t), "cuda_ipc_cache_pgdir"); + return (ret == 0) ? ptr : NULL; } static void uct_cuda_ipc_cache_pgt_dir_release(const ucs_pgtable_t *pgtable, @@ -44,7 +53,8 @@ static void uct_cuda_ipc_cache_purge(uct_cuda_ipc_cache_t *cache) ucs_pgtable_purge(&cache->pgtable, uct_cuda_ipc_cache_region_collect_callback, ®ion_list); ucs_list_for_each_safe(region, tmp, ®ion_list, list) { - UCT_CUDADRV_FUNC(cuIpcCloseMemHandle((CUdeviceptr)region->mapped_addr)); + UCT_CUDADRV_FUNC_LOG_ERR( + cuIpcCloseMemHandle((CUdeviceptr)region->mapped_addr)); ucs_free(region); } ucs_trace("%s: cuda ipc cache purged", cache->name); @@ -53,6 +63,7 @@ static void uct_cuda_ipc_cache_purge(uct_cuda_ipc_cache_t *cache) static ucs_status_t uct_cuda_ipc_open_memhandle(CUipcMemHandle memh, CUdeviceptr *mapped_addr) { + const char *cu_err_str; CUresult cuerr; cuerr = cuIpcOpenMemHandle(mapped_addr, memh, @@ -61,6 +72,10 @@ static ucs_status_t uct_cuda_ipc_open_memhandle(CUipcMemHandle memh, if (cuerr == CUDA_ERROR_ALREADY_MAPPED) { return UCS_ERR_ALREADY_EXISTS; } + + cuGetErrorString(cuerr, &cu_err_str); + ucs_error("cuIpcOpenMemHandle() failed: %s", cu_err_str); + return UCS_ERR_INVALID_PARAM; } @@ -85,22 +100,61 @@ static void uct_cuda_ipc_cache_invalidate_regions(uct_cuda_ipc_cache_t *cache, ucs_error("failed to remove address:%p from cache (%s)", (void *)region->key.d_bptr, ucs_status_string(status)); } - UCT_CUDADRV_FUNC(cuIpcCloseMemHandle((CUdeviceptr)region->mapped_addr)); + UCT_CUDADRV_FUNC_LOG_ERR( + cuIpcCloseMemHandle((CUdeviceptr)region->mapped_addr)); ucs_free(region); } ucs_trace("%s: closed memhandles in the range [%p..%p]", cache->name, from, to); } -ucs_status_t uct_cuda_ipc_cache_map_memhandle(void *arg, uct_cuda_ipc_key_t *key, - void **mapped_addr) +ucs_status_t uct_cuda_ipc_unmap_memhandle(void *rem_cache, uintptr_t d_bptr, + void *mapped_addr, int cache_enabled) +{ + uct_cuda_ipc_cache_t *cache = (uct_cuda_ipc_cache_t *) rem_cache; + ucs_status_t status = UCS_OK; + ucs_pgt_region_t *pgt_region; + uct_cuda_ipc_cache_region_t *region; + + /* use write lock because cache maybe modified */ + pthread_rwlock_wrlock(&cache->lock); + pgt_region = UCS_PROFILE_CALL(ucs_pgtable_lookup, &cache->pgtable, d_bptr); + ucs_assert(pgt_region != NULL); + region = ucs_derived_of(pgt_region, uct_cuda_ipc_cache_region_t); + + ucs_assert(region->refcount >= 1); + region->refcount--; + + /* + * check refcount to see if an in-flight transfer is using the same mapping + */ + if (!region->refcount && !cache_enabled) { + status = ucs_pgtable_remove(&cache->pgtable, ®ion->super); + if (status != UCS_OK) { + ucs_error("failed to remove address:%p from cache (%s)", + (void *)region->key.d_bptr, ucs_status_string(status)); + } + ucs_assert(region->mapped_addr == mapped_addr); + status = UCT_CUDADRV_FUNC_LOG_ERR( + cuIpcCloseMemHandle((CUdeviceptr)region->mapped_addr)); + ucs_free(region); + } + + pthread_rwlock_unlock(&cache->lock); + return status; +} + +UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_map_memhandle, + (arg, key, mapped_addr), + void *arg, uct_cuda_ipc_key_t *key, void **mapped_addr) { uct_cuda_ipc_cache_t *cache = (uct_cuda_ipc_cache_t *)arg; ucs_status_t status; ucs_pgt_region_t *pgt_region; uct_cuda_ipc_cache_region_t *region; + int ret; - pthread_rwlock_rdlock(&cache->lock); + pthread_rwlock_wrlock(&cache->lock); pgt_region = UCS_PROFILE_CALL(ucs_pgtable_lookup, &cache->pgtable, key->d_bptr); if (ucs_likely(pgt_region != NULL)) { @@ -113,6 +167,8 @@ ucs_status_t uct_cuda_ipc_cache_map_memhandle(void *arg, uct_cuda_ipc_key_t *key key->b_len, UCS_PGT_REGION_ARG(®ion->super)); *mapped_addr = region->mapped_addr; + ucs_assert(region->refcount < UINT64_MAX); + region->refcount++; pthread_rwlock_unlock(&cache->lock); return UCS_OK; } else { @@ -129,8 +185,8 @@ ucs_status_t uct_cuda_ipc_cache_map_memhandle(void *arg, uct_cuda_ipc_key_t *key } /* close memhandle */ - UCT_CUDADRV_FUNC(cuIpcCloseMemHandle((CUdeviceptr) - region->mapped_addr)); + UCT_CUDADRV_FUNC_LOG_ERR( + cuIpcCloseMemHandle((CUdeviceptr)region->mapped_addr)); ucs_free(region); } } @@ -140,7 +196,8 @@ ucs_status_t uct_cuda_ipc_cache_map_memhandle(void *arg, uct_cuda_ipc_key_t *key if (ucs_likely(status == UCS_ERR_ALREADY_EXISTS)) { /* unmap all overlapping regions and retry*/ uct_cuda_ipc_cache_invalidate_regions(cache, (void *)key->d_bptr, - (void *)key->d_bptr + key->b_len); + UCS_PTR_BYTE_OFFSET(key->d_bptr, + key->b_len)); status = uct_cuda_ipc_open_memhandle(key->ph, (CUdeviceptr *)mapped_addr); if (ucs_unlikely(status != UCS_OK)) { if (ucs_likely(status == UCS_ERR_ALREADY_EXISTS)) { @@ -165,10 +222,11 @@ ucs_status_t uct_cuda_ipc_cache_map_memhandle(void *arg, uct_cuda_ipc_key_t *key } /*create new cache entry */ - region = ucs_memalign(UCS_PGT_ENTRY_MIN_ALIGN, - sizeof(uct_cuda_ipc_cache_region_t), - "uct_cuda_ipc_cache_region"); - if (region == NULL) { + ret = ucs_posix_memalign((void **)®ion, + ucs_max(sizeof(void *), UCS_PGT_ENTRY_MIN_ALIGN), + sizeof(uct_cuda_ipc_cache_region_t), + "uct_cuda_ipc_cache_region"); + if (ret != 0) { ucs_warn("failed to allocate uct_cuda_ipc_cache region"); status = UCS_ERR_NO_MEMORY; goto err; @@ -180,6 +238,7 @@ ucs_status_t uct_cuda_ipc_cache_map_memhandle(void *arg, uct_cuda_ipc_key_t *key UCS_PGT_ADDR_ALIGN); region->key = *key; region->mapped_addr = *mapped_addr; + region->refcount = 1; status = UCS_PROFILE_CALL(ucs_pgtable_insert, &cache->pgtable, ®ion->super); diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.h b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.h index fa5f8677711..588f5c97c11 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.h +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.h @@ -26,6 +26,7 @@ struct uct_cuda_ipc_cache_region { ucs_list_link_t list; /**< List element */ uct_cuda_ipc_key_t key; /**< Remote memory key */ void *mapped_addr; /**< Local mapped address */ + uint64_t refcount; /**< Track inflight ops before unmapping*/ }; @@ -43,6 +44,8 @@ ucs_status_t uct_cuda_ipc_create_cache(uct_cuda_ipc_cache_t **cache, void uct_cuda_ipc_destroy_cache(uct_cuda_ipc_cache_t *cache); -ucs_status_t uct_cuda_ipc_cache_map_memhandle(void *arg, uct_cuda_ipc_key_t *key, - void **mapped_addr); +ucs_status_t uct_cuda_ipc_map_memhandle(void *arg, uct_cuda_ipc_key_t *key, + void **mapped_addr); +ucs_status_t uct_cuda_ipc_unmap_memhandle(void *rem_cache, uintptr_t d_bptr, + void *mapped_addr, int cache_enabled); #endif diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c b/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c index 094b3a2e354..886ffbcb4ca 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c @@ -4,14 +4,20 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "cuda_ipc_ep.h" #include "cuda_ipc_iface.h" #include "cuda_ipc_md.h" #include +#include #include #include #include +#include #define UCT_CUDA_IPC_PUT 0 #define UCT_CUDA_IPC_GET 1 @@ -27,15 +33,15 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_ipc_ep_t, const uct_ep_params_t *params) UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super); self->remote_memh_cache = NULL; - if (iface->config.enable_cache) { - snprintf(target_name, sizeof(target_name), "dest:%d", - *(pid_t*)params->iface_addr); - status = uct_cuda_ipc_create_cache(&self->remote_memh_cache, target_name); - if (status != UCS_OK) { - ucs_error("could not create create cuda ipc cache: %s", - ucs_status_string(status)); - return status; - } + /* create a cache by default; disabling implies remove mapping immediately + * after use */ + snprintf(target_name, sizeof(target_name), "dest:%d", + *(pid_t*)params->iface_addr); + status = uct_cuda_ipc_create_cache(&self->remote_memh_cache, target_name); + if (status != UCS_OK) { + ucs_error("could not create create cuda ipc cache: %s", + ucs_status_string(status)); + return status; } return UCS_OK; @@ -69,7 +75,6 @@ uct_cuda_ipc_post_cuda_async_copy(uct_ep_h tl_ep, uint64_t remote_addr, ucs_queue_head_t *outstanding_queue; ucs_status_t status; CUdeviceptr dst, src; - CUdevice cu_device; CUstream stream; size_t offset; @@ -78,8 +83,6 @@ uct_cuda_ipc_post_cuda_async_copy(uct_ep_h tl_ep, uint64_t remote_addr, return UCS_OK; } - UCT_CUDA_IPC_GET_DEVICE(cu_device); - status = iface->map_memhandle((void *)ep->remote_memh_cache, key, &mapped_addr); if (status != UCS_OK) { return UCS_ERR_IO_ERROR; @@ -96,6 +99,8 @@ uct_cuda_ipc_post_cuda_async_copy(uct_ep_h tl_ep, uint64_t remote_addr, } } + key->dev_num %= iface->config.max_streams; /* round-robin */ + stream = iface->stream_d2d[key->dev_num]; outstanding_queue = &iface->outstanding_d2d_event_q; cuda_ipc_event = ucs_mpool_get(&iface->event_desc); @@ -110,13 +115,18 @@ uct_cuda_ipc_post_cuda_async_copy(uct_ep_h tl_ep, uint64_t remote_addr, src = (CUdeviceptr) ((direction == UCT_CUDA_IPC_PUT) ? iov[0].buffer : mapped_rem_addr); - status = UCT_CUDADRV_FUNC(cuMemcpyDtoDAsync(dst, src, iov[0].length, stream)); + status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemcpyDtoDAsync(dst, src, iov[0].length, + stream)); if (UCS_OK != status) { ucs_mpool_put(cuda_ipc_event); return status; } - status = UCT_CUDADRV_FUNC(cuEventRecord(cuda_ipc_event->event, stream)); + iface->stream_refcount[key->dev_num]++; + cuda_ipc_event->stream_id = key->dev_num; + + status = UCT_CUDADRV_FUNC_LOG_ERR(cuEventRecord(cuda_ipc_event->event, + stream)); if (UCS_OK != status) { ucs_mpool_put(cuda_ipc_event); return status; @@ -125,14 +135,18 @@ uct_cuda_ipc_post_cuda_async_copy(uct_ep_h tl_ep, uint64_t remote_addr, ucs_queue_push(outstanding_queue, &cuda_ipc_event->queue); cuda_ipc_event->comp = comp; cuda_ipc_event->mapped_addr = mapped_addr; + cuda_ipc_event->cache = ep->remote_memh_cache; + cuda_ipc_event->d_bptr = (uintptr_t)key->d_bptr; ucs_trace("cuMemcpyDtoDAsync issued :%p dst:%p, src:%p len:%ld", cuda_ipc_event, (void *) dst, (void *) src, iov[0].length); return UCS_INPROGRESS; } -ucs_status_t uct_cuda_ipc_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp) +UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_ep_get_zcopy, + (tl_ep, iov, iovcnt, remote_addr, rkey, comp), + uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, + uint64_t remote_addr, uct_rkey_t rkey, + uct_completion_t *comp) { ucs_status_t status; @@ -149,9 +163,11 @@ ucs_status_t uct_cuda_ipc_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, siz return status; } -ucs_status_t uct_cuda_ipc_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp) +UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_ep_put_zcopy, + (tl_ep, iov, iovcnt, remote_addr, rkey, comp), + uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, + uint64_t remote_addr, uct_rkey_t rkey, + uct_completion_t *comp) { ucs_status_t status; diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c index 4821f646cbe..b56a8dab5cc 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c @@ -4,13 +4,19 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "cuda_ipc_iface.h" #include "cuda_ipc_md.h" #include "cuda_ipc_ep.h" +#include #include #include - +#include +#include static ucs_config_field_t uct_cuda_ipc_iface_config_table[] = { @@ -22,11 +28,19 @@ static ucs_config_field_t uct_cuda_ipc_iface_config_table[] = { "Max number of event completions to pick during cuda events polling", ucs_offsetof(uct_cuda_ipc_iface_config_t, max_poll), UCS_CONFIG_TYPE_UINT}, + {"MAX_STREAMS", "16", + "Max number of CUDA streams to make concurrent progress on", + ucs_offsetof(uct_cuda_ipc_iface_config_t, max_streams), UCS_CONFIG_TYPE_UINT}, + {"CACHE", "y", "Enable remote endpoint IPC memhandle mapping cache", ucs_offsetof(uct_cuda_ipc_iface_config_t, enable_cache), UCS_CONFIG_TYPE_BOOL}, + {"MAX_EVENTS", "inf", + "Max number of cuda events. -1 is infinite", + ucs_offsetof(uct_cuda_ipc_iface_config_t, max_cuda_ipc_events), UCS_CONFIG_TYPE_UINT}, + {NULL} }; @@ -67,37 +81,78 @@ static int uct_cuda_ipc_iface_is_reachable(const uct_iface_h tl_iface, *((const uint64_t *)dev_addr)) && ((getpid() != *(pid_t *)iface_addr))); } -static ucs_status_t uct_cuda_ipc_iface_query(uct_iface_h iface, +static double uct_cuda_ipc_iface_get_bw() +{ + CUdevice cu_device; + int major_version; + ucs_status_t status; + + status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&cu_device, 0)); + if (status != UCS_OK) { + return 0; + } + + status = UCT_CUDADRV_FUNC_LOG_ERR( + cuDeviceGetAttribute(&major_version, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + cu_device)); + if (status != UCS_OK) { + return 0; + } + + /* + * TODO: Detect nvswitch + */ + switch (major_version) { + case UCT_CUDA_BASE_GEN_P100: + return 80000.0 * UCS_MBYTE; + case UCT_CUDA_BASE_GEN_V100: + return 250000.0 * UCS_MBYTE; + case UCT_CUDA_BASE_GEN_A100: + return 300000.0 * UCS_MBYTE; + default: + return 6911.0 * UCS_MBYTE; + } +} + +static ucs_status_t uct_cuda_ipc_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { - memset(iface_attr, 0, sizeof(uct_iface_attr_t)); + uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_ipc_iface_t); + + uct_base_iface_query(&iface->super, iface_attr); + iface_attr->iface_addr_len = sizeof(pid_t); iface_attr->device_addr_len = sizeof(uint64_t); iface_attr->ep_addr_len = 0; iface_attr->max_conn_priv = 0; - iface_attr->cap.flags = UCT_IFACE_FLAG_CONNECT_TO_IFACE | - UCT_IFACE_FLAG_PENDING | - UCT_IFACE_FLAG_GET_ZCOPY | + iface_attr->cap.flags = UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE | + UCT_IFACE_FLAG_CONNECT_TO_IFACE | + UCT_IFACE_FLAG_PENDING | + UCT_IFACE_FLAG_GET_ZCOPY | UCT_IFACE_FLAG_PUT_ZCOPY; + iface_attr->cap.event_flags = UCT_IFACE_FLAG_EVENT_SEND_COMP | + UCT_IFACE_FLAG_EVENT_RECV | + UCT_IFACE_FLAG_EVENT_FD; iface_attr->cap.put.max_short = 0; iface_attr->cap.put.max_bcopy = 0; iface_attr->cap.put.min_zcopy = 0; - iface_attr->cap.put.max_zcopy = UCT_CUDA_IPC_MAX_ALLOC_SZ; + iface_attr->cap.put.max_zcopy = ULONG_MAX; iface_attr->cap.put.opt_zcopy_align = 1; iface_attr->cap.put.align_mtu = iface_attr->cap.put.opt_zcopy_align; iface_attr->cap.put.max_iov = 1; iface_attr->cap.get.max_bcopy = 0; iface_attr->cap.get.min_zcopy = 0; - iface_attr->cap.get.max_zcopy = UCT_CUDA_IPC_MAX_ALLOC_SZ; + iface_attr->cap.get.max_zcopy = ULONG_MAX; iface_attr->cap.get.opt_zcopy_align = 1; iface_attr->cap.get.align_mtu = iface_attr->cap.get.opt_zcopy_align; iface_attr->cap.get.max_iov = 1; - iface_attr->latency.overhead = 1e-9; - iface_attr->latency.growth = 0; - iface_attr->bandwidth = 6911 * 1024.0 * 1024.0; + iface_attr->latency = ucs_linear_func_make(1e-9, 0); + iface_attr->bandwidth.dedicated = 0; + iface_attr->bandwidth.shared = uct_cuda_ipc_iface_get_bw(); iface_attr->overhead = 0; iface_attr->priority = 0; @@ -123,17 +178,68 @@ uct_cuda_ipc_iface_flush(uct_iface_h tl_iface, unsigned flags, return UCS_INPROGRESS; } +static ucs_status_t uct_cuda_ipc_iface_event_fd_get(uct_iface_h tl_iface, int *fd_p) +{ + uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_ipc_iface_t); + + if (-1 == iface->eventfd) { + iface->eventfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + if (iface->eventfd == -1) { + ucs_error("Failed to create event fd: %m"); + return UCS_ERR_IO_ERROR; + } + } + + *fd_p = iface->eventfd; + return UCS_OK; +} + +static void uct_cuda_ipc_common_cb(void *cuda_ipc_iface) +{ + uct_cuda_ipc_iface_t *iface = cuda_ipc_iface; + uint64_t dummy = 1; + int ret; + + /* No error handling yet */ + do { + ret = write(iface->eventfd, &dummy, sizeof(dummy)); + if (ret == sizeof(dummy)) { + return; + } else if (ret == -1) { + if (errno == EAGAIN) { + continue; + } else if (errno != EINTR) { + ucs_error("Signaling wakeup failed: %m"); + return; + } + } else { + ucs_assert(ret == 0); + } + } while (ret == 0); +} + +#if (__CUDACC_VER_MAJOR__ >= 100000) +static void CUDA_CB myHostFn(void *iface) +#else +static void CUDA_CB myHostCallback(CUstream hStream, CUresult status, + void *iface) +#endif +{ + uct_cuda_ipc_common_cb(iface); +} + static UCS_F_ALWAYS_INLINE unsigned uct_cuda_ipc_progress_event_q(uct_cuda_ipc_iface_t *iface, - ucs_queue_head_t *event_q, unsigned max_events) + ucs_queue_head_t *event_q) { unsigned count = 0; uct_cuda_ipc_event_desc_t *cuda_ipc_event; ucs_queue_iter_t iter; ucs_status_t status; + unsigned max_events = iface->config.max_poll; ucs_queue_for_each_safe(cuda_ipc_event, iter, event_q, queue) { - status = UCT_CUDADRV_FUNC(cuEventQuery(cuda_ipc_event->event)); + status = UCT_CUDADRV_FUNC_LOG_ERR(cuEventQuery(cuda_ipc_event->event)); if (UCS_INPROGRESS == status) { continue; } else if (UCS_OK != status) { @@ -145,12 +251,16 @@ uct_cuda_ipc_progress_event_q(uct_cuda_ipc_iface_t *iface, uct_invoke_completion(cuda_ipc_event->comp, UCS_OK); } - status = iface->unmap_memhandle(cuda_ipc_event->mapped_addr); + status = iface->unmap_memhandle(cuda_ipc_event->cache, + cuda_ipc_event->d_bptr, + cuda_ipc_event->mapped_addr, + iface->config.enable_cache); if (status != UCS_OK) { ucs_fatal("failed to unmap addr:%p", cuda_ipc_event->mapped_addr); } ucs_trace_poll("CUDA_IPC Event Done :%p", cuda_ipc_event); + iface->stream_refcount[cuda_ipc_event->stream_id]--; ucs_mpool_put(cuda_ipc_event); count++; @@ -165,10 +275,64 @@ uct_cuda_ipc_progress_event_q(uct_cuda_ipc_iface_t *iface, static unsigned uct_cuda_ipc_iface_progress(uct_iface_h tl_iface) { uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_ipc_iface_t); - unsigned max_events = iface->config.max_poll; - return uct_cuda_ipc_progress_event_q(iface, &iface->outstanding_d2d_event_q, - max_events); + return uct_cuda_ipc_progress_event_q(iface, &iface->outstanding_d2d_event_q); +} + +static ucs_status_t uct_cuda_ipc_iface_event_fd_arm(uct_iface_h tl_iface, + unsigned events) +{ + uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_ipc_iface_t); + int ret; + int i; + uint64_t dummy; + ucs_status_t status; + + if (uct_cuda_ipc_progress_event_q(iface, &iface->outstanding_d2d_event_q)) { + return UCS_ERR_BUSY; + } + + ucs_assert(iface->eventfd != -1); + + do { + ret = read(iface->eventfd, &dummy, sizeof(dummy)); + if (ret == sizeof(dummy)) { + status = UCS_ERR_BUSY; + return status; + } else if (ret == -1) { + if (errno == EAGAIN) { + break; + } else if (errno != EINTR) { + ucs_error("read from internal event fd failed: %m"); + status = UCS_ERR_IO_ERROR; + return status; + } else { + return UCS_ERR_BUSY; + } + } else { + ucs_assert(ret == 0); + } + } while (ret != 0); + + if (iface->streams_initialized) { + for (i = 0; i < iface->config.max_streams; i++) { + if (iface->stream_refcount[i]) { + status = +#if (__CUDACC_VER_MAJOR__ >= 100000) + UCT_CUDADRV_FUNC_LOG_ERR(cuLaunchHostFunc(iface->stream_d2d[i], + myHostFn, iface)); +#else + UCT_CUDADRV_FUNC_LOG_ERR(cuStreamAddCallback(iface->stream_d2d[i], + myHostCallback, + iface, 0)); +#endif + if (UCS_OK != status) { + return status; + } + } + } + } + return UCS_OK; } static uct_iface_ops_t uct_cuda_ipc_iface_ops = { @@ -185,6 +349,8 @@ static uct_iface_ops_t uct_cuda_ipc_iface_ops = { .iface_progress_enable = uct_base_iface_progress_enable, .iface_progress_disable = uct_base_iface_progress_disable, .iface_progress = uct_cuda_ipc_iface_progress, + .iface_event_fd_get = uct_cuda_ipc_iface_event_fd_get, + .iface_event_arm = uct_cuda_ipc_iface_event_fd_arm, .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_ipc_iface_t), .iface_query = uct_cuda_ipc_iface_query, .iface_get_device_address = uct_cuda_ipc_iface_get_device_address, @@ -197,14 +363,19 @@ static void uct_cuda_ipc_event_desc_init(ucs_mpool_t *mp, void *obj, void *chunk uct_cuda_ipc_event_desc_t *base = (uct_cuda_ipc_event_desc_t *) obj; memset(base, 0, sizeof(*base)); - UCT_CUDADRV_FUNC(cuEventCreate(&base->event, CU_EVENT_DISABLE_TIMING)); + UCT_CUDADRV_FUNC_LOG_ERR(cuEventCreate(&base->event, CU_EVENT_DISABLE_TIMING)); } static void uct_cuda_ipc_event_desc_cleanup(ucs_mpool_t *mp, void *obj) { uct_cuda_ipc_event_desc_t *base = (uct_cuda_ipc_event_desc_t *) obj; + int active; - UCT_CUDADRV_FUNC(cuEventDestroy(base->event)); + UCT_CUDADRV_CTX_ACTIVE(active); + + if (active) { + UCT_CUDADRV_FUNC_LOG_ERR(cuEventDestroy(base->event)); + } } ucs_status_t uct_cuda_ipc_iface_init_streams(uct_cuda_ipc_iface_t *iface) @@ -212,12 +383,14 @@ ucs_status_t uct_cuda_ipc_iface_init_streams(uct_cuda_ipc_iface_t *iface) ucs_status_t status; int i; - for (i = 0; i < iface->device_count; i++) { - status = UCT_CUDADRV_FUNC(cuStreamCreate(&iface->stream_d2d[i], - CU_STREAM_NON_BLOCKING)); + for (i = 0; i < iface->config.max_streams; i++) { + status = UCT_CUDADRV_FUNC_LOG_ERR(cuStreamCreate(&iface->stream_d2d[i], + CU_STREAM_NON_BLOCKING)); if (UCS_OK != status) { return status; } + + iface->stream_refcount[i] = 0; } iface->streams_initialized = 1; @@ -232,54 +405,31 @@ static ucs_mpool_ops_t uct_cuda_ipc_event_desc_mpool_ops = { .obj_cleanup = uct_cuda_ipc_event_desc_cleanup, }; -ucs_status_t uct_cuda_ipc_map_memhandle(void *arg, uct_cuda_ipc_key_t *key, - void **mapped_addr) -{ - return UCT_CUDADRV_FUNC(cuIpcOpenMemHandle((CUdeviceptr *)mapped_addr, - key->ph, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS)); -} - -ucs_status_t uct_cuda_ipc_unmap_memhandle(void *mapped_addr) -{ - return UCT_CUDADRV_FUNC(cuIpcCloseMemHandle((CUdeviceptr)mapped_addr)); -} - static UCS_CLASS_INIT_FUNC(uct_cuda_ipc_iface_t, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, const uct_iface_config_t *tl_config) { uct_cuda_ipc_iface_config_t *config = NULL; ucs_status_t status; - int dev_count; config = ucs_derived_of(tl_config, uct_cuda_ipc_iface_config_t); UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_cuda_ipc_iface_ops, md, worker, params, tl_config UCS_STATS_ARG(params->stats_root) - UCS_STATS_ARG(UCT_CUDA_IPC_TL_NAME)); + UCS_STATS_ARG("cuda_ipc")); if (strncmp(params->mode.device.dev_name, - UCT_CUDA_IPC_DEV_NAME, strlen(UCT_CUDA_IPC_DEV_NAME)) != 0) { + UCT_CUDA_DEV_NAME, strlen(UCT_CUDA_DEV_NAME)) != 0) { ucs_error("No device was found: %s", params->mode.device.dev_name); return UCS_ERR_NO_DEVICE; } - status = UCT_CUDADRV_FUNC(cuDeviceGetCount(&dev_count)); - if (UCS_OK != status) { - return status; - } - ucs_assert(dev_count <= UCT_CUDA_IPC_MAX_PEERS); - - self->device_count = dev_count; - self->config.max_poll = config->max_poll; - self->config.enable_cache = config->enable_cache; - - if (self->config.enable_cache) { - self->map_memhandle = uct_cuda_ipc_cache_map_memhandle; - self->unmap_memhandle = ucs_empty_function_return_success; - } else { - self->map_memhandle = uct_cuda_ipc_map_memhandle; - self->unmap_memhandle = uct_cuda_ipc_unmap_memhandle; - } + self->config.max_poll = config->max_poll; + self->config.max_streams = config->max_streams; + self->config.enable_cache = config->enable_cache; + self->config.max_cuda_ipc_events = config->max_cuda_ipc_events; + + self->map_memhandle = uct_cuda_ipc_map_memhandle; + self->unmap_memhandle = uct_cuda_ipc_unmap_memhandle; status = ucs_mpool_init(&self->event_desc, 0, @@ -287,7 +437,7 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_ipc_iface_t, uct_md_h md, uct_worker_h worke 0, UCS_SYS_CACHE_LINE_SIZE, 128, - 1024, + self->config.max_cuda_ipc_events, &uct_cuda_ipc_event_desc_mpool_ops, "CUDA_IPC EVENT objects"); if (UCS_OK != status) { @@ -295,8 +445,10 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_ipc_iface_t, uct_md_h md, uct_worker_h worke return UCS_ERR_IO_ERROR; } + self->eventfd = -1; self->streams_initialized = 0; ucs_queue_head_init(&self->outstanding_d2d_event_q); + return UCS_OK; } @@ -304,13 +456,18 @@ static UCS_CLASS_CLEANUP_FUNC(uct_cuda_ipc_iface_t) { ucs_status_t status; int i; + int active; - if (self->streams_initialized) { - for (i = 0; i < self->device_count; i++) { - status = UCT_CUDADRV_FUNC(cuStreamDestroy(self->stream_d2d[i])); + UCT_CUDADRV_CTX_ACTIVE(active); + + if (self->streams_initialized && active) { + for (i = 0; i < self->config.max_streams; i++) { + status = UCT_CUDADRV_FUNC_LOG_ERR(cuStreamDestroy(self->stream_d2d[i])); if (UCS_OK != status) { continue; } + + ucs_assert(self->stream_refcount[i] == 0); } self->streams_initialized = 0; } @@ -318,6 +475,9 @@ static UCS_CLASS_CLEANUP_FUNC(uct_cuda_ipc_iface_t) uct_base_iface_progress_disable(&self->super.super, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); ucs_mpool_cleanup(&self->event_desc, 1); + if (self->eventfd != -1) { + close(self->eventfd); + } } UCS_CLASS_DEFINE(uct_cuda_ipc_iface_t, uct_base_iface_t); @@ -325,34 +485,6 @@ UCS_CLASS_DEFINE_NEW_FUNC(uct_cuda_ipc_iface_t, uct_iface_t, uct_md_h, uct_worke const uct_iface_params_t*, const uct_iface_config_t*); static UCS_CLASS_DEFINE_DELETE_FUNC(uct_cuda_ipc_iface_t, uct_iface_t); -static ucs_status_t uct_cuda_ipc_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) -{ - uct_tl_resource_desc_t *resource; - - resource = ucs_calloc(1, sizeof(uct_tl_resource_desc_t), "resource desc"); - if (NULL == resource) { - ucs_error("Failed to allocate memory"); - return UCS_ERR_NO_MEMORY; - } - - ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", - UCT_CUDA_IPC_TL_NAME); - ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", - UCT_CUDA_IPC_DEV_NAME); - resource->dev_type = UCT_DEVICE_TYPE_ACC; - - *num_resources_p = 1; - *resource_p = resource; - return UCS_OK; -} - -UCT_TL_COMPONENT_DEFINE(uct_cuda_ipc_tl, - uct_cuda_ipc_query_tl_resources, - uct_cuda_ipc_iface_t, - UCT_CUDA_IPC_TL_NAME, - "CUDA_IPC_", - uct_cuda_ipc_iface_config_table, - uct_cuda_ipc_iface_config_t); -UCT_MD_REGISTER_TL(&uct_cuda_ipc_md_component, &uct_cuda_ipc_tl); +UCT_TL_DEFINE(&uct_cuda_ipc_component.super, cuda_ipc, uct_cuda_base_query_devices, + uct_cuda_ipc_iface_t, "CUDA_IPC_", uct_cuda_ipc_iface_config_table, + uct_cuda_ipc_iface_config_t); diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h index a1e3060caf7..42e135a92e1 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h @@ -13,10 +13,9 @@ #include #include "cuda_ipc_md.h" +#include "cuda_ipc_ep.h" -#define UCT_CUDA_IPC_TL_NAME "cuda_ipc" -#define UCT_CUDA_IPC_DEV_NAME "cudaipc0" #define UCT_CUDA_IPC_MAX_PEERS 16 @@ -24,32 +23,43 @@ typedef struct uct_cuda_ipc_iface { uct_base_iface_t super; ucs_mpool_t event_desc; /* cuda event desc */ ucs_queue_head_t outstanding_d2d_event_q; /* stream for outstanding d2d */ - int device_count; + int eventfd; /* get event notifications */ int streams_initialized; /* indicates if stream created */ CUstream stream_d2d[UCT_CUDA_IPC_MAX_PEERS]; /* per-peer stream */ + unsigned long stream_refcount[UCT_CUDA_IPC_MAX_PEERS]; + /* per stream outstanding ops */ struct { unsigned max_poll; /* query attempts w.o success */ + unsigned max_streams; /* # concurrent streams for || progress*/ + unsigned max_cuda_ipc_events; /* max mpool entries */ int enable_cache; /* enable/disable ipc handle cache */ } config; ucs_status_t (*map_memhandle)(void *context, uct_cuda_ipc_key_t *key, void **map_addr); - ucs_status_t (*unmap_memhandle)(void *map_addr); + ucs_status_t (*unmap_memhandle)(void *rem_cache, uintptr_t d_bptr, + void *mapped_addr, int cache_enabled); } uct_cuda_ipc_iface_t; typedef struct uct_cuda_ipc_iface_config { uct_iface_config_t super; unsigned max_poll; + unsigned max_streams; int enable_cache; + unsigned max_cuda_ipc_events; } uct_cuda_ipc_iface_config_t; typedef struct uct_cuda_ipc_event_desc { CUevent event; void *mapped_addr; + unsigned stream_id; uct_completion_t *comp; ucs_queue_elem_t queue; + uct_cuda_ipc_ep_t *ep; + void *cache; + uintptr_t d_bptr; } uct_cuda_ipc_event_desc_t; diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_md.c b/src/uct/cuda/cuda_ipc/cuda_ipc_md.c index 23212640c45..4e06366f682 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_md.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_md.c @@ -1,9 +1,13 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2018. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2018-2019. ALL RIGHTS RESERVED. * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "cuda_ipc_md.h" #include @@ -12,10 +16,10 @@ #include #include #include +#include #include #include - static ucs_config_field_t uct_cuda_ipc_md_config_table[] = { {"", "", NULL, ucs_offsetof(uct_cuda_ipc_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_md_config_table)}, @@ -25,15 +29,15 @@ static ucs_config_field_t uct_cuda_ipc_md_config_table[] = { static ucs_status_t uct_cuda_ipc_md_query(uct_md_h md, uct_md_attr_t *md_attr) { - md_attr->cap.flags = UCT_MD_FLAG_REG | - UCT_MD_FLAG_NEED_RKEY; - md_attr->cap.reg_mem_types = UCS_BIT(UCT_MD_MEM_TYPE_CUDA); - md_attr->cap.mem_type = UCT_MD_MEM_TYPE_CUDA; - md_attr->cap.max_alloc = 0; - md_attr->cap.max_reg = UCT_CUDA_IPC_MAX_ALLOC_SZ; - md_attr->rkey_packed_size = sizeof(uct_cuda_ipc_key_t); - md_attr->reg_cost.overhead = 0; - md_attr->reg_cost.growth = 0; + md_attr->cap.flags = UCT_MD_FLAG_REG | + UCT_MD_FLAG_NEED_RKEY; + md_attr->cap.reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA); + md_attr->cap.access_mem_type = UCS_MEMORY_TYPE_CUDA; + md_attr->cap.detect_mem_types = 0; + md_attr->cap.max_alloc = 0; + md_attr->cap.max_reg = ULONG_MAX; + md_attr->rkey_packed_size = sizeof(uct_cuda_ipc_key_t); + md_attr->reg_cost = ucs_linear_func_make(0, 0); memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); return UCS_OK; } @@ -44,27 +48,127 @@ static ucs_status_t uct_cuda_ipc_mkey_pack(uct_md_h md, uct_mem_h memh, uct_cuda_ipc_key_t *packed = (uct_cuda_ipc_key_t *) rkey_buffer; uct_cuda_ipc_key_t *mem_hndl = (uct_cuda_ipc_key_t *) memh; - *packed = *mem_hndl; + *packed = *mem_hndl; + + return UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGetUuid(&packed->uuid, + mem_hndl->dev_num)); +} + +static inline int uct_cuda_ipc_uuid_equals(const CUuuid* a, const CUuuid* b) +{ + int64_t *a0 = (int64_t *) a->bytes; + int64_t *b0 = (int64_t *) b->bytes; + return (a0[0] == b0[0]) && (a0[1] == b0[1]) ? 1 : 0; +} + +static inline void uct_cuda_ipc_uuid_copy(CUuuid* dst, const CUuuid* src) +{ + int64_t *a = (int64_t *) src->bytes; + int64_t *b = (int64_t *) dst->bytes; + *b++ = *a++; + *b = *a; +} + +ucs_status_t uct_cuda_ipc_get_unique_index_for_uuid(int* idx, + uct_cuda_ipc_md_t* md, + uct_cuda_ipc_key_t *rkey) +{ + int i; + + for (i = 0; i < md->uuid_map_size; i++) { + if (uct_cuda_ipc_uuid_equals(&rkey->uuid, &md->uuid_map[i])) { + *idx = i; + return UCS_OK; /* found */ + } + } + + if (ucs_unlikely(md->uuid_map_size == md->uuid_map_capacity)) { + /* reallocate on demand */ + int num_devices; + int original_cache_size, new_cache_size; + int new_capacity = md->uuid_map_capacity * 2; + + UCT_CUDA_IPC_DEVICE_GET_COUNT(num_devices); + original_cache_size = md->uuid_map_capacity * num_devices; + new_cache_size = new_capacity * num_devices; + md->uuid_map_capacity = new_capacity; + md->uuid_map = ucs_realloc(md->uuid_map, + new_capacity * sizeof(CUuuid), + "uct_cuda_ipc_uuid_map"); + if (md->uuid_map == NULL) { + return UCS_ERR_NO_MEMORY; + } + + md->peer_accessible_cache = ucs_realloc(md->peer_accessible_cache, + new_cache_size, + "uct_cuda_ipc_peer_accessible_cache"); + if (md->peer_accessible_cache == NULL) { + return UCS_ERR_NO_MEMORY; + } + + memset(md->peer_accessible_cache + original_cache_size, 0xFF, + new_cache_size - original_cache_size); + } + + /* Add new mapping */ + uct_cuda_ipc_uuid_copy(&md->uuid_map[md->uuid_map_size], &rkey->uuid); + *idx = md->uuid_map_size++; return UCS_OK; } -static ucs_status_t uct_cuda_ipc_rkey_unpack(uct_md_component_t *mdc, - const void *rkey_buffer, uct_rkey_t *rkey_p, - void **handle_p) +static ucs_status_t uct_cuda_ipc_is_peer_accessible(uct_cuda_ipc_component_t *mdc, + uct_cuda_ipc_key_t *rkey) { - uct_cuda_ipc_key_t *packed = (uct_cuda_ipc_key_t *) rkey_buffer; - uct_cuda_ipc_key_t *key; + CUdevice this_device; ucs_status_t status; - CUdevice cu_device; - int peer_accessble; + int peer_idx; + int num_devices; + char* accessible; + CUdeviceptr d_mapped; - UCT_CUDA_IPC_GET_DEVICE(cu_device); + status = uct_cuda_ipc_get_unique_index_for_uuid(&peer_idx, mdc->md, rkey); + if (ucs_unlikely(status != UCS_OK)) { + return status; + } + + /* overwrite dev_num with a unique ID; this means that relative remote + * device number of multiple peers do not map on the same stream and reduces + * stream sequentialization */ + rkey->dev_num = peer_idx; + + UCT_CUDA_IPC_GET_DEVICE(this_device); + UCT_CUDA_IPC_DEVICE_GET_COUNT(num_devices); + + accessible = &mdc->md->peer_accessible_cache[peer_idx * num_devices + this_device]; + if (*accessible == (char)0xFF) { /* unchecked, add to cache */ + CUresult result = cuIpcOpenMemHandle(&d_mapped, + rkey->ph, + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + *accessible = ((result != CUDA_SUCCESS) && (result != CUDA_ERROR_ALREADY_MAPPED)) + ? 0 : 1; + if (result == CUDA_SUCCESS) { + result = cuIpcCloseMemHandle(d_mapped); + if (result != CUDA_SUCCESS) ucs_fatal("Unable to close memhandle"); + } + } + + return (*accessible == 1) ? UCS_OK : UCS_ERR_UNREACHABLE; +} + +UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_rkey_unpack, + (component, rkey_buffer, rkey_p, handle_p), + uct_component_t *component, const void *rkey_buffer, + uct_rkey_t *rkey_p, void **handle_p) +{ + uct_cuda_ipc_component_t *com = ucs_derived_of(component, uct_cuda_ipc_component_t); + uct_cuda_ipc_key_t *packed = (uct_cuda_ipc_key_t *) rkey_buffer; + uct_cuda_ipc_key_t *key; + ucs_status_t status; - status = UCT_CUDADRV_FUNC(cuDeviceCanAccessPeer(&peer_accessble, - cu_device, packed->dev_num)); - if ((status != UCS_OK) || (peer_accessble == 0)) { - return UCS_ERR_UNREACHABLE; + status = uct_cuda_ipc_is_peer_accessible(com, packed); + if (status != UCS_OK) { + return status; } key = ucs_malloc(sizeof(uct_cuda_ipc_key_t), "uct_cuda_ipc_key_t"); @@ -80,8 +184,8 @@ static ucs_status_t uct_cuda_ipc_rkey_unpack(uct_md_component_t *mdc, return UCS_OK; } -static ucs_status_t uct_cuda_ipc_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, - void *handle) +static ucs_status_t uct_cuda_ipc_rkey_release(uct_component_t *component, + uct_rkey_t rkey, void *handle) { ucs_assert(NULL == handle); ucs_free((void *)rkey); @@ -92,6 +196,7 @@ static ucs_status_t uct_cuda_ipc_mem_reg_internal(uct_md_h uct_md, void *addr, size_t length, unsigned flags, uct_cuda_ipc_key_t *key) { + ucs_log_level_t log_level; CUdevice cu_device; ucs_status_t status; @@ -99,19 +204,23 @@ uct_cuda_ipc_mem_reg_internal(uct_md_h uct_md, void *addr, size_t length, return UCS_OK; } - status = UCT_CUDADRV_FUNC(cuIpcGetMemHandle(&(key->ph), (CUdeviceptr) addr)); + log_level = (flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ? UCS_LOG_LEVEL_DEBUG : + UCS_LOG_LEVEL_ERROR; + status = UCT_CUDADRV_FUNC(cuIpcGetMemHandle(&key->ph, (CUdeviceptr)addr), + log_level); if (UCS_OK != status) { return status; } UCT_CUDA_IPC_GET_DEVICE(cu_device); - UCT_CUDADRV_FUNC(cuMemGetAddressRange(&(key->d_bptr), - &(key->b_len), - (CUdeviceptr) addr)); + UCT_CUDADRV_FUNC(cuMemGetAddressRange(&key->d_bptr, &key->b_len, + (CUdeviceptr)addr), + log_level); + key->dev_num = (int) cu_device; ucs_trace("registered memory:%p..%p length:%lu dev_num:%d", - addr, addr + length, length, (int) cu_device); + addr, UCS_PTR_BYTE_OFFSET(addr, length), length, (int) cu_device); return UCS_OK; } @@ -119,6 +228,7 @@ static ucs_status_t uct_cuda_ipc_mem_reg(uct_md_h md, void *address, size_t leng unsigned flags, uct_mem_h *memh_p) { uct_cuda_ipc_key_t *key; + ucs_status_t status; key = ucs_malloc(sizeof(uct_cuda_ipc_key_t), "uct_cuda_ipc_key_t"); if (NULL == key) { @@ -126,9 +236,10 @@ static ucs_status_t uct_cuda_ipc_mem_reg(uct_md_h md, void *address, size_t leng return UCS_ERR_NO_MEMORY; } - if (UCS_OK != uct_cuda_ipc_mem_reg_internal(md, address, length, 0, key)) { + status = uct_cuda_ipc_mem_reg_internal(md, address, length, 0, key); + if (status != UCS_OK) { ucs_free(key); - return UCS_ERR_IO_ERROR; + return status; } *memh_p = key; @@ -141,44 +252,92 @@ static ucs_status_t uct_cuda_ipc_mem_dereg(uct_md_h md, uct_mem_h memh) return UCS_OK; } -static ucs_status_t uct_cuda_ipc_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) + +static void uct_cuda_ipc_md_close(uct_md_h uct_md) { - int num_gpus; - cudaError_t cudaErr; - - cudaErr = cudaGetDeviceCount(&num_gpus); - if ((cudaErr!= cudaSuccess) || (num_gpus == 0)) { - ucs_debug("Not found cuda devices"); - *resources_p = NULL; - *num_resources_p = 0; - return UCS_OK; - } + uct_cuda_ipc_md_t *md = ucs_derived_of(uct_md, uct_cuda_ipc_md_t); - return uct_single_md_resource(&uct_cuda_ipc_md_component, resources_p, num_resources_p); + ucs_free(md->uuid_map); + ucs_free(md->peer_accessible_cache); + ucs_free(md); } -static ucs_status_t uct_cuda_ipc_md_open(const char *md_name, const uct_md_config_t *md_config, - uct_md_h *md_p) +static ucs_status_t +uct_cuda_ipc_md_open(uct_component_t *component, const char *md_name, + const uct_md_config_t *config, uct_md_h *md_p) { static uct_md_ops_t md_ops = { - .close = (void*)ucs_empty_function, - .query = uct_cuda_ipc_md_query, - .mkey_pack = uct_cuda_ipc_mkey_pack, - .mem_reg = uct_cuda_ipc_mem_reg, - .mem_dereg = uct_cuda_ipc_mem_dereg, - .is_mem_type_owned = uct_cuda_is_mem_type_owned, - }; - static uct_md_t md = { - .ops = &md_ops, - .component = &uct_cuda_ipc_md_component + .close = uct_cuda_ipc_md_close, + .query = uct_cuda_ipc_md_query, + .mkey_pack = uct_cuda_ipc_mkey_pack, + .mem_reg = uct_cuda_ipc_mem_reg, + .mem_dereg = uct_cuda_ipc_mem_dereg, + .detect_memory_type = ucs_empty_function_return_unsupported, }; - *md_p = &md; + int num_devices; + uct_cuda_ipc_md_t* md; + uct_cuda_ipc_component_t* com; + + UCS_STATIC_ASSERT(sizeof(md->peer_accessible_cache[0]) == sizeof(char)); + UCT_CUDA_IPC_DEVICE_GET_COUNT(num_devices); + + md = ucs_calloc(1, sizeof(uct_cuda_ipc_md_t), "uct_cuda_ipc_md"); + if (md == NULL) { + return UCS_ERR_NO_MEMORY; + } + + md->super.ops = &md_ops; + md->super.component = &uct_cuda_ipc_component.super; + + /* allocate uuid map and peer accessible cache */ + md->uuid_map_size = 0; + md->uuid_map_capacity = 16; + md->uuid_map = ucs_malloc(md->uuid_map_capacity * sizeof(CUuuid), + "uct_cuda_ipc_uuid_map"); + if (md->uuid_map == NULL) { + free(md); + return UCS_ERR_NO_MEMORY; + } + + /* Initially support caching accessibility of up to 16 other peers */ + md->peer_accessible_cache = ucs_malloc(num_devices * md->uuid_map_capacity, + "uct_cuda_ipc_peer_accessible_cache"); + if (md->peer_accessible_cache == NULL) { + free(md->uuid_map); + free(md); + return UCS_ERR_NO_MEMORY; + } + + /* 0xFF = !cached, 1 = accessible, 0 = !accessible */ + memset(md->peer_accessible_cache, 0xFF, num_devices * md->uuid_map_capacity); + + com = ucs_derived_of(md->super.component, uct_cuda_ipc_component_t); + com->md = md; + *md_p = &md->super; return UCS_OK; } -UCT_MD_COMPONENT_DEFINE(uct_cuda_ipc_md_component, UCT_CUDA_IPC_MD_NAME, - uct_cuda_ipc_query_md_resources, uct_cuda_ipc_md_open, NULL, - uct_cuda_ipc_rkey_unpack, uct_cuda_ipc_rkey_release, "CUDA_IPC_", - uct_cuda_ipc_md_config_table, uct_cuda_ipc_md_config_t); +uct_cuda_ipc_component_t uct_cuda_ipc_component = { + .super = { + .query_md_resources = uct_cuda_base_query_md_resources, + .md_open = uct_cuda_ipc_md_open, + .cm_open = ucs_empty_function_return_unsupported, + .rkey_unpack = uct_cuda_ipc_rkey_unpack, + .rkey_ptr = ucs_empty_function_return_unsupported, + .rkey_release = uct_cuda_ipc_rkey_release, + .name = "cuda_ipc", + .md_config = { + .name = "Cuda-IPC memory domain", + .prefix = "CUDA_IPC_", + .table = uct_cuda_ipc_md_config_table, + .size = sizeof(uct_cuda_ipc_md_config_t), + }, + .cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY, + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_cuda_ipc_component.super), + .flags = 0 + }, + .md = NULL, +}; +UCT_COMPONENT_REGISTER(&uct_cuda_ipc_component.super); + diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_md.h b/src/uct/cuda/cuda_ipc/cuda_ipc_md.h index b36fa4e36d9..5e0ef493867 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_md.h +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_md.h @@ -12,20 +12,26 @@ #include -#define UCT_CUDA_IPC_MD_NAME "cuda_ipc" -#define UCT_CUDA_IPC_MAX_ALLOC_SZ (1 << 30) - - -extern uct_md_component_t uct_cuda_ipc_md_component; - - /** * @brief cuda ipc MD descriptor */ typedef struct uct_cuda_ipc_md { struct uct_md super; /**< Domain info */ + CUuuid* uuid_map; + char* peer_accessible_cache; + int uuid_map_size; + int uuid_map_capacity; } uct_cuda_ipc_md_t; +/** + * @brief cuda ipc component extension + */ +typedef struct uct_cuda_ipc_component { + uct_component_t super; + uct_cuda_ipc_md_t* md; +} uct_cuda_ipc_component_t; + +extern uct_cuda_ipc_component_t uct_cuda_ipc_component; /** * @brief cuda ipc domain configuration. @@ -43,12 +49,23 @@ typedef struct uct_cuda_ipc_key { CUdeviceptr d_bptr; /* Allocation base address */ size_t b_len; /* Allocation size */ int dev_num; /* GPU Device number */ + CUuuid uuid; /* GPU Device UUID */ } uct_cuda_ipc_key_t; -#define UCT_CUDA_IPC_GET_DEVICE(_cu_device) \ +#define UCT_CUDA_IPC_GET_DEVICE(_cu_device) \ + do { \ + if (UCS_OK != \ + UCT_CUDADRV_FUNC_LOG_ERR(cuCtxGetDevice(&_cu_device))) { \ + return UCS_ERR_IO_ERROR; \ + } \ + } while(0); + + +#define UCT_CUDA_IPC_DEVICE_GET_COUNT(_num_device) \ do { \ - if (UCS_OK != UCT_CUDADRV_FUNC(cuCtxGetDevice(&_cu_device))) { \ + if (UCS_OK != \ + UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGetCount(&_num_device))) { \ return UCS_ERR_IO_ERROR; \ } \ } while(0); diff --git a/src/uct/cuda/gdr_copy/configure.m4 b/src/uct/cuda/gdr_copy/configure.m4 index 67e7d84b3c5..be2a17bbb62 100644 --- a/src/uct/cuda/gdr_copy/configure.m4 +++ b/src/uct/cuda/gdr_copy/configure.m4 @@ -6,5 +6,5 @@ UCX_CHECK_GDRCOPY -AS_IF([test "x$gdrcopy_happy" = "xyes"], [uct_cuda_modules+=":gdrcopy"]) +AS_IF([test "x$gdrcopy_happy" = "xyes"], [uct_cuda_modules="${uct_cuda_modules}:gdrcopy"]) AC_CONFIG_FILES([src/uct/cuda/gdr_copy/Makefile]) diff --git a/src/uct/cuda/gdr_copy/gdr_copy_ep.c b/src/uct/cuda/gdr_copy/gdr_copy_ep.c index 01079541568..1e0f965cf48 100644 --- a/src/uct/cuda/gdr_copy/gdr_copy_ep.c +++ b/src/uct/cuda/gdr_copy/gdr_copy_ep.c @@ -1,8 +1,13 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2017-2019. ALL RIGHTS RESERVED. + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "gdr_copy_ep.h" #include "gdr_copy_md.h" #include "gdr_copy_iface.h" @@ -10,6 +15,7 @@ #include #include #include +#include #include @@ -32,9 +38,10 @@ UCS_CLASS_DEFINE(uct_gdr_copy_ep_t, uct_base_ep_t) UCS_CLASS_DEFINE_NEW_FUNC(uct_gdr_copy_ep_t, uct_ep_t, const uct_ep_params_t *); UCS_CLASS_DEFINE_DELETE_FUNC(uct_gdr_copy_ep_t, uct_ep_t); -ucs_status_t uct_gdr_copy_ep_put_short(uct_ep_h tl_ep, const void *buffer, - unsigned length, uint64_t remote_addr, - uct_rkey_t rkey) +UCS_PROFILE_FUNC(ucs_status_t, uct_gdr_copy_ep_put_short, + (tl_ep, buffer, length, remote_addr, rkey), + uct_ep_h tl_ep, const void *buffer, unsigned length, + uint64_t remote_addr, uct_rkey_t rkey) { uct_gdr_copy_key_t *gdr_copy_key = (uct_gdr_copy_key_t *) rkey; size_t bar_offset; @@ -42,11 +49,22 @@ ucs_status_t uct_gdr_copy_ep_put_short(uct_ep_h tl_ep, const void *buffer, if (ucs_likely(length)) { bar_offset = remote_addr - gdr_copy_key->vaddr; +#if HAVE_DECL_GDR_COPY_TO_MAPPING + ret = gdr_copy_to_mapping(gdr_copy_key->mh, + UCS_PTR_BYTE_OFFSET(gdr_copy_key->bar_ptr, + bar_offset), + buffer, length); + if (ret) { + ucs_error("gdr_copy_to_mapping failed. ret:%d", ret); + return UCS_ERR_IO_ERROR; + } +#else ret = gdr_copy_to_bar(gdr_copy_key->bar_ptr + bar_offset, buffer, length); if (ret) { ucs_error("gdr_copy_to_bar failed. ret:%d", ret); return UCS_ERR_IO_ERROR; } +#endif } UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), PUT, SHORT, length); @@ -55,9 +73,10 @@ ucs_status_t uct_gdr_copy_ep_put_short(uct_ep_h tl_ep, const void *buffer, return UCS_OK; } -ucs_status_t uct_gdr_copy_ep_get_short(uct_ep_h tl_ep, void *buffer, - unsigned length, uint64_t remote_addr, - uct_rkey_t rkey) +UCS_PROFILE_FUNC(ucs_status_t, uct_gdr_copy_ep_get_short, + (tl_ep, buffer, length, remote_addr, rkey), + uct_ep_h tl_ep, void *buffer, unsigned length, + uint64_t remote_addr, uct_rkey_t rkey) { uct_gdr_copy_key_t *gdr_copy_key = (uct_gdr_copy_key_t *) rkey; size_t bar_offset; @@ -65,11 +84,22 @@ ucs_status_t uct_gdr_copy_ep_get_short(uct_ep_h tl_ep, void *buffer, if (ucs_likely(length)) { bar_offset = remote_addr - gdr_copy_key->vaddr; +#if HAVE_DECL_GDR_COPY_TO_MAPPING + ret = gdr_copy_from_mapping(gdr_copy_key->mh, buffer, + UCS_PTR_BYTE_OFFSET(gdr_copy_key->bar_ptr, + bar_offset), + length); + if (ret) { + ucs_error("gdr_copy_from_mapping failed. ret:%d", ret); + return UCS_ERR_IO_ERROR; + } +#else ret = gdr_copy_from_bar(buffer, gdr_copy_key->bar_ptr + bar_offset, length); if (ret) { ucs_error("gdr_copy_from_bar failed. ret:%d", ret); return UCS_ERR_IO_ERROR; } +#endif } UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, SHORT, length); diff --git a/src/uct/cuda/gdr_copy/gdr_copy_iface.c b/src/uct/cuda/gdr_copy/gdr_copy_iface.c index 3edf7487236..c46aa28b295 100644 --- a/src/uct/cuda/gdr_copy/gdr_copy_iface.c +++ b/src/uct/cuda/gdr_copy/gdr_copy_iface.c @@ -3,10 +3,15 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "gdr_copy_iface.h" #include "gdr_copy_md.h" #include "gdr_copy_ep.h" +#include #include #include @@ -42,10 +47,12 @@ static int uct_gdr_copy_iface_is_reachable(const uct_iface_h tl_iface, return (addr != NULL) && (iface->id == *addr); } -static ucs_status_t uct_gdr_copy_iface_query(uct_iface_h iface, +static ucs_status_t uct_gdr_copy_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { - memset(iface_attr, 0, sizeof(uct_iface_attr_t)); + uct_gdr_copy_iface_t *iface = ucs_derived_of(tl_iface, uct_gdr_copy_iface_t); + + uct_base_iface_query(&iface->super, iface_attr); iface_attr->iface_addr_len = sizeof(uct_gdr_copy_iface_addr_t); iface_attr->device_addr_len = 0; @@ -79,9 +86,9 @@ static ucs_status_t uct_gdr_copy_iface_query(uct_iface_h iface, iface_attr->cap.am.max_hdr = 0; iface_attr->cap.am.max_iov = 1; - iface_attr->latency.overhead = 1e-6; /* 1 us */ - iface_attr->latency.growth = 0; - iface_attr->bandwidth = 6911 * 1024.0 * 1024.0; + iface_attr->latency = ucs_linear_func_make(1e-6, 0); + iface_attr->bandwidth.dedicated = 0; + iface_attr->bandwidth.shared = 6911.0 * UCS_MBYTE; iface_attr->overhead = 0; iface_attr->priority = 0; @@ -104,7 +111,7 @@ static uct_iface_ops_t uct_gdr_copy_iface_ops = { .iface_progress = ucs_empty_function_return_zero, .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_gdr_copy_iface_t), .iface_query = uct_gdr_copy_iface_query, - .iface_get_device_address = (void*)ucs_empty_function_return_success, + .iface_get_device_address = (uct_iface_get_device_address_func_t)ucs_empty_function_return_success, .iface_get_address = uct_gdr_copy_iface_get_address, .iface_is_reachable = uct_gdr_copy_iface_is_reachable, }; @@ -115,7 +122,7 @@ static UCS_CLASS_INIT_FUNC(uct_gdr_copy_iface_t, uct_md_h md, uct_worker_h worke { UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_gdr_copy_iface_ops, md, worker, params, tl_config UCS_STATS_ARG(params->stats_root) - UCS_STATS_ARG(UCT_GDR_COPY_TL_NAME)); + UCS_STATS_ARG("gdr_copy")); if (strncmp(params->mode.device.dev_name, UCT_CUDA_DEV_NAME, strlen(UCT_CUDA_DEV_NAME)) != 0) { @@ -138,35 +145,6 @@ UCS_CLASS_DEFINE_NEW_FUNC(uct_gdr_copy_iface_t, uct_iface_t, uct_md_h, uct_worke const uct_iface_params_t*, const uct_iface_config_t*); static UCS_CLASS_DEFINE_DELETE_FUNC(uct_gdr_copy_iface_t, uct_iface_t); - -static ucs_status_t uct_gdr_copy_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) -{ - uct_tl_resource_desc_t *resource; - - resource = ucs_calloc(1, sizeof(uct_tl_resource_desc_t), "resource desc"); - if (NULL == resource) { - ucs_error("Failed to allocate memory"); - return UCS_ERR_NO_MEMORY; - } - - ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", - UCT_GDR_COPY_TL_NAME); - ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", - UCT_CUDA_DEV_NAME); - resource->dev_type = UCT_DEVICE_TYPE_ACC; - - *num_resources_p = 1; - *resource_p = resource; - return UCS_OK; -} - -UCT_TL_COMPONENT_DEFINE(uct_gdr_copy_tl, - uct_gdr_copy_query_tl_resources, - uct_gdr_copy_iface_t, - UCT_GDR_COPY_TL_NAME, - "GDR_COPY_", - uct_gdr_copy_iface_config_table, - uct_gdr_copy_iface_config_t); -UCT_MD_REGISTER_TL(&uct_gdr_copy_md_component, &uct_gdr_copy_tl); +UCT_TL_DEFINE(&uct_gdr_copy_component, gdr_copy, uct_cuda_base_query_devices, + uct_gdr_copy_iface_t, "GDR_COPY_", + uct_gdr_copy_iface_config_table, uct_gdr_copy_iface_config_t); diff --git a/src/uct/cuda/gdr_copy/gdr_copy_iface.h b/src/uct/cuda/gdr_copy/gdr_copy_iface.h index 882ec9c994b..1d4875e8cbe 100644 --- a/src/uct/cuda/gdr_copy/gdr_copy_iface.h +++ b/src/uct/cuda/gdr_copy/gdr_copy_iface.h @@ -9,10 +9,6 @@ #include -#define UCT_GDR_COPY_TL_NAME "gdr_copy" -#define UCT_CUDA_DEV_NAME "gdrcopy0" - - typedef uint64_t uct_gdr_copy_iface_addr_t; diff --git a/src/uct/cuda/gdr_copy/gdr_copy_md.c b/src/uct/cuda/gdr_copy/gdr_copy_md.c index 10aea7d28b9..398d030c027 100644 --- a/src/uct/cuda/gdr_copy/gdr_copy_md.c +++ b/src/uct/cuda/gdr_copy/gdr_copy_md.c @@ -1,8 +1,13 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2017-2018. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2017-2019. ALL RIGHTS RESERVED. + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "gdr_copy_md.h" #include @@ -12,6 +17,7 @@ #include #include #include +#include #include #include @@ -29,25 +35,25 @@ static ucs_config_field_t uct_gdr_copy_md_config_table[] = { UCS_CONFIG_TYPE_TABLE(uct_md_config_rcache_table)}, {"MEM_REG_OVERHEAD", "16us", "Memory registration overhead", /* TODO take default from device */ - ucs_offsetof(uct_gdr_copy_md_config_t, uc_reg_cost.overhead), UCS_CONFIG_TYPE_TIME}, + ucs_offsetof(uct_gdr_copy_md_config_t, uc_reg_cost.m), UCS_CONFIG_TYPE_TIME}, {"MEM_REG_GROWTH", "0.06ns", "Memory registration growth rate", /* TODO take default from device */ - ucs_offsetof(uct_gdr_copy_md_config_t, uc_reg_cost.growth), UCS_CONFIG_TYPE_TIME}, + ucs_offsetof(uct_gdr_copy_md_config_t, uc_reg_cost.c), UCS_CONFIG_TYPE_TIME}, {NULL} }; static ucs_status_t uct_gdr_copy_md_query(uct_md_h md, uct_md_attr_t *md_attr) { - md_attr->cap.flags = UCT_MD_FLAG_REG | - UCT_MD_FLAG_NEED_RKEY; - md_attr->cap.reg_mem_types = UCS_BIT(UCT_MD_MEM_TYPE_CUDA); - md_attr->cap.mem_type = UCT_MD_MEM_TYPE_CUDA; - md_attr->cap.max_alloc = 0; - md_attr->cap.max_reg = ULONG_MAX; - md_attr->rkey_packed_size = sizeof(uct_gdr_copy_key_t); - md_attr->reg_cost.overhead = 0; - md_attr->reg_cost.growth = 0; + md_attr->cap.flags = UCT_MD_FLAG_REG | + UCT_MD_FLAG_NEED_RKEY; + md_attr->cap.reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA); + md_attr->cap.access_mem_type = UCS_MEMORY_TYPE_CUDA; + md_attr->cap.detect_mem_types = 0; + md_attr->cap.max_alloc = 0; + md_attr->cap.max_reg = ULONG_MAX; + md_attr->rkey_packed_size = sizeof(uct_gdr_copy_key_t); + md_attr->reg_cost = ucs_linear_func_make(0, 0); memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); return UCS_OK; } @@ -60,13 +66,14 @@ static ucs_status_t uct_gdr_copy_mkey_pack(uct_md_h md, uct_mem_h memh, packed->vaddr = mem_hndl->info.va; packed->bar_ptr = mem_hndl->bar_ptr; + packed->mh = mem_hndl->mh; return UCS_OK; } -static ucs_status_t uct_gdr_copy_rkey_unpack(uct_md_component_t *mdc, - const void *rkey_buffer, uct_rkey_t *rkey_p, - void **handle_p) +static ucs_status_t uct_gdr_copy_rkey_unpack(uct_component_t *component, + const void *rkey_buffer, + uct_rkey_t *rkey_p, void **handle_p) { uct_gdr_copy_key_t *packed = (uct_gdr_copy_key_t *)rkey_buffer; uct_gdr_copy_key_t *key; @@ -79,6 +86,7 @@ static ucs_status_t uct_gdr_copy_rkey_unpack(uct_md_component_t *mdc, key->vaddr = packed->vaddr; key->bar_ptr = packed->bar_ptr; + key->mh = packed->mh; *handle_p = NULL; *rkey_p = (uintptr_t)key; @@ -86,55 +94,51 @@ static ucs_status_t uct_gdr_copy_rkey_unpack(uct_md_component_t *mdc, return UCS_OK; } -static ucs_status_t uct_gdr_copy_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, - void *handle) +static ucs_status_t uct_gdr_copy_rkey_release(uct_component_t *component, + uct_rkey_t rkey, void *handle) { ucs_assert(NULL == handle); ucs_free((void *)rkey); return UCS_OK; } -static ucs_status_t -uct_gdr_copy_mem_reg_internal(uct_md_h uct_md, void *address, size_t length, - unsigned flags, uct_gdr_copy_mem_t *mem_hndl) +UCS_PROFILE_FUNC(ucs_status_t, uct_gdr_copy_mem_reg_internal, + (uct_md, address, length, flags, mem_hndl), + uct_md_h uct_md, void *address, size_t length, + unsigned flags, uct_gdr_copy_mem_t *mem_hndl) { uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); - CUdeviceptr d_ptr = ((CUdeviceptr )(char *) address); - gdr_mh_t mh; - void *bar_ptr; - gdr_info_t info; + CUdeviceptr d_ptr = ((CUdeviceptr )(char *) address); int ret; if (!length) { - mem_hndl->mh = 0; + memset(mem_hndl, 0, sizeof(*mem_hndl)); return UCS_OK; } - ret = gdr_pin_buffer(md->gdrcpy_ctx, d_ptr, length, 0, 0, &mh); + ret = gdr_pin_buffer(md->gdrcpy_ctx, d_ptr, length, 0, 0, &mem_hndl->mh); if (ret) { ucs_error("gdr_pin_buffer failed. length :%lu ret:%d", length, ret); goto err; } - ret = gdr_map(md->gdrcpy_ctx, mh, &bar_ptr, length); + ret = gdr_map(md->gdrcpy_ctx, mem_hndl->mh, &mem_hndl->bar_ptr, length); if (ret) { ucs_error("gdr_map failed. length :%lu ret:%d", length, ret); goto unpin_buffer; } - ret = gdr_get_info(md->gdrcpy_ctx, mh, &info); + mem_hndl->reg_size = length; + + ret = gdr_get_info(md->gdrcpy_ctx, mem_hndl->mh, &mem_hndl->info); if (ret) { ucs_error("gdr_get_info failed. ret:%d", ret); goto unmap_buffer; } - mem_hndl->mh = mh; - mem_hndl->info = info; - mem_hndl->bar_ptr = bar_ptr; - mem_hndl->reg_size = length; - ucs_trace("registered memory:%p..%p length:%lu info.va:0x%"PRIx64" bar_ptr:%p", - address, address + length, length, info.va, bar_ptr); + address, UCS_PTR_BYTE_OFFSET(address, length), length, + mem_hndl->info.va, mem_hndl->bar_ptr); return UCS_OK; @@ -144,7 +148,7 @@ uct_gdr_copy_mem_reg_internal(uct_md_h uct_md, void *address, size_t length, ucs_warn("gdr_unmap failed. unpin_size:%lu ret:%d", mem_hndl->reg_size, ret); } unpin_buffer: - ret = gdr_unpin_buffer(md->gdrcpy_ctx, mh); + ret = gdr_unpin_buffer(md->gdrcpy_ctx, mem_hndl->mh); if (ret) { ucs_warn("gdr_unpin_buffer failed. ret;%d", ret); } @@ -152,9 +156,10 @@ uct_gdr_copy_mem_reg_internal(uct_md_h uct_md, void *address, size_t length, return UCS_ERR_IO_ERROR; } -static ucs_status_t uct_gdr_copy_mem_dereg_internal(uct_md_h uct_md, uct_gdr_copy_mem_t *mem_hndl) +UCS_PROFILE_FUNC(ucs_status_t, uct_gdr_copy_mem_dereg_internal, + (uct_md, mem_hndl), + uct_md_h uct_md, uct_gdr_copy_mem_t *mem_hndl) { - uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); int ret; @@ -189,10 +194,12 @@ static ucs_status_t uct_gdr_copy_mem_reg(uct_md_h uct_md, void *address, size_t } start = ucs_align_down_pow2_ptr(address, GPU_PAGE_SIZE); - end = ucs_align_up_pow2_ptr(address + length, GPU_PAGE_SIZE); + end = ucs_align_up_pow2_ptr(UCS_PTR_BYTE_OFFSET(address, length), GPU_PAGE_SIZE); ucs_assert_always(start <= end); - status = uct_gdr_copy_mem_reg_internal(uct_md, start, end - start, 0, mem_hndl); + status = uct_gdr_copy_mem_reg_internal(uct_md, start, + UCS_PTR_BYTE_DIFF(start, end), + 0, mem_hndl); if (status != UCS_OK) { ucs_free(mem_hndl); return status; @@ -216,32 +223,22 @@ static ucs_status_t uct_gdr_copy_mem_dereg(uct_md_h uct_md, uct_mem_h memh) return status; } -static ucs_status_t uct_gdr_copy_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) +static ucs_status_t +uct_gdr_copy_query_md_resources(uct_component_t *component, + uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p) { - int num_gpus; gdr_t ctx; - cudaError_t cudaErr; - - cudaErr = cudaGetDeviceCount(&num_gpus); - if ((cudaErr != cudaSuccess) || (num_gpus == 0)) { - ucs_debug("not found cuda devices"); - *resources_p = NULL; - *num_resources_p = 0; - return UCS_OK; - } ctx = gdr_open(); if (ctx == NULL) { ucs_debug("could not open gdr copy. disabling gdr copy resource"); - *resources_p = NULL; - *num_resources_p = 0; - return UCS_OK; + return uct_md_query_empty_md_resource(resources_p, num_resources_p); } gdr_close(ctx); - return uct_single_md_resource(&uct_gdr_copy_md_component, resources_p, - num_resources_p); + return uct_cuda_base_query_md_resources(component, resources_p, + num_resources_p); } static void uct_gdr_copy_md_close(uct_md_h uct_md) @@ -262,12 +259,12 @@ static void uct_gdr_copy_md_close(uct_md_h uct_md) } static uct_md_ops_t md_ops = { - .close = uct_gdr_copy_md_close, - .query = uct_gdr_copy_md_query, - .mkey_pack = uct_gdr_copy_mkey_pack, - .mem_reg = uct_gdr_copy_mem_reg, - .mem_dereg = uct_gdr_copy_mem_dereg, - .is_mem_type_owned = uct_cuda_is_mem_type_owned, + .close = uct_gdr_copy_md_close, + .query = uct_gdr_copy_md_query, + .mkey_pack = uct_gdr_copy_mkey_pack, + .mem_reg = uct_gdr_copy_mem_reg, + .mem_dereg = uct_gdr_copy_mem_dereg, + .detect_memory_type = ucs_empty_function_return_unsupported, }; static inline uct_gdr_copy_rcache_region_t* @@ -307,12 +304,12 @@ static ucs_status_t uct_gdr_copy_mem_rcache_dereg(uct_md_h uct_md, uct_mem_h mem } static uct_md_ops_t md_rcache_ops = { - .close = uct_gdr_copy_md_close, - .query = uct_gdr_copy_md_query, - .mkey_pack = uct_gdr_copy_mkey_pack, - .mem_reg = uct_gdr_copy_mem_rcache_reg, - .mem_dereg = uct_gdr_copy_mem_rcache_dereg, - .is_mem_type_owned = uct_cuda_is_mem_type_owned, + .close = uct_gdr_copy_md_close, + .query = uct_gdr_copy_md_query, + .mkey_pack = uct_gdr_copy_mkey_pack, + .mem_reg = uct_gdr_copy_mem_rcache_reg, + .mem_dereg = uct_gdr_copy_mem_rcache_dereg, + .detect_memory_type = ucs_empty_function_return_unsupported, }; static ucs_status_t @@ -358,12 +355,12 @@ static ucs_rcache_ops_t uct_gdr_copy_rcache_ops = { .dump_region = uct_gdr_copy_rcache_dump_region_cb }; -static ucs_status_t uct_gdr_copy_md_open(const char *md_name, - const uct_md_config_t *uct_md_config, - uct_md_h *md_p) +static ucs_status_t +uct_gdr_copy_md_open(uct_component_t *component, const char *md_name, + const uct_md_config_t *config, uct_md_h *md_p) { - const uct_gdr_copy_md_config_t *md_config = ucs_derived_of(uct_md_config, - uct_gdr_copy_md_config_t); + const uct_gdr_copy_md_config_t *md_config = + ucs_derived_of(config, uct_gdr_copy_md_config_t); ucs_status_t status; uct_gdr_copy_md_t *md; ucs_rcache_params_t rcache_params; @@ -374,10 +371,10 @@ static ucs_status_t uct_gdr_copy_md_open(const char *md_name, return UCS_ERR_NO_MEMORY; } - md->super.ops = &md_ops; - md->super.component = &uct_gdr_copy_md_component; - md->rcache = NULL; - md->reg_cost = md_config->uc_reg_cost; + md->super.ops = &md_ops; + md->super.component = &uct_gdr_copy_component; + md->rcache = NULL; + md->reg_cost = md_config->uc_reg_cost; md->gdrcpy_ctx = gdr_open(); if (md->gdrcpy_ctx == NULL) { @@ -394,11 +391,11 @@ static ucs_status_t uct_gdr_copy_md_open(const char *md_name, rcache_params.ucm_event_priority = md_config->rcache.event_prio; rcache_params.context = md; rcache_params.ops = &uct_gdr_copy_rcache_ops; + rcache_params.flags = 0; status = ucs_rcache_create(&rcache_params, "gdr_copy", NULL, &md->rcache); if (status == UCS_OK) { - md->super.ops = &md_rcache_ops; - md->reg_cost.overhead = 0; - md->reg_cost.growth = 0; + md->super.ops = &md_rcache_ops; + md->reg_cost = ucs_linear_func_make(0, 0); } else { ucs_assert(md->rcache == NULL); if (md_config->enable_rcache == UCS_YES) { @@ -422,7 +419,23 @@ static ucs_status_t uct_gdr_copy_md_open(const char *md_name, goto out; } -UCT_MD_COMPONENT_DEFINE(uct_gdr_copy_md_component, UCT_GDR_COPY_MD_NAME, - uct_gdr_copy_query_md_resources, uct_gdr_copy_md_open, NULL, - uct_gdr_copy_rkey_unpack, uct_gdr_copy_rkey_release, "GDR_COPY_", - uct_gdr_copy_md_config_table, uct_gdr_copy_md_config_t); +uct_component_t uct_gdr_copy_component = { + .query_md_resources = uct_gdr_copy_query_md_resources, + .md_open = uct_gdr_copy_md_open, + .cm_open = ucs_empty_function_return_unsupported, + .rkey_unpack = uct_gdr_copy_rkey_unpack, + .rkey_ptr = ucs_empty_function_return_unsupported, + .rkey_release = uct_gdr_copy_rkey_release, + .name = "gdr_copy", + .md_config = { + .name = "GDR-copy memory domain", + .prefix = "GDR_COPY_", + .table = uct_gdr_copy_md_config_table, + .size = sizeof(uct_gdr_copy_md_config_t), + }, + .cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY, + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_gdr_copy_component), + .flags = 0 +}; +UCT_COMPONENT_REGISTER(&uct_gdr_copy_component); + diff --git a/src/uct/cuda/gdr_copy/gdr_copy_md.h b/src/uct/cuda/gdr_copy/gdr_copy_md.h index 06e1927a8cb..275ded010b0 100644 --- a/src/uct/cuda/gdr_copy/gdr_copy_md.h +++ b/src/uct/cuda/gdr_copy/gdr_copy_md.h @@ -1,5 +1,6 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2017. ALL RIGHTS RESERVED. + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * See file LICENSE for terms. */ @@ -11,10 +12,8 @@ #include #include "gdrapi.h" -#define UCT_GDR_COPY_MD_NAME "gdr_copy" - -extern uct_md_component_t uct_gdr_copy_md_component; +extern uct_component_t uct_gdr_copy_component; /** @@ -24,7 +23,7 @@ typedef struct uct_gdr_copy_md { uct_md_t super; /**< Domain info */ gdr_t gdrcpy_ctx; /**< gdr copy context */ ucs_rcache_t *rcache; /**< Registration cache (can be NULL) */ - uct_linear_growth_t reg_cost; /**< Memory registration cost */ + ucs_linear_func_t reg_cost; /**< Memory registration cost */ } uct_gdr_copy_md_t; @@ -35,7 +34,7 @@ typedef struct uct_gdr_copy_md_config { uct_md_config_t super; int enable_rcache;/**< Enable registration cache */ uct_md_rcache_config_t rcache; /**< Registration cache config */ - uct_linear_growth_t uc_reg_cost; /**< Memory registration cost estimation + ucs_linear_func_t uc_reg_cost; /**< Memory registration cost estimation without using the cache */ } uct_gdr_copy_md_config_t; @@ -57,6 +56,7 @@ typedef struct uct_gdr_copy_mem { typedef struct uct_gdr_copy_key { uint64_t vaddr; /**< Mapped GPU address */ void *bar_ptr; /**< BAR address of GPU mapping */ + gdr_mh_t mh; /**< Memory handle of GPU memory */ } uct_gdr_copy_key_t; diff --git a/src/uct/ib/Makefile.am b/src/uct/ib/Makefile.am index 694f2435e8f..61f2cdd08e6 100644 --- a/src/uct/ib/Makefile.am +++ b/src/uct/ib/Makefile.am @@ -13,10 +13,7 @@ libuct_ib_la_CFLAGS = $(BASE_CFLAGS) libuct_ib_la_LIBADD = $(top_builddir)/src/ucs/libucs.la \ $(top_builddir)/src/uct/libuct.la libuct_ib_la_LDFLAGS = $(IBVERBS_LDFLAGS) $(NUMA_LIBS) -version-info $(SOVERSION) -libmlx5_ver = $(shell (rpm -qf $(IBVERBS_DIR)/include/infiniband/mlx5_hw.h &>/dev/null && rpm -qf /usr/include/infiniband/mlx5_hw.h) | head -1) -libuct_ib_ladir = $(includedir)/uct/ib - -nobase_dist_libuct_ib_la_HEADERS = +libmlx5_ver = $(shell (rpm -qf $(IBVERBS_DIR)/include/infiniband/mlx5_hw.h &>/dev/null && rpm -qf /usr/include/infiniband/mlx5_hw.h) | grep -v 'not owned' | head -1) noinst_HEADERS = \ base/ib_device.h \ @@ -31,15 +28,6 @@ libuct_ib_la_SOURCES = \ base/ib_log.c \ base/ib_md.c -devel_headers = \ - base/ib_alloc.h - -if INSTALL_DEVEL_HEADERS -nobase_dist_libuct_ib_la_HEADERS += $(devel_headers) -else -noinst_HEADERS += $(devel_headers) -endif - # TODO separate module for mlx5 if HAVE_MLX5_HW libuct_ib_la_CPPFLAGS += -DUCT_IB_LIBMLX5_VER=\"$(libmlx5_ver)\" @@ -48,18 +36,29 @@ noinst_HEADERS += \ mlx5/ib_mlx5_log.h \ mlx5/ib_mlx5.h \ mlx5/ib_mlx5.inl \ - mlx5/ib_mlx5_dv.h \ - mlx5/ib_mlx5_hw.h \ - mlx5/ib_mlx5_ifc.h + mlx5/dv/ib_mlx5_dv.h \ + mlx5/dv/ib_mlx5_ifc.h \ + mlx5/exp/ib_mlx5_hw.h libuct_ib_la_SOURCES += \ mlx5/ib_mlx5_log.c \ - mlx5/ib_mlx5.c \ - mlx5/ib_mlx5_dv.c \ - mlx5/ib_mlx5_hw.c + mlx5/ib_mlx5.c + +if HAVE_EXP +noinst_HEADERS += \ + mlx5/exp/ib_exp.h + +libuct_ib_la_SOURCES += \ + mlx5/exp/ib_mlx5_hw.c \ + mlx5/exp/ib_exp.c \ + mlx5/exp/ib_exp_md.c +endif # HAVE_EXP if HAVE_MLX5_DV libuct_ib_la_LDFLAGS += $(LIB_MLX5) +libuct_ib_la_SOURCES += \ + mlx5/dv/ib_mlx5_dv.c \ + mlx5/dv/ib_mlx5dv_md.c endif # HAVE_MLX5_DV endif # HAVE_MLX5_HW @@ -91,6 +90,11 @@ libuct_ib_la_SOURCES += \ rc/accel/rc_mlx5_common.c endif # HAVE_MLX5_HW +if HAVE_DEVX +libuct_ib_la_SOURCES += \ + rc/accel/rc_mlx5_devx.c +endif # HAVE_DEVX + endif # HAVE_TL_RC @@ -102,6 +106,12 @@ noinst_HEADERS += \ libuct_ib_la_SOURCES += \ dc/dc_mlx5_ep.c \ dc/dc_mlx5.c + +if HAVE_DEVX +libuct_ib_la_SOURCES += \ + dc/dc_mlx5_devx.c +endif # HAVE_DEVX + endif # HAVE_TL_DC diff --git a/src/uct/ib/base/ib_alloc.h b/src/uct/ib/base/ib_alloc.h deleted file mode 100644 index 0b5eb419bcc..00000000000 --- a/src/uct/ib/base/ib_alloc.h +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. - * - * See file LICENSE for terms. - */ - -#ifndef UCT_IB_ALLOC_H_ -#define UCT_IB_ALLOC_H_ - -#include - -BEGIN_C_DECLS - -typedef struct uct_ib_device_mem *uct_ib_device_mem_h; - -ucs_status_t uct_ib_md_alloc_device_mem(uct_md_h uct_md, size_t *length_p, - void **address_p, unsigned flags, - const char *alloc_name, - uct_ib_device_mem_h *dev_mem_p); - -void uct_ib_md_release_device_mem(uct_ib_device_mem_h dev_mem); - -END_C_DECLS - -#endif diff --git a/src/uct/ib/base/ib_device.c b/src/uct/ib/base/ib_device.c index af67993f96a..d6053a10d04 100644 --- a/src/uct/ib/base/ib_device.c +++ b/src/uct/ib/base/ib_device.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ib_device.h" #include "ib_md.h" @@ -13,18 +17,21 @@ #include #include #include +#include #include #include #include +#include -typedef struct { - union ibv_gid gid; - struct { - uint8_t major; - uint8_t minor; - } roce_version; -} uct_ib_device_gid_info_t; +/* This table is according to "Encoding for RNR NAK Timer Field" + * in IBTA specification */ +const double uct_ib_qp_rnr_time_ms[] = { + 655.36, 0.01, 0.02, 0.03, 0.04, 0.06, 0.08, 0.12, + 0.16, 0.24, 0.32, 0.48, 0.64, 0.96, 1.28, 1.92, + 2.56, 3.84, 5.12, 7.68, 10.24, 15.36, 20.48, 30.72, + 40.96, 61.44, 81.92, 122.88, 163.84, 245.76, 327.68, 491.52 +}; /* use both gid + lid data for key generarion (lid - ib based, gid - RoCE) */ @@ -46,7 +53,7 @@ KHASH_IMPL(uct_ib_ah, struct ibv_ah_attr, struct ibv_ah*, 1, uct_ib_kh_ah_hash_func, uct_ib_kh_ah_hash_equal) -#if ENABLE_STATS +#ifdef ENABLE_STATS static ucs_stats_class_t uct_ib_device_stats_class = { .name = "", .num_counters = UCT_IB_DEVICE_STAT_LAST, @@ -57,44 +64,55 @@ static ucs_stats_class_t uct_ib_device_stats_class = { #endif static uct_ib_device_spec_t uct_ib_builtin_device_specs[] = { - {0x02c9, 4099, "ConnectX-3", + {"ConnectX-3", {0x15b3, 4099}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX4_PRM, 10}, - {0x02c9, 4103, "ConnectX-3 Pro", + {"ConnectX-3 Pro", {0x15b3, 4103}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX4_PRM, 11}, - {0x02c9, 4113, "Connect-IB", + {"Connect-IB", {0x15b3, 4113}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V1, 20}, - {0x02c9, 4115, "ConnectX-4", + {"ConnectX-4", {0x15b3, 4115}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V1, 30}, - {0x02c9, 4116, "ConnectX-4", + {"ConnectX-4", {0x15b3, 4116}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V1, 29}, - {0x02c9, 4117, "ConnectX-4 LX", + {"ConnectX-4 LX", {0x15b3, 4117}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V1, 28}, - {0x02c9, 4119, "ConnectX-5", + {"ConnectX-4 LX VF", {0x15b3, 4118}, + UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | + UCT_IB_DEVICE_FLAG_DC_V1, 28}, + {"ConnectX-5", {0x15b3, 4119}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V2, 38}, - {0x02c9, 4121, "ConnectX-5", + {"ConnectX-5", {0x15b3, 4121}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V2, 40}, - {0x02c9, 4120, "ConnectX-5", + {"ConnectX-5", {0x15b3, 4120}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V2, 39}, - {0x02c9, 41682, "ConnectX-5", + {"ConnectX-5", {0x15b3, 41682}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V2, 37}, - {0x02c9, 4122, "ConnectX-5", + {"ConnectX-5", {0x15b3, 4122}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V2, 36}, - {0x02c9, 4123, "ConnectX-6", + {"ConnectX-6", {0x15b3, 4123}, + UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | + UCT_IB_DEVICE_FLAG_DC_V2, 50}, + {"ConnectX-6 VF", {0x15b3, 4124}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V2, 50}, - {0, 0, "Generic HCA", 0, 0} + {"ConnectX-6 DX", {0x15b3, 4125}, + UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | + UCT_IB_DEVICE_FLAG_DC_V2, 50}, + {"Generic HCA", {0, 0}, 0, 0}, + {NULL} }; -static void uct_ib_device_get_locailty(const char *dev_name, cpu_set_t *cpu_mask, +static void uct_ib_device_get_locality(const char *dev_name, + ucs_sys_cpuset_t *cpu_mask, int *numa_node) { char *p, buf[ucs_max(CPU_SETSIZE, 10)]; @@ -106,9 +124,8 @@ static void uct_ib_device_get_locailty(const char *dev_name, cpu_set_t *cpu_mask /* Read list of CPUs close to the device */ CPU_ZERO(cpu_mask); - nread = ucs_read_file(buf, sizeof(buf) - 1, 1, - "/sys/class/infiniband/%s/device/local_cpus", - dev_name); + nread = ucs_read_file(buf, sizeof(buf) - 1, 1, UCT_IB_DEVICE_SYSFS_FMT, + dev_name, "local_cpus"); if (nread >= 0) { buf[CPU_SETSIZE - 1] = '\0'; base = 0; @@ -142,15 +159,14 @@ static void uct_ib_device_get_locailty(const char *dev_name, cpu_set_t *cpu_mask *numa_node = (status == UCS_OK) ? n : -1; } -static void uct_ib_async_event_handler(int fd, void *arg) +static void uct_ib_async_event_handler(int fd, int events, void *arg) { uct_ib_device_t *dev = arg; - struct ibv_async_event event; - ucs_log_level_t level; - char event_info[200]; + struct ibv_async_event ibevent; + uct_ib_async_event_t event; int ret; - ret = ibv_get_async_event(dev->ibv_context, &event); + ret = ibv_get_async_event(dev->ibv_context, &ibevent); if (ret != 0) { if (errno != EAGAIN) { ucs_warn("ibv_get_async_event() failed: %m"); @@ -158,10 +174,65 @@ static void uct_ib_async_event_handler(int fd, void *arg) return; } + event.event_type = ibevent.event_type; switch (event.event_type) { + case IBV_EVENT_CQ_ERR: + event.cookie = ibevent.element.cq; + break; + case IBV_EVENT_QP_FATAL: + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_SQ_DRAINED: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_PATH_MIG_ERR: + case IBV_EVENT_QP_LAST_WQE_REACHED: + event.qp_num = ibevent.element.qp->qp_num; + break; + case IBV_EVENT_SRQ_ERR: + case IBV_EVENT_SRQ_LIMIT_REACHED: + event.cookie = ibevent.element.srq; + break; + case IBV_EVENT_DEVICE_FATAL: + case IBV_EVENT_PORT_ERR: + case IBV_EVENT_PORT_ACTIVE: +#if HAVE_DECL_IBV_EVENT_GID_CHANGE + case IBV_EVENT_GID_CHANGE: +#endif + case IBV_EVENT_LID_CHANGE: + case IBV_EVENT_PKEY_CHANGE: + case IBV_EVENT_SM_CHANGE: + case IBV_EVENT_CLIENT_REREGISTER: + event.port_num = ibevent.element.port_num; + break; +#ifdef HAVE_STRUCT_IBV_ASYNC_EVENT_ELEMENT_DCT + case IBV_EXP_EVENT_DCT_KEY_VIOLATION: + case IBV_EXP_EVENT_DCT_ACCESS_ERR: + case IBV_EXP_EVENT_DCT_REQ_ERR: + if (ibevent.element.dct) { + event.dct_num = ibevent.element.dct->dct_num; + } else { + event.dct_num = 0; + } + break; +#endif + default: + break; + }; + + uct_ib_handle_async_event(dev, &event); + ibv_ack_async_event(&ibevent); +} + +void uct_ib_handle_async_event(uct_ib_device_t *dev, uct_ib_async_event_t *event) +{ + char event_info[200]; + ucs_log_level_t level; + + switch (event->event_type) { case IBV_EVENT_CQ_ERR: snprintf(event_info, sizeof(event_info), "%s on CQ %p", - ibv_event_type_str(event.event_type), event.element.cq); + ibv_event_type_str(event->event_type), event->cookie); level = UCS_LOG_LEVEL_ERROR; break; case IBV_EVENT_QP_FATAL: @@ -172,28 +243,28 @@ static void uct_ib_async_event_handler(int fd, void *arg) case IBV_EVENT_PATH_MIG: case IBV_EVENT_PATH_MIG_ERR: snprintf(event_info, sizeof(event_info), "%s on QPN 0x%x", - ibv_event_type_str(event.event_type), event.element.qp->qp_num); + ibv_event_type_str(event->event_type), event->qp_num); level = UCS_LOG_LEVEL_ERROR; break; case IBV_EVENT_QP_LAST_WQE_REACHED: snprintf(event_info, sizeof(event_info), "SRQ-attached QP 0x%x was flushed", - event.element.qp->qp_num); + event->qp_num); level = UCS_LOG_LEVEL_DEBUG; break; case IBV_EVENT_SRQ_ERR: level = UCS_LOG_LEVEL_ERROR; snprintf(event_info, sizeof(event_info), "%s on SRQ %p", - ibv_event_type_str(event.event_type), event.element.srq); + ibv_event_type_str(event->event_type), event->cookie); break; case IBV_EVENT_SRQ_LIMIT_REACHED: snprintf(event_info, sizeof(event_info), "%s on SRQ %p", - ibv_event_type_str(event.event_type), event.element.srq); + ibv_event_type_str(event->event_type), event->cookie); level = UCS_LOG_LEVEL_DEBUG; break; case IBV_EVENT_DEVICE_FATAL: case IBV_EVENT_PORT_ERR: snprintf(event_info, sizeof(event_info), "%s on port %d", - ibv_event_type_str(event.event_type), event.element.port_num); + ibv_event_type_str(event->event_type), event->port_num); level = UCS_LOG_LEVEL_ERROR; break; case IBV_EVENT_PORT_ACTIVE: @@ -205,19 +276,19 @@ static void uct_ib_async_event_handler(int fd, void *arg) case IBV_EVENT_SM_CHANGE: case IBV_EVENT_CLIENT_REREGISTER: snprintf(event_info, sizeof(event_info), "%s on port %d", - ibv_event_type_str(event.event_type), event.element.port_num); + ibv_event_type_str(event->event_type), event->port_num); level = UCS_LOG_LEVEL_WARN; break; -#if HAVE_STRUCT_IBV_ASYNC_EVENT_ELEMENT_DCT +#ifdef HAVE_STRUCT_IBV_ASYNC_EVENT_ELEMENT_DCT case IBV_EXP_EVENT_DCT_KEY_VIOLATION: snprintf(event_info, sizeof(event_info), "%s on DCTN 0x%x", - "DCT key violation", event.element.dct->dct_num); + "DCT key violation", event->dct_num); level = UCS_LOG_LEVEL_ERROR; break; case IBV_EXP_EVENT_DCT_ACCESS_ERR: - if (event.element.dct) { + if (event->dct_num) { snprintf(event_info, sizeof(event_info), "%s on DCTN 0x%x", - "DCT access error", event.element.dct->dct_num); + "DCT access error", event->dct_num); } else { snprintf(event_info, sizeof(event_info), "%s on DCTN UNKNOWN", "DCT access error"); @@ -226,31 +297,52 @@ static void uct_ib_async_event_handler(int fd, void *arg) break; case IBV_EXP_EVENT_DCT_REQ_ERR: snprintf(event_info, sizeof(event_info), "%s on DCTN 0x%x", - "DCT requester error", event.element.dct->dct_num); + "DCT requester error", event->dct_num); level = UCS_LOG_LEVEL_ERROR; break; #endif default: snprintf(event_info, sizeof(event_info), "%s (%d)", - ibv_event_type_str(event.event_type), event.event_type); + ibv_event_type_str(event->event_type), event->event_type); level = UCS_LOG_LEVEL_INFO; break; }; UCS_STATS_UPDATE_COUNTER(dev->stats, UCT_IB_DEVICE_STAT_ASYNC_EVENT, +1); ucs_log(level, "IB Async event on %s: %s", uct_ib_device_name(dev), event_info); - ibv_ack_async_event(&event); } -ucs_status_t uct_ib_device_init(uct_ib_device_t *dev, - struct ibv_device *ibv_device, int async_events - UCS_STATS_ARG(ucs_stats_node_t *stats_parent)) +static void uct_ib_device_get_ids(uct_ib_device_t *dev) +{ + long vendor_id, device_id; + + if ((ucs_read_file_number(&vendor_id, 1, UCT_IB_DEVICE_SYSFS_FMT, + uct_ib_device_name(dev), "vendor") == UCS_OK) && + (ucs_read_file_number(&device_id, 1, UCT_IB_DEVICE_SYSFS_FMT, + uct_ib_device_name(dev), "device") == UCS_OK)) { + dev->pci_id.vendor = vendor_id; + dev->pci_id.device = device_id; + ucs_debug("%s vendor_id: 0x%x device_id: %d", uct_ib_device_name(dev), + dev->pci_id.vendor, dev->pci_id.device); + } else { + dev->pci_id.vendor = 0; + dev->pci_id.device = 0; + ucs_warn("%s: could not read device/vendor id from sysfs, " + "performance may be affected", uct_ib_device_name(dev)); + } +} + +ucs_status_t uct_ib_device_query(uct_ib_device_t *dev, + struct ibv_device *ibv_device) { ucs_status_t status; uint8_t i; int ret; - dev->async_events = async_events; + status = uct_ib_query_device(dev->ibv_context, &dev->dev_attr); + if (status != UCS_OK) { + return status; + } /* Check device type*/ switch (ibv_device->node_type) { @@ -269,30 +361,39 @@ ucs_status_t uct_ib_device_init(uct_ib_device_t *dev, ucs_error("%s has %d ports, but only up to %d are supported", ibv_get_device_name(ibv_device), dev->num_ports, UCT_IB_DEV_MAX_PORTS); - status = UCS_ERR_UNSUPPORTED; - goto err_free_context; + return UCS_ERR_UNSUPPORTED; } - /* Get device locality */ - uct_ib_device_get_locailty(ibv_get_device_name(ibv_device), &dev->local_cpus, - &dev->numa_node); - /* Query all ports */ for (i = 0; i < dev->num_ports; ++i) { - IBV_EXP_PORT_ATTR_SET_COMP_MASK(&dev->port_attr[i]); - ret = ibv_exp_query_port(dev->ibv_context, i + dev->first_port, - &dev->port_attr[i]); + ret = ibv_query_port(dev->ibv_context, i + dev->first_port, + &dev->port_attr[i]); if (ret != 0) { ucs_error("ibv_query_port() returned %d: %m", ret); - status = UCS_ERR_IO_ERROR; - goto err_free_context; + return UCS_ERR_IO_ERROR; } } + uct_ib_device_get_ids(dev); + + return UCS_OK; +} + +ucs_status_t uct_ib_device_init(uct_ib_device_t *dev, + struct ibv_device *ibv_device, int async_events + UCS_STATS_ARG(ucs_stats_node_t *stats_parent)) +{ + ucs_status_t status; + + dev->async_events = async_events; + + uct_ib_device_get_locality(ibv_get_device_name(ibv_device), &dev->local_cpus, + &dev->numa_node); + status = UCS_STATS_NODE_ALLOC(&dev->stats, &uct_ib_device_stats_class, stats_parent, "device"); if (status != UCS_OK) { - goto err_free_context; + goto err; } status = ucs_sys_fcntl_modfl(dev->ibv_context->async_fd, O_NONBLOCK, 0); @@ -303,7 +404,8 @@ ucs_status_t uct_ib_device_init(uct_ib_device_t *dev, /* Register to IB async events */ if (dev->async_events) { status = ucs_async_set_event_handler(UCS_ASYNC_THREAD_LOCK_TYPE, - dev->ibv_context->async_fd, POLLIN, + dev->ibv_context->async_fd, + UCS_EVENT_SET_EVREAD, uct_ib_async_event_handler, dev, NULL); if (status != UCS_OK) { @@ -312,7 +414,7 @@ ucs_status_t uct_ib_device_init(uct_ib_device_t *dev, } kh_init_inplace(uct_ib_ah, &dev->ah_hash); - ucs_spinlock_init(&dev->ah_lock); + ucs_recursive_spinlock_init(&dev->ah_lock, 0); ucs_debug("initialized device '%s' (%s) with %d ports", uct_ib_device_name(dev), ibv_node_type_str(ibv_device->node_type), @@ -321,8 +423,7 @@ ucs_status_t uct_ib_device_init(uct_ib_device_t *dev, err_release_stats: UCS_STATS_NODE_FREE(dev->stats); -err_free_context: - ibv_close_device(dev->ibv_context); +err: return status; } @@ -335,23 +436,28 @@ void uct_ib_device_cleanup_ah_cached(uct_ib_device_t *dev) void uct_ib_device_cleanup(uct_ib_device_t *dev) { + ucs_status_t status; + ucs_debug("destroying ib device %s", uct_ib_device_name(dev)); kh_destroy_inplace(uct_ib_ah, &dev->ah_hash); - ucs_spinlock_destroy(&dev->ah_lock); + + status = ucs_recursive_spinlock_destroy(&dev->ah_lock); + if (status != UCS_OK) { + ucs_warn("ucs_recursive_spinlock_destroy() failed (%d)", status); + } if (dev->async_events) { ucs_async_remove_handler(dev->ibv_context->async_fd, 1); } UCS_STATS_NODE_FREE(dev->stats); - ibv_close_device(dev->ibv_context); } static inline int uct_ib_device_spec_match(uct_ib_device_t *dev, const uct_ib_device_spec_t *spec) { - return (spec->vendor_id == IBV_DEV_ATTR(dev, vendor_id)) && - (spec->part_id == IBV_DEV_ATTR(dev, vendor_part_id)); + return (spec->pci_id.vendor == dev->pci_id.vendor) && + (spec->pci_id.device == dev->pci_id.device); } const uct_ib_device_spec_t* uct_ib_device_spec(uct_ib_device_t *dev) @@ -369,7 +475,7 @@ const uct_ib_device_spec_t* uct_ib_device_spec(uct_ib_device_t *dev) /* search through built-in list of device specifications */ spec = uct_ib_builtin_device_specs; - while ((spec->vendor_id != 0) && !uct_ib_device_spec_match(dev, spec)) { + while ((spec->name != NULL) && !uct_ib_device_spec_match(dev, spec)) { ++spec; } return spec; /* if no match is found, return the last entry, which contains @@ -378,7 +484,7 @@ const uct_ib_device_spec_t* uct_ib_device_spec(uct_ib_device_t *dev) static size_t uct_ib_device_get_ib_gid_index(uct_ib_md_t *md) { - if (md->config.gid_index == UCS_CONFIG_ULUNITS_AUTO) { + if (md->config.gid_index == UCS_ULUNITS_AUTO) { return UCT_IB_MD_DEFAULT_GID_INDEX; } else { return md->config.gid_index; @@ -398,7 +504,6 @@ ucs_status_t uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num, uint8_t required_dev_flags; ucs_status_t status; union ibv_gid gid; - int is_roce_v2; if (port_num < dev->first_port || port_num >= dev->first_port + dev->num_ports) { return UCS_ERR_NO_DEVICE; @@ -441,13 +546,11 @@ ucs_status_t uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num, if (md->check_subnet_filter && uct_ib_device_is_port_ib(dev, port_num)) { status = uct_ib_device_query_gid(dev, port_num, - uct_ib_device_get_ib_gid_index(md), &gid, - &is_roce_v2); - if (status) { + uct_ib_device_get_ib_gid_index(md), &gid); + if (status != UCS_OK) { return status; } - ucs_assert(is_roce_v2 == 0); if (md->subnet_filter != gid.global.subnet_prefix) { ucs_trace("%s:%d subnet_prefix does not match", uct_ib_device_name(dev), port_num); @@ -458,6 +561,26 @@ ucs_status_t uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num, return UCS_OK; } +const char *uct_ib_roce_version_str(uct_ib_roce_version_t roce_ver) +{ + switch (roce_ver) { + case UCT_IB_DEVICE_ROCE_V1: + return "RoCE v1"; + case UCT_IB_DEVICE_ROCE_V1_5: + return "RoCE v1.5"; + case UCT_IB_DEVICE_ROCE_V2: + return "RoCE v2"; + default: + return ""; + } +} + +const char *uct_ib_gid_str(const union ibv_gid *gid, char *str, size_t max_size) +{ + inet_ntop(AF_INET6, gid, str, max_size); + return str; +} + static int uct_ib_device_is_addr_ipv4_mcast(const struct in6_addr *raw, const uint32_t addr_last_bits) { @@ -473,7 +596,7 @@ static sa_family_t uct_ib_device_get_addr_family(union ibv_gid *gid, int gid_ind char p[128]; ucs_debug("testing addr_family on gid index %d: %s", - gid_index, inet_ntop(AF_INET6, gid, p, sizeof(p))); + gid_index, uct_ib_gid_str(gid, p, sizeof(p))); if (!((raw->s6_addr32[0] | raw->s6_addr32[1]) | addr_last_bits) || uct_ib_device_is_addr_ipv4_mcast(raw, addr_last_bits)) { @@ -483,9 +606,10 @@ static sa_family_t uct_ib_device_get_addr_family(union ibv_gid *gid, int gid_ind } } -static ucs_status_t -uct_ib_device_query_gid_info(uct_ib_device_t *dev, uint8_t port_num, - unsigned gid_index, uct_ib_device_gid_info_t *info) +ucs_status_t +uct_ib_device_query_gid_info(struct ibv_context *ctx, const char *dev_name, + uint8_t port_num, unsigned gid_index, + uct_ib_device_gid_info_t *info) { int ret; @@ -493,82 +617,135 @@ uct_ib_device_query_gid_info(uct_ib_device_t *dev, uint8_t port_num, struct ibv_exp_gid_attr attr; attr.comp_mask = IBV_EXP_QUERY_GID_ATTR_TYPE | IBV_EXP_QUERY_GID_ATTR_GID; - ret = ibv_exp_query_gid_attr(dev->ibv_context, port_num, gid_index, &attr); + ret = ibv_exp_query_gid_attr(ctx, port_num, gid_index, &attr); if (ret == 0) { - info->gid = attr.gid; + info->gid = attr.gid; + info->gid_index = gid_index; + info->roce_info.addr_family = + uct_ib_device_get_addr_family(&info->gid, gid_index); switch (attr.type) { case IBV_EXP_IB_ROCE_V1_GID_TYPE: - info->roce_version.major = 1; - info->roce_version.minor = 0; + info->roce_info.ver = UCT_IB_DEVICE_ROCE_V1; return UCS_OK; case IBV_EXP_ROCE_V1_5_GID_TYPE: - info->roce_version.major = 1; - info->roce_version.minor = 5; + info->roce_info.ver = UCT_IB_DEVICE_ROCE_V1_5; return UCS_OK; case IBV_EXP_ROCE_V2_GID_TYPE: - info->roce_version.major = 2; - info->roce_version.minor = 0; + info->roce_info.ver = UCT_IB_DEVICE_ROCE_V2; return UCS_OK; default: ucs_error("Invalid GID[%d] type on %s:%d: %d", - gid_index, uct_ib_device_name(dev), port_num, attr.type); + gid_index, dev_name, port_num, attr.type); return UCS_ERR_IO_ERROR; } } #else - ret = ibv_query_gid(dev->ibv_context, port_num, gid_index, &info->gid); + char buf[16]; + + ret = ibv_query_gid(ctx, port_num, gid_index, &info->gid); if (ret == 0) { - info->roce_version.major = 1; - info->roce_version.minor = 0; + ret = ucs_read_file(buf, sizeof(buf) - 1, 1, + UCT_IB_DEVICE_SYSFS_GID_TYPE_FMT, + dev_name, port_num, gid_index); + if (ret > 0) { + if (!strncmp(buf, "IB/RoCE v1", 10)) { + info->roce_info.ver = UCT_IB_DEVICE_ROCE_V1; + } else if (!strncmp(buf, "RoCE v2", 7)) { + info->roce_info.ver = UCT_IB_DEVICE_ROCE_V2; + } else { + ucs_error("failed to parse gid type '%s' (dev=%s port=%d index=%d)", + buf, dev_name, port_num, gid_index); + return UCS_ERR_INVALID_PARAM; + } + } else { + info->roce_info.ver = UCT_IB_DEVICE_ROCE_V1; + } + + info->roce_info.addr_family = + uct_ib_device_get_addr_family(&info->gid, gid_index); + info->gid_index = gid_index; return UCS_OK; } #endif ucs_error("ibv_query_gid(dev=%s port=%d index=%d) failed: %m", - uct_ib_device_name(dev), port_num, gid_index); + dev_name, port_num, gid_index); return UCS_ERR_INVALID_PARAM; } -static ucs_status_t uct_ib_device_set_roce_gid_index(uct_ib_device_t *dev, - uint8_t port_num, - uint8_t *gid_index) +int uct_ib_device_test_roce_gid_index(uct_ib_device_t *dev, uint8_t port_num, + const union ibv_gid *gid, + uint8_t gid_index) { - static const uct_ib_roce_version_desc_t roce_prio[] = { - {2, 0, AF_INET}, - {2, 0, AF_INET6}, - {1, 0, AF_INET}, - {1, 0, AF_INET6} + struct ibv_ah_attr ah_attr; + struct ibv_ah *ah; + + ucs_assert(uct_ib_device_is_port_roce(dev, port_num)); + + memset(&ah_attr, 0, sizeof(ah_attr)); + ah_attr.port_num = port_num; + ah_attr.is_global = 1; + ah_attr.grh.dgid = *gid; + ah_attr.grh.sgid_index = gid_index; + ah_attr.grh.hop_limit = 255; + ah_attr.grh.flow_label = 1; + ah_attr.dlid = UCT_IB_ROCE_UDP_SRC_PORT_BASE; + + ah = ibv_create_ah(ucs_container_of(dev, uct_ib_md_t, dev)->pd, &ah_attr); + if (ah == NULL) { + return 0; /* gid entry is not operational */ + } + + ibv_destroy_ah(ah); + return 1; +} + +ucs_status_t uct_ib_device_select_gid(uct_ib_device_t *dev, uint8_t port_num, + uct_ib_device_gid_info_t *gid_info) +{ + static const uct_ib_roce_version_info_t roce_prio[] = { + {UCT_IB_DEVICE_ROCE_V2, AF_INET}, + {UCT_IB_DEVICE_ROCE_V2, AF_INET6}, + {UCT_IB_DEVICE_ROCE_V1, AF_INET}, + {UCT_IB_DEVICE_ROCE_V1, AF_INET6} }; int gid_tbl_len = uct_ib_device_port_attr(dev, port_num)->gid_tbl_len; ucs_status_t status = UCS_OK; int priorities_arr_len = ucs_static_array_size(roce_prio); - uct_ib_device_gid_info_t gid_info; + uct_ib_device_gid_info_t gid_info_tmp; int i, prio_idx; - /* search for matching GID table entries, accroding to the order defined + ucs_assert(uct_ib_device_is_port_roce(dev, port_num)); + + /* search for matching GID table entries, according to the order defined * in priorities array */ for (prio_idx = 0; prio_idx < priorities_arr_len; prio_idx++) { for (i = 0; i < gid_tbl_len; i++) { - status = uct_ib_device_query_gid_info(dev, port_num, i, &gid_info); + status = uct_ib_device_query_gid_info(dev->ibv_context, + uct_ib_device_name(dev), + port_num, i, &gid_info_tmp); if (status != UCS_OK) { goto out; } - if ((roce_prio[prio_idx].roce_major == gid_info.roce_version.major) && - (roce_prio[prio_idx].roce_minor == gid_info.roce_version.minor) && - (roce_prio[prio_idx].address_family == - uct_ib_device_get_addr_family(&gid_info.gid, i))) { - *gid_index = i; + if ((roce_prio[prio_idx].ver == gid_info_tmp.roce_info.ver) && + (roce_prio[prio_idx].addr_family == gid_info_tmp.roce_info.addr_family) && + uct_ib_device_test_roce_gid_index(dev, port_num, &gid_info_tmp.gid, i)) { + + gid_info->gid_index = i; + gid_info->roce_info = gid_info_tmp.roce_info; goto out_print; } } } - *gid_index = UCT_IB_MD_DEFAULT_GID_INDEX; + gid_info->gid_index = UCT_IB_MD_DEFAULT_GID_INDEX; + gid_info->roce_info.ver = UCT_IB_DEVICE_ROCE_V1; + gid_info->roce_info.addr_family = AF_INET; out_print: ucs_debug("%s:%d using gid_index %d", uct_ib_device_name(dev), port_num, - *gid_index); + gid_info->gid_index); out: return status; } @@ -587,29 +764,57 @@ int uct_ib_device_is_port_roce(uct_ib_device_t *dev, uint8_t port_num) return IBV_PORT_IS_LINK_LAYER_ETHERNET(uct_ib_device_port_attr(dev, port_num)); } -ucs_status_t uct_ib_device_select_gid_index(uct_ib_device_t *dev, - uint8_t port_num, - size_t md_config_index, - uint8_t *gid_index) +const char *uct_ib_device_name(uct_ib_device_t *dev) { - ucs_status_t status = UCS_OK; + return ibv_get_device_name(dev->ibv_context->device); +} - if (md_config_index == UCS_CONFIG_ULUNITS_AUTO) { - if (uct_ib_device_is_port_roce(dev, port_num)) { - status = uct_ib_device_set_roce_gid_index(dev, port_num, gid_index); - } else { - *gid_index = UCT_IB_MD_DEFAULT_GID_INDEX; +ucs_status_t uct_ib_device_bus(uct_ib_device_t *dev, int port_num, + ucs_sys_bus_id_t *bus_id) +{ + char ib_realpath[PATH_MAX]; + char *pcie_bus; + char *tmp; + int i, bus_len; + int num_inputs; + + if (NULL == realpath(dev->ibv_context->device->ibdev_path, ib_realpath)) { + return UCS_ERR_NO_RESOURCE; + } + + /* realpath name is of form /sys/devices/.../0000:05:00.0/infiniband/mlx5_0 + * and bus_id is constructed from 0000:05:00.0 */ + + /* Make sure there is /infiniband substring in ib_realpath*/ + tmp = strstr(ib_realpath, "/infiniband"); + if (NULL == tmp) { + return UCS_ERR_NO_RESOURCE; + } + + pcie_bus = dirname(ib_realpath); + pcie_bus = dirname(pcie_bus); + pcie_bus = basename(pcie_bus); + + bus_len = strlen(pcie_bus); + for (i = 0; i < bus_len; i++) { + if ((pcie_bus[i] == ':') || (pcie_bus[i] == '.')) { + pcie_bus[i] = ' '; } - } else { - *gid_index = md_config_index; } - return status; -} + num_inputs = sscanf(pcie_bus, "%hx %hhx %hhx %hhx", &bus_id->domain, + &bus_id->bus, + &bus_id->slot, + &bus_id->function); + if (num_inputs != 4) { + return UCS_ERR_NO_RESOURCE; + } -const char *uct_ib_device_name(uct_ib_device_t *dev) -{ - return ibv_get_device_name(dev->ibv_context->device); + ucs_debug("ib device = %s:%d, bus id = %hu:%hhu:%hhu.%hhu", + uct_ib_device_name(dev), port_num, bus_id->domain, bus_id->bus, + bus_id->slot, bus_id->function); + + return UCS_OK; } size_t uct_ib_mtu_value(enum ibv_mtu mtu) @@ -629,23 +834,46 @@ size_t uct_ib_mtu_value(enum ibv_mtu mtu) ucs_fatal("Invalid MTU value (%d)", mtu); } -uint8_t uct_ib_to_fabric_time(double time) +uint8_t uct_ib_to_qp_fabric_time(double t) { double to; - long t; - to = log(time / 4.096e-6) / log(2.0); + to = log(t / 4.096e-6) / log(2.0); if (to < 1) { return 1; /* Very small timeout */ - } else if (to > 30) { + } else if ((long)(to + 0.5) >= UCT_IB_FABRIC_TIME_MAX) { return 0; /* No timeout */ } else { - t = (long)(to + 0.5); - ucs_assert(t >= 1 && t < 31); - return t; + return (long)(to + 0.5); } } +uint8_t uct_ib_to_rnr_fabric_time(double t) +{ + double time_ms = t * UCS_MSEC_PER_SEC; + uint8_t idx, next_index; + double avg_ms; + + for (idx = 1; idx < UCT_IB_FABRIC_TIME_MAX; idx++) { + next_index = (idx + 1) % UCT_IB_FABRIC_TIME_MAX; + + if (time_ms <= uct_ib_qp_rnr_time_ms[next_index]) { + avg_ms = (uct_ib_qp_rnr_time_ms[idx] + + uct_ib_qp_rnr_time_ms[next_index]) * 0.5; + + if (time_ms < avg_ms) { + /* return previous index */ + return idx; + } else { + /* return current index */ + return next_index; + } + } + } + + return 0; /* this is a special value that means the maximum value */ +} + ucs_status_t uct_ib_modify_qp(struct ibv_qp *qp, enum ibv_qp_state state) { struct ibv_qp_attr qp_attr; @@ -661,27 +889,25 @@ ucs_status_t uct_ib_modify_qp(struct ibv_qp *qp, enum ibv_qp_state state) return UCS_OK; } -ucs_status_t uct_ib_device_query_tl_resources(uct_ib_device_t *dev, - const char *tl_name, unsigned flags, - uct_tl_resource_desc_t **resources_p, - unsigned *num_resources_p) +ucs_status_t uct_ib_device_query_ports(uct_ib_device_t *dev, unsigned flags, + uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) { - uct_tl_resource_desc_t *resources, *rsc; - unsigned num_resources; + uct_tl_device_resource_t *tl_devices; + unsigned num_tl_devices; ucs_status_t status; uint8_t port_num; /* Allocate resources array * We may allocate more memory than really required, but it's not so bad. */ - resources = ucs_calloc(dev->num_ports, sizeof(uct_tl_resource_desc_t), - "ib resource"); - if (resources == NULL) { + tl_devices = ucs_calloc(dev->num_ports, sizeof(*tl_devices), "ib device resource"); + if (tl_devices == NULL) { status = UCS_ERR_NO_MEMORY; goto err; } /* Second pass: fill port information */ - num_resources = 0; + num_tl_devices = 0; for (port_num = dev->first_port; port_num < dev->first_port + dev->num_ports; ++port_num) { @@ -694,30 +920,26 @@ ucs_status_t uct_ib_device_query_tl_resources(uct_ib_device_t *dev, continue; } - /* Get port information */ - rsc = &resources[num_resources]; - ucs_snprintf_zero(rsc->dev_name, sizeof(rsc->dev_name), "%s:%d", - uct_ib_device_name(dev), port_num); - ucs_snprintf_zero(rsc->tl_name, UCT_TL_NAME_MAX, "%s", tl_name); - rsc->dev_type = UCT_DEVICE_TYPE_NET; - - ucs_debug("found usable port for tl %s %s:%d", tl_name, - uct_ib_device_name(dev), port_num); - ++num_resources; + /* Save device information */ + ucs_snprintf_zero(tl_devices[num_tl_devices].name, + sizeof(tl_devices[num_tl_devices].name), + "%s:%d", uct_ib_device_name(dev), port_num); + tl_devices[num_tl_devices].type = UCT_DEVICE_TYPE_NET; + ++num_tl_devices; } - if (num_resources == 0) { + if (num_tl_devices == 0) { ucs_debug("no compatible IB ports found for flags 0x%x", flags); status = UCS_ERR_NO_DEVICE; goto err_free; } - *num_resources_p = num_resources; - *resources_p = resources; + *num_tl_devices_p = num_tl_devices; + *tl_devices_p = tl_devices; return UCS_OK; err_free: - ucs_free(resources); + ucs_free(tl_devices); err: return status; } @@ -762,7 +984,6 @@ ucs_status_t uct_ib_device_find_port(uct_ib_device_t *dev, ucs_status_t uct_ib_device_mtu(const char *dev_name, uct_md_h md, int *p_mtu) { - uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev; uint8_t port_num; ucs_status_t status; @@ -782,13 +1003,13 @@ int uct_ib_device_is_gid_raw_empty(uint8_t *gid_raw) } ucs_status_t uct_ib_device_query_gid(uct_ib_device_t *dev, uint8_t port_num, - unsigned gid_index, union ibv_gid *gid, - int *is_roce_v2) + unsigned gid_index, union ibv_gid *gid) { uct_ib_device_gid_info_t gid_info; ucs_status_t status; - status = uct_ib_device_query_gid_info(dev, port_num, gid_index, &gid_info); + status = uct_ib_device_query_gid_info(dev->ibv_context, uct_ib_device_name(dev), + port_num, gid_index, &gid_info); if (status != UCS_OK) { return status; } @@ -799,15 +1020,13 @@ ucs_status_t uct_ib_device_query_gid(uct_ib_device_t *dev, uint8_t port_num, return UCS_ERR_INVALID_ADDR; } - *gid = gid_info.gid; - *is_roce_v2 = uct_ib_device_is_port_roce(dev, port_num) && - (gid_info.roce_version.major >= 2); + *gid = gid_info.gid; return UCS_OK; } size_t uct_ib_device_odp_max_size(uct_ib_device_t *dev) { -#if HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS +#ifdef HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS const struct ibv_exp_device_attr *dev_attr = &dev->dev_attr; uint32_t required_ud_odp_caps = IBV_EXP_ODP_SUPPORT_SEND; uint32_t required_rc_odp_caps = IBV_EXP_ODP_SUPPORT_SEND | @@ -842,58 +1061,6 @@ size_t uct_ib_device_odp_max_size(uct_ib_device_t *dev) #endif /* HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS */ } -static ucs_status_t -uct_ib_device_parse_fw_ver_triplet(uct_ib_device_t *dev, unsigned *major, - unsigned *minor, unsigned *release) -{ - int ret; - - ret = sscanf(IBV_DEV_ATTR(dev, fw_ver), "%u.%u.%u", major, minor, release); - if (ret != 3) { - ucs_debug("failed to parse firmware version string '%s'", - IBV_DEV_ATTR(dev, fw_ver)); - return UCS_ERR_INVALID_PARAM; - } - - return UCS_OK; -} - -int uct_ib_device_odp_has_global_mr(uct_ib_device_t *dev) -{ - unsigned fw_major, fw_minor, fw_release; - ucs_status_t status; - - if (!uct_ib_device_odp_max_size(dev)) { - return 0; - } - -#if HAVE_DECL_IBV_EXP_ODP_SUPPORT_IMPLICIT - if (!(dev->dev_attr.odp_caps.general_odp_caps & IBV_EXP_ODP_SUPPORT_IMPLICIT)) { - return 0; - } -#endif - - if (uct_ib_device_spec(dev)->flags & UCT_IB_DEVICE_FLAG_MELLANOX) { - status = uct_ib_device_parse_fw_ver_triplet(dev, &fw_major, &fw_minor, - &fw_release); - if (status != UCS_OK) { - return 0; - } - - if ((fw_major < 12) || (fw_minor < 21)) { - return 0; - } else if (fw_minor == 21) { - return (fw_release >= 2031) && (fw_release <= 2099); - } else if (fw_minor == 22) { - return (fw_release >= 84); - } else { - return 1; - } - } - - return 1; -} - const char *uct_ib_wc_status_str(enum ibv_wc_status wc_status) { return ibv_wc_status_str(wc_status); @@ -904,29 +1071,14 @@ static ucs_status_t uct_ib_device_create_ah(uct_ib_device_t *dev, struct ibv_pd *pd, struct ibv_ah **ah_p) { - char buf[128]; - char *p, *endp; struct ibv_ah *ah; + char buf[128]; ah = ibv_create_ah(pd, ah_attr); if (ah == NULL) { - p = buf; - endp = buf + sizeof(buf); - snprintf(p, endp - p, "dlid=%d sl=%d port=%d src_path_bits=%d", - ah_attr->dlid, ah_attr->sl, - ah_attr->port_num, ah_attr->src_path_bits); - p += strlen(p); - - if (ah_attr->is_global) { - snprintf(p, endp - p, " dgid="); - p += strlen(p); - inet_ntop(AF_INET6, &ah_attr->grh.dgid, p, endp - p); - p += strlen(p); - snprintf(p, endp - p, " sgid_index=%d traffic_class=%d", - ah_attr->grh.sgid_index, ah_attr->grh.traffic_class); - } - - ucs_error("ibv_create_ah(%s) failed: %m", buf); + ucs_error("ibv_create_ah(%s) on %s failed: %m", + uct_ib_ah_attr_str(buf, sizeof(buf), ah_attr), + uct_ib_device_name(dev)); return UCS_ERR_INVALID_ADDR; } @@ -943,7 +1095,7 @@ ucs_status_t uct_ib_device_create_ah_cached(uct_ib_device_t *dev, khiter_t iter; int ret; - ucs_spin_lock(&dev->ah_lock); + ucs_recursive_spin_lock(&dev->ah_lock); /* looking for existing AH with same attributes */ iter = kh_get(uct_ib_ah, &dev->ah_hash, *ah_attr); @@ -971,7 +1123,7 @@ ucs_status_t uct_ib_device_create_ah_cached(uct_ib_device_t *dev, } unlock: - ucs_spin_unlock(&dev->ah_lock); + ucs_recursive_spin_unlock(&dev->ah_lock); return status; } @@ -1009,3 +1161,66 @@ int uct_ib_get_cqe_size(int cqe_size_min) return cqe_size; } + +static ucs_status_t +uct_ib_device_get_roce_ndev_name(uct_ib_device_t *dev, uint8_t port_num, + char *ndev_name, size_t max) +{ + ssize_t nread; + + ucs_assert_always(uct_ib_device_is_port_roce(dev, port_num)); + + /* get the network device name which corresponds to a RoCE port */ + nread = ucs_read_file_str(ndev_name, max, 1, + UCT_IB_DEVICE_SYSFS_GID_NDEV_FMT, + uct_ib_device_name(dev), port_num, 0); + if (nread < 0) { + ucs_diag("failed to read " UCT_IB_DEVICE_SYSFS_GID_NDEV_FMT": %m", + uct_ib_device_name(dev), port_num, 0); + return UCS_ERR_NO_DEVICE; + } + + ucs_strtrim(ndev_name); + return UCS_OK; +} + +unsigned uct_ib_device_get_roce_lag_level(uct_ib_device_t *dev, uint8_t port_num) +{ + char ndev_name[IFNAMSIZ]; + unsigned roce_lag_level; + ucs_status_t status; + + status = uct_ib_device_get_roce_ndev_name(dev, port_num, ndev_name, + sizeof(ndev_name)); + if (status != UCS_OK) { + return 1; + } + + roce_lag_level = ucs_netif_bond_ad_num_ports(ndev_name); + ucs_debug("RoCE LAG level on %s:%d (%s) is %u", uct_ib_device_name(dev), + port_num, ndev_name, roce_lag_level); + return roce_lag_level; +} + +const char* uct_ib_ah_attr_str(char *buf, size_t max, + const struct ibv_ah_attr *ah_attr) +{ + char *p = buf; + char *endp = buf + max; + + snprintf(p, endp - p, "dlid=%d sl=%d port=%d src_path_bits=%d", + ah_attr->dlid, ah_attr->sl, + ah_attr->port_num, ah_attr->src_path_bits); + p += strlen(p); + + if (ah_attr->is_global) { + snprintf(p, endp - p, " dgid="); + p += strlen(p); + uct_ib_gid_str(&ah_attr->grh.dgid, p, endp - p); + p += strlen(p); + snprintf(p, endp - p, " sgid_index=%d traffic_class=%d", + ah_attr->grh.sgid_index, ah_attr->grh.traffic_class); + } + + return buf; +} diff --git a/src/uct/ib/base/ib_device.h b/src/uct/ib/base/ib_device.h index 98107b4b6c2..b4b1b707723 100644 --- a/src/uct/ib/base/ib_device.h +++ b/src/uct/ib/base/ib_device.h @@ -10,40 +10,51 @@ #include "ib_verbs.h" #include +#include #include #include #include #include +#include +#include #include - - -#define UCT_IB_QPN_ORDER 24 /* How many bits can be an IB QP number */ -#define UCT_IB_LRH_LEN 8 /* IB Local routing header */ -#define UCT_IB_GRH_LEN 40 /* IB GLobal routing header */ -#define UCT_IB_BTH_LEN 12 /* IB base transport header */ -#define UCT_IB_ROCE_LEN 14 /* Ethernet header - - 6B for Destination MAC + - 6B for Source MAC + 2B Type (RoCE) */ -#define UCT_IB_DETH_LEN 8 /* IB datagram header */ -#define UCT_IB_RETH_LEN 16 /* IB RDMA header */ -#define UCT_IB_ATOMIC_ETH_LEN 28 /* IB atomic header */ -#define UCT_IB_AETH_LEN 4 /* IB ack */ -#define UCT_IB_PAYLOAD_ALIGN 4 /* IB payload padding */ -#define UCT_IB_ICRC_LEN 4 /* IB invariant crc footer */ -#define UCT_IB_VCRC_LEN 2 /* IB variant crc footer */ -#define UCT_IB_DELIM_LEN 2 /* IB wire delimiter */ -#define UCT_IB_FDR_PACKET_GAP 64 /* Minimal FDR packet gap */ -#define UCT_IB_MAX_MESSAGE_SIZE (2 << 30) /* Maximal IB message size */ -#define UCT_IB_PKEY_PARTITION_MASK 0x7fff /* IB partition number mask */ -#define UCT_IB_PKEY_MEMBERSHIP_MASK 0x8000 /* Full/send-only member */ -#define UCT_IB_DEV_MAX_PORTS 2 -#define UCT_IB_INVALID_RKEY 0xffffffffu -#define UCT_IB_KEY 0x1ee7a330 -#define UCT_IB_LINK_LOCAL_PREFIX be64toh(0xfe80000000000000ul) /* IBTA 4.1.1 12a */ -#define UCT_IB_SITE_LOCAL_PREFIX be64toh(0xfec0000000000000ul) /* IBTA 4.1.1 12b */ -#define UCT_IB_SITE_LOCAL_MASK be64toh(0xffffffffffff0000ul) /* IBTA 4.1.1 12b */ -#define UCT_IB_DEFAULT_ROCEV2_DSCP 106 /* Default DSCP for RoCE v2 */ +#include + + +#define UCT_IB_QPN_ORDER 24 /* How many bits can be an IB QP number */ +#define UCT_IB_LRH_LEN 8 /* IB Local routing header */ +#define UCT_IB_GRH_LEN 40 /* IB GLobal routing header */ +#define UCT_IB_BTH_LEN 12 /* IB base transport header */ +#define UCT_IB_ROCE_LEN 14 /* Ethernet header - + 6B for Destination MAC + + 6B for Source MAC + 2B Type (RoCE) */ +#define UCT_IB_DETH_LEN 8 /* IB datagram header */ +#define UCT_IB_RETH_LEN 16 /* IB RDMA header */ +#define UCT_IB_ATOMIC_ETH_LEN 28 /* IB atomic header */ +#define UCT_IB_AETH_LEN 4 /* IB ack */ +#define UCT_IB_PAYLOAD_ALIGN 4 /* IB payload padding */ +#define UCT_IB_ICRC_LEN 4 /* IB invariant crc footer */ +#define UCT_IB_VCRC_LEN 2 /* IB variant crc footer */ +#define UCT_IB_DELIM_LEN 2 /* IB wire delimiter */ +#define UCT_IB_FDR_PACKET_GAP 64 /* Minimal FDR packet gap */ +#define UCT_IB_MAX_MESSAGE_SIZE (2UL << 30) /* Maximal IB message size */ +#define UCT_IB_PKEY_PARTITION_MASK 0x7fff /* IB partition number mask */ +#define UCT_IB_PKEY_MEMBERSHIP_MASK 0x8000 /* Full/send-only member */ +#define UCT_IB_DEV_MAX_PORTS 2 +#define UCT_IB_FABRIC_TIME_MAX 32 +#define UCT_IB_INVALID_RKEY 0xffffffffu +#define UCT_IB_KEY 0x1ee7a330 +#define UCT_IB_LINK_LOCAL_PREFIX be64toh(0xfe80000000000000ul) /* IBTA 4.1.1 12a */ +#define UCT_IB_SITE_LOCAL_PREFIX be64toh(0xfec0000000000000ul) /* IBTA 4.1.1 12b */ +#define UCT_IB_SITE_LOCAL_MASK be64toh(0xffffffffffff0000ul) /* IBTA 4.1.1 12b */ +#define UCT_IB_DEFAULT_ROCEV2_DSCP 106 /* Default DSCP for RoCE v2 */ +#define UCT_IB_ROCE_UDP_SRC_PORT_BASE 0xC000 +#define UCT_IB_DEVICE_SYSFS_PFX "/sys/class/infiniband/%s" +#define UCT_IB_DEVICE_SYSFS_FMT UCT_IB_DEVICE_SYSFS_PFX "/device/%s" +#define UCT_IB_DEVICE_SYSFS_GID_ATTR_PFX UCT_IB_DEVICE_SYSFS_PFX "/ports/%d/gid_attrs" +#define UCT_IB_DEVICE_SYSFS_GID_TYPE_FMT UCT_IB_DEVICE_SYSFS_GID_ATTR_PFX "/types/%d" +#define UCT_IB_DEVICE_SYSFS_GID_NDEV_FMT UCT_IB_DEVICE_SYSFS_GID_ATTR_PFX "/ndevs/%d" enum { @@ -52,6 +63,14 @@ enum { }; +typedef enum uct_ib_roce_version { + UCT_IB_DEVICE_ROCE_V1, + UCT_IB_DEVICE_ROCE_V1_5, + UCT_IB_DEVICE_ROCE_V2, + UCT_IB_DEVICE_ROCE_ANY +} uct_ib_roce_version_t; + + enum { UCT_IB_DEVICE_FLAG_MLX4_PRM = UCS_BIT(1), /* Device supports mlx4 PRM */ UCT_IB_DEVICE_FLAG_MLX5_PRM = UCS_BIT(2), /* Device supports mlx5 PRM */ @@ -61,7 +80,8 @@ enum { UCT_IB_DEVICE_FLAG_DC_V2 = UCS_BIT(7), /* Device supports DC ver 2 */ UCT_IB_DEVICE_FLAG_AV = UCS_BIT(8), /* Device supports compact AV */ UCT_IB_DEVICE_FLAG_DC = UCT_IB_DEVICE_FLAG_DC_V1 | - UCT_IB_DEVICE_FLAG_DC_V2 /* Device supports DC */ + UCT_IB_DEVICE_FLAG_DC_V2, /* Device supports DC */ + UCT_IB_DEVICE_FLAG_ODP_IMPLICIT = UCS_BIT(9), }; @@ -69,13 +89,27 @@ enum { * Flags which specify which address fields are present */ enum { - UCT_IB_ADDRESS_FLAG_LID = UCS_BIT(0), - UCT_IB_ADDRESS_FLAG_IF_ID = UCS_BIT(1), - UCT_IB_ADDRESS_FLAG_SUBNET16 = UCS_BIT(2), - UCT_IB_ADDRESS_FLAG_SUBNET64 = UCS_BIT(3), - UCT_IB_ADDRESS_FLAG_GID = UCS_BIT(4), - UCT_IB_ADDRESS_FLAG_LINK_LAYER_IB = UCS_BIT(5), - UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH = UCS_BIT(6) + /* GID index, used for both ETH or IB link layer. */ + UCT_IB_ADDRESS_FLAG_GID_INDEX = UCS_BIT(0), + /* Defines path MTU size, used for both ETH or IB link layer. */ + UCT_IB_ADDRESS_FLAG_PATH_MTU = UCS_BIT(1), + /* PKEY value, used for both ETH or IB link layer. */ + UCT_IB_ADDRESS_FLAG_PKEY = UCS_BIT(2), + + /* If set - ETH link layer, else- IB link layer. */ + UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH = UCS_BIT(3), + + /* Used for ETH link layer. */ + UCT_IB_ADDRESS_FLAG_ROCE_IPV6 = UCS_BIT(4), + /* Used for ETH link layer, following bits are used to pack RoCE version. */ + UCT_IB_ADDRESS_FLAG_ETH_LAST = UCS_BIT(5), + + /* Used for IB link layer. */ + UCT_IB_ADDRESS_FLAG_SUBNET16 = UCS_BIT(4), + /* Used for IB link layer. */ + UCT_IB_ADDRESS_FLAG_SUBNET64 = UCS_BIT(5), + /* Used for IB link layer. */ + UCT_IB_ADDRESS_FLAG_IF_ID = UCS_BIT(6) }; @@ -83,6 +117,10 @@ enum { * IB network address */ typedef struct uct_ib_address { + /* Using flags from UCT_IB_ADDRESS_FLAG_xx + * For ETH link layer, the 4 msb's are used to indicate the RoCE version - + * (by shifting the UCT_IB_DEVICE_ROCE_xx values when packing and unpacking + * the ib address) */ uint8_t flags; /* Following fields appear in this order (if specified by flags). * The full gid always appears last: @@ -96,13 +134,21 @@ typedef struct uct_ib_address { } UCS_S_PACKED uct_ib_address_t; +/** + * PCI identifier of a device + */ +typedef struct { + uint16_t vendor; + uint16_t device; +} uct_ib_pci_id_t; + + /** * IB device specification. */ typedef struct uct_ib_device_spec { - uint16_t vendor_id; - uint16_t part_id; const char *name; + uct_ib_pci_id_t pci_id; unsigned flags; uint8_t priority; } uct_ib_device_spec_t; @@ -115,15 +161,16 @@ KHASH_TYPE(uct_ib_ah, struct ibv_ah_attr, struct ibv_ah*); */ typedef struct uct_ib_device { struct ibv_context *ibv_context; /* Verbs context */ - struct ibv_exp_device_attr dev_attr; /* Cached device attributes */ + uct_ib_device_attr dev_attr; /* Cached device attributes */ uint8_t first_port; /* Number of first port (usually 1) */ uint8_t num_ports; /* Amount of physical ports */ - cpu_set_t local_cpus; /* CPUs local to device */ + ucs_sys_cpuset_t local_cpus; /* CPUs local to device */ int numa_node; /* NUMA node of the device */ int async_events; /* Whether async events are handled */ int max_zcopy_log_sge; /* Maximum sges log for zcopy am */ - UCS_STATS_NODE_DECLARE(stats); - struct ibv_exp_port_attr port_attr[UCT_IB_DEV_MAX_PORTS]; /* Cached port attributes */ + UCS_STATS_NODE_DECLARE(stats) + struct ibv_port_attr port_attr[UCT_IB_DEV_MAX_PORTS]; /* Cached port attributes */ + uct_ib_pci_id_t pci_id; unsigned flags; uint8_t atomic_arg_sizes; uint8_t atomic_arg_sizes_be; @@ -131,20 +178,43 @@ typedef struct uct_ib_device { uint8_t ext_atomic_arg_sizes_be; uint8_t pci_fadd_arg_sizes; uint8_t pci_cswap_arg_sizes; + uint8_t atomic_align; /* AH hash */ khash_t(uct_ib_ah) ah_hash; - ucs_spinlock_t ah_lock; + ucs_recursive_spinlock_t ah_lock; } uct_ib_device_t; /** - * RoCE version priorities + * RoCE version */ -typedef struct uct_ib_roce_version_desc { - uint8_t roce_major; - uint8_t roce_minor; - sa_family_t address_family; -} uct_ib_roce_version_desc_t; +typedef struct uct_ib_roce_version_info { + /** RoCE version described by the UCT_IB_DEVICE_ROCE_xx values */ + uct_ib_roce_version_t ver; + /** Address family of the port */ + sa_family_t addr_family; +} uct_ib_roce_version_info_t; + + +typedef struct { + union ibv_gid gid; + uint8_t gid_index; /* IB/RoCE GID index to use */ + uct_ib_roce_version_info_t roce_info; /* For a RoCE port */ +} uct_ib_device_gid_info_t; + + +typedef struct { + enum ibv_event_type event_type; + union { + uint8_t port_num; + uint32_t qp_num; + uint32_t dct_num; + void *cookie; + }; +} uct_ib_async_event_t; + + +extern const double uct_ib_qp_rnr_time_ms[]; /** @@ -158,16 +228,16 @@ ucs_status_t uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num, * Helper function to list IB transport resources. * * @param dev IB device. - * @param tl_name Transport name. * @param flags Transport requirements from IB device (see UCT_IB_RESOURCE_FLAG_xx) - * @param resources_p Filled with a pointer to an array of resources. - * @param num_resources_p Filled with the number of resources. + * @param devices_p Filled with a pointer to an array of devices. + * @param num_devices_p Filled with the number of devices. */ -ucs_status_t uct_ib_device_query_tl_resources(uct_ib_device_t *dev, - const char *tl_name, unsigned flags, - uct_tl_resource_desc_t **resources_p, - unsigned *num_resources_p); +ucs_status_t uct_ib_device_query_ports(uct_ib_device_t *dev, unsigned flags, + uct_tl_device_resource_t **devices_p, + unsigned *num_devices_p); +ucs_status_t uct_ib_device_query(uct_ib_device_t *dev, + struct ibv_device *ibv_device); ucs_status_t uct_ib_device_init(uct_ib_device_t *dev, struct ibv_device *ibv_device, int async_events @@ -183,17 +253,17 @@ const uct_ib_device_spec_t* uct_ib_device_spec(uct_ib_device_t *dev); /** - * Select the IB gid index to use. + * Select the best gid to use and set its information on the RoCE port - + * gid index, RoCE version and address family. * - * @param dev IB device. - * @param port_num Port number. - * @param md_config_index Gid index from the md configuration. - * @param ib_gid_index Filled with the selected gid index. + * @param [in] dev IB device. + * @param [in] port_num Port number. + * @param [out] gid_info Filled with the selected gid index and the + * port's RoCE version and address family. */ -ucs_status_t uct_ib_device_select_gid_index(uct_ib_device_t *dev, - uint8_t port_num, - size_t md_config_index, - uint8_t *ib_gid_index); +ucs_status_t uct_ib_device_select_gid(uct_ib_device_t *dev, + uint8_t port_num, + uct_ib_device_gid_info_t *gid_info); /** @@ -203,19 +273,14 @@ const char *uct_ib_device_name(uct_ib_device_t *dev); /** - * @return true if device name begins with "hns". + * For the given IB device find the associated bus information + * + * @param [in] dev IB device. + * @param [in] port_num Port number. + * @param [out] bus_id Bus information. */ -static inline int uct_ib_device_is_hns(struct ibv_device *device) -{ -#if HAVE_HNS_ROCE -#define UCT_IB_DEVICE_HNS "hns" -#define UCT_IB_DEVICE_HNS_LEN 3 - return !strncmp(ibv_get_device_name(device), UCT_IB_DEVICE_HNS, UCT_IB_DEVICE_HNS_LEN); -#else - return 0; -#endif -} - +ucs_status_t uct_ib_device_bus(uct_ib_device_t *dev, int port_num, + ucs_sys_bus_id_t *bus_id); /** * @return whether the port is InfiniBand @@ -236,9 +301,15 @@ int uct_ib_device_is_gid_raw_empty(uint8_t *gid_raw); /** - * Convert time-in-seconds to IB fabric time value + * Convert time-in-seconds to IB fabric QP time value + */ +uint8_t uct_ib_to_qp_fabric_time(double time); + + +/** + * Convert time-in-seconds to IB fabric RNR time value */ -uint8_t uct_ib_to_fabric_time(double time); +uint8_t uct_ib_to_rnr_fabric_time(double time); /** @@ -265,8 +336,6 @@ ucs_status_t uct_ib_device_find_port(uct_ib_device_t *dev, size_t uct_ib_device_odp_max_size(uct_ib_device_t *dev); -int uct_ib_device_odp_has_global_mr(uct_ib_device_t *dev); - const char *uct_ib_wc_status_str(enum ibv_wc_status wc_status); ucs_status_t uct_ib_device_create_ah_cached(uct_ib_device_t *dev, @@ -276,18 +345,42 @@ ucs_status_t uct_ib_device_create_ah_cached(uct_ib_device_t *dev, void uct_ib_device_cleanup_ah_cached(uct_ib_device_t *dev); -static inline struct ibv_exp_port_attr* +unsigned uct_ib_device_get_roce_lag_level(uct_ib_device_t *dev, + uint8_t port_num); + + +static inline struct ibv_port_attr* uct_ib_device_port_attr(uct_ib_device_t *dev, uint8_t port_num) { return &dev->port_attr[port_num - dev->first_port]; } +static inline int uct_ib_device_has_pci_atomics(uct_ib_device_t *dev) +{ + return !!((dev->pci_fadd_arg_sizes | dev->pci_cswap_arg_sizes) & + (sizeof(uint32_t) | sizeof(uint64_t))); +} + +const char *uct_ib_roce_version_str(uct_ib_roce_version_t roce_ver); + +const char *uct_ib_gid_str(const union ibv_gid *gid, char *str, size_t max_size); + ucs_status_t uct_ib_device_query_gid(uct_ib_device_t *dev, uint8_t port_num, - unsigned gid_index, union ibv_gid *gid, - int *is_roce_v2); + unsigned gid_index, union ibv_gid *gid); + +ucs_status_t uct_ib_device_query_gid_info(struct ibv_context *ctx, const char *dev_name, + uint8_t port_num, unsigned gid_index, + uct_ib_device_gid_info_t *info); + +int uct_ib_device_test_roce_gid_index(uct_ib_device_t *dev, uint8_t port_num, + const union ibv_gid *gid, + uint8_t gid_index); int uct_ib_get_cqe_size(int cqe_size_min); +const char* uct_ib_ah_attr_str(char *buf, size_t max, + const struct ibv_ah_attr *ah_attr); + static inline ucs_status_t uct_ib_poll_cq(struct ibv_cq *cq, unsigned *count, struct ibv_wc *wcs) { int ret; @@ -304,4 +397,6 @@ static inline ucs_status_t uct_ib_poll_cq(struct ibv_cq *cq, unsigned *count, st return UCS_OK; } +void uct_ib_handle_async_event(uct_ib_device_t *dev, uct_ib_async_event_t *event); + #endif diff --git a/src/uct/ib/base/ib_iface.c b/src/uct/ib/base/ib_iface.c index c490f7663e1..4c6f6175d8b 100644 --- a/src/uct/ib/base/ib_iface.c +++ b/src/uct/ib/base/ib_iface.c @@ -1,10 +1,14 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ib_iface.h" #include "ib_log.h" @@ -16,6 +20,8 @@ #include #include #include +#include +#include #include #include #include @@ -57,6 +63,10 @@ ucs_config_field_t uct_ib_iface_config_table[] = { {"", "", NULL, ucs_offsetof(uct_ib_iface_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)}, + {"SEG_SIZE", "8192", + "Size of bounce buffers used for post_send and post_recv.", + ucs_offsetof(uct_ib_iface_config_t, seg_size), UCS_CONFIG_TYPE_MEMUNITS}, + {"TX_QUEUE_LEN", "256", "Length of send queue in the QP.", ucs_offsetof(uct_ib_iface_config_t, tx.queue_len), UCS_CONFIG_TYPE_UINT}, @@ -80,19 +90,15 @@ ucs_config_field_t uct_ib_iface_config_table[] = { "enough will be sent inline.", ucs_offsetof(uct_ib_iface_config_t, tx.min_inline), UCS_CONFIG_TYPE_MEMUNITS}, - {"TX_INLINE_RESP", "32", + {"TX_INLINE_RESP", "0", "Bytes to reserve in send WQE for inline response. Responses which are small\n" "enough, such as of atomic operations and small reads, will be received inline.", - ucs_offsetof(uct_ib_iface_config_t, tx.inl_resp), UCS_CONFIG_TYPE_MEMUNITS}, + ucs_offsetof(uct_ib_iface_config_t, inl[UCT_IB_DIR_TX]), UCS_CONFIG_TYPE_MEMUNITS}, {"TX_MIN_SGE", "3", "Number of SG entries to reserve in the send WQE.", ucs_offsetof(uct_ib_iface_config_t, tx.min_sge), UCS_CONFIG_TYPE_UINT}, - {"TX_CQ_MODERATION", "64", - "Maximum number of send WQEs which can be posted without requesting a completion.", - ucs_offsetof(uct_ib_iface_config_t, tx.cq_moderation), UCS_CONFIG_TYPE_UINT}, - #if HAVE_DECL_IBV_EXP_CQ_MODERATION {"TX_EVENT_MOD_COUNT", "0", "Number of send completions for which an event would be generated (0 - disabled).", @@ -132,7 +138,7 @@ ucs_config_field_t uct_ib_iface_config_table[] = { "Number of bytes to request for inline receive. If the maximal supported size\n" "is smaller, it will be used instead. If it is possible to support a larger\n" "size than requested with the same hardware resources, it will be used instead.", - ucs_offsetof(uct_ib_iface_config_t, rx.inl), UCS_CONFIG_TYPE_MEMUNITS}, + ucs_offsetof(uct_ib_iface_config_t, inl[UCT_IB_DIR_RX]), UCS_CONFIG_TYPE_MEMUNITS}, UCT_IFACE_MPOOL_CONFIG_FIELDS("RX_", -1, 0, "receive", ucs_offsetof(uct_ib_iface_config_t, rx.mp), ""), @@ -161,21 +167,45 @@ ucs_config_field_t uct_ib_iface_config_table[] = { "IB Hop limit / RoCEv2 Time to Live. Should be between 0 and 255.\n", ucs_offsetof(uct_ib_iface_config_t, hop_limit), UCS_CONFIG_TYPE_UINT}, - {"LID_PATH_BITS", "0-17", + {"NUM_PATHS", "auto", + "Number of connections that should be created between a pair of communicating\n" + "endpoints for optimal performance. The default value 'auto' behaves according\n" + "to the port link layer:\n" + " RoCE - "UCS_PP_MAKE_STRING(UCT_IB_DEV_MAX_PORTS) " for LAG port, otherwise - 1.\n" + " InfiniBand - As the number of path bits enabled by fabric's LMC value and selected\n" + " by "UCS_DEFAULT_ENV_PREFIX UCT_IB_CONFIG_PREFIX"LID_PATH_BITS configuration.", + ucs_offsetof(uct_ib_iface_config_t, num_paths), UCS_CONFIG_TYPE_ULUNITS}, + + {"ROCE_PATH_FACTOR", "1", + "Multiplier for RoCE LAG UDP source port calculation. The UDP source port\n" + "is typically used by switches and network adapters to select a different\n" + "path for the same pair of endpoints.", + ucs_offsetof(uct_ib_iface_config_t, roce_path_factor), UCS_CONFIG_TYPE_UINT}, + + {"LID_PATH_BITS", "0", "List of IB Path bits separated by comma (a,b,c) " "which will be the low portion of the LID, according to the LMC in the fabric.", ucs_offsetof(uct_ib_iface_config_t, lid_path_bits), UCS_CONFIG_TYPE_ARRAY(path_bits_spec)}, - {"PKEY", "0x7fff", - "Which pkey value to use. Should be between 0 and 0x7fff.", - ucs_offsetof(uct_ib_iface_config_t, pkey_value), UCS_CONFIG_TYPE_HEX}, + {"PKEY", "auto", + "Which pkey value to use. Should be between 0 and 0x7fff.\n" + "\"auto\" option selects a first valid pkey value with full membership.", + ucs_offsetof(uct_ib_iface_config_t, pkey), UCS_CONFIG_TYPE_HEX}, -#if HAVE_IBV_EXP_RES_DOMAIN +#ifdef HAVE_IBV_EXP_RES_DOMAIN {"RESOURCE_DOMAIN", "y", "Enable multiple resource domains (experimental).", ucs_offsetof(uct_ib_iface_config_t, enable_res_domain), UCS_CONFIG_TYPE_BOOL}, #endif + {"PATH_MTU", "default", + "Path MTU. \"default\" will select the best MTU for the device.", + ucs_offsetof(uct_ib_iface_config_t, path_mtu), + UCS_CONFIG_TYPE_ENUM(uct_ib_mtu_values)}, + + {"ENABLE_CUDA_AFFINITY", "y", + "Prefer IB devices closest to detected CUDA device\n", + ucs_offsetof(uct_ib_iface_config_t, enable_cuda_affinity), UCS_CONFIG_TYPE_BOOL}, {NULL} }; @@ -186,12 +216,17 @@ int uct_ib_iface_is_roce(uct_ib_iface_t *iface) iface->config.port_num); } +int uct_ib_iface_is_ib(uct_ib_iface_t *iface) +{ + return uct_ib_device_is_port_ib(uct_ib_iface_device(iface), + iface->config.port_num); +} + static void uct_ib_iface_recv_desc_init(uct_iface_h tl_iface, void *obj, uct_mem_h memh) { uct_ib_iface_recv_desc_t *desc = obj; - uct_ib_mem_t *ib_memh = memh; - desc->lkey = ib_memh->lkey; + desc->lkey = uct_ib_memh_get_lkey(memh); } ucs_status_t uct_ib_iface_recv_mpool_init(uct_ib_iface_t *iface, @@ -222,124 +257,293 @@ void uct_ib_iface_release_desc(uct_recv_desc_t *self, void *desc) uct_ib_iface_t *iface = ucs_container_of(self, uct_ib_iface_t, release_desc); void *ib_desc; - ib_desc = desc - iface->config.rx_headroom_offset; + ib_desc = UCS_PTR_BYTE_OFFSET(desc, -(ptrdiff_t)iface->config.rx_headroom_offset); ucs_mpool_put_inline(ib_desc); } -size_t uct_ib_address_size(uct_ib_iface_t *iface) +static inline uct_ib_roce_version_t +uct_ib_address_flags_get_roce_version(uint8_t flags) { - if (uct_ib_iface_is_roce(iface)) { - return sizeof(uct_ib_address_t) + - sizeof(union ibv_gid); /* raw gid */ - } else if ((iface->gid.global.subnet_prefix == UCT_IB_LINK_LOCAL_PREFIX) && - !iface->is_global_addr) { - return sizeof(uct_ib_address_t) + - sizeof(uint16_t); /* lid */ - } else if (((iface->gid.global.subnet_prefix & UCT_IB_SITE_LOCAL_MASK) == - UCT_IB_SITE_LOCAL_PREFIX) && - !iface->is_global_addr) { - return sizeof(uct_ib_address_t) + - sizeof(uint16_t) + /* lid */ - sizeof(uint64_t) + /* if_id */ - sizeof(uint16_t); /* subnet16 */ + ucs_assert(flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH); + return (uct_ib_roce_version_t)(flags >> ucs_ilog2(UCT_IB_ADDRESS_FLAG_ETH_LAST)); +} + +static inline sa_family_t +uct_ib_address_flags_get_roce_af(uint8_t flags) +{ + ucs_assert(flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH); + return (flags & UCT_IB_ADDRESS_FLAG_ROCE_IPV6) ? + AF_INET6 : AF_INET; +} + +size_t uct_ib_address_size(const uct_ib_address_pack_params_t *params) +{ + size_t size = sizeof(uct_ib_address_t); + + if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_ETH) { + /* Ethernet: address contains only raw GID */ + size += sizeof(union ibv_gid); } else { - return sizeof(uct_ib_address_t) + - sizeof(uint16_t) + /* lid */ - sizeof(uint64_t) + /* if_id */ - sizeof(uint64_t); /* subnet64 */ + /* InfiniBand: address always contains LID */ + size += sizeof(uint16_t); /* lid */ + + if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID) { + /* Add GUID */ + UCS_STATIC_ASSERT(sizeof(params->gid.global.interface_id) == sizeof(uint64_t)); + size += sizeof(uint64_t); + } + + if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX) { + if ((params->gid.global.subnet_prefix & UCT_IB_SITE_LOCAL_MASK) == + UCT_IB_SITE_LOCAL_PREFIX) { + /* 16-bit subnet prefix */ + size += sizeof(uint16_t); + } else if (params->gid.global.subnet_prefix != UCT_IB_LINK_LOCAL_PREFIX) { + /* 64-bit subnet prefix */ + size += sizeof(uint64_t); + } + /* Note: if subnet prefix is LINK_LOCAL, no need to pack it because + * it's a well-known value defined by IB specification. + */ + } } + + if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU) { + size += sizeof(uint8_t); + } + + if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX) { + size += sizeof(uint8_t); + } + + if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY) { + size += sizeof(uint16_t); + } + + return size; } -void uct_ib_address_pack(uct_ib_iface_t *iface, - const union ibv_gid *gid, uint16_t lid, +void uct_ib_address_pack(const uct_ib_address_pack_params_t *params, uct_ib_address_t *ib_addr) { void *ptr = ib_addr + 1; - if (uct_ib_iface_is_roce(iface)) { - /* RoCE, in this case we don't use the lid and set the GID flag */ + if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_ETH) { + /* RoCE, in this case we don't use the lid, we pack the gid, the RoCE + * version, address family and set the ETH flag */ ib_addr->flags = UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH | - UCT_IB_ADDRESS_FLAG_GID; + (params->roce_info.ver << + ucs_ilog2(UCT_IB_ADDRESS_FLAG_ETH_LAST)); + + if (params->roce_info.addr_family == AF_INET6) { + ib_addr->flags |= UCT_IB_ADDRESS_FLAG_ROCE_IPV6; + } + /* uint8_t raw[16]; */ - memcpy(ptr, gid->raw, sizeof(gid->raw) * sizeof(uint8_t)); + memcpy(ptr, params->gid.raw, sizeof(params->gid.raw)); + ptr = UCS_PTR_TYPE_OFFSET(ptr, params->gid.raw); } else { /* IB, LID */ - ib_addr->flags = UCT_IB_ADDRESS_FLAG_LINK_LAYER_IB | - UCT_IB_ADDRESS_FLAG_LID; - *(uint16_t*) ptr = lid; - ptr += sizeof(uint16_t); - - if ((gid->global.subnet_prefix != UCT_IB_LINK_LOCAL_PREFIX) || - iface->is_global_addr) { - ib_addr->flags |= UCT_IB_ADDRESS_FLAG_IF_ID; - *(uint64_t*) ptr = gid->global.interface_id; - ptr += sizeof(uint64_t); - - if (((gid->global.subnet_prefix & UCT_IB_SITE_LOCAL_MASK) == - UCT_IB_SITE_LOCAL_PREFIX) && - !iface->is_global_addr) { + ib_addr->flags = 0; + *(uint16_t*)ptr = params->lid; + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint16_t); + + if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID) { + /* Pack GUID */ + ib_addr->flags |= UCT_IB_ADDRESS_FLAG_IF_ID; + *(uint64_t*) ptr = params->gid.global.interface_id; + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint64_t); + } + + if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX) { + if ((params->gid.global.subnet_prefix & UCT_IB_SITE_LOCAL_MASK) == + UCT_IB_SITE_LOCAL_PREFIX) { /* Site-local */ ib_addr->flags |= UCT_IB_ADDRESS_FLAG_SUBNET16; - *(uint16_t*) ptr = gid->global.subnet_prefix >> 48; - } else { + *(uint16_t*)ptr = params->gid.global.subnet_prefix >> 48; + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint16_t); + } else if (params->gid.global.subnet_prefix != UCT_IB_LINK_LOCAL_PREFIX) { /* Global */ ib_addr->flags |= UCT_IB_ADDRESS_FLAG_SUBNET64; - *(uint64_t*) ptr = gid->global.subnet_prefix; + *(uint64_t*)ptr = params->gid.global.subnet_prefix; + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint64_t); } } } + + if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU) { + ucs_assert((int)params->path_mtu < UINT8_MAX); + ib_addr->flags |= UCT_IB_ADDRESS_FLAG_PATH_MTU; + *(uint8_t*)ptr = (uint8_t)params->path_mtu; + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint8_t); + } + + if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX) { + ib_addr->flags |= UCT_IB_ADDRESS_FLAG_GID_INDEX; + *(uint8_t*)ptr = params->gid_index; + } + + if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY) { + ucs_assert(params->pkey != UCT_IB_ADDRESS_DEFAULT_PKEY); + ib_addr->flags |= UCT_IB_ADDRESS_FLAG_PKEY; + *(uint16_t*)ptr = params->pkey; + } } -void uct_ib_address_unpack(const uct_ib_address_t *ib_addr, uint16_t *lid, - union ibv_gid *gid) +unsigned uct_ib_iface_address_pack_flags(uct_ib_iface_t *iface) { - const void *ptr = ib_addr + 1; + unsigned pack_flags = 0; - gid->global.subnet_prefix = UCT_IB_LINK_LOCAL_PREFIX; /* Default prefix */ - gid->global.interface_id = 0; - *lid = 0; + if (iface->pkey != UCT_IB_ADDRESS_DEFAULT_PKEY) { + pack_flags |= UCT_IB_ADDRESS_PACK_FLAG_PKEY; + } - if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_GID) { - memcpy(gid->raw, ptr, sizeof(gid->raw) * sizeof(uint8_t)); /* uint8_t raw[16]; */ + if (uct_ib_iface_is_roce(iface)) { + /* pack Ethernet address */ + pack_flags |= UCT_IB_ADDRESS_PACK_FLAG_ETH; + } else if (iface->config.force_global_addr) { + /* pack full IB address */ + pack_flags |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX | + UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID; + } else { + /* pack only subnet prefix for reachability test */ + pack_flags |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX; } - if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LID) { - *lid = *(uint16_t*)ptr; - ptr += sizeof(uint16_t); + return pack_flags; +} + +size_t uct_ib_iface_address_size(uct_ib_iface_t *iface) +{ + uct_ib_address_pack_params_t params; + + params.flags = uct_ib_iface_address_pack_flags(iface); + params.gid = iface->gid_info.gid; + params.roce_info = iface->gid_info.roce_info; + return uct_ib_address_size(¶ms); +} + +void uct_ib_iface_address_pack(uct_ib_iface_t *iface, uct_ib_address_t *ib_addr) +{ + uct_ib_address_pack_params_t params; + + params.flags = uct_ib_iface_address_pack_flags(iface); + params.gid = iface->gid_info.gid; + params.lid = uct_ib_iface_port_attr(iface)->lid; + params.roce_info = iface->gid_info.roce_info; + /* to suppress gcc 4.3.4 warning */ + params.path_mtu = UCT_IB_ADDRESS_INVALID_PATH_MTU; + params.gid_index = UCT_IB_ADDRESS_INVALID_GID_INDEX; + params.pkey = iface->pkey; + uct_ib_address_pack(¶ms, ib_addr); +} + +void uct_ib_address_unpack(const uct_ib_address_t *ib_addr, + uct_ib_address_pack_params_t *params_p) +{ + const void *ptr = ib_addr + 1; + /* silence cppcheck warning */ + uct_ib_address_pack_params_t params = {0}; + + params.gid_index = UCT_IB_ADDRESS_INVALID_GID_INDEX; + params.path_mtu = UCT_IB_ADDRESS_INVALID_PATH_MTU; + params.pkey = UCT_IB_ADDRESS_DEFAULT_PKEY; + + if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH) { + /* uint8_t raw[16]; */ + memcpy(params.gid.raw, ptr, sizeof(params.gid.raw)); + ptr = UCS_PTR_BYTE_OFFSET(ptr, sizeof(params.gid.raw)); + params.flags |= UCT_IB_ADDRESS_PACK_FLAG_ETH; + + params.roce_info.addr_family = + uct_ib_address_flags_get_roce_af(ib_addr->flags); + params.roce_info.ver = + uct_ib_address_flags_get_roce_version(ib_addr->flags); + } else { + /* Default prefix */ + params.gid.global.subnet_prefix = UCT_IB_LINK_LOCAL_PREFIX; + params.gid.global.interface_id = 0; + params.flags |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX | + UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID; + + /* If the link layer is not ETHERNET, then it is IB and a lid + * must be present */ + params.lid = *(const uint16_t*)ptr; + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint16_t); + + if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_IF_ID) { + params.gid.global.interface_id = *(uint64_t*)ptr; + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint64_t); + } + + if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET16) { + params.gid.global.subnet_prefix = UCT_IB_SITE_LOCAL_PREFIX | + ((uint64_t)*(uint16_t*)ptr << 48); + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint16_t); + ucs_assert(!(ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET64)); + } + + if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET64) { + params.gid.global.subnet_prefix = *(uint64_t*)ptr; + ptr = UCS_PTR_TYPE_OFFSET(ptr, uint64_t); + params.flags |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX; + } } - if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_IF_ID) { - gid->global.interface_id = *(uint64_t*)ptr; - ptr += sizeof(uint64_t); + if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_PATH_MTU) { + params.path_mtu = *(const uint8_t*)ptr; + ptr = UCS_PTR_TYPE_OFFSET(ptr, const uint8_t); + params.flags |= UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU; } - if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET16) { - gid->global.subnet_prefix = UCT_IB_SITE_LOCAL_PREFIX | - ((uint64_t) *(uint16_t*) ptr << 48); - ptr += sizeof(uint16_t); + if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_GID_INDEX) { + params.gid_index = *(const uint8_t*)ptr; + ptr = UCS_PTR_TYPE_OFFSET(ptr, const uint16_t); + params.flags |= UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX; } - if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET64) { - gid->global.subnet_prefix = *(uint64_t*) ptr; + if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_PKEY) { + params.pkey = *(const uint16_t*)ptr; } + /* PKEY is always in params */ + params.flags |= UCT_IB_ADDRESS_PACK_FLAG_PKEY; + + *params_p = params; } const char *uct_ib_address_str(const uct_ib_address_t *ib_addr, char *buf, size_t max) { - union ibv_gid gid; - uint16_t lid; + uct_ib_address_pack_params_t params; char *p, *endp; - uct_ib_address_unpack(ib_addr, &lid, &gid); + uct_ib_address_unpack(ib_addr, ¶ms); p = buf; endp = buf + max; - if (lid != 0) { - snprintf(p, endp - p, "lid %d ", lid); + if (params.lid != 0) { + snprintf(p, endp - p, "lid %d ", params.lid); p += strlen(p); } - inet_ntop(AF_INET6, &gid, p, endp - p); + + uct_ib_gid_str(¶ms.gid, p, endp - p); + p += strlen(p); + + if (params.flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX) { + ucs_assert(params.gid_index != UCT_IB_ADDRESS_INVALID_GID_INDEX); + snprintf(p, endp - p, "gid index %u ", params.gid_index); + p += strlen(p); + } + + if (params.flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU) { + ucs_assert(params.path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU); + snprintf(p, endp - p, "mtu %zu ", uct_ib_mtu_value(params.path_mtu)); + p += strlen(p); + } + + ucs_assert((params.flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY) && + (params.flags != UCT_IB_ADDRESS_INVALID_PKEY)); + snprintf(p, endp - p, "pkey 0x%x ", params.pkey); return buf; } @@ -348,30 +552,85 @@ ucs_status_t uct_ib_iface_get_device_address(uct_iface_h tl_iface, uct_device_addr_t *dev_addr) { uct_ib_iface_t *iface = ucs_derived_of(tl_iface, uct_ib_iface_t); - uct_ib_address_pack(iface, &iface->gid, uct_ib_iface_port_attr(iface)->lid, - (void*)dev_addr); + + uct_ib_iface_address_pack(iface, (void*)dev_addr); + return UCS_OK; } -int uct_ib_iface_is_reachable(const uct_iface_h tl_iface, const uct_device_addr_t *dev_addr, +static int uct_ib_iface_roce_is_reachable(const uct_ib_device_gid_info_t *local_gid_info, + const uct_ib_address_t *remote_ib_addr) +{ + sa_family_t local_ib_addr_af = local_gid_info->roce_info.addr_family; + uct_ib_roce_version_t local_roce_ver = local_gid_info->roce_info.ver; + uint8_t remote_ib_addr_flags = remote_ib_addr->flags; + uct_ib_roce_version_t remote_roce_ver; + sa_family_t remote_ib_addr_af; + char local_gid_str[128], remote_gid_str[128]; + + if ((uct_ib_address_flags_get_roce_version(remote_ib_addr_flags)) == + UCT_IB_DEVICE_ROCE_ANY) { + return 1; + } + + /* check the address family */ + remote_ib_addr_af = uct_ib_address_flags_get_roce_af(remote_ib_addr_flags); + + if (local_ib_addr_af != remote_ib_addr_af) { + ucs_assert(local_ib_addr_af != 0); + ucs_debug("different addr_family detected. local %s remote %s", + ucs_sockaddr_address_family_str(local_ib_addr_af), + ucs_sockaddr_address_family_str(remote_ib_addr_af)); + return 0; + } + + /* check the RoCE version */ + ucs_assert(local_roce_ver != UCT_IB_DEVICE_ROCE_ANY); + + remote_roce_ver = uct_ib_address_flags_get_roce_version(remote_ib_addr_flags); + + if (local_roce_ver != remote_roce_ver) { + ucs_trace("different RoCE versions detected. local %s (gid=%s)" + "remote %s (gid=%s)", + uct_ib_roce_version_str(local_roce_ver), + uct_ib_gid_str(&local_gid_info->gid, local_gid_str, + sizeof(local_gid_str)), + uct_ib_roce_version_str(remote_roce_ver), + uct_ib_gid_str((union ibv_gid *)(remote_ib_addr + 1), remote_gid_str, + sizeof(remote_gid_str))); + return 0; + } + + return 1; +} + +int uct_ib_iface_is_reachable(const uct_iface_h tl_iface, + const uct_device_addr_t *dev_addr, const uct_iface_addr_t *iface_addr) { - uct_ib_iface_t *iface = ucs_derived_of(tl_iface, uct_ib_iface_t); - int is_local_eth = uct_ib_iface_is_roce(iface); + uct_ib_iface_t *iface = ucs_derived_of(tl_iface, uct_ib_iface_t); + int is_local_eth = uct_ib_iface_is_roce(iface); const uct_ib_address_t *ib_addr = (const void*)dev_addr; - union ibv_gid gid; - uint16_t lid; + uct_ib_address_pack_params_t params; + + uct_ib_address_unpack(ib_addr, ¶ms); - uct_ib_address_unpack(ib_addr, &lid, &gid); + if (/* at least one PKEY has to be with full membership */ + !((params.pkey | iface->pkey) & UCT_IB_PKEY_MEMBERSHIP_MASK) || + /* PKEY values have to be equal */ + ((params.pkey ^ iface->pkey) & UCT_IB_PKEY_PARTITION_MASK)) { + return 0; + } - if (!is_local_eth && (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_IB)) { + if (!is_local_eth && !(ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH)) { /* same subnet prefix */ - return gid.global.subnet_prefix == iface->gid.global.subnet_prefix; + return params.gid.global.subnet_prefix == + iface->gid_info.gid.global.subnet_prefix; } else if (is_local_eth && (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH)) { - /* there shouldn't be a lid and the gid flag should be on */ - ucs_assert(ib_addr->flags & UCT_IB_ADDRESS_FLAG_GID); - ucs_assert(!(ib_addr->flags & UCT_IB_ADDRESS_FLAG_LID)); - return 1; + /* there shouldn't be a lid and the UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH + * flag should be on. If reachable, the remote and local RoCE versions + * and address families have to be the same */ + return uct_ib_iface_roce_is_reachable(&iface->gid_info, ib_addr); } else { /* local and remote have different link layers and therefore are unreachable */ return 0; @@ -386,16 +645,95 @@ ucs_status_t uct_ib_iface_create_ah(uct_ib_iface_t *iface, uct_ib_iface_md(iface)->pd, ah_p); } +void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid, + const union ibv_gid *gid, + uint8_t gid_index, + unsigned path_index, + struct ibv_ah_attr *ah_attr) +{ + uint8_t path_bits; + char buf[128]; + + memset(ah_attr, 0, sizeof(*ah_attr)); + + ah_attr->sl = iface->config.sl; + ah_attr->port_num = iface->config.port_num; + ah_attr->grh.traffic_class = iface->config.traffic_class; + + if (uct_ib_iface_is_roce(iface)) { + ah_attr->dlid = UCT_IB_ROCE_UDP_SRC_PORT_BASE | + (iface->config.roce_path_factor * path_index); + /* Workaround rdma-core issue of calling rand() which affects global + * random state in glibc */ + ah_attr->grh.flow_label = 1; + } else { + /* TODO iface->path_bits should be removed and replaced by path_index */ + path_bits = iface->path_bits[path_index % + iface->path_bits_count]; + ah_attr->dlid = lid | path_bits; + ah_attr->src_path_bits = path_bits; + } + + if (iface->config.force_global_addr || + (iface->gid_info.gid.global.subnet_prefix != gid->global.subnet_prefix)) { + ucs_assert_always(gid->global.interface_id != 0); + ah_attr->is_global = 1; + ah_attr->grh.dgid = *gid; + ah_attr->grh.sgid_index = gid_index; + ah_attr->grh.hop_limit = iface->config.hop_limit; + } else { + ah_attr->is_global = 0; + } + + ucs_debug("iface %p: ah_attr %s", iface, + uct_ib_ah_attr_str(buf, sizeof(buf), ah_attr)); +} + +void uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface, + const uct_ib_address_t *ib_addr, + unsigned path_index, + struct ibv_ah_attr *ah_attr, + enum ibv_mtu *path_mtu) +{ + uct_ib_address_pack_params_t params; + + ucs_assert(!uct_ib_iface_is_roce(iface) == + !(ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH)); + + uct_ib_address_unpack(ib_addr, ¶ms); + + if (params.flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU) { + ucs_assert(params.path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU); + *path_mtu = params.path_mtu; + } else { + *path_mtu = iface->config.path_mtu; + } + + if (params.flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX) { + ucs_assert(params.gid_index != UCT_IB_ADDRESS_INVALID_GID_INDEX); + } else { + params.gid_index = iface->gid_info.gid_index; + } + + uct_ib_iface_fill_ah_attr_from_gid_lid(iface, params.lid, ¶ms.gid, + params.gid_index, path_index, + ah_attr); +} + static ucs_status_t uct_ib_iface_init_pkey(uct_ib_iface_t *iface, const uct_ib_iface_config_t *config) { - uct_ib_device_t *dev = uct_ib_iface_device(iface); - uint16_t pkey_tbl_len = uct_ib_iface_port_attr(iface)->pkey_tbl_len; + uct_ib_device_t *dev = uct_ib_iface_device(iface); + uint16_t pkey_tbl_len = uct_ib_iface_port_attr(iface)->pkey_tbl_len; + int pkey_found = 0; + uint16_t lim_pkey = UCT_IB_ADDRESS_INVALID_PKEY; + uint16_t lim_pkey_index = UINT16_MAX; uint16_t pkey_index, port_pkey, pkey; - if (config->pkey_value > UCT_IB_PKEY_PARTITION_MASK) { - ucs_error("Requested pkey 0x%x is invalid, should be in the range 0..0x%x", - config->pkey_value, UCT_IB_PKEY_PARTITION_MASK); + if ((config->pkey != UCS_HEXUNITS_AUTO) && + (config->pkey > UCT_IB_PKEY_PARTITION_MASK)) { + ucs_error("requested pkey 0x%x is invalid, should be in the range 0..0x%x", + config->pkey, UCT_IB_PKEY_PARTITION_MASK); return UCS_ERR_INVALID_PARAM; } @@ -405,30 +743,59 @@ static ucs_status_t uct_ib_iface_init_pkey(uct_ib_iface_t *iface, if (ibv_query_pkey(dev->ibv_context, iface->config.port_num, pkey_index, &port_pkey)) { - ucs_error("ibv_query_pkey("UCT_IB_IFACE_FMT", index=%d) failed: %m", + ucs_debug("ibv_query_pkey("UCT_IB_IFACE_FMT", index=%d) failed: %m", UCT_IB_IFACE_ARG(iface), pkey_index); + continue; } pkey = ntohs(port_pkey); - if (!(pkey & UCT_IB_PKEY_MEMBERSHIP_MASK)) { - ucs_debug("skipping send-only pkey[%d]=0x%x", pkey_index, pkey); + /* if pkey = 0x0, just skip it w/o debug trace, because 0x0 + * means that there is no real pkey configured at this index */ + if (pkey == UCT_IB_ADDRESS_INVALID_PKEY) { continue; } - /* take only the lower 15 bits for the comparison */ - if ((pkey & UCT_IB_PKEY_PARTITION_MASK) == config->pkey_value) { + if ((config->pkey == UCS_HEXUNITS_AUTO) || + /* take only the lower 15 bits for the comparison */ + ((pkey & UCT_IB_PKEY_PARTITION_MASK) == config->pkey)) { + if (!(pkey & UCT_IB_PKEY_MEMBERSHIP_MASK) && + /* limited PKEY has not yet been found */ + (lim_pkey == UCT_IB_ADDRESS_INVALID_PKEY)) { + lim_pkey_index = pkey_index; + lim_pkey = pkey; + continue; + } + iface->pkey_index = pkey_index; - iface->pkey_value = pkey; - ucs_debug("using pkey[%d] 0x%x on "UCT_IB_IFACE_FMT, iface->pkey_index, - iface->pkey_value, UCT_IB_IFACE_ARG(iface)); - return UCS_OK; + iface->pkey = pkey; + pkey_found = 1; + break; } } - ucs_error("The requested pkey: 0x%x, cannot be used. " - "It wasn't found or the configured pkey doesn't have full membership.", - config->pkey_value); - return UCS_ERR_INVALID_PARAM; + if (!pkey_found) { + if (lim_pkey == UCT_IB_ADDRESS_INVALID_PKEY) { + /* PKEY neither with full nor with limited membership was found */ + if (config->pkey == UCS_HEXUNITS_AUTO) { + ucs_error("there is no valid pkey to use on " + UCT_IB_IFACE_FMT, UCT_IB_IFACE_ARG(iface)); + } else { + ucs_error("unable to find specified pkey 0x%x on "UCT_IB_IFACE_FMT, + config->pkey, UCT_IB_IFACE_ARG(iface)); + } + + return UCS_ERR_NO_ELEM; + } else { + ucs_assert(lim_pkey_index != UINT16_MAX); + iface->pkey_index = lim_pkey_index; + iface->pkey = lim_pkey; + } + } + + ucs_debug("using pkey[%d] 0x%x on "UCT_IB_IFACE_FMT, iface->pkey_index, + iface->pkey, UCT_IB_IFACE_ARG(iface)); + + return UCS_OK; } static ucs_status_t uct_ib_iface_init_lmc(uct_ib_iface_t *iface, @@ -485,7 +852,7 @@ static ucs_status_t uct_ib_iface_init_lmc(uct_ib_iface_t *iface, } } - ucs_assert(iface->path_bits_count <= num_path_bits); + ucs_assert(iface->path_bits_count < num_path_bits); iface->path_bits[iface->path_bits_count] = j; iface->path_bits_count++; } @@ -501,7 +868,7 @@ void uct_ib_iface_fill_attr(uct_ib_iface_t *iface, uct_ib_qp_attr_t *attr) attr->ibv.srq = attr->srq; attr->ibv.cap = attr->cap; - attr->ibv.qp_type = attr->qp_type; + attr->ibv.qp_type = (enum ibv_qp_type)attr->qp_type; attr->ibv.sq_sig_all = attr->sq_sig_all; #if HAVE_DECL_IBV_EXP_CREATE_QP @@ -516,15 +883,13 @@ void uct_ib_iface_fill_attr(uct_ib_iface_t *iface, uct_ib_qp_attr_t *attr) } #endif + attr->port = iface->config.port_num; + if (attr->qp_type == IBV_QPT_UD) { return; } -#if HAVE_IB_EXT_ATOMICS - attr->ibv.comp_mask |= IBV_EXP_QP_INIT_ATTR_ATOMICS_ARG; - attr->ibv.max_atomic_arg = UCT_IB_MAX_ATOMIC_SIZE; -#endif - + /* MOFED requires this to enable IB spec atomic */ #if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE if (uct_ib_iface_device(iface)->dev_attr.exp_atomic_cap == IBV_EXP_ATOMIC_HCA_REPLY_BE) { @@ -532,11 +897,6 @@ void uct_ib_iface_fill_attr(uct_ib_iface_t *iface, uct_ib_qp_attr_t *attr) attr->ibv.exp_create_flags = IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY; } #endif - -#if HAVE_STRUCT_IBV_EXP_QP_INIT_ATTR_MAX_INL_RECV - attr->ibv.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV; - attr->ibv.max_inl_recv = attr->max_inl_recv; -#endif } ucs_status_t uct_ib_iface_create_qp(uct_ib_iface_t *iface, @@ -556,87 +916,99 @@ ucs_status_t uct_ib_iface_create_qp(uct_ib_iface_t *iface, qp = ibv_create_qp(uct_ib_iface_md(iface)->pd, &attr->ibv); #endif if (qp == NULL) { - ucs_error("iface=%p: failed to create %s QP TX wr:%d sge:%d inl:%d RX wr:%d sge:%d inl %d: %m", + ucs_error("iface=%p: failed to create %s QP " + "TX wr:%d sge:%d inl:%d resp:%d RX wr:%d sge:%d resp:%d: %m", iface, uct_ib_qp_type_str(attr->qp_type), - attr->cap.max_send_wr, attr->cap.max_send_sge, attr->cap.max_inline_data, - attr->cap.max_recv_wr, attr->cap.max_recv_sge, attr->max_inl_recv); + attr->cap.max_send_wr, attr->cap.max_send_sge, + attr->cap.max_inline_data, attr->max_inl_cqe[UCT_IB_DIR_TX], + attr->cap.max_recv_wr, attr->cap.max_recv_sge, + attr->max_inl_cqe[UCT_IB_DIR_RX]); return UCS_ERR_IO_ERROR; } attr->cap = attr->ibv.cap; *qp_p = qp; - ucs_debug("iface=%p: created %s QP 0x%x on %s:%d TX wr:%d sge:%d inl:%d RX wr:%d sge:%d inl %d", + ucs_debug("iface=%p: created %s QP 0x%x on %s:%d " + "TX wr:%d sge:%d inl:%d resp:%d RX wr:%d sge:%d resp:%d", iface, uct_ib_qp_type_str(attr->qp_type), qp->qp_num, uct_ib_device_name(dev), iface->config.port_num, - attr->cap.max_send_wr, attr->cap.max_send_sge, attr->cap.max_inline_data, - attr->cap.max_recv_wr, attr->cap.max_recv_sge, attr->max_inl_recv); + attr->cap.max_send_wr, attr->cap.max_send_sge, + attr->cap.max_inline_data, attr->max_inl_cqe[UCT_IB_DIR_TX], + attr->cap.max_recv_wr, attr->cap.max_recv_sge, + attr->max_inl_cqe[UCT_IB_DIR_RX]); return UCS_OK; } -ucs_status_t uct_ib_verbs_create_cq(struct ibv_context *context, int cqe, - struct ibv_comp_channel *channel, - int comp_vector, int ignore_overrun, - size_t *inl, struct ibv_cq **cq_p) +ucs_status_t uct_ib_verbs_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir, + const uct_ib_iface_init_attr_t *init_attr, + int preferred_cpu, size_t inl) { + uct_ib_device_t *dev = uct_ib_iface_device(iface); struct ibv_cq *cq; #if HAVE_DECL_IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN struct ibv_cq_init_attr_ex cq_attr = {}; - if (uct_ib_device_is_hns(context->device)) { - *inl = 0; - cq = ibv_create_cq(context, cqe, NULL, channel, comp_vector); + if (uct_ib_device_is_hns(dev->ibv_context)) { + iface->config.max_inl_cqe[dir] = 0; + cq = ibv_create_cq(dev->ibv_context, init_attr->cq_len[dir], NULL, + iface->comp_channel, preferred_cpu); } else { - cq_attr.cqe = cqe; - cq_attr.channel = channel; - cq_attr.comp_vector = comp_vector; - if (ignore_overrun) { + cq_attr.cqe = init_attr->cq_len[dir]; + cq_attr.channel = iface->comp_channel; + cq_attr.comp_vector = preferred_cpu; + if (init_attr->flags & UCT_IB_CQ_IGNORE_OVERRUN) { cq_attr.comp_mask = IBV_CQ_INIT_ATTR_MASK_FLAGS; - cq_attr.flags = IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN; + cq_attr.flags = IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN; } - cq = ibv_cq_ex_to_cq(ibv_create_cq_ex(context, &cq_attr)); + cq = ibv_cq_ex_to_cq(ibv_create_cq_ex(dev->ibv_context, &cq_attr)); } if (!cq && (errno == ENOSYS)) #endif { - *inl = 0; - cq = ibv_create_cq(context, cqe, NULL, channel, comp_vector); + iface->config.max_inl_cqe[dir] = 0; + cq = ibv_create_cq(dev->ibv_context, init_attr->cq_len[dir], NULL, + iface->comp_channel, preferred_cpu); } if (!cq) { - ucs_error("ibv_create_cq(cqe=%d) failed: %m", cqe); + ucs_error("ibv_create_cq(cqe=%d) failed: %m", init_attr->cq_len[dir]); return UCS_ERR_IO_ERROR; } - *cq_p = cq; + iface->cq[dir] = cq; + iface->config.max_inl_cqe[dir] = inl; return UCS_OK; } -static ucs_status_t uct_ib_iface_create_cq(uct_ib_iface_t *iface, int cq_length, - size_t *inl, int preferred_cpu, - int flags, struct ibv_cq **cq_p) +static ucs_status_t +uct_ib_iface_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir, + const uct_ib_iface_init_attr_t *init_attr, + const uct_ib_iface_config_t *config, + int preferred_cpu) { - uct_ib_device_t *dev = uct_ib_iface_device(iface); ucs_status_t status; + size_t inl = config->inl[dir]; #if HAVE_DECL_IBV_EXP_SETENV && !HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE + uct_ib_device_t *dev = uct_ib_iface_device(iface); static const char *cqe_size_env_var = "MLX5_CQE_SIZE"; + size_t cqe_size = 64; + int env_var_added = 0; const char *cqe_size_env_value; - size_t cqe_size = 64; size_t cqe_size_min; char cqe_size_buf[32]; - int env_var_added = 0; int ret; - cqe_size_min = (*inl > 32) ? 128 : 64; + cqe_size_min = (inl > 32) ? 128 : 64; cqe_size_env_value = getenv(cqe_size_env_var); if (cqe_size_env_value != NULL) { cqe_size = atol(cqe_size_env_value); if (cqe_size < cqe_size_min) { ucs_error("%s is set to %zu, but at least %zu is required (inl: %zu)", - cqe_size_env_var, cqe_size, cqe_size_min, *inl); + cqe_size_env_var, cqe_size, cqe_size_min, inl); return UCS_ERR_INVALID_PARAM; } } else { @@ -654,9 +1026,7 @@ static ucs_status_t uct_ib_iface_create_cq(uct_ib_iface_t *iface, int cq_length, env_var_added = 1; } #endif - status = iface->ops->create_cq(dev->ibv_context, cq_length, - iface->comp_channel, preferred_cpu, - flags & UCT_IB_CQ_IGNORE_OVERRUN, inl, cq_p); + status = iface->ops->create_cq(iface, dir, init_attr, preferred_cpu, inl); if (status != UCS_OK) { goto out_unsetenv; } @@ -665,7 +1035,7 @@ static ucs_status_t uct_ib_iface_create_cq(uct_ib_iface_t *iface, int cq_length, out_unsetenv: #if HAVE_DECL_IBV_EXP_SETENV && !HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE - *inl = cqe_size / 2; + iface->config.max_inl_cqe[dir] = cqe_size / 2; if (env_var_added) { /* if we created a new environment variable, remove it */ ret = ibv_exp_unsetenv(dev->ibv_context, cqe_size_env_var); @@ -719,22 +1089,120 @@ static ucs_status_t uct_ib_iface_set_moderation(struct ibv_cq *cq, return UCS_OK; } +static void uct_ib_iface_set_num_paths(uct_ib_iface_t *iface, + const uct_ib_iface_config_t *config) +{ + uct_ib_device_t *dev = uct_ib_iface_device(iface); + + if (config->num_paths == UCS_ULUNITS_AUTO) { + if (uct_ib_iface_is_roce(iface)) { + /* RoCE - number of paths is RoCE LAG level */ + iface->num_paths = + uct_ib_device_get_roce_lag_level(dev, iface->config.port_num); + } else { + /* IB - number of paths is LMC level */ + ucs_assert(iface->path_bits_count > 0); + iface->num_paths = iface->path_bits_count; + } + } else { + iface->num_paths = config->num_paths; + } +} + +int uct_ib_iface_is_roce_v2(uct_ib_iface_t *iface, uct_ib_device_t *dev) +{ + return uct_ib_iface_is_roce(iface) && + (iface->gid_info.roce_info.ver == UCT_IB_DEVICE_ROCE_V2); +} + +ucs_status_t uct_ib_iface_init_roce_gid_info(uct_ib_iface_t *iface, + size_t md_config_index) +{ + uct_ib_device_t *dev = uct_ib_iface_device(iface); + uint8_t port_num = iface->config.port_num; + + ucs_assert(uct_ib_iface_is_roce(iface)); + + if (md_config_index == UCS_ULUNITS_AUTO) { + return uct_ib_device_select_gid(dev, port_num, &iface->gid_info); + } + + return uct_ib_device_query_gid_info(dev->ibv_context, uct_ib_device_name(dev), + port_num, md_config_index, + &iface->gid_info); +} + +static ucs_status_t uct_ib_iface_init_gid_info(uct_ib_iface_t *iface, + size_t md_config_index) +{ + uct_ib_device_gid_info_t *gid_info = &iface->gid_info; + ucs_status_t status; + + /* Fill the gid index and the RoCE version */ + if (uct_ib_iface_is_roce(iface)) { + status = uct_ib_iface_init_roce_gid_info(iface, md_config_index); + if (status != UCS_OK) { + goto out; + } + } else { + gid_info->gid_index = (md_config_index == + UCS_ULUNITS_AUTO) ? + UCT_IB_MD_DEFAULT_GID_INDEX : + md_config_index; + gid_info->roce_info.ver = UCT_IB_DEVICE_ROCE_ANY; + gid_info->roce_info.addr_family = 0; + } + + /* Fill the gid */ + status = uct_ib_device_query_gid(uct_ib_iface_device(iface), + iface->config.port_num, + gid_info->gid_index, &gid_info->gid); + if (status != UCS_OK) { + goto out; + } + +out: + return status; +} + +static void uct_ib_iface_set_path_mtu(uct_ib_iface_t *iface, + const uct_ib_iface_config_t *config) +{ + enum ibv_mtu port_mtu = uct_ib_iface_port_attr(iface)->active_mtu; + uct_ib_device_t *dev = uct_ib_iface_device(iface); + + /* MTU is set by user configuration */ + if (config->path_mtu != UCT_IB_MTU_DEFAULT) { + /* cast from uct_ib_mtu_t to ibv_mtu */ + iface->config.path_mtu = (enum ibv_mtu)(config->path_mtu + + (IBV_MTU_512 - UCT_IB_MTU_512)); + } else if ((port_mtu > IBV_MTU_2048) && + (IBV_DEV_ATTR(dev, vendor_id) == 0x02c9) && + ((IBV_DEV_ATTR(dev, vendor_part_id) == 4099) || + (IBV_DEV_ATTR(dev, vendor_part_id) == 4100) || + (IBV_DEV_ATTR(dev, vendor_part_id) == 4103) || + (IBV_DEV_ATTR(dev, vendor_part_id) == 4104))) { + /* On some devices optimal path_mtu is 2048 */ + iface->config.path_mtu = IBV_MTU_2048; + } else { + iface->config.path_mtu = port_mtu; + } +} + UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, const uct_ib_iface_config_t *config, const uct_ib_iface_init_attr_t *init_attr) { - uct_ib_md_t *ib_md = ucs_derived_of(md, uct_ib_md_t); + uct_ib_md_t *ib_md = ucs_derived_of(md, uct_ib_md_t); uct_ib_device_t *dev = &ib_md->dev; size_t rx_headroom = (params->field_mask & - UCT_IFACE_PARAM_FIELD_CPU_MASK) ? + UCT_IFACE_PARAM_FIELD_RX_HEADROOM) ? params->rx_headroom : 0; ucs_cpu_set_t cpu_mask; int preferred_cpu; ucs_status_t status; uint8_t port_num; - int is_roce_v2; - size_t inl; if (!(params->open_mode & UCT_IFACE_OPEN_MODE_DEVICE)) { return UCS_ERR_UNSUPPORTED; @@ -762,28 +1230,31 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, goto err; } - self->ops = ops; - - self->config.rx_payload_offset = sizeof(uct_ib_iface_recv_desc_t) + - ucs_max(sizeof(uct_recv_desc_t) + - rx_headroom, - init_attr->rx_priv_len + - init_attr->rx_hdr_len); - self->config.rx_hdr_offset = self->config.rx_payload_offset - - init_attr->rx_hdr_len; - self->config.rx_headroom_offset = self->config.rx_payload_offset - - rx_headroom; - self->config.seg_size = init_attr->seg_size; - self->config.tx_max_poll = config->tx.max_poll; - self->config.rx_max_poll = config->rx.max_poll; - self->config.rx_max_batch = ucs_min(config->rx.max_batch, - config->rx.queue_len / 4); - self->config.port_num = port_num; - self->config.sl = config->sl; - self->config.hop_limit = config->hop_limit; - self->release_desc.cb = uct_ib_iface_release_desc; - self->config.enable_res_domain = config->enable_res_domain; - self->config.qp_type = init_attr->qp_type; + self->ops = ops; + + self->config.rx_payload_offset = sizeof(uct_ib_iface_recv_desc_t) + + ucs_max(sizeof(uct_recv_desc_t) + + rx_headroom, + init_attr->rx_priv_len + + init_attr->rx_hdr_len); + self->config.rx_hdr_offset = self->config.rx_payload_offset - + init_attr->rx_hdr_len; + self->config.rx_headroom_offset = self->config.rx_payload_offset - + rx_headroom; + self->config.seg_size = init_attr->seg_size; + self->config.roce_path_factor = config->roce_path_factor; + self->config.tx_max_poll = config->tx.max_poll; + self->config.rx_max_poll = config->rx.max_poll; + self->config.rx_max_batch = ucs_min(config->rx.max_batch, + config->rx.queue_len / 4); + self->config.port_num = port_num; + self->config.sl = config->sl; + self->config.hop_limit = config->hop_limit; + self->release_desc.cb = uct_ib_iface_release_desc; + self->config.enable_res_domain = config->enable_res_domain; + self->config.enable_cuda_affinity = config->enable_cuda_affinity; + self->config.qp_type = init_attr->qp_type; + uct_ib_iface_set_path_mtu(self, config); if (ucs_derived_of(worker, uct_priv_worker_t)->thread_mode == UCS_THREAD_MODE_MULTI) { ucs_error("IB transports do not support multi-threaded worker"); @@ -795,22 +1266,14 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, goto err; } - status = uct_ib_device_select_gid_index(dev, self->config.port_num, - ib_md->config.gid_index, - &self->config.gid_index); + status = uct_ib_iface_init_gid_info(self, ib_md->config.gid_index); if (status != UCS_OK) { goto err; } - status = uct_ib_device_query_gid(dev, self->config.port_num, - self->config.gid_index, &self->gid, - &is_roce_v2); - if (status != UCS_OK) { - goto err; - } - - if (config->traffic_class == UCS_CONFIG_ULUNITS_AUTO) { - self->config.traffic_class = is_roce_v2 ? UCT_IB_DEFAULT_ROCEV2_DSCP : 0; + if (config->traffic_class == UCS_ULUNITS_AUTO) { + self->config.traffic_class = uct_ib_iface_is_roce_v2(self, dev) ? + UCT_IB_DEFAULT_ROCEV2_DSCP : 0; } else { self->config.traffic_class = config->traffic_class; } @@ -820,10 +1283,7 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, goto err; } - status = self->ops->init_res_domain(self); - if (status != UCS_OK) { - goto err_free_path_bits; - } + uct_ib_iface_set_num_paths(self, config); self->comp_channel = ibv_create_comp_channel(dev->ibv_context); if (self->comp_channel == NULL) { @@ -837,15 +1297,11 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, goto err_destroy_comp_channel; } - inl = config->rx.inl; - status = uct_ib_iface_create_cq(self, init_attr->tx_cq_len, &inl, - preferred_cpu, init_attr->flags, - &self->cq[UCT_IB_DIR_TX]); + status = uct_ib_iface_create_cq(self, UCT_IB_DIR_TX, init_attr, + config, preferred_cpu); if (status != UCS_OK) { goto err_destroy_comp_channel; } - ucs_assert_always(inl <= UINT8_MAX); - self->config.max_inl_resp = inl; status = uct_ib_iface_set_moderation(self->cq[UCT_IB_DIR_TX], config->tx.cq_moderation_count, @@ -854,10 +1310,8 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, goto err_destroy_send_cq; } - inl = config->rx.inl; - status = uct_ib_iface_create_cq(self, init_attr->rx_cq_len, &inl, - preferred_cpu, init_attr->flags, - &self->cq[UCT_IB_DIR_RX]); + status = uct_ib_iface_create_cq(self, UCT_IB_DIR_RX, init_attr, + config, preferred_cpu); if (status != UCS_OK) { goto err_destroy_send_cq; } @@ -871,15 +1325,16 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, /* Address scope and size */ if (uct_ib_iface_is_roce(self) || config->is_global || + uct_ib_grh_required(uct_ib_iface_port_attr(self)) || /* check ADDR_TYPE for backward compatibility */ (config->addr_type == UCT_IB_ADDRESS_TYPE_SITE_LOCAL) || (config->addr_type == UCT_IB_ADDRESS_TYPE_GLOBAL)) { - self->is_global_addr = 1; + self->config.force_global_addr = 1; } else { - self->is_global_addr = 0; + self->config.force_global_addr = 0; } - self->addr_size = uct_ib_address_size(self); + self->addr_size = uct_ib_iface_address_size(self); ucs_debug("created uct_ib_iface_t headroom_ofs %d payload_ofs %d hdr_ofs %d data_sz %d", self->config.rx_headroom_offset, self->config.rx_payload_offset, @@ -894,8 +1349,6 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, err_destroy_comp_channel: ibv_destroy_comp_channel(self->comp_channel); err_cleanup: - self->ops->cleanup_res_domain(self); -err_free_path_bits: ucs_free(self->path_bits); err: return status; @@ -920,7 +1373,6 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ib_iface_t) ucs_warn("ibv_destroy_comp_channel(comp_channel) returned %d: %m", ret); } - self->ops->cleanup_res_domain(self); ucs_free(self->path_bits); } @@ -957,7 +1409,7 @@ static ucs_status_t uct_ib_iface_get_numa_latency(uct_ib_iface_t *iface, { uct_ib_device_t *dev = uct_ib_iface_device(iface); uct_ib_md_t *md = uct_ib_iface_md(iface); - cpu_set_t temp_cpu_mask, process_affinity; + ucs_sys_cpuset_t temp_cpu_mask, process_affinity; #if HAVE_NUMA int distance, min_cpu_distance; int cpu, num_cpus; @@ -969,7 +1421,7 @@ static ucs_status_t uct_ib_iface_get_numa_latency(uct_ib_iface_t *iface, return UCS_OK; } - ret = sched_getaffinity(0, sizeof(process_affinity), &process_affinity); + ret = ucs_sys_getaffinity(&process_affinity); if (ret) { ucs_error("sched_getaffinity() failed: %m"); return UCS_ERR_INVALID_PARAM; @@ -1008,6 +1460,49 @@ static ucs_status_t uct_ib_iface_get_numa_latency(uct_ib_iface_t *iface, return UCS_OK; } +static ucs_status_t uct_ib_iface_get_cuda_latency(uct_ib_iface_t *iface, + double *latency) +{ + ucs_sys_dev_distance_t dist = {0.0, 0.0}; + uct_ib_device_t *dev = uct_ib_iface_device(iface); + ucs_sys_device_t ib_sys_device; + ucs_sys_device_t cuda_sys_device; + ucs_sys_bus_id_t ib_bus_id; + ucs_sys_bus_id_t cuda_bus_id; + ucs_status_t status; + + status = ucm_get_mem_type_current_device_info(UCS_MEMORY_TYPE_CUDA, + &cuda_bus_id); + if (status != UCS_OK) { + *latency = 0.0; + return UCS_OK; + } + + status = ucs_topo_find_device_by_bus_id(&cuda_bus_id, &cuda_sys_device); + if (status != UCS_OK) { + return status; + } + + status = uct_ib_device_bus(dev, iface->config.port_num, &ib_bus_id); + if (status != UCS_OK) { + return status; + } + + status = ucs_topo_find_device_by_bus_id(&ib_bus_id, &ib_sys_device); + if (status != UCS_OK) { + return status; + } + + status = ucs_topo_get_distance(ib_sys_device, cuda_sys_device, &dist); + if (status != UCS_OK) { + return status; + } + + *latency = dist.latency; + + return UCS_OK; +} + ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len, uct_iface_attr_t *iface_attr) { @@ -1020,71 +1515,74 @@ ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len, [3] = 12, [4] = 16 }; - uint8_t active_width, active_speed, active_mtu; + uint8_t active_width, active_speed, active_mtu, width_idx; double encoding, signal_rate, wire_speed; size_t mtu, width, extra_pkt_len; ucs_status_t status; double numa_latency; + double cuda_latency; + + uct_base_iface_query(&iface->super, iface_attr); active_width = uct_ib_iface_port_attr(iface)->active_width; active_speed = uct_ib_iface_port_attr(iface)->active_speed; active_mtu = uct_ib_iface_port_attr(iface)->active_mtu; /* Get active width */ + width_idx = ucs_ilog2(active_width); if (!ucs_is_pow2(active_width) || - (active_width < 1) || (ucs_ilog2(active_width) > 4)) + (active_width < 1) || (width_idx > 4)) { ucs_error("Invalid active_width on %s:%d: %d", UCT_IB_IFACE_ARG(iface), active_width); return UCS_ERR_IO_ERROR; } - memset(iface_attr, 0, sizeof(*iface_attr)); - iface_attr->device_addr_len = iface->addr_size; + iface_attr->dev_num_paths = iface->num_paths; switch (active_speed) { case 1: /* SDR */ - iface_attr->latency.overhead = 5000e-9; - signal_rate = 2.5e9; - encoding = 8.0/10.0; + iface_attr->latency.c = 5000e-9; + signal_rate = 2.5e9; + encoding = 8.0/10.0; break; case 2: /* DDR */ - iface_attr->latency.overhead = 2500e-9; - signal_rate = 5.0e9; - encoding = 8.0/10.0; + iface_attr->latency.c = 2500e-9; + signal_rate = 5.0e9; + encoding = 8.0/10.0; break; case 4: - iface_attr->latency.overhead = 1300e-9; + iface_attr->latency.c = 1300e-9; if (uct_ib_iface_is_roce(iface)) { /* 10/40g Eth */ - signal_rate = 10.3125e9; - encoding = 64.0/66.0; + signal_rate = 10.3125e9; + encoding = 64.0/66.0; } else { /* QDR */ - signal_rate = 10.0e9; - encoding = 8.0/10.0; + signal_rate = 10.0e9; + encoding = 8.0/10.0; } break; case 8: /* FDR10 */ - iface_attr->latency.overhead = 700e-9; - signal_rate = 10.3125e9; - encoding = 64.0/66.0; + iface_attr->latency.c = 700e-9; + signal_rate = 10.3125e9; + encoding = 64.0/66.0; break; case 16: /* FDR */ - iface_attr->latency.overhead = 700e-9; - signal_rate = 14.0625e9; - encoding = 64.0/66.0; + iface_attr->latency.c = 700e-9; + signal_rate = 14.0625e9; + encoding = 64.0/66.0; break; case 32: /* EDR / 100g Eth */ - iface_attr->latency.overhead = 600e-9; - signal_rate = 25.78125e9; - encoding = 64.0/66.0; + iface_attr->latency.c = 600e-9; + signal_rate = 25.78125e9; + encoding = 64.0/66.0; break; case 64: /* 50g Eth */ - iface_attr->latency.overhead = 600e-9; - signal_rate = 25.78125e9 * 2; - encoding = 64.0/66.0; + iface_attr->latency.c = 600e-9; + signal_rate = 25.78125e9 * 2; + encoding = 64.0/66.0; break; default: ucs_error("Invalid active_speed on %s:%d: %d", @@ -1097,29 +1595,40 @@ ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len, return status; } - iface_attr->latency.overhead += numa_latency; - iface_attr->latency.growth = 0; + iface_attr->latency.c += numa_latency; + iface_attr->latency.m = 0; + + if (iface->config.enable_cuda_affinity != UCS_NO) { + status = uct_ib_iface_get_cuda_latency(iface, &cuda_latency); + if (status != UCS_OK) { + return status; + } + + iface_attr->latency.c += cuda_latency; + iface_attr->latency.m = 0; + } /* Wire speed calculation: Width * SignalRate * Encoding */ - width = ib_port_widths[ucs_ilog2(active_width)]; + width = ib_port_widths[width_idx]; wire_speed = (width * signal_rate * encoding) / 8.0; /* Calculate packet overhead */ - mtu = ucs_min(uct_ib_mtu_value(active_mtu), + mtu = ucs_min(uct_ib_mtu_value((enum ibv_mtu)active_mtu), iface->config.seg_size); extra_pkt_len = UCT_IB_BTH_LEN + xport_hdr_len + UCT_IB_ICRC_LEN + UCT_IB_VCRC_LEN + UCT_IB_DELIM_LEN; if (uct_ib_iface_is_roce(iface)) { extra_pkt_len += UCT_IB_GRH_LEN + UCT_IB_ROCE_LEN; - iface_attr->latency.overhead += 200e-9; + iface_attr->latency.c += 200e-9; } else { /* TODO check if UCT_IB_DELIM_LEN is present in RoCE as well */ extra_pkt_len += UCT_IB_LRH_LEN; } - iface_attr->bandwidth = ucs_min((wire_speed * mtu) / (mtu + extra_pkt_len), md->pci_bw); - iface_attr->priority = uct_ib_device_spec(dev)->priority; + iface_attr->bandwidth.shared = ucs_min((wire_speed * mtu) / (mtu + extra_pkt_len), md->pci_bw); + iface_attr->bandwidth.dedicated = 0; + iface_attr->priority = uct_ib_device_spec(dev)->priority; return UCS_OK; } @@ -1127,7 +1636,7 @@ ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len, ucs_status_t uct_ib_iface_event_fd_get(uct_iface_h tl_iface, int *fd_p) { uct_ib_iface_t *iface = ucs_derived_of(tl_iface, uct_ib_iface_t); - *fd_p = iface->comp_channel->fd; + *fd_p = iface->comp_channel->fd; return UCS_OK; } diff --git a/src/uct/ib/base/ib_iface.h b/src/uct/ib/base/ib_iface.h index a64139d7327..78f9dfb4186 100644 --- a/src/uct/ib/base/ib_iface.h +++ b/src/uct/ib/base/ib_iface.h @@ -11,15 +11,19 @@ #include #include +#include #include -#include -#include +#include #include +#include #define UCT_IB_MAX_IOV 8UL #define UCT_IB_IFACE_NULL_RES_DOMAIN_KEY 0u #define UCT_IB_MAX_ATOMIC_SIZE sizeof(uint64_t) - +#define UCT_IB_ADDRESS_INVALID_GID_INDEX UINT8_MAX +#define UCT_IB_ADDRESS_INVALID_PATH_MTU 0 +#define UCT_IB_ADDRESS_INVALID_PKEY 0 +#define UCT_IB_ADDRESS_DEFAULT_PKEY 0xffff /* Forward declarations */ typedef struct uct_ib_iface_config uct_ib_iface_config_t; @@ -51,24 +55,60 @@ typedef enum { enum { UCT_IB_QPT_UNKNOWN, -#if HAVE_DC_EXP +#ifdef HAVE_DC_EXP UCT_IB_QPT_DCI = IBV_EXP_QPT_DC_INI, #elif HAVE_DC_DV UCT_IB_QPT_DCI = IBV_QPT_DRIVER, #endif }; + +/** + * IB address packing flags + */ +enum { + UCT_IB_ADDRESS_PACK_FLAG_ETH = UCS_BIT(0), + UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID = UCS_BIT(1), + UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX = UCS_BIT(2), + UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU = UCS_BIT(3), + UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX = UCS_BIT(4), + UCT_IB_ADDRESS_PACK_FLAG_PKEY = UCS_BIT(5) +}; + + +typedef struct uct_ib_address_pack_params { + /* Packing flags, UCT_IB_ADDRESS_PACK_FLAG_xx. */ + uint64_t flags; + /* GID address to pack/unpack. */ + union ibv_gid gid; + /* LID address to pack/unpack. */ + uint16_t lid; + /* RoCE version to pack/unpack in case of an Ethernet link layer, + must be valid if @ref UCT_IB_ADDRESS_PACK_FLAG_ETH is set. */ + uct_ib_roce_version_info_t roce_info; + /* path MTU size as defined in enum ibv_mtu, + must be valid if @ref UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU is set. */ + enum ibv_mtu path_mtu; + /* GID index, + must be valid if @ref UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX is set. */ + uint8_t gid_index; + /* PKEY value, + must be valid if @ref UCT_IB_ADDRESS_PACK_FLAG_PKEY is set. */ + uint16_t pkey; +} uct_ib_address_pack_params_t; + + struct uct_ib_iface_config { uct_iface_config_t super; + size_t seg_size; /* Maximal size of copy-out sends */ + struct { unsigned queue_len; /* Queue length */ unsigned max_batch; /* How many fragments can be batched to one post send */ unsigned max_poll; /* How many wcs can be picked when polling tx cq */ size_t min_inline; /* Inline space to reserve for sends */ - size_t inl_resp; /* Inline space to reserve for responses */ unsigned min_sge; /* How many SG entries to support */ - unsigned cq_moderation; /* How many TX messages are batched to one CQE */ uct_iface_mpool_config_t mp; /* Event moderation parameters */ @@ -80,7 +120,6 @@ struct uct_ib_iface_config { unsigned queue_len; /* Queue length */ unsigned max_batch; /* How many buffers can be batched to one post receive */ unsigned max_poll; /* How many wcs can be picked when polling rx cq */ - size_t inl; /* Inline space to reserve in CQ/QP */ uct_iface_mpool_config_t mp; /* Event moderation parameters */ @@ -88,10 +127,13 @@ struct uct_ib_iface_config { double cq_moderation_period; } rx; + /* Inline space to reserve in CQ */ + size_t inl[UCT_IB_DIR_NUM]; + /* Change the address type */ int addr_type; - /* Forice global routing */ + /* Force global routing */ int is_global; /* IB SL to use */ @@ -103,23 +145,54 @@ struct uct_ib_iface_config { /* IB hop limit / TTL */ unsigned hop_limit; + /* Number of paths to expose for the interface */ + unsigned long num_paths; + + /* Multiplier for RoCE LAG UDP source port calculation */ + unsigned roce_path_factor; + /* Ranges of path bits */ UCS_CONFIG_ARRAY_FIELD(ucs_range_spec_t, ranges) lid_path_bits; /* IB PKEY to use */ - unsigned pkey_value; + unsigned pkey; /* Multiple resource domains */ int enable_res_domain; + + /* Path MTU size */ + uct_ib_mtu_t path_mtu; + + /* Allow IB devices to be penalized based on distance from CUDA device */ + int enable_cuda_affinity; }; +enum { + UCT_IB_CQ_IGNORE_OVERRUN = UCS_BIT(0), + UCT_IB_TM_SUPPORTED = UCS_BIT(1) +}; + + +typedef struct uct_ib_iface_init_attr { + unsigned rx_priv_len; /* Length of transport private data to reserve */ + unsigned rx_hdr_len; /* Length of transport network header */ + unsigned cq_len[UCT_IB_DIR_NUM]; /* CQ length */ + size_t seg_size; /* Transport segment size */ + unsigned fc_req_size; /* Flow control request size */ + int qp_type; /* IB QP type */ + int flags; /* Various flags (see enum) */ +} uct_ib_iface_init_attr_t; + + typedef struct uct_ib_qp_attr { int qp_type; struct ibv_qp_cap cap; + int port; struct ibv_srq *srq; + uint32_t srq_num; unsigned sq_sig_all; - unsigned max_inl_recv; + unsigned max_inl_cqe[UCT_IB_DIR_NUM]; #if HAVE_DECL_IBV_EXP_CREATE_QP struct ibv_exp_qp_init_attr ibv; #elif HAVE_DECL_IBV_CREATE_QP_EX @@ -130,80 +203,82 @@ typedef struct uct_ib_qp_attr { } uct_ib_qp_attr_t; +typedef ucs_status_t (*uct_ib_iface_create_cq_func_t)(uct_ib_iface_t *iface, + uct_ib_dir_t dir, + const uct_ib_iface_init_attr_t *init_attr, + int preferred_cpu, + size_t inl); + +typedef ucs_status_t (*uct_ib_iface_arm_cq_func_t)(uct_ib_iface_t *iface, + uct_ib_dir_t dir, + int solicited_only); + +typedef void (*uct_ib_iface_event_cq_func_t)(uct_ib_iface_t *iface, + uct_ib_dir_t dir); + +typedef void (*uct_ib_iface_handle_failure_func_t)(uct_ib_iface_t *iface, void *arg, + ucs_status_t status); + +typedef ucs_status_t (*uct_ib_iface_set_ep_failed_func_t)(uct_ib_iface_t *iface, uct_ep_h ep, + ucs_status_t status); + + struct uct_ib_iface_ops { - uct_iface_ops_t super; - ucs_status_t (*create_cq)(struct ibv_context *context, int cqe, - struct ibv_comp_channel *channel, - int comp_vector, int ignore_overrun, - size_t *inl, struct ibv_cq **cq_p); - ucs_status_t (*arm_cq)(uct_ib_iface_t *iface, - uct_ib_dir_t dir, - int solicited_only); - void (*event_cq)(uct_ib_iface_t *iface, - uct_ib_dir_t dir); - void (*handle_failure)(uct_ib_iface_t *iface, void *arg, - ucs_status_t status); - ucs_status_t (*set_ep_failed)(uct_ib_iface_t *iface, uct_ep_h ep, - ucs_status_t status); - ucs_status_t (*create_qp)(uct_ib_iface_t *iface, uct_ib_qp_attr_t *attr, - struct ibv_qp **qp_p); - ucs_status_t (*init_res_domain)(uct_ib_iface_t *iface); - void (*cleanup_res_domain)(uct_ib_iface_t *iface); + uct_iface_ops_t super; + uct_ib_iface_create_cq_func_t create_cq; + uct_ib_iface_arm_cq_func_t arm_cq; + uct_ib_iface_event_cq_func_t event_cq; + uct_ib_iface_handle_failure_func_t handle_failure; + uct_ib_iface_set_ep_failed_func_t set_ep_failed; }; struct uct_ib_iface { - uct_base_iface_t super; + uct_base_iface_t super; - struct ibv_cq *cq[UCT_IB_DIR_NUM]; - struct ibv_comp_channel *comp_channel; - uct_recv_desc_t release_desc; + struct ibv_cq *cq[UCT_IB_DIR_NUM]; + struct ibv_comp_channel *comp_channel; + uct_recv_desc_t release_desc; - uint8_t *path_bits; - unsigned path_bits_count; - uint16_t pkey_index; - uint16_t pkey_value; - uint8_t is_global_addr; - uint8_t addr_size; - union ibv_gid gid; + uint8_t *path_bits; + unsigned path_bits_count; + unsigned num_paths; + uint16_t pkey_index; + uint16_t pkey; + uint8_t addr_size; + uct_ib_device_gid_info_t gid_info; struct { - unsigned rx_payload_offset; /* offset from desc to payload */ - unsigned rx_hdr_offset; /* offset from desc to network header */ - unsigned rx_headroom_offset; /* offset from desc to user headroom */ - unsigned rx_max_batch; - unsigned rx_max_poll; - unsigned tx_max_poll; - unsigned seg_size; - uint8_t max_inl_resp; - uint8_t port_num; - uint8_t sl; - uint8_t traffic_class; - uint8_t hop_limit; - uint8_t gid_index; /* IB GID index to use */ - uint8_t enable_res_domain; /* Disable multiple resource domains */ - uint8_t qp_type; - size_t max_iov; /* Maximum buffers in IOV array */ + unsigned rx_payload_offset; /* offset from desc to payload */ + unsigned rx_hdr_offset; /* offset from desc to network header */ + unsigned rx_headroom_offset; /* offset from desc to user headroom */ + unsigned rx_max_batch; + unsigned rx_max_poll; + unsigned tx_max_poll; + unsigned seg_size; + unsigned roce_path_factor; + uint8_t max_inl_cqe[UCT_IB_DIR_NUM]; + uint8_t port_num; + uint8_t sl; + uint8_t traffic_class; + uint8_t hop_limit; + uint8_t enable_res_domain; /* Disable multiple resource domains */ + uint8_t enable_cuda_affinity; + uint8_t qp_type; + uint8_t force_global_addr; + enum ibv_mtu path_mtu; } config; - uct_ib_iface_ops_t *ops; + uct_ib_iface_ops_t *ops; }; -enum { - UCT_IB_CQ_IGNORE_OVERRUN = UCS_BIT(0), -}; -typedef struct uct_ib_iface_init_attr { - unsigned rx_priv_len; /* Length of transport private data to reserve */ - unsigned rx_hdr_len; /* Length of transport network header */ - unsigned tx_cq_len; /* Send CQ length */ - unsigned rx_cq_len; /* Receive CQ length */ - size_t seg_size; /* Transport segment size */ - int tm_cap_bit; /* Required HW tag-matching capabilities */ - unsigned fc_req_size; /* Flow control request size */ - int qp_type; /* IB QP type */ - int flags; /* Various flags (see enum) */ -} uct_ib_iface_init_attr_t; +typedef struct uct_ib_fence_info { + uint16_t fence_beat; /* 16bit is enough because if it wraps around, + * it means the older ops are already completed + * because QP size is less than 64k */ +} uct_ib_fence_info_t; + UCS_CLASS_DECLARE(uct_ib_iface_t, uct_ib_iface_ops_t*, uct_md_h, uct_worker_h, const uct_iface_params_t*, const uct_ib_iface_config_t*, @@ -292,34 +367,68 @@ int uct_ib_iface_is_roce(uct_ib_iface_t *iface); /** + * @return Whether the port used by this interface is IB + */ +int uct_ib_iface_is_ib(uct_ib_iface_t *iface); + + +/** + * Get the expected size of IB packed address. + * + * @param [in] params Address parameters as defined in + * @ref uct_ib_address_pack_params_t. + * * @return IB address size of the given link scope. */ -size_t uct_ib_address_size(uct_ib_iface_t *iface); +size_t uct_ib_address_size(const uct_ib_address_pack_params_t *params); + + +/** + * @return IB address packing flags of the given iface. + */ +unsigned uct_ib_iface_address_pack_flags(uct_ib_iface_t *iface); + + +/** + * @return IB address size of the given iface. + */ +size_t uct_ib_iface_address_size(uct_ib_iface_t *iface); /** * Pack IB address. * - * @param [in] dev IB device. TODO remove this. - * @param [in] gid GID address to pack. - * @param [in] lid LID address to pack. - * @param [out] ib_addr Filled with packed ib address. Size of the structure - * must be at least what @ref uct_ib_address_size() returns - * for the given scope. + * @param [in] params Address parameters as defined in + * @ref uct_ib_address_pack_params_t. + * @param [in/out] ib_addr Filled with packed ib address. Size of the structure + * must be at least what @ref uct_ib_address_size() + * returns for the given scope. */ -void uct_ib_address_pack(uct_ib_iface_t *iface, - const union ibv_gid *gid, uint16_t lid, +void uct_ib_address_pack(const uct_ib_address_pack_params_t *params, uct_ib_address_t *ib_addr); + +/** + * Pack the IB address of the given iface. + * + * @param [in] iface Iface whose IB address to pack. + * @param [in/out] ib_addr Filled with packed ib address. Size of the structure + * must be at least what @ref uct_ib_address_size() + * returns for the given scope. + */ +void uct_ib_iface_address_pack(uct_ib_iface_t *iface, uct_ib_address_t *ib_addr); + + /** * Unpack IB address. * * @param [in] ib_addr IB address to unpack. - * @param [out] lid Filled with address LID, or 0 if not present. + * @param [out] params_p Filled with address attributes as in + * @ref uct_ib_address_pack_params_t. */ -void uct_ib_address_unpack(const uct_ib_address_t *ib_addr, uint16_t *lid, - union ibv_gid *gid); +void uct_ib_address_unpack(const uct_ib_address_t *ib_addr, + uct_ib_address_pack_params_t *params_p); /** @@ -340,6 +449,20 @@ int uct_ib_iface_is_reachable(const uct_iface_h tl_iface, const uct_device_addr_ ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len, uct_iface_attr_t *iface_attr); + +int uct_ib_iface_is_roce_v2(uct_ib_iface_t *iface, uct_ib_device_t *dev); + + +/** + * Select the IB gid index and RoCE version to use for a RoCE port. + * + * @param iface IB interface + * @param md_config_index Gid index from the md configuration. + */ +ucs_status_t uct_ib_iface_init_roce_gid_info(uct_ib_iface_t *iface, + size_t md_config_index); + + static inline uct_ib_md_t* uct_ib_iface_md(uct_ib_iface_t *iface) { return ucs_derived_of(iface->super.md, uct_ib_md_t); @@ -350,7 +473,7 @@ static inline uct_ib_device_t* uct_ib_iface_device(uct_ib_iface_t *iface) return &uct_ib_iface_md(iface)->dev; } -static inline struct ibv_exp_port_attr* uct_ib_iface_port_attr(uct_ib_iface_t *iface) +static inline struct ibv_port_attr* uct_ib_iface_port_attr(uct_ib_iface_t *iface) { return uct_ib_device_port_attr(uct_ib_iface_device(iface), iface->config.port_num); } @@ -379,6 +502,18 @@ ucs_status_t uct_ib_iface_create_ah(uct_ib_iface_t *iface, struct ibv_ah_attr *ah_attr, struct ibv_ah **ah_p); +void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid, + const union ibv_gid *gid, + uint8_t gid_index, + unsigned path_index, + struct ibv_ah_attr *ah_attr); + +void uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface, + const uct_ib_address_t *ib_addr, + unsigned path_index, + struct ibv_ah_attr *ah_attr, + enum ibv_mtu *path_mtu); + ucs_status_t uct_ib_iface_pre_arm(uct_ib_iface_t *iface); ucs_status_t uct_ib_iface_event_fd_get(uct_iface_h iface, int *fd_p); @@ -387,15 +522,9 @@ ucs_status_t uct_ib_iface_arm_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir, int solicited_only); -static inline uint8_t uct_ib_iface_get_atomic_mr_id(uct_ib_iface_t *iface) -{ - return uct_ib_md_get_atomic_mr_id(ucs_derived_of(iface->super.md, uct_ib_md_t)); -} - -ucs_status_t uct_ib_verbs_create_cq(struct ibv_context *context, int cqe, - struct ibv_comp_channel *channel, - int comp_vector, int ignore_overrun, - size_t *inl, struct ibv_cq **cq_p); +ucs_status_t uct_ib_verbs_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir, + const uct_ib_iface_init_attr_t *init_attr, + int preferred_cpu, size_t inl); ucs_status_t uct_ib_iface_create_qp(uct_ib_iface_t *iface, uct_ib_qp_attr_t *attr, @@ -453,7 +582,7 @@ size_t uct_ib_verbs_sge_fill_iov(struct ibv_sge *sge, const uct_iov_t *iov, if (iov[sge_it].memh == UCT_MEM_HANDLE_NULL) { sge[sge_it].lkey = 0; } else { - sge[sge_it].lkey = ((uct_ib_mem_t *)(iov[iov_it].memh))->lkey; + sge[sge_it].lkey = uct_ib_memh_get_lkey(iov[iov_it].memh); } ++sge_it; } @@ -461,70 +590,16 @@ size_t uct_ib_verbs_sge_fill_iov(struct ibv_sge *sge, const uct_iov_t *iov, return sge_it; } - static UCS_F_ALWAYS_INLINE -size_t uct_ib_iface_get_max_iov(uct_ib_iface_t *iface) -{ - return iface->config.max_iov; -} - - -static UCS_F_ALWAYS_INLINE -void uct_ib_iface_set_max_iov(uct_ib_iface_t *iface, size_t max_iov) -{ - size_t min_iov_requested; - - ucs_assert((ssize_t)max_iov > 0); - - min_iov_requested = ucs_max(max_iov, 1UL); /* max_iov mustn't be 0 */ - iface->config.max_iov = ucs_min(UCT_IB_MAX_IOV, min_iov_requested); -} - - -static UCS_F_ALWAYS_INLINE -void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid, - const union ibv_gid *gid, - uint8_t path_bits, - struct ibv_ah_attr *ah_attr) -{ - memset(ah_attr, 0, sizeof(*ah_attr)); - - ah_attr->sl = iface->config.sl; - ah_attr->src_path_bits = path_bits; - ah_attr->dlid = lid | path_bits; - ah_attr->port_num = iface->config.port_num; - ah_attr->grh.traffic_class = iface->config.traffic_class; - - if (iface->is_global_addr || - (iface->gid.global.subnet_prefix != gid->global.subnet_prefix)) { - ucs_assert_always(gid->global.interface_id != 0); - ah_attr->is_global = 1; - ah_attr->grh.dgid = *gid; - ah_attr->grh.sgid_index = iface->config.gid_index; - ah_attr->grh.hop_limit = iface->config.hop_limit; - } else { - ah_attr->is_global = 0; - } -} - -static UCS_F_ALWAYS_INLINE -void uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface, - const uct_ib_address_t *ib_addr, - uint8_t path_bits, - struct ibv_ah_attr *ah_attr) +size_t uct_ib_iface_hdr_size(size_t max_inline, size_t min_size) { - union ibv_gid gid; - uint16_t lid; - - uct_ib_address_unpack(ib_addr, &lid, &gid); - - uct_ib_iface_fill_ah_attr_from_gid_lid(iface, lid, &gid, path_bits, ah_attr); + return (size_t)ucs_max((ssize_t)(max_inline - min_size), 0); } -static UCS_F_ALWAYS_INLINE -size_t uct_ib_iface_hdr_size(size_t max_inline, size_t min_size) +static UCS_F_ALWAYS_INLINE void +uct_ib_fence_info_init(uct_ib_fence_info_t* fence) { - return (size_t)ucs_max((ssize_t)(max_inline - min_size), 0); + fence->fence_beat = 0; } #endif diff --git a/src/uct/ib/base/ib_log.c b/src/uct/ib/base/ib_log.c index 5a4f74da4b2..d79c6ebb1b7 100644 --- a/src/uct/ib/base/ib_log.c +++ b/src/uct/ib/base/ib_log.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ib_log.h" #include @@ -26,11 +30,11 @@ const char *uct_ib_qp_type_str(int qp_type) } } -void uct_ib_log_dump_opcode(uct_ib_opcode_t *op, int signal, int fence, int se, +void uct_ib_log_dump_opcode(uct_ib_opcode_t *op, int sig, int fence, int se, char *buf, size_t max) { snprintf(buf, max, "%s %c%c%c", op->name, - signal ? 's' : '-', + sig ? 's' : '-', fence ? 'f' : '-', se ? 'e' : '-'); } @@ -61,10 +65,11 @@ void uct_ib_log_dump_sg_list(uct_ib_iface_t *iface, uct_am_trace_type_t type, s += strlen(s); if (data_dump) { - len = ucs_min(sg_list[i].length, (void*)data + sizeof(data) - md); + len = ucs_min(sg_list[i].length, + UCS_PTR_BYTE_DIFF(md, data) + sizeof(data)); memcpy(md, (void*)sg_list[i].addr, len); - md += len; + md = UCS_PTR_BYTE_OFFSET(md, len); total_len += len; total_valid_len += sg_list[i].length; } @@ -138,7 +143,7 @@ static void uct_ib_dump_wr_opcode(struct ibv_qp *qp, uint64_t wr_id, char *s = buf; char *ends = buf + max; - snprintf(s, ends - s, "QP 0x%x wrid 0x%"PRIx64, qp->qp_num, wr_id); + snprintf(s, ends - s, "QP 0x%x wrid 0x%"PRIx64" ", qp->qp_num, wr_id); s += strlen(s); uct_ib_log_dump_opcode(op, @@ -177,7 +182,9 @@ static void uct_ib_dump_wr(struct ibv_qp *qp, uct_ib_opcode_t *op, uct_ib_log_dump_atomic_cswap(wr->wr.atomic.compare_add, wr->wr.atomic.swap, s, ends - s); } - s += strlen(s); + + /* do not forget `s += strlen(s);` here if you are + * processing more information for dumping below */ } } @@ -193,7 +200,7 @@ static void uct_ib_dump_send_wr(uct_ib_iface_t *iface, struct ibv_qp *qp, [IBV_WR_SEND_WITH_IMM] = { "SEND_IMM", 0 }, [IBV_WR_ATOMIC_CMP_AND_SWP] = { "CSWAP", UCT_IB_OPCODE_FLAG_HAS_ATOMIC }, [IBV_WR_ATOMIC_FETCH_AND_ADD] = { "FETCH_ADD", UCT_IB_OPCODE_FLAG_HAS_ATOMIC }, - }; + }; char *s = buf; char *ends = buf + max; @@ -236,7 +243,7 @@ void __uct_ib_log_recv_completion(const char *file, int line, const char *functi len = length; if (iface->config.qp_type == IBV_QPT_UD) { len -= UCT_IB_GRH_LEN; - data += UCT_IB_GRH_LEN; + data = UCS_PTR_BYTE_OFFSET(data, UCT_IB_GRH_LEN); } uct_ib_log_dump_recv_completion(iface, l_qp, r_qp, slid, data, len, packet_dump_cb, buf, sizeof(buf) - 1); diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c index e5a929a5197..de38db559f9 100644 --- a/src/uct/ib/base/ib_md.c +++ b/src/uct/ib/base/ib_md.c @@ -1,13 +1,16 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2016. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * Copyright (C) The University of Tennessee and The University * of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ib_md.h" -#include "ib_alloc.h" #include "ib_device.h" #include @@ -15,22 +18,18 @@ #include #include #include +#include #include #include +#ifdef HAVE_PTHREAD_NP_H +#include +#endif #include #include -#define UCT_IB_MD_PREFIX "ib" -#define UCT_IB_MEM_ACCESS_FLAGS (IBV_ACCESS_LOCAL_WRITE | \ - IBV_ACCESS_REMOTE_WRITE | \ - IBV_ACCESS_REMOTE_READ | \ - IBV_ACCESS_REMOTE_ATOMIC) #define UCT_IB_MD_RCACHE_DEFAULT_ALIGN 16 -/* define string to use it in debug messages */ -#define UCT_IB_MD_PCI_DATA_PATH_FMT "/sys/class/infiniband/%s/device/%s" - typedef struct uct_ib_md_pci_info { double bw; /* bandwidth */ uint16_t payload; /* payload used to data transfer */ @@ -46,6 +45,13 @@ static UCS_CONFIG_DEFINE_ARRAY(pci_bw, sizeof(ucs_config_bw_spec_t), UCS_CONFIG_TYPE_BW_SPEC); +static const char *uct_ib_devx_objs[] = { + [UCT_IB_DEVX_OBJ_RCQP] = "rcqp", + [UCT_IB_DEVX_OBJ_RCSRQ] = "rcsrq", + [UCT_IB_DEVX_OBJ_DCT] = "dct", + [UCT_IB_DEVX_OBJ_DCSRQ] = "dcsrq", + NULL +}; static ucs_config_field_t uct_ib_md_config_table[] = { {"", "", NULL, @@ -63,10 +69,10 @@ static ucs_config_field_t uct_ib_md_config_table[] = { UCS_CONFIG_TYPE_TABLE(uct_md_config_rcache_table)}, {"MEM_REG_OVERHEAD", "16us", "Memory registration overhead", /* TODO take default from device */ - ucs_offsetof(uct_ib_md_config_t, uc_reg_cost.overhead), UCS_CONFIG_TYPE_TIME}, + ucs_offsetof(uct_ib_md_config_t, uc_reg_cost.c), UCS_CONFIG_TYPE_TIME}, {"MEM_REG_GROWTH", "0.06ns", "Memory registration growth rate", /* TODO take default from device */ - ucs_offsetof(uct_ib_md_config_t, uc_reg_cost.growth), UCS_CONFIG_TYPE_TIME}, + ucs_offsetof(uct_ib_md_config_t, uc_reg_cost.m), UCS_CONFIG_TYPE_TIME}, {"FORK_INIT", "try", "Initialize a fork-safe IB library with ibv_fork_init().", @@ -105,24 +111,27 @@ static ucs_config_field_t uct_ib_md_config_table[] = { {"DEVICE_SPECS", "", "Array of custom device specification. Each element is a string of the following format:\n" - " :[:name[:[:]]]\n" + " :[:name[:[:]]]\n" "where:\n" - " - (mandatory) vendor id, integer or hexadecimal.\n" - " - (mandatory) vendor part id, integer or hexadecimal.\n" + " - (mandatory) pci vendor id, integer or hexadecimal.\n" + " - (mandatory) pci device id, integer or hexadecimal.\n" " - (optional) device name.\n" - " - (optional) empty, or any of: '4' - mlx4 device, '5' - mlx5 device.\n" - " - (optional) device priority, integer.\n", + " - (optional) empty, or a combination of:\n" + " '4' - mlx4 device\n" + " '5' - mlx5 device\n" + " 'd' - DC version 1 (Connect-IB, ConnectX-4)\n" + " 'D' - DC version 2 (ConnectX-5 and above)\n" + " 'a' - Compact address vector support\n" + " - (optional) device priority, integer.\n" + "\n" + "Example: The value '0x02c9:4115:ConnectX4:5d' would specify a device named ConnectX-4\n" + "to match vendor id 0x2c9, device id 4115, with DC version 1 support.", ucs_offsetof(uct_ib_md_config_t, custom_devices), UCS_CONFIG_TYPE_STRING_ARRAY}, {"PREFER_NEAREST_DEVICE", "y", "Prefer nearest device to cpu when selecting a device from NET_DEVICES list.\n", ucs_offsetof(uct_ib_md_config_t, ext.prefer_nearest_device), UCS_CONFIG_TYPE_BOOL}, - {"CONTIG_PAGES", "n", - "Enable allocation with contiguous pages. Warning: enabling this option may\n" - "cause stack smashing.\n", - ucs_offsetof(uct_ib_md_config_t, ext.enable_contig_pages), UCS_CONFIG_TYPE_BOOL}, - {"INDIRECT_ATOMIC", "y", "Use indirect atomic\n", ucs_offsetof(uct_ib_md_config_t, ext.enable_indirect_atomic), UCS_CONFIG_TYPE_BOOL}, @@ -141,7 +150,7 @@ static ucs_config_field_t uct_ib_md_config_table[] = { "Use GPU Direct RDMA for HCA to access GPU pages directly\n", ucs_offsetof(uct_ib_md_config_t, ext.enable_gpudirect_rdma), UCS_CONFIG_TYPE_TERNARY}, -#if HAVE_EXP_UMR +#ifdef HAVE_EXP_UMR {"MAX_INLINE_KLM_LIST", "inf", "When posting a UMR, KLM lists shorter or equal to this value will be posted as inline.\n" "The actual maximal length is also limited by device capabilities.", @@ -152,10 +161,38 @@ static ucs_config_field_t uct_ib_md_config_table[] = { "Maximum effective data transfer rate of PCI bus connected to HCA\n", ucs_offsetof(uct_ib_md_config_t, pci_bw), UCS_CONFIG_TYPE_ARRAY(pci_bw)}, + {"MLX5_DEVX", "try", + "DEVX support\n", + ucs_offsetof(uct_ib_md_config_t, devx), UCS_CONFIG_TYPE_TERNARY}, + + {"MLX5_DEVX_OBJECTS", "rcqp,rcsrq,dct,dcsrq", + "Objects to be created by DevX\n", + ucs_offsetof(uct_ib_md_config_t, devx_objs), + UCS_CONFIG_TYPE_BITMAP(uct_ib_devx_objs)}, + + {"REG_MT_THRESH", "4G", + "Minimal MR size to be register using multiple parallel threads.\n" + "Number of threads used will be determined by number of CPUs which " + "registering thread is bound to by hard affinity.", + ucs_offsetof(uct_ib_md_config_t, ext.min_mt_reg), UCS_CONFIG_TYPE_MEMUNITS}, + + {"REG_MT_CHUNK", "2G", + "Size of single chunk used in multithreaded registration.\n" + "Must be power of 2.", + ucs_offsetof(uct_ib_md_config_t, ext.mt_reg_chunk), UCS_CONFIG_TYPE_MEMUNITS}, + + {"REG_MT_BIND", "n", + "Enable setting CPU affinity of memory registration threads.", + ucs_offsetof(uct_ib_md_config_t, ext.mt_reg_bind), UCS_CONFIG_TYPE_BOOL}, + + {"PCI_RELAXED_ORDERING", "auto", + "Enable relaxed ordering for PCIe transactions to improve performance on some systems.", + ucs_offsetof(uct_ib_md_config_t, mr_relaxed_order), UCS_CONFIG_TYPE_ON_OFF_AUTO}, + {NULL} }; -#if ENABLE_STATS +#ifdef ENABLE_STATS static ucs_stats_class_t uct_ib_md_stats_class = { .name = "", .num_counters = UCT_IB_MD_STAT_LAST, @@ -199,17 +236,33 @@ static const uct_ib_md_pci_info_t uct_ib_md_pci_info[] = { }, }; -UCS_LIST_HEAD(uct_ib_md_open_list); +UCS_LIST_HEAD(uct_ib_md_ops_list); + +typedef struct uct_ib_verbs_mem { + uct_ib_mem_t super; + uct_ib_mr_t mrs[]; +} uct_ib_verbs_mem_t; + +typedef struct { + pthread_t thread; + void *addr; + size_t len; + size_t chunk; + uint64_t access; + struct ibv_pd *pd; + struct ibv_mr **mr; +} uct_ib_md_mem_reg_thread_t; static void uct_ib_check_gpudirect_driver(uct_ib_md_t *md, uct_md_attr_t *md_attr, - const char *file, int mem_type, - const char *name) + const char *file, + ucs_memory_type_t mem_type) { - if (!access(file, F_OK)) + if (!access(file, F_OK)) { md_attr->cap.reg_mem_types |= UCS_BIT(mem_type); + } ucs_debug("%s: %s GPUDirect RDMA is %s", - uct_ib_device_name(&md->dev), name, + uct_ib_device_name(&md->dev), ucs_memory_type_names[mem_type], md_attr->cap.reg_mem_types & UCS_BIT(mem_type) ? "enabled" : "disabled"); } @@ -224,20 +277,22 @@ static ucs_status_t uct_ib_md_query(uct_md_h uct_md, uct_md_attr_t *md_attr) UCT_MD_FLAG_NEED_MEMH | UCT_MD_FLAG_NEED_RKEY | UCT_MD_FLAG_ADVISE; - md_attr->cap.reg_mem_types = UCS_BIT(UCT_MD_MEM_TYPE_HOST); + md_attr->cap.reg_mem_types = UCS_MEMORY_TYPES_CPU_ACCESSIBLE; + md_attr->cap.access_mem_type = UCS_MEMORY_TYPE_HOST; + md_attr->cap.detect_mem_types = 0; if (md->config.enable_gpudirect_rdma != UCS_NO) { /* check if GDR driver is loaded */ uct_ib_check_gpudirect_driver(md, md_attr, "/sys/kernel/mm/memory_peers/nv_mem/version", - UCT_MD_MEM_TYPE_CUDA, "CUDA"); + UCS_MEMORY_TYPE_CUDA); /* check if ROCM KFD driver is loaded */ uct_ib_check_gpudirect_driver(md, md_attr, "/dev/kfd", - UCT_MD_MEM_TYPE_ROCM, "ROCM"); + UCS_MEMORY_TYPE_ROCM); - if (!(md_attr->cap.reg_mem_types & ~UCS_BIT(UCT_MD_MEM_TYPE_HOST)) && - md->config.enable_gpudirect_rdma == UCS_YES) { + if (!(md_attr->cap.reg_mem_types & ~UCS_BIT(UCS_MEMORY_TYPE_HOST)) && + (md->config.enable_gpudirect_rdma == UCS_YES)) { ucs_error("%s: Couldn't enable GPUDirect RDMA. Please make sure" " nv_peer_mem or amdgpu plugin installed correctly.", uct_ib_device_name(&md->dev)); @@ -245,191 +300,22 @@ static ucs_status_t uct_ib_md_query(uct_md_h uct_md, uct_md_attr_t *md_attr) } } - md_attr->cap.mem_type = UCT_MD_MEM_TYPE_HOST; md_attr->rkey_packed_size = UCT_IB_MD_PACKED_RKEY_SIZE; + md_attr->reg_cost = md->reg_cost; + ucs_sys_cpuset_copy(&md_attr->local_cpus, &md->dev.local_cpus); - if (md->config.enable_contig_pages && - IBV_EXP_HAVE_CONTIG_PAGES(&md->dev.dev_attr)) - { - md_attr->cap.flags |= UCT_MD_FLAG_ALLOC; - } - - md_attr->reg_cost = md->reg_cost; - md_attr->local_cpus = md->dev.local_cpus; return UCS_OK; } -static ucs_status_t uct_ib_md_umr_qp_create(uct_ib_md_t *md) -{ -#if HAVE_EXP_UMR - struct ibv_exp_qp_init_attr qp_init_attr; - struct ibv_qp_attr qp_attr; - uint8_t port_num; - int ret; - uct_ib_device_t *ibdev; - struct ibv_exp_port_attr *port_attr; - int is_roce_v2; - - ibdev = &md->dev; - - if (!(ibdev->dev_attr.exp_device_cap_flags & IBV_EXP_DEVICE_UMR) || - !md->config.enable_indirect_atomic) { - return UCS_ERR_UNSUPPORTED; - } - - /* TODO: fix port selection. It looks like active port should be used */ - port_num = ibdev->first_port; - port_attr = uct_ib_device_port_attr(ibdev, port_num); - - memset(&qp_init_attr, 0, sizeof(qp_init_attr)); - - md->umr_cq = ibv_create_cq(ibdev->ibv_context, 1, NULL, NULL, 0); - if (md->umr_cq == NULL) { - ucs_error("failed to create UMR CQ: %m"); - goto err; - } - - md->config.max_inline_klm_list = ucs_min(md->config.max_inline_klm_list, - ibdev->dev_attr.umr_caps.max_send_wqe_inline_klms); - - qp_init_attr.qp_type = IBV_QPT_RC; - qp_init_attr.send_cq = md->umr_cq; - qp_init_attr.recv_cq = md->umr_cq; - qp_init_attr.cap.max_inline_data = 0; - qp_init_attr.cap.max_recv_sge = 1; - qp_init_attr.cap.max_send_sge = 1; - qp_init_attr.srq = NULL; - qp_init_attr.cap.max_recv_wr = 16; - qp_init_attr.cap.max_send_wr = 16; - qp_init_attr.pd = md->pd; - qp_init_attr.comp_mask = IBV_EXP_QP_INIT_ATTR_PD|IBV_EXP_QP_INIT_ATTR_MAX_INL_KLMS; - qp_init_attr.max_inl_recv = 0; - qp_init_attr.max_inl_send_klms = md->config.max_inline_klm_list; - -#if HAVE_IBV_EXP_QP_CREATE_UMR - qp_init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS; - qp_init_attr.exp_create_flags = IBV_EXP_QP_CREATE_UMR; -#endif - - md->umr_qp = ibv_exp_create_qp(ibdev->ibv_context, &qp_init_attr); - if (md->umr_qp == NULL) { - ucs_error("failed to create UMR QP: %m"); - goto err_destroy_cq; - } - - memset(&qp_attr, 0, sizeof(qp_attr)); - - /* Modify QP to INIT state */ - qp_attr.qp_state = IBV_QPS_INIT; - qp_attr.pkey_index = 0; - qp_attr.port_num = port_num; - qp_attr.qp_access_flags = UCT_IB_MEM_ACCESS_FLAGS; - ret = ibv_modify_qp(md->umr_qp, &qp_attr, - IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS); - if (ret) { - ucs_error("Failed to modify UMR QP to INIT: %m"); - goto err_destroy_qp; - } - - /* Modify to RTR */ - qp_attr.qp_state = IBV_QPS_RTR; - qp_attr.dest_qp_num = md->umr_qp->qp_num; - - memset(&qp_attr.ah_attr, 0, sizeof(qp_attr.ah_attr)); - qp_attr.ah_attr.port_num = port_num; - qp_attr.ah_attr.dlid = port_attr->lid; - qp_attr.ah_attr.is_global = 1; - if (uct_ib_device_query_gid(ibdev, port_num, UCT_IB_MD_DEFAULT_GID_INDEX, - &qp_attr.ah_attr.grh.dgid, &is_roce_v2) != UCS_OK) { - goto err_destroy_qp; - } - qp_attr.rq_psn = 0; - qp_attr.path_mtu = IBV_MTU_512; - qp_attr.min_rnr_timer = 7; - qp_attr.max_dest_rd_atomic = 1; - ret = ibv_modify_qp(md->umr_qp, &qp_attr, - IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | - IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER); - if (ret) { - ucs_error("Failed to modify UMR QP to RTR: %m"); - goto err_destroy_qp; - } - - /* Modify to RTS */ - qp_attr.qp_state = IBV_QPS_RTS; - qp_attr.sq_psn = 0; - qp_attr.timeout = 7; - qp_attr.rnr_retry = 7; - qp_attr.retry_cnt = 7; - qp_attr.max_rd_atomic = 1; - ret = ibv_modify_qp(md->umr_qp, &qp_attr, - IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | - IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | - IBV_QP_MAX_QP_RD_ATOMIC); - if (ret) { - ucs_error("Failed to modify UMR QP to RTS: %m"); - goto err_destroy_qp; - } - - ucs_debug("initialized UMR QP 0x%x, max_inline_klm_list %u", - md->umr_qp->qp_num, md->config.max_inline_klm_list); - return UCS_OK; - -err_destroy_qp: - ibv_destroy_qp(md->umr_qp); -err_destroy_cq: - ibv_destroy_cq(md->umr_cq); -err: - return UCS_ERR_IO_ERROR; -#else - return UCS_ERR_UNSUPPORTED; -#endif -} - -static void uct_ib_md_umr_qp_destroy(uct_ib_md_t *md) -{ -#if HAVE_EXP_UMR - if (md->umr_qp != NULL) { - ibv_destroy_qp(md->umr_qp); - } - if (md->umr_cq != NULL) { - ibv_destroy_cq(md->umr_cq); - } -#endif -} - -uint8_t uct_ib_md_get_atomic_mr_id(uct_ib_md_t *md) -{ -#if HAVE_EXP_UMR - if ((md->umr_qp == NULL) || (md->umr_cq == NULL)) { - return 0; - } - /* Generate atomic UMR id. We want umrs for same virtual addresses to have - * different ids across processes. - * - * Usually parallel processes running on the same node as part of a single - * job will have consecutive PIDs. For example MPI ranks, slurm spawned tasks... - */ - return getpid() % 256; -#else - return 0; -#endif -} - static void uct_ib_md_print_mem_reg_err_msg(ucs_log_level_t level, void *address, - size_t length, uint64_t exp_access, - const char *exp_prefix, int line) + size_t length, uint64_t access_flags) { char msg[200] = {0}; struct rlimit limit_info; - if (!ucs_log_is_enabled(level)) { - return; - } - ucs_snprintf_zero(msg, sizeof(msg), - "ibv_%sreg_mr(address=%p, length=%zu, %saccess=0x%lx) failed: %m", - exp_prefix, address, length, exp_prefix, exp_access); + "%s(address=%p, length=%zu, access=0x%lx) failed: %m", + ibv_reg_mr_func_name, address, length, access_flags); /* Check the value of the max locked memory which is set on the system * (ulimit -l) */ @@ -440,220 +326,210 @@ static void uct_ib_md_print_mem_reg_err_msg(ucs_log_level_t level, void *address "(current: %llu kbytes)", limit_info.rlim_cur / UCS_KBYTE); } - ucs_log_dispatch(__FILE__, line, "??", level, "%s", msg); + ucs_log(level, "%s", msg); } -static ucs_status_t uct_ib_md_reg_mr(uct_ib_md_t *md, void *address, - size_t length, uint64_t exp_access, - int silent, struct ibv_mr **mr_p) +void *uct_ib_md_mem_handle_thread_func(void *arg) { - ucs_log_level_t level = silent ? UCS_LOG_LEVEL_DEBUG : UCS_LOG_LEVEL_ERROR; - struct ibv_mr *mr; - - if (exp_access) { -#if HAVE_DECL_IBV_EXP_REG_MR - struct ibv_exp_reg_mr_in in; - - memset(&in, 0, sizeof(in)); - in.pd = md->pd; - in.addr = address; - in.length = length; - in.exp_access = UCT_IB_MEM_ACCESS_FLAGS | exp_access; - - mr = UCS_PROFILE_CALL(ibv_exp_reg_mr, &in); - if (mr == NULL) { - uct_ib_md_print_mem_reg_err_msg(level, in.addr, in.length, - in.exp_access, "exp_", __LINE__); - return UCS_ERR_IO_ERROR; - } -#else - return UCS_ERR_UNSUPPORTED; -#endif - } else { - mr = UCS_PROFILE_CALL(ibv_reg_mr, md->pd, address, length, - UCT_IB_MEM_ACCESS_FLAGS); - if (mr == NULL) { - uct_ib_md_print_mem_reg_err_msg(level, address, length, - UCT_IB_MEM_ACCESS_FLAGS, "", - __LINE__); - return UCS_ERR_IO_ERROR; + uct_ib_md_mem_reg_thread_t *ctx = arg; + ucs_status_t status; + int mr_idx = 0; + size_t size = 0; + ucs_time_t UCS_V_UNUSED t0 = ucs_get_time(); + + while (ctx->len) { + size = ucs_min(ctx->len, ctx->chunk); + if (ctx->access != UCT_IB_MEM_DEREG) { + ctx->mr[mr_idx] = UCS_PROFILE_NAMED_CALL(ibv_reg_mr_func_name, + ibv_reg_mr, ctx->pd, + ctx->addr, size, + ctx->access); + if (ctx->mr[mr_idx] == NULL) { + return UCS_STATUS_PTR(UCS_ERR_IO_ERROR); + } + } else { + status = uct_ib_dereg_mr(ctx->mr[mr_idx]); + if (status != UCS_OK) { + return UCS_STATUS_PTR(status); + } } + ctx->addr = UCS_PTR_BYTE_OFFSET(ctx->addr, size); + ctx->len -= size; + mr_idx++; } - *mr_p = mr; - return UCS_OK; + ucs_trace("%s %p..%p took %f usec\n", + (ctx->access == UCT_IB_MEM_DEREG) ? "dereg_mr" : "reg_mr", + ctx->mr[0]->addr, + UCS_PTR_BYTE_OFFSET(ctx->mr[mr_idx-1]->addr, size), + ucs_time_to_usec(ucs_get_time() - t0)); + + return UCS_STATUS_PTR(UCS_OK); } -#if HAVE_EXP_UMR -static ucs_status_t uct_ib_verbs_md_post_umr(uct_ib_md_t *md, struct ibv_mr *mr, - void *base_addr, - struct ibv_mr **indirect_mr_p) +ucs_status_t +uct_ib_md_handle_mr_list_multithreaded(uct_ib_md_t *md, void *address, + size_t length, uint64_t access_flags, + size_t chunk, struct ibv_mr **mrs) { - struct ibv_exp_mem_region *mem_reg = NULL; - struct ibv_exp_send_wr wr, *bad_wr; - struct ibv_exp_create_mr_in mrin; + int thread_num_mrs, thread_num, thread_idx, mr_idx = 0, cpu_id = 0; + int mr_num = ucs_div_round_up(length, chunk); ucs_status_t status; - struct ibv_mr *umr; - struct ibv_wc wc; - int i, list_size; - size_t reg_length; + void *thread_status; + ucs_sys_cpuset_t parent_set, thread_set; + uct_ib_md_mem_reg_thread_t *ctxs, *cur_ctx; + pthread_attr_t attr; + char UCS_V_UNUSED affinity_str[64]; int ret; - if (md->umr_qp == NULL) { - status = UCS_ERR_UNSUPPORTED; - goto err; + ret = pthread_getaffinity_np(pthread_self(), sizeof(ucs_sys_cpuset_t), + &parent_set); + if (ret != 0) { + ucs_error("pthread_getaffinity_np() failed: %m"); + return UCS_ERR_INVALID_PARAM; } - /* Create and fill memory key */ - memset(&mrin, 0, sizeof(mrin)); - memset(&wr, 0, sizeof(wr)); + thread_num = ucs_min(CPU_COUNT(&parent_set), mr_num); - mrin.pd = md->pd; - wr.exp_opcode = IBV_EXP_WR_UMR_FILL; - wr.exp_send_flags = IBV_EXP_SEND_SIGNALED; - wr.ext_op.umr.exp_access = UCT_IB_MEM_ACCESS_FLAGS; + ucs_trace("multithreaded handle %p..%p access %lx threads %d affinity %s\n", + address, UCS_PTR_BYTE_OFFSET(address, length), access_flags, thread_num, + ucs_make_affinity_str(&parent_set, affinity_str, sizeof(affinity_str))); - reg_length = UCT_IB_MD_MAX_MR_SIZE; -#ifdef HAVE_EXP_UMR_KSM - if ((md->dev.dev_attr.comp_mask & IBV_EXP_DEVICE_ATTR_COMP_MASK_2) && - (md->dev.dev_attr.comp_mask_2 & IBV_EXP_DEVICE_ATTR_UMR_FIXED_SIZE_CAPS) && - (md->dev.dev_attr.exp_device_cap_flags & IBV_EXP_DEVICE_UMR_FIXED_SIZE)) - { - reg_length = md->dev.dev_attr.umr_fixed_size_caps.max_entity_size; - list_size = ucs_div_round_up(mr->length, reg_length); - } else if (mr->length < reg_length) { - list_size = 1; - } else { - status = UCS_ERR_UNSUPPORTED; - goto err; + if (thread_num == 1) { + return UCS_ERR_UNSUPPORTED; } - if (list_size > 1) { - mrin.attr.create_flags = IBV_EXP_MR_FIXED_BUFFER_SIZE; - wr.ext_op.umr.umr_type = IBV_EXP_UMR_MR_LIST_FIXED_SIZE; - } else { - mrin.attr.create_flags = IBV_EXP_MR_INDIRECT_KLMS; - wr.ext_op.umr.umr_type = IBV_EXP_UMR_MR_LIST; - } -#else - if (mr->length >= reg_length) { - status = UCS_ERR_UNSUPPORTED; - goto err; + ctxs = ucs_calloc(thread_num, sizeof(*ctxs), "ib mr ctxs"); + if (ctxs == NULL) { + return UCS_ERR_NO_MEMORY; } - list_size = 1; - mrin.attr.create_flags = IBV_EXP_MR_INDIRECT_KLMS; - wr.ext_op.umr.umr_type = IBV_EXP_UMR_MR_LIST; -#endif /* HAVE_EXP_UMR_KSM */ + pthread_attr_init(&attr); - mrin.attr.exp_access_flags = UCT_IB_MEM_ACCESS_FLAGS; - mrin.attr.max_klm_list_size = list_size; - mem_reg = ucs_calloc(list_size, sizeof(mem_reg[0]), "mem_reg"); - if (!mem_reg) { - status = UCS_ERR_NO_MEMORY; - goto err; - } + status = UCS_OK; + for (thread_idx = 0; thread_idx < thread_num; thread_idx++) { + /* calculate number of mrs for each thread so each one will + * get proportional amount */ + thread_num_mrs = ucs_div_round_up(mr_num - mr_idx, thread_num - thread_idx); + + cur_ctx = &ctxs[thread_idx]; + cur_ctx->pd = md->pd; + cur_ctx->addr = UCS_PTR_BYTE_OFFSET(address, mr_idx * chunk); + cur_ctx->len = ucs_min(thread_num_mrs * chunk, length - (mr_idx * chunk)); + cur_ctx->access = access_flags; + cur_ctx->mr = &mrs[mr_idx]; + cur_ctx->chunk = chunk; + + if (md->config.mt_reg_bind) { + while (!CPU_ISSET(cpu_id, &parent_set)) { + cpu_id++; + } - umr = ibv_exp_create_mr(&mrin); - if (!umr) { - ucs_error("Failed to create modified_mr: %m"); - status = UCS_ERR_NO_MEMORY; - goto err; - } + CPU_ZERO(&thread_set); + CPU_SET(cpu_id, &thread_set); + cpu_id++; + pthread_attr_setaffinity_np(&attr, sizeof(ucs_sys_cpuset_t), &thread_set); + } - for (i = 0; i < list_size; i++) { - mem_reg[i].base_addr = (uintptr_t)mr->addr + i * reg_length; - mem_reg[i].length = reg_length; - mem_reg[i].mr = mr; + ret = pthread_create(&cur_ctx->thread, &attr, + uct_ib_md_mem_handle_thread_func, cur_ctx); + if (ret) { + ucs_error("pthread_create() failed: %m"); + status = UCS_ERR_IO_ERROR; + thread_num = thread_idx; + break; + } + + mr_idx += thread_num_mrs; } - ucs_assert(list_size >= 1); - mem_reg[list_size - 1].length = mr->length % reg_length; - wr.ext_op.umr.mem_list.mem_reg_list = mem_reg; - wr.ext_op.umr.base_addr = (uintptr_t)base_addr; - wr.ext_op.umr.num_mrs = list_size; - wr.ext_op.umr.modified_mr = umr; + for (thread_idx = 0; thread_idx < thread_num; thread_idx++) { + cur_ctx = &ctxs[thread_idx]; + pthread_join(cur_ctx->thread, &thread_status); + if (UCS_PTR_IS_ERR(UCS_OK)) { + status = UCS_PTR_STATUS(thread_status); + } + } - /* If the list exceeds max inline size, allocate a container object */ - if (list_size > md->config.max_inline_klm_list) { - struct ibv_exp_mkey_list_container_attr in = { - .pd = md->pd, - .mkey_list_type = IBV_EXP_MKEY_LIST_TYPE_INDIRECT_MR, - .max_klm_list_size = list_size - }; + ucs_free(ctxs); + pthread_attr_destroy(&attr); - wr.ext_op.umr.memory_objects = ibv_exp_alloc_mkey_list_memory(&in); - if (wr.ext_op.umr.memory_objects == NULL) { - ucs_error("ibv_exp_alloc_mkey_list_memory(list_size=%d) failed: %m", - list_size); - status = UCS_ERR_IO_ERROR; - goto err_free_umr; + if (status != UCS_OK) { + for (mr_idx = 0; mr_idx < mr_num; mr_idx++) { + /* coverity[check_return] */ + uct_ib_dereg_mr(mrs[mr_idx]); } - } else { - wr.ext_op.umr.memory_objects = NULL; - wr.exp_send_flags |= IBV_EXP_SEND_INLINE; } - ucs_trace_data("UMR_FILL qp 0x%x lkey 0x%x base 0x%lx [addr %lx len %zu lkey 0x%x] list_size %d", - md->umr_qp->qp_num, wr.ext_op.umr.modified_mr->lkey, - wr.ext_op.umr.base_addr, mem_reg[0].base_addr, - mem_reg[0].length, mem_reg[0].mr->lkey, list_size); + return status; +} - /* Post UMR */ - ret = ibv_exp_post_send(md->umr_qp, &wr, &bad_wr); - if (ret) { - ucs_error("ibv_exp_post_send(UMR_FILL) failed: %m"); - status = UCS_ERR_IO_ERROR; - goto err_free_klm_container; - } +static ucs_status_t uct_ib_md_reg_mr(uct_ib_md_t *md, void *address, + size_t length, uint64_t access_flags, + int silent, uct_ib_mem_t *memh, + uct_ib_mr_type_t mr_type) +{ + ucs_log_level_t level = silent ? UCS_LOG_LEVEL_DEBUG : UCS_LOG_LEVEL_ERROR; + ucs_status_t status; - /* Wait for send UMR completion */ - for (;;) { - ret = ibv_poll_cq(md->umr_cq, 1, &wc); - if (ret < 0) { - ucs_error("ibv_exp_poll_cq(umr_cq) failed: %m"); - status = UCS_ERR_IO_ERROR; - goto err_free_klm_container; + if (length >= md->config.min_mt_reg) { + UCS_PROFILE_CODE("reg ksm") { + status = md->ops->reg_multithreaded(md, address, length, + access_flags, memh, mr_type); } - if (ret == 1) { - if (wc.status != IBV_WC_SUCCESS) { - ucs_error("UMR_FILL completed with error: %s vendor_err %d", - ibv_wc_status_str(wc.status), wc.vendor_err); - status = UCS_ERR_IO_ERROR; - goto err_free_klm_container; + + if (status != UCS_ERR_UNSUPPORTED) { + if (status == UCS_OK) { + memh->flags |= UCT_IB_MEM_MULTITHREADED; + } else { + uct_ib_md_print_mem_reg_err_msg(level, address, length, + access_flags); } - break; - } - } - if (wr.ext_op.umr.memory_objects != NULL) { - ibv_exp_dealloc_mkey_list_memory(wr.ext_op.umr.memory_objects); + return status; + } /* if unsuported - fallback to regular registration */ } - ucs_debug("UMR registered memory 0x%lx..0x%lx/%p on %s lkey 0x%x rkey 0x%x", - (uintptr_t)mr->addr, (uintptr_t)mr->addr + mr->length, base_addr, - uct_ib_device_name(&md->dev), umr->lkey, umr->rkey); - *indirect_mr_p = umr; + status = md->ops->reg_key(md, address, length, access_flags, memh, mr_type); + if (status != UCS_OK) { + uct_ib_md_print_mem_reg_err_msg(level, address, length, access_flags); + return status; + } - ucs_free(mem_reg); return UCS_OK; +} -err_free_klm_container: - if (wr.ext_op.umr.memory_objects != NULL) { - ibv_exp_dealloc_mkey_list_memory(wr.ext_op.umr.memory_objects); +ucs_status_t uct_ib_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t access_flags, struct ibv_mr **mr_p) +{ + struct ibv_mr *mr; +#if HAVE_DECL_IBV_EXP_REG_MR + struct ibv_exp_reg_mr_in in = {}; + + in.pd = pd; + in.addr = addr; + in.length = length; + in.exp_access = access_flags; + mr = UCS_PROFILE_CALL(ibv_exp_reg_mr, &in); +#else + mr = UCS_PROFILE_CALL(ibv_reg_mr, pd, addr, length, access_flags); +#endif + if (mr == NULL) { + return UCS_ERR_IO_ERROR; } -err_free_umr: - UCS_PROFILE_CALL(ibv_dereg_mr, umr); -err: - ucs_free(mem_reg); - return status; + + *mr_p = mr; + return UCS_OK; } -#endif /* HAVE_EXP_UMR */ -static ucs_status_t uct_ib_dereg_mr(struct ibv_mr *mr) +ucs_status_t uct_ib_dereg_mr(struct ibv_mr *mr) { int ret; + if (mr == NULL) { + return UCS_OK; + } + ret = UCS_PROFILE_CALL(ibv_dereg_mr, mr); if (ret != 0) { ucs_error("ibv_dereg_mr() failed: %m"); @@ -663,19 +539,57 @@ static ucs_status_t uct_ib_dereg_mr(struct ibv_mr *mr) return UCS_OK; } +ucs_status_t uct_ib_dereg_mrs(struct ibv_mr **mrs, size_t mr_num) +{ + ucs_status_t s, status = UCS_OK; + int i; + + for (i = 0; i < mr_num; i++) { + s = uct_ib_dereg_mr(mrs[i]); + if (s != UCS_OK) { + status = s; + } + } + + return status; +} + +static ucs_status_t uct_ib_memh_dereg_key(uct_ib_md_t *md, uct_ib_mem_t *memh, + uct_ib_mr_type_t mr_type) +{ + if (memh->flags & UCT_IB_MEM_MULTITHREADED) { + return md->ops->dereg_multithreaded(md, memh, mr_type); + } else { + return md->ops->dereg_key(md, memh, mr_type); + } +} + static ucs_status_t uct_ib_memh_dereg(uct_ib_md_t *md, uct_ib_mem_t *memh) { - ucs_status_t s1, s2; + ucs_status_t s, status = UCS_OK; - s1 = s2 = UCS_OK; if (memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR) { - s2 = md->ops->dereg_atomic_key(md, memh); + s = md->ops->dereg_atomic_key(md, memh); memh->flags &= ~UCT_IB_MEM_FLAG_ATOMIC_MR; + if (s != UCS_OK) { + status = s; + } } - if (memh->mr != NULL) { - s1 = uct_ib_dereg_mr(memh->mr); + + if (memh->flags & UCT_IB_MEM_FLAG_RELAXED_ORDERING) { + s = uct_ib_memh_dereg_key(md, memh, UCT_IB_MR_STRICT_ORDER); + memh->flags &= ~UCT_IB_MEM_FLAG_RELAXED_ORDERING; + if (s != UCS_OK) { + status = s; + } + } + + s = uct_ib_memh_dereg_key(md, memh, UCT_IB_MR_DEFAULT); + if (s != UCS_OK) { + status = s; } - return (s1 != UCS_OK) ? s1 : s2; + + return status; } static void uct_ib_memh_free(uct_ib_mem_t *memh) @@ -685,23 +599,29 @@ static void uct_ib_memh_free(uct_ib_mem_t *memh) static uct_ib_mem_t *uct_ib_memh_alloc(uct_ib_md_t *md) { - return ucs_calloc(1, md->ops->memh_struct_size, "ib_memh"); + return ucs_calloc(1, md->memh_struct_size, "ib_memh"); } static uint64_t uct_ib_md_access_flags(uct_ib_md_t *md, unsigned flags, size_t length) { - uint64_t exp_access = 0; + uint64_t access_flags = UCT_IB_MEM_ACCESS_FLAGS; if ((flags & UCT_MD_MEM_FLAG_NONBLOCK) && (length > 0) && (length <= md->config.odp.max_size)) { - exp_access |= IBV_EXP_ACCESS_ON_DEMAND; + access_flags |= IBV_ACCESS_ON_DEMAND; + } + + if (md->relaxed_order) { + access_flags |= IBV_ACCESS_RELAXED_ORDERING; } - return exp_access; + + return access_flags; } #if HAVE_NUMA -static ucs_status_t uct_ib_mem_set_numa_policy(uct_ib_md_t *md, uct_ib_mem_t *memh) +static ucs_status_t uct_ib_mem_set_numa_policy(uct_ib_md_t *md, void *address, + size_t length, uct_ib_mem_t *memh) { int ret, old_policy, new_policy; struct bitmask *nodemask; @@ -759,8 +679,8 @@ static ucs_status_t uct_ib_mem_set_numa_policy(uct_ib_md_t *md, uct_ib_mem_t *me } if (new_policy != old_policy) { - start = ucs_align_down_pow2((uintptr_t)memh->mr->addr, ucs_get_page_size()); - end = ucs_align_up_pow2((uintptr_t)memh->mr->addr + memh->mr->length, + start = ucs_align_down_pow2((uintptr_t)address, ucs_get_page_size()); + end = ucs_align_up_pow2((uintptr_t)address + length, ucs_get_page_size()); ucs_trace("0x%lx..0x%lx: changing numa policy from %d to %d, " "nodemask[0]=0x%lx", start, end, old_policy, new_policy, @@ -785,50 +705,20 @@ static ucs_status_t uct_ib_mem_set_numa_policy(uct_ib_md_t *md, uct_ib_mem_t *me return status; } #else -static ucs_status_t uct_ib_mem_set_numa_policy(uct_ib_md_t *md, uct_ib_mem_t *memh) +static ucs_status_t uct_ib_mem_set_numa_policy(uct_ib_md_t *md, void *address, + size_t length, uct_ib_mem_t *memh) { return UCS_OK; } #endif /* UCT_MD_DISABLE_NUMA */ -static ucs_status_t -uct_ib_mem_prefetch_internal(uct_ib_md_t *md, uct_ib_mem_t *memh, void *addr, size_t length) -{ -#if HAVE_DECL_IBV_EXP_PREFETCH_MR - struct ibv_exp_prefetch_attr attr; - int ret; - - if ((memh->flags & UCT_IB_MEM_FLAG_ODP)) { - if ((addr < memh->mr->addr) || - (addr + length > memh->mr->addr + memh->mr->length)) { - return UCS_ERR_INVALID_PARAM; - } - ucs_debug("memh %p prefetch %p length %llu", memh, addr, - (unsigned long long)length); - attr.flags = IBV_EXP_PREFETCH_WRITE_ACCESS; - attr.addr = addr; - attr.length = length; - attr.comp_mask = 0; - - ret = UCS_PROFILE_CALL(ibv_exp_prefetch_mr, memh->mr, &attr); - if (ret) { - ucs_error("ibv_exp_prefetch_mr(addr=%p length=%zu) returned %d: %m", - attr.addr, attr.length, ret); - return UCS_ERR_IO_ERROR; - } - } -#endif - return UCS_OK; -} - static void uct_ib_mem_init(uct_ib_mem_t *memh, unsigned uct_flags, - uint64_t exp_access) + uint64_t access_flags) { - memh->lkey = memh->mr->lkey; memh->flags = 0; /* coverity[dead_error_condition] */ - if (exp_access & IBV_EXP_ACCESS_ON_DEMAND) { + if (access_flags & IBV_ACCESS_ON_DEMAND) { memh->flags |= UCT_IB_MEM_FLAG_ODP; } @@ -837,323 +727,150 @@ static void uct_ib_mem_init(uct_ib_mem_t *memh, unsigned uct_flags, } } -ucs_status_t uct_ib_md_alloc_device_mem(uct_md_h uct_md, size_t *length_p, - void **address_p, unsigned flags, - const char *alloc_name, - uct_ib_device_mem_h *dev_mem_p) +static ucs_status_t uct_ib_mem_reg_internal(uct_md_h uct_md, void *address, + size_t length, unsigned flags, + int silent, uct_ib_mem_t *memh) { -#if HAVE_IBV_EXP_DM uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); - struct ibv_exp_alloc_dm_attr dm_attr; - struct ibv_exp_reg_mr_in mr_in; - uct_ib_device_mem_t *dev_mem; ucs_status_t status; + uint64_t access_flags; - dev_mem = ucs_malloc(sizeof(*dev_mem), "ib_device_mem"); - if (dev_mem == NULL) { - status = UCS_ERR_NO_MEMORY; - goto err; - } - - /* Align the allocation to a potential use of registration cache */ - *length_p = ucs_align_up_pow2(*length_p, md->alloc_align); - - /* Allocate device memory */ - dm_attr.length = *length_p; - dm_attr.comp_mask = 0; - dev_mem->dm = UCS_PROFILE_CALL(ibv_exp_alloc_dm, md->dev.ibv_context, - &dm_attr); - if (dev_mem->dm == NULL) { - ucs_debug("ibv_exp_alloc_dm(dev=%s, length=%zu) failed: %m", - uct_ib_device_name(&md->dev), dm_attr.length); - status = UCS_ERR_NO_RESOURCE; - goto err_free_struct; - } - - /* Register device memory (the resulting key will have address==0) */ - mr_in.addr = 0; - mr_in.pd = md->pd; - mr_in.length = *length_p; - mr_in.exp_access = UCT_IB_MEM_ACCESS_FLAGS; - mr_in.comp_mask = IBV_EXP_REG_MR_DM; - mr_in.dm = dev_mem->dm; - dev_mem->mr = UCS_PROFILE_CALL(ibv_exp_reg_mr, &mr_in); - if (dev_mem->mr == NULL) { - uct_ib_md_print_mem_reg_err_msg(UCS_LOG_LEVEL_ERROR, mr_in.addr, - mr_in.length, mr_in.exp_access, "exp_", - __LINE__); - status = UCS_ERR_IO_ERROR; - goto err_free_dm; - } - - dev_mem->address = ((uct_mlx5_dm_va_t*)dev_mem->dm)->start_va; - *address_p = dev_mem->address; - *dev_mem_p = dev_mem; - ucs_list_add_tail(&md->dm_list, &dev_mem->list); - - ucs_debug("allocated device memory %p..%p on %s lkey 0x%x rkey 0x%x", - dev_mem->address, dev_mem->address + dev_mem->mr->length, - uct_ib_device_name(&md->dev), dev_mem->mr->lkey, dev_mem->mr->rkey); - return UCS_OK; - -err_free_dm: - UCS_PROFILE_CALL(ibv_exp_free_dm, dev_mem->dm); -err_free_struct: - ucs_free(dev_mem); -err: - return status; -#else - return UCS_ERR_UNSUPPORTED; -#endif -} - -void uct_ib_md_release_device_mem(uct_ib_device_mem_h dev_mem) -{ -#if HAVE_IBV_EXP_DM - int ret; - - ucs_list_del(&dev_mem->list); - (void)uct_ib_dereg_mr(dev_mem->mr); - - ret = UCS_PROFILE_CALL(ibv_exp_free_dm, dev_mem->dm); - if (ret) { - ucs_warn("ibv_exp_free_dm() failed: %m"); + access_flags = uct_ib_md_access_flags(md, flags, length); + uct_ib_mem_init(memh, flags, access_flags); + status = uct_ib_md_reg_mr(md, address, length, access_flags, silent, memh, + UCT_IB_MR_DEFAULT); + if (status != UCS_OK) { + return status; } - ucs_free(dev_mem); -#endif -} - -static ucs_status_t -uct_ib_md_reg_check_device_mem(uct_ib_md_t *md, void *address, size_t length, - unsigned flags, uct_ib_mem_t *memh) -{ - ucs_status_t status = UCS_ERR_NO_ELEM; -#if HAVE_IBV_EXP_DM - uct_ib_device_mem_t *dev_mem; - off_t offset; - - /* try to find a device memory object which covers the requested address range */ - ucs_list_for_each(dev_mem, &md->dm_list, list) { - if ((address >= dev_mem->address) && - (address + length <= dev_mem->address + dev_mem->mr->length)) { - status = UCS_OK; - break; + if (md->relaxed_order) { + status = uct_ib_md_reg_mr(md, address, length, + access_flags & ~IBV_ACCESS_RELAXED_ORDERING, + silent, memh, UCT_IB_MR_STRICT_ORDER); + if (status != UCS_OK) { + goto err; } - } - if (status != UCS_OK) { - goto err; /* device memory object not found */ - } - /* create access key as indirect key over DM key */ - status = uct_ib_verbs_md_post_umr(md, dev_mem->mr, address, &memh->mr); - if (status != UCS_OK) { - goto err; + memh->flags |= UCT_IB_MEM_FLAG_RELAXED_ORDERING; } - uct_ib_mem_init(memh, flags, 0); + ucs_debug("registered memory %p..%p on %s lkey 0x%x rkey 0x%x " + "access 0x%lx flags 0x%x", address, + UCS_PTR_BYTE_OFFSET(address, length), + uct_ib_device_name(&md->dev), memh->lkey, memh->rkey, + access_flags, flags); - /* create atomic key as indirect key over DM key with atomic offset */ - offset = uct_ib_md_atomic_offset(uct_ib_md_get_atomic_mr_id(md)); - status = uct_ib_verbs_md_post_umr(md, dev_mem->mr, address + offset, - &memh->atomic_mr); - if (status != UCS_OK) { - goto err_dereg_mr; + uct_ib_mem_set_numa_policy(md, address, length, memh); + + if (md->config.odp.prefetch) { + md->ops->mem_prefetch(md, memh, address, length); } - memh->flags |= UCT_IB_MEM_FLAG_ATOMIC_MR; - memh->atomic_rkey = memh->atomic_mr->rkey; + UCS_STATS_UPDATE_COUNTER(md->stats, UCT_IB_MD_STAT_MEM_REG, +1); return UCS_OK; -err_dereg_mr: - (void)uct_ib_dereg_mr(memh->mr); err: -#endif + uct_ib_memh_dereg(md, memh); return status; } -static ucs_status_t uct_ib_mem_alloc(uct_md_h uct_md, size_t *length_p, - void **address_p, unsigned flags, - const char *alloc_name, uct_mem_h *memh_p) +static ucs_status_t uct_ib_mem_reg(uct_md_h uct_md, void *address, size_t length, + unsigned flags, uct_mem_h *memh_p) { -#if HAVE_DECL_IBV_EXP_ACCESS_ALLOCATE_MR uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); ucs_status_t status; - uint64_t exp_access; uct_ib_mem_t *memh; - size_t length; - - if (!md->config.enable_contig_pages) { - return UCS_ERR_UNSUPPORTED; - } memh = uct_ib_memh_alloc(md); if (memh == NULL) { - status = UCS_ERR_NO_MEMORY; - goto err; - } - - length = *length_p; - exp_access = uct_ib_md_access_flags(md, flags, length) | - IBV_EXP_ACCESS_ALLOCATE_MR; - status = uct_ib_md_reg_mr(md, NULL, length, exp_access, 0, &memh->mr); - if (status != UCS_OK) { - goto err_free_memh; - } - - ucs_trace("allocated memory %p..%p on %s lkey 0x%x rkey 0x%x", - memh->mr->addr, memh->mr->addr + memh->mr->length, uct_ib_device_name(&md->dev), - memh->mr->lkey, memh->mr->rkey); - - uct_ib_mem_init(memh, flags, exp_access); - uct_ib_mem_set_numa_policy(md, memh); - - if (md->config.odp.prefetch) { - uct_ib_mem_prefetch_internal(md, memh, memh->mr->addr, memh->mr->length); + return UCS_ERR_NO_MEMORY; } - UCS_STATS_UPDATE_COUNTER(md->stats, UCT_IB_MD_STAT_MEM_ALLOC, +1); - ucs_memtrack_allocated(memh->mr->addr, memh->mr->length UCS_MEMTRACK_VAL); - - *address_p = memh->mr->addr; - *length_p = memh->mr->length; - *memh_p = memh; - return UCS_OK; - -err_free_memh: - uct_ib_memh_free(memh); -err: - return status; -#else - return UCS_ERR_UNSUPPORTED; -#endif -} - -static ucs_status_t -uct_ib_verbs_reg_atomic_key(struct uct_ib_md *md, uct_ib_mem_t *memh, - off_t offset) -{ -#if HAVE_EXP_UMR - uct_ib_mem_t *ib_memh = memh; - ucs_status_t status; - - status = uct_ib_verbs_md_post_umr(md, ib_memh->mr, memh->mr->addr + offset, - &memh->atomic_mr); + status = uct_ib_mem_reg_internal(uct_md, address, length, flags, 0, memh); if (status != UCS_OK) { + uct_ib_memh_free(memh); return status; } + *memh_p = memh; - memh->atomic_rkey = memh->atomic_mr->rkey; return UCS_OK; -#else - return UCS_ERR_UNSUPPORTED; -#endif } -static ucs_status_t uct_ib_verbs_dereg_atomic_key(uct_ib_md_t *md, - uct_ib_mem_t *memh) -{ -#if HAVE_EXP_UMR - return uct_ib_dereg_mr(memh->atomic_mr); -#else - return UCS_ERR_UNSUPPORTED; -#endif -} - -static ucs_status_t uct_ib_mem_free(uct_md_h uct_md, uct_mem_h memh) +static ucs_status_t uct_ib_mem_dereg(uct_md_h uct_md, uct_mem_h memh) { uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); uct_ib_mem_t *ib_memh = memh; ucs_status_t status; - ucs_memtrack_releasing(ib_memh->mr->addr); + status = uct_ib_memh_dereg(md, ib_memh); + uct_ib_memh_free(ib_memh); + return status; +} - status = UCS_PROFILE_CALL(uct_ib_memh_dereg, md, memh); - if (status != UCS_OK) { - return status; - } +static ucs_status_t uct_ib_verbs_reg_key(uct_ib_md_t *md, void *address, + size_t length, uint64_t access_flags, + uct_ib_mem_t *ib_memh, + uct_ib_mr_type_t mr_type) +{ + uct_ib_verbs_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_verbs_mem_t); - uct_ib_memh_free(ib_memh); - return UCS_OK; + return uct_ib_reg_key_impl(md, address, length, access_flags, + ib_memh, &memh->mrs[mr_type], mr_type); } -static ucs_status_t uct_ib_mem_reg_internal(uct_md_h uct_md, void *address, - size_t length, unsigned flags, - int silent, uct_ib_mem_t *memh) +ucs_status_t uct_ib_reg_key_impl(uct_ib_md_t *md, void *address, + size_t length, uint64_t access_flags, + uct_ib_mem_t *memh, uct_ib_mr_t *mr, + uct_ib_mr_type_t mr_type) { - uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); ucs_status_t status; - uint64_t exp_access; - status = uct_ib_md_reg_check_device_mem(md, address, length, flags, memh); - if (status != UCS_ERR_NO_ELEM) { - return status; - } - - exp_access = uct_ib_md_access_flags(md, flags, length); - status = uct_ib_md_reg_mr(md, address, length, exp_access, silent, &memh->mr); + status = uct_ib_reg_mr(md->pd, address, length, access_flags, &mr->ib); if (status != UCS_OK) { return status; } - ucs_debug("registered memory %p..%p on %s lkey 0x%x rkey 0x%x " - "exp_access 0x%lx flags 0x%x", address, address + length, - uct_ib_device_name(&md->dev), memh->mr->lkey, memh->mr->rkey, - exp_access, flags); - - uct_ib_mem_init(memh, flags, exp_access); - uct_ib_mem_set_numa_policy(md, memh); - if (md->config.odp.prefetch) { - uct_ib_mem_prefetch_internal(md, memh, memh->mr->addr, memh->mr->length); + if (mr_type == UCT_IB_MR_DEFAULT) { + uct_ib_memh_init_keys(memh, mr->ib->lkey, mr->ib->rkey); } - UCS_STATS_UPDATE_COUNTER(md->stats, UCT_IB_MD_STAT_MEM_REG, +1); return UCS_OK; } -static ucs_status_t uct_ib_mem_reg(uct_md_h uct_md, void *address, size_t length, - unsigned flags, uct_mem_h *memh_p) +static ucs_status_t uct_ib_verbs_dereg_key(uct_ib_md_t *md, + uct_ib_mem_t *ib_memh, + uct_ib_mr_type_t mr_type) { - uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); - ucs_status_t status; - uct_ib_mem_t *memh; + uct_ib_verbs_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_verbs_mem_t); - memh = uct_ib_memh_alloc(md); - if (memh == NULL) { - return UCS_ERR_NO_MEMORY; - } - - status = uct_ib_mem_reg_internal(uct_md, address, length, flags, 0, memh); - if (status != UCS_OK) { - uct_ib_memh_free(memh); - return status; - } - *memh_p = memh; - - return UCS_OK; + return uct_ib_dereg_mr(memh->mrs[mr_type].ib); } -static ucs_status_t uct_ib_mem_dereg(uct_md_h uct_md, uct_mem_h memh) +static ucs_status_t uct_ib_verbs_reg_atomic_key(uct_ib_md_t *ibmd, + uct_ib_mem_t *ib_memh) { - uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); - uct_ib_mem_t *ib_memh = memh; - ucs_status_t status; + uct_ib_mr_type_t mr_type = uct_ib_memh_get_atomic_base_mr_type(ib_memh); + uct_ib_verbs_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_verbs_mem_t); - status = uct_ib_memh_dereg(md, ib_memh); - uct_ib_memh_free(ib_memh); - return status; + if (mr_type != UCT_IB_MR_STRICT_ORDER) { + return UCS_ERR_UNSUPPORTED; + } + + memh->super.atomic_rkey = memh->mrs[mr_type].ib->rkey; + return UCS_OK; } -static ucs_status_t -uct_ib_mem_advise(uct_md_h uct_md, uct_mem_h memh, void *addr, size_t length, - unsigned advice) +static ucs_status_t +uct_ib_mem_advise(uct_md_h uct_md, uct_mem_h memh, void *addr, + size_t length, unsigned advice) { uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); ucs_debug("memh %p advice %d", memh, advice); if ((advice == UCT_MADV_WILLNEED) && !md->config.odp.prefetch) { - return uct_ib_mem_prefetch_internal(md, memh, addr, length); + return md->ops->mem_prefetch(md, memh, addr, length); } + return UCS_OK; } @@ -1163,25 +880,24 @@ static ucs_status_t uct_ib_mkey_pack(uct_md_h uct_md, uct_mem_h uct_memh, uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); uct_ib_mem_t *memh = uct_memh; uint32_t atomic_rkey; - uint16_t umr_offset; ucs_status_t status; /* create umr only if a user requested atomic access to the * memory region and the hardware supports it. */ - if ((memh->flags & UCT_IB_MEM_ACCESS_REMOTE_ATOMIC) && + if (((memh->flags & UCT_IB_MEM_ACCESS_REMOTE_ATOMIC) || + (memh->flags & UCT_IB_MEM_FLAG_RELAXED_ORDERING)) && !(memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR) && - (memh != &md->global_odp)) + (memh != md->global_odp)) { /* create UMR on-demand */ - umr_offset = uct_ib_md_atomic_offset(uct_ib_md_get_atomic_mr_id(md)); UCS_PROFILE_CODE("reg atomic key") { - status = md->ops->reg_atomic_key(md, memh, umr_offset); + status = md->ops->reg_atomic_key(md, memh); } if (status == UCS_OK) { memh->flags |= UCT_IB_MEM_FLAG_ATOMIC_MR; ucs_trace("created atomic key 0x%x for 0x%x", memh->atomic_rkey, - memh->mr->lkey); + memh->lkey); } else if (status != UCS_ERR_UNSUPPORTED) { return status; } @@ -1192,11 +908,11 @@ static ucs_status_t uct_ib_mkey_pack(uct_md_h uct_md, uct_mem_h uct_memh, atomic_rkey = UCT_IB_INVALID_RKEY; } - uct_ib_md_pack_rkey(memh->mr->rkey, atomic_rkey, rkey_buffer); + uct_ib_md_pack_rkey(memh->rkey, atomic_rkey, rkey_buffer); return UCS_OK; } -static ucs_status_t uct_ib_rkey_unpack(uct_md_component_t *mdc, +static ucs_status_t uct_ib_rkey_unpack(uct_component_t *component, const void *rkey_buffer, uct_rkey_t *rkey_p, void **handle_p) { @@ -1211,21 +927,13 @@ static ucs_status_t uct_ib_rkey_unpack(uct_md_component_t *mdc, } static uct_md_ops_t uct_ib_md_ops = { - .close = uct_ib_md_close, - .query = uct_ib_md_query, - .mem_alloc = uct_ib_mem_alloc, - .mem_free = uct_ib_mem_free, - .mem_reg = uct_ib_mem_reg, - .mem_dereg = uct_ib_mem_dereg, - .mem_advise = uct_ib_mem_advise, - .mkey_pack = uct_ib_mkey_pack, - .is_mem_type_owned = (void*)ucs_empty_function_return_zero, -}; - -uct_ib_md_ops_t uct_ib_verbs_md_ops = { - .memh_struct_size = sizeof(uct_ib_mem_t), - .reg_atomic_key = uct_ib_verbs_reg_atomic_key, - .dereg_atomic_key = uct_ib_verbs_dereg_atomic_key, + .close = uct_ib_md_close, + .query = uct_ib_md_query, + .mem_reg = uct_ib_mem_reg, + .mem_dereg = uct_ib_mem_dereg, + .mem_advise = uct_ib_mem_advise, + .mkey_pack = uct_ib_mkey_pack, + .detect_memory_type = ucs_empty_function_return_unsupported, }; static inline uct_ib_rcache_region_t* uct_ib_rcache_region_from_memh(uct_mem_h memh) @@ -1271,15 +979,13 @@ static ucs_status_t uct_ib_mem_rcache_dereg(uct_md_h uct_md, uct_mem_h memh) } static uct_md_ops_t uct_ib_md_rcache_ops = { - .close = uct_ib_md_close, - .query = uct_ib_md_query, - .mem_alloc = uct_ib_mem_alloc, - .mem_free = uct_ib_mem_free, - .mem_reg = uct_ib_mem_rcache_reg, - .mem_dereg = uct_ib_mem_rcache_dereg, - .mem_advise = uct_ib_mem_advise, - .mkey_pack = uct_ib_mkey_pack, - .is_mem_type_owned = (void*)ucs_empty_function_return_zero, + .close = uct_ib_md_close, + .query = uct_ib_md_query, + .mem_reg = uct_ib_mem_rcache_reg, + .mem_dereg = uct_ib_mem_rcache_dereg, + .mem_advise = uct_ib_mem_advise, + .mkey_pack = uct_ib_mkey_pack, + .detect_memory_type = ucs_empty_function_return_unsupported, }; static ucs_status_t uct_ib_rcache_mem_reg_cb(void *context, ucs_rcache_t *rcache, @@ -1320,7 +1026,7 @@ static void uct_ib_rcache_dump_region_cb(void *context, ucs_rcache_t *rcache, uct_ib_mem_t *memh = ®ion->memh; snprintf(buf, max, "lkey 0x%x rkey 0x%x atomic_rkey 0x%x", - memh->mr->lkey, memh->mr->rkey, + memh->lkey, memh->rkey, (memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR) ? memh->atomic_rkey : UCT_IB_INVALID_RKEY ); @@ -1342,7 +1048,7 @@ static ucs_status_t uct_ib_md_odp_query(uct_md_h uct_md, uct_md_attr_t *md_attr) } /* ODP supports only host memory */ - md_attr->cap.reg_mem_types &= UCS_BIT(UCT_MD_MEM_TYPE_HOST); + md_attr->cap.reg_mem_types &= UCS_BIT(UCS_MEMORY_TYPE_HOST); return UCS_OK; } @@ -1351,16 +1057,19 @@ static ucs_status_t uct_ib_mem_global_odp_reg(uct_md_h uct_md, void *address, uct_mem_h *memh_p) { uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); + uct_ib_mem_t *memh = md->global_odp; - ucs_assert(md->global_odp.mr != NULL); + ucs_assert(md->global_odp != NULL); if (flags & UCT_MD_MEM_FLAG_LOCK) { return uct_ib_mem_reg(uct_md, address, length, flags, memh_p); } if (md->config.odp.prefetch) { - uct_ib_mem_prefetch_internal(md, &md->global_odp, address, length); + md->ops->mem_prefetch(md, memh, address, length); } - *memh_p = &md->global_odp; + + /* cppcheck-suppress autoVariables */ + *memh_p = md->global_odp; return UCS_OK; } @@ -1368,7 +1077,7 @@ static ucs_status_t uct_ib_mem_global_odp_dereg(uct_md_h uct_md, uct_mem_h memh) { uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); - if (memh == &md->global_odp) { + if (memh == md->global_odp) { return UCS_OK; } @@ -1376,24 +1085,17 @@ static ucs_status_t uct_ib_mem_global_odp_dereg(uct_md_h uct_md, uct_mem_h memh) } static uct_md_ops_t UCS_V_UNUSED uct_ib_md_global_odp_ops = { - .close = uct_ib_md_close, - .query = uct_ib_md_odp_query, - .mem_alloc = uct_ib_mem_alloc, - .mem_free = uct_ib_mem_free, - .mem_reg = uct_ib_mem_global_odp_reg, - .mem_dereg = uct_ib_mem_global_odp_dereg, - .mem_advise = uct_ib_mem_advise, - .mkey_pack = uct_ib_mkey_pack, - .is_mem_type_owned = (void*)ucs_empty_function_return_zero, + .close = uct_ib_md_close, + .query = uct_ib_md_odp_query, + .mem_reg = uct_ib_mem_global_odp_reg, + .mem_dereg = uct_ib_mem_global_odp_dereg, + .mem_advise = uct_ib_mem_advise, + .mkey_pack = uct_ib_mkey_pack, + .detect_memory_type = ucs_empty_function_return_unsupported, }; -void uct_ib_make_md_name(char md_name[UCT_MD_NAME_MAX], struct ibv_device *device) -{ - snprintf(md_name, UCT_MD_NAME_MAX, "%s/%s", UCT_IB_MD_PREFIX, - ibv_get_device_name(device)); -} - -static ucs_status_t uct_ib_query_md_resources(uct_md_resource_desc_t **resources_p, +static ucs_status_t uct_ib_query_md_resources(uct_component_t *component, + uct_md_resource_desc_t **resources_p, unsigned *num_resources_p) { UCS_MODULE_FRAMEWORK_DECLARE(uct_ib); @@ -1408,8 +1110,9 @@ static ucs_status_t uct_ib_query_md_resources(uct_md_resource_desc_t **resources device_list = ibv_get_device_list(&num_devices); if (device_list == NULL) { ucs_debug("Failed to get IB device list, assuming no devices are present"); - status = UCS_ERR_NO_DEVICE; - goto out; + *resources_p = NULL; + *num_resources_p = 0; + return UCS_OK; } resources = ucs_calloc(num_devices, sizeof(*resources), "ib resources"); @@ -1419,7 +1122,8 @@ static ucs_status_t uct_ib_query_md_resources(uct_md_resource_desc_t **resources } for (i = 0; i < num_devices; ++i) { - uct_ib_make_md_name(resources[i].md_name, device_list[i]); + ucs_snprintf_zero(resources[i].md_name, sizeof(resources[i].md_name), + "%s", ibv_get_device_name(device_list[i])); } *resources_p = resources; @@ -1428,7 +1132,6 @@ static ucs_status_t uct_ib_query_md_resources(uct_md_resource_desc_t **resources out_free_device_list: ibv_free_device_list(device_list); -out: return status; } @@ -1463,6 +1166,38 @@ static void uct_ib_md_release_device_config(uct_ib_md_t *md) ucs_free(md->custom_devices.specs); } +static ucs_status_t UCS_V_UNUSED +uct_ib_md_global_odp_init(uct_ib_md_t *md, uct_mem_h *memh_p) +{ + uct_ib_verbs_mem_t *global_odp; + uct_ib_mr_t *mr; + ucs_status_t status; + + global_odp = (uct_ib_verbs_mem_t *)uct_ib_memh_alloc(md); + if (global_odp == NULL) { + return UCS_ERR_NO_MEMORY; + } + + mr = &global_odp->mrs[UCT_IB_MR_DEFAULT]; + status = uct_ib_reg_mr(md->pd, 0, UINT64_MAX, + UCT_IB_MEM_ACCESS_FLAGS | IBV_ACCESS_ON_DEMAND, + &mr->ib); + if (status != UCS_OK) { + ucs_debug("%s: failed to register global mr: %m", + uct_ib_device_name(&md->dev)); + goto err; + } + + global_odp->super.flags = UCT_IB_MEM_FLAG_ODP; + uct_ib_memh_init_keys(&global_odp->super, mr->ib->lkey, mr->ib->rkey); + *memh_p = global_odp; + return UCS_OK; + +err: + uct_ib_memh_free(&global_odp->super); + return status; +} + static ucs_status_t uct_ib_md_parse_reg_methods(uct_ib_md_t *md, uct_md_attr_t *md_attr, const uct_ib_md_config_t *md_config) @@ -1474,16 +1209,17 @@ uct_ib_md_parse_reg_methods(uct_ib_md_t *md, uct_md_attr_t *md_attr, for (i = 0; i < md_config->reg_methods.count; ++i) { if (!strcasecmp(md_config->reg_methods.rmtd[i], "rcache")) { rcache_params.region_struct_size = sizeof(ucs_rcache_region_t) + - md->ops->memh_struct_size; + md->memh_struct_size; rcache_params.alignment = md_config->rcache.alignment; rcache_params.max_alignment = ucs_get_page_size(); rcache_params.ucm_events = UCM_EVENT_VM_UNMAPPED; - if (md_attr->cap.reg_mem_types & ~UCS_BIT(UCT_MD_MEM_TYPE_HOST)) { + if (md_attr->cap.reg_mem_types & ~UCS_BIT(UCS_MEMORY_TYPE_HOST)) { rcache_params.ucm_events |= UCM_EVENT_MEM_TYPE_FREE; } rcache_params.ucm_event_priority = md_config->rcache.event_prio; rcache_params.context = md; rcache_params.ops = &uct_ib_rcache_ops; + rcache_params.flags = 0; status = ucs_rcache_create(&rcache_params, uct_ib_device_name(&md->dev), UCS_STATS_RVAL(md->stats), &md->rcache); @@ -1494,47 +1230,32 @@ uct_ib_md_parse_reg_methods(uct_ib_md_t *md, uct_md_attr_t *md_attr, continue; } - md->super.ops = &uct_ib_md_rcache_ops; - md->alloc_align = md_config->rcache.alignment; - md->reg_cost.overhead = md_config->rcache.overhead; - md->reg_cost.growth = 0; /* It's close enough to 0 */ + md->super.ops = &uct_ib_md_rcache_ops; + md->reg_cost = ucs_linear_func_make(md_config->rcache.overhead, 0); ucs_debug("%s: using registration cache", uct_ib_device_name(&md->dev)); return UCS_OK; -#if HAVE_DECL_IBV_EXP_REG_MR && HAVE_DECL_IBV_EXP_ODP_SUPPORT_IMPLICIT +#if HAVE_ODP_IMPLICIT } else if (!strcasecmp(md_config->reg_methods.rmtd[i], "odp")) { - if (!uct_ib_device_odp_has_global_mr(&md->dev)) { + if (!(md->dev.flags & UCT_IB_DEVICE_FLAG_ODP_IMPLICIT)) { ucs_debug("%s: on-demand-paging with global memory region is " "not supported", uct_ib_device_name(&md->dev)); continue; } - struct ibv_exp_reg_mr_in in; - memset(&in, 0, sizeof(in)); - in.pd = md->pd; - in.length = IBV_EXP_IMPLICIT_MR_SIZE; - in.exp_access = UCT_IB_MEM_ACCESS_FLAGS | IBV_EXP_ACCESS_ON_DEMAND; - md->global_odp.mr = UCS_PROFILE_CALL(ibv_exp_reg_mr, &in); - if (md->global_odp.mr == NULL) { - ucs_debug("%s: failed to register global mr: %m", - uct_ib_device_name(&md->dev)); + status = uct_ib_md_global_odp_init(md, &md->global_odp); + if (status != UCS_OK) { continue; } - md->global_odp.lkey = md->global_odp.mr->lkey; - md->global_odp.flags = UCT_IB_MEM_FLAG_ODP; - md->super.ops = &uct_ib_md_global_odp_ops; - md->alloc_align = 1; - md->reg_cost.overhead = 10e-9; - md->reg_cost.growth = 0; - uct_ib_mem_init(&md->global_odp, 0, in.exp_access); + md->super.ops = &uct_ib_md_global_odp_ops; + md->reg_cost = ucs_linear_func_make(10e-9, 0); ucs_debug("%s: using odp global key", uct_ib_device_name(&md->dev)); return UCS_OK; #endif } else if (!strcmp(md_config->reg_methods.rmtd[i], "direct")) { - md->super.ops = &uct_ib_md_ops; - md->alloc_align = 1; - md->reg_cost = md_config->uc_reg_cost; + md->super.ops = &uct_ib_md_ops; + md->reg_cost = md_config->uc_reg_cost; ucs_debug("%s: using direct registration", uct_ib_device_name(&md->dev)); return UCS_OK; @@ -1571,7 +1292,7 @@ uct_ib_md_parse_device_config(uct_ib_md_t *md, const uct_ib_md_config_t *md_conf spec = &md->custom_devices.specs[i]; nfields = sscanf(md_config->custom_devices.spec[i], "%hi:%hi:%m[^:]:%m[^:]:%hhu", - &spec->vendor_id, &spec->part_id, &spec->name, + &spec->pci_id.vendor, &spec->pci_id.device, &spec->name, &flags_str, &spec->priority); if (nfields < 2) { ucs_error("failed to parse device config '%s' (parsed: %d/%d)", @@ -1586,6 +1307,12 @@ uct_ib_md_parse_device_config(uct_ib_md_t *md, const uct_ib_md_config_t *md_conf spec->flags |= UCT_IB_DEVICE_FLAG_MLX4_PRM; } else if (*p == '5') { spec->flags |= UCT_IB_DEVICE_FLAG_MLX5_PRM; + } else if (*p == 'd') { + spec->flags |= UCT_IB_DEVICE_FLAG_DC_V1; + } else if (*p == 'D') { + spec->flags |= UCT_IB_DEVICE_FLAG_DC_V2; + } else if (*p == 'a') { + spec->flags |= UCT_IB_DEVICE_FLAG_AV; } else { ucs_error("invalid device flag: '%c'", *p); free(flags_str); @@ -1596,8 +1323,8 @@ uct_ib_md_parse_device_config(uct_ib_md_t *md, const uct_ib_md_config_t *md_conf free(flags_str); } - ucs_trace("added device '%s' vendor_id 0x%x part_id %d flags %c%c prio %d", - spec->name, spec->vendor_id, spec->part_id, + ucs_trace("added device '%s' vendor_id 0x%x device_id %d flags %c%c prio %d", + spec->name, spec->pci_id.vendor, spec->pci_id.device, (spec->flags & UCT_IB_DEVICE_FLAG_MLX4_PRM) ? '4' : '-', (spec->flags & UCT_IB_DEVICE_FLAG_MLX5_PRM) ? '5' : '-', spec->priority); @@ -1616,7 +1343,9 @@ static void uct_ib_md_release_reg_method(uct_ib_md_t *md) if (md->rcache != NULL) { ucs_rcache_destroy(md->rcache); } - uct_ib_memh_dereg(md, &md->global_odp); + if (md->global_odp != NULL) { + uct_ib_mem_dereg(&md->super, md->global_odp); + } } static ucs_status_t @@ -1655,19 +1384,21 @@ static double uct_ib_md_read_pci_bw(struct ibv_device *ib_device) ssize_t len; size_t i; - len = ucs_read_file(pci_width_str, sizeof(pci_width_str) - 1, 1, UCT_IB_MD_PCI_DATA_PATH_FMT, - ib_device->name, pci_width_file_name); + len = ucs_read_file(pci_width_str, sizeof(pci_width_str) - 1, 1, + UCT_IB_DEVICE_SYSFS_FMT, ib_device->name, + pci_width_file_name); if (len < 1) { - ucs_debug("failed to read file: " UCT_IB_MD_PCI_DATA_PATH_FMT, + ucs_debug("failed to read file: " UCT_IB_DEVICE_SYSFS_FMT, ib_device->name, pci_width_file_name); return DBL_MAX; /* failed to read file */ } pci_width_str[len] = '\0'; - len = ucs_read_file(pci_speed_str, sizeof(pci_speed_str) - 1, 1, UCT_IB_MD_PCI_DATA_PATH_FMT, - ib_device->name, pci_speed_file_name); + len = ucs_read_file(pci_speed_str, sizeof(pci_speed_str) - 1, 1, + UCT_IB_DEVICE_SYSFS_FMT, ib_device->name, + pci_speed_file_name); if (len < 1) { - ucs_debug("failed to read file: " UCT_IB_MD_PCI_DATA_PATH_FMT, + ucs_debug("failed to read file: " UCT_IB_DEVICE_SYSFS_FMT, ib_device->name, pci_speed_file_name); return DBL_MAX; /* failed to read file */ } @@ -1713,6 +1444,9 @@ static double uct_ib_md_pci_bw(const uct_ib_md_config_t *md_config, for (i = 0; i < md_config->pci_bw.count; i++) { if (!strcmp(ib_device->name, md_config->pci_bw.device[i].name)) { + if (UCS_CONFIG_BW_IS_AUTO(md_config->pci_bw.device[i].bw)) { + break; /* read data from system */ + } return md_config->pci_bw.device[i].bw; } } @@ -1720,20 +1454,26 @@ static double uct_ib_md_pci_bw(const uct_ib_md_config_t *md_config, return uct_ib_md_read_pci_bw(ib_device); } -ucs_status_t -uct_ib_md_open(const char *md_name, const uct_md_config_t *uct_md_config, uct_md_h *md_p) +ucs_status_t uct_ib_md_open(uct_component_t *component, const char *md_name, + const uct_md_config_t *uct_md_config, uct_md_h *md_p) { const uct_ib_md_config_t *md_config = ucs_derived_of(uct_md_config, uct_ib_md_config_t); ucs_status_t status = UCS_ERR_UNSUPPORTED; uct_ib_md_t *md = NULL; struct ibv_device **ib_device_list, *ib_device; - uct_ib_md_open_entry_t *md_open_entry; - char tmp_md_name[UCT_MD_NAME_MAX]; - int i, num_devices, ret; - uct_md_attr_t md_attr; + uct_ib_md_ops_entry_t *md_ops_entry; + int i, num_devices, ret, fork_init = 0; ucs_trace("opening IB device %s", md_name); +#if !HAVE_DEVX + if (md_config->devx == UCS_YES) { + ucs_error("DEVX requested but not supported"); + status = UCS_ERR_NO_DEVICE; + goto out; + } +#endif + /* Get device list from driver */ ib_device_list = ibv_get_device_list(&num_devices); if (ib_device_list == NULL) { @@ -1744,8 +1484,7 @@ uct_ib_md_open(const char *md_name, const uct_md_config_t *uct_md_config, uct_md ib_device = NULL; for (i = 0; i < num_devices; ++i) { - uct_ib_make_md_name(tmp_md_name, ib_device_list[i]); - if (!strcmp(tmp_md_name, md_name)) { + if (!strcmp(ibv_get_device_name(ib_device_list[i]), md_name)) { ib_device = ib_device_list[i]; break; } @@ -1757,13 +1496,35 @@ uct_ib_md_open(const char *md_name, const uct_md_config_t *uct_md_config, uct_md goto out_free_dev_list; } - ucs_list_for_each(md_open_entry, &uct_ib_md_open_list, list) { - status = md_open_entry->md_open(ib_device, &md); + if (md_config->fork_init != UCS_NO) { + ret = ibv_fork_init(); + if (ret) { + if (md_config->fork_init == UCS_YES) { + ucs_error("ibv_fork_init() failed: %m"); + status = UCS_ERR_IO_ERROR; + goto out_free_dev_list; + } + ucs_debug("ibv_fork_init() failed: %m, continuing, but fork may be unsafe."); + uct_ib_fork_warn_enable(); + } else { + fork_init = 1; + } + } else { + uct_ib_fork_warn_enable(); + } + + ucs_list_for_each(md_ops_entry, &uct_ib_md_ops_list, list) { + status = md_ops_entry->ops->open(ib_device, md_config, &md); if (status == UCS_OK) { + ucs_debug("%s: md open by '%s' is successful", md_name, + md_ops_entry->name); + md->ops = md_ops_entry->ops; break; } else if (status != UCS_ERR_UNSUPPORTED) { goto out_free_dev_list; } + ucs_debug("%s: md open by '%s' failed, trying next", md_name, + md_ops_entry->name); } if (status != UCS_OK) { @@ -1772,33 +1533,53 @@ uct_ib_md_open(const char *md_name, const uct_md_config_t *uct_md_config, uct_md goto out_free_dev_list; } - ucs_assert(md != NULL); - md->super.ops = &uct_ib_md_ops; - md->super.component = &uct_ib_mdc; - md->config = md_config->ext; - ucs_list_head_init(&md->dm_list); + /* cppcheck-suppress autoVariables */ + *md_p = &md->super; + md->fork_init = fork_init; + status = UCS_OK; + +out_free_dev_list: + ibv_free_device_list(ib_device_list); +out: + return status; +} + +void uct_ib_md_parse_relaxed_order(uct_ib_md_t *md, + const uct_ib_md_config_t *md_config) +{ + if (md_config->mr_relaxed_order == UCS_CONFIG_ON) { + if (IBV_ACCESS_RELAXED_ORDERING) { + md->relaxed_order = 1; + } else { + ucs_warn("relaxed order memory access requested but not supported"); + } + } else if (md_config->mr_relaxed_order == UCS_CONFIG_AUTO) { + if (ucs_cpu_prefer_relaxed_order()) { + md->relaxed_order = 1; + } + } +} + +ucs_status_t uct_ib_md_open_common(uct_ib_md_t *md, + struct ibv_device *ib_device, + const uct_ib_md_config_t *md_config) +{ + uct_md_attr_t md_attr; + ucs_status_t status; + + md->super.ops = &uct_ib_md_ops; + md->super.component = &uct_ib_component; + + if (md->config.odp.max_size == UCS_MEMUNITS_AUTO) { + md->config.odp.max_size = uct_ib_device_odp_max_size(&md->dev); + } /* Create statistics */ status = UCS_STATS_NODE_ALLOC(&md->stats, &uct_ib_md_stats_class, ucs_stats_get_root(), "%s-%p", ibv_get_device_name(ib_device), md); if (status != UCS_OK) { - goto err_free_md; - } - - if (md_config->fork_init != UCS_NO) { - ret = ibv_fork_init(); - if (ret) { - if (md_config->fork_init == UCS_YES) { - ucs_error("ibv_fork_init() failed: %m"); - status = UCS_ERR_IO_ERROR; - goto err_release_stats; - } - ucs_debug("ibv_fork_init() failed: %m, continuing, but fork may be unsafe."); - uct_ib_fork_warn_enable(); - } - } else { - uct_ib_fork_warn_enable(); + goto err; } status = uct_ib_device_init(&md->dev, ib_device, md_config->async_events @@ -1807,16 +1588,10 @@ uct_ib_md_open(const char *md_name, const uct_md_config_t *uct_md_config, uct_md goto err_release_stats; } - /* Disable contig pages allocator for IB transport objects */ - if (!md->config.enable_contig_pages) { - ibv_exp_setenv(md->dev.ibv_context, "MLX_QP_ALLOC_TYPE", "ANON", 0); - ibv_exp_setenv(md->dev.ibv_context, "MLX_CQ_ALLOC_TYPE", "ANON", 0); - } - - if (md->config.odp.max_size == UCS_CONFIG_MEMUNITS_AUTO) { - /* Must be done after we open and query the device */ - md->config.odp.max_size = uct_ib_device_odp_max_size(&md->dev); - } +#if HAVE_DECL_IBV_EXP_SETENV + ibv_exp_setenv(md->dev.ibv_context, "MLX_QP_ALLOC_TYPE", "ANON", 0); + ibv_exp_setenv(md->dev.ibv_context, "MLX_CQ_ALLOC_TYPE", "ANON", 0); +#endif if (strlen(md_config->subnet_prefix) > 0) { status = uct_ib_md_parse_subnet_prefix(md_config->subnet_prefix, @@ -1837,94 +1612,67 @@ uct_ib_md_open(const char *md_name, const uct_md_config_t *uct_md_config, uct_md goto err_cleanup_device; } - status = uct_ib_md_umr_qp_create(md); - if (status == UCS_ERR_UNSUPPORTED) { - md->umr_qp = NULL; - md->umr_cq = NULL; - } else if (status != UCS_OK) { - goto err_dealloc_pd; - } - status = uct_md_query(&md->super, &md_attr); if (status != UCS_OK) { - goto err_destroy_umr_qp; + goto err_dealloc_pd; } status = uct_ib_md_parse_reg_methods(md, &md_attr, md_config); if (status != UCS_OK) { - goto err_destroy_umr_qp; - } - - status = uct_ib_md_parse_device_config(md, md_config); - if (status != UCS_OK) { - goto err_release_reg_method; + goto err_dealloc_pd; } md->dev.max_zcopy_log_sge = INT_MAX; - if (md_attr.cap.reg_mem_types & ~UCS_BIT(UCT_MD_MEM_TYPE_HOST)) { + if (md_attr.cap.reg_mem_types & ~UCS_BIT(UCS_MEMORY_TYPE_HOST)) { md->dev.max_zcopy_log_sge = 1; } md->pci_bw = uct_ib_md_pci_bw(md_config, ib_device); + return UCS_OK; - *md_p = &md->super; - status = UCS_OK; - -out_free_dev_list: - ibv_free_device_list(ib_device_list); -out: - return status; - -err_release_reg_method: - uct_ib_md_release_reg_method(md); -err_destroy_umr_qp: - uct_ib_md_umr_qp_destroy(md); err_dealloc_pd: ibv_dealloc_pd(md->pd); err_cleanup_device: uct_ib_device_cleanup(&md->dev); err_release_stats: UCS_STATS_NODE_FREE(md->stats); -err_free_md: - ucs_free(md); - goto out_free_dev_list; +err: + return status; } void uct_ib_md_close(uct_md_h uct_md) { uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); - if (!ucs_list_is_empty(&md->dm_list)) { - ucs_warn("device memory list is not empty during md %s close", - uct_ib_device_name(&md->dev)); - } - + md->ops->cleanup(md); uct_ib_md_release_device_config(md); uct_ib_md_release_reg_method(md); - uct_ib_md_umr_qp_destroy(md); uct_ib_device_cleanup_ah_cached(&md->dev); ibv_dealloc_pd(md->pd); uct_ib_device_cleanup(&md->dev); + ibv_close_device(md->dev.ibv_context); UCS_STATS_NODE_FREE(md->stats); ucs_free(md); } +static uct_ib_md_ops_t uct_ib_verbs_md_ops; + static ucs_status_t uct_ib_verbs_md_open(struct ibv_device *ibv_device, + const uct_ib_md_config_t *md_config, uct_ib_md_t **p_md) { uct_ib_device_t *dev; ucs_status_t status; uct_ib_md_t *md; - int ret; + int num_mrs; md = ucs_calloc(1, sizeof(*md), "ib_md"); if (md == NULL) { return UCS_ERR_NO_MEMORY; } - md->ops = &uct_ib_verbs_md_ops; - dev = &md->dev; /* Open verbs context */ + dev = &md->dev; dev->ibv_context = ibv_open_device(ibv_device); if (dev->ibv_context == NULL) { ucs_error("ibv_open_device(%s) failed: %m", ibv_get_device_name(ibv_device)); @@ -1932,64 +1680,48 @@ static ucs_status_t uct_ib_verbs_md_open(struct ibv_device *ibv_device, goto err; } - /* Read device properties */ - IBV_EXP_DEVICE_ATTR_SET_COMP_MASK(&dev->dev_attr); -#if HAVE_DECL_IBV_EXP_QUERY_DEVICE - ret = ibv_exp_query_device(dev->ibv_context, &dev->dev_attr); -#elif HAVE_DECL_IBV_QUERY_DEVICE_EX - if (uct_ib_device_is_hns(ibv_device)) { - memset(&dev->dev_attr, 0, sizeof(dev->dev_attr)); - ret = ibv_query_device(dev->ibv_context, &dev->dev_attr.orig_attr); - } else { - ret = ibv_query_device_ex(dev->ibv_context, NULL, &dev->dev_attr); - } -#else - ret = ibv_query_device(dev->ibv_context, &dev->dev_attr); -#endif - if (ret != 0) { - ucs_error("ibv_query_device(%s) returned %d: %m", ibv_get_device_name(ibv_device), ret); - status = UCS_ERR_IO_ERROR; + md->config = md_config->ext; + + status = uct_ib_device_query(dev, ibv_device); + if (status != UCS_OK) { goto err_free_context; } - if (IBV_EXP_HAVE_ATOMIC_HCA(&dev->dev_attr) || - IBV_EXP_HAVE_ATOMIC_GLOB(&dev->dev_attr) || - IBV_EXP_HAVE_ATOMIC_HCA_REPLY_BE(&dev->dev_attr)) - { -#ifdef HAVE_IB_EXT_ATOMICS - if (dev->dev_attr.comp_mask & IBV_EXP_DEVICE_ATTR_EXT_ATOMIC_ARGS) { - dev->ext_atomic_arg_sizes = dev->dev_attr.ext_atom.log_atomic_arg_sizes; - } -# if HAVE_MASKED_ATOMICS_ENDIANNESS - if (dev->dev_attr.comp_mask & IBV_EXP_DEVICE_ATTR_MASKED_ATOMICS) { - dev->ext_atomic_arg_sizes |= - dev->dev_attr.masked_atomic.masked_log_atomic_arg_sizes; - dev->ext_atomic_arg_sizes_be = - dev->dev_attr.masked_atomic.masked_log_atomic_arg_sizes_network_endianness; - } -# endif - dev->ext_atomic_arg_sizes &= UCS_MASK(dev->dev_attr.ext_atom.log_max_atomic_inline + 1); -#endif + if (UCT_IB_HAVE_ODP_IMPLICIT(&dev->dev_attr)) { + md->dev.flags |= UCT_IB_DEVICE_FLAG_ODP_IMPLICIT; + } + + if (IBV_EXP_HAVE_ATOMIC_HCA(&dev->dev_attr)) { dev->atomic_arg_sizes = sizeof(uint64_t); - if (IBV_EXP_HAVE_ATOMIC_HCA_REPLY_BE(&dev->dev_attr)) { - dev->atomic_arg_sizes_be = sizeof(uint64_t); - } } -#if HAVE_DECL_IBV_EXP_DEVICE_DC_TRANSPORT && HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_EXP_DEVICE_CAP_FLAGS - if (dev->dev_attr.exp_device_cap_flags & IBV_EXP_DEVICE_DC_TRANSPORT) { - dev->flags |= UCT_IB_DEVICE_FLAG_DC; + md->ops = &uct_ib_verbs_md_ops; + status = uct_ib_md_parse_device_config(md, md_config); + if (status != UCS_OK) { + goto err_free_context; } -#endif -#if HAVE_DECL_IBV_EXP_DEVICE_ATTR_PCI_ATOMIC_CAPS - dev->pci_fadd_arg_sizes = dev->dev_attr.pci_atomic_caps.fetch_add << 2; - dev->pci_cswap_arg_sizes = dev->dev_attr.pci_atomic_caps.compare_swap << 2; -#endif + uct_ib_md_parse_relaxed_order(md, md_config); + num_mrs = 1; /* UCT_IB_MR_DEFAULT */ + + if (md->relaxed_order) { + ++num_mrs; /* UCT_IB_MR_STRICT_ORDER */ + } + + md->memh_struct_size = sizeof(uct_ib_verbs_mem_t) + + (sizeof(uct_ib_mr_t) * num_mrs); + status = uct_ib_md_open_common(md, ibv_device, md_config); + if (status != UCS_OK) { + goto err_dev_cfg; + } + + md->dev.flags = uct_ib_device_spec(&md->dev)->flags; *p_md = md; return UCS_OK; +err_dev_cfg: + uct_ib_md_release_device_config(md); err_free_context: ibv_close_device(dev->ibv_context); err: @@ -1997,10 +1729,37 @@ static ucs_status_t uct_ib_verbs_md_open(struct ibv_device *ibv_device, return status; } -UCT_IB_MD_OPEN(uct_ib_verbs_md_open, 0); +static uct_ib_md_ops_t uct_ib_verbs_md_ops = { + .open = uct_ib_verbs_md_open, + .cleanup = (uct_ib_md_cleanup_func_t)ucs_empty_function, + .reg_key = uct_ib_verbs_reg_key, + .dereg_key = uct_ib_verbs_dereg_key, + .reg_atomic_key = uct_ib_verbs_reg_atomic_key, + .dereg_atomic_key = (uct_ib_md_dereg_atomic_key_func_t)ucs_empty_function_return_success, + .reg_multithreaded = (uct_ib_md_reg_multithreaded_func_t)ucs_empty_function_return_unsupported, + .dereg_multithreaded = (uct_ib_md_dereg_multithreaded_func_t)ucs_empty_function_return_unsupported, + .mem_prefetch = (uct_ib_md_mem_prefetch_func_t)ucs_empty_function_return_success, + .get_atomic_mr_id = (uct_ib_md_get_atomic_mr_id_func_t)ucs_empty_function_return_unsupported, +}; -UCT_MD_COMPONENT_DEFINE(uct_ib_mdc, UCT_IB_MD_PREFIX, - uct_ib_query_md_resources, uct_ib_md_open, NULL, - uct_ib_rkey_unpack, - (void*)ucs_empty_function_return_success /* release */, - "IB_", uct_ib_md_config_table, uct_ib_md_config_t); +UCT_IB_MD_OPS(uct_ib_verbs_md_ops, 0); + +uct_component_t uct_ib_component = { + .query_md_resources = uct_ib_query_md_resources, + .md_open = uct_ib_md_open, + .cm_open = ucs_empty_function_return_unsupported, + .rkey_unpack = uct_ib_rkey_unpack, + .rkey_ptr = ucs_empty_function_return_unsupported, + .rkey_release = ucs_empty_function_return_success, + .name = "ib", + .md_config = { + .name = "IB memory domain", + .prefix = UCT_IB_CONFIG_PREFIX, + .table = uct_ib_md_config_table, + .size = sizeof(uct_ib_md_config_t), + }, + .cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY, + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_ib_component), + .flags = 0 +}; +UCT_COMPONENT_REGISTER(&uct_ib_component); diff --git a/src/uct/ib/base/ib_md.h b/src/uct/ib/base/ib_md.h index 1a97aa05a3a..2b657cc1e9b 100644 --- a/src/uct/ib/base/ib_md.h +++ b/src/uct/ib/base/ib_md.h @@ -21,6 +21,15 @@ #define UCT_IB_MD_DEFAULT_GID_INDEX 0 /**< The gid index used by default for an IB/RoCE port */ +#define UCT_IB_MEM_ACCESS_FLAGS (IBV_ACCESS_LOCAL_WRITE | \ + IBV_ACCESS_REMOTE_WRITE | \ + IBV_ACCESS_REMOTE_READ | \ + IBV_ACCESS_REMOTE_ATOMIC) + +#define UCT_IB_MEM_DEREG 0 +#define UCT_IB_CONFIG_PREFIX "IB_" + + /** * IB MD statistics counters */ @@ -32,25 +41,36 @@ enum { enum { - UCT_IB_MEM_FLAG_ODP = UCS_BIT(0), /**< The memory region has on - demand paging enabled */ - UCT_IB_MEM_FLAG_ATOMIC_MR = UCS_BIT(1), /**< The memory region has UMR - for the atomic access */ - UCT_IB_MEM_ACCESS_REMOTE_ATOMIC = UCS_BIT(2) /**< An atomic access was - requested for the memory - region */ + UCT_IB_MEM_FLAG_ODP = UCS_BIT(0), /**< The memory region has on + demand paging enabled */ + UCT_IB_MEM_FLAG_ATOMIC_MR = UCS_BIT(1), /**< The memory region has UMR + for the atomic access */ + UCT_IB_MEM_ACCESS_REMOTE_ATOMIC = UCS_BIT(2), /**< An atomic access was + requested for the memory + region */ + UCT_IB_MEM_MULTITHREADED = UCS_BIT(3), /**< The memory region registration + handled by chunks in parallel + threads */ + UCT_IB_MEM_FLAG_RELAXED_ORDERING = UCS_BIT(4), /**< The memory region will issue + PCIe writes with relaxed order + attribute */ }; +enum { + UCT_IB_DEVX_OBJ_RCQP, + UCT_IB_DEVX_OBJ_RCSRQ, + UCT_IB_DEVX_OBJ_DCT, + UCT_IB_DEVX_OBJ_DCSRQ +}; typedef struct uct_ib_md_ext_config { int eth_pause; /**< Whether or not Pause Frame is enabled on the Ethernet network */ int prefer_nearest_device; /**< Give priority for near device */ - int enable_contig_pages; /** Enable contiguous pages */ int enable_indirect_atomic; /** Enable indirect atomic */ int enable_gpudirect_rdma; /** Enable GPUDirect RDMA */ -#if HAVE_EXP_UMR +#ifdef HAVE_EXP_UMR unsigned max_inline_klm_list; /* Maximal length of inline KLM list */ #endif @@ -62,50 +82,34 @@ typedef struct uct_ib_md_ext_config { } odp; size_t gid_index; /**< IB GID index to use */ -} uct_ib_md_ext_config_t; - -#if HAVE_IBV_EXP_DM - -/* uct_mlx5_dm_va is used to get pointer to DM mapped into process address space */ -typedef struct uct_mlx5_dm_va { - struct ibv_exp_dm ibv_dm; - size_t length; - uint64_t *start_va; -} uct_mlx5_dm_va_t; - - -/* Device memory region */ -typedef struct uct_ib_device_mem { - struct ibv_exp_dm *dm; /* Device memory object */ - struct ibv_mr *mr; /* Direct map memory region */ - void *address; /* Virtual memory address */ - ucs_list_link_t list; /* Entry in DM list in memory domain */ -} uct_ib_device_mem_t; - -#endif /* HAVE_IBV_EXP_DM */ + size_t min_mt_reg; /**< Multi-threaded registration threshold */ + size_t mt_reg_chunk; /**< Multi-threaded registration chunk */ + int mt_reg_bind; /**< Multi-threaded registration bind to core */ +} uct_ib_md_ext_config_t; typedef struct uct_ib_mem { uint32_t lkey; + uint32_t rkey; uint32_t atomic_rkey; uint32_t flags; - struct ibv_mr *mr; -#if HAVE_EXP_UMR - struct ibv_mr *atomic_mr; -#endif } uct_ib_mem_t; -struct uct_ib_md; -typedef struct uct_ib_md_ops { - size_t memh_struct_size; - ucs_status_t (*reg_atomic_key)(struct uct_ib_md *md, - uct_ib_mem_t *memh, - off_t offset); - ucs_status_t (*dereg_atomic_key)(struct uct_ib_md *md, - uct_ib_mem_t *memh); -} uct_ib_md_ops_t; +typedef union uct_ib_mr { + struct ibv_mr *ib; +} uct_ib_mr_t; + + +typedef enum { + /* Default memory region with either strict or relaxed order */ + UCT_IB_MR_DEFAULT, + /* Additional memory region with strict order, + * if the default region is relaxed order */ + UCT_IB_MR_STRICT_ORDER, + UCT_IB_MR_LAST +} uct_ib_mr_type_t; /** @@ -114,15 +118,12 @@ typedef struct uct_ib_md_ops { typedef struct uct_ib_md { uct_md_t super; ucs_rcache_t *rcache; /**< Registration cache (can be NULL) */ - uct_ib_mem_t global_odp;/**< Implicit ODP memory handle */ + uct_mem_h global_odp;/**< Implicit ODP memory handle */ struct ibv_pd *pd; /**< IB memory domain */ uct_ib_device_t dev; /**< IB device */ - uct_linear_growth_t reg_cost; /**< Memory registration cost */ - uct_ib_md_ops_t *ops; - /* keep it in md because pd is needed to create umr_qp/cq */ - struct ibv_qp *umr_qp; /* special QP for creating UMR */ - struct ibv_cq *umr_cq; /* special CQ for creating UMR */ - UCS_STATS_NODE_DECLARE(stats); + ucs_linear_func_t reg_cost; /**< Memory registration cost */ + struct uct_ib_md_ops *ops; + UCS_STATS_NODE_DECLARE(stats) uct_ib_md_ext_config_t config; /* IB external configuration */ struct { uct_ib_device_spec_t *specs; /* Custom device specifications */ @@ -130,9 +131,10 @@ typedef struct uct_ib_md { } custom_devices; int check_subnet_filter; uint64_t subnet_filter; - size_t alloc_align; double pci_bw; - ucs_list_link_t dm_list; + int relaxed_order; + int fork_init; + size_t memh_struct_size; } uct_ib_md_t; @@ -146,7 +148,7 @@ typedef struct uct_ib_md_config { UCS_CONFIG_STRING_ARRAY_FIELD(rmtd) reg_methods; uct_md_rcache_config_t rcache; /**< Registration cache config */ - uct_linear_growth_t uc_reg_cost; /**< Memory registration cost estimation + ucs_linear_func_t uc_reg_cost; /**< Memory registration cost estimation without using the cache */ unsigned fork_init; /**< Use ibv_fork_init() */ int async_events; /**< Whether async events should be delivered */ @@ -158,8 +160,175 @@ typedef struct uct_ib_md_config { char *subnet_prefix; /**< Filter of subnet_prefix for IB ports */ UCS_CONFIG_ARRAY_FIELD(ucs_config_bw_spec_t, device) pci_bw; /**< List of PCI BW for devices */ + + unsigned devx; /**< DEVX support */ + unsigned devx_objs; /**< Objects to be created by DevX */ + ucs_on_off_auto_value_t mr_relaxed_order; /**< Allow reorder memory accesses */ } uct_ib_md_config_t; +/** + * Memory domain constructor. + * + * @param [in] ibv_device IB device. + * + * @param [in] md_config Memory domain configuration parameters. + * + * @param [out] md_p Handle to memory domain. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_ib_md_open_func_t)(struct ibv_device *ibv_device, + const uct_ib_md_config_t *md_config, + struct uct_ib_md **md_p); + +/** + * Memory domain destructor. + * + * @param [in] md Memory domain. + */ +typedef void (*uct_ib_md_cleanup_func_t)(struct uct_ib_md *); + +/** + * Memory domain method to register memory area. + * + * @param [in] md Memory domain. + * + * @param [in] address Memory area start address. + * + * @param [in] length Memory area length. + * + * @param [in] access IB verbs registration access flags + * + * @param [in] memh Memory region handle. + * Method should initialize lkey & rkey. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_ib_md_reg_key_func_t)(struct uct_ib_md *md, + void *address, size_t length, + uint64_t access, + uct_ib_mem_t *memh, + uct_ib_mr_type_t mr_type); + +/** + * Memory domain method to deregister memory area. + * + * @param [in] md Memory domain. + * + * @param [in] memh Memory region handle registered with + * uct_ib_md_reg_key_func_t. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_ib_md_dereg_key_func_t)(struct uct_ib_md *md, + uct_ib_mem_t *memh, + uct_ib_mr_type_t mr_type); + +/** + * Memory domain method to register memory area optimized for atomic ops. + * + * @param [in] md Memory domain. + * + * @param [in] memh Memory region handle registered for regular ops. + * Method should initialize atomic_rkey + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_ib_md_reg_atomic_key_func_t)(struct uct_ib_md *md, + uct_ib_mem_t *memh); + +/** + * Memory domain method to release resources registered for atomic ops. + * + * @param [in] md Memory domain. + * + * @param [in] memh Memory region handle registered with + * uct_ib_md_reg_atomic_key_func_t. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_ib_md_dereg_atomic_key_func_t)(struct uct_ib_md *md, + uct_ib_mem_t *memh); + +/** + * Memory domain method to register memory area using multiple threads. + * + * @param [in] md Memory domain. + * + * @param [in] address Memory area start address. + * + * @param [in] length Memory area length. + * + * @param [in] access IB verbs registration access flags + * + * @param [in] memh Memory region handle. + * Method should initialize lkey & rkey. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_ib_md_reg_multithreaded_func_t)(uct_ib_md_t *md, + void *address, + size_t length, + uint64_t access, + uct_ib_mem_t *memh, + uct_ib_mr_type_t mr_type); + +/** + * Memory domain method to deregister memory area. + * + * @param [in] md Memory domain. + * + * @param [in] memh Memory region handle registered with + * uct_ib_md_reg_key_func_t. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_ib_md_dereg_multithreaded_func_t)(uct_ib_md_t *md, + uct_ib_mem_t *memh, + uct_ib_mr_type_t mr_type); + +/** + * Memory domain method to prefetch physical memory for virtual memory area. + * + * @param [in] md Memory domain. + * + * @param [in] memh Memory region handle. + * + * @param [in] address Memory area start address. + * + * @param [in] length Memory area length. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_ib_md_mem_prefetch_func_t)(uct_ib_md_t *md, + uct_ib_mem_t *memh, + void *addr, size_t length); + +/** + * Memory domain method to get unique atomic mr id. + * + * @param [in] md Memory domain. + * + * @param [out] mr_id id to access atomic MR. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_ib_md_get_atomic_mr_id_func_t)(uct_ib_md_t *md, + uint8_t *mr_id); + +typedef struct uct_ib_md_ops { + uct_ib_md_open_func_t open; + uct_ib_md_cleanup_func_t cleanup; + uct_ib_md_reg_key_func_t reg_key; + uct_ib_md_dereg_key_func_t dereg_key; + uct_ib_md_reg_atomic_key_func_t reg_atomic_key; + uct_ib_md_dereg_atomic_key_func_t dereg_atomic_key; + uct_ib_md_reg_multithreaded_func_t reg_multithreaded; + uct_ib_md_dereg_multithreaded_func_t dereg_multithreaded; + uct_ib_md_mem_prefetch_func_t mem_prefetch; + uct_ib_md_get_atomic_mr_id_func_t get_atomic_mr_id; +} uct_ib_md_ops_t; + /** * IB memory region in the registration cache. @@ -177,34 +346,31 @@ typedef struct uct_ib_rcache_region { * - setup atomic MR ops * - determine device attributes and flags */ -typedef struct uct_ib_md_open_entry { +typedef struct uct_ib_md_ops_entry { ucs_list_link_t list; - ucs_status_t (*md_open)(struct ibv_device *ibv_device, - uct_ib_md_t **p_md); -} uct_ib_md_open_entry_t; + const char *name; + uct_ib_md_ops_t *ops; + int priority; +} uct_ib_md_ops_entry_t; -#define UCT_IB_MD_OPEN(_open_fn, _priority) \ +#define UCT_IB_MD_OPS(_md_ops, _priority) \ + extern ucs_list_link_t uct_ib_md_ops_list; \ UCS_STATIC_INIT { \ - extern ucs_list_link_t uct_ib_md_open_list; \ - static uct_ib_md_open_entry_t entry = { \ - .md_open = _open_fn, \ + static uct_ib_md_ops_entry_t *p, entry = { \ + .name = UCS_PP_MAKE_STRING(_md_ops), \ + .ops = &_md_ops, \ + .priority = _priority, \ }; \ - if (_priority) { \ - ucs_list_add_head(&uct_ib_md_open_list, &entry.list); \ - } else { \ - ucs_list_add_tail(&uct_ib_md_open_list, &entry.list); \ + ucs_list_for_each(p, &uct_ib_md_ops_list, list) { \ + if (p->priority < _priority) { \ + ucs_list_insert_before(&p->list, &entry.list); \ + return; \ + } \ } \ + ucs_list_add_tail(&uct_ib_md_ops_list, &entry.list); \ } - -extern uct_md_component_t uct_ib_mdc; - - -/** - * Calculate unique id for atomic - */ -uint8_t uct_ib_md_get_atomic_mr_id(uct_ib_md_t *md); - +extern uct_component_t uct_ib_component; static inline uint32_t uct_ib_md_direct_rkey(uct_rkey_t uct_rkey) { @@ -251,12 +417,54 @@ static inline uint16_t uct_ib_md_atomic_offset(uint8_t atomic_mr_id) return 8 * atomic_mr_id; } +static inline void +uct_ib_memh_init_keys(uct_ib_mem_t *memh, uint32_t lkey, uint32_t rkey) +{ + memh->lkey = lkey; + memh->rkey = rkey; +} -void uct_ib_make_md_name(char md_name[UCT_MD_NAME_MAX], struct ibv_device *device); +static inline uct_ib_mr_type_t +uct_ib_memh_get_atomic_base_mr_type(uct_ib_mem_t *memh) +{ + if (memh->flags & UCT_IB_MEM_FLAG_RELAXED_ORDERING) { + return UCT_IB_MR_STRICT_ORDER; + } else { + return UCT_IB_MR_DEFAULT; + } +} + +static UCS_F_ALWAYS_INLINE uint32_t uct_ib_memh_get_lkey(uct_mem_h memh) +{ + ucs_assert(memh != UCT_MEM_HANDLE_NULL); + return ((uct_ib_mem_t*)memh)->lkey; +} -ucs_status_t -uct_ib_md_open(const char *md_name, const uct_md_config_t *uct_md_config, uct_md_h *md_p); + +ucs_status_t uct_ib_md_open(uct_component_t *component, const char *md_name, + const uct_md_config_t *uct_md_config, uct_md_h *md_p); + +ucs_status_t uct_ib_md_open_common(uct_ib_md_t *md, + struct ibv_device *ib_device, + const uct_ib_md_config_t *md_config); void uct_ib_md_close(uct_md_h uct_md); +ucs_status_t uct_ib_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t access, struct ibv_mr **mr_p); +ucs_status_t uct_ib_dereg_mr(struct ibv_mr *mr); +ucs_status_t uct_ib_dereg_mrs(struct ibv_mr **mrs, size_t mr_num); + +ucs_status_t +uct_ib_md_handle_mr_list_multithreaded(uct_ib_md_t *md, void *address, + size_t length, uint64_t access, + size_t chunk, struct ibv_mr **mrs); + +void uct_ib_md_parse_relaxed_order(uct_ib_md_t *md, + const uct_ib_md_config_t *md_config); + +ucs_status_t uct_ib_reg_key_impl(uct_ib_md_t *md, void *address, + size_t length, uint64_t access_flags, + uct_ib_mem_t *memh, uct_ib_mr_t *mrs, + uct_ib_mr_type_t mr_type); #endif diff --git a/src/uct/ib/base/ib_verbs.h b/src/uct/ib/base/ib_verbs.h index 5753d61df69..14b2a355bc3 100644 --- a/src/uct/ib/base/ib_verbs.h +++ b/src/uct/ib/base/ib_verbs.h @@ -1,6 +1,7 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. * Copyright (C) UT-Battelle, LLC. 2014. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -18,6 +19,9 @@ #include +#include +#include + #ifndef HAVE_VERBS_EXP_H # define IBV_EXP_SEND_INLINE IBV_SEND_INLINE # define IBV_EXP_SEND_SIGNALED IBV_SEND_SIGNALED @@ -37,12 +41,6 @@ # define IBV_EXP_ACCESS_REMOTE_ATOMIC IBV_ACCESS_REMOTE_ATOMIC # define ibv_exp_reg_shared_mr ibv_reg_shared_mr_ex # define ibv_exp_reg_shared_mr_in ibv_reg_shared_mr_in -# if HAVE_DECL_IBV_QUERY_DEVICE_EX -# define ibv_exp_device_attr ibv_device_attr_ex -# define IBV_DEV_ATTR(_dev, _attr) ((_dev)->dev_attr.orig_attr._attr) -# else -# define ibv_exp_device_attr ibv_device_attr -# endif # define ibv_exp_send_wr ibv_send_wr # define exp_opcode opcode # define ibv_exp_post_send ibv_post_send @@ -60,23 +58,97 @@ # define IBV_SHARED_MR_ACCESS_FLAGS(_shared_mr) ((_shared_mr)->exp_access) # define IBV_EXP_DEVICE_ATTR_SET_COMP_MASK(_attr) -# define IBV_EXP_PORT_ATTR_SET_COMP_MASK(_attr) #else # define IBV_SHARED_MR_ACCESS_FLAGS(_shared_mr) ((_shared_mr)->access) +#endif /* HAVE_VERBS_EXP_H */ + +/* Read device properties */ +#if HAVE_DECL_IBV_EXP_QUERY_DEVICE + +# define IBV_DEV_ATTR(_dev, _attr) ((_dev)->dev_attr._attr) + +typedef struct ibv_exp_device_attr uct_ib_device_attr; + +static inline ucs_status_t uct_ib_query_device(struct ibv_context *ctx, + uct_ib_device_attr* attr) { + int ret; #if HAVE_DECL_IBV_EXP_DEVICE_ATTR_RESERVED_2 -# define IBV_EXP_DEVICE_ATTR_SET_COMP_MASK(_attr) do { \ - (_attr)->comp_mask = 0xffffffff; \ - (_attr)->comp_mask_2 = (IBV_EXP_DEVICE_ATTR_RESERVED_2 - 1); \ - } while (0) + attr->comp_mask = 0xffffffff; + attr->comp_mask_2 = IBV_EXP_DEVICE_ATTR_RESERVED_2 - 1; +#else + attr->comp_mask = IBV_EXP_DEVICE_ATTR_RESERVED - 1; +#endif + ret = ibv_exp_query_device(ctx, attr); + if (ret != 0) { + ucs_error("ibv_exp_query_device(%s) returned %d: %m", + ibv_get_device_name(ctx->device), ret); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} +#elif HAVE_DECL_IBV_QUERY_DEVICE_EX + +# define IBV_DEV_ATTR(_dev, _attr) ((_dev)->dev_attr.orig_attr._attr) + +typedef struct ibv_device_attr_ex uct_ib_device_attr; + + +/** + * @return true if device name begins with "hns". + */ +static inline int uct_ib_device_is_hns(struct ibv_context *ctx) +{ +#if HAVE_HNS_ROCE +#define UCT_IB_DEVICE_HNS "hns" +#define UCT_IB_DEVICE_HNS_LEN 3 + return !strncmp(ibv_get_device_name(ctx->device), UCT_IB_DEVICE_HNS, UCT_IB_DEVICE_HNS_LEN); +#else + return 0; +#endif +} + +static inline ucs_status_t uct_ib_query_device(struct ibv_context *ctx, + uct_ib_device_attr* attr) { + int ret; + + attr->comp_mask = 0; + if (uct_ib_device_is_hns(ctx)) { + memset(attr, 0, sizeof(*attr)); + ret = ibv_query_device(ctx, &attr->orig_attr); + } else { + ret = ibv_query_device_ex(ctx, NULL, attr); + } + if (ret != 0) { + ucs_error("ibv_query_device_ex(%s) returned %d: %m", + ibv_get_device_name(ctx->device), ret); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + #else -# define IBV_EXP_DEVICE_ATTR_SET_COMP_MASK(_attr) (_attr)->comp_mask = (IBV_EXP_DEVICE_ATTR_RESERVED - 1) -#endif /* HAVE_DECL_IBV_EXP_DEVICE_ATTR_RESERVED_2 */ -# define IBV_EXP_PORT_ATTR_SET_COMP_MASK(_attr) (_attr)->comp_mask = 0 -#endif /* HAVE_VERBS_EXP_H */ -#ifndef IBV_DEV_ATTR # define IBV_DEV_ATTR(_dev, _attr) ((_dev)->dev_attr._attr) + +typedef struct ibv_device_attr uct_ib_device_attr; + +static inline ucs_status_t uct_ib_query_device(struct ibv_context *ctx, + uct_ib_device_attr* attr) { + int ret; + + ret = ibv_query_device(ctx, attr); + if (ret != 0) { + ucs_error("ibv_query_device(%s) returned %d: %m", + ibv_get_device_name(ctx->device), ret); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + #endif @@ -93,7 +165,7 @@ /* * On-demand paging support */ -#if HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS +#ifdef HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS # define IBV_EXP_HAVE_ODP(_attr) ((_attr)->odp_caps.general_odp_caps & IBV_EXP_ODP_SUPPORT) # define IBV_EXP_ODP_CAPS(_attr, _xport) ((_attr)->odp_caps.per_transport_caps._xport##_odp_caps) #else @@ -101,12 +173,34 @@ # define IBV_EXP_ODP_CAPS(_attr, _xport) 0 #endif -#if !HAVE_DECL_IBV_EXP_ACCESS_ON_DEMAND -# define IBV_EXP_ACCESS_ON_DEMAND 0 +#if HAVE_ODP +# ifdef HAVE_VERBS_EXP_H +# define IBV_ACCESS_ON_DEMAND IBV_EXP_ACCESS_ON_DEMAND +# define ibv_reg_mr_func_name "ibv_exp_reg_mr" +# else +# define ibv_reg_mr_func_name "ibv_reg_mr" +# endif +#else +# define IBV_ACCESS_ON_DEMAND 0 +# define ibv_reg_mr_func_name "ibv_reg_mr" +#endif + +#if HAVE_ODP_IMPLICIT +# ifdef HAVE_VERBS_EXP_H +# define UCT_IB_HAVE_ODP_IMPLICIT(_attr) ((_attr)->odp_caps.general_odp_caps & IBV_EXP_ODP_SUPPORT_IMPLICIT) +# else +# define UCT_IB_HAVE_ODP_IMPLICIT(_attr) ((_attr)->odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT) +# endif +#else +# define UCT_IB_HAVE_ODP_IMPLICIT(_attr) 0 +#endif + +#if !HAVE_DECL_IBV_ACCESS_RELAXED_ORDERING +# define IBV_ACCESS_RELAXED_ORDERING 0 #endif #if !HAVE_DECL_IBV_EXP_PREFETCH_WRITE_ACCESS -# define IBV_EXP_PREFETCH_WRITE_ACCESS IBV_EXP_ACCESS_LOCAL_WRITE +# define IBV_EXP_PREFETCH_WRITE_ACCESS IBV_EXP_ACCESS_LOCAL_WRITE #endif /* @@ -114,15 +208,6 @@ */ #define IBV_DEVICE_HAS_DC(dev) (dev->flags & UCT_IB_DEVICE_FLAG_DC) -/* - * NOP support - */ -#if HAVE_DECL_IBV_EXP_WR_NOP -# define IBV_DEVICE_HAS_NOP(_attr) ((_attr)->exp_device_cap_flags & IBV_EXP_DEVICE_NOP) -#else -# define IBV_DEVICE_HAS_NOP(_attr) 0 -#endif /* HAVE_DECL_IBV_EXP_WR_NOP */ - /* * Adaptive Routing support */ @@ -201,48 +286,10 @@ static inline int ibv_exp_cq_ignore_overrun(struct ibv_cq *cq) # define IBV_PORT_IS_LINK_LAYER_ETHERNET(_attr) 0 #endif -/* - * HW tag matching - */ -#if IBV_HW_TM -# if HAVE_INFINIBAND_TM_TYPES_H -# include -# else -# define ibv_tmh ibv_exp_tmh -# define ibv_rvh ibv_exp_tmh_rvh -# define ibv_ravh ibv_exp_tmh_ravh -# define IBV_TMH_EAGER IBV_EXP_TMH_EAGER -# define IBV_TMH_RNDV IBV_EXP_TMH_RNDV -# define IBV_TMH_FIN IBV_EXP_TMH_FIN -# define IBV_TMH_NO_TAG IBV_EXP_TMH_NO_TAG -# define IBV_TM_CAP_RC IBV_EXP_TM_CAP_RC -# define IBV_TM_CAP_DC IBV_EXP_TM_CAP_DC -# endif -# if HAVE_STRUCT_IBV_TM_CAPS_FLAGS -# define IBV_DEVICE_TM_FLAGS(_dev) ((_dev)->dev_attr.tm_caps.flags) -# else -# define IBV_DEVICE_TM_FLAGS(_dev) ((_dev)->dev_attr.tm_caps.capability_flags) -# endif -# define IBV_DEVICE_TM_CAPS(_dev, _field) ((_dev)->dev_attr.tm_caps._field) +#if HAVE_DECL_IBV_QPF_GRH_REQUIRED +# define uct_ib_grh_required(_attr) ((_attr)->flags & IBV_QPF_GRH_REQUIRED) #else -# define IBV_DEVICE_TM_CAPS(_dev, _field) 0 -# define IBV_TM_CAP_RC 0 -# define IBV_TM_CAP_DC 0 -#endif - -#ifndef IBV_EXP_HW_TM_DC -# define IBV_EXP_TM_CAP_DC 0 -#endif - -#define IBV_DEVICE_MAX_UNEXP_COUNT UCS_BIT(14) -#define IBV_DEVICE_MIN_UWQ_POST 33 - -#if !HAVE_DECL_IBV_EXP_CREATE_SRQ -# if HAVE_DECL_IBV_CREATE_SRQ_EX -# define ibv_exp_create_srq_attr ibv_srq_init_attr_ex -# else -# define ibv_exp_create_srq_attr ibv_srq_init_attr -# endif +# define uct_ib_grh_required(_attr) 0 #endif typedef uint8_t uct_ib_uint24_t[3]; @@ -260,6 +307,44 @@ static inline uint32_t uct_ib_unpack_uint24(const uct_ib_uint24_t buf) return buf[0] | ((uint32_t)buf[1] << 8) | ((uint32_t)buf[2] << 16); } +static inline void uct_ib_destroy_qp(struct ibv_qp *qp) +{ + int ret; + + ret = ibv_destroy_qp(qp); + if (ret) { + ucs_warn("ibv_destroy_qp() failed: %m"); + } +} + +static inline void uct_ib_destroy_srq(struct ibv_srq *srq) +{ + int ret; + + ret = ibv_destroy_srq(srq); + if (ret) { + ucs_warn("ibv_destroy_srq() failed: %m"); + } +} + +static inline ucs_status_t uct_ib_qp_max_send_sge(struct ibv_qp *qp, + uint32_t *max_send_sge) +{ + struct ibv_qp_attr qp_attr; + struct ibv_qp_init_attr qp_init_attr; + int ret; + + ret = ibv_query_qp(qp, &qp_attr, IBV_QP_CAP, &qp_init_attr); + if (ret) { + ucs_error("Failed to query UD QP(ret=%d): %m", ret); + return UCS_ERR_IO_ERROR; + } + + *max_send_sge = qp_attr.cap.max_send_sge; + + return UCS_OK; +} + typedef struct uct_ib_qpnum { uct_ib_uint24_t qp_num; } uct_ib_qpnum_t; diff --git a/src/uct/ib/cm/cm_ep.c b/src/uct/ib/cm/cm_ep.c index 8c72a679a44..9f2435fd2a5 100644 --- a/src/uct/ib/cm/cm_ep.c +++ b/src/uct/ib/cm/cm_ep.c @@ -6,6 +6,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "cm.h" #include @@ -28,8 +32,6 @@ static UCS_CLASS_INIT_FUNC(uct_cm_ep_t, const uct_ep_params_t *params) UCT_EP_PARAMS_CHECK_DEV_IFACE_ADDRS(params); UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super.super); - uct_ib_address_unpack((const uct_ib_address_t*)params->dev_addr, &self->dlid, - &self->dgid); self->dest_service_id = *(const uint32_t*)params->iface_addr; return UCS_OK; } @@ -49,10 +51,10 @@ static ucs_status_t uct_cm_ep_fill_path_rec(uct_cm_ep_t *ep, { uct_cm_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_cm_iface_t); - path->sgid = iface->super.gid; + path->sgid = iface->super.gid_info.gid; path->dlid = htons(ep->dlid); path->slid = htons(uct_ib_iface_port_attr(&iface->super)->lid); - if (iface->super.is_global_addr) { + if (iface->super.config.force_global_addr) { ucs_assert_always(ep->dgid.global.interface_id != 0); path->dgid = ep->dgid; path->hop_limit = iface->super.config.hop_limit; @@ -65,7 +67,7 @@ static ucs_status_t uct_cm_ep_fill_path_rec(uct_cm_ep_t *ep, path->traffic_class = iface->super.config.traffic_class; path->reversible = htonl(1); /* IBCM currently only supports reversible paths */ path->numb_path = 0; - path->pkey = ntohs(iface->super.pkey_value); + path->pkey = ntohs(iface->super.pkey); path->sl = iface->super.config.sl; path->mtu_selector = 2; /* EQ */ path->mtu = uct_ib_iface_port_attr(&iface->super)->active_mtu; @@ -82,8 +84,8 @@ static void uct_cm_dump_path(struct ibv_sa_path_rec *path) char sgid_buf[256]; char dgid_buf[256]; - inet_ntop(AF_INET6, &path->dgid, dgid_buf, sizeof(dgid_buf)); - inet_ntop(AF_INET6, &path->sgid, sgid_buf, sizeof(sgid_buf)); + uct_ib_gid_str(&path->dgid, dgid_buf, sizeof(dgid_buf)); + uct_ib_gid_str(&path->sgid, sgid_buf, sizeof(sgid_buf)); ucs_trace_data("slid %d sgid %s dlid %d dgid %s", ntohs(path->slid), sgid_buf, ntohs(path->dlid), dgid_buf); diff --git a/src/uct/ib/cm/cm_iface.c b/src/uct/ib/cm/cm_iface.c index f3643f29b0a..f54d8a07b8a 100644 --- a/src/uct/ib/cm/cm_iface.c +++ b/src/uct/ib/cm/cm_iface.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "cm.h" #include @@ -16,7 +20,7 @@ static ucs_config_field_t uct_cm_iface_config_table[] = { - {"IB_", "RX_INLINE=0", NULL, + {UCT_IB_CONFIG_PREFIX, "RX_INLINE=0", NULL, ucs_offsetof(uct_cm_iface_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_ib_iface_config_table)}, {"TIMEOUT", "300ms", "Timeout for MAD layer", @@ -191,7 +195,7 @@ static void uct_cm_iface_outstanding_purge(uct_cm_iface_t *iface) iface->num_outstanding = 0; } -static void uct_cm_iface_event_handler(int fd, void *arg) +static void uct_cm_iface_event_handler(int fd, int events, void *arg) { uct_cm_iface_t *iface = arg; struct ib_cm_event *event; @@ -260,7 +264,8 @@ static void uct_cm_iface_event_handler(int fd, void *arg) static void uct_cm_iface_release_desc(uct_recv_desc_t *self, void *desc) { uct_ib_iface_t *iface = ucs_container_of(self, uct_ib_iface_t, release_desc); - ucs_free(desc - iface->config.rx_headroom_offset); + /* Don't use UCS_PTR_BYTE_OFFSET here due to coverity false positive report */ + ucs_free((char*)desc - iface->config.rx_headroom_offset); } static UCS_CLASS_INIT_FUNC(uct_cm_iface_t, uct_md_h md, uct_worker_h worker, @@ -274,10 +279,10 @@ static UCS_CLASS_INIT_FUNC(uct_cm_iface_t, uct_md_h md, uct_worker_h worker, ucs_trace_func(""); - init_attr.tx_cq_len = 1; - init_attr.rx_cq_len = config->super.rx.queue_len; - init_attr.seg_size = ucs_min(IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE, - config->super.super.max_bcopy); + init_attr.cq_len[UCT_IB_DIR_TX] = 1; + init_attr.cq_len[UCT_IB_DIR_RX] = config->super.rx.queue_len; + init_attr.seg_size = ucs_min(IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE, + config->super.seg_size); UCS_CLASS_CALL_SUPER_INIT(uct_ib_iface_t, &uct_cm_iface_ops, md, worker, params, &config->super, &init_attr); @@ -346,7 +351,7 @@ static UCS_CLASS_INIT_FUNC(uct_cm_iface_t, uct_md_h md, uct_worker_h worker, } status = ucs_async_set_event_handler(self->super.super.worker->async->mode, - self->cmdev->fd, POLLIN, + self->cmdev->fd, UCS_EVENT_SET_EVREAD, uct_cm_iface_event_handler, self, self->super.super.worker->async); if (status != UCS_OK) { @@ -450,20 +455,18 @@ static uct_ib_iface_ops_t uct_cm_iface_ops = { .iface_is_reachable = uct_ib_iface_is_reachable }, .create_cq = uct_ib_verbs_create_cq, - .arm_cq = (void*)ucs_empty_function_return_success, - .init_res_domain = (void*)ucs_empty_function_return_success, - .cleanup_res_domain = (void*)ucs_empty_function, + .arm_cq = ucs_empty_function_return_success, }; -static int uct_cm_is_module_loaded(uct_md_h md) +static int uct_cm_is_module_loaded(uct_ib_md_t *ib_md) { struct ib_cm_device *cmdev = NULL; - cmdev = ib_cm_open_device(ucs_derived_of(md, uct_ib_md_t)->dev.ibv_context); + cmdev = ib_cm_open_device(ib_md->dev.ibv_context); if (cmdev == NULL) { ucs_debug("ib_cm_open_device() for %s failed: %m. " "Check if ib_ucm.ko module is loaded.", - uct_ib_device_name(&ucs_derived_of(md, uct_ib_md_t)->dev)); + uct_ib_device_name(&ib_md->dev)); return 0; } @@ -471,26 +474,21 @@ static int uct_cm_is_module_loaded(uct_md_h md) return 1; } -static ucs_status_t uct_cm_query_resources(uct_md_h md, - uct_tl_resource_desc_t **resources_p, - unsigned *num_resources_p) +static ucs_status_t +uct_cm_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) { - if (uct_cm_is_module_loaded(md)) { - return uct_ib_device_query_tl_resources(&ucs_derived_of(md, uct_ib_md_t)->dev, - "cm", UCT_IB_DEVICE_FLAG_LINK_IB, - resources_p, num_resources_p); - } else { - *num_resources_p = 0; - *resources_p = NULL; + uct_ib_md_t *ib_md = ucs_derived_of(md, uct_ib_md_t); + + if (!uct_cm_is_module_loaded(ib_md)) { + *num_tl_devices_p = 0; + *tl_devices_p = NULL; return UCS_OK; } + + return uct_ib_device_query_ports(&ib_md->dev, UCT_IB_DEVICE_FLAG_LINK_IB, + tl_devices_p, num_tl_devices_p); } -UCT_TL_COMPONENT_DEFINE(uct_cm_tl, - uct_cm_query_resources, - uct_cm_iface_t, - "cm", - "CM_", - uct_cm_iface_config_table, - uct_cm_iface_config_t); -UCT_MD_REGISTER_TL(&uct_ib_mdc, &uct_cm_tl); +UCT_TL_DEFINE(&uct_ib_component, cm, uct_cm_query_tl_devices, uct_cm_iface_t, + "CM_", uct_cm_iface_config_table, uct_cm_iface_config_t); diff --git a/src/uct/ib/cm/configure.m4 b/src/uct/ib/cm/configure.m4 index e85d404d998..b7bc0416a05 100644 --- a/src/uct/ib/cm/configure.m4 +++ b/src/uct/ib/cm/configure.m4 @@ -20,7 +20,7 @@ AS_IF([test "x$with_cm" != xno], [save_LIBS="$LIBS" AC_CHECK_LIB([ibcm], [ib_cm_send_req], [AC_SUBST(IBCM_LIBS, [-libcm]) - uct_ib_modules+=":cm" + uct_ib_modules="${uct_ib_modules}:cm" cm_happy="yes"], [AS_IF([test "x$with_cm" = xyes], [AC_MSG_ERROR([CM requested but lib ibcm not found])], diff --git a/src/uct/ib/configure.m4 b/src/uct/ib/configure.m4 index 3832604e9e5..bc204f6a70a 100644 --- a/src/uct/ib/configure.m4 +++ b/src/uct/ib/configure.m4 @@ -2,7 +2,7 @@ # Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. # Copyright (C) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. # Copyright (C) The University of Tennessee and the University of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED. -# Copyright (C) Huawei Technologies Co.,Ltd. 2020. ALL RIGHTS RESERVED. +# Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. # # See file LICENSE for terms. # @@ -54,10 +54,7 @@ AC_ARG_WITH([mlx5-dv], [AC_HELP_STRING([--with-mlx5-dv], [Compile with mlx5 Direct Verbs support. Direct Verbs (DV) support provides additional acceleration capabilities that are not available in a - regular mode.])], - [], - [with_mlx5_dv=yes]) - + regular mode.])]) # # TM (IB Tag Matching) Support @@ -76,6 +73,10 @@ AC_ARG_WITH([dm], [], [with_dm=yes]) +# +# DEVX Support +# +AC_ARG_WITH([devx], [], [], [with_devx=check]) # # Check basic IB support: User wanted at least one IB transport, and we found @@ -129,7 +130,7 @@ AS_IF([test "x$with_ib" = "xyes"], save_LDFLAGS="$LDFLAGS" save_CFLAGS="$CFLAGS" save_CPPFLAGS="$CPPFLAGS" - LDFLAGS="$IBVERBS_LDFAGS $LDFLAGS" + LDFLAGS="$IBVERBS_LDFLAGS $LDFLAGS" CFLAGS="$IBVERBS_CFLAGS $CFLAGS" CPPFLAGS="$IBVERBS_CPPFLAGS $CPPFLAGS" AC_CHECK_HEADER([infiniband/verbs_exp.h], @@ -203,9 +204,9 @@ AS_IF([test "x$with_ib" = "xyes"], mlx5dv_init_obj, mlx5dv_create_qp, mlx5dv_is_supported, + mlx5dv_devx_subscribe_devx_event, MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE, - MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE, - MLX5DV_CONTEXT_FLAGS_DEVX], + MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE], [], [], [[#include ]]) AC_CHECK_MEMBERS([struct mlx5dv_cq.cq_uar], [], [], [[#include ]]) @@ -219,6 +220,15 @@ AS_IF([test "x$with_ib" = "xyes"], AC_CHECK_DECLS([ibv_alloc_td], [has_res_domain=yes], [], [[#include ]])]) + AS_IF([test "x$with_devx" != xno], [ + AC_CHECK_DECL(MLX5DV_CONTEXT_FLAGS_DEVX, [ + AC_DEFINE([HAVE_DEVX], [1], [DEVX support]) + have_devx=yes + ], [ + AS_IF([test "x$with_devx" != xcheck], + [AC_MSG_ERROR([devx requested but not found])]) + ], [[#include ]])]) + AS_IF([test "x$has_res_domain" = "xyes" -a "x$have_cq_io" = "xyes" ], [], [ with_mlx5_hw=no]) @@ -297,6 +307,9 @@ AS_IF([test "x$with_ib" = "xyes"], [], [[#include ]]) + AC_CHECK_MEMBERS([struct ibv_device_attr_ex.pci_atomic_caps], + [], [], [[#include ]]) + # Extended atomics AS_IF([test "x$have_ext_atomics" != xno], [AC_DEFINE([HAVE_IB_EXT_ATOMICS], 1, [IB extended atomics support])], @@ -310,10 +323,44 @@ AS_IF([test "x$with_ib" = "xyes"], AC_CHECK_DECLS(IBV_EXP_ODP_SUPPORT_IMPLICIT, [], [], [[#include ]]) + AC_CHECK_DECLS(IBV_EXP_ACCESS_ON_DEMAND, [with_odp=yes], [], + [[#include ]]) + + AC_CHECK_DECLS(IBV_ACCESS_ON_DEMAND, [with_odp=yes], [], + [[#include ]]) + + AS_IF([test "x$with_odp" = "xyes" ], [ + AC_DEFINE([HAVE_ODP], 1, [ODP support]) + + AC_CHECK_DECLS(IBV_EXP_ODP_SUPPORT_IMPLICIT, [with_odp_i=yes], [], + [[#include ]]) + + AC_CHECK_DECLS(IBV_ODP_SUPPORT_IMPLICIT, [with_odp_i=yes], [], + [[#include ]]) + + AS_IF([test "x$with_odp_i" = "xyes" ], [ + AC_DEFINE([HAVE_ODP_IMPLICIT], 1, [Implicit ODP support])])]) + + AC_CHECK_DECLS([IBV_ACCESS_RELAXED_ORDERING, + IBV_QPF_GRH_REQUIRED], + [], [], [[#include ]]) + + AC_CHECK_DECLS(ibv_exp_prefetch_mr, [with_prefetch=yes], [], + [[#include ]]) + + AC_CHECK_DECLS(ibv_advise_mr, [with_prefetch=yes], [], + [[#include ]]) + + AS_IF([test "x$with_prefetch" = "xyes" ], [ + AC_DEFINE([HAVE_PREFETCH], 1, [Prefetch support])]) + AC_CHECK_MEMBERS([struct mlx5_wqe_av.base, struct mlx5_grh_av.rmac], [], [], [[#include ]]) + AC_CHECK_MEMBERS([struct mlx5_cqe64.ib_stride_index], + [], [], [[#include ]]) + AC_DEFINE([HAVE_IB], 1, [IB support]) AC_CHECK_DECLS([IBV_EXP_QPT_DC_INI], @@ -342,10 +389,9 @@ AS_IF([test "x$with_ib" = "xyes"], [[#include ]]) ]) AS_IF([test "x$with_ib_hw_tm" = xexp], - [AC_DEFINE([IBV_HW_TM], 1, [IB Tag Matching support]) - AC_CHECK_MEMBERS([struct ibv_exp_create_srq_attr.dc_offload_params], - [AC_DEFINE([IBV_EXP_HW_TM_DC], 1, [DC Tag Matching support])], - [], [#include ]) + [AC_CHECK_MEMBERS([struct ibv_exp_create_srq_attr.dc_offload_params], [ + AC_DEFINE([IBV_HW_TM], 1, [IB Tag Matching support]) + ], [], [#include ]) ]) AS_IF([test "x$with_ib_hw_tm" = xupstream], [AC_DEFINE([IBV_HW_TM], 1, [IB Tag Matching support]) @@ -353,19 +399,14 @@ AS_IF([test "x$with_ib" = "xyes"], [#include ])]) # Device Memory support - AS_IF([test "x$with_dm" != xno], - [AC_TRY_COMPILE([#include ], - [ - struct ibv_exp_dm ibv_dm; - struct ibv_exp_alloc_dm_attr dm_attr; - void* a1 = ibv_exp_alloc_dm; - void* a2 = ibv_exp_reg_mr; - void* a3 = ibv_dereg_mr; - void* a4 = ibv_exp_free_dm; - ], - [AC_DEFINE([HAVE_IBV_EXP_DM], 1, [Device Memory support])], - []) - ]) + AS_IF([test "x$with_dm" != xno], [ + AC_CHECK_DECLS([ibv_exp_alloc_dm], + [AC_DEFINE([HAVE_IBV_DM], 1, [Device Memory support]) + AC_DEFINE([HAVE_IBV_EXP_DM], 1, [Device Memory support (EXP)])], + [], [[#include ]]) + AC_CHECK_DECLS([ibv_alloc_dm], + [AC_DEFINE([HAVE_IBV_DM], 1, [Device Memory support])], + [], [[#include ]])]) # Hns RoCE support AC_CHECK_FILE(/usr/lib64/libibverbs/libhns-rdmav25.so, @@ -394,7 +435,7 @@ AS_IF([test "x$with_ib" = "xyes"], CFLAGS="$save_CFLAGS" CPPFLAGS="$save_CPPFLAGS" - uct_modules+=":ib" + uct_modules="${uct_modules}:ib" ], [ with_dc=no @@ -402,7 +443,6 @@ AS_IF([test "x$with_ib" = "xyes"], with_ud=no with_mlx5_hw=no with_mlx5_dv=no - with_ib_hw_tm=no ]) # @@ -414,11 +454,12 @@ AM_CONDITIONAL([HAVE_TL_DC], [test "x$with_dc" != xno]) AM_CONDITIONAL([HAVE_DC_DV], [test -n "$have_dc_dv"]) AM_CONDITIONAL([HAVE_DC_EXP], [test -n "$have_dc_exp"]) AM_CONDITIONAL([HAVE_TL_UD], [test "x$with_ud" != xno]) -AM_CONDITIONAL([HAVE_HNS_ROCE], [test "x$with_hns_roce" != xno]) +AM_CONDITIONAL([HAVE_HNS_ROCE],[test "x$with_hns_roce" != xno]) AM_CONDITIONAL([HAVE_MLX5_HW], [test "x$with_mlx5_hw" != xno]) -AM_CONDITIONAL([HAVE_MLX5_DV], [test "x$with_mlx5_dv" != xno]) +AM_CONDITIONAL([HAVE_MLX5_DV], [test "x$with_mlx5_dv" = xyes]) +AM_CONDITIONAL([HAVE_DEVX], [test -n "$have_devx"]) +AM_CONDITIONAL([HAVE_EXP], [test "x$verbs_exp" != xno]) AM_CONDITIONAL([HAVE_MLX5_HW_UD], [test "x$with_mlx5_hw" != xno -a "x$has_get_av" != xno]) -AM_CONDITIONAL([HAVE_IBV_EX_HW_TM], [test "x$with_ib_hw_tm" != xno]) uct_ib_modules="" m4_include([src/uct/ib/cm/configure.m4]) diff --git a/src/uct/ib/dc/dc_mlx5.c b/src/uct/ib/dc/dc_mlx5.c index a6d5264abea..2d7f41785ec 100644 --- a/src/uct/ib/dc/dc_mlx5.c +++ b/src/uct/ib/dc/dc_mlx5.c @@ -1,9 +1,13 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "dc_mlx5.h" #include "dc_mlx5_ep.h" @@ -20,6 +24,9 @@ #include +#define UCT_DC_MLX5_MAX_TX_CQ_LEN (16 * UCS_MBYTE) + + static const char *uct_dc_tx_policy_names[] = { [UCT_DC_TX_POLICY_DCS] = "dcs", [UCT_DC_TX_POLICY_DCS_QUOTA] = "dcs_quota", @@ -29,17 +36,21 @@ static const char *uct_dc_tx_policy_names[] = { /* DC specific parameters, expecting DC_ prefix */ ucs_config_field_t uct_dc_mlx5_iface_config_sub_table[] = { - {"RC_", "IB_TX_QUEUE_LEN=128;RC_FC_ENABLE=y;", NULL, + {"RC_", "IB_TX_QUEUE_LEN=128;FC_ENABLE=y;", NULL, ucs_offsetof(uct_dc_mlx5_iface_config_t, super), + UCS_CONFIG_TYPE_TABLE(uct_rc_iface_common_config_table)}, + + {"RC_", "", NULL, + ucs_offsetof(uct_dc_mlx5_iface_config_t, rc_mlx5_common), UCS_CONFIG_TYPE_TABLE(uct_rc_mlx5_common_config_table)}, - {"", "", NULL, + {"UD_", "", NULL, ucs_offsetof(uct_dc_mlx5_iface_config_t, ud_common), UCS_CONFIG_TYPE_TABLE(uct_ud_iface_common_config_table)}, {"NUM_DCI", "8", "Number of DC initiator QPs (DCI) used by the interface " - "(up to " UCS_PP_QUOTE(UCT_DC_MLX5_IFACE_MAX_DCIS) ").", + "(up to " UCS_PP_MAKE_STRING(UCT_DC_MLX5_IFACE_MAX_DCIS) ").", ucs_offsetof(uct_dc_mlx5_iface_config_t, ndci), UCS_CONFIG_TYPE_UINT}, {"TX_POLICY", "dcs_quota", @@ -75,14 +86,10 @@ ucs_config_field_t uct_dc_mlx5_iface_config_table[] = { {"DC_", "", NULL, 0, UCS_CONFIG_TYPE_TABLE(uct_dc_mlx5_iface_config_sub_table)}, - {"", "", NULL, + {"UD_", "", NULL, ucs_offsetof(uct_dc_mlx5_iface_config_t, mlx5_ud), UCS_CONFIG_TYPE_TABLE(uct_ud_mlx5_iface_common_config_table)}, - {"", "", NULL, - ucs_offsetof(uct_dc_mlx5_iface_config_t, super.mlx5_common), - UCS_CONFIG_TYPE_TABLE(uct_ib_mlx5_iface_config_table)}, - {NULL} }; @@ -98,15 +105,17 @@ uct_dc_mlx5_ep_create_connected(const uct_ep_params_t *params, uct_ep_h* ep_p) int is_global; uct_ib_mlx5_base_av_t av; struct mlx5_grh_av grh_av; + unsigned path_index; ucs_trace_func(""); UCT_EP_PARAMS_CHECK_DEV_IFACE_ADDRS(params); - ib_addr = (const uct_ib_address_t *)params->dev_addr; - if_addr = (const uct_dc_mlx5_iface_addr_t *)params->iface_addr; + ib_addr = (const uct_ib_address_t *)params->dev_addr; + if_addr = (const uct_dc_mlx5_iface_addr_t *)params->iface_addr; + path_index = UCT_EP_PARAMS_GET_PATH_INDEX(params); - status = uct_ud_mlx5_iface_get_av(&iface->super.super.super, &iface->ud_common, - ib_addr, iface->super.super.super.path_bits[0], + status = uct_ud_mlx5_iface_get_av(&iface->super.super.super, + &iface->ud_common, ib_addr, path_index, &av, &grh_av, &is_global); if (status != UCS_OK) { return UCS_ERR_INVALID_ADDR; @@ -131,7 +140,7 @@ static ucs_status_t uct_dc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr size_t max_put_inline = UCT_IB_MLX5_PUT_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE); ucs_status_t status; -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM if (iface->super.dm.dm != NULL) { max_am_inline = ucs_max(iface->super.dm.dm->seg_len, UCT_IB_MLX5_AM_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE)); @@ -145,22 +154,23 @@ static ucs_status_t uct_dc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr max_am_inline, UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(UCT_IB_MLX5_AV_FULL_SIZE), UCT_IB_MLX5_AM_ZCOPY_MAX_IOV, - UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(UCT_IB_MLX5_AV_FULL_SIZE), - sizeof(uct_rc_mlx5_hdr_t)); + sizeof(uct_rc_mlx5_hdr_t), + UCT_RC_MLX5_RMA_MAX_IOV(UCT_IB_MLX5_AV_FULL_SIZE)); if (status != UCS_OK) { return status; } /* fixup flags and address lengths */ - iface_attr->cap.flags &= ~UCT_IFACE_FLAG_CONNECT_TO_EP; - iface_attr->cap.flags |= UCT_IFACE_FLAG_CONNECT_TO_IFACE; - iface_attr->ep_addr_len = 0; - iface_attr->max_conn_priv = 0; - iface_attr->iface_addr_len = sizeof(uct_dc_mlx5_iface_addr_t); - iface_attr->latency.overhead += 60e-9; /* connect packet + cqe */ + iface_attr->cap.flags &= ~UCT_IFACE_FLAG_CONNECT_TO_EP; + iface_attr->cap.flags |= UCT_IFACE_FLAG_CONNECT_TO_IFACE; + iface_attr->ep_addr_len = 0; + iface_attr->max_conn_priv = 0; + iface_attr->iface_addr_len = sizeof(uct_dc_mlx5_iface_addr_t); + iface_attr->latency.c += 60e-9; /* connect packet + cqe */ uct_rc_mlx5_iface_common_query(&iface->super.super.super, iface_attr, - max_am_inline, UCT_IB_MLX5_AV_FULL_SIZE); + max_am_inline, + UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(UCT_IB_MLX5_AV_FULL_SIZE)); /* Error handling is not supported with random dci policy * TODO: Fix */ @@ -208,17 +218,21 @@ uct_dc_mlx5_poll_tx(uct_dc_mlx5_iface_t *iface) qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); dci = uct_dc_mlx5_iface_dci_find(iface, qp_num); txqp = &iface->tx.dcis[dci].txqp; - txwq = &iface->tx.dci_wqs[dci]; + txwq = &iface->tx.dcis[dci].txwq; hw_ci = ntohs(cqe->wqe_counter); ucs_trace_poll("dc iface %p tx_cqe: dci[%d] qpn 0x%x txqp %p hw_ci %d", iface, dci, qp_num, txqp, hw_ci); - uct_rc_mlx5_common_update_tx_res(&iface->super.super, txwq, txqp, hw_ci); - uct_dc_mlx5_iface_dci_put(iface, dci); - uct_rc_mlx5_txqp_process_tx_cqe(txqp, cqe, hw_ci); + uct_rc_txqp_available_set(txqp, uct_ib_mlx5_txwq_update_bb(txwq, hw_ci)); + ucs_assert(uct_rc_txqp_available(txqp) <= txwq->bb_max); + uct_dc_mlx5_iface_dci_put(iface, dci); + /* process pending elements prior to CQ entries to + * avoid out-of-order transmission in completion + * callbacks */ uct_dc_mlx5_iface_progress_pending(iface); + uct_rc_mlx5_txqp_process_tx_cqe(txqp, cqe, hw_ci); return 1; } @@ -234,42 +248,45 @@ static unsigned uct_dc_mlx5_iface_progress(void *arg) return uct_dc_mlx5_poll_tx(iface); } -#if IBV_EXP_HW_TM_DC static unsigned uct_dc_mlx5_iface_progress_tm(void *arg) { uct_dc_mlx5_iface_t *iface = arg; unsigned count; - count = uct_rc_mlx5_iface_common_poll_rx(&iface->super, 1); + count = uct_rc_mlx5_iface_common_poll_rx(&iface->super, + UCT_RC_MLX5_POLL_FLAG_TM); if (count > 0) { return count; } return uct_dc_mlx5_poll_tx(iface); } -#endif static void UCS_CLASS_DELETE_FUNC_NAME(uct_dc_mlx5_iface_t)(uct_iface_t*); -ucs_status_t uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *iface, int dci) +ucs_status_t uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *iface, + uct_dc_dci_t *dci) { + uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.super.md, + uct_ib_mlx5_md_t); ucs_status_t status; - ucs_debug("iface %p reset dci[%d]", iface, dci); + ucs_assert(dci->txwq.super.type == UCT_IB_MLX5_OBJ_TYPE_VERBS); + ucs_debug("iface %p reset dci[%p]", iface, dci); /* Synchronize CQ index with the driver, since it would remove pending * completions for this QP (both send and receive) during ibv_destroy_qp(). */ uct_rc_mlx5_iface_common_update_cqs_ci(&iface->super, &iface->super.super.super); - status = uct_ib_modify_qp(iface->tx.dcis[dci].txqp.qp, IBV_QPS_RESET); + status = uct_ib_mlx5_modify_qp_state(md, &dci->txwq.super, IBV_QPS_RESET); uct_rc_mlx5_iface_common_sync_cqs_ci(&iface->super, &iface->super.super.super); uct_rc_mlx5_iface_commom_clean(&iface->super.cq[UCT_IB_DIR_TX], NULL, - iface->tx.dcis[dci].txqp.qp->qp_num); + dci->txwq.super.qp_num); /* Resume posting from to the beginning of the QP */ - uct_ib_mlx5_txwq_reset(&iface->tx.dci_wqs[dci]); + uct_ib_mlx5_txwq_reset(&dci->txwq); return status; } @@ -282,45 +299,105 @@ static void uct_dc_mlx5_iface_event_cq(uct_ib_iface_t *ib_iface, iface->super.cq[dir].cq_sn++; } -static ucs_status_t uct_dc_mlx5_iface_create_qp(uct_ib_iface_t *ib_iface, - uct_ib_qp_attr_t *attr, - struct ibv_qp **qp_p) +static ucs_status_t uct_dc_mlx5_iface_create_qp(uct_dc_mlx5_iface_t *iface, + struct ibv_qp_cap *cap, + uct_dc_dci_t *dci) { - uct_dc_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_dc_mlx5_iface_t); + uct_ib_iface_t *ib_iface = &iface->super.super.super; + uct_ib_mlx5_qp_attr_t attr = {}; + ucs_status_t status; #if HAVE_DC_DV uct_ib_device_t *dev = uct_ib_iface_device(ib_iface); struct mlx5dv_qp_init_attr dv_attr = {}; struct ibv_qp *qp; - uct_ib_iface_fill_attr(ib_iface, attr); - uct_ib_mlx5_iface_fill_attr(ib_iface, &iface->super.mlx5_common, attr); - attr->ibv.cap.max_recv_sge = 0; + uct_rc_mlx5_iface_fill_attr(&iface->super, &attr, + iface->super.super.config.tx_qp_len, + &iface->super.rx.srq); + status = uct_ib_mlx5_iface_fill_attr(ib_iface, &dci->txwq.super, &attr); + if (status != UCS_OK) { + return status; + } + + uct_ib_iface_fill_attr(ib_iface, &attr.super); + attr.super.ibv.cap.max_recv_sge = 0; dv_attr.comp_mask = MLX5DV_QP_INIT_ATTR_MASK_DC; dv_attr.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCI; dv_attr.dc_init_attr.dct_access_key = UCT_IB_KEY; - qp = mlx5dv_create_qp(dev->ibv_context, &attr->ibv, &dv_attr); + qp = mlx5dv_create_qp(dev->ibv_context, &attr.super.ibv, &dv_attr); if (qp == NULL) { - ucs_error("iface=%p: failed to create DCI: %m", iface); + ucs_error("mlx5dv_create_qp("UCT_IB_IFACE_FMT", DCI): failed: %m", + UCT_IB_IFACE_ARG(ib_iface)); return UCS_ERR_IO_ERROR; } - attr->cap = attr->ibv.cap; - *qp_p = qp; - - return UCS_OK; + dci->txwq.super.verbs.qp = qp; + dci->txwq.super.qp_num = dci->txwq.super.verbs.qp->qp_num; #else - return uct_ib_mlx5_iface_create_qp(ib_iface, &iface->super.mlx5_common, attr, qp_p); + uct_rc_mlx5_iface_fill_attr(&iface->super, &attr, + iface->super.super.config.tx_qp_len, + &iface->super.rx.srq); + status = uct_ib_mlx5_iface_create_qp(ib_iface, &dci->txwq.super, &attr); + if (status != UCS_OK) { + return status; + } +#endif + + status = uct_rc_txqp_init(&dci->txqp, &iface->super.super, + dci->txwq.super.qp_num + UCS_STATS_ARG(iface->super.super.stats)); + if (status != UCS_OK) { + goto err_qp; + } + + status = uct_dc_mlx5_iface_dci_connect(iface, dci); + if (status != UCS_OK) { + goto err; + } + + if (uct_dc_mlx5_iface_is_dci_rand(iface)) { + ucs_arbiter_group_init(&dci->arb_group); + } else { + dci->ep = NULL; + } + +#if UCS_ENABLE_ASSERT + dci->flags = 0; #endif + + status = uct_ib_mlx5_txwq_init(iface->super.super.super.super.worker, + iface->super.tx.mmio_mode, &dci->txwq, + dci->txwq.super.verbs.qp); + if (status != UCS_OK) { + goto err; + } + + uct_rc_txqp_available_set(&dci->txqp, dci->txwq.bb_max); + *cap = attr.super.ibv.cap; + return UCS_OK; + +err: + uct_rc_txqp_cleanup(&dci->txqp); +err_qp: + ibv_destroy_qp(dci->txwq.super.verbs.qp); + return status; } #if HAVE_DC_DV ucs_status_t uct_dc_mlx5_iface_dci_connect(uct_dc_mlx5_iface_t *iface, - uct_rc_txqp_t *dci) + uct_dc_dci_t *dci) { + uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.super.md, + uct_ib_mlx5_md_t); struct ibv_qp_attr attr; long attr_mask; + if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX) { + return uct_dc_mlx5_iface_devx_dci_connect(iface, &dci->txwq.super); + } + + ucs_assert(dci->txwq.super.type == UCT_IB_MLX5_OBJ_TYPE_VERBS); memset(&attr, 0, sizeof(attr)); attr.qp_state = IBV_QPS_INIT; attr.pkey_index = iface->super.super.super.pkey_index; @@ -329,24 +406,25 @@ ucs_status_t uct_dc_mlx5_iface_dci_connect(uct_dc_mlx5_iface_t *iface, IBV_QP_PKEY_INDEX | IBV_QP_PORT; - if (ibv_modify_qp(dci->qp, &attr, attr_mask)) { - ucs_error("error modifying QP to INIT : %m"); + if (ibv_modify_qp(dci->txwq.super.verbs.qp, &attr, attr_mask)) { + ucs_error("ibv_modify_qp(DCI, INIT) failed : %m"); return UCS_ERR_IO_ERROR; } /* Move QP to the RTR state */ memset(&attr, 0, sizeof(attr)); attr.qp_state = IBV_QPS_RTR; - attr.path_mtu = iface->super.super.config.path_mtu; - attr.min_rnr_timer = iface->super.super.config.min_rnr_timer; - attr.max_dest_rd_atomic = 1; - attr.ah_attr.is_global = iface->super.super.super.is_global_addr; + attr.path_mtu = iface->super.super.super.config.path_mtu; + attr.ah_attr.is_global = iface->super.super.super.config.force_global_addr; attr.ah_attr.sl = iface->super.super.super.config.sl; + /* ib_core expects valied ah_attr::port_num when IBV_QP_AV is set */ + attr.ah_attr.port_num = iface->super.super.super.config.port_num; attr_mask = IBV_QP_STATE | - IBV_QP_PATH_MTU; + IBV_QP_PATH_MTU | + IBV_QP_AV; - if (ibv_modify_qp(dci->qp, &attr, attr_mask)) { - ucs_error("error modifying DCI QP to RTR: %m"); + if (ibv_modify_qp(dci->txwq.super.verbs.qp, &attr, attr_mask)) { + ucs_error("ibv_modify_qp(DCI, RTR) failed : %m"); return UCS_ERR_IO_ERROR; } @@ -364,8 +442,8 @@ ucs_status_t uct_dc_mlx5_iface_dci_connect(uct_dc_mlx5_iface_t *iface, IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC; - if (ibv_modify_qp(dci->qp, &attr, attr_mask)) { - ucs_error("error modifying DCI QP to RTS: %m"); + if (ibv_modify_qp(dci->txwq.super.verbs.qp, &attr, attr_mask)) { + ucs_error("ibv_modify_qp(DCI, RTS) failed : %m"); return UCS_ERR_IO_ERROR; } @@ -374,29 +452,34 @@ ucs_status_t uct_dc_mlx5_iface_dci_connect(uct_dc_mlx5_iface_t *iface, ucs_status_t uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface) { + uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.super.md, + uct_ib_mlx5_md_t); uct_ib_device_t *dev = uct_ib_iface_device(&iface->super.super.super); struct mlx5dv_qp_init_attr dv_init_attr = {}; struct ibv_qp_init_attr_ex init_attr = {}; struct ibv_qp_attr attr = {}; int ret; + if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX_DCT) { + return uct_dc_mlx5_iface_devx_create_dct(iface); + } + init_attr.comp_mask = IBV_QP_INIT_ATTR_PD; init_attr.pd = uct_ib_iface_md(&iface->super.super.super)->pd; init_attr.recv_cq = iface->super.super.super.cq[UCT_IB_DIR_RX]; /* DCT can't send, but send_cq have to point to valid CQ */ init_attr.send_cq = iface->super.super.super.cq[UCT_IB_DIR_RX]; - init_attr.srq = iface->super.super.rx.srq.srq; + init_attr.srq = iface->super.rx.srq.verbs.srq; init_attr.qp_type = IBV_QPT_DRIVER; - init_attr.cap.max_inline_data = iface->super.super.config.rx_inline; dv_init_attr.comp_mask = MLX5DV_QP_INIT_ATTR_MASK_DC; dv_init_attr.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCT; dv_init_attr.dc_init_attr.dct_access_key = UCT_IB_KEY; - iface->rx_dct = mlx5dv_create_qp(dev->ibv_context, - &init_attr, &dv_init_attr); - if (iface->rx_dct == NULL) { - ucs_error("Failed to created DC target %m"); + iface->rx.dct.verbs.qp = mlx5dv_create_qp(dev->ibv_context, + &init_attr, &dv_init_attr); + if (iface->rx.dct.verbs.qp == NULL) { + ucs_error("mlx5dv_create_qp(DCT) failed: %m"); return UCS_ERR_INVALID_PARAM; } @@ -407,10 +490,10 @@ ucs_status_t uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface) IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC; - ret = ibv_modify_qp(iface->rx_dct, &attr, IBV_QP_STATE | - IBV_QP_PKEY_INDEX | - IBV_QP_PORT | - IBV_QP_ACCESS_FLAGS); + ret = ibv_modify_qp(iface->rx.dct.verbs.qp, &attr, IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS); if (ret) { ucs_error("error modifying DCT to INIT: %m"); @@ -418,78 +501,62 @@ ucs_status_t uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface) } attr.qp_state = IBV_QPS_RTR; - attr.path_mtu = iface->super.super.config.path_mtu; + attr.path_mtu = iface->super.super.super.config.path_mtu; attr.min_rnr_timer = iface->super.super.config.min_rnr_timer; + attr.ah_attr.is_global = iface->super.super.super.config.force_global_addr; attr.ah_attr.grh.hop_limit = iface->super.super.super.config.hop_limit; attr.ah_attr.grh.traffic_class = iface->super.super.super.config.traffic_class; - attr.ah_attr.grh.sgid_index = uct_ib_iface_md(&iface->super.super.super)->config.gid_index; + attr.ah_attr.grh.sgid_index = iface->super.super.super.gid_info.gid_index; attr.ah_attr.port_num = iface->super.super.super.config.port_num; - ret = ibv_modify_qp(iface->rx_dct, &attr, IBV_QP_STATE | - IBV_QP_MIN_RNR_TIMER | - IBV_QP_AV | - IBV_QP_PATH_MTU); + ret = ibv_modify_qp(iface->rx.dct.verbs.qp, &attr, IBV_QP_STATE | + IBV_QP_MIN_RNR_TIMER | + IBV_QP_AV | + IBV_QP_PATH_MTU); if (ret) { ucs_error("error modifying DCT to RTR: %m"); goto err; } + iface->rx.dct.type = UCT_IB_MLX5_OBJ_TYPE_VERBS; + iface->rx.dct.qp_num = iface->rx.dct.verbs.qp->qp_num; return UCS_OK; err: - ibv_destroy_qp(iface->rx_dct); + uct_ib_destroy_qp(iface->rx.dct.verbs.qp); return UCS_ERR_IO_ERROR; } -int uct_dc_mlx5_get_dct_num(uct_dc_mlx5_iface_t *iface) -{ - return iface->rx_dct->qp_num; -} - void uct_dc_mlx5_destroy_dct(uct_dc_mlx5_iface_t *iface) { - if (iface->rx_dct != NULL) { - ibv_destroy_qp(iface->rx_dct); - iface->rx_dct = NULL; - } -} + switch (iface->rx.dct.type) { + case UCT_IB_MLX5_OBJ_TYPE_VERBS: + uct_ib_destroy_qp(iface->rx.dct.verbs.qp); + break; + case UCT_IB_MLX5_OBJ_TYPE_DEVX: +#if HAVE_DEVX + mlx5dv_devx_obj_destroy(iface->rx.dct.devx.obj); #endif - -static ucs_status_t uct_dc_mlx5_iface_init_dcis(uct_dc_mlx5_iface_t *iface, - uct_ib_mlx5_mmio_mode_t mmio_mode) -{ - ucs_status_t status; - uint16_t bb_max; - int i; - - bb_max = 0; - for (i = 0; i < iface->tx.ndci; i++) { - status = uct_ib_mlx5_txwq_init(iface->super.super.super.super.worker, - mmio_mode, &iface->tx.dci_wqs[i], - iface->tx.dcis[i].txqp.qp); - if (status != UCS_OK) { - return status; - } - - - bb_max = iface->tx.dci_wqs[i].bb_max; - uct_rc_txqp_available_set(&iface->tx.dcis[i].txqp, bb_max); + break; + case UCT_IB_MLX5_OBJ_TYPE_LAST: + break; } - - iface->super.super.config.tx_qp_len = bb_max; - return UCS_OK; } +#endif static void uct_dc_mlx5_iface_cleanup_dcis(uct_dc_mlx5_iface_t *iface) { int i; for (i = 0; i < iface->tx.ndci; i++) { - uct_ib_mlx5_txwq_cleanup(&iface->tx.dci_wqs[i]); + if (uct_dc_mlx5_iface_is_dci_rand(iface)) { + ucs_arbiter_group_cleanup(&iface->tx.dcis[i].arb_group); + } + uct_ib_mlx5_txwq_cleanup(&iface->tx.dcis[i].txwq); } } -#if IBV_EXP_HW_TM_DC || HAVE_DC_EXP +#ifdef HAVE_DC_EXP static uint64_t uct_dc_mlx5_iface_ooo_flag(uct_dc_mlx5_iface_t *iface, uint64_t flag, char *str, uint32_t qp_num) @@ -511,43 +578,89 @@ uct_dc_mlx5_iface_ooo_flag(uct_dc_mlx5_iface_t *iface, uint64_t flag, static ucs_status_t uct_dc_mlx5_init_rx(uct_rc_iface_t *rc_iface, - const uct_rc_iface_config_t *rc_config) + const uct_rc_iface_common_config_t *rc_config) { - uct_dc_mlx5_iface_t *iface = ucs_derived_of(rc_iface, uct_dc_mlx5_iface_t); + uct_ib_mlx5_md_t *md = ucs_derived_of(rc_iface->super.super.md, + uct_ib_mlx5_md_t); + uct_dc_mlx5_iface_config_t *config = ucs_derived_of(rc_config, + uct_dc_mlx5_iface_config_t); + uct_dc_mlx5_iface_t *iface = ucs_derived_of(rc_iface, + uct_dc_mlx5_iface_t); + struct ibv_srq_init_attr_ex srq_attr = {}; + ucs_status_t status; -#if IBV_EXP_HW_TM_DC - uct_dc_mlx5_iface_config_t *config = ucs_derived_of(rc_config, - uct_dc_mlx5_iface_config_t); if (UCT_RC_MLX5_TM_ENABLED(&iface->super)) { - struct ibv_exp_create_srq_attr srq_attr = {}; - struct ibv_exp_srq_dc_offload_params dc_op = {}; - - iface->super.super.progress = uct_dc_mlx5_iface_progress_tm; - - dc_op.timeout = rc_iface->config.timeout; - dc_op.path_mtu = rc_iface->config.path_mtu; - dc_op.pkey_index = rc_iface->super.pkey_index; - dc_op.sl = rc_iface->super.config.sl; - dc_op.dct_key = UCT_IB_KEY; - dc_op.ooo_caps = uct_dc_mlx5_iface_ooo_flag(iface, - IBV_EXP_OOO_SUPPORT_RW_DATA_PLACEMENT, - "TM XRQ", 0); - - srq_attr.comp_mask = IBV_EXP_CREATE_SRQ_DC_OFFLOAD_PARAMS; - srq_attr.dc_offload_params = &dc_op; - - return uct_rc_mlx5_init_rx_tm(&iface->super, &config->super, - &srq_attr, - sizeof(struct ibv_rvh) + - sizeof(struct ibv_ravh), 0); - } + if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX_DC_SRQ) { + status = uct_rc_mlx5_devx_init_rx_tm(&iface->super, &config->super, + 1, UCT_DC_RNDV_HDR_LEN); + if (status != UCS_OK) { + goto err; + } + + status = uct_dc_mlx5_iface_devx_set_srq_dc_params(iface); + if (status != UCS_OK) { + goto err_free_srq; + } + } else { +#ifdef HAVE_STRUCT_IBV_EXP_CREATE_SRQ_ATTR_DC_OFFLOAD_PARAMS + struct ibv_exp_srq_dc_offload_params dc_op = {}; + + dc_op.timeout = rc_iface->config.timeout; + dc_op.path_mtu = rc_iface->super.config.path_mtu; + dc_op.pkey_index = rc_iface->super.pkey_index; + dc_op.sl = rc_iface->super.config.sl; + dc_op.dct_key = UCT_IB_KEY; + dc_op.ooo_caps = uct_dc_mlx5_iface_ooo_flag(iface, + IBV_EXP_OOO_SUPPORT_RW_DATA_PLACEMENT, + "TM XRQ", 0); + + srq_attr.comp_mask = IBV_EXP_CREATE_SRQ_DC_OFFLOAD_PARAMS; + srq_attr.dc_offload_params = &dc_op; #endif + status = uct_rc_mlx5_init_rx_tm(&iface->super, &config->super, + &srq_attr, UCT_DC_RNDV_HDR_LEN); + if (status != UCS_OK) { + goto err; + } + } + + iface->super.super.progress = uct_dc_mlx5_iface_progress_tm; + return status; + } + + /* MP XRQ is supported with HW TM only */ + ucs_assert(!UCT_RC_MLX5_MP_ENABLED(&iface->super)); + + if (ucs_test_all_flags(md->flags, UCT_IB_MLX5_MD_FLAG_RMP | + UCT_IB_MLX5_MD_FLAG_DEVX_DC_SRQ)) { + status = uct_rc_mlx5_devx_init_rx(&iface->super, &config->super); + } else { + status = uct_rc_mlx5_common_iface_init_rx(&iface->super, rc_config); + } + + if (status != UCS_OK) { + goto err; + } iface->super.super.progress = uct_dc_mlx5_iface_progress; - return uct_rc_iface_init_rx(rc_iface, rc_config); + return UCS_OK; + +err_free_srq: + uct_rc_mlx5_destroy_srq(md, &iface->super.rx.srq); +err: + return status; } -#if HAVE_DC_EXP +void uct_dc_mlx5_cleanup_rx(uct_rc_iface_t *rc_iface) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(rc_iface->super.super.md, + uct_ib_mlx5_md_t); + uct_dc_mlx5_iface_t *iface = ucs_derived_of(rc_iface, uct_dc_mlx5_iface_t); + + uct_rc_mlx5_destroy_srq(md, &iface->super.rx.srq); +} + +#ifdef HAVE_DC_EXP ucs_status_t uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface) { struct ibv_exp_dct_init_attr init_attr; @@ -556,35 +669,36 @@ ucs_status_t uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface) init_attr.pd = uct_ib_iface_md(&iface->super.super.super)->pd; init_attr.cq = iface->super.super.super.cq[UCT_IB_DIR_RX]; - init_attr.srq = iface->super.super.rx.srq.srq; + init_attr.srq = iface->super.rx.srq.verbs.srq; init_attr.dc_key = UCT_IB_KEY; init_attr.port = iface->super.super.super.config.port_num; - init_attr.mtu = iface->super.super.config.path_mtu; + init_attr.mtu = iface->super.super.super.config.path_mtu; init_attr.access_flags = IBV_EXP_ACCESS_REMOTE_WRITE | IBV_EXP_ACCESS_REMOTE_READ | IBV_EXP_ACCESS_REMOTE_ATOMIC; init_attr.min_rnr_timer = iface->super.super.config.min_rnr_timer; init_attr.tclass = iface->super.super.super.config.traffic_class; init_attr.hop_limit = iface->super.super.super.config.hop_limit; - init_attr.gid_index = iface->super.super.super.config.gid_index; - init_attr.inline_size = iface->super.super.config.rx_inline; + init_attr.gid_index = iface->super.super.super.gid_info.gid_index; + init_attr.inline_size = iface->super.super.super.config.max_inl_cqe[UCT_IB_DIR_RX]; init_attr.pkey_index = iface->super.super.super.pkey_index; init_attr.create_flags |= uct_dc_mlx5_iface_ooo_flag(iface, IBV_EXP_DCT_OOO_RW_DATA_PLACEMENT, "DCT", 0); - iface->rx_dct = ibv_exp_create_dct(uct_ib_iface_device(&iface->super.super.super)->ibv_context, - &init_attr); - if (iface->rx_dct == NULL) { + iface->rx.dct.verbs.dct = ibv_exp_create_dct(uct_ib_iface_device(&iface->super.super.super)->ibv_context, + &init_attr); + if (iface->rx.dct.verbs.dct == NULL) { ucs_error("failed to create DC target: %m"); return UCS_ERR_INVALID_PARAM; } + iface->rx.dct.qp_num = iface->rx.dct.verbs.dct->dct_num; return UCS_OK; } /* take dc qp to rts state */ ucs_status_t uct_dc_mlx5_iface_dci_connect(uct_dc_mlx5_iface_t *iface, - uct_rc_txqp_t *dci) + uct_dc_dci_t *dci) { struct ibv_exp_qp_attr attr; long attr_mask; @@ -600,28 +714,27 @@ ucs_status_t uct_dc_mlx5_iface_dci_connect(uct_dc_mlx5_iface_t *iface, IBV_EXP_QP_PORT | IBV_EXP_QP_DC_KEY; - if (ibv_exp_modify_qp(dci->qp, &attr, attr_mask)) { - ucs_error("error modifying QP to INIT : %m"); + if (ibv_exp_modify_qp(dci->txwq.super.verbs.qp, &attr, attr_mask)) { + ucs_error("ibv_exp_modify_qp(DCI, INIT) failed : %m"); return UCS_ERR_IO_ERROR; } /* Move QP to the RTR state */ ooo_qp_flag = uct_dc_mlx5_iface_ooo_flag(iface, IBV_EXP_QP_OOO_RW_DATA_PLACEMENT, - "DCI QP 0x", dci->qp->qp_num); + "DCI QP 0x", dci->txwq.super.qp_num); memset(&attr, 0, sizeof(attr)); attr.qp_state = IBV_QPS_RTR; - attr.path_mtu = iface->super.super.config.path_mtu; - attr.max_dest_rd_atomic = 1; - attr.ah_attr.is_global = iface->super.super.super.is_global_addr; + attr.path_mtu = iface->super.super.super.config.path_mtu; + attr.ah_attr.is_global = iface->super.super.super.config.force_global_addr; attr.ah_attr.sl = iface->super.super.super.config.sl; attr_mask = IBV_EXP_QP_STATE | IBV_EXP_QP_PATH_MTU | IBV_EXP_QP_AV | ooo_qp_flag; - if (ibv_exp_modify_qp(dci->qp, &attr, attr_mask)) { - ucs_error("error modifying QP to RTR: %m"); + if (ibv_exp_modify_qp(dci->txwq.super.verbs.qp, &attr, attr_mask)) { + ucs_error("ibv_exp_modify_qp(DCI, RTR) failed : %m"); return UCS_ERR_IO_ERROR; } @@ -638,25 +751,17 @@ ucs_status_t uct_dc_mlx5_iface_dci_connect(uct_dc_mlx5_iface_t *iface, IBV_EXP_QP_RNR_RETRY | IBV_EXP_QP_MAX_QP_RD_ATOMIC; - if (ibv_exp_modify_qp(dci->qp, &attr, attr_mask)) { - ucs_error("error modifying QP to RTS: %m"); + if (ibv_exp_modify_qp(dci->txwq.super.verbs.qp, &attr, attr_mask)) { + ucs_error("ibv_exp_modify_qp(DCI, RTS) failed : %m"); return UCS_ERR_IO_ERROR; } return UCS_OK; } -int uct_dc_mlx5_get_dct_num(uct_dc_mlx5_iface_t *iface) -{ - return iface->rx_dct->dct_num; -} - void uct_dc_mlx5_destroy_dct(uct_dc_mlx5_iface_t *iface) { - if (iface->rx_dct != NULL) { - ibv_exp_destroy_dct(iface->rx_dct); - iface->rx_dct = NULL; - } + ibv_exp_destroy_dct(iface->rx.dct.verbs.dct); } #endif @@ -665,13 +770,14 @@ void uct_dc_mlx5_iface_dcis_destroy(uct_dc_mlx5_iface_t *iface, int max) int i; for (i = 0; i < max; i++) { uct_rc_txqp_cleanup(&iface->tx.dcis[i].txqp); + ucs_assert(iface->tx.dcis[i].txwq.super.type == UCT_IB_MLX5_OBJ_TYPE_VERBS); + uct_ib_destroy_qp(iface->tx.dcis[i].txwq.super.verbs.qp); } } -ucs_status_t uct_dc_mlx5_iface_create_dcis(uct_dc_mlx5_iface_t *iface, - uct_dc_mlx5_iface_config_t *config) +static ucs_status_t uct_dc_mlx5_iface_create_dcis(uct_dc_mlx5_iface_t *iface) { - struct ibv_qp_cap cap; + struct ibv_qp_cap cap = {}; ucs_status_t status; int i; @@ -680,25 +786,17 @@ ucs_status_t uct_dc_mlx5_iface_create_dcis(uct_dc_mlx5_iface_t *iface, iface->tx.stack_top = 0; for (i = 0; i < iface->tx.ndci; i++) { ucs_assert(iface->super.super.super.config.qp_type == UCT_IB_QPT_DCI); - status = uct_rc_txqp_init(&iface->tx.dcis[i].txqp, &iface->super.super, - &cap UCS_STATS_ARG(iface->super.super.stats)); - if (status != UCS_OK) { - goto err; - } - status = uct_dc_mlx5_iface_dci_connect(iface, &iface->tx.dcis[i].txqp); + status = uct_dc_mlx5_iface_create_qp(iface, &cap, &iface->tx.dcis[i]); if (status != UCS_OK) { - uct_rc_txqp_cleanup(&iface->tx.dcis[i].txqp); goto err; } iface->tx.dcis_stack[i] = i; - iface->tx.dcis[i].ep = NULL; -#if ENABLE_ASSERT - iface->tx.dcis[i].flags = 0; -#endif } - uct_ib_iface_set_max_iov(&iface->super.super.super, cap.max_send_sge); + + iface->super.super.config.tx_qp_len = iface->tx.dcis[0].txwq.bb_max; + return UCS_OK; err: @@ -753,9 +851,11 @@ uct_dc_mlx5_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *iface_addr { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_iface, uct_dc_mlx5_iface_t); uct_dc_mlx5_iface_addr_t *addr = (uct_dc_mlx5_iface_addr_t *)iface_addr; + uct_ib_md_t *md = uct_ib_iface_md(ucs_derived_of(iface, + uct_ib_iface_t)); - uct_ib_pack_uint24(addr->qp_num, uct_dc_mlx5_get_dct_num(iface)); - addr->atomic_mr_id = uct_ib_iface_get_atomic_mr_id(&iface->super.super.super); + uct_ib_pack_uint24(addr->qp_num, iface->rx.dct.qp_num); + uct_ib_mlx5_md_get_atomic_mr_id(md, &addr->atomic_mr_id); addr->flags = iface->version_flag; if (UCT_RC_MLX5_TM_ENABLED(&iface->super)) { addr->flags |= UCT_DC_MLX5_IFACE_ADDR_HW_TM; @@ -764,36 +864,23 @@ uct_dc_mlx5_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *iface_addr return UCS_OK; } -ucs_status_t uct_dc_device_query_tl_resources(uct_ib_device_t *dev, - const char *tl_name, unsigned flags, - uct_tl_resource_desc_t **resources_p, - unsigned *num_resources_p) -{ - return uct_ib_device_query_tl_resources(dev, tl_name, - flags | UCT_IB_DEVICE_FLAG_DC, - resources_p, num_resources_p); -} - static inline ucs_status_t uct_dc_mlx5_iface_flush_dcis(uct_dc_mlx5_iface_t *iface) { - int is_flush_done = 1; - uct_dc_mlx5_ep_t *ep; int i; + if (iface->tx.fc_grants) { + /* If some ep is waiting for grant it may have some pending + * operations, while all QP resources are available. */ + return UCS_INPROGRESS; + } + for (i = 0; i < iface->tx.ndci; i++) { - /* TODO: Remove this check - no need to wait for grant, because we - * use gc_list for removed eps */ - if (!uct_dc_mlx5_iface_is_dci_rand(iface)) { - ep = uct_dc_mlx5_ep_from_dci(iface, i); - if ((ep != NULL) && uct_dc_mlx5_ep_fc_wait_for_grant(ep)) { - return UCS_INPROGRESS; - } - } if (uct_dc_mlx5_iface_flush_dci(iface, i) != UCS_OK) { - is_flush_done = 0; + return UCS_INPROGRESS; } } - return is_flush_done ? UCS_OK : UCS_INPROGRESS; + + return UCS_OK; } ucs_status_t uct_dc_mlx5_iface_flush(uct_iface_h tl_iface, unsigned flags, uct_completion_t *comp) @@ -804,6 +891,12 @@ ucs_status_t uct_dc_mlx5_iface_flush(uct_iface_h tl_iface, unsigned flags, uct_c if (comp != NULL) { return UCS_ERR_UNSUPPORTED; } + + status = uct_rc_iface_fence_relaxed_order(tl_iface); + if (status != UCS_OK) { + return status; + } + status = uct_dc_mlx5_iface_flush_dcis(iface); if (status == UCS_OK) { UCT_TL_IFACE_STAT_FLUSH(&iface->super.super.super.super); @@ -909,15 +1002,18 @@ ucs_status_t uct_dc_mlx5_iface_fc_handler(uct_rc_iface_t *rc_iface, unsigned qp_ status = uct_dc_mlx5_iface_fc_grant(&dc_req->super.super); if (status == UCS_ERR_NO_RESOURCE){ - status = uct_ep_pending_add(&ep->super.super, &dc_req->super.super, - 0); + uct_dc_mlx5_ep_pending_common(iface, ep, &dc_req->super.super, 0, 1); + } else { + ucs_assertv_always(status == UCS_OK, + "Failed to send FC grant msg: %s", + ucs_status_string(status)); } - ucs_assertv_always(status == UCS_OK, "Failed to send FC grant msg: %s", - ucs_status_string(status)); } else if (fc_hdr == UCT_RC_EP_FC_PURE_GRANT) { ep = *((uct_dc_mlx5_ep_t**)(hdr + 1)); if (!(ep->flags & UCT_DC_MLX5_EP_FLAG_VALID)) { + /* Just remove ep now, no need to clear waiting for grant state + * (it was done in destroy_ep func) */ uct_dc_mlx5_ep_release(ep); return UCS_OK; } @@ -928,7 +1024,7 @@ ucs_status_t uct_dc_mlx5_iface_fc_handler(uct_rc_iface_t *rc_iface, unsigned qp_ ep->fc.fc_wnd = rc_iface->config.fc_wnd_size; /* Clear the flag for flush to complete */ - ep->fc.flags &= ~UCT_DC_MLX5_EP_FC_FLAG_WAIT_FOR_GRANT; + uct_dc_mlx5_ep_clear_fc_grant_flag(iface, ep); UCS_STATS_UPDATE_COUNTER(ep->fc.stats, UCT_RC_FC_STAT_RX_PURE_GRANT, 1); UCS_STATS_SET_COUNTER(ep->fc.stats, UCT_RC_FC_STAT_FC_WND, ep->fc.fc_wnd); @@ -953,14 +1049,6 @@ ucs_status_t uct_dc_mlx5_iface_fc_handler(uct_rc_iface_t *rc_iface, unsigned qp_ return UCS_OK; } -void uct_dc_mlx5_iface_set_av_sport(uct_dc_mlx5_iface_t *iface, - uct_ib_mlx5_base_av_t *av, - uint32_t remote_dctn) -{ - uct_ib_mlx5_iface_set_av_sport(&iface->super.super.super, av, - remote_dctn ^ uct_dc_mlx5_get_dct_num(iface)); -} - static void uct_dc_mlx5_iface_handle_failure(uct_ib_iface_t *ib_iface, void *arg, ucs_status_t status) { @@ -969,67 +1057,25 @@ static void uct_dc_mlx5_iface_handle_failure(uct_ib_iface_t *ib_iface, uint32_t qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); uint8_t dci = uct_dc_mlx5_iface_dci_find(iface, qp_num); - uct_rc_txqp_t *txqp = &iface->tx.dcis[dci].txqp; uct_dc_mlx5_ep_t *ep; - ucs_status_t ep_status; - int16_t outstanding; - - if (uct_dc_mlx5_iface_is_dci_rand(iface) || - (uct_dc_mlx5_ep_from_dci(iface, dci) == NULL)) { - uct_ib_mlx5_completion_with_err(ib_iface, arg, &iface->tx.dci_wqs[dci], - ib_iface->super.config.failure_level); - return; - } - - ep = uct_dc_mlx5_ep_from_dci(iface, dci); - - uct_rc_txqp_purge_outstanding(txqp, status, 0); - - /* poll_cqe for mlx5 returns NULL in case of failure and the cq_avaialble - is not updated for the error cqe and all outstanding wqes*/ - outstanding = (int16_t)iface->super.super.config.tx_qp_len - - uct_rc_txqp_available(txqp); - iface->super.super.tx.cq_available += outstanding; - uct_rc_txqp_available_set(txqp, (int16_t)iface->super.super.config.tx_qp_len); + ucs_log_level_t level; - /* since we removed all outstanding ops on the dci, it should be released */ - ucs_assert(ep->dci != UCT_DC_MLX5_EP_NO_DCI); - uct_dc_mlx5_iface_dci_put(iface, dci); - ucs_assert_always(ep->dci == UCT_DC_MLX5_EP_NO_DCI); - - if (ep == iface->tx.fc_ep) { - /* Cannot handle errors on flow-control endpoint. - * Or shall we ignore them? - */ - ucs_debug("got error on DC flow-control endpoint, iface %p: %s", iface, - ucs_status_string(status)); - ep_status = UCS_OK; + if (uct_dc_mlx5_iface_is_dci_rand(iface)) { + ep = NULL; + level = UCS_LOG_LEVEL_FATAL; /* error handling is not supported with rand dci */ } else { - ep_status = iface->super.super.super.ops->set_ep_failed(ib_iface, - &ep->super.super, - status); - if (ep_status != UCS_OK) { - uct_ib_mlx5_completion_with_err(ib_iface, arg, - &iface->tx.dci_wqs[dci], - UCS_LOG_LEVEL_FATAL); - return; - } + ep = uct_dc_mlx5_ep_from_dci(iface, dci); + level = ib_iface->super.config.failure_level; } - uct_ib_mlx5_completion_with_err(ib_iface, arg, &iface->tx.dci_wqs[dci], - ib_iface->super.config.failure_level); - - status = uct_dc_mlx5_iface_reset_dci(iface, dci); - if (status != UCS_OK) { - ucs_fatal("iface %p failed to reset dci[%d] qpn 0x%x: %s", - iface, dci, txqp->qp->qp_num, ucs_status_string(status)); + if (ep == NULL) { + uct_ib_mlx5_completion_with_err(ib_iface, arg, &iface->tx.dcis[dci].txwq, + level); + return; } - status = uct_dc_mlx5_iface_dci_connect(iface, txqp); - if (status != UCS_OK) { - ucs_fatal("iface %p failed to connect dci[%d] qpn 0x%x: %s", - iface, dci, txqp->qp->qp_num, ucs_status_string(status)); - } + ep = uct_dc_mlx5_ep_from_dci(iface, dci); + uct_dc_mlx5_ep_handle_failure(ep, arg, status); } static uct_rc_iface_ops_t uct_dc_mlx5_iface_ops = { @@ -1053,7 +1099,7 @@ static uct_rc_iface_ops_t uct_dc_mlx5_iface_ops = { .ep_pending_purge = uct_dc_mlx5_ep_pending_purge, .ep_flush = uct_dc_mlx5_ep_flush, .ep_fence = uct_dc_mlx5_ep_fence, -#if IBV_EXP_HW_TM_DC +#if IBV_HW_TM .ep_tag_eager_short = uct_dc_mlx5_ep_tag_eager_short, .ep_tag_eager_bcopy = uct_dc_mlx5_ep_tag_eager_bcopy, .ep_tag_eager_zcopy = uct_dc_mlx5_ep_tag_eager_zcopy, @@ -1064,7 +1110,7 @@ static uct_rc_iface_ops_t uct_dc_mlx5_iface_ops = { .iface_tag_recv_cancel = uct_dc_mlx5_iface_tag_recv_cancel, #endif .iface_flush = uct_dc_mlx5_iface_flush, - .iface_fence = uct_rc_mlx5_iface_fence, + .iface_fence = uct_rc_iface_fence, .iface_progress_enable = uct_dc_mlx5_iface_progress_enable, .iface_progress_disable = uct_base_iface_progress_disable, .iface_progress = uct_rc_iface_do_progress, @@ -1083,35 +1129,24 @@ static uct_rc_iface_ops_t uct_dc_mlx5_iface_ops = { .event_cq = uct_dc_mlx5_iface_event_cq, .handle_failure = uct_dc_mlx5_iface_handle_failure, .set_ep_failed = uct_dc_mlx5_ep_set_failed, - .create_qp = uct_dc_mlx5_iface_create_qp, - .init_res_domain = uct_rc_mlx5_init_res_domain, - .cleanup_res_domain = uct_rc_mlx5_cleanup_res_domain, }, .init_rx = uct_dc_mlx5_init_rx, + .cleanup_rx = uct_dc_mlx5_cleanup_rx, .fc_ctrl = uct_dc_mlx5_ep_fc_ctrl, .fc_handler = uct_dc_mlx5_iface_fc_handler, }; -static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h md, uct_worker_h worker, +static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h tl_md, uct_worker_h worker, const uct_iface_params_t *params, const uct_iface_config_t *tl_config) { uct_dc_mlx5_iface_config_t *config = ucs_derived_of(tl_config, uct_dc_mlx5_iface_config_t); + uct_ib_mlx5_md_t *md = ucs_derived_of(tl_md, uct_ib_mlx5_md_t); uct_ib_iface_init_attr_t init_attr = {}; ucs_status_t status; ucs_trace_func(""); - init_attr.tm_cap_bit = IBV_EXP_TM_CAP_DC; - init_attr.qp_type = UCT_IB_QPT_DCI; - init_attr.flags = UCT_IB_CQ_IGNORE_OVERRUN; - init_attr.fc_req_size = sizeof(uct_dc_fc_request_t); - init_attr.rx_hdr_len = sizeof(uct_rc_mlx5_hdr_t); - - UCS_CLASS_CALL_SUPER_INIT(uct_rc_mlx5_iface_common_t, - &uct_dc_mlx5_iface_ops, - md, worker, params, - &config->super, &init_attr); if (config->ndci < 1) { ucs_error("dc interface must have at least 1 dci (requested: %d)", config->ndci); @@ -1124,10 +1159,36 @@ static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h md, uct_worker_h worker return UCS_ERR_INVALID_PARAM; } - uct_dc_mlx5_iface_init_version(self, md); + init_attr.qp_type = UCT_IB_QPT_DCI; + init_attr.flags = UCT_IB_CQ_IGNORE_OVERRUN; + init_attr.fc_req_size = sizeof(uct_dc_fc_request_t); + init_attr.rx_hdr_len = sizeof(uct_rc_mlx5_hdr_t); + + if (md->flags & UCT_IB_MLX5_MD_FLAG_DC_TM) { + init_attr.flags |= UCT_IB_TM_SUPPORTED; + } + + /* driver will round up to pow of 2 if needed */ + init_attr.cq_len[UCT_IB_DIR_TX] = config->super.super.tx.queue_len * + UCT_IB_MLX5_MAX_BB * config->ndci; + /* TODO check caps instead */ + if (ucs_roundup_pow2(init_attr.cq_len[UCT_IB_DIR_TX]) > UCT_DC_MLX5_MAX_TX_CQ_LEN) { + ucs_error("Can't allocate TX resources, try to decrease dcis number (%d)" + " or tx qp length (%d)", + config->ndci, config->super.super.tx.queue_len); + return UCS_ERR_INVALID_PARAM; + } + + UCS_CLASS_CALL_SUPER_INIT(uct_rc_mlx5_iface_common_t, + &uct_dc_mlx5_iface_ops, + tl_md, worker, params, &config->super, + &config->rc_mlx5_common, &init_attr); + + uct_dc_mlx5_iface_init_version(self, tl_md); self->tx.ndci = config->ndci; - self->tx.policy = config->tx_policy; + self->tx.policy = (uct_dc_tx_policy_t)config->tx_policy; + self->tx.fc_grants = 0; self->super.super.config.tx_moderation = 0; /* disable tx moderation for dcs */ ucs_list_head_init(&self->tx.gc_list); @@ -1143,15 +1204,15 @@ static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h md, uct_worker_h worker } /* create DC initiators */ - status = uct_dc_mlx5_iface_create_dcis(self, config); + status = uct_dc_mlx5_iface_create_dcis(self); if (status != UCS_OK) { goto err_destroy_dct; } - ucs_debug("dc iface %p: using '%s' policy with %d dcis, dct 0x%x", self, - uct_dc_tx_policy_names[self->tx.policy], self->tx.ndci, - UCT_RC_MLX5_TM_ENABLED(&self->super) ? - 0 : uct_dc_mlx5_get_dct_num(self)); + ucs_debug("dc iface %p: using '%s' policy with %d dcis and %d cqes, dct 0x%x", + self, uct_dc_tx_policy_names[self->tx.policy], self->tx.ndci, + init_attr.cq_len[UCT_IB_DIR_TX], UCT_RC_MLX5_TM_ENABLED(&self->super) ? + 0 : self->rx.dct.qp_num); /* Create fake endpoint which will be used for sending FC grants */ uct_dc_mlx5_iface_init_fc_ep(self); @@ -1165,20 +1226,8 @@ static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h md, uct_worker_h worker goto err_destroy_dct; } - status = uct_dc_mlx5_iface_init_dcis(self, self->super.tx.mmio_mode); - if (status != UCS_OK) { - goto err_destroy_dct; - } - self->tx.available_quota = self->super.super.config.tx_qp_len - ucs_min(self->super.super.config.tx_qp_len, config->quota); - /* Set max_iov for put_zcopy and get_zcopy */ - uct_ib_iface_set_max_iov(&self->super.super.super, - (UCT_IB_MLX5_MAX_SEND_WQE_SIZE - - sizeof(struct mlx5_wqe_raddr_seg) - - sizeof(struct mlx5_wqe_ctrl_seg) - - UCT_IB_MLX5_AV_FULL_SIZE) / - sizeof(struct mlx5_wqe_data_seg)); uct_rc_mlx5_iface_common_prepost_recvs(&self->super); @@ -1219,24 +1268,19 @@ static UCS_CLASS_DEFINE_NEW_FUNC(uct_dc_mlx5_iface_t, uct_iface_t, uct_md_h, static UCS_CLASS_DEFINE_DELETE_FUNC(uct_dc_mlx5_iface_t, uct_iface_t); -static -ucs_status_t uct_dc_mlx5_query_resources(uct_md_h md, - uct_tl_resource_desc_t **resources_p, - unsigned *num_resources_p) +static ucs_status_t +uct_dc_mlx5_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) { uct_ib_md_t *ib_md = ucs_derived_of(md, uct_ib_md_t); + int flags; - return uct_dc_device_query_tl_resources(&ib_md->dev,"dc_mlx5", - UCT_IB_DEVICE_FLAG_MLX5_PRM | - (ib_md->config.eth_pause ? 0 : UCT_IB_DEVICE_FLAG_LINK_IB), - resources_p, num_resources_p); + flags = UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC | + (ib_md->config.eth_pause ? 0 : UCT_IB_DEVICE_FLAG_LINK_IB); + return uct_ib_device_query_ports(&ib_md->dev, flags, tl_devices_p, + num_tl_devices_p); } -UCT_TL_COMPONENT_DEFINE(uct_dc_mlx5_tl, - uct_dc_mlx5_query_resources, - uct_dc_mlx5_iface_t, - "dc_mlx5", - "DC_MLX5_", - uct_dc_mlx5_iface_config_table, - uct_dc_mlx5_iface_config_t); -UCT_MD_REGISTER_TL(&uct_ib_mdc, &uct_dc_mlx5_tl); +UCT_TL_DEFINE(&uct_ib_component, dc_mlx5, uct_dc_mlx5_query_tl_devices, + uct_dc_mlx5_iface_t, "DC_MLX5_", uct_dc_mlx5_iface_config_table, + uct_dc_mlx5_iface_config_t); diff --git a/src/uct/ib/dc/dc_mlx5.h b/src/uct/ib/dc/dc_mlx5.h index a5d9ca51e54..67d59962dde 100644 --- a/src/uct/ib/dc/dc_mlx5.h +++ b/src/uct/ib/dc/dc_mlx5.h @@ -13,8 +13,29 @@ #include #include #include +#include +/* + * HW tag matching + */ +#if IBV_HW_TM +# if HAVE_INFINIBAND_TM_TYPES_H +/* upstream tm_types.h doesn't provide RAVH header */ +struct ibv_ravh { + uint32_t sl_dct; + uint32_t reserved; /* must be zero */ + uint64_t dc_access_key; +}; +# else +# define ibv_ravh ibv_exp_tmh_ravh +# endif +# define UCT_DC_RNDV_HDR_LEN (sizeof(struct ibv_rvh) + \ + sizeof(struct ibv_ravh)) +#else +# define UCT_DC_RNDV_HDR_LEN 0 +#endif + #define UCT_DC_MLX5_IFACE_MAX_DCIS 16 #define UCT_DC_MLX5_IFACE_ADDR_TM_ENABLED(_addr) \ @@ -81,7 +102,8 @@ typedef enum { typedef struct uct_dc_mlx5_iface_config { - uct_rc_mlx5_iface_common_config_t super; + uct_rc_iface_common_config_t super; + uct_rc_mlx5_iface_common_config_t rc_mlx5_common; uct_ud_iface_common_config_t ud_common; int ndci; int tx_policy; @@ -91,14 +113,9 @@ typedef struct uct_dc_mlx5_iface_config { } uct_dc_mlx5_iface_config_t; -typedef enum { - UCT_DC_DCI_FLAG_EP_CANCELED = UCS_BIT(0), - UCT_DC_DCI_FLAG_EP_DESTROYED = UCS_BIT(1) -} uct_dc_dci_state_t; - - typedef struct uct_dc_dci { uct_rc_txqp_t txqp; /* DCI qp */ + uct_ib_mlx5_txwq_t txwq; /* DCI mlx5 wq */ union { uct_dc_mlx5_ep_t *ep; /* points to an endpoint that currently owns the dci. Relevant only for dcs @@ -109,7 +126,7 @@ typedef struct uct_dc_dci { processed. Better have dci num groups scheduled than ep num. */ }; -#if ENABLE_ASSERT +#if UCS_ENABLE_ASSERT uint8_t flags; /* debug state, @ref uct_dc_dci_state_t */ #endif } uct_dc_dci_t; @@ -139,7 +156,6 @@ struct uct_dc_mlx5_iface { struct { /* Array of dcis */ uct_dc_dci_t dcis[UCT_DC_MLX5_IFACE_MAX_DCIS]; - uct_ib_mlx5_txwq_t dci_wqs[UCT_DC_MLX5_IFACE_MAX_DCIS]; uint8_t ndci; /* Number of DCIs */ uct_dc_tx_policy_t policy; /* dci selection algorithm */ @@ -158,17 +174,18 @@ struct uct_dc_mlx5_iface { /* List of destroyed endpoints waiting for credit grant */ ucs_list_link_t gc_list; + /* Number of expected FC grants */ + unsigned fc_grants; + /* Seed used for random dci allocation */ unsigned rand_seed; ucs_arbiter_callback_t pend_cb; } tx; -#if HAVE_DC_EXP - struct ibv_exp_dct *rx_dct; -#elif HAVE_DC_DV - struct ibv_qp *rx_dct; -#endif + struct { + uct_ib_mlx5_qp_t dct; + } rx; uint8_t version_flag; @@ -186,19 +203,12 @@ int uct_dc_mlx5_iface_is_reachable(const uct_iface_h tl_iface, ucs_status_t uct_dc_mlx5_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *iface_addr); -ucs_status_t uct_dc_device_query_tl_resources(uct_ib_device_t *dev, - const char *tl_name, unsigned flags, - uct_tl_resource_desc_t **resources_p, - unsigned *num_resources_p); - ucs_status_t uct_dc_mlx5_iface_flush(uct_iface_h tl_iface, unsigned flags, uct_completion_t *comp); void uct_dc_mlx5_iface_set_quota(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_iface_config_t *config); ucs_status_t uct_dc_mlx5_iface_init_fc_ep(uct_dc_mlx5_iface_t *iface); -ucs_status_t uct_dc_mlx5_iface_dci_connect(uct_dc_mlx5_iface_t *iface, uct_rc_txqp_t *dci); - void uct_dc_mlx5_iface_cleanup_fc_ep(uct_dc_mlx5_iface_t *iface); ucs_status_t uct_dc_mlx5_iface_fc_grant(uct_pending_req_t *self); @@ -207,30 +217,53 @@ ucs_status_t uct_dc_mlx5_iface_fc_handler(uct_rc_iface_t *rc_iface, unsigned qp_ uct_rc_hdr_t *hdr, unsigned length, uint32_t imm_data, uint16_t lid, unsigned flags); -void uct_dc_mlx5_iface_set_av_sport(uct_dc_mlx5_iface_t *iface, - uct_ib_mlx5_base_av_t *av, - uint32_t remote_dctn); - -int uct_dc_mlx5_get_dct_num(uct_dc_mlx5_iface_t *iface); - void uct_dc_mlx5_destroy_dct(uct_dc_mlx5_iface_t *iface); void uct_dc_mlx5_iface_init_version(uct_dc_mlx5_iface_t *iface, uct_md_h md); -ucs_status_t uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *dc_mlx5_iface, int dci); +ucs_status_t uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *iface, + uct_dc_dci_t *dci); -ucs_status_t uct_dc_mlx5_iface_create_dcis(uct_dc_mlx5_iface_t *iface, - uct_dc_mlx5_iface_config_t *config); +ucs_status_t uct_dc_mlx5_iface_dci_connect(uct_dc_mlx5_iface_t *iface, + uct_dc_dci_t *dci); void uct_dc_mlx5_iface_dcis_destroy(uct_dc_mlx5_iface_t *iface, int max); -#if IBV_EXP_HW_TM_DC -void uct_dc_mlx5_iface_fill_xrq_init_attrs(uct_rc_iface_t *rc_iface, - struct ibv_exp_create_srq_attr *srq_attr, - struct ibv_exp_srq_dc_offload_params *dc_op); +#if HAVE_DEVX + +ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface); + +ucs_status_t uct_dc_mlx5_iface_devx_set_srq_dc_params(uct_dc_mlx5_iface_t *iface); +ucs_status_t uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface, + uct_ib_mlx5_qp_t *qp); + +#else + +static UCS_F_MAYBE_UNUSED ucs_status_t +uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface) +{ + return UCS_ERR_UNSUPPORTED; +} + +static UCS_F_MAYBE_UNUSED ucs_status_t +uct_dc_mlx5_iface_devx_set_srq_dc_params(uct_dc_mlx5_iface_t *iface) +{ + return UCS_ERR_UNSUPPORTED; +} + +static UCS_F_MAYBE_UNUSED ucs_status_t +uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface, + uct_ib_mlx5_qp_t *qp) +{ + return UCS_ERR_UNSUPPORTED; +} + +#endif + +#if IBV_HW_TM static UCS_F_ALWAYS_INLINE void -uct_dc_mlx5_iface_fill_ravh(struct ibv_exp_tmh_ravh *ravh, uint32_t dct_num) +uct_dc_mlx5_iface_fill_ravh(struct ibv_ravh *ravh, uint32_t dct_num) { ravh->sl_dct = htobe32(dct_num); ravh->dc_access_key = htobe64(UCT_IB_KEY); @@ -250,13 +283,20 @@ static inline uint8_t uct_dc_mlx5_iface_dci_find(uct_dc_mlx5_iface_t *iface, uin int i, ndci = iface->tx.ndci; for (i = 0; i < ndci; i++) { - if (dcis[i].txqp.qp->qp_num == qp_num) { + if (dcis[i].txwq.super.qp_num == qp_num) { return i; } } ucs_fatal("DCI (qpnum=%d) does not exist", qp_num); } +static UCS_F_ALWAYS_INLINE int +uct_dc_mlx5_iface_has_tx_resources(uct_dc_mlx5_iface_t *iface) +{ + return !ucs_mpool_is_empty(&iface->super.super.tx.mp) && + (iface->super.super.tx.reads_available > 0); +} + static inline int uct_dc_mlx5_iface_dci_has_tx_resources(uct_dc_mlx5_iface_t *iface, uint8_t dci) { return uct_rc_txqp_available(&iface->tx.dcis[dci].txqp) > 0; diff --git a/src/uct/ib/dc/dc_mlx5_devx.c b/src/uct/ib/dc/dc_mlx5_devx.c new file mode 100644 index 00000000000..ad65f222cfb --- /dev/null +++ b/src/uct/ib/dc/dc_mlx5_devx.c @@ -0,0 +1,169 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "dc_mlx5.h" + +#include +#include +#include + + +ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface) +{ + uct_ib_device_t *dev = uct_ib_iface_device(&iface->super.super.super); + struct mlx5dv_pd dvpd = {}; + struct mlx5dv_cq dvcq = {}; + struct mlx5dv_obj dv = {}; + char in[UCT_IB_MLX5DV_ST_SZ_BYTES(create_dct_in)] = {}; + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_dct_out)] = {}; + int dvflags; + void *dctc; + + dvflags = MLX5DV_OBJ_PD | MLX5DV_OBJ_CQ; + dv.pd.in = uct_ib_iface_md(&iface->super.super.super)->pd; + dv.pd.out = &dvpd; + dv.cq.in = iface->super.super.super.cq[UCT_IB_DIR_RX]; + dv.cq.out = &dvcq; + mlx5dv_init_obj(&dv, dvflags); + + UCT_IB_MLX5DV_SET(create_dct_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_DCT); + dctc = UCT_IB_MLX5DV_ADDR_OF(create_dct_in, in, dct_context_entry); + UCT_IB_MLX5DV_SET(dctc, dctc, pd, dvpd.pdn); + ucs_assert(iface->super.rx.srq.srq_num != 0); + UCT_IB_MLX5DV_SET(dctc, dctc, srqn_xrqn, iface->super.rx.srq.srq_num); + if (UCT_RC_MLX5_TM_ENABLED(&iface->super)) { + UCT_IB_MLX5DV_SET(dctc, dctc, offload_type, UCT_IB_MLX5_QPC_OFFLOAD_TYPE_RNDV); + } + UCT_IB_MLX5DV_SET(dctc, dctc, cqn, dvcq.cqn); + UCT_IB_MLX5DV_SET64(dctc, dctc, dc_access_key, UCT_IB_KEY); + + UCT_IB_MLX5DV_SET(dctc, dctc, rre, true); + UCT_IB_MLX5DV_SET(dctc, dctc, rwe, true); + UCT_IB_MLX5DV_SET(dctc, dctc, rae, true); + UCT_IB_MLX5DV_SET(dctc, dctc, cs_res, uct_ib_mlx5_qpc_cs_res( + iface->super.super.super.config.max_inl_cqe[UCT_IB_DIR_RX], 1)); + UCT_IB_MLX5DV_SET(dctc, dctc, atomic_mode, UCT_IB_MLX5_ATOMIC_MODE); + UCT_IB_MLX5DV_SET(dctc, dctc, pkey_index, iface->super.super.super.pkey_index); + UCT_IB_MLX5DV_SET(dctc, dctc, port, iface->super.super.super.config.port_num); + + UCT_IB_MLX5DV_SET(dctc, dctc, min_rnr_nak, iface->super.super.config.min_rnr_timer); + UCT_IB_MLX5DV_SET(dctc, dctc, tclass, iface->super.super.super.config.traffic_class); + UCT_IB_MLX5DV_SET(dctc, dctc, mtu, iface->super.super.super.config.path_mtu); + UCT_IB_MLX5DV_SET(dctc, dctc, my_addr_index, iface->super.super.super.gid_info.gid_index); + UCT_IB_MLX5DV_SET(dctc, dctc, hop_limit, iface->super.super.super.config.hop_limit); + + iface->rx.dct.devx.obj = mlx5dv_devx_obj_create(dev->ibv_context, in, sizeof(in), + out, sizeof(out)); + if (iface->rx.dct.devx.obj == NULL) { + ucs_error("mlx5dv_devx_obj_create(DCT) failed, syndrome %x: %m", + UCT_IB_MLX5DV_GET(create_dct_out, out, syndrome)); + return UCS_ERR_INVALID_PARAM; + } + + iface->rx.dct.type = UCT_IB_MLX5_OBJ_TYPE_DEVX; + iface->rx.dct.qp_num = UCT_IB_MLX5DV_GET(create_dct_out, out, dctn); + return UCS_OK; +} + +ucs_status_t +uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface, + uct_ib_mlx5_qp_t *qp) +{ + char in_2init[UCT_IB_MLX5DV_ST_SZ_BYTES(rst2init_qp_in)] = {}; + char out_2init[UCT_IB_MLX5DV_ST_SZ_BYTES(rst2init_qp_out)] = {}; + char in_2rtr[UCT_IB_MLX5DV_ST_SZ_BYTES(init2rtr_qp_in)] = {}; + char out_2rtr[UCT_IB_MLX5DV_ST_SZ_BYTES(init2rtr_qp_out)] = {}; + char in_2rts[UCT_IB_MLX5DV_ST_SZ_BYTES(rtr2rts_qp_in)] = {}; + char out_2rts[UCT_IB_MLX5DV_ST_SZ_BYTES(rtr2rts_qp_out)] = {}; + ucs_status_t status; + void *qpc; + + UCT_IB_MLX5DV_SET(rst2init_qp_in, in_2init, opcode, UCT_IB_MLX5_CMD_OP_RST2INIT_QP); + UCT_IB_MLX5DV_SET(rst2init_qp_in, in_2init, qpn, qp->qp_num); + + qpc = UCT_IB_MLX5DV_ADDR_OF(rst2init_qp_in, in_2init, qpc); + UCT_IB_MLX5DV_SET(qpc, qpc, pm_state, UCT_IB_MLX5_QPC_PM_STATE_MIGRATED); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.vhca_port_num, iface->super.super.super.config.port_num); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.pkey_index, iface->super.super.super.pkey_index); + + status = uct_ib_mlx5_devx_modify_qp(qp, in_2init, sizeof(in_2init), + out_2init, sizeof(out_2init)); + if (status != UCS_OK) { + return status; + } + + UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, opcode, UCT_IB_MLX5_CMD_OP_INIT2RTR_QP); + UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, qpn, qp->qp_num); + UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, opt_param_mask, 4); + + qpc = UCT_IB_MLX5DV_ADDR_OF(init2rtr_qp_in, in_2rtr, qpc); + UCT_IB_MLX5DV_SET(qpc, qpc, pm_state, UCT_IB_MLX5_QPC_PM_STATE_MIGRATED); + UCT_IB_MLX5DV_SET(qpc, qpc, mtu, iface->super.super.super.config.path_mtu); + UCT_IB_MLX5DV_SET(qpc, qpc, log_msg_max, UCT_IB_MLX5_LOG_MAX_MSG_SIZE); + UCT_IB_MLX5DV_SET(qpc, qpc, atomic_mode, UCT_IB_MLX5_ATOMIC_MODE); + UCT_IB_MLX5DV_SET(qpc, qpc, rae, true); + if (uct_ib_iface_is_roce(&iface->super.super.super)) { + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.eth_prio, + iface->super.super.super.config.sl); + } else { + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.sl, + iface->super.super.super.config.sl); + } + + status = uct_ib_mlx5_devx_modify_qp(qp, in_2rtr, sizeof(in_2rtr), + out_2rtr, sizeof(out_2rtr)); + if (status != UCS_OK) { + return status; + } + + UCT_IB_MLX5DV_SET(rtr2rts_qp_in, in_2rts, opcode, UCT_IB_MLX5_CMD_OP_RTR2RTS_QP); + UCT_IB_MLX5DV_SET(rtr2rts_qp_in, in_2rts, qpn, qp->qp_num); + + qpc = UCT_IB_MLX5DV_ADDR_OF(rtr2rts_qp_in, in_2rts, qpc); + UCT_IB_MLX5DV_SET(qpc, qpc, pm_state, UCT_IB_MLX5_QPC_PM_STATE_MIGRATED); + /* cppcheck-suppress internalAstError */ + UCT_IB_MLX5DV_SET(qpc, qpc, log_sra_max, ucs_ilog2_or0(iface->super.super.config.max_rd_atomic)); + UCT_IB_MLX5DV_SET(qpc, qpc, retry_count, iface->super.super.config.retry_cnt); + UCT_IB_MLX5DV_SET(qpc, qpc, rnr_retry, iface->super.super.config.rnr_retry); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.ack_timeout, iface->super.super.config.timeout); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.log_rtm, iface->super.super.config.exp_backoff); + + return uct_ib_mlx5_devx_modify_qp(qp, in_2rts, sizeof(in_2rts), + out_2rts, sizeof(out_2rts)); +} + +ucs_status_t uct_dc_mlx5_iface_devx_set_srq_dc_params(uct_dc_mlx5_iface_t *iface) +{ + char in[UCT_IB_MLX5DV_ST_SZ_BYTES(set_xrq_dc_params_entry_in)] = {}; + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(set_xrq_dc_params_entry_out)] = {}; + int ret; + + UCT_IB_MLX5DV_SET(set_xrq_dc_params_entry_in, in, pkey_table_index, iface->super.super.super.pkey_index); + UCT_IB_MLX5DV_SET(set_xrq_dc_params_entry_in, in, mtu, iface->super.super.super.config.path_mtu); + UCT_IB_MLX5DV_SET(set_xrq_dc_params_entry_in, in, sl, iface->super.super.super.config.sl); + UCT_IB_MLX5DV_SET(set_xrq_dc_params_entry_in, in, reverse_sl, iface->super.super.super.config.sl); + UCT_IB_MLX5DV_SET(set_xrq_dc_params_entry_in, in, cnak_reverse_sl, iface->super.super.super.config.sl); + UCT_IB_MLX5DV_SET(set_xrq_dc_params_entry_in, in, ack_timeout, iface->super.super.config.timeout); + UCT_IB_MLX5DV_SET64(set_xrq_dc_params_entry_in, in, dc_access_key, UCT_IB_KEY); + ucs_assert(iface->super.rx.srq.srq_num != 0); + UCT_IB_MLX5DV_SET(set_xrq_dc_params_entry_in, in, xrqn, iface->super.rx.srq.srq_num); + UCT_IB_MLX5DV_SET(set_xrq_dc_params_entry_in, in, opcode, + UCT_IB_MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY); + + ret = mlx5dv_devx_obj_modify(iface->super.rx.srq.devx.obj, in, sizeof(in), out, sizeof(out)); + if (ret) { + ucs_error("mlx5dv_devx_obj_modify(SET_XRQ_DC_PARAMS) failed, syndrome %x: %m", + UCT_IB_MLX5DV_GET(set_xrq_dc_params_entry_out, out, syndrome)); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + diff --git a/src/uct/ib/dc/dc_mlx5_ep.c b/src/uct/ib/dc/dc_mlx5_ep.c index 1ccb0f02a66..9046c207e6e 100644 --- a/src/uct/ib/dc/dc_mlx5_ep.c +++ b/src/uct/ib/dc/dc_mlx5_ep.c @@ -1,9 +1,13 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "dc_mlx5_ep.h" #include "dc_mlx5.h" @@ -15,7 +19,7 @@ uint8_t dci; \ dci = (_ep)->dci; \ _txqp = &(_iface)->tx.dcis[dci].txqp; \ - _txwq = &(_iface)->tx.dci_wqs[dci]; \ + _txwq = &(_iface)->tx.dcis[dci].txwq; \ } static UCS_F_ALWAYS_INLINE void @@ -32,8 +36,7 @@ uct_dc_mlx5_iface_bcopy_post(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep, desc->super.sn = txwq->sw_pi; uct_rc_mlx5_txqp_dptr_post(&iface->super, UCT_IB_QPT_DCI, txqp, txwq, opcode, buffer, length, &desc->lkey, - rdma_raddr, uct_ib_md_direct_rkey(rdma_rkey), - 0, 0, 0, 0, + rdma_raddr, rdma_rkey, 0, 0, 0, 0, &ep->av, uct_dc_mlx5_ep_get_grh(ep), uct_ib_mlx5_wqe_av_size(&ep->av), MLX5_WQE_CTRL_CQ_UPDATE | send_flags, imm_val_be, INT_MAX, @@ -45,10 +48,12 @@ uct_dc_mlx5_iface_bcopy_post(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep, static UCS_F_ALWAYS_INLINE void uct_dc_mlx5_iface_zcopy_post(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep, unsigned opcode, const uct_iov_t *iov, size_t iovcnt, + size_t iov_total_length, /* SEND */ uint8_t am_id, const void *am_hdr, unsigned am_hdr_len, /* RDMA */ uint64_t rdma_raddr, uct_rkey_t rdma_rkey, /* TAG */ uct_tag_t tag, uint32_t app_ctx, uint32_t ib_imm_be, - uct_completion_t *comp, uint8_t send_flags) + uct_rc_send_handler_t handler, uct_completion_t *comp, + uint8_t send_flags) { uint16_t sn; UCT_DC_MLX5_TXQP_DECL(txqp, txwq); @@ -59,15 +64,15 @@ uct_dc_mlx5_iface_zcopy_post(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep, uct_rc_mlx5_txqp_dptr_post_iov(&iface->super, UCT_IB_QPT_DCI, txqp, txwq, opcode, iov, iovcnt, am_id, am_hdr, am_hdr_len, - rdma_raddr, uct_ib_md_direct_rkey(rdma_rkey), + rdma_raddr, rdma_rkey, tag, app_ctx, ib_imm_be, &ep->av, uct_dc_mlx5_ep_get_grh(ep), uct_ib_mlx5_wqe_av_size(&ep->av), MLX5_WQE_CTRL_CQ_UPDATE | send_flags, UCT_IB_MAX_ZCOPY_LOG_SGE(&iface->super.super.super)); - uct_rc_txqp_add_send_comp(&iface->super.super, txqp, comp, sn, - UCT_RC_IFACE_SEND_OP_FLAG_ZCOPY); + uct_rc_txqp_add_send_comp(&iface->super.super, txqp, handler, comp, sn, + UCT_RC_IFACE_SEND_OP_FLAG_ZCOPY, iov_total_length); } static UCS_F_ALWAYS_INLINE void @@ -111,7 +116,7 @@ uct_dc_mlx5_ep_atomic_op_post(uct_ep_h tl_ep, unsigned opcode, unsigned size, int ext; /* not used here */ ucs_status_t status; - UCT_DC_MLX5_CHECK_RES(iface, ep); + UCT_DC_MLX5_CHECK_RMA_RES(iface, ep); UCT_RC_MLX5_CHECK_ATOMIC_OPS(opcode, size, UCT_RC_MLX5_ATOMIC_OPS); status = uct_rc_mlx5_iface_common_atomic_data(opcode, size, value, &op, &compare_mask, @@ -135,7 +140,7 @@ uct_dc_mlx5_ep_atomic_fop(uct_dc_mlx5_ep_t *ep, int opcode, void *result, int ex uct_dc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_mlx5_iface_t); uct_rc_iface_send_desc_t *desc; - UCT_DC_MLX5_CHECK_RES(iface, ep); + UCT_DC_MLX5_CHECK_RMA_RES(iface, ep); UCT_RC_IFACE_GET_TX_ATOMIC_FETCH_DESC(&iface->super.super, &iface->super.tx.atomic_desc_mp, desc, uct_rc_iface_atomic_handler(&iface->super.super, ext, length), @@ -178,7 +183,7 @@ ucs_status_t uct_dc_mlx5_ep_atomic_cswap64(uct_ep_h tl_ep, uint64_t compare, uin { return uct_dc_mlx5_ep_atomic_fop(ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t), MLX5_OPCODE_ATOMIC_CS, result, 0, sizeof(uint64_t), - remote_addr, rkey, 0, htobe64(compare), -1, + remote_addr, rkey, 0, htobe64(compare), UINT64_MAX, htobe64(swap), comp); } @@ -189,7 +194,7 @@ ucs_status_t uct_dc_mlx5_ep_atomic_cswap32(uct_ep_h tl_ep, uint32_t compare, uin return uct_dc_mlx5_ep_atomic_fop(ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t), MLX5_OPCODE_ATOMIC_MASKED_CS, result, 1, sizeof(uint32_t), remote_addr, rkey, UCS_MASK(32), - htonl(compare), -1, htonl(swap), comp); + htonl(compare), UINT64_MAX, htonl(swap), comp); } ucs_status_t uct_dc_mlx5_ep_atomic32_post(uct_ep_h ep, unsigned opcode, uint32_t value, @@ -226,17 +231,9 @@ ucs_status_t uct_dc_mlx5_ep_fence(uct_ep_h tl_ep, unsigned flags) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); - uct_ib_md_t *md = uct_ib_iface_md(&iface->super.super.super); - /* in case PCI Atomics are enabled atomic/read operation on target - * are unordered according to PCI specification so we need to - * request atomic fence for next such operation */ - if (ep->dci != UCT_DC_MLX5_EP_NO_DCI) { - uct_rc_mlx5_add_fence(md, &iface->tx.dci_wqs[ep->dci]); - } - - UCT_TL_EP_STAT_FENCE(&ep->super); - return UCS_OK; + return uct_rc_ep_fence(tl_ep, &iface->tx.dcis[ep->dci].txwq.fi, + ep->dci != UCT_DC_MLX5_EP_NO_DCI); } static ucs_status_t UCS_F_ALWAYS_INLINE @@ -266,15 +263,15 @@ uct_dc_mlx5_ep_am_short_inline(uct_ep_h tl_ep, uint8_t id, uint64_t hdr, return UCS_OK; } -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM static ucs_status_t UCS_F_ALWAYS_INLINE uct_dc_mlx5_ep_short_dm(uct_dc_mlx5_ep_t *ep, uct_rc_mlx5_dm_copy_data_t *cache, size_t hdr_len, const void *payload, unsigned length, unsigned opcode, uint8_t fm_ce_se, uint64_t rdma_raddr, uct_rkey_t rdma_rkey) { - uct_dc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_mlx5_iface_t); - uct_rc_iface_send_desc_t *desc; + uct_dc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_mlx5_iface_t); + uct_rc_iface_send_desc_t *desc = NULL; void *buffer; ucs_status_t status; uct_ib_log_sge_t log_sge; @@ -298,7 +295,7 @@ uct_dc_mlx5_ep_short_dm(uct_dc_mlx5_ep_t *ep, uct_rc_mlx5_dm_copy_data_t *cache, ucs_status_t uct_dc_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr, const void *buffer, unsigned length) { -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); ucs_status_t status; @@ -309,7 +306,7 @@ ucs_status_t uct_dc_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr, !iface->super.dm.dm)) { #endif return uct_dc_mlx5_ep_am_short_inline(tl_ep, id, hdr, buffer, length); -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM } UCT_CHECK_AM_ID(id); @@ -372,8 +369,9 @@ ucs_status_t uct_dc_mlx5_ep_am_zcopy(uct_ep_h tl_ep, uint8_t id, const void *hea UCT_IB_MLX5_AV_FULL_SIZE); UCT_DC_CHECK_RES_AND_FC(iface, ep); - uct_dc_mlx5_iface_zcopy_post(iface, ep, MLX5_OPCODE_SEND, iov, iovcnt, + uct_dc_mlx5_iface_zcopy_post(iface, ep, MLX5_OPCODE_SEND, iov, iovcnt, 0ul, id, header, header_length, 0, 0, 0ul, 0, 0, + uct_rc_ep_send_op_completion_handler, comp, MLX5_WQE_CTRL_SOLICITED); UCT_RC_UPDATE_FC_WND(&iface->super.super, &ep->fc); @@ -389,18 +387,18 @@ uct_dc_mlx5_ep_put_short_inline(uct_ep_h tl_ep, const void *buffer, uct_rkey_t rkey) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); - uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); + uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); UCT_DC_MLX5_TXQP_DECL(txqp, txwq); UCT_RC_MLX5_CHECK_PUT_SHORT(length, UCT_IB_MLX5_AV_FULL_SIZE); - UCT_DC_MLX5_CHECK_RES(iface, ep); + UCT_DC_MLX5_CHECK_RMA_RES(iface, ep); UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq); + uct_rc_mlx5_ep_fence_put(&iface->super, txwq, &rkey, &remote_addr, + ep->atomic_mr_offset); uct_rc_mlx5_txqp_inline_post(&iface->super, UCT_IB_QPT_DCI, - txqp, txwq, - MLX5_OPCODE_RDMA_WRITE, - buffer, length, 0, 0, 0, - remote_addr, uct_ib_md_direct_rkey(rkey), + txqp, txwq, MLX5_OPCODE_RDMA_WRITE, + buffer, length, 0, 0, 0, remote_addr, rkey, &ep->av, uct_dc_mlx5_ep_get_grh(ep), uct_ib_mlx5_wqe_av_size(&ep->av), 0, INT_MAX); @@ -413,20 +411,24 @@ ucs_status_t uct_dc_mlx5_ep_put_short(uct_ep_h tl_ep, const void *payload, unsigned length, uint64_t remote_addr, uct_rkey_t rkey) { -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); + UCT_DC_MLX5_TXQP_DECL(txqp, txwq); ucs_status_t status; if (ucs_likely((length <= UCT_IB_MLX5_PUT_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE)) || !iface->super.dm.dm)) { #endif return uct_dc_mlx5_ep_put_short_inline(tl_ep, payload, length, remote_addr, rkey); -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM } UCT_CHECK_LENGTH(length, 0, iface->super.dm.seg_len, "put_short"); - UCT_DC_MLX5_CHECK_RES(iface, ep); + UCT_DC_MLX5_CHECK_RMA_RES(iface, ep); + UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq); + uct_rc_mlx5_ep_fence_put(&iface->super, txwq, &rkey, &remote_addr, + ep->atomic_mr_offset); status = uct_dc_mlx5_ep_short_dm(ep, NULL, 0, payload, length, MLX5_OPCODE_RDMA_WRITE, MLX5_WQE_CTRL_CQ_UPDATE, @@ -443,13 +445,17 @@ ssize_t uct_dc_mlx5_ep_put_bcopy(uct_ep_h tl_ep, uct_pack_callback_t pack_cb, void *arg, uint64_t remote_addr, uct_rkey_t rkey) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); - uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); + uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); + UCT_DC_MLX5_TXQP_DECL(txqp, txwq); uct_rc_iface_send_desc_t *desc; size_t length; - UCT_DC_MLX5_CHECK_RES(iface, ep); + UCT_DC_MLX5_CHECK_RMA_RES(iface, ep); UCT_RC_IFACE_GET_TX_PUT_BCOPY_DESC(&iface->super.super, &iface->super.super.tx.mp, desc, pack_cb, arg, length); + UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq); + uct_rc_mlx5_ep_fence_put(&iface->super, txwq, &rkey, &remote_addr, + ep->atomic_mr_offset); uct_dc_mlx5_iface_bcopy_post(iface, ep, MLX5_OPCODE_RDMA_WRITE, length, remote_addr, rkey, desc, 0, 0, desc + 1, NULL); UCT_TL_EP_STAT_OP(&ep->super, PUT, BCOPY, length); @@ -461,17 +467,21 @@ ucs_status_t uct_dc_mlx5_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size uct_completion_t *comp) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); - uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); + uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); + UCT_DC_MLX5_TXQP_DECL(txqp, txwq); - UCT_CHECK_IOV_SIZE(iovcnt, uct_ib_iface_get_max_iov(&iface->super.super.super), + UCT_CHECK_IOV_SIZE(iovcnt, UCT_RC_MLX5_RMA_MAX_IOV(UCT_IB_MLX5_AV_FULL_SIZE), "uct_dc_mlx5_ep_put_zcopy"); UCT_CHECK_LENGTH(uct_iov_total_length(iov, iovcnt), 0, UCT_IB_MAX_MESSAGE_SIZE, "put_zcopy"); - UCT_DC_MLX5_CHECK_RES(iface, ep); + UCT_DC_MLX5_CHECK_RMA_RES(iface, ep); + UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq); + uct_rc_mlx5_ep_fence_put(&iface->super, txwq, &rkey, &remote_addr, + ep->atomic_mr_offset); uct_dc_mlx5_iface_zcopy_post(iface, ep, MLX5_OPCODE_RDMA_WRITE, iov, iovcnt, - 0, NULL, 0, remote_addr, rkey, 0ul, 0, 0, - comp, 0); + 0ul, 0, NULL, 0, remote_addr, rkey, 0ul, 0, 0, + uct_rc_ep_send_op_completion_handler, comp, 0); UCT_TL_EP_STAT_OP(&ep->super, PUT, ZCOPY, uct_iov_total_length(iov, iovcnt)); @@ -485,43 +495,64 @@ ucs_status_t uct_dc_mlx5_ep_get_bcopy(uct_ep_h tl_ep, uct_completion_t *comp) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); - uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); + uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); + uint8_t fm_ce_se = 0; + UCT_DC_MLX5_TXQP_DECL(txqp, txwq); uct_rc_iface_send_desc_t *desc; - UCT_CHECK_LENGTH(length, 0, iface->super.super.super.config.seg_size, "get_bcopy"); - UCT_DC_MLX5_CHECK_RES(iface, ep); - UCT_RC_IFACE_GET_TX_GET_BCOPY_DESC(&iface->super.super, &iface->super.super.tx.mp, + UCT_CHECK_LENGTH(length, 0, iface->super.super.super.config.seg_size, + "get_bcopy"); + UCT_DC_MLX5_CHECK_RMA_RES(iface, ep); + UCT_RC_IFACE_GET_TX_GET_BCOPY_DESC(&iface->super.super, + &iface->super.super.tx.mp, desc, unpack_cb, comp, arg, length); + UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq); + uct_rc_mlx5_ep_fence_get(&iface->super, txwq, &rkey, &fm_ce_se); + uct_dc_mlx5_iface_bcopy_post(iface, ep, MLX5_OPCODE_RDMA_READ, length, - remote_addr, rkey, desc, 0, 0, desc + 1, NULL); + remote_addr, rkey, desc, fm_ce_se, 0, + desc + 1, NULL); + + UCT_RC_RDMA_READ_POSTED(&iface->super.super, length); UCT_TL_EP_STAT_OP(&ep->super, GET, BCOPY, length); + return UCS_INPROGRESS; } -ucs_status_t uct_dc_mlx5_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp) +ucs_status_t uct_dc_mlx5_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, + size_t iovcnt, uint64_t remote_addr, + uct_rkey_t rkey, uct_completion_t *comp) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); - uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); + uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); + uint8_t fm_ce_se = 0; + size_t total_length = uct_iov_total_length(iov, iovcnt); + UCT_DC_MLX5_TXQP_DECL(txqp, txwq); - UCT_CHECK_IOV_SIZE(iovcnt, uct_ib_iface_get_max_iov(&iface->super.super.super), + UCT_CHECK_IOV_SIZE(iovcnt, UCT_RC_MLX5_RMA_MAX_IOV(UCT_IB_MLX5_AV_FULL_SIZE), "uct_dc_mlx5_ep_get_zcopy"); - UCT_CHECK_LENGTH(uct_iov_total_length(iov, iovcnt), - iface->super.super.super.config.max_inl_resp + 1, UCT_IB_MAX_MESSAGE_SIZE, - "get_zcopy"); - UCT_DC_MLX5_CHECK_RES(iface, ep); + UCT_CHECK_LENGTH(total_length, + iface->super.super.super.config.max_inl_cqe[UCT_IB_DIR_TX] + 1, + iface->super.super.config.max_get_zcopy, "get_zcopy"); + UCT_DC_MLX5_CHECK_RMA_RES(iface, ep); + UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq); + uct_rc_mlx5_ep_fence_get(&iface->super, txwq, &rkey, &fm_ce_se); uct_dc_mlx5_iface_zcopy_post(iface, ep, MLX5_OPCODE_RDMA_READ, iov, iovcnt, - 0, NULL, 0, remote_addr, rkey, 0ul, 0, 0, - comp, 0); - UCT_TL_EP_STAT_OP(&ep->super, GET, ZCOPY, - uct_iov_total_length(iov, iovcnt)); + total_length, 0, NULL, 0, remote_addr, rkey, + 0ul, 0, 0, + uct_rc_ep_get_zcopy_completion_handler, comp, + fm_ce_se); + + UCT_RC_RDMA_READ_POSTED(&iface->super.super, total_length); + UCT_TL_EP_STAT_OP(&ep->super, GET, ZCOPY, total_length); + return UCS_INPROGRESS; } -ucs_status_t uct_dc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp) +ucs_status_t uct_dc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags, + uct_completion_t *comp) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); @@ -529,19 +560,21 @@ ucs_status_t uct_dc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion UCT_DC_MLX5_TXQP_DECL(txqp, txwq); if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) { - if (ep->dci != UCT_DC_MLX5_EP_NO_DCI) { - uct_rc_txqp_purge_outstanding(&iface->tx.dcis[ep->dci].txqp, - UCS_ERR_CANCELED, 0); -#if ENABLE_ASSERT - iface->tx.dcis[ep->dci].flags |= UCT_DC_DCI_FLAG_EP_CANCELED; -#endif + if (uct_dc_mlx5_iface_is_dci_rand(iface)) { + return UCS_ERR_UNSUPPORTED; } uct_ep_pending_purge(tl_ep, NULL, 0); + if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) { + /* No dci -> no WQEs -> HW is clean, nothing to cancel */ + return UCS_OK; + } + + uct_dc_mlx5_ep_handle_failure(ep, NULL, UCS_ERR_CANCELED); return UCS_OK; } - if (!uct_rc_iface_has_tx_resources(&iface->super.super)) { + if (!uct_dc_mlx5_iface_has_tx_resources(iface)) { return UCS_ERR_NO_RESOURCE; } @@ -573,7 +606,7 @@ ucs_status_t uct_dc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion comp, txwq->sig_pi); } -#if IBV_EXP_HW_TM_DC +#if IBV_HW_TM static ucs_status_t UCS_F_ALWAYS_INLINE uct_dc_mlx5_ep_tag_eager_short_inline(uct_ep_h tl_ep, uct_tag_t tag, const void *data, size_t length) @@ -582,7 +615,7 @@ uct_dc_mlx5_ep_tag_eager_short_inline(uct_ep_h tl_ep, uct_tag_t tag, uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); UCT_DC_MLX5_TXQP_DECL(txqp, txwq); - UCT_CHECK_LENGTH(length + sizeof(struct ibv_exp_tmh), 0, + UCT_CHECK_LENGTH(length + sizeof(struct ibv_tmh), 0, UCT_IB_MLX5_AM_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE), "uct_dc_mlx5_ep_tag_short"); UCT_DC_MLX5_CHECK_RES(iface, ep); @@ -591,7 +624,7 @@ uct_dc_mlx5_ep_tag_eager_short_inline(uct_ep_h tl_ep, uct_tag_t tag, uct_rc_mlx5_txqp_tag_inline_post(&iface->super, UCT_IB_QPT_DCI, txqp, txwq, MLX5_OPCODE_SEND, data, length, - NULL, tag, 0, IBV_EXP_TMH_EAGER, 0, + NULL, tag, 0, IBV_TMH_EAGER, 0, &ep->av, uct_dc_mlx5_ep_get_grh(ep), uct_ib_mlx5_wqe_av_size(&ep->av), NULL, 0, MLX5_WQE_CTRL_SOLICITED); @@ -604,25 +637,25 @@ uct_dc_mlx5_ep_tag_eager_short_inline(uct_ep_h tl_ep, uct_tag_t tag, ucs_status_t uct_dc_mlx5_ep_tag_eager_short(uct_ep_h tl_ep, uct_tag_t tag, const void *data, size_t length) { -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); uct_rc_mlx5_dm_copy_data_t cache; ucs_status_t status; - if (ucs_likely((sizeof(struct ibv_exp_tmh) + length <= + if (ucs_likely((sizeof(struct ibv_tmh) + length <= UCT_IB_MLX5_AM_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE)) || !iface->super.dm.dm)) { #endif return uct_dc_mlx5_ep_tag_eager_short_inline(tl_ep, tag, data, length); -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM } - UCT_CHECK_LENGTH(length + sizeof(struct ibv_exp_tmh), 0, + UCT_CHECK_LENGTH(length + sizeof(struct ibv_tmh), 0, iface->super.dm.seg_len, "tag_short"); UCT_DC_MLX5_CHECK_RES(iface, ep); - uct_rc_mlx5_fill_tmh(ucs_unaligned_ptr(&cache.tm_hdr), tag, 0, IBV_EXP_TMH_EAGER); + uct_rc_mlx5_fill_tmh(ucs_unaligned_ptr(&cache.tm_hdr), tag, 0, IBV_TMH_EAGER); status = uct_dc_mlx5_ep_short_dm(ep, &cache, sizeof(cache.tm_hdr), data, length, MLX5_OPCODE_SEND, @@ -650,15 +683,18 @@ ssize_t uct_dc_mlx5_ep_tag_eager_bcopy(uct_ep_h tl_ep, uct_tag_t tag, UCT_DC_MLX5_CHECK_RES(iface, ep); - UCT_RC_MLX5_FILL_TM_IMM(imm, app_ctx, ib_imm, opcode, MLX5_OPCODE_SEND, _IMM); + UCT_RC_MLX5_FILL_TM_IMM(imm, app_ctx, ib_imm, opcode, MLX5_OPCODE_SEND, + _IMM); UCT_RC_MLX5_IFACE_GET_TM_BCOPY_DESC(&iface->super.super, - &iface->super.super.tx.mp, desc, tag, - app_ctx, pack_cb, arg, length); + iface->super.tm.bcopy_mp, + desc, tag, app_ctx, pack_cb, + arg, length); uct_dc_mlx5_iface_bcopy_post(iface, ep, opcode, - sizeof(struct ibv_exp_tmh) + length, - 0, 0, desc, MLX5_WQE_CTRL_SOLICITED, ib_imm, desc + 1, NULL); + sizeof(struct ibv_tmh) + length, + 0, 0, desc, MLX5_WQE_CTRL_SOLICITED, ib_imm, + desc + 1, NULL); UCT_TL_EP_STAT_OP(&ep->super, TAG, BCOPY, length); @@ -677,16 +713,18 @@ ucs_status_t uct_dc_mlx5_ep_tag_eager_zcopy(uct_ep_h tl_ep, uct_tag_t tag, UCT_CHECK_IOV_SIZE(iovcnt, UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(UCT_IB_MLX5_AV_FULL_SIZE), "uct_dc_mlx5_ep_tag_eager_zcopy"); - UCT_RC_CHECK_ZCOPY_DATA(sizeof(struct ibv_exp_tmh), + UCT_RC_CHECK_ZCOPY_DATA(sizeof(struct ibv_tmh), uct_iov_total_length(iov, iovcnt), - iface->super.super.super.config.seg_size); + iface->super.tm.max_zcopy); + UCT_DC_MLX5_CHECK_RES(iface, ep); UCT_RC_MLX5_FILL_TM_IMM(imm, app_ctx, ib_imm, opcode, MLX5_OPCODE_SEND, _IMM); uct_dc_mlx5_iface_zcopy_post(iface, ep, opcode|UCT_RC_MLX5_OPCODE_FLAG_TM, - iov, iovcnt, 0, "", 0, 0, 0, tag, app_ctx, - ib_imm, comp, MLX5_WQE_CTRL_SOLICITED); + iov, iovcnt, 0ul, 0, "", 0, 0, 0, tag, app_ctx, + ib_imm, uct_rc_ep_send_op_completion_handler, + comp, MLX5_WQE_CTRL_SOLICITED); UCT_TL_EP_STAT_OP(&ep->super, TAG, ZCOPY, uct_iov_total_length(iov, iovcnt)); @@ -703,10 +741,10 @@ ucs_status_ptr_t uct_dc_mlx5_ep_tag_rndv_zcopy(uct_ep_h tl_ep, uct_tag_t tag, { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); - unsigned tm_hdr_len = sizeof(struct ibv_exp_tmh) + - sizeof(struct ibv_exp_tmh_rvh) + - sizeof(struct ibv_exp_tmh_ravh); - struct ibv_exp_tmh_ravh ravh; + unsigned tm_hdr_len = sizeof(struct ibv_tmh) + + sizeof(struct ibv_rvh) + + sizeof(struct ibv_ravh); + struct ibv_ravh ravh; uint32_t op_index; UCT_DC_MLX5_TXQP_DECL(txqp, txwq); @@ -718,14 +756,14 @@ ucs_status_ptr_t uct_dc_mlx5_ep_tag_rndv_zcopy(uct_ep_h tl_ep, uct_tag_t tag, op_index = uct_rc_mlx5_tag_get_op_id(&iface->super, comp); - uct_dc_mlx5_iface_fill_ravh(&ravh, uct_dc_mlx5_get_dct_num(iface)); + uct_dc_mlx5_iface_fill_ravh(&ravh, iface->rx.dct.qp_num); UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq); uct_rc_mlx5_txqp_tag_inline_post(&iface->super, UCT_IB_QPT_DCI, txqp, txwq, MLX5_OPCODE_SEND, header, header_length, iov, tag, op_index, - IBV_EXP_TMH_RNDV, 0, &ep->av, + IBV_TMH_RNDV, 0, &ep->av, uct_dc_mlx5_ep_get_grh(ep), uct_ib_mlx5_wqe_av_size(&ep->av), &ravh, sizeof(ravh), MLX5_WQE_CTRL_SOLICITED); @@ -742,7 +780,7 @@ ucs_status_t uct_dc_mlx5_ep_tag_rndv_request(uct_ep_h tl_ep, uct_tag_t tag, uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); UCT_DC_MLX5_TXQP_DECL(txqp, txwq); - UCT_CHECK_LENGTH(header_length + sizeof(struct ibv_exp_tmh), 0, + UCT_CHECK_LENGTH(header_length + sizeof(struct ibv_tmh), 0, UCT_IB_MLX5_AM_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE), "tag_rndv_request"); UCT_DC_MLX5_CHECK_RES(iface, ep); @@ -752,7 +790,7 @@ ucs_status_t uct_dc_mlx5_ep_tag_rndv_request(uct_ep_h tl_ep, uct_tag_t tag, uct_rc_mlx5_txqp_tag_inline_post(&iface->super, UCT_IB_QPT_DCI, txqp, txwq, MLX5_OPCODE_SEND_IMM, header, header_length, NULL, tag, 0, - IBV_EXP_TMH_EAGER, 0, &ep->av, + IBV_TMH_EAGER, 0, &ep->av, uct_dc_mlx5_ep_get_grh(ep), uct_ib_mlx5_wqe_av_size(&ep->av), NULL, 0, MLX5_WQE_CTRL_SOLICITED); @@ -817,7 +855,8 @@ ucs_status_t uct_dc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, if (dc_req->sender.global.is_global) { uct_ib_iface_fill_ah_attr_from_gid_lid(ib_iface, dc_req->lid, ucs_unaligned_ptr(&dc_req->sender.global.gid), - ib_iface->path_bits[0], &ah_attr); + iface->super.super.super.gid_info.gid_index, + 0, &ah_attr); status = uct_ib_iface_create_ah(ib_iface, &ah_attr, &ah); if (status != UCS_OK) { @@ -832,10 +871,12 @@ ucs_status_t uct_dc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, av.fl_mlid = ib_iface->path_bits[0] & 0x7f; /* lid in dc_req is in BE already */ - av.rlid = uct_ib_iface_is_roce(ib_iface) ? 0 : - (dc_req->lid | htons(ib_iface->path_bits[0])); + if (uct_ib_iface_is_roce(ib_iface)) { + av.rlid = htons(UCT_IB_ROCE_UDP_SRC_PORT_BASE); + } else { + av.rlid = dc_req->lid | htons(ib_iface->path_bits[0]); + } av.dqp_dct = htonl(dc_req->dct_num); - uct_dc_mlx5_iface_set_av_sport(iface, &av, dc_req->dct_num); if (!iface->ud_common.config.compact_av || ah_attr.is_global) { av.dqp_dct |= UCT_IB_MLX5_EXTENDED_UD_AV; @@ -850,7 +891,7 @@ ucs_status_t uct_dc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, } else { ucs_assert(op == UCT_RC_EP_FC_FLAG_HARD_REQ); sender.ep = (uint64_t)dc_ep; - sender.global.gid = ib_iface->gid; + sender.global.gid = ib_iface->gid_info.gid; sender.global.is_global = dc_ep->flags & UCT_DC_MLX5_EP_FLAG_GRH; UCS_STATS_UPDATE_COUNTER(dc_ep->fc.stats, @@ -859,7 +900,7 @@ ucs_status_t uct_dc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, uct_rc_mlx5_txqp_inline_post(&iface->super, UCT_IB_QPT_DCI, txqp, txwq, MLX5_OPCODE_SEND_IMM, &sender.global, sizeof(sender.global), op, sender.ep, - uct_dc_mlx5_get_dct_num(iface), + iface->rx.dct.qp_num, 0, 0, &dc_ep->av, uct_dc_mlx5_ep_get_grh(dc_ep), @@ -871,7 +912,8 @@ ucs_status_t uct_dc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, } -UCS_CLASS_INIT_FUNC(uct_dc_mlx5_ep_t, uct_dc_mlx5_iface_t *iface, const uct_dc_mlx5_iface_addr_t *if_addr, +UCS_CLASS_INIT_FUNC(uct_dc_mlx5_ep_t, uct_dc_mlx5_iface_t *iface, + const uct_dc_mlx5_iface_addr_t *if_addr, uct_ib_mlx5_base_av_t *av) { uint32_t remote_dctn; @@ -884,8 +926,7 @@ UCS_CLASS_INIT_FUNC(uct_dc_mlx5_ep_t, uct_dc_mlx5_iface_t *iface, const uct_dc_m remote_dctn = uct_ib_unpack_uint24(if_addr->qp_num); memcpy(&self->av, av, sizeof(*av)); - self->av.dqp_dct |= htonl(remote_dctn); - uct_dc_mlx5_iface_set_av_sport(iface, &self->av, remote_dctn); + self->av.dqp_dct |= htonl(remote_dctn); return uct_dc_mlx5_ep_basic_init(iface, self); } @@ -895,7 +936,6 @@ static UCS_CLASS_CLEANUP_FUNC(uct_dc_mlx5_ep_t) uct_dc_mlx5_iface_t *iface = ucs_derived_of(self->super.super.iface, uct_dc_mlx5_iface_t); uct_dc_mlx5_ep_pending_purge(&self->super.super, NULL, NULL); - ucs_arbiter_group_cleanup(uct_dc_mlx5_ep_arb_group(iface, self)); uct_rc_fc_cleanup(&self->fc); ucs_assert_always(self->flags & UCT_DC_MLX5_EP_FLAG_VALID); @@ -908,6 +948,7 @@ static UCS_CLASS_CLEANUP_FUNC(uct_dc_mlx5_ep_t) /* TODO: this is good for dcs policy only. * Need to change if eps share dci */ + ucs_arbiter_group_cleanup(uct_dc_mlx5_ep_arb_group(iface, self)); ucs_assertv_always(uct_dc_mlx5_iface_dci_has_outstanding(iface, self->dci), "iface (%p) ep (%p) dci leak detected: dci=%d", iface, self, self->dci); @@ -917,10 +958,7 @@ static UCS_CLASS_CLEANUP_FUNC(uct_dc_mlx5_ep_t) self, (int16_t)iface->super.super.config.tx_qp_len - uct_rc_txqp_available(&iface->tx.dcis[self->dci].txqp)); uct_rc_txqp_purge_outstanding(&iface->tx.dcis[self->dci].txqp, UCS_ERR_CANCELED, 1); - iface->tx.dcis[self->dci].ep = NULL; -#if ENABLE_ASSERT - iface->tx.dcis[self->dci].flags |= UCT_DC_DCI_FLAG_EP_DESTROYED; -#endif + iface->tx.dcis[self->dci].ep = NULL; } UCS_CLASS_DEFINE(uct_dc_mlx5_ep_t, uct_base_ep_t); @@ -962,6 +1000,8 @@ void uct_dc_mlx5_ep_cleanup(uct_ep_h tl_ep, ucs_class_t *cls) if (uct_dc_mlx5_ep_fc_wait_for_grant(ep)) { ucs_trace("not releasing dc_mlx5_ep %p - waiting for grant", ep); ep->flags &= ~UCT_DC_MLX5_EP_FLAG_VALID; + /* No need to wait for grant on this ep anymore */ + uct_dc_mlx5_ep_clear_fc_grant_flag(iface, ep); ucs_list_add_tail(&iface->tx.gc_list, &ep->list); } else { ucs_free(ep); @@ -976,6 +1016,47 @@ void uct_dc_mlx5_ep_release(uct_dc_mlx5_ep_t *ep) ucs_free(ep); } +void uct_dc_mlx5_ep_pending_common(uct_dc_mlx5_iface_t *iface, + uct_dc_mlx5_ep_t *ep, uct_pending_req_t *r, + unsigned flags, int push_to_head) +{ + int no_dci = (ep->dci == UCT_DC_MLX5_EP_NO_DCI); + ucs_arbiter_group_t *group; + + UCS_STATIC_ASSERT(sizeof(uct_dc_mlx5_pending_req_priv) <= + UCT_PENDING_REQ_PRIV_LEN); + + if (uct_dc_mlx5_iface_is_dci_rand(iface)) { + uct_dc_mlx5_pending_req_priv(r)->ep = ep; + group = uct_dc_mlx5_ep_rand_arb_group(iface, ep); + } else { + group = &ep->arb_group; + } + + if (push_to_head) { + uct_pending_req_arb_group_push_head(no_dci ? + uct_dc_mlx5_iface_dci_waitq(iface) : + uct_dc_mlx5_iface_tx_waitq(iface), + group, r); + } else { + uct_pending_req_arb_group_push(group, r); + } + + if (no_dci) { + /* no dci: + * Do not grab dci here. Instead put the group on dci allocation arbiter. + * This way we can assure fairness between all eps waiting for + * dci allocation. Relevant for dcs and dcs_quota policies. + */ + uct_dc_mlx5_iface_schedule_dci_alloc(iface, ep); + } else { + uct_dc_mlx5_iface_dci_sched_tx(iface, ep); + } + + UCT_TL_EP_STAT_PEND(&ep->super); +} + + /* TODO: currently pending code supports only dcs policy support hash/random policies @@ -984,15 +1065,14 @@ ucs_status_t uct_dc_mlx5_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *r, unsigned flags) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); - uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); - ucs_arbiter_group_t *group; + uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); /* ep can tx iff * - iface has resources: cqe and tx skb * - dci is either assigned or can be assigned * - dci has resources */ - if (uct_rc_iface_has_tx_resources(&iface->super.super)) { + if (uct_dc_mlx5_iface_has_tx_resources(iface)) { if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) { if (uct_dc_mlx5_iface_dci_can_alloc(iface) && (ep->fc.fc_wnd > 0)) { return UCS_ERR_BUSY; @@ -1004,31 +1084,8 @@ ucs_status_t uct_dc_mlx5_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *r, } } - UCS_STATIC_ASSERT(sizeof(uct_dc_mlx5_pending_req_priv) <= - UCT_PENDING_REQ_PRIV_LEN); - - if (uct_dc_mlx5_iface_is_dci_rand(iface)) { - ucs_assert(ep->dci != UCT_DC_MLX5_EP_NO_DCI); - uct_dc_mlx5_pending_req_priv(r)->ep = ep; - group = uct_dc_mlx5_ep_rand_arb_group(iface, ep); - } else { - group = &ep->arb_group; - } - uct_pending_req_arb_group_push(group, r); - - if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) { - /* no dci: - * Do not grab dci here. Instead put the group on dci allocation arbiter. - * This way we can assure fairness between all eps waiting for - * dci allocation. Relevant for dcs and dcs_quota policies. - */ - uct_dc_mlx5_iface_schedule_dci_alloc(iface, ep); - UCT_TL_EP_STAT_PEND(&ep->super); - return UCS_OK; - } + uct_dc_mlx5_ep_pending_common(iface, ep, r, flags, 0); - uct_dc_mlx5_iface_dci_sched_tx(iface, ep); - UCT_TL_EP_STAT_PEND(&ep->super); return UCS_OK; } @@ -1038,10 +1095,11 @@ ucs_status_t uct_dc_mlx5_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *r, */ ucs_arbiter_cb_result_t uct_dc_mlx5_iface_dci_do_pending_wait(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { - uct_dc_mlx5_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), uct_dc_mlx5_ep_t, arb_group); + uct_dc_mlx5_ep_t *ep = ucs_container_of(group, uct_dc_mlx5_ep_t, arb_group); uct_dc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_mlx5_iface_t); ucs_assert(!uct_dc_mlx5_iface_is_dci_rand(iface)); @@ -1068,13 +1126,15 @@ uct_dc_mlx5_iface_dci_do_common_pending_tx(uct_dc_mlx5_ep_t *ep, uct_dc_mlx5_iface_t); ucs_status_t status; - if (!uct_rc_iface_has_tx_resources(&iface->super.super)) { + if (!uct_dc_mlx5_iface_has_tx_resources(iface)) { return UCS_ARBITER_CB_RESULT_STOP; } + ucs_trace_data("progressing pending request %p", req); status = req->func(req); - ucs_trace_data("progress pending request %p returned: %s", req, + ucs_trace_data("status returned from progress pending: %s", ucs_status_string(status)); + if (status == UCS_OK) { return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; } else if (status == UCS_INPROGRESS) { @@ -1085,7 +1145,7 @@ uct_dc_mlx5_iface_dci_do_common_pending_tx(uct_dc_mlx5_ep_t *ep, return UCS_ARBITER_CB_RESULT_DESCHED_GROUP; } - ucs_assertv(!uct_rc_iface_has_tx_resources(&iface->super.super), + ucs_assertv(!uct_dc_mlx5_iface_has_tx_resources(iface), "pending callback returned error but send resources are available"); return UCS_ARBITER_CB_RESULT_STOP; } @@ -1095,23 +1155,25 @@ uct_dc_mlx5_iface_dci_do_common_pending_tx(uct_dc_mlx5_ep_t *ep, */ ucs_arbiter_cb_result_t uct_dc_mlx5_iface_dci_do_dcs_pending_tx(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { - uct_dc_mlx5_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), - uct_dc_mlx5_ep_t, arb_group); + uct_dc_mlx5_ep_t *ep = ucs_container_of(group, uct_dc_mlx5_ep_t, + arb_group); uct_dc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_mlx5_iface_t); + int is_only = ucs_arbiter_elem_is_only(elem); ucs_arbiter_cb_result_t res; - res = uct_dc_mlx5_iface_dci_do_common_pending_tx(ep, elem); + res = uct_dc_mlx5_iface_dci_do_common_pending_tx(ep, elem); if (res == UCS_ARBITER_CB_RESULT_REMOVE_ELEM) { /* For dcs* policies release dci if this is the last elem in the group * and the dci has no outstanding operations. For example pending * callback did not send anything. (uct_ep_flush or just return ok) */ - if (ucs_arbiter_elem_is_last(&ep->arb_group, elem)) { + if (is_only) { uct_dc_mlx5_iface_dci_free(iface, ep); } } @@ -1124,6 +1186,7 @@ uct_dc_mlx5_iface_dci_do_dcs_pending_tx(ucs_arbiter_t *arbiter, */ ucs_arbiter_cb_result_t uct_dc_mlx5_iface_dci_do_rand_pending_tx(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { @@ -1145,7 +1208,7 @@ uct_dc_mlx5_iface_dci_do_rand_pending_tx(ucs_arbiter_t *arbiter, } static ucs_arbiter_cb_result_t -uct_dc_mlx5_ep_abriter_purge_cb(ucs_arbiter_t *arbiter, +uct_dc_mlx5_ep_abriter_purge_cb(ucs_arbiter_t *arbiter, ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { uct_purge_cb_args_t *cb_args = arg; @@ -1209,7 +1272,7 @@ ucs_status_t uct_dc_mlx5_ep_check_fc(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_ if (iface->super.super.config.fc_enabled) { UCT_RC_CHECK_FC_WND(&ep->fc, ep->super.stats); if ((ep->fc.fc_wnd == iface->super.super.config.fc_hard_thresh) && - !(ep->fc.flags & UCT_DC_MLX5_EP_FC_FLAG_WAIT_FOR_GRANT)) { + !uct_dc_mlx5_ep_fc_wait_for_grant(ep)) { status = uct_rc_fc_ctrl(&ep->super.super, UCT_RC_EP_FC_FLAG_HARD_REQ, NULL); @@ -1217,6 +1280,7 @@ ucs_status_t uct_dc_mlx5_ep_check_fc(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_ return status; } ep->fc.flags |= UCT_DC_MLX5_EP_FC_FLAG_WAIT_FOR_GRANT; + ++iface->tx.fc_grants; } } else { /* Set fc_wnd to max, to send as much as possible without checks */ @@ -1224,3 +1288,72 @@ ucs_status_t uct_dc_mlx5_ep_check_fc(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_ } return UCS_OK; } + +void uct_dc_mlx5_ep_handle_failure(uct_dc_mlx5_ep_t *ep, void *arg, + ucs_status_t ep_status) +{ + uct_iface_h tl_iface = ep->super.super.iface; + uint8_t dci = ep->dci; + uct_ib_iface_t *ib_iface = ucs_derived_of(tl_iface, uct_ib_iface_t); + uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_iface, uct_dc_mlx5_iface_t); + uct_rc_txqp_t *txqp = &iface->tx.dcis[dci].txqp; + uct_ib_mlx5_txwq_t *txwq = &iface->tx.dcis[dci].txwq; + int16_t outstanding; + ucs_status_t status; + + ucs_assert(!uct_dc_mlx5_iface_is_dci_rand(iface)); + + uct_rc_txqp_purge_outstanding(txqp, ep_status, 0); + + /* poll_cqe for mlx5 returns NULL in case of failure and the cq_avaialble + is not updated for the error cqe and all outstanding wqes*/ + outstanding = (int16_t)iface->super.super.config.tx_qp_len - + uct_rc_txqp_available(txqp); + iface->super.super.tx.cq_available += outstanding; + uct_rc_txqp_available_set(txqp, (int16_t)iface->super.super.config.tx_qp_len); + + /* since we removed all outstanding ops on the dci, it should be released */ + ucs_assert(ep->dci != UCT_DC_MLX5_EP_NO_DCI); + uct_dc_mlx5_iface_dci_put(iface, dci); + ucs_assert_always(ep->dci == UCT_DC_MLX5_EP_NO_DCI); + + if (uct_dc_mlx5_ep_fc_wait_for_grant(ep)) { + /* No need to wait for grant on this ep anymore */ + uct_dc_mlx5_ep_clear_fc_grant_flag(iface, ep); + } + + if (ep == iface->tx.fc_ep) { + ucs_assert(ep_status != UCS_ERR_CANCELED); + /* Cannot handle errors on flow-control endpoint. + * Or shall we ignore them? + */ + ucs_debug("got error on DC flow-control endpoint, iface %p: %s", iface, + ucs_status_string(ep_status)); + } else { + status = ib_iface->ops->set_ep_failed(ib_iface, &ep->super.super, + ep_status); + if (status != UCS_OK) { + uct_ib_mlx5_completion_with_err(ib_iface, arg, + &iface->tx.dcis[dci].txwq, + UCS_LOG_LEVEL_FATAL); + return; + } + } + + if (ep_status != UCS_ERR_CANCELED) { + uct_ib_mlx5_completion_with_err(ib_iface, arg, &iface->tx.dcis[dci].txwq, + ib_iface->super.config.failure_level); + } + + status = uct_dc_mlx5_iface_reset_dci(iface, &iface->tx.dcis[dci]); + if (status != UCS_OK) { + ucs_fatal("iface %p failed to reset dci[%d] qpn 0x%x: %s", + iface, dci, txwq->super.qp_num, ucs_status_string(status)); + } + + status = uct_dc_mlx5_iface_dci_connect(iface, &iface->tx.dcis[dci]); + if (status != UCS_OK) { + ucs_fatal("iface %p failed to connect dci[%d] qpn 0x%x: %s", + iface, dci, txwq->super.qp_num, ucs_status_string(status)); + } +} diff --git a/src/uct/ib/dc/dc_mlx5_ep.h b/src/uct/ib/dc/dc_mlx5_ep.h index 982a06d728c..037002d6b61 100644 --- a/src/uct/ib/dc/dc_mlx5_ep.h +++ b/src/uct/ib/dc/dc_mlx5_ep.h @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2016-2018. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2016-2020. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -13,11 +13,15 @@ #include "dc_mlx5.h" +#define UCT_DC_MLX5_EP_NO_DCI ((uint8_t)-1) + + enum { /* Indicates that FC grant has been requested, but is not received yet. * Flush will not complete until an outgoing grant request is acked. - * It is needed to avoid the case when grant arrives for the recently - * deleted ep. */ + * It is needed to avoid the following cases: + * 1) Grant arrives for the recently deleted ep. + * 2) QP resources are available, but there are some pending requests. */ UCT_DC_MLX5_EP_FC_FLAG_WAIT_FOR_GRANT = UCS_BIT(0) }; @@ -118,7 +122,7 @@ ucs_status_t uct_dc_mlx5_ep_atomic32_fetch(uct_ep_h ep, uct_atomic_op_t opcode, uint64_t remote_addr, uct_rkey_t rkey, uct_completion_t *comp); -#if IBV_EXP_HW_TM_DC +#if IBV_HW_TM ucs_status_t uct_dc_mlx5_ep_tag_eager_short(uct_ep_h tl_ep, uct_tag_t tag, const void *data, size_t length); @@ -165,16 +169,19 @@ ucs_status_t uct_dc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, ucs_arbiter_cb_result_t uct_dc_mlx5_iface_dci_do_pending_wait(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg); ucs_arbiter_cb_result_t uct_dc_mlx5_iface_dci_do_dcs_pending_tx(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg); ucs_arbiter_cb_result_t uct_dc_mlx5_iface_dci_do_rand_pending_tx(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg); @@ -182,6 +189,10 @@ ucs_status_t uct_dc_mlx5_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *r, unsigned flags); void uct_dc_mlx5_ep_pending_purge(uct_ep_h tl_ep, uct_pending_purge_callback_t cb, void *arg); +void uct_dc_mlx5_ep_pending_common(uct_dc_mlx5_iface_t *iface, + uct_dc_mlx5_ep_t *ep, uct_pending_req_t *r, + unsigned flags, int push_to_head); + void uct_dc_mlx5_ep_cleanup(uct_ep_h tl_ep, ucs_class_t *cls); void uct_dc_mlx5_ep_release(uct_dc_mlx5_ep_t *ep); @@ -200,7 +211,9 @@ static UCS_F_ALWAYS_INLINE int uct_dc_mlx5_iface_is_dci_rand(uct_dc_mlx5_iface_t static UCS_F_ALWAYS_INLINE ucs_arbiter_group_t* uct_dc_mlx5_ep_rand_arb_group(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep) { - ucs_assert(uct_dc_mlx5_iface_is_dci_rand(iface)); + ucs_assert(uct_dc_mlx5_iface_is_dci_rand(iface) && + (ep->dci != UCT_DC_MLX5_EP_NO_DCI)); + /* If DCI random policy is used, DCI is always assigned to EP */ return &iface->tx.dcis[ep->dci].arb_group; } @@ -232,6 +245,16 @@ uct_dc_mlx5_ep_from_dci(uct_dc_mlx5_iface_t *iface, uint8_t dci) return iface->tx.dcis[dci].ep; } +static UCS_F_ALWAYS_INLINE void +uct_dc_mlx5_ep_clear_fc_grant_flag(uct_dc_mlx5_iface_t *iface, + uct_dc_mlx5_ep_t *ep) +{ + ucs_assert((ep->fc.flags & UCT_DC_MLX5_EP_FC_FLAG_WAIT_FOR_GRANT) && + iface->tx.fc_grants); + ep->fc.flags &= ~UCT_DC_MLX5_EP_FC_FLAG_WAIT_FOR_GRANT; + --iface->tx.fc_grants; +} + enum uct_dc_mlx5_ep_flags { UCT_DC_MLX5_EP_FLAG_TX_WAIT = UCS_BIT(0), /* ep is in the tx_wait state. See description of the dcs+quota dci @@ -242,7 +265,8 @@ enum uct_dc_mlx5_ep_flags { }; -#define UCT_DC_MLX5_EP_NO_DCI ((uint8_t)-1) +void uct_dc_mlx5_ep_handle_failure(uct_dc_mlx5_ep_t *ep, void *arg, + ucs_status_t status); static UCS_F_ALWAYS_INLINE ucs_status_t uct_dc_mlx5_ep_basic_init(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep) @@ -277,7 +301,7 @@ uct_dc_mlx5_iface_progress_pending(uct_dc_mlx5_iface_t *iface) * Pending op on the tx_waitq can complete with the UCS_OK * status without actually sending anything on the dci. * In this case pending ops on the waitq may never be - * scdeduled. + * scheduled. * * So we keep progressing pending while dci_waitq is not * empty and it is possible to allocate a dci. @@ -314,7 +338,18 @@ void uct_dc_mlx5_iface_schedule_dci_alloc(uct_dc_mlx5_iface_t *iface, uct_dc_mlx } } -static inline void uct_dc_mlx5_iface_dci_put(uct_dc_mlx5_iface_t *iface, uint8_t dci) +static UCS_F_ALWAYS_INLINE void + uct_dc_mlx5_iface_dci_release(uct_dc_mlx5_iface_t *iface, uint8_t dci) +{ + iface->tx.stack_top--; + iface->tx.dcis_stack[iface->tx.stack_top] = dci; +#if UCS_ENABLE_ASSERT + iface->tx.dcis[dci].flags = 0; +#endif +} + +static UCS_F_ALWAYS_INLINE void + uct_dc_mlx5_iface_dci_put(uct_dc_mlx5_iface_t *iface, uint8_t dci) { uct_dc_mlx5_ep_t *ep; @@ -326,14 +361,14 @@ static inline void uct_dc_mlx5_iface_dci_put(uct_dc_mlx5_iface_t *iface, uint8_t ucs_assert(iface->tx.stack_top > 0); - if (uct_dc_mlx5_iface_dci_has_outstanding(iface, dci)) { - if (ep == NULL) { - /* The EP was destroyed after flush cancel */ - ucs_assert(ucs_test_all_flags(iface->tx.dcis[dci].flags, - (UCT_DC_DCI_FLAG_EP_CANCELED | - UCT_DC_DCI_FLAG_EP_DESTROYED))); - return; + if (ucs_unlikely(ep == NULL)) { + if (!uct_dc_mlx5_iface_dci_has_outstanding(iface, dci)) { + uct_dc_mlx5_iface_dci_release(iface, dci); } + return; + } + + if (uct_dc_mlx5_iface_dci_has_outstanding(iface, dci)) { if (iface->tx.policy == UCT_DC_TX_POLICY_DCS_QUOTA) { /* in tx_wait state: * - if there are no eps are waiting for dci allocation @@ -349,15 +384,8 @@ static inline void uct_dc_mlx5_iface_dci_put(uct_dc_mlx5_iface_t *iface, uint8_t ucs_arbiter_group_schedule(uct_dc_mlx5_iface_tx_waitq(iface), &ep->arb_group); return; } - iface->tx.stack_top--; - iface->tx.dcis_stack[iface->tx.stack_top] = dci; -#if ENABLE_ASSERT - iface->tx.dcis[dci].flags = 0; -#endif - if (ucs_unlikely(ep == NULL)) { - return; - } + uct_dc_mlx5_iface_dci_release(iface, dci); ucs_assert(uct_dc_mlx5_ep_from_dci(iface, dci)->dci != UCT_DC_MLX5_EP_NO_DCI); ep->dci = UCT_DC_MLX5_EP_NO_DCI; @@ -371,13 +399,6 @@ static inline void uct_dc_mlx5_iface_dci_put(uct_dc_mlx5_iface_t *iface, uint8_t uct_dc_mlx5_iface_schedule_dci_alloc(iface, ep); } -static inline ucs_status_t -uct_dc_mlx5_iface_check_txqp(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep, uct_rc_txqp_t *txqp) -{ - UCT_RC_CHECK_TXQP(&iface->super.super, ep, txqp); - return UCS_OK; -} - static inline void uct_dc_mlx5_iface_dci_alloc(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep) { /* take a first available dci from stack. @@ -410,30 +431,28 @@ static inline void uct_dc_mlx5_iface_dci_free(uct_dc_mlx5_iface_t *iface, uct_dc return; } - iface->tx.stack_top--; - iface->tx.dcis_stack[iface->tx.stack_top] = dci; - iface->tx.dcis[dci].ep = NULL; -#if ENABLE_ASSERT - iface->tx.dcis[ep->dci].flags = 0; -#endif + uct_dc_mlx5_iface_dci_release(iface, dci); - ep->dci = UCT_DC_MLX5_EP_NO_DCI; - ep->flags &= ~UCT_DC_MLX5_EP_FLAG_TX_WAIT; + iface->tx.dcis[dci].ep = NULL; + ep->dci = UCT_DC_MLX5_EP_NO_DCI; + ep->flags &= ~UCT_DC_MLX5_EP_FLAG_TX_WAIT; } -static inline ucs_status_t uct_dc_mlx5_iface_dci_get(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep) +static UCS_F_ALWAYS_INLINE ucs_status_t +uct_dc_mlx5_iface_dci_get(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep) { uct_rc_txqp_t *txqp; int16_t available; + ucs_assert(!iface->super.super.config.tx_moderation); + if (uct_dc_mlx5_iface_is_dci_rand(iface)) { if (uct_dc_mlx5_iface_dci_has_tx_resources(iface, ep->dci)) { return UCS_OK; } else { - txqp = &iface->tx.dcis[ep->dci].txqp; - UCS_STATS_UPDATE_COUNTER(txqp->stats, UCT_RC_TXQP_STAT_QP_FULL, 1); - UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); - return UCS_ERR_NO_RESOURCE; + UCS_STATS_UPDATE_COUNTER(iface->tx.dcis[ep->dci].txqp.stats, + UCT_RC_TXQP_STAT_QP_FULL, 1); + goto out_no_res; } } @@ -441,8 +460,7 @@ static inline ucs_status_t uct_dc_mlx5_iface_dci_get(uct_dc_mlx5_iface_t *iface, /* dci is already assigned - keep using it */ if ((iface->tx.policy == UCT_DC_TX_POLICY_DCS_QUOTA) && (ep->flags & UCT_DC_MLX5_EP_FLAG_TX_WAIT)) { - UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); - return UCS_ERR_NO_RESOURCE; + goto out_no_res; } /* if dci has sent more than quota, and there are eps waiting for dci @@ -455,24 +473,26 @@ static inline ucs_status_t uct_dc_mlx5_iface_dci_get(uct_dc_mlx5_iface_t *iface, !ucs_arbiter_is_empty(uct_dc_mlx5_iface_dci_waitq(iface))) { ep->flags |= UCT_DC_MLX5_EP_FLAG_TX_WAIT; - UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); - return UCS_ERR_NO_RESOURCE; + goto out_no_res; } if (available <= 0) { UCS_STATS_UPDATE_COUNTER(txqp->stats, UCT_RC_TXQP_STAT_QP_FULL, 1); - UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); - return UCS_ERR_NO_RESOURCE; + goto out_no_res; } return UCS_OK; } - if (uct_dc_mlx5_iface_dci_can_alloc(iface)) { + /* Do not alloc dci if no TX desc resources, + * otherwise this dci may never be released. */ + if (uct_dc_mlx5_iface_dci_can_alloc(iface) && + uct_dc_mlx5_iface_has_tx_resources(iface)) { uct_dc_mlx5_iface_dci_alloc(iface, ep); return UCS_OK; } +out_no_res: /* we will have to wait until someone releases dci */ UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); return UCS_ERR_NO_RESOURCE; @@ -493,32 +513,37 @@ static inline struct mlx5_grh_av *uct_dc_mlx5_ep_get_grh(uct_dc_mlx5_ep_t *ep) #define UCT_DC_MLX5_TXQP_DECL(_txqp, _txwq) \ - uct_rc_txqp_t *_txqp; \ - uct_ib_mlx5_txwq_t *_txwq; + uct_rc_txqp_t UCS_V_UNUSED *_txqp; \ + uct_ib_mlx5_txwq_t UCS_V_UNUSED *_txwq; #define UCT_DC_MLX5_CHECK_RES(_iface, _ep) \ { \ - ucs_status_t status; \ - status = uct_dc_mlx5_iface_dci_get(_iface, _ep); \ - if (ucs_unlikely(status != UCS_OK)) { \ - return status; \ + ucs_status_t _status = uct_dc_mlx5_iface_dci_get(_iface, _ep); \ + if (ucs_unlikely(_status != UCS_OK)) { \ + return _status; \ } \ - UCT_RC_CHECK_CQE(&(_iface)->super.super, _ep, \ - &(_iface)->tx.dcis[(_ep)->dci].txqp); \ } #define UCT_DC_CHECK_RES_PTR(_iface, _ep) \ { \ - ucs_status_t status; \ - status = uct_dc_mlx5_iface_dci_get(_iface, _ep); \ + ucs_status_t status = uct_dc_mlx5_iface_dci_get(_iface, _ep); \ if (ucs_unlikely(status != UCS_OK)) { \ return UCS_STATUS_PTR(status); \ } \ - UCT_RC_CHECK_CQE_RET(&(_iface)->super.super, _ep, \ - &(_iface)->tx.dcis[(_ep)->dci].txqp, \ - UCS_STATUS_PTR(UCS_ERR_NO_RESOURCE)); \ + } + + +/** + * All RMA and AMO operations are not allowed if no RDMA_READ credits. + * Otherwise operations ordering can be broken (which fence operation + * relies on). + */ +#define UCT_DC_MLX5_CHECK_RMA_RES(_iface, _ep) \ + { \ + UCT_RC_CHECK_NUM_RDMA_READ(&(_iface)->super.super) \ + UCT_DC_MLX5_CHECK_RES(_iface, _ep) \ } @@ -530,15 +555,16 @@ static inline struct mlx5_grh_av *uct_dc_mlx5_ep_get_grh(uct_dc_mlx5_ep_t *ep) { \ if (ucs_unlikely((_ep)->fc.fc_wnd <= \ (_iface)->super.super.config.fc_hard_thresh)) { \ - ucs_status_t status = uct_dc_mlx5_ep_check_fc(_iface, _ep); \ - if (ucs_unlikely(status != UCS_OK)) { \ + ucs_status_t _status = uct_dc_mlx5_ep_check_fc(_iface, _ep); \ + if (ucs_unlikely(_status != UCS_OK)) { \ if (((_ep)->dci != UCT_DC_MLX5_EP_NO_DCI) && \ !uct_dc_mlx5_iface_is_dci_rand(_iface)) { \ - ucs_assertv_always(uct_dc_mlx5_iface_dci_has_outstanding(_iface, (_ep)->dci), \ + ucs_assertv_always(uct_dc_mlx5_iface_dci_has_outstanding(_iface, \ + (_ep)->dci), \ "iface (%p) ep (%p) dci leak detected: dci=%d", \ _iface, _ep, (_ep)->dci); \ } \ - return status; \ + return _status; \ } \ } \ UCT_DC_MLX5_CHECK_RES(_iface, _ep) \ diff --git a/src/uct/ib/mlx5/dv/ib_mlx5_dv.c b/src/uct/ib/mlx5/dv/ib_mlx5_dv.c new file mode 100644 index 00000000000..92cd87a495f --- /dev/null +++ b/src/uct/ib/mlx5/dv/ib_mlx5_dv.c @@ -0,0 +1,318 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "ib_mlx5_ifc.h" + +#include +#include + +#if HAVE_DECL_MLX5DV_INIT_OBJ +ucs_status_t uct_ib_mlx5dv_init_obj(uct_ib_mlx5dv_t *obj, uint64_t type) +{ + int ret; + + ret = mlx5dv_init_obj(&obj->dv, type); +#ifdef HAVE_IBV_EXP_DM + if (!ret && (type & MLX5DV_OBJ_DM)) { + ret = uct_ib_mlx5_get_dm_info(obj->dv_dm.in, obj->dv_dm.out); + } +#endif + if (ret != 0) { + ucs_error("DV failed to get mlx5 information. Type %lx.", type); + return UCS_ERR_NO_DEVICE; + } + + return UCS_OK; +} +#endif + +#if HAVE_DEVX +ucs_status_t uct_ib_mlx5_devx_create_qp(uct_ib_iface_t *iface, + uct_ib_mlx5_qp_t *qp, + uct_ib_mlx5_txwq_t *tx, + uct_ib_mlx5_qp_attr_t *attr) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.md, uct_ib_mlx5_md_t); + uct_ib_device_t *dev = &md->super.dev; + struct mlx5dv_pd dvpd = {}; + struct mlx5dv_cq dvscq = {}; + struct mlx5dv_cq dvrcq = {}; + struct mlx5dv_obj dv = {}; + char in[UCT_IB_MLX5DV_ST_SZ_BYTES(create_qp_in)] = {}; + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_qp_out)] = {}; + char in_2init[UCT_IB_MLX5DV_ST_SZ_BYTES(rst2init_qp_in)] = {}; + char out_2init[UCT_IB_MLX5DV_ST_SZ_BYTES(rst2init_qp_out)] = {}; + uct_ib_mlx5_mmio_mode_t mmio_mode; + int max_tx, max_rx, len_tx, len; + uct_ib_mlx5_devx_uar_t *uar; + ucs_status_t status; + int wqe_size; + int dvflags; + void *qpc; + int ret; + + uct_ib_iface_fill_attr(iface, &attr->super); + + status = uct_ib_mlx5_get_mmio_mode(iface->super.worker, attr->mmio_mode, + UCT_IB_MLX5_BF_REG_SIZE, &mmio_mode); + if (status != UCS_OK) { + goto err; + } + + uar = uct_worker_tl_data_get(iface->super.worker, + UCT_IB_MLX5_DEVX_UAR_KEY, + uct_ib_mlx5_devx_uar_t, + uct_ib_mlx5_devx_uar_cmp, + uct_ib_mlx5_devx_uar_init, + md, mmio_mode); + if (UCS_PTR_IS_ERR(uar)) { + status = UCS_PTR_STATUS(uar); + goto err; + } + + wqe_size = sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_wqe_mkey_context_seg) + + ucs_max(sizeof(struct mlx5_wqe_umr_klm_seg), 64) + + ucs_max(attr->super.cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg), + ucs_align_up(sizeof(struct mlx5_wqe_inl_data_seg) + + attr->super.cap.max_inline_data, 16)); + len_tx = ucs_roundup_pow2_or0(attr->super.cap.max_send_wr * wqe_size); + max_tx = len_tx / MLX5_SEND_WQE_BB; + max_rx = ucs_roundup_pow2_or0(attr->super.cap.max_recv_wr); + len = len_tx + max_rx * UCT_IB_MLX5_MAX_BB * UCT_IB_MLX5_WQE_SEG_SIZE; + + if (tx != NULL) { + status = uct_ib_mlx5_md_buf_alloc(md, len, 0, &qp->devx.wq_buf, + &qp->devx.mem, "qp umem"); + if (status != UCS_OK) { + goto err_uar; + } + } else { + qp->devx.wq_buf = NULL; + } + + qp->devx.dbrec = uct_ib_mlx5_get_dbrec(md); + if (!qp->devx.dbrec) { + status = UCS_ERR_NO_MEMORY; + goto err_free_mem; + } + + dv.pd.in = attr->super.ibv.pd; + dv.pd.out = &dvpd; + dv.cq.in = attr->super.ibv.send_cq; + dv.cq.out = &dvscq; + dvflags = MLX5DV_OBJ_PD | MLX5DV_OBJ_CQ; + mlx5dv_init_obj(&dv, dvflags); + dv.cq.in = attr->super.ibv.recv_cq; + dv.cq.out = &dvrcq; + mlx5dv_init_obj(&dv, MLX5DV_OBJ_CQ); + + UCT_IB_MLX5DV_SET(create_qp_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_QP); + qpc = UCT_IB_MLX5DV_ADDR_OF(create_qp_in, in, qpc); + UCT_IB_MLX5DV_SET(qpc, qpc, st, UCT_IB_MLX5_QPC_ST_RC); + UCT_IB_MLX5DV_SET(qpc, qpc, pm_state, UCT_IB_MLX5_QPC_PM_STATE_MIGRATED); + UCT_IB_MLX5DV_SET(qpc, qpc, pd, dvpd.pdn); + UCT_IB_MLX5DV_SET(qpc, qpc, uar_page, uar->uar->page_id); + ucs_assert((attr->super.srq == NULL) || (attr->super.srq_num != 0)); + UCT_IB_MLX5DV_SET(qpc, qpc, rq_type, !!attr->super.srq_num); + UCT_IB_MLX5DV_SET(qpc, qpc, srqn_rmpn_xrqn, attr->super.srq_num); + UCT_IB_MLX5DV_SET(qpc, qpc, cqn_snd, dvscq.cqn); + UCT_IB_MLX5DV_SET(qpc, qpc, cqn_rcv, dvrcq.cqn); + /* cppcheck-suppress internalAstError */ + UCT_IB_MLX5DV_SET(qpc, qpc, log_sq_size, ucs_ilog2_or0(max_tx)); + UCT_IB_MLX5DV_SET(qpc, qpc, log_rq_size, ucs_ilog2_or0(max_rx)); + UCT_IB_MLX5DV_SET(qpc, qpc, cs_req, + uct_ib_mlx5_qpc_cs_req(attr->super.max_inl_cqe[UCT_IB_DIR_TX])); + UCT_IB_MLX5DV_SET(qpc, qpc, cs_res, + uct_ib_mlx5_qpc_cs_res(attr->super.max_inl_cqe[UCT_IB_DIR_RX], 0)); + UCT_IB_MLX5DV_SET64(qpc, qpc, dbr_addr, qp->devx.dbrec->offset); + UCT_IB_MLX5DV_SET(qpc, qpc, dbr_umem_id, qp->devx.dbrec->mem_id); + + if (qp->devx.wq_buf == NULL) { + UCT_IB_MLX5DV_SET(qpc, qpc, no_sq, true); + UCT_IB_MLX5DV_SET(qpc, qpc, offload_type, true); + UCT_IB_MLX5DV_SET(create_qp_in, in, wq_umem_id, md->zero_mem.mem->umem_id); + } else { + UCT_IB_MLX5DV_SET(create_qp_in, in, wq_umem_id, qp->devx.mem.mem->umem_id); + } + + qp->devx.obj = mlx5dv_devx_obj_create(dev->ibv_context, in, sizeof(in), + out, sizeof(out)); + if (!qp->devx.obj) { + ucs_error("mlx5dv_devx_obj_create(QP) failed, syndrome %x: %m", + UCT_IB_MLX5DV_GET(create_qp_out, out, syndrome)); + status = UCS_ERR_IO_ERROR; + goto err_free_db; + } + + qp->qp_num = UCT_IB_MLX5DV_GET(create_qp_out, out, qpn); + + qpc = UCT_IB_MLX5DV_ADDR_OF(rst2init_qp_in, in_2init, qpc); + UCT_IB_MLX5DV_SET(rst2init_qp_in, in_2init, opcode, UCT_IB_MLX5_CMD_OP_RST2INIT_QP); + UCT_IB_MLX5DV_SET(rst2init_qp_in, in_2init, qpn, qp->qp_num); + UCT_IB_MLX5DV_SET(qpc, qpc, pm_state, UCT_IB_MLX5_QPC_PM_STATE_MIGRATED); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->super.port); + UCT_IB_MLX5DV_SET(qpc, qpc, rwe, true); + + ret = mlx5dv_devx_obj_modify(qp->devx.obj, in_2init, sizeof(in_2init), + out_2init, sizeof(out_2init)); + if (ret) { + ucs_error("mlx5dv_devx_obj_modify(2INIT_QP) failed, syndrome %x: %m", + UCT_IB_MLX5DV_GET(rst2init_qp_out, out_2init, syndrome)); + status = UCS_ERR_IO_ERROR; + goto err_free; + } + + qp->type = UCT_IB_MLX5_OBJ_TYPE_DEVX; + + attr->super.cap.max_send_wr = max_tx; + attr->super.cap.max_recv_wr = max_rx; + + if (tx != NULL) { + tx->reg = &uar->super; + tx->qstart = qp->devx.wq_buf; + tx->qend = UCS_PTR_BYTE_OFFSET(qp->devx.wq_buf, len_tx); + tx->dbrec = &qp->devx.dbrec->db[MLX5_SND_DBR]; + tx->bb_max = max_tx - 2 * UCT_IB_MLX5_MAX_BB; + ucs_assert(*tx->dbrec == 0); + uct_ib_mlx5_txwq_reset(tx); + } else { + uct_worker_tl_data_put(uar, uct_ib_mlx5_devx_uar_cleanup); + } + + return UCS_OK; + +err_free: + mlx5dv_devx_obj_destroy(qp->devx.obj); +err_free_db: + uct_ib_mlx5_put_dbrec(qp->devx.dbrec); +err_free_mem: + uct_ib_mlx5_md_buf_free(md, qp->devx.wq_buf, &qp->devx.mem); +err_uar: + uct_worker_tl_data_put(uar, uct_ib_mlx5_devx_uar_cleanup); +err: + return status; +} + +ucs_status_t uct_ib_mlx5_devx_modify_qp(uct_ib_mlx5_qp_t *qp, + const void *in, size_t inlen, + void *out, size_t outlen) +{ + int ret; + + switch (qp->type) { + case UCT_IB_MLX5_OBJ_TYPE_VERBS: + ret = mlx5dv_devx_qp_modify(qp->verbs.qp, in, inlen, out, outlen); + if (ret) { + ucs_error("mlx5dv_devx_qp_modify(%x) failed, syndrome %x: %m", + UCT_IB_MLX5DV_GET(modify_qp_in, in, opcode), + UCT_IB_MLX5DV_GET(modify_qp_out, out, syndrome)); + return UCS_ERR_IO_ERROR; + } + break; + case UCT_IB_MLX5_OBJ_TYPE_DEVX: + ret = mlx5dv_devx_obj_modify(qp->devx.obj, in, inlen, out, outlen); + if (ret) { + ucs_error("mlx5dv_devx_obj_modify(%x) failed, syndrome %x: %m", + UCT_IB_MLX5DV_GET(modify_qp_in, in, opcode), + UCT_IB_MLX5DV_GET(modify_qp_out, out, syndrome)); + return UCS_ERR_IO_ERROR; + } + break; + case UCT_IB_MLX5_OBJ_TYPE_LAST: + return UCS_ERR_UNSUPPORTED; + } + + return UCS_OK; +} + +ucs_status_t uct_ib_mlx5_devx_modify_qp_state(uct_ib_mlx5_qp_t *qp, + enum ibv_qp_state state) +{ + char in[UCT_IB_MLX5DV_ST_SZ_BYTES(modify_qp_in)] = {}; + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(modify_qp_out)] = {}; + + switch (state) { + case IBV_QPS_ERR: + UCT_IB_MLX5DV_SET(modify_qp_in, in, opcode, UCT_IB_MLX5_CMD_OP_2ERR_QP); + break; + case IBV_QPS_RESET: + UCT_IB_MLX5DV_SET(modify_qp_in, in, opcode, UCT_IB_MLX5_CMD_OP_2RST_QP); + break; + default: + return UCS_ERR_UNSUPPORTED; + } + + UCT_IB_MLX5DV_SET(modify_qp_in, in, qpn, qp->qp_num); + return uct_ib_mlx5_devx_modify_qp(qp, in, sizeof(in), out, sizeof(out)); +} + +void uct_ib_mlx5_devx_destroy_qp(uct_ib_mlx5_md_t *md, uct_ib_mlx5_qp_t *qp) +{ + int ret = mlx5dv_devx_obj_destroy(qp->devx.obj); + if (ret) { + ucs_error("mlx5dv_devx_obj_destroy(QP) failed: %m"); + } + uct_ib_mlx5_put_dbrec(qp->devx.dbrec); + uct_ib_mlx5_md_buf_free(md, qp->devx.wq_buf, &qp->devx.mem); +} +#endif + +ucs_status_t uct_ib_mlx5dv_arm_cq(uct_ib_mlx5_cq_t *cq, int solicited) +{ + uint64_t doorbell, sn_ci_cmd; + uint32_t sn, ci, cmd; + + sn = cq->cq_sn & 3; + ci = cq->cq_ci & 0xffffff; + cmd = solicited ? MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT; + sn_ci_cmd = (sn << 28) | cmd | ci; + + cq->dbrec[UCT_IB_MLX5_CQ_ARM_DB] = htobe32(sn_ci_cmd); + + ucs_memory_cpu_fence(); + + doorbell = (sn_ci_cmd << 32) | cq->cq_num; + + *(uint64_t *)((uint8_t *)cq->uar + MLX5_CQ_DOORBELL) = htobe64(doorbell); + + ucs_memory_bus_store_fence(); + + return UCS_OK; +} + +#if HAVE_DECL_MLX5DV_OBJ_AH +void uct_ib_mlx5_get_av(struct ibv_ah *ah, struct mlx5_wqe_av *av) +{ + struct mlx5dv_obj dv; + struct mlx5dv_ah dah; + + dv.ah.in = ah; + dv.ah.out = &dah; + mlx5dv_init_obj(&dv, MLX5DV_OBJ_AH); + + *av = *(dah.av); + av->dqp_dct |= UCT_IB_MLX5_EXTENDED_UD_AV; +} +#elif !defined (HAVE_INFINIBAND_MLX5_HW_H) +void uct_ib_mlx5_get_av(struct ibv_ah *ah, struct mlx5_wqe_av *av) +{ + ucs_bug("MLX5DV_OBJ_AH not supported"); +} +#endif + +#if HAVE_DEVX +ucs_status_t uct_ib_mlx5_get_compact_av(uct_ib_iface_t *iface, int *compact_av) +{ + *compact_av = !!(uct_ib_iface_device(iface)->flags & UCT_IB_DEVICE_FLAG_AV); + return UCS_OK; +} +#endif diff --git a/src/uct/ib/mlx5/dv/ib_mlx5_dv.h b/src/uct/ib/mlx5/dv/ib_mlx5_dv.h new file mode 100644 index 00000000000..2d00acd9fb7 --- /dev/null +++ b/src/uct/ib/mlx5/dv/ib_mlx5_dv.h @@ -0,0 +1,116 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef UCT_IB_MLX5_DV_H_ +#define UCT_IB_MLX5_DV_H_ + +#ifndef UCT_IB_MLX5_H_ +# error "Never include directly; use instead." +#endif + +#include +#include + +typedef struct { + struct mlx5dv_obj dv; +#ifdef HAVE_IBV_EXP_DM + struct { + struct ibv_exp_dm *in; + struct mlx5dv_dm *out; + } dv_dm; +#endif +} uct_ib_mlx5dv_t; + +typedef struct { + struct mlx5dv_qp dv; +} uct_ib_mlx5dv_qp_t; + +typedef struct { + struct mlx5dv_srq dv; +} uct_ib_mlx5dv_srq_t; + +/* Completion queue */ +typedef struct { + struct mlx5dv_cq dv; +} uct_ib_mlx5dv_cq_t; + +/** + * Get internal verbs information. + */ +ucs_status_t uct_ib_mlx5dv_init_obj(uct_ib_mlx5dv_t *obj, uint64_t type); + +/** + * Update CI to support req_notify_cq + */ +void uct_ib_mlx5_update_cq_ci(struct ibv_cq *cq, unsigned cq_ci); + +/** + * Retrieve CI from the driver + */ +unsigned uct_ib_mlx5_get_cq_ci(struct ibv_cq *cq); + +/** + * Get internal AV information. + */ +void uct_ib_mlx5_get_av(struct ibv_ah *ah, struct mlx5_wqe_av *av); + +/** + * Backports for legacy bare-metal support + */ +struct ibv_qp *uct_dv_get_cmd_qp(struct ibv_srq *srq); + +void *uct_dv_get_info_uar0(void *uar); + +/* + * DM backports + */ +#ifdef HAVE_IBV_EXP_DM +# define ibv_dm ibv_exp_dm +# define ibv_alloc_dm_attr ibv_exp_alloc_dm_attr +# define ibv_alloc_dm ibv_exp_alloc_dm +# define ibv_free_dm ibv_exp_free_dm + +struct mlx5dv_dm { + void *buf; + uint64_t length; + uint64_t comp_mask; +}; + +enum { + MLX5DV_OBJ_DM = 1 << 4, +}; + +static struct ibv_mr * UCS_F_MAYBE_UNUSED +ibv_reg_dm_mr(struct ibv_pd *pd, struct ibv_dm *dm, + uint64_t dm_offset, size_t length, unsigned int access_flags) +{ + struct ibv_exp_reg_mr_in mr_in = {}; + mr_in.pd = pd; + mr_in.comp_mask = IBV_EXP_REG_MR_DM; + mr_in.dm = dm; + mr_in.length = length; + return ibv_exp_reg_mr(&mr_in); +} + +typedef struct uct_mlx5_dm_va { + struct ibv_dm ibv_dm; + size_t length; + uint64_t *start_va; +} uct_mlx5_dm_va_t; + +static ucs_status_t UCS_F_MAYBE_UNUSED +uct_ib_mlx5_get_dm_info(struct ibv_exp_dm *dm, struct mlx5dv_dm *dm_info) +{ + dm_info->buf = ((uct_mlx5_dm_va_t*)dm)->start_va; + return UCS_OK; +} + +# define UCT_IB_MLX5_DV_DM(_obj) _obj.dv_dm +#else +# define UCT_IB_MLX5_DV_DM(_obj) _obj.dv.dm +#endif + +#endif diff --git a/src/uct/ib/mlx5/ib_mlx5_ifc.h b/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h similarity index 90% rename from src/uct/ib/mlx5/ib_mlx5_ifc.h rename to src/uct/ib/mlx5/dv/ib_mlx5_ifc.h index a4b10feaacf..9835ba1e377 100644 --- a/src/uct/ib/mlx5/ib_mlx5_ifc.h +++ b/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h @@ -9,6 +9,7 @@ #include +#include #include #include @@ -34,12 +35,15 @@ /* insert a value to a struct */ #define UCT_IB_MLX5DV_SET(_typ, _p, _fld, _v) \ do { \ + char *___p = _p; \ uint32_t ___v = _v; \ + uint32_t ___h; \ UCS_STATIC_ASSERT(__uct_st_sz_bits(_typ) % 32 == 0); \ - *((__be32 *)(_p) + __uct_dw_off(_typ, _fld)) = \ - htobe32((be32toh(*((__be32 *)(_p) + __uct_dw_off(_typ, _fld))) & \ - (~__uct_dw_mask(_typ, _fld))) | (((___v) & __uct_mask(_typ, _fld)) \ - << __uct_dw_bit_off(_typ, _fld))); \ + ___h = (be32toh(*((__be32 *)(___p) + __uct_dw_off(_typ, _fld))) & \ + (~__uct_dw_mask(_typ, _fld))) | \ + (((___v) & __uct_mask(_typ, _fld)) << \ + __uct_dw_bit_off(_typ, _fld)); \ + *((__be32 *)(___p) + __uct_dw_off(_typ, _fld)) = htobe32(___h); \ } while (0) #define UCT_IB_MLX5DV_GET(_typ, _p, _fld) \ @@ -64,6 +68,9 @@ enum { UCT_IB_MLX5_CMD_OP_RST2INIT_QP = 0x502, UCT_IB_MLX5_CMD_OP_INIT2RTR_QP = 0x503, UCT_IB_MLX5_CMD_OP_RTR2RTS_QP = 0x504, + UCT_IB_MLX5_CMD_OP_2ERR_QP = 0x507, + UCT_IB_MLX5_CMD_OP_2RST_QP = 0x50a, + UCT_IB_MLX5_CMD_OP_CREATE_RMP = 0x90c, UCT_IB_MLX5_CMD_OP_CREATE_DCT = 0x710, UCT_IB_MLX5_CMD_OP_DRAIN_DCT = 0x712, UCT_IB_MLX5_CMD_OP_CREATE_XRQ = 0x717, @@ -77,7 +84,8 @@ enum { enum { UCT_IB_MLX5_CAP_GENERAL = 0, - UCT_IB_MLX5_CAP_ATOMIC = 3 + UCT_IB_MLX5_CAP_ODP = 2, + UCT_IB_MLX5_CAP_ATOMIC = 3, }; struct uct_ib_mlx5_cmd_hca_cap_bits { @@ -423,8 +431,40 @@ struct uct_ib_mlx5_atomic_caps_bits { uint8_t reserved_at_2c0[0x540]; }; +struct uct_ib_mlx5_odp_per_transport_service_cap_bits { + uint8_t send[0x1]; + uint8_t receive[0x1]; + uint8_t write[0x1]; + uint8_t read[0x1]; + uint8_t atomic[0x1]; + uint8_t srq_receive[0x1]; + uint8_t reserved_at_6[0x1a]; +}; + +struct uct_ib_mlx5_odp_cap_bits { + uint8_t reserved_at_0[0x40]; + + uint8_t sig[0x1]; + uint8_t reserved_at_41[0x1f]; + + uint8_t reserved_at_60[0x20]; + + struct uct_ib_mlx5_odp_per_transport_service_cap_bits rc_odp_caps; + + struct uct_ib_mlx5_odp_per_transport_service_cap_bits uc_odp_caps; + + struct uct_ib_mlx5_odp_per_transport_service_cap_bits ud_odp_caps; + + struct uct_ib_mlx5_odp_per_transport_service_cap_bits xrc_odp_caps; + + struct uct_ib_mlx5_odp_per_transport_service_cap_bits dc_odp_caps; + + uint8_t reserved_at_100[0x700]; +}; + union uct_ib_mlx5_hca_cap_union_bits { struct uct_ib_mlx5_cmd_hca_cap_bits cmd_hca_cap; + struct uct_ib_mlx5_odp_cap_bits odp_cap; struct uct_ib_mlx5_atomic_caps_bits atomic_caps; uint8_t reserved_at_0[0x8000]; }; @@ -872,6 +912,48 @@ struct uct_ib_mlx5_create_xrq_in_bits { struct uct_ib_mlx5_xrqc_bits xrq_context; }; +enum { + UCT_IB_MLX5_RMPC_STATE_RDY = 0x1, + UCT_IB_MLX5_RMPC_STATE_ERR = 0x3 +}; + +struct uct_ib_mlx5_rmpc_bits { + uint8_t reserved_at_0[0x8]; + uint8_t state[0x4]; + uint8_t reserved_at_c[0x14]; + + uint8_t basic_cyclic_rcv_wqe[0x1]; + uint8_t reserved_at_21[0x1f]; + + uint8_t reserved_at_40[0x140]; + + struct uct_ib_mlx5_wq_bits wq; +}; + +struct uct_ib_mlx5_create_rmp_out_bits { + uint8_t status[0x8]; + uint8_t reserved_at_8[0x18]; + + uint8_t syndrome[0x20]; + + uint8_t reserved_at_40[0x8]; + uint8_t rmpn[0x18]; + + uint8_t reserved_at_60[0x20]; +}; + +struct uct_ib_mlx5_create_rmp_in_bits { + uint8_t opcode[0x10]; + uint8_t uid[0x10]; + + uint8_t reserved_at_20[0x10]; + uint8_t op_mod[0x10]; + + uint8_t reserved_at_40[0xc0]; + + struct uct_ib_mlx5_rmpc_bits rmp_context; +}; + enum { UCT_IB_MLX5_ADS_STAT_RATE_NO_LIMIT = 0x0, UCT_IB_MLX5_ADS_STAT_RATE_2_5GBPS = 0x7, @@ -899,7 +981,7 @@ struct uct_ib_mlx5_ads_bits { uint8_t ack_timeout[0x5]; uint8_t reserved_at_45[0x3]; uint8_t src_addr_index[0x8]; - uint8_t reserved_at_50[0x4]; + uint8_t log_rtm[0x4]; uint8_t stat_rate[0x4]; uint8_t hop_limit[0x8]; @@ -992,12 +1074,26 @@ enum { UCT_IB_MLX5_QPC_CS_REQ_UP_TO_64B = 0x22 }; +static inline unsigned uct_ib_mlx5_qpc_cs_req(unsigned size) +{ + return (size > 32) ? UCT_IB_MLX5_QPC_CS_REQ_UP_TO_64B : + size ? UCT_IB_MLX5_QPC_CS_REQ_UP_TO_32B : + UCT_IB_MLX5_QPC_CS_REQ_DISABLE; +} + enum { UCT_IB_MLX5_QPC_CS_RES_DISABLE = 0x0, UCT_IB_MLX5_QPC_CS_RES_UP_TO_32B = 0x1, UCT_IB_MLX5_QPC_CS_RES_UP_TO_64B = 0x2 }; +static inline unsigned uct_ib_mlx5_qpc_cs_res(unsigned size, int dc) +{ + return (size > 32) ? UCT_IB_MLX5_QPC_CS_RES_UP_TO_64B : + (size && !dc) ? UCT_IB_MLX5_QPC_CS_RES_UP_TO_32B : + UCT_IB_MLX5_QPC_CS_RES_DISABLE; +} + struct uct_ib_mlx5_qpc_bits { uint8_t state[0x4]; uint8_t lag_tx_port_affinity[0x4]; @@ -1261,4 +1357,31 @@ struct uct_ib_mlx5_rst2init_qp_in_bits { uint8_t reserved_at_800[0x80]; }; + +struct uct_ib_mlx5_modify_qp_out_bits { + uint8_t status[0x8]; + uint8_t reserved_at_8[0x18]; + + uint8_t syndrome[0x20]; + + uint8_t reserved_at_40[0x40]; +}; + +struct uct_ib_mlx5_modify_qp_in_bits { + uint8_t opcode[0x10]; + uint8_t uid[0x10]; + + uint8_t reserved_at_20[0x10]; + uint8_t op_mod[0x10]; + + uint8_t reserved_at_40[0x8]; + uint8_t qpn[0x18]; + + uint8_t reserved_at_60[0x20]; +}; + +enum { + UCT_IB_MLX5_EVENT_TYPE_SRQ_LAST_WQE = 0x13 +}; + #endif diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c new file mode 100644 index 00000000000..0391f9d1870 --- /dev/null +++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c @@ -0,0 +1,1004 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include "ib_mlx5_ifc.h" + +#include +#include + +typedef struct { + struct mlx5dv_devx_obj *dvmr; + int mr_num; + size_t length; + struct ibv_mr *mrs[]; +} uct_ib_mlx5_ksm_data_t; + +typedef union uct_ib_mlx5_mr { + uct_ib_mr_t super; + uct_ib_mlx5_ksm_data_t *ksm_data; +} uct_ib_mlx5_mr_t; + +typedef struct uct_ib_mlx5_mem { + uct_ib_mem_t super; +#if HAVE_DEVX + struct mlx5dv_devx_obj *atomic_dvmr; +#endif + uct_ib_mlx5_mr_t mrs[]; +} uct_ib_mlx5_mem_t; + + +static ucs_status_t uct_ib_mlx5_reg_key(uct_ib_md_t *md, void *address, + size_t length, uint64_t access_flags, + uct_ib_mem_t *ib_memh, uct_ib_mr_type_t mr_type) +{ + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + + return uct_ib_reg_key_impl(md, address, length, access_flags, ib_memh, + &memh->mrs[mr_type].super, mr_type); +} + +static ucs_status_t uct_ib_mlx5_dereg_key(uct_ib_md_t *md, + uct_ib_mem_t *ib_memh, + uct_ib_mr_type_t mr_type) +{ + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + + return uct_ib_dereg_mr(memh->mrs[mr_type].super.ib); +} + +static ucs_status_t uct_ib_mlx5_reg_atomic_key(uct_ib_md_t *ibmd, + uct_ib_mem_t *ib_memh) +{ + uct_ib_mr_type_t mr_type = uct_ib_memh_get_atomic_base_mr_type(ib_memh); + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + + if (mr_type != UCT_IB_MR_STRICT_ORDER) { + return UCS_ERR_UNSUPPORTED; + } + + memh->super.atomic_rkey = memh->mrs[mr_type].super.ib->rkey; + return UCS_OK; +} + +static ucs_status_t +uct_ib_mlx5_mem_prefetch(uct_ib_md_t *md, uct_ib_mem_t *ib_memh, void *addr, + size_t length) +{ +#if HAVE_DECL_IBV_ADVISE_MR + struct ibv_sge sg_list; + int ret; + + if (!(ib_memh->flags & UCT_IB_MEM_FLAG_ODP)) { + return UCS_OK; + } + + ucs_debug("memh %p prefetch %p length %zu", ib_memh, addr, length); + + sg_list.lkey = ib_memh->lkey; + sg_list.addr = (uintptr_t)addr; + sg_list.length = length; + + ret = UCS_PROFILE_CALL(ibv_advise_mr, md->pd, + IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE, + IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sg_list, 1); + if (ret) { + ucs_error("ibv_advise_mr(addr=%p length=%zu) returned %d: %m", + addr, length, ret); + return UCS_ERR_IO_ERROR; + } +#endif + return UCS_OK; +} + +static int uct_ib_mlx5_has_roce_port(uct_ib_device_t *dev) +{ + int port_num; + + for (port_num = dev->first_port; + port_num < dev->first_port + dev->num_ports; + port_num++) + { + if (uct_ib_device_is_port_roce(dev, port_num)) { + return 1; + } + } + + return 0; +} + +static void uct_ib_mlx5_parse_relaxed_order(uct_ib_mlx5_md_t *md, + const uct_ib_md_config_t *md_config) +{ + int num_mrs = 1; /* UCT_IB_MR_DEFAULT */ + + uct_ib_md_parse_relaxed_order(&md->super, md_config); + + if (md->super.relaxed_order) { + ++num_mrs; /* UCT_IB_MR_STRICT_ORDER */ + } + + md->super.memh_struct_size = sizeof(uct_ib_mlx5_mem_t) + + (sizeof(uct_ib_mlx5_mr_t) * num_mrs); +} + +#if HAVE_DEVX + +typedef struct uct_ib_mlx5_dbrec_page { + uct_ib_mlx5_devx_umem_t mem; +} uct_ib_mlx5_dbrec_page_t; + + +static size_t uct_ib_mlx5_calc_mkey_inlen(int list_size) +{ + return UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_in) + + UCT_IB_MLX5DV_ST_SZ_BYTES(klm) * list_size; +} + +static ucs_status_t uct_ib_mlx5_alloc_mkey_inbox(int list_size, char **in_p) +{ + size_t inlen; + char *in; + + inlen = uct_ib_mlx5_calc_mkey_inlen(list_size); + in = ucs_calloc(1, inlen, "mkey mailbox"); + if (in == NULL) { + return UCS_ERR_NO_MEMORY; + } + + *in_p = in; + return UCS_OK; +} + +static ucs_status_t uct_ib_mlx5_devx_reg_ksm(uct_ib_mlx5_md_t *md, + intptr_t addr, size_t length, + int list_size, size_t entity_size, + char *in, + struct mlx5dv_devx_obj **mr_p, + uint32_t *mkey) +{ + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_out)] = {}; + struct mlx5dv_pd dvpd = {}; + struct mlx5dv_obj dv = {}; + struct mlx5dv_devx_obj *mr; + void *mkc; + + dv.pd.in = md->super.pd; + dv.pd.out = &dvpd; + mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); + + UCT_IB_MLX5DV_SET(create_mkey_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_MKEY); + mkc = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, UCT_IB_MLX5_MKC_ACCESS_MODE_KSM); + UCT_IB_MLX5DV_SET(mkc, mkc, a, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, rw, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, rr, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, lw, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, lr, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, pd, dvpd.pdn); + UCT_IB_MLX5DV_SET(mkc, mkc, translations_octword_size, list_size); + UCT_IB_MLX5DV_SET(mkc, mkc, log_entity_size, ucs_ilog2(entity_size)); + UCT_IB_MLX5DV_SET(mkc, mkc, qpn, 0xffffff); + UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, addr & 0xff); + UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, addr); + UCT_IB_MLX5DV_SET64(mkc, mkc, len, length); + UCT_IB_MLX5DV_SET(create_mkey_in, in, translations_octword_actual_size, list_size); + + mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, + uct_ib_mlx5_calc_mkey_inlen(list_size), + out, sizeof(out)); + if (mr == NULL) { + ucs_debug("mlx5dv_devx_obj_create(CREATE_MKEY, mode=KSM) failed, syndrome %x: %m", + UCT_IB_MLX5DV_GET(create_mkey_out, out, syndrome)); + return UCS_ERR_UNSUPPORTED; + } + + *mr_p = mr; + *mkey = (UCT_IB_MLX5DV_GET(create_mkey_out, out, mkey_index) << 8) | + (addr & 0xff); + + return UCS_OK; +} + +static ucs_status_t +uct_ib_mlx5_devx_reg_ksm_data(uct_ib_mlx5_md_t *md, + uct_ib_mlx5_ksm_data_t *ksm_data, + size_t length, off_t off, + struct mlx5dv_devx_obj **mr_p, + uint32_t *mkey) +{ + ucs_status_t status; + char *in; + void *klm; + int i; + + status = uct_ib_mlx5_alloc_mkey_inbox(ksm_data->mr_num, &in); + if (status != UCS_OK) { + return UCS_ERR_NO_MEMORY; + } + + klm = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, klm_pas_mtt); + for (i = 0; i < ksm_data->mr_num; i++) { + UCT_IB_MLX5DV_SET64(klm, klm, address, (intptr_t)ksm_data->mrs[i]->addr); + UCT_IB_MLX5DV_SET(klm, klm, byte_count, ksm_data->mrs[i]->length); + UCT_IB_MLX5DV_SET(klm, klm, mkey, ksm_data->mrs[i]->lkey); + klm = UCS_PTR_BYTE_OFFSET(klm, UCT_IB_MLX5DV_ST_SZ_BYTES(klm)); + } + + status = uct_ib_mlx5_devx_reg_ksm(md, (intptr_t)ksm_data->mrs[0]->addr + off, + length, ksm_data->mr_num, + ksm_data->mrs[0]->length, in, mr_p, mkey); + ucs_free(in); + return status; +} + +static ucs_status_t uct_ib_mlx5_devx_reg_atomic_key(uct_ib_md_t *ibmd, + uct_ib_mem_t *ib_memh) +{ + uct_ib_mr_type_t mr_type = uct_ib_memh_get_atomic_base_mr_type(ib_memh); + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t); + uct_ib_mlx5_mr_t *mr = &memh->mrs[mr_type]; + size_t reg_length, length; + ucs_status_t status; + int list_size, i; + void *klm; + char *in; + intptr_t addr; + uint8_t mr_id; + + if (!(md->flags & UCT_IB_MLX5_MD_FLAG_KSM)) { + return uct_ib_mlx5_reg_atomic_key(ibmd, ib_memh); + } + + status = uct_ib_mlx5_md_get_atomic_mr_id(ibmd, &mr_id); + if (status != UCS_OK) { + return status; + } + + if (memh->super.flags & UCT_IB_MEM_MULTITHREADED) { + return uct_ib_mlx5_devx_reg_ksm_data(md, mr->ksm_data, mr->ksm_data->length, + uct_ib_md_atomic_offset(mr_id), + &memh->atomic_dvmr, + &memh->super.atomic_rkey); + } + + reg_length = UCT_IB_MD_MAX_MR_SIZE; + addr = (intptr_t)mr->super.ib->addr & ~(reg_length - 1); + /* FW requires indirect atomic MR addr and length to be aligned + * to max supported atomic argument size */ + length = ucs_align_up(mr->super.ib->length + + (intptr_t)mr->super.ib->addr - addr, + md->super.dev.atomic_align); + list_size = ucs_div_round_up(length, reg_length); + + status = uct_ib_mlx5_alloc_mkey_inbox(list_size, &in); + if (status != UCS_OK) { + return status; + } + + klm = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, klm_pas_mtt); + for (i = 0; i < list_size; i++) { + if (i == list_size - 1) { + UCT_IB_MLX5DV_SET(klm, klm, byte_count, length % reg_length); + } else { + UCT_IB_MLX5DV_SET(klm, klm, byte_count, reg_length); + } + UCT_IB_MLX5DV_SET(klm, klm, mkey, mr->super.ib->lkey); + UCT_IB_MLX5DV_SET64(klm, klm, address, addr + (i * reg_length)); + klm = UCS_PTR_BYTE_OFFSET(klm, UCT_IB_MLX5DV_ST_SZ_BYTES(klm)); + } + + status = uct_ib_mlx5_devx_reg_ksm(md, addr + uct_ib_md_atomic_offset(mr_id), + length, list_size, reg_length, in, + &memh->atomic_dvmr, + &memh->super.atomic_rkey); + if (status != UCS_OK) { + if (status == UCS_ERR_UNSUPPORTED) { + md->flags &= ~UCT_IB_MLX5_MD_FLAG_KSM; + } + goto out; + } + + ucs_debug("KSM registered memory %p..%p offset 0x%x on %s rkey 0x%x", + mr->super.ib->addr, UCS_PTR_BYTE_OFFSET(mr->super.ib->addr, + mr->super.ib->length), uct_ib_md_atomic_offset(mr_id), + uct_ib_device_name(&md->super.dev), memh->super.atomic_rkey); +out: + ucs_free(in); + return status; +} + +static ucs_status_t uct_ib_mlx5_devx_dereg_atomic_key(uct_ib_md_t *ibmd, + uct_ib_mem_t *ib_memh) +{ + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t); + int ret; + + if (!(md->flags & UCT_IB_MLX5_MD_FLAG_KSM)) { + return UCS_OK; + } + + ret = mlx5dv_devx_obj_destroy(memh->atomic_dvmr); + if (ret != 0) { + ucs_error("mlx5dv_devx_obj_destroy(MKEY, ATOMIC) failed: %m"); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + +static ucs_status_t uct_ib_mlx5_devx_reg_multithreaded(uct_ib_md_t *ibmd, + void *address, size_t length, + uint64_t access_flags, + uct_ib_mem_t *ib_memh, + uct_ib_mr_type_t mr_type) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t); + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + uct_ib_mlx5_mr_t *mr = &memh->mrs[mr_type]; + size_t chunk = md->super.config.mt_reg_chunk; + uct_ib_mlx5_ksm_data_t *ksm_data; + size_t ksm_data_size; + ucs_status_t status; + uint32_t mkey; + int mr_num; + + if (!(md->flags & UCT_IB_MLX5_MD_FLAG_KSM) || + !(md->flags & UCT_IB_MLX5_MD_FLAG_INDIRECT_ATOMICS)) { + return UCS_ERR_UNSUPPORTED; + } + + mr_num = ucs_div_round_up(length, chunk); + ksm_data_size = (mr_num * sizeof(*ksm_data->mrs)) + sizeof(*ksm_data); + ksm_data = ucs_calloc(1, ksm_data_size, "ksm_data"); + if (!ksm_data) { + status = UCS_ERR_NO_MEMORY; + goto err; + } + + ucs_trace("multithreaded register memory %p..%p chunks %d", + address, UCS_PTR_BYTE_OFFSET(address, length), mr_num); + + ksm_data->mr_num = mr_num; + status = uct_ib_md_handle_mr_list_multithreaded(ibmd, address, length, + access_flags, chunk, + ksm_data->mrs); + if (status != UCS_OK) { + goto err; + } + + status = uct_ib_mlx5_devx_reg_ksm_data(md, ksm_data, length, 0, + &ksm_data->dvmr, &mkey); + if (status != UCS_OK) { + goto err_dereg; + } + + ksm_data->length = length; + mr->ksm_data = ksm_data; + + if (mr_type == UCT_IB_MR_DEFAULT) { + uct_ib_memh_init_keys(ib_memh, mkey, mkey); + } + return UCS_OK; + +err_dereg: + uct_ib_md_handle_mr_list_multithreaded(ibmd, address, length, UCT_IB_MEM_DEREG, + chunk, ksm_data->mrs); +err: + ucs_free(ksm_data); + return status; +} + +static ucs_status_t uct_ib_mlx5_devx_dereg_multithreaded(uct_ib_md_t *ibmd, + uct_ib_mem_t *ib_memh, + uct_ib_mr_type_t mr_type) +{ + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + uct_ib_mlx5_mr_t *mr = &memh->mrs[mr_type]; + size_t chunk = ibmd->config.mt_reg_chunk; + ucs_status_t s, status = UCS_OK; + int ret; + + s = uct_ib_md_handle_mr_list_multithreaded(ibmd, 0, mr->ksm_data->length, + UCT_IB_MEM_DEREG, chunk, + mr->ksm_data->mrs); + if (s == UCS_ERR_UNSUPPORTED) { + s = uct_ib_dereg_mrs(mr->ksm_data->mrs, mr->ksm_data->mr_num); + if (s != UCS_OK) { + status = s; + } + } else if (s != UCS_OK) { + status = s; + } + + ret = mlx5dv_devx_obj_destroy(mr->ksm_data->dvmr); + if (ret != 0) { + ucs_error("mlx5dv_devx_obj_destroy(MKEY, KSM) failed: %m"); + status = UCS_ERR_IO_ERROR; + } + + ucs_free(mr->ksm_data); + + return status; +} + +static ucs_status_t uct_ib_mlx5_add_page(ucs_mpool_t *mp, size_t *size_p, void **page_p) +{ + uct_ib_mlx5_md_t *md = ucs_container_of(mp, uct_ib_mlx5_md_t, dbrec_pool); + uct_ib_mlx5_dbrec_page_t *page; + size_t size = ucs_align_up(*size_p + sizeof(*page), ucs_get_page_size()); + uct_ib_mlx5_devx_umem_t mem; + ucs_status_t status; + + status = uct_ib_mlx5_md_buf_alloc(md, size, 1, (void **)&page, &mem, "devx dbrec"); + if (status != UCS_OK) { + return status; + } + + page->mem = mem; + *size_p = size - sizeof(*page); + *page_p = page + 1; + return UCS_OK; +} + +static void uct_ib_mlx5_init_dbrec(ucs_mpool_t *mp, void *obj, void *chunk) +{ + uct_ib_mlx5_dbrec_page_t *page = (uct_ib_mlx5_dbrec_page_t*)chunk - 1; + uct_ib_mlx5_dbrec_t *dbrec = obj; + + dbrec->mem_id = page->mem.mem->umem_id; + dbrec->offset = UCS_PTR_BYTE_DIFF(chunk, obj) + sizeof(*page); +} + +static void uct_ib_mlx5_free_page(ucs_mpool_t *mp, void *chunk) +{ + uct_ib_mlx5_md_t *md = ucs_container_of(mp, uct_ib_mlx5_md_t, dbrec_pool); + uct_ib_mlx5_dbrec_page_t *page = (uct_ib_mlx5_dbrec_page_t*)chunk - 1; + uct_ib_mlx5_md_buf_free(md, page, &page->mem); +} + +static ucs_mpool_ops_t uct_ib_mlx5_dbrec_ops = { + .chunk_alloc = uct_ib_mlx5_add_page, + .chunk_release = uct_ib_mlx5_free_page, + .obj_init = uct_ib_mlx5_init_dbrec, + .obj_cleanup = NULL +}; + +static ucs_status_t +uct_ib_mlx5_devx_check_odp(uct_ib_mlx5_md_t *md, + const uct_ib_md_config_t *md_config, void *cap) +{ + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_out)] = {}; + char in[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_in)] = {}; + void *odp; + int ret; + + if (md_config->devx_objs & UCS_BIT(UCT_IB_DEVX_OBJ_RCQP)) { + ucs_debug("%s: disable ODP because it's not supported for DevX QP", + uct_ib_device_name(&md->super.dev)); + goto no_odp; + } + + if (uct_ib_mlx5_has_roce_port(&md->super.dev)) { + ucs_debug("%s: disable ODP on RoCE", uct_ib_device_name(&md->super.dev)); + goto no_odp; + } + + if (!UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, pg)) { + goto no_odp; + } + + odp = UCT_IB_MLX5DV_ADDR_OF(query_hca_cap_out, out, capability); + UCT_IB_MLX5DV_SET(query_hca_cap_in, in, opcode, UCT_IB_MLX5_CMD_OP_QUERY_HCA_CAP); + UCT_IB_MLX5DV_SET(query_hca_cap_in, in, op_mod, UCT_IB_MLX5_HCA_CAP_OPMOD_GET_CUR | + (UCT_IB_MLX5_CAP_ODP << 1)); + ret = mlx5dv_devx_general_cmd(md->super.dev.ibv_context, in, sizeof(in), + out, sizeof(out)); + if (ret != 0) { + ucs_error("mlx5dv_devx_general_cmd(QUERY_HCA_CAP, ODP) failed: %m"); + return UCS_ERR_IO_ERROR; + } + + if (!UCT_IB_MLX5DV_GET(odp_cap, odp, ud_odp_caps.send) || + !UCT_IB_MLX5DV_GET(odp_cap, odp, rc_odp_caps.send) || + !UCT_IB_MLX5DV_GET(odp_cap, odp, rc_odp_caps.write) || + !UCT_IB_MLX5DV_GET(odp_cap, odp, rc_odp_caps.read)) { + goto no_odp; + } + + if ((md->super.dev.flags & UCT_IB_DEVICE_FLAG_DC) && + (!UCT_IB_MLX5DV_GET(odp_cap, odp, dc_odp_caps.send) || + !UCT_IB_MLX5DV_GET(odp_cap, odp, dc_odp_caps.write) || + !UCT_IB_MLX5DV_GET(odp_cap, odp, dc_odp_caps.read))) { + goto no_odp; + } + + if (md->super.config.odp.max_size == UCS_MEMUNITS_AUTO) { + if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, umr_extended_translation_offset)) { + md->super.config.odp.max_size = 1ul << 55; + } else { + md->super.config.odp.max_size = 1ul << 28; + } + } + + if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, fixed_buffer_size) && + UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, null_mkey) && + UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, umr_extended_translation_offset)) { + md->super.dev.flags |= UCT_IB_DEVICE_FLAG_ODP_IMPLICIT; + } + + return UCS_OK; + +no_odp: + md->super.config.odp.max_size = 0; + return UCS_OK; +} + +static struct ibv_context * +uct_ib_mlx5_devx_open_device(struct ibv_device *ibv_device, + struct mlx5dv_context_attr *dv_attr) +{ + struct ibv_context *ctx; + struct ibv_cq *cq; + + ctx = mlx5dv_open_device(ibv_device, dv_attr); + if (ctx == NULL) { + return NULL; + } + + cq = ibv_create_cq(ctx, 1, NULL, NULL, 0); + if (cq == NULL) { + ibv_close_device(ctx); + return NULL; + } + + ibv_destroy_cq(cq); + return ctx; +} + +static uct_ib_md_ops_t uct_ib_mlx5_devx_md_ops; + +static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device, + const uct_ib_md_config_t *md_config, + uct_ib_md_t **p_md) +{ + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_out)] = {}; + char in[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_in)] = {}; + struct mlx5dv_context_attr dv_attr = {}; + ucs_status_t status = UCS_OK; + struct ibv_context *ctx; + uct_ib_device_t *dev; + uct_ib_mlx5_md_t *md; + void *cap; + int ret; + +#if HAVE_DECL_MLX5DV_IS_SUPPORTED + if (!mlx5dv_is_supported(ibv_device)) { + return UCS_ERR_UNSUPPORTED; + } +#endif + + if (md_config->devx == UCS_NO) { + return UCS_ERR_UNSUPPORTED; + } + + dv_attr.flags |= MLX5DV_CONTEXT_FLAGS_DEVX; + ctx = uct_ib_mlx5_devx_open_device(ibv_device, &dv_attr); + if (ctx == NULL) { + if (md_config->devx == UCS_YES) { + status = UCS_ERR_IO_ERROR; + ucs_error("DEVX requested but not supported by %s", + ibv_get_device_name(ibv_device)); + } else { + status = UCS_ERR_UNSUPPORTED; + ucs_debug("mlx5dv_open_device(%s) failed: %m", + ibv_get_device_name(ibv_device)); + } + goto err; + } + + md = ucs_calloc(1, sizeof(*md), "ib_mlx5_md"); + if (md == NULL) { + status = UCS_ERR_NO_MEMORY; + goto err_free_context; + } + + dev = &md->super.dev; + dev->ibv_context = ctx; + md->super.config = md_config->ext; + + status = uct_ib_device_query(dev, ibv_device); + if (status != UCS_OK) { + goto err_free; + } + + cap = UCT_IB_MLX5DV_ADDR_OF(query_hca_cap_out, out, capability); + UCT_IB_MLX5DV_SET(query_hca_cap_in, in, opcode, UCT_IB_MLX5_CMD_OP_QUERY_HCA_CAP); + UCT_IB_MLX5DV_SET(query_hca_cap_in, in, op_mod, UCT_IB_MLX5_HCA_CAP_OPMOD_GET_CUR | + (UCT_IB_MLX5_CAP_GENERAL << 1)); + ret = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); + if (ret != 0) { + if ((errno == EPERM) || (errno == EPROTONOSUPPORT) || + (errno == EOPNOTSUPP)) { + status = UCS_ERR_UNSUPPORTED; + ucs_debug("mlx5dv_devx_general_cmd(QUERY_HCA_CAP) failed: %m"); + } else { + ucs_error("mlx5dv_devx_general_cmd(QUERY_HCA_CAP) failed: %m"); + status = UCS_ERR_IO_ERROR; + } + goto err_free; + } + + if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, log_max_msg) != + UCT_IB_MLX5_LOG_MAX_MSG_SIZE) { + status = UCS_ERR_UNSUPPORTED; + ucs_debug("Unexpected QUERY_HCA_CAP.log_max_msg %d\n", + UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, log_max_msg)); + goto err_free; + } + + if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, dct)) { + dev->flags |= UCT_IB_DEVICE_FLAG_DC; + } + + if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, rndv_offload_dc)) { + md->flags |= UCT_IB_MLX5_MD_FLAG_DC_TM; + } + + if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, compact_address_vector)) { + dev->flags |= UCT_IB_DEVICE_FLAG_AV; + } + + if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, fixed_buffer_size)) { + md->flags |= UCT_IB_MLX5_MD_FLAG_KSM; + } + + if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, ext_stride_num_range)) { + md->flags |= UCT_IB_MLX5_MD_FLAG_MP_RQ; + } + + if (!UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, umr_modify_atomic_disabled)) { + md->flags |= UCT_IB_MLX5_MD_FLAG_INDIRECT_ATOMICS; + } + + if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, log_max_rmp) > 0) { + md->flags |= UCT_IB_MLX5_MD_FLAG_RMP; + } + + status = uct_ib_mlx5_devx_check_odp(md, md_config, cap); + if (status != UCS_OK) { + goto err_free; + } + + if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, atomic)) { + int ops = UCT_IB_MLX5_ATOMIC_OPS_CMP_SWAP | + UCT_IB_MLX5_ATOMIC_OPS_FETCH_ADD; + uint8_t arg_size; + int cap_ops, mode8b; + + UCT_IB_MLX5DV_SET(query_hca_cap_in, in, op_mod, UCT_IB_MLX5_HCA_CAP_OPMOD_GET_CUR | + (UCT_IB_MLX5_CAP_ATOMIC << 1)); + ret = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); + if (ret != 0) { + ucs_error("mlx5dv_devx_general_cmd(QUERY_HCA_CAP, ATOMIC) failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err_free; + } + + arg_size = UCT_IB_MLX5DV_GET(atomic_caps, cap, atomic_size_qp); + cap_ops = UCT_IB_MLX5DV_GET(atomic_caps, cap, atomic_operations); + mode8b = UCT_IB_MLX5DV_GET(atomic_caps, cap, atomic_req_8B_endianness_mode); + + if ((cap_ops & ops) == ops) { + dev->atomic_arg_sizes = sizeof(uint64_t); + if (!mode8b) { + dev->atomic_arg_sizes_be = sizeof(uint64_t); + } + } + + dev->atomic_align = ucs_rounddown_pow2(arg_size); + + ops |= UCT_IB_MLX5_ATOMIC_OPS_MASKED_CMP_SWAP | + UCT_IB_MLX5_ATOMIC_OPS_MASKED_FETCH_ADD; + + arg_size &= UCT_IB_MLX5DV_GET(query_hca_cap_out, out, + capability.atomic_caps.atomic_size_dc); + + if ((cap_ops & ops) == ops) { + dev->ext_atomic_arg_sizes = arg_size; + if (mode8b) { + arg_size &= ~(sizeof(uint64_t)); + } + dev->ext_atomic_arg_sizes_be = arg_size; + } + + dev->pci_fadd_arg_sizes = UCT_IB_MLX5DV_GET(atomic_caps, cap, fetch_add_pci_atomic) << 2; + dev->pci_cswap_arg_sizes = UCT_IB_MLX5DV_GET(atomic_caps, cap, compare_swap_pci_atomic) << 2; + } + + md->super.ops = &uct_ib_mlx5_devx_md_ops; + + uct_ib_mlx5_parse_relaxed_order(md, md_config); + status = uct_ib_md_open_common(&md->super, ibv_device, md_config); + if (status != UCS_OK) { + goto err_free; + } + + ucs_recursive_spinlock_init(&md->dbrec_lock, 0); + status = ucs_mpool_init(&md->dbrec_pool, 0, + sizeof(uct_ib_mlx5_dbrec_t), 0, + UCS_SYS_CACHE_LINE_SIZE, + ucs_get_page_size() / UCS_SYS_CACHE_LINE_SIZE - 1, + UINT_MAX, &uct_ib_mlx5_dbrec_ops, "devx dbrec"); + if (status != UCS_OK) { + goto err_free; + } + + status = uct_ib_mlx5_md_buf_alloc(md, ucs_get_page_size(), 0, &md->zero_buf, + &md->zero_mem, "zero umem"); + if (status != UCS_OK) { + goto err_release_dbrec; + } + + dev->flags |= UCT_IB_DEVICE_FLAG_MLX5_PRM; + md->flags |= UCT_IB_MLX5_MD_FLAG_DEVX; + md->flags |= UCT_IB_MLX5_MD_FLAGS_DEVX_OBJS(md_config->devx_objs); + *p_md = &md->super; + return status; + +err_release_dbrec: + ucs_mpool_cleanup(&md->dbrec_pool, 1); +err_free: + ucs_free(md); +err_free_context: + ibv_close_device(ctx); +err: + return status; +} + +void uct_ib_mlx5_devx_md_cleanup(uct_ib_md_t *ibmd) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t); + ucs_status_t status; + + uct_ib_mlx5_md_buf_free(md, md->zero_buf, &md->zero_mem); + ucs_mpool_cleanup(&md->dbrec_pool, 1); + status = ucs_recursive_spinlock_destroy(&md->dbrec_lock); + if (status != UCS_OK) { + ucs_warn("ucs_recursive_spinlock_destroy() failed (%d)", status); + } +} + +static uct_ib_md_ops_t uct_ib_mlx5_devx_md_ops = { + .open = uct_ib_mlx5_devx_md_open, + .cleanup = uct_ib_mlx5_devx_md_cleanup, + .reg_key = uct_ib_mlx5_reg_key, + .dereg_key = uct_ib_mlx5_dereg_key, + .reg_atomic_key = uct_ib_mlx5_devx_reg_atomic_key, + .dereg_atomic_key = uct_ib_mlx5_devx_dereg_atomic_key, + .reg_multithreaded = uct_ib_mlx5_devx_reg_multithreaded, + .dereg_multithreaded = uct_ib_mlx5_devx_dereg_multithreaded, + .mem_prefetch = uct_ib_mlx5_mem_prefetch, + .get_atomic_mr_id = uct_ib_mlx5_md_get_atomic_mr_id, +}; + +UCT_IB_MD_OPS(uct_ib_mlx5_devx_md_ops, 2); + +#endif + +static ucs_status_t uct_ib_mlx5dv_check_dc(uct_ib_device_t *dev) +{ + ucs_status_t status = UCS_OK; +#if HAVE_DC_DV + struct ibv_srq_init_attr srq_attr = {}; + struct ibv_context *ctx = dev->ibv_context; + struct ibv_qp_init_attr_ex qp_attr = {}; + struct mlx5dv_qp_init_attr dv_attr = {}; + struct ibv_qp_attr attr = {}; + struct ibv_srq *srq; + struct ibv_pd *pd; + struct ibv_cq *cq; + struct ibv_qp *qp; + int ret; + + ucs_debug("checking for DC support on %s", uct_ib_device_name(dev)); + + pd = ibv_alloc_pd(ctx); + if (pd == NULL) { + ucs_error("ibv_alloc_pd() failed: %m"); + return UCS_ERR_IO_ERROR; + } + + cq = ibv_create_cq(ctx, 1, NULL, NULL, 0); + if (cq == NULL) { + ucs_error("ibv_create_cq() failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err_cq; + } + + srq_attr.attr.max_sge = 1; + srq_attr.attr.max_wr = 1; + srq = ibv_create_srq(pd, &srq_attr); + if (srq == NULL) { + ucs_error("ibv_create_srq() failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err_srq; + } + + qp_attr.send_cq = cq; + qp_attr.recv_cq = cq; + qp_attr.qp_type = IBV_QPT_DRIVER; + qp_attr.comp_mask = IBV_QP_INIT_ATTR_PD; + qp_attr.pd = pd; + qp_attr.srq = srq; + + dv_attr.comp_mask = MLX5DV_QP_INIT_ATTR_MASK_DC; + dv_attr.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCT; + dv_attr.dc_init_attr.dct_access_key = UCT_IB_KEY; + + /* create DCT qp successful means DC is supported */ + qp = mlx5dv_create_qp(ctx, &qp_attr, &dv_attr); + if (qp == NULL) { + ucs_debug("failed to create DCT on %s: %m", uct_ib_device_name(dev)); + goto err_qp; + } + + attr.qp_state = IBV_QPS_INIT; + attr.port_num = 1; + attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_ATOMIC; + ret = ibv_modify_qp(qp, &attr, IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS); + if (ret != 0) { + ucs_debug("failed to ibv_modify_qp(DCT, INIT) on %s: %m", + uct_ib_device_name(dev)); + goto err; + } + + /* always set global address parameters, in case the port is RoCE or SRIOV */ + attr.qp_state = IBV_QPS_RTR; + attr.min_rnr_timer = 1; + attr.path_mtu = IBV_MTU_256; + attr.ah_attr.port_num = 1; + attr.ah_attr.sl = 0; + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.hop_limit = 1; + attr.ah_attr.grh.traffic_class = 0; + attr.ah_attr.grh.sgid_index = 0; + + ret = ibv_modify_qp(qp, &attr, IBV_QP_STATE | + IBV_QP_MIN_RNR_TIMER | + IBV_QP_AV | + IBV_QP_PATH_MTU); + + if (ret == 0) { + ucs_debug("DC is supported on %s", uct_ib_device_name(dev)); + dev->flags |= UCT_IB_DEVICE_FLAG_DC; + } else { + ucs_debug("failed to ibv_modify_qp(DCT, RTR) on %s: %m", + uct_ib_device_name(dev)); + } + +err: + uct_ib_destroy_qp(qp); +err_qp: + uct_ib_destroy_srq(srq); +err_srq: + ibv_destroy_cq(cq); +err_cq: + ibv_dealloc_pd(pd); +#endif + return status; +} + +static uct_ib_md_ops_t uct_ib_mlx5_md_ops; + +static ucs_status_t uct_ib_mlx5dv_md_open(struct ibv_device *ibv_device, + const uct_ib_md_config_t *md_config, + uct_ib_md_t **p_md) +{ + ucs_status_t status = UCS_OK; + struct ibv_context *ctx; + uct_ib_device_t *dev; + uct_ib_mlx5_md_t *md; + +#if HAVE_DECL_MLX5DV_IS_SUPPORTED + if (!mlx5dv_is_supported(ibv_device)) { + return UCS_ERR_UNSUPPORTED; + } +#endif + + ctx = ibv_open_device(ibv_device); + if (ctx == NULL) { + ucs_debug("ibv_open_device(%s) failed: %m", ibv_get_device_name(ibv_device)); + status = UCS_ERR_UNSUPPORTED; + goto err; + } + + md = ucs_calloc(1, sizeof(*md), "ib_mlx5_md"); + if (md == NULL) { + status = UCS_ERR_NO_MEMORY; + goto err_free_context; + } + + dev = &md->super.dev; + dev->ibv_context = ctx; + md->super.config = md_config->ext; + + status = uct_ib_device_query(dev, ibv_device); + if (status != UCS_OK) { + goto err_free; + } + + if (!(uct_ib_device_spec(dev)->flags & UCT_IB_DEVICE_FLAG_MLX5_PRM)) { + status = UCS_ERR_UNSUPPORTED; + goto err_free; + } + + if (UCT_IB_HAVE_ODP_IMPLICIT(&dev->dev_attr) && + !uct_ib_mlx5_has_roce_port(dev)) { + dev->flags |= UCT_IB_DEVICE_FLAG_ODP_IMPLICIT; + } + + if (IBV_EXP_HAVE_ATOMIC_HCA(&dev->dev_attr)) { + dev->atomic_arg_sizes = sizeof(uint64_t); + +#if HAVE_STRUCT_IBV_DEVICE_ATTR_EX_PCI_ATOMIC_CAPS + dev->pci_fadd_arg_sizes = dev->dev_attr.pci_atomic_caps.fetch_add << 2; + dev->pci_cswap_arg_sizes = dev->dev_attr.pci_atomic_caps.compare_swap << 2; +#endif + } + + status = uct_ib_mlx5dv_check_dc(dev); + if (status != UCS_OK) { + goto err_free; + } + + md->super.ops = &uct_ib_mlx5_md_ops; + + uct_ib_mlx5_parse_relaxed_order(md, md_config); + status = uct_ib_md_open_common(&md->super, ibv_device, md_config); + if (status != UCS_OK) { + goto err_free; + } + + dev->flags |= UCT_IB_DEVICE_FLAG_MLX5_PRM; + /* cppcheck-suppress autoVariables */ + *p_md = &md->super; + return UCS_OK; + +err_free: + ucs_free(md); +err_free_context: + ibv_close_device(ctx); +err: + return status; +} + +static uct_ib_md_ops_t uct_ib_mlx5_md_ops = { + .open = uct_ib_mlx5dv_md_open, + .cleanup = (uct_ib_md_cleanup_func_t)ucs_empty_function, + .reg_key = uct_ib_mlx5_reg_key, + .dereg_key = uct_ib_mlx5_dereg_key, + .reg_atomic_key = uct_ib_mlx5_reg_atomic_key, + .dereg_atomic_key = (uct_ib_md_dereg_atomic_key_func_t)ucs_empty_function_return_success, + .reg_multithreaded = (uct_ib_md_reg_multithreaded_func_t)ucs_empty_function_return_unsupported, + .dereg_multithreaded = (uct_ib_md_dereg_multithreaded_func_t)ucs_empty_function_return_unsupported, + .mem_prefetch = uct_ib_mlx5_mem_prefetch, + .get_atomic_mr_id = (uct_ib_md_get_atomic_mr_id_func_t)ucs_empty_function_return_unsupported, +}; + +UCT_IB_MD_OPS(uct_ib_mlx5_md_ops, 1); + diff --git a/src/uct/ib/mlx5/exp/ib_exp.c b/src/uct/ib/mlx5/exp/ib_exp.c new file mode 100644 index 00000000000..778f1453125 --- /dev/null +++ b/src/uct/ib/mlx5/exp/ib_exp.c @@ -0,0 +1,44 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include + +void uct_ib_exp_qp_fill_attr(uct_ib_iface_t *iface, uct_ib_qp_attr_t *attr) +{ +#if HAVE_DECL_IBV_EXP_CREATE_QP + if (!(attr->ibv.comp_mask & IBV_EXP_QP_INIT_ATTR_PD)) { + attr->ibv.comp_mask = IBV_EXP_QP_INIT_ATTR_PD; + attr->ibv.pd = uct_ib_iface_md(iface)->pd; + } +#endif + + if (attr->qp_type == IBV_QPT_UD) { + return; + } + +#if HAVE_IB_EXT_ATOMICS + attr->ibv.comp_mask |= IBV_EXP_QP_INIT_ATTR_ATOMICS_ARG; + attr->ibv.max_atomic_arg = UCT_IB_MAX_ATOMIC_SIZE; +#endif + +#if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE + if (uct_ib_iface_device(iface)->dev_attr.exp_atomic_cap == + IBV_EXP_ATOMIC_HCA_REPLY_BE) { + attr->ibv.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS; + attr->ibv.exp_create_flags = IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY; + } +#endif + +#if HAVE_STRUCT_IBV_EXP_QP_INIT_ATTR_MAX_INL_RECV + attr->ibv.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV; + attr->ibv.max_inl_recv = attr->max_inl_cqe[UCT_IB_DIR_RX]; +#endif +} + diff --git a/src/uct/ib/mlx5/exp/ib_exp.h b/src/uct/ib/mlx5/exp/ib_exp.h new file mode 100644 index 00000000000..2743d393452 --- /dev/null +++ b/src/uct/ib/mlx5/exp/ib_exp.h @@ -0,0 +1,20 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef UCT_IB_MLX5_EXP_H_ +#define UCT_IB_MLX5_EXP_H_ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#if defined (HAVE_MLX5_HW) && defined (HAVE_VERBS_EXP_H) +void uct_ib_exp_qp_fill_attr(uct_ib_iface_t *iface, uct_ib_qp_attr_t *attr); +#else +static inline void uct_ib_exp_qp_fill_attr(uct_ib_iface_t *iface, uct_ib_qp_attr_t *attr) { } +#endif + +#endif diff --git a/src/uct/ib/mlx5/exp/ib_exp_md.c b/src/uct/ib/mlx5/exp/ib_exp_md.c new file mode 100644 index 00000000000..4b7b9de1d93 --- /dev/null +++ b/src/uct/ib/mlx5/exp/ib_exp_md.c @@ -0,0 +1,748 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include + +#include +#include + + +typedef struct { + struct ibv_mr *atomic_mr; + int mr_num; + struct ibv_mr *mrs[]; +} uct_ib_mlx5_ksm_data_t; + +typedef struct uct_ib_mlx5_mem { + uct_ib_mem_t super; + struct ibv_mr *mr; +#ifdef HAVE_EXP_UMR + union { + struct ibv_mr *atomic_mr; + uct_ib_mlx5_ksm_data_t *ksm_data; + }; +#endif +} uct_ib_mlx5_mem_t; + + +static ucs_status_t uct_ib_mlx5_reg_key(uct_ib_md_t *md, void *address, + size_t length, uint64_t access_flags, + uct_ib_mem_t *ib_memh, + uct_ib_mr_type_t mr_type) +{ + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + ucs_status_t status; + + ucs_assert(mr_type == UCT_IB_MR_DEFAULT); + status = uct_ib_reg_mr(md->pd, address, length, access_flags, &memh->mr); + if (status != UCS_OK) { + return status; + } + + uct_ib_memh_init_keys(ib_memh, memh->mr->lkey, memh->mr->rkey); + return UCS_OK; +} + +static ucs_status_t uct_ib_mlx5_dereg_key(uct_ib_md_t *md, + uct_ib_mem_t *ib_memh, + uct_ib_mr_type_t mr_type) +{ + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + + return uct_ib_dereg_mr(memh->mr); +} + +static ucs_status_t +uct_ib_mlx5_mem_prefetch(uct_ib_md_t *md, uct_ib_mem_t *ib_memh, void *addr, + size_t length) +{ +#if HAVE_DECL_IBV_EXP_PREFETCH_MR + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + struct ibv_exp_prefetch_attr attr = {}; + int ret; + + if (!(memh->super.flags & UCT_IB_MEM_FLAG_ODP)) { + return UCS_OK; + } + + ucs_debug("memh %p prefetch %p length %zu", memh, addr, length); + + attr.flags = IBV_EXP_PREFETCH_WRITE_ACCESS; + attr.addr = addr; + attr.length = length; + + ret = UCS_PROFILE_CALL(ibv_exp_prefetch_mr, memh->mr, &attr); + if (ret) { + ucs_error("ibv_exp_prefetch_mr(addr=%p length=%zu) returned %d: %m", + addr, length, ret); + return UCS_ERR_IO_ERROR; + } +#endif + return UCS_OK; +} + +static ucs_status_t uct_ib_mlx5_exp_md_umr_qp_create(uct_ib_mlx5_md_t *md) +{ +#ifdef HAVE_EXP_UMR + struct ibv_exp_qp_init_attr qp_init_attr; + struct ibv_qp_attr qp_attr; + uint8_t port_num; + int ret; + uct_ib_device_t *ibdev; + struct ibv_port_attr *port_attr; + + ibdev = &md->super.dev; + + if (!(ibdev->dev_attr.exp_device_cap_flags & IBV_EXP_DEVICE_UMR) || + !md->super.config.enable_indirect_atomic) { + return UCS_ERR_UNSUPPORTED; + } + + /* TODO: fix port selection. It looks like active port should be used */ + port_num = ibdev->first_port; + port_attr = uct_ib_device_port_attr(ibdev, port_num); + + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + + md->umr_cq = ibv_create_cq(ibdev->ibv_context, 1, NULL, NULL, 0); + if (md->umr_cq == NULL) { + ucs_error("failed to create UMR CQ: %m"); + goto err; + } + + md->super.config.max_inline_klm_list = + ucs_min(md->super.config.max_inline_klm_list, + ibdev->dev_attr.umr_caps.max_send_wqe_inline_klms); + + qp_init_attr.qp_type = IBV_QPT_RC; + qp_init_attr.send_cq = md->umr_cq; + qp_init_attr.recv_cq = md->umr_cq; + qp_init_attr.cap.max_inline_data = 0; + qp_init_attr.cap.max_recv_sge = 1; + qp_init_attr.cap.max_send_sge = 1; + qp_init_attr.srq = NULL; + qp_init_attr.cap.max_recv_wr = 16; + qp_init_attr.cap.max_send_wr = 16; + qp_init_attr.pd = md->super.pd; + qp_init_attr.comp_mask = IBV_EXP_QP_INIT_ATTR_PD|IBV_EXP_QP_INIT_ATTR_MAX_INL_KLMS; + qp_init_attr.max_inl_recv = 0; + qp_init_attr.max_inl_send_klms = md->super.config.max_inline_klm_list; + +#if HAVE_IBV_EXP_QP_CREATE_UMR + qp_init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS; + qp_init_attr.exp_create_flags = IBV_EXP_QP_CREATE_UMR; +#endif + + md->umr_qp = ibv_exp_create_qp(ibdev->ibv_context, &qp_init_attr); + if (md->umr_qp == NULL) { + ucs_error("failed to create UMR QP: %m"); + goto err_destroy_cq; + } + + memset(&qp_attr, 0, sizeof(qp_attr)); + + /* Modify QP to INIT state */ + qp_attr.qp_state = IBV_QPS_INIT; + qp_attr.pkey_index = 0; + qp_attr.port_num = port_num; + qp_attr.qp_access_flags = UCT_IB_MEM_ACCESS_FLAGS; + ret = ibv_modify_qp(md->umr_qp, &qp_attr, + IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS); + if (ret) { + ucs_error("Failed to modify UMR QP to INIT: %m"); + goto err_destroy_qp; + } + + /* Modify to RTR */ + qp_attr.qp_state = IBV_QPS_RTR; + qp_attr.dest_qp_num = md->umr_qp->qp_num; + + memset(&qp_attr.ah_attr, 0, sizeof(qp_attr.ah_attr)); + qp_attr.ah_attr.port_num = port_num; + qp_attr.ah_attr.dlid = port_attr->lid; + qp_attr.ah_attr.is_global = 1; + if (uct_ib_device_query_gid(ibdev, port_num, UCT_IB_MD_DEFAULT_GID_INDEX, + &qp_attr.ah_attr.grh.dgid) != UCS_OK) { + goto err_destroy_qp; + } + + qp_attr.rq_psn = 0; + qp_attr.path_mtu = IBV_MTU_512; + qp_attr.min_rnr_timer = 7; + qp_attr.max_dest_rd_atomic = 1; + ret = ibv_modify_qp(md->umr_qp, &qp_attr, + IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER); + if (ret) { + ucs_error("Failed to modify UMR QP to RTR: %m"); + goto err_destroy_qp; + } + + /* Modify to RTS */ + qp_attr.qp_state = IBV_QPS_RTS; + qp_attr.sq_psn = 0; + qp_attr.timeout = 7; + qp_attr.rnr_retry = 7; + qp_attr.retry_cnt = 7; + qp_attr.max_rd_atomic = 1; + ret = ibv_modify_qp(md->umr_qp, &qp_attr, + IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | + IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC); + if (ret) { + ucs_error("Failed to modify UMR QP to RTS: %m"); + goto err_destroy_qp; + } + + ucs_debug("initialized UMR QP 0x%x, max_inline_klm_list %u", + md->umr_qp->qp_num, md->super.config.max_inline_klm_list); + return UCS_OK; + +err_destroy_qp: + uct_ib_destroy_qp(md->umr_qp); +err_destroy_cq: + ibv_destroy_cq(md->umr_cq); +err: + return UCS_ERR_IO_ERROR; +#else + return UCS_ERR_UNSUPPORTED; +#endif +} + +#ifdef HAVE_EXP_UMR +static ucs_status_t +uct_ib_mlx5_exp_reg_indirect_mr(uct_ib_mlx5_md_t *md, + void *addr, size_t length, + struct ibv_exp_mem_region *mem_reg, + int list_size, uint32_t create_flags, + uint32_t umr_type, struct ibv_mr **mr_p) +{ + struct ibv_exp_send_wr wr, *bad_wr; + struct ibv_exp_create_mr_in mrin; + ucs_status_t status; + struct ibv_mr *umr; + struct ibv_wc wc; + int ret; + + if (md->umr_qp == NULL) { + status = UCS_ERR_UNSUPPORTED; + goto err; + } + + /* Create and fill memory key */ + memset(&mrin, 0, sizeof(mrin)); + memset(&wr, 0, sizeof(wr)); + + mrin.pd = md->super.pd; + wr.exp_opcode = IBV_EXP_WR_UMR_FILL; + wr.exp_send_flags = IBV_EXP_SEND_SIGNALED; + wr.ext_op.umr.exp_access = UCT_IB_MEM_ACCESS_FLAGS; + + mrin.attr.create_flags = create_flags; + wr.ext_op.umr.umr_type = umr_type; + + mrin.attr.exp_access_flags = UCT_IB_MEM_ACCESS_FLAGS; + mrin.attr.max_klm_list_size = list_size; + + umr = ibv_exp_create_mr(&mrin); + if (!umr) { + ucs_error("ibv_exp_create_mr() failed: %m"); + status = UCS_ERR_NO_MEMORY; + goto err; + } + + wr.ext_op.umr.mem_list.mem_reg_list = mem_reg; + wr.ext_op.umr.base_addr = (uint64_t)(uintptr_t)addr; + wr.ext_op.umr.num_mrs = list_size; + wr.ext_op.umr.modified_mr = umr; + + /* If the list exceeds max inline size, allocate a container object */ + if (list_size > md->super.config.max_inline_klm_list) { + struct ibv_exp_mkey_list_container_attr in = { + .pd = md->super.pd, + .mkey_list_type = IBV_EXP_MKEY_LIST_TYPE_INDIRECT_MR, + .max_klm_list_size = list_size + }; + + wr.ext_op.umr.memory_objects = ibv_exp_alloc_mkey_list_memory(&in); + if (wr.ext_op.umr.memory_objects == NULL) { + ucs_error("ibv_exp_alloc_mkey_list_memory(list_size=%d) failed: %m", + list_size); + status = UCS_ERR_IO_ERROR; + goto err_free_umr; + } + } else { + wr.ext_op.umr.memory_objects = NULL; + wr.exp_send_flags |= IBV_EXP_SEND_INLINE; + } + + ucs_trace_data("UMR_FILL qp 0x%x lkey 0x%x base 0x%lx [addr %lx len %zu lkey 0x%x] list_size %d", + md->umr_qp->qp_num, wr.ext_op.umr.modified_mr->lkey, + wr.ext_op.umr.base_addr, mem_reg[0].base_addr, + mem_reg[0].length, mem_reg[0].mr->lkey, list_size); + + /* Post UMR */ + ret = ibv_exp_post_send(md->umr_qp, &wr, &bad_wr); + if (ret) { + ucs_error("ibv_exp_post_send(UMR_FILL) failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err_free_klm_container; + } + + /* Wait for send UMR completion */ + for (;;) { + ret = ibv_poll_cq(md->umr_cq, 1, &wc); + if (ret < 0) { + ucs_error("ibv_exp_poll_cq(umr_cq) failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err_free_klm_container; + } + if (ret == 1) { + if (wc.status != IBV_WC_SUCCESS) { + ucs_error("UMR_FILL completed with error: %s vendor_err %d", + ibv_wc_status_str(wc.status), wc.vendor_err); + status = UCS_ERR_IO_ERROR; + goto err_free_klm_container; + } + break; + } + } + + if (wr.ext_op.umr.memory_objects != NULL) { + ibv_exp_dealloc_mkey_list_memory(wr.ext_op.umr.memory_objects); + } + + umr->addr = addr; + umr->length = length; + ucs_debug("UMR registered memory %p..%p on %s lkey 0x%x rkey 0x%x", + umr->addr, UCS_PTR_BYTE_OFFSET(umr->addr, length), + uct_ib_device_name(&md->super.dev), + umr->lkey, umr->rkey); + + *mr_p = umr; + + return UCS_OK; + +err_free_klm_container: + if (wr.ext_op.umr.memory_objects != NULL) { + ibv_exp_dealloc_mkey_list_memory(wr.ext_op.umr.memory_objects); + } +err_free_umr: + UCS_PROFILE_CALL(ibv_dereg_mr, umr); +err: + return status; +} +#endif + +ucs_status_t uct_ib_mlx5_exp_reg_ksm(uct_ib_mlx5_md_t *md, + uct_ib_mlx5_ksm_data_t *ksm_data, + size_t length, off_t off, + struct ibv_mr **mr_p) +{ +#if HAVE_EXP_UMR_KSM + struct ibv_exp_mem_region *mem_reg; + ucs_status_t status; + int i; + + mem_reg = ucs_calloc(ksm_data->mr_num, sizeof(mem_reg[0]), "mem_reg"); + if (!mem_reg) { + return UCS_ERR_NO_MEMORY; + } + + for (i = 0; i < ksm_data->mr_num; i++) { + mem_reg[i].base_addr = (uint64_t) (uintptr_t) ksm_data->mrs[i]->addr; + mem_reg[i].length = ksm_data->mrs[i]->length; + mem_reg[i].mr = ksm_data->mrs[i]; + } + + status = uct_ib_mlx5_exp_reg_indirect_mr(md, + UCS_PTR_BYTE_OFFSET(ksm_data->mrs[0]->addr, + off), + length, mem_reg, ksm_data->mr_num, + IBV_EXP_MR_FIXED_BUFFER_SIZE, + IBV_EXP_UMR_MR_LIST_FIXED_SIZE, + mr_p); + + ucs_free(mem_reg); + return status; +#else + return UCS_ERR_UNSUPPORTED; +#endif +} + +#if HAVE_EXP_UMR_KSM +static UCS_F_ALWAYS_INLINE int +uct_ib_mlx5_md_is_ksm_supported(uct_ib_mlx5_md_t *md) +{ + return (md->super.dev.dev_attr.comp_mask & (uint64_t)IBV_EXP_DEVICE_ATTR_COMP_MASK_2) && + (md->super.dev.dev_attr.comp_mask_2 & (uint64_t)IBV_EXP_DEVICE_ATTR_UMR_FIXED_SIZE_CAPS) && + (md->super.dev.dev_attr.exp_device_cap_flags & (uint64_t)IBV_EXP_DEVICE_UMR_FIXED_SIZE); +} +#endif + +static ucs_status_t uct_ib_mlx5_exp_reg_atomic_key(uct_ib_md_t *ibmd, + uct_ib_mem_t *ib_memh) +{ +#ifdef HAVE_EXP_UMR + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t); + struct ibv_exp_mem_region *mem_reg = NULL; + struct ibv_mr *mr = memh->mr; + uint32_t create_flags, umr_type; + ucs_status_t status; + struct ibv_mr *umr; + int i, list_size; + size_t reg_length; + uint8_t mr_id; + + status = uct_ib_mlx5_md_get_atomic_mr_id(ibmd, &mr_id); + if (status != UCS_OK) { + return status; + } + + if (memh->super.flags & UCT_IB_MEM_MULTITHREADED) { + status = uct_ib_mlx5_exp_reg_ksm(md, memh->ksm_data, memh->mr->length, + uct_ib_md_atomic_offset(mr_id), + &memh->ksm_data->atomic_mr); + if (status == UCS_OK) { + memh->super.atomic_rkey = memh->ksm_data->atomic_mr->rkey; + } + + return status; + } + + reg_length = UCT_IB_MD_MAX_MR_SIZE; +#if HAVE_EXP_UMR_KSM + if (uct_ib_mlx5_md_is_ksm_supported(md)) + { + reg_length = md->super.dev.dev_attr.umr_fixed_size_caps.max_entity_size; + list_size = ucs_div_round_up(mr->length, reg_length); + } else if (mr->length < reg_length) { + list_size = 1; + } else { + status = UCS_ERR_UNSUPPORTED; + goto err; + } + + if (list_size > 1) { + create_flags = IBV_EXP_MR_FIXED_BUFFER_SIZE; + umr_type = IBV_EXP_UMR_MR_LIST_FIXED_SIZE; + } else { + create_flags = IBV_EXP_MR_INDIRECT_KLMS; + umr_type = IBV_EXP_UMR_MR_LIST; + } +#else + if (mr->length >= reg_length) { + status = UCS_ERR_UNSUPPORTED; + goto err; + } + + list_size = 1; + create_flags = IBV_EXP_MR_INDIRECT_KLMS; + umr_type = IBV_EXP_UMR_MR_LIST; +#endif + + mem_reg = ucs_calloc(list_size, sizeof(mem_reg[0]), "mem_reg"); + if (!mem_reg) { + status = UCS_ERR_NO_MEMORY; + goto err; + } + + for (i = 0; i < list_size; i++) { + mem_reg[i].base_addr = (uintptr_t) mr->addr + i * reg_length; + mem_reg[i].length = reg_length; + mem_reg[i].mr = mr; + } + + ucs_assert(list_size >= 1); + mem_reg[list_size - 1].length = mr->length % reg_length; + + status = uct_ib_mlx5_exp_reg_indirect_mr(md, + UCS_PTR_BYTE_OFFSET(mr->addr, uct_ib_md_atomic_offset(mr_id)), + mr->length, mem_reg, list_size, create_flags, umr_type, &umr); + if (status != UCS_OK) { + goto err; + } + + memh->atomic_mr = umr; + memh->super.atomic_rkey = umr->rkey; + +err: + ucs_free(mem_reg); + return status; +#else + return UCS_ERR_UNSUPPORTED; +#endif +} + +static ucs_status_t uct_ib_mlx5_exp_dereg_atomic_key(uct_ib_md_t *ibmd, + uct_ib_mem_t *ib_memh) +{ +#ifdef HAVE_EXP_UMR + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + int ret; + + ret = UCS_PROFILE_CALL(ibv_dereg_mr, memh->atomic_mr); + if (ret != 0) { + ucs_error("ibv_dereg_mr() failed: %m"); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +#else + return UCS_ERR_UNSUPPORTED; +#endif +} + +static ucs_status_t uct_ib_mlx5_exp_reg_multithreaded(uct_ib_md_t *ibmd, + void *address, size_t length, + uint64_t access_flags, + uct_ib_mem_t *ib_memh, + uct_ib_mr_type_t mr_type) +{ +#if HAVE_EXP_UMR_KSM + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t); + size_t chunk = md->super.config.mt_reg_chunk; + uct_ib_mlx5_ksm_data_t *ksm_data; + size_t ksm_data_size; + ucs_status_t status; + struct ibv_mr *umr; + int mr_num; + + ucs_assert(mr_type == UCT_IB_MR_DEFAULT); + + if (!uct_ib_mlx5_md_is_ksm_supported(md)) { + return UCS_ERR_UNSUPPORTED; + } + + mr_num = ucs_div_round_up(length, chunk); + ksm_data_size = (mr_num * sizeof(*ksm_data->mrs)) + sizeof(*ksm_data); + ksm_data = ucs_calloc(1, ksm_data_size, "ksm_data"); + if (!ksm_data) { + status = UCS_ERR_NO_MEMORY; + goto err; + } + + ucs_trace("multithreaded register memory %p..%p chunks %d", + address, UCS_PTR_BYTE_OFFSET(address, length), mr_num); + + ksm_data->mr_num = mr_num; + status = uct_ib_md_handle_mr_list_multithreaded(ibmd, address, length, + access_flags, chunk, + ksm_data->mrs); + if (status != UCS_OK) { + goto err; + } + + status = uct_ib_mlx5_exp_reg_ksm(md, ksm_data, length, 0, &umr); + if (status != UCS_OK) { + goto err_dereg; + } + + memh->mr = umr; + memh->ksm_data = ksm_data; + memh->super.lkey = umr->lkey; + memh->super.rkey = umr->rkey; + return UCS_OK; + +err_dereg: + uct_ib_md_handle_mr_list_multithreaded(ibmd, address, length, UCT_IB_MEM_DEREG, + chunk, ksm_data->mrs); +err: + ucs_free(ksm_data); + return status; +#else + return UCS_ERR_UNSUPPORTED; +#endif +} + +static ucs_status_t uct_ib_mlx5_exp_dereg_multithreaded(uct_ib_md_t *ibmd, + uct_ib_mem_t *ib_memh, + uct_ib_mr_type_t mr_type) +{ +#if HAVE_EXP_UMR_KSM + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + size_t chunk = ibmd->config.mt_reg_chunk; + ucs_status_t s, status = UCS_OK; + + if (memh->super.flags & UCT_IB_MEM_FLAG_ATOMIC_MR) { + s = uct_ib_dereg_mr(memh->ksm_data->atomic_mr); + if (s != UCS_OK) { + status = s; + } + } + + s = uct_ib_md_handle_mr_list_multithreaded(ibmd, memh->mr->addr, + memh->mr->length, + UCT_IB_MEM_DEREG, chunk, + memh->ksm_data->mrs); + if (s == UCS_ERR_UNSUPPORTED) { + s = uct_ib_dereg_mrs(memh->ksm_data->mrs, memh->ksm_data->mr_num); + if (s != UCS_OK) { + status = s; + } + } else if (s != UCS_OK) { + status = s; + } + + s = uct_ib_dereg_mr(memh->mr); + if (s != UCS_OK) { + status = s; + } + + ucs_free(memh->ksm_data); + + return status; +#else + return UCS_ERR_UNSUPPORTED; +#endif +} + +static uct_ib_md_ops_t uct_ib_mlx5_md_ops; + +static ucs_status_t uct_ib_mlx5_exp_md_open(struct ibv_device *ibv_device, + const uct_ib_md_config_t *md_config, + uct_ib_md_t **p_md) +{ + ucs_status_t status = UCS_OK; + struct ibv_context *ctx; + uct_ib_device_t *dev; + uct_ib_mlx5_md_t *md; + + ctx = ibv_open_device(ibv_device); + if (ctx == NULL) { + ucs_debug("ibv_open_device(%s) failed: %m", ibv_get_device_name(ibv_device)); + status = UCS_ERR_UNSUPPORTED; + goto err; + } + + md = ucs_calloc(1, sizeof(*md), "ib_mlx5_md"); + if (md == NULL) { + status = UCS_ERR_NO_MEMORY; + goto err_free_context; + } + + dev = &md->super.dev; + dev->ibv_context = ctx; + md->super.config = md_config->ext; + + status = uct_ib_device_query(dev, ibv_device); + if (status != UCS_OK) { + goto err_free; + } + + if (!(uct_ib_device_spec(dev)->flags & UCT_IB_DEVICE_FLAG_MLX5_PRM)) { + status = UCS_ERR_UNSUPPORTED; + goto err_free; + } + +#if HAVE_DECL_IBV_EXP_DEVICE_DC_TRANSPORT && HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_EXP_DEVICE_CAP_FLAGS + if (dev->dev_attr.exp_device_cap_flags & IBV_EXP_DEVICE_DC_TRANSPORT) { + dev->flags |= UCT_IB_DEVICE_FLAG_DC; + } +#endif + +#if IBV_HW_TM + if (dev->dev_attr.tm_caps.capability_flags & IBV_EXP_TM_CAP_DC) { + md->flags |= UCT_IB_MLX5_MD_FLAG_DC_TM; + } +#endif + + if (UCT_IB_HAVE_ODP_IMPLICIT(&dev->dev_attr)) { + dev->flags |= UCT_IB_DEVICE_FLAG_ODP_IMPLICIT; + } + + if (IBV_EXP_HAVE_ATOMIC_HCA(&dev->dev_attr) || + IBV_EXP_HAVE_ATOMIC_GLOB(&dev->dev_attr) || + IBV_EXP_HAVE_ATOMIC_HCA_REPLY_BE(&dev->dev_attr)) + { +#ifdef HAVE_IB_EXT_ATOMICS + if (dev->dev_attr.comp_mask & IBV_EXP_DEVICE_ATTR_EXT_ATOMIC_ARGS) { + dev->ext_atomic_arg_sizes = dev->dev_attr.ext_atom.log_atomic_arg_sizes; + } +# if HAVE_MASKED_ATOMICS_ENDIANNESS + if (dev->dev_attr.comp_mask & IBV_EXP_DEVICE_ATTR_MASKED_ATOMICS) { + dev->ext_atomic_arg_sizes |= + dev->dev_attr.masked_atomic.masked_log_atomic_arg_sizes; + dev->ext_atomic_arg_sizes_be = + dev->dev_attr.masked_atomic.masked_log_atomic_arg_sizes_network_endianness; + } +# endif + dev->ext_atomic_arg_sizes &= UCS_MASK(dev->dev_attr.ext_atom.log_max_atomic_inline + 1); +#endif + dev->atomic_arg_sizes = sizeof(uint64_t); + if (IBV_EXP_HAVE_ATOMIC_HCA_REPLY_BE(&dev->dev_attr)) { + dev->atomic_arg_sizes_be = sizeof(uint64_t); + } + } + +#if HAVE_DECL_IBV_EXP_DEVICE_ATTR_PCI_ATOMIC_CAPS + dev->pci_fadd_arg_sizes = dev->dev_attr.pci_atomic_caps.fetch_add << 2; + dev->pci_cswap_arg_sizes = dev->dev_attr.pci_atomic_caps.compare_swap << 2; +#endif + + md->super.ops = &uct_ib_mlx5_md_ops; + + uct_ib_md_parse_relaxed_order(&md->super, md_config); + ucs_assert(!md->super.relaxed_order); + md->super.memh_struct_size = sizeof(uct_ib_mlx5_mem_t), + + status = uct_ib_md_open_common(&md->super, ibv_device, md_config); + if (status != UCS_OK) { + goto err_free; + } + + status = uct_ib_mlx5_exp_md_umr_qp_create(md); + if (status != UCS_OK && status != UCS_ERR_UNSUPPORTED) { + goto err_free; + } + + dev->flags |= UCT_IB_DEVICE_FLAG_MLX5_PRM; + *p_md = &md->super; + return UCS_OK; + +err_free: + ucs_free(md); +err_free_context: + ibv_close_device(ctx); +err: + return status; +} + +void uct_ib_mlx5_exp_md_cleanup(uct_ib_md_t *ibmd) +{ +#ifdef HAVE_EXP_UMR + uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t); + + if (md->umr_qp != NULL) { + uct_ib_destroy_qp(md->umr_qp); + } + if (md->umr_cq != NULL) { + ibv_destroy_cq(md->umr_cq); + } +#endif +} + +static uct_ib_md_ops_t uct_ib_mlx5_md_ops = { + .open = uct_ib_mlx5_exp_md_open, + .cleanup = uct_ib_mlx5_exp_md_cleanup, + .reg_key = uct_ib_mlx5_reg_key, + .dereg_key = uct_ib_mlx5_dereg_key, + .reg_atomic_key = uct_ib_mlx5_exp_reg_atomic_key, + .dereg_atomic_key = uct_ib_mlx5_exp_dereg_atomic_key, + .reg_multithreaded = uct_ib_mlx5_exp_reg_multithreaded, + .dereg_multithreaded = uct_ib_mlx5_exp_dereg_multithreaded, + .mem_prefetch = uct_ib_mlx5_mem_prefetch, + .get_atomic_mr_id = uct_ib_mlx5_md_get_atomic_mr_id, +}; + +UCT_IB_MD_OPS(uct_ib_mlx5_md_ops, 1); + diff --git a/src/uct/ib/mlx5/ib_mlx5_hw.c b/src/uct/ib/mlx5/exp/ib_mlx5_hw.c similarity index 96% rename from src/uct/ib/mlx5/ib_mlx5_hw.c rename to src/uct/ib/mlx5/exp/ib_mlx5_hw.c index 2da7e236f0b..1d0170ff5d9 100644 --- a/src/uct/ib/mlx5/ib_mlx5_hw.c +++ b/src/uct/ib/mlx5/exp/ib_mlx5_hw.c @@ -7,7 +7,7 @@ # include "config.h" #endif -#if HAVE_INFINIBAND_MLX5_HW_H +#ifdef HAVE_INFINIBAND_MLX5_HW_H #include "ib_mlx5_hw.h" @@ -21,7 +21,7 @@ /* Since this file intended to emulate DV using legacy mlx5_hw.h definitions * we include DV declarations. */ #define UCT_IB_MLX5_H_ -#include "ib_mlx5_dv.h" +#include static void UCS_F_MAYBE_UNUSED uct_ib_mlx5_obj_error(const char *obj_name) { @@ -173,6 +173,12 @@ ucs_status_t uct_ib_mlx5dv_init_obj(uct_ib_mlx5dv_t *obj, uint64_t obj_type) ucs_container_of(obj->dv.srq.out, uct_ib_mlx5dv_srq_t, dv)); } +#ifdef HAVE_IBV_EXP_DM + if (!ret && (obj_type & MLX5DV_OBJ_DM)) { + ret = uct_ib_mlx5_get_dm_info(obj->dv_dm.in, obj->dv_dm.out); + } +#endif + return ret; } #endif @@ -202,7 +208,7 @@ void uct_ib_mlx5_get_av(struct ibv_ah *ah, struct mlx5_wqe_av *av) struct ibv_qp *uct_dv_get_cmd_qp(struct ibv_srq *srq) { -#if HAVE_STRUCT_MLX5_SRQ_CMD_QP +#ifdef HAVE_STRUCT_MLX5_SRQ_CMD_QP struct mlx5_srq *msrq; if (srq->handle == LEGACY_XRC_SRQ_HANDLE) { diff --git a/src/uct/ib/mlx5/ib_mlx5_hw.h b/src/uct/ib/mlx5/exp/ib_mlx5_hw.h similarity index 100% rename from src/uct/ib/mlx5/ib_mlx5_hw.h rename to src/uct/ib/mlx5/exp/ib_mlx5_hw.h diff --git a/src/uct/ib/mlx5/ib_mlx5.c b/src/uct/ib/mlx5/ib_mlx5.c index b2e08fe913a..4bb06c78037 100644 --- a/src/uct/ib/mlx5/ib_mlx5.c +++ b/src/uct/ib/mlx5/ib_mlx5.c @@ -4,10 +4,14 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ib_mlx5.h" #include "ib_mlx5.inl" #include "ib_mlx5_log.h" - +#include #include #include #include @@ -26,7 +30,7 @@ static const char *uct_ib_mlx5_mmio_modes[] = { }; ucs_config_field_t uct_ib_mlx5_iface_config_table[] = { -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM {"DM_SIZE", "2k", "Device Memory segment size (0 - disabled)", ucs_offsetof(uct_ib_mlx5_iface_config_t, dm.seg_len), UCS_CONFIG_TYPE_MEMUNITS}, @@ -49,37 +53,36 @@ ucs_config_field_t uct_ib_mlx5_iface_config_table[] = { {NULL} }; -ucs_status_t uct_ib_mlx5_create_cq(struct ibv_context *context, int cqe, - struct ibv_comp_channel *channel, - int comp_vector, int ignore_overrun, - size_t *inl, struct ibv_cq **cq_p) +ucs_status_t uct_ib_mlx5_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir, + const uct_ib_iface_init_attr_t *init_attr, + int preferred_cpu, size_t inl) { #if HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE + uct_ib_device_t *dev = uct_ib_iface_device(iface); struct ibv_cq *cq; struct ibv_cq_init_attr_ex cq_attr = {}; struct mlx5dv_cq_init_attr dv_attr = {}; - cq_attr.cqe = cqe; - cq_attr.channel = channel; - cq_attr.comp_vector = comp_vector; - if (ignore_overrun) { + cq_attr.cqe = init_attr->cq_len[dir]; + cq_attr.channel = iface->comp_channel; + cq_attr.comp_vector = preferred_cpu; + if (init_attr->flags & UCT_IB_CQ_IGNORE_OVERRUN) { cq_attr.comp_mask = IBV_CQ_INIT_ATTR_MASK_FLAGS; - cq_attr.flags = IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN; + cq_attr.flags = IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN; } dv_attr.comp_mask = MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE; - dv_attr.cqe_size = uct_ib_get_cqe_size(*inl > 32 ? 128 : 64); - cq = ibv_cq_ex_to_cq(mlx5dv_create_cq(context, &cq_attr, &dv_attr)); + dv_attr.cqe_size = uct_ib_get_cqe_size(inl > 32 ? 128 : 64); + cq = ibv_cq_ex_to_cq(mlx5dv_create_cq(dev->ibv_context, &cq_attr, &dv_attr)); if (!cq) { - ucs_error("mlx5dv_create_cq(cqe=%d) failed: %m", cqe); + ucs_error("mlx5dv_create_cq(cqe=%d) failed: %m", cq_attr.cqe); return UCS_ERR_IO_ERROR; } - *cq_p = cq; - *inl = dv_attr.cqe_size / 2; + iface->cq[dir] = cq; + iface->config.max_inl_cqe[dir] = dv_attr.cqe_size / 2; return UCS_OK; #else - return uct_ib_verbs_create_cq(context, cqe, channel, comp_vector, - ignore_overrun, inl, cq_p); + return uct_ib_verbs_create_cq(iface, dir, init_attr, preferred_cpu, inl); #endif } @@ -116,7 +119,8 @@ ucs_status_t uct_ib_mlx5_get_cq(struct ibv_cq *cq, uct_ib_mlx5_cq_t *mlx5_cq) /* Move buffer forward for 128b CQE, so we would get pointer to the 2nd * 64b when polling. */ - mlx5_cq->cq_buf += cqe_size - sizeof(struct mlx5_cqe64); + mlx5_cq->cq_buf = UCS_PTR_BYTE_OFFSET(mlx5_cq->cq_buf, + cqe_size - sizeof(struct mlx5_cqe64)); ret = ibv_exp_cq_ignore_overrun(cq); if (ret != 0) { @@ -125,7 +129,7 @@ ucs_status_t uct_ib_mlx5_get_cq(struct ibv_cq *cq, uct_ib_mlx5_cq_t *mlx5_cq) } mlx5_cq->cqe_size_log = ucs_ilog2(cqe_size); - ucs_assert_always((1<cqe_size_log) == cqe_size); + ucs_assert_always((1ul << mlx5_cq->cqe_size_log) == cqe_size); /* Set owner bit for all CQEs, so that CQE would look like it is in HW * ownership. In this case CQ polling functions will return immediately if @@ -141,10 +145,10 @@ ucs_status_t uct_ib_mlx5_get_cq(struct ibv_cq *cq, uct_ib_mlx5_cq_t *mlx5_cq) } static int -uct_ib_mlx5_iface_res_domain_cmp(uct_ib_mlx5_iface_res_domain_t *res_domain, - uct_ib_md_t *md, uct_priv_worker_t *worker) +uct_ib_mlx5_res_domain_cmp(uct_ib_mlx5_res_domain_t *res_domain, + uct_ib_md_t *md, uct_priv_worker_t *worker) { -#if HAVE_IBV_EXP_RES_DOMAIN +#ifdef HAVE_IBV_EXP_RES_DOMAIN return res_domain->ibv_domain->context == md->dev.ibv_context; #elif HAVE_DECL_IBV_ALLOC_TD return res_domain->pd->context == md->dev.ibv_context; @@ -154,10 +158,10 @@ uct_ib_mlx5_iface_res_domain_cmp(uct_ib_mlx5_iface_res_domain_t *res_domain, } static ucs_status_t -uct_ib_mlx5_iface_res_domain_init(uct_ib_mlx5_iface_res_domain_t *res_domain, - uct_ib_md_t *md, uct_priv_worker_t *worker) +uct_ib_mlx5_res_domain_init(uct_ib_mlx5_res_domain_t *res_domain, + uct_ib_md_t *md, uct_priv_worker_t *worker) { -#if HAVE_IBV_EXP_RES_DOMAIN +#ifdef HAVE_IBV_EXP_RES_DOMAIN struct ibv_exp_res_domain_init_attr attr; attr.comp_mask = IBV_EXP_RES_DOMAIN_THREAD_MODEL | @@ -214,9 +218,9 @@ uct_ib_mlx5_iface_res_domain_init(uct_ib_mlx5_iface_res_domain_t *res_domain, return UCS_OK; } -static void uct_ib_mlx5_iface_res_domain_cleanup(uct_ib_mlx5_iface_res_domain_t *res_domain) +static void uct_ib_mlx5_res_domain_cleanup(uct_ib_mlx5_res_domain_t *res_domain) { -#if HAVE_IBV_EXP_RES_DOMAIN +#ifdef HAVE_IBV_EXP_RES_DOMAIN struct ibv_exp_destroy_res_domain_attr attr; int ret; @@ -244,38 +248,54 @@ static void uct_ib_mlx5_iface_res_domain_cleanup(uct_ib_mlx5_iface_res_domain_t #endif } -ucs_status_t uct_ib_mlx5_iface_init_res_domain(uct_ib_iface_t *iface, - uct_ib_mlx5_iface_common_t *mlx5) +ucs_status_t uct_ib_mlx5_iface_get_res_domain(uct_ib_iface_t *iface, + uct_ib_mlx5_qp_t *qp) { - mlx5->res_domain = uct_worker_tl_data_get(iface->super.worker, - UCT_IB_MLX5_RES_DOMAIN_KEY, - uct_ib_mlx5_iface_res_domain_t, - uct_ib_mlx5_iface_res_domain_cmp, - uct_ib_mlx5_iface_res_domain_init, - uct_ib_iface_md(iface), - iface->super.worker); - if (UCS_PTR_IS_ERR(mlx5->res_domain)) { - return UCS_PTR_STATUS(mlx5->res_domain); + qp->verbs.rd = uct_worker_tl_data_get(iface->super.worker, + UCT_IB_MLX5_RES_DOMAIN_KEY, + uct_ib_mlx5_res_domain_t, + uct_ib_mlx5_res_domain_cmp, + uct_ib_mlx5_res_domain_init, + uct_ib_iface_md(iface), + iface->super.worker); + if (UCS_PTR_IS_ERR(qp->verbs.rd)) { + return UCS_PTR_STATUS(qp->verbs.rd); } + qp->type = UCT_IB_MLX5_OBJ_TYPE_VERBS; + return UCS_OK; } -void uct_ib_mlx5_iface_cleanup_res_domain(uct_ib_mlx5_iface_common_t *mlx5) +void uct_ib_mlx5_iface_put_res_domain(uct_ib_mlx5_qp_t *qp) { - uct_worker_tl_data_put(mlx5->res_domain, uct_ib_mlx5_iface_res_domain_cleanup); + if (qp->type == UCT_IB_MLX5_OBJ_TYPE_VERBS) { + uct_worker_tl_data_put(qp->verbs.rd, uct_ib_mlx5_res_domain_cleanup); + } } ucs_status_t uct_ib_mlx5_iface_create_qp(uct_ib_iface_t *iface, - uct_ib_mlx5_iface_common_t *mlx5, - uct_ib_qp_attr_t *attr, - struct ibv_qp **qp_p) + uct_ib_mlx5_qp_t *qp, + uct_ib_mlx5_qp_attr_t *attr) { - uct_ib_mlx5_iface_fill_attr(iface, mlx5, attr); - return uct_ib_iface_create_qp(iface, attr, qp_p); + ucs_status_t status; + + status = uct_ib_mlx5_iface_fill_attr(iface, qp, attr); + if (status != UCS_OK) { + return status; + } + + uct_ib_exp_qp_fill_attr(iface, &attr->super); + status = uct_ib_iface_create_qp(iface, &attr->super, &qp->verbs.qp); + if (status != UCS_OK) { + return status; + } + + qp->qp_num = qp->verbs.qp->qp_num; + return UCS_OK; } -#if !HAVE_DECL_MLX5DV_CONTEXT_FLAGS_DEVX +#if !HAVE_DEVX ucs_status_t uct_ib_mlx5_get_compact_av(uct_ib_iface_t *iface, int *compact_av) { struct mlx5_wqe_av mlx5_av; @@ -283,6 +303,7 @@ ucs_status_t uct_ib_mlx5_get_compact_av(uct_ib_iface_t *iface, int *compact_av) uct_ib_address_t *ib_addr; ucs_status_t status; struct ibv_ah_attr ah_attr; + enum ibv_mtu path_mtu; /* coverity[result_independent_of_operands] */ ib_addr = ucs_alloca((size_t)iface->addr_size); @@ -293,8 +314,8 @@ ucs_status_t uct_ib_mlx5_get_compact_av(uct_ib_iface_t *iface, int *compact_av) return status; } - uct_ib_iface_fill_ah_attr_from_addr(iface, ib_addr, iface->path_bits[0], &ah_attr); - ah_attr.is_global = iface->is_global_addr; + uct_ib_iface_fill_ah_attr_from_addr(iface, ib_addr, 0, &ah_attr, &path_mtu); + ah_attr.is_global = iface->config.force_global_addr; status = uct_ib_iface_create_ah(iface, &ah_attr, &ah); if (status != UCS_OK) { return status; @@ -357,15 +378,77 @@ static void uct_ib_mlx5_mmio_cleanup(uct_ib_mlx5_mmio_reg_t *reg) { } +int uct_ib_mlx5_devx_uar_cmp(uct_ib_mlx5_devx_uar_t *uar, + uct_ib_mlx5_md_t *md, + uct_ib_mlx5_mmio_mode_t mmio_mode) +{ + return uar->ctx == md->super.dev.ibv_context; +} + +ucs_status_t uct_ib_mlx5_devx_uar_init(uct_ib_mlx5_devx_uar_t *uar, + uct_ib_mlx5_md_t *md, + uct_ib_mlx5_mmio_mode_t mmio_mode) +{ +#if HAVE_DEVX + uar->uar = mlx5dv_devx_alloc_uar(md->super.dev.ibv_context, 0); + if (uar->uar == NULL) { + ucs_error("mlx5dv_devx_alloc_uar() failed: %m"); + return UCS_ERR_NO_MEMORY; + } + + uar->super.addr.ptr = uar->uar->reg_addr; + uar->super.mode = mmio_mode; + uar->ctx = md->super.dev.ibv_context; + + return UCS_OK; +#else + return UCS_ERR_UNSUPPORTED; +#endif +} + +void uct_ib_mlx5_devx_uar_cleanup(uct_ib_mlx5_devx_uar_t *uar) +{ +#if HAVE_DEVX + mlx5dv_devx_free_uar(uar->uar); +#endif +} + void uct_ib_mlx5_txwq_reset(uct_ib_mlx5_txwq_t *txwq) { txwq->curr = txwq->qstart; txwq->sw_pi = 0; - txwq->prev_sw_pi = -1; -#if ENABLE_ASSERT + txwq->prev_sw_pi = UINT16_MAX; +#if UCS_ENABLE_ASSERT txwq->hw_ci = 0xFFFF; #endif - memset(txwq->qstart, 0, txwq->qend - txwq->qstart); + uct_ib_fence_info_init(&txwq->fi); + memset(txwq->qstart, 0, UCS_PTR_BYTE_DIFF(txwq->qstart, txwq->qend)); +} + +ucs_status_t +uct_ib_mlx5_get_mmio_mode(uct_priv_worker_t *worker, + uct_ib_mlx5_mmio_mode_t cfg_mmio_mode, + unsigned bf_size, + uct_ib_mlx5_mmio_mode_t *mmio_mode) +{ + ucs_assert(cfg_mmio_mode < UCT_IB_MLX5_MMIO_MODE_LAST); + + if (cfg_mmio_mode != UCT_IB_MLX5_MMIO_MODE_AUTO) { + *mmio_mode = cfg_mmio_mode; + } else if (bf_size > 0) { + if (worker->thread_mode == UCS_THREAD_MODE_SINGLE) { + *mmio_mode = UCT_IB_MLX5_MMIO_MODE_BF_POST; + } else if (worker->thread_mode == UCS_THREAD_MODE_SERIALIZED) { + *mmio_mode = UCT_IB_MLX5_MMIO_MODE_BF_POST_MT; + } else { + ucs_error("unsupported thread mode for mlx5: %d", worker->thread_mode); + return UCS_ERR_UNSUPPORTED; + } + } else { + *mmio_mode = UCT_IB_MLX5_MMIO_MODE_DB; + } + + return UCS_OK; } ucs_status_t uct_ib_mlx5_txwq_init(uct_priv_worker_t *worker, @@ -396,19 +479,10 @@ ucs_status_t uct_ib_mlx5_txwq_init(uct_priv_worker_t *worker, return UCS_ERR_IO_ERROR; } - if (cfg_mmio_mode != UCT_IB_MLX5_MMIO_MODE_AUTO) { - mmio_mode = cfg_mmio_mode; - } else if (qp_info.dv.bf.size > 0) { - if (worker->thread_mode == UCS_THREAD_MODE_SINGLE) { - mmio_mode = UCT_IB_MLX5_MMIO_MODE_BF_POST; - } else if (worker->thread_mode == UCS_THREAD_MODE_SERIALIZED) { - mmio_mode = UCT_IB_MLX5_MMIO_MODE_BF_POST_MT; - } else { - ucs_error("unsupported thread mode for mlx5: %d", worker->thread_mode); - return UCS_ERR_UNSUPPORTED; - } - } else { - mmio_mode = UCT_IB_MLX5_MMIO_MODE_DB; + status = uct_ib_mlx5_get_mmio_mode(worker, cfg_mmio_mode, + qp_info.dv.bf.size, &mmio_mode); + if (status != UCS_OK) { + return status; } ucs_debug("tx wq %d bytes [bb=%d, nwqe=%d] mmio_mode %s", @@ -417,7 +491,8 @@ ucs_status_t uct_ib_mlx5_txwq_init(uct_priv_worker_t *worker, uct_ib_mlx5_mmio_modes[mmio_mode]); txwq->qstart = qp_info.dv.sq.buf; - txwq->qend = qp_info.dv.sq.buf + (qp_info.dv.sq.stride * qp_info.dv.sq.wqe_cnt); + txwq->qend = UCS_PTR_BYTE_OFFSET(qp_info.dv.sq.buf, + qp_info.dv.sq.stride * qp_info.dv.sq.wqe_cnt); txwq->reg = uct_worker_tl_data_get(worker, UCT_IB_MLX5_WORKER_BF_KEY, uct_ib_mlx5_mmio_reg_t, @@ -429,6 +504,7 @@ ucs_status_t uct_ib_mlx5_txwq_init(uct_priv_worker_t *worker, return UCS_PTR_STATUS(txwq->reg); } + /* cppcheck-suppress autoVariables */ txwq->dbrec = &qp_info.dv.dbrec[MLX5_SND_DBR]; /* need to reserve 2x because: * - on completion we only get the index of last wqe and we do not @@ -438,8 +514,6 @@ ucs_status_t uct_ib_mlx5_txwq_init(uct_priv_worker_t *worker, */ txwq->bb_max = qp_info.dv.sq.wqe_cnt - 2 * UCT_IB_MLX5_MAX_BB; ucs_assert_always(txwq->bb_max > 0); - txwq->next_fm = 0; - txwq->fence_beat = 0; uct_ib_mlx5_txwq_reset(txwq); return UCS_OK; @@ -447,7 +521,21 @@ ucs_status_t uct_ib_mlx5_txwq_init(uct_priv_worker_t *worker, void uct_ib_mlx5_txwq_cleanup(uct_ib_mlx5_txwq_t* txwq) { - uct_worker_tl_data_put(txwq->reg, uct_ib_mlx5_mmio_cleanup); + uct_ib_mlx5_devx_uar_t *uar = ucs_derived_of(txwq->reg, + uct_ib_mlx5_devx_uar_t); + switch (txwq->super.type) { + case UCT_IB_MLX5_OBJ_TYPE_DEVX: + uct_worker_tl_data_put(uar, uct_ib_mlx5_devx_uar_cleanup); + break; + case UCT_IB_MLX5_OBJ_TYPE_VERBS: + uct_ib_mlx5_iface_put_res_domain(&txwq->super); + uct_worker_tl_data_put(txwq->reg, uct_ib_mlx5_mmio_cleanup); + break; + case UCT_IB_MLX5_OBJ_TYPE_LAST: + if (txwq->reg != NULL) { + uct_worker_tl_data_put(txwq->reg, uct_ib_mlx5_mmio_cleanup); + } + } } ucs_status_t uct_ib_mlx5_get_rxwq(struct ibv_qp *verbs_qp, uct_ib_mlx5_rxwq_t *rxwq) @@ -475,37 +563,48 @@ ucs_status_t uct_ib_mlx5_get_rxwq(struct ibv_qp *verbs_qp, uct_ib_mlx5_rxwq_t *r rxwq->rq_wqe_counter = 0; rxwq->cq_wqe_counter = 0; rxwq->mask = qp_info.dv.rq.wqe_cnt - 1; + /* cppcheck-suppress autoVariables */ rxwq->dbrec = &qp_info.dv.dbrec[MLX5_RCV_DBR]; memset(rxwq->wqes, 0, qp_info.dv.rq.wqe_cnt * sizeof(struct mlx5_wqe_data_seg)); return UCS_OK; } -ucs_status_t uct_ib_mlx5_srq_init(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs_srq, - size_t sg_byte_count) +ucs_status_t +uct_ib_mlx5_verbs_srq_init(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs_srq, + size_t sg_byte_count, int sge_num) { uct_ib_mlx5dv_srq_t srq_info = {}; - uct_ib_mlx5_srq_seg_t *seg; - uct_ib_mlx5dv_t obj = {}; + uct_ib_mlx5dv_t obj = {}; ucs_status_t status; - unsigned i; + uint16_t stride; - obj.dv.srq.in = verbs_srq; - obj.dv.srq.out = &srq_info.dv; + obj.dv.srq.in = verbs_srq; + obj.dv.srq.out = &srq_info.dv; +#if HAVE_DEVX + srq_info.dv.comp_mask = MLX5DV_SRQ_MASK_SRQN; +#endif status = uct_ib_mlx5dv_init_obj(&obj, MLX5DV_OBJ_SRQ); if (status != UCS_OK) { return status; } +#if HAVE_DEVX + srq->srq_num = srq_info.dv.srqn; +#else + srq->srq_num = 0; +#endif + if (srq_info.dv.head != 0) { ucs_error("SRQ head is not 0 (%d)", srq_info.dv.head); return UCS_ERR_NO_DEVICE; } - if (srq_info.dv.stride != UCT_IB_MLX5_SRQ_STRIDE) { - ucs_error("SRQ stride is not %lu (%d)", UCT_IB_MLX5_SRQ_STRIDE, - srq_info.dv.stride); + stride = uct_ib_mlx5_srq_stride(sge_num); + if (srq_info.dv.stride != stride) { + ucs_error("SRQ stride is not %u (%d), sgenum %d", + stride, srq_info.dv.stride, sge_num); return UCS_ERR_NO_DEVICE; } @@ -514,30 +613,52 @@ ucs_status_t uct_ib_mlx5_srq_init(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs_ return UCS_ERR_NO_DEVICE; } - srq->buf = srq_info.dv.buf; - srq->db = srq_info.dv.dbrec; - srq->free_idx = srq_info.dv.tail; - srq->ready_idx = -1; - srq->sw_pi = -1; - srq->mask = srq_info.dv.tail; - srq->tail = srq_info.dv.tail; + srq->buf = srq_info.dv.buf; + srq->db = srq_info.dv.dbrec; + uct_ib_mlx5_srq_buff_init(srq, srq_info.dv.head, srq_info.dv.tail, + sg_byte_count, sge_num); + + return UCS_OK; +} + +void uct_ib_mlx5_srq_buff_init(uct_ib_mlx5_srq_t *srq, uint32_t head, + uint32_t tail, size_t sg_byte_count, int sge_num) +{ + uct_ib_mlx5_srq_seg_t *seg; + unsigned i, j; + + srq->free_idx = tail; + srq->ready_idx = UINT16_MAX; + srq->sw_pi = UINT16_MAX; + srq->mask = tail; + srq->tail = tail; + srq->stride = uct_ib_mlx5_srq_stride(sge_num); - for (i = srq_info.dv.head; i <= srq_info.dv.tail; ++i) { + for (i = head; i <= tail; ++i) { seg = uct_ib_mlx5_srq_get_wqe(srq, i); - seg->srq.free = 0; - seg->srq.desc = NULL; - seg->dptr.byte_count = htonl(sg_byte_count); + seg->srq.next_wqe_index = htons((i + 1) & tail); + seg->srq.ptr_mask = 0; + seg->srq.free = 0; + seg->srq.desc = NULL; + seg->srq.strides = sge_num; + for (j = 0; j < sge_num; ++j) { + seg->dptr[j].byte_count = htonl(sg_byte_count); + } } - - return UCS_OK; } -void uct_ib_mlx5_srq_cleanup(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs_srq) +void uct_ib_mlx5_verbs_srq_cleanup(uct_ib_mlx5_srq_t *srq, + struct ibv_srq *verbs_srq) { uct_ib_mlx5dv_srq_t srq_info = {}; uct_ib_mlx5dv_t obj = {}; ucs_status_t status; + if (srq->type != UCT_IB_MLX5_OBJ_TYPE_VERBS) { + return; + } + + /* check if mlx5 driver didn't modified SRQ */ obj.dv.srq.in = verbs_srq; obj.dv.srq.out = &srq_info.dv; @@ -546,3 +667,43 @@ void uct_ib_mlx5_srq_cleanup(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs_srq) ucs_assertv_always(srq->tail == srq_info.dv.tail, "srq->tail=%d srq_info.tail=%d", srq->tail, srq_info.dv.tail); } + +ucs_status_t uct_ib_mlx5_modify_qp_state(uct_ib_mlx5_md_t *md, + uct_ib_mlx5_qp_t *qp, + enum ibv_qp_state state) +{ + if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX) { + return uct_ib_mlx5_devx_modify_qp_state(qp, state); + } else { + return uct_ib_modify_qp(qp->verbs.qp, state); + } +} + +ucs_status_t uct_ib_mlx5_md_get_atomic_mr_id(uct_ib_md_t *ibmd, uint8_t *mr_id) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t); + +#if HAVE_EXP_UMR + if ((md->umr_qp == NULL) || (md->umr_cq == NULL)) { + goto unsupported; + } +#else + if (!(md->flags & UCT_IB_MLX5_MD_FLAG_DEVX)) { + goto unsupported; + } +#endif + + /* Generate atomic UMR id. We want umrs for same virtual addresses to have + * different ids across processes. + * + * Usually parallel processes running on the same node as part of a single + * job will have consecutive PIDs. For example MPI ranks, slurm spawned tasks... + */ + *mr_id = getpid() % 256; + return UCS_OK; + +unsupported: + *mr_id = 0; + return UCS_ERR_UNSUPPORTED; +} + diff --git a/src/uct/ib/mlx5/ib_mlx5.h b/src/uct/ib/mlx5/ib_mlx5.h index 232c1cd79ba..253180dbdcb 100644 --- a/src/uct/ib/mlx5/ib_mlx5.h +++ b/src/uct/ib/mlx5/ib_mlx5.h @@ -36,9 +36,9 @@ # include #else # include -# include "ib_mlx5_hw.h" +# include #endif -#include "ib_mlx5_dv.h" +#include #include #include @@ -52,6 +52,7 @@ #define UCT_IB_MLX5_CQE128_SIZE_LOG 7 #define UCT_IB_MLX5_MAX_BB 4 #define UCT_IB_MLX5_WORKER_BF_KEY 0x00c1b7e8u +#define UCT_IB_MLX5_DEVX_UAR_KEY 0xdea1ab1eU #define UCT_IB_MLX5_RES_DOMAIN_KEY 0x1b1bda7aU #define UCT_IB_MLX5_WORKER_DM_KEY 0xacdf1245u #define UCT_IB_MLX5_EXTENDED_UD_AV 0x80 /* htonl(0x80000000) */ @@ -62,13 +63,16 @@ #define UCT_IB_MLX5_MAX_SEND_WQE_SIZE (UCT_IB_MLX5_MAX_BB * MLX5_SEND_WQE_BB) #define UCT_IB_MLX5_CQ_SET_CI 0 #define UCT_IB_MLX5_CQ_ARM_DB 1 -#define UCT_IB_MLX5_ROCE_SRC_PORT_MIN 0xC000 +#define UCT_IB_MLX5_LOG_MAX_MSG_SIZE 30 +#define UCT_IB_MLX5_ATOMIC_MODE 3 +#define UCT_IB_MLX5_CQE_FLAG_L3_IN_DATA UCS_BIT(28) /* GRH/IP in the receive buffer */ +#define UCT_IB_MLX5_CQE_FLAG_L3_IN_CQE UCS_BIT(29) /* GRH/IP in the CQE */ #define UCT_IB_MLX5_OPMOD_EXT_ATOMIC(_log_arg_size) \ ((8) | ((_log_arg_size) - 2)) -#if HAVE_STRUCT_MLX5_WQE_AV_BASE +#ifdef HAVE_STRUCT_MLX5_WQE_AV_BASE # define mlx5_av_base(_av) (&(_av)->base) # define mlx5_av_grh(_av) (&(_av)->grh_sec) @@ -95,13 +99,16 @@ struct mlx5_grh_av { uint8_t rgid[16]; }; +# define HAVE_STRUCT_MLX5_GRH_AV_RMAC 1 + #endif -#if !(HAVE_MLX5_WQE_CTRL_SOLICITED) +#ifndef MLX5_WQE_CTRL_SOLICITED # define MLX5_WQE_CTRL_SOLICITED (1<<1) #endif -#define UCT_IB_MLX5_WQE_CTRL_FENCE_ATOMIC (2<<5) +#define UCT_IB_MLX5_WQE_CTRL_FLAG_FENCE (2<<5) +#define UCT_IB_MLX5_WQE_CTRL_FLAG_STRONG_ORDER (3<<5) #define UCT_IB_MLX5_AM_ZCOPY_MAX_IOV 3UL @@ -118,20 +125,71 @@ struct mlx5_grh_av { #define UCT_IB_MLX5_PUT_MAX_SHORT(_av_size) \ (UCT_IB_MLX5_AM_MAX_SHORT(_av_size) - sizeof(struct mlx5_wqe_raddr_seg)) -#define UCT_IB_MLX5_SRQ_STRIDE (sizeof(struct mlx5_wqe_srq_next_seg) + \ - sizeof(struct mlx5_wqe_data_seg)) +#define UCT_IB_MLX5_XRQ_MIN_UWQ_POST 33 + +#define UCT_IB_MLX5_MD_FLAGS_DEVX_OBJS(_devx_objs) \ + ((_devx_objs) << UCT_IB_MLX5_MD_FLAG_DEVX_OBJS_SHIFT) + +#define UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(_obj) \ + UCT_IB_MLX5_MD_FLAGS_DEVX_OBJS(UCS_BIT(UCT_IB_DEVX_OBJ_ ## _obj)) + +#define UCT_IB_MLX5_DEVX_EVENT_TYPE_MASK 0xffff +#define UCT_IB_MLX5_DEVX_EVENT_DATA_SHIFT 16 + +enum { + /* Device supports KSM */ + UCT_IB_MLX5_MD_FLAG_KSM = UCS_BIT(0), + /* Device supports DEVX */ + UCT_IB_MLX5_MD_FLAG_DEVX = UCS_BIT(1), + /* Device supports TM DC */ + UCT_IB_MLX5_MD_FLAG_DC_TM = UCS_BIT(2), + /* Device supports MP RQ */ + UCT_IB_MLX5_MD_FLAG_MP_RQ = UCS_BIT(3), + /* Device supports creation of indirect MR with atomics access rights */ + UCT_IB_MLX5_MD_FLAG_INDIRECT_ATOMICS = UCS_BIT(4), + /* Device supports RMP to create SRQ for AM */ + UCT_IB_MLX5_MD_FLAG_RMP = UCS_BIT(5), + + /* Object to be created by DevX */ + UCT_IB_MLX5_MD_FLAG_DEVX_OBJS_SHIFT = 6, + UCT_IB_MLX5_MD_FLAG_DEVX_RC_QP = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(RCQP), + UCT_IB_MLX5_MD_FLAG_DEVX_RC_SRQ = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(RCSRQ), + UCT_IB_MLX5_MD_FLAG_DEVX_DCT = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(DCT), + UCT_IB_MLX5_MD_FLAG_DEVX_DC_SRQ = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(DCSRQ), +}; enum { - UCT_IB_MLX5_MD_FLAG_KSM = UCS_BIT(0) /* Device supports KSM */ + UCT_IB_MLX5_SRQ_TOPO_LIST = 0x0, + UCT_IB_MLX5_SRQ_TOPO_CYCLIC = 0x1, + UCT_IB_MLX5_SRQ_TOPO_LIST_MP_RQ = 0x2, + UCT_IB_MLX5_SRQ_TOPO_CYCLIC_MP_RQ = 0x3 }; +#if HAVE_DEVX +typedef struct uct_ib_mlx5_devx_umem { + struct mlx5dv_devx_umem *mem; + size_t size; +} uct_ib_mlx5_devx_umem_t; +#endif + /** * MLX5 IB memory domain. */ typedef struct uct_ib_mlx5_md { uct_ib_md_t super; uint32_t flags; + ucs_mpool_t dbrec_pool; + ucs_recursive_spinlock_t dbrec_lock; +#if HAVE_EXP_UMR + struct ibv_qp *umr_qp; /* special QP for creating UMR */ + struct ibv_cq *umr_cq; /* special CQ for creating UMR */ +#endif + +#if HAVE_DEVX + void *zero_buf; + uct_ib_mlx5_devx_umem_t zero_mem; +#endif } uct_ib_mlx5_md_t; @@ -148,7 +206,7 @@ typedef enum { typedef struct uct_ib_mlx5_iface_config { -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM struct { size_t seg_len; unsigned count; @@ -158,15 +216,48 @@ typedef struct uct_ib_mlx5_iface_config { } uct_ib_mlx5_iface_config_t; +/** + * MLX5 DoorBell record + */ +typedef struct uct_ib_mlx5_dbrec { + volatile uint32_t db[2]; + uint32_t mem_id; + size_t offset; + uct_ib_mlx5_md_t *md; +} uct_ib_mlx5_dbrec_t; + + +typedef enum { + UCT_IB_MLX5_OBJ_TYPE_VERBS, + UCT_IB_MLX5_OBJ_TYPE_DEVX, + UCT_IB_MLX5_OBJ_TYPE_LAST +} uct_ib_mlx5_obj_type_t; + + /* Shared receive queue */ typedef struct uct_ib_mlx5_srq { - void *buf; - volatile uint32_t *db; - uint16_t free_idx; /* what is completed contiguously */ - uint16_t ready_idx; /* what is ready to be posted to hw */ - uint16_t sw_pi; /* what is posted to hw */ - uint16_t mask; - uint16_t tail; /* tail in the driver */ + uct_ib_mlx5_obj_type_t type; + uint32_t srq_num; + void *buf; + volatile uint32_t *db; + uint16_t free_idx; /* what is completed contiguously */ + uint16_t ready_idx; /* what is ready to be posted to hw */ + uint16_t sw_pi; /* what is posted to hw */ + uint16_t mask; + uint16_t tail; /* tail in the driver */ + uint16_t stride; + union { + struct { + struct ibv_srq *srq; + } verbs; +#if HAVE_DEVX + struct { + uct_ib_mlx5_dbrec_t *dbrec; + uct_ib_mlx5_devx_umem_t mem; + struct mlx5dv_devx_obj *obj; + } devx; +#endif + }; } uct_ib_mlx5_srq_t; @@ -194,8 +285,61 @@ typedef struct uct_ib_mlx5_mmio_reg { } uct_ib_mlx5_mmio_reg_t; +typedef struct uct_ib_mlx5_devx_uar { + uct_ib_mlx5_mmio_reg_t super; +#if HAVE_DEVX + struct mlx5dv_devx_uar *uar; +#endif + struct ibv_context *ctx; +} uct_ib_mlx5_devx_uar_t; + + +/* resource domain */ +typedef struct uct_ib_mlx5_res_domain { + uct_worker_tl_data_t super; +#ifdef HAVE_IBV_EXP_RES_DOMAIN + struct ibv_exp_res_domain *ibv_domain; +#elif HAVE_DECL_IBV_ALLOC_TD + struct ibv_td *td; + struct ibv_pd *pd; +#endif +} uct_ib_mlx5_res_domain_t; + + +typedef struct uct_ib_mlx5_qp_attr { + uct_ib_qp_attr_t super; + uct_ib_mlx5_mmio_mode_t mmio_mode; +} uct_ib_mlx5_qp_attr_t; + + +/* MLX5 QP wrapper */ +typedef struct uct_ib_mlx5_qp { + uct_ib_mlx5_obj_type_t type; + uint32_t qp_num; + union { + struct { + union { + struct ibv_qp *qp; +#ifdef HAVE_DC_EXP + struct ibv_exp_dct *dct; +#endif + }; + uct_ib_mlx5_res_domain_t *rd; + } verbs; +#if HAVE_DEVX + struct { + void *wq_buf; + uct_ib_mlx5_dbrec_t *dbrec; + uct_ib_mlx5_devx_umem_t mem; + struct mlx5dv_devx_obj *obj; + } devx; +#endif + }; +} uct_ib_mlx5_qp_t; + /* Send work-queue */ typedef struct uct_ib_mlx5_txwq { + uct_ib_mlx5_qp_t super; uint16_t sw_pi; /* PI for next WQE */ uint16_t prev_sw_pi; /* PI where last WQE *started* */ uct_ib_mlx5_mmio_reg_t *reg; @@ -205,11 +349,10 @@ typedef struct uct_ib_mlx5_txwq { void *qend; uint16_t bb_max; uint16_t sig_pi; /* PI for last signaled WQE */ -#if ENABLE_ASSERT +#if UCS_ENABLE_ASSERT uint16_t hw_ci; #endif - uint16_t fence_beat; - uint8_t next_fm; + uct_ib_fence_info_t fi; } uct_ib_mlx5_txwq_t; @@ -255,23 +398,26 @@ typedef struct uct_ib_mlx5_err_cqe { * SRQ segment * * We add some SW book-keeping information in the unused HW fields: - * - next_hole - points to the next out-of-order completed segment - * - desc - the receive descriptor. - * + * - desc - the receive descriptor. + * - strides - Number of available strides in this WQE. When it is 0, + * this segment can be reposted to the HW. Relevant for + * Multi-Packet SRQ only. + * - free - points to the next out-of-order completed segment. */ typedef struct uct_rc_mlx5_srq_seg { union { struct mlx5_wqe_srq_next_seg mlx5_srq; struct { - uint8_t rsvd0[2]; + uint16_t ptr_mask; uint16_t next_wqe_index; /* Network byte order */ uint8_t signature; - uint8_t rsvd1[2]; + uint8_t rsvd1[1]; + uint8_t strides; uint8_t free; /* Released but not posted */ uct_ib_iface_recv_desc_t *desc; /* Host byte order */ } srq; }; - struct mlx5_wqe_data_seg dptr; + struct mlx5_wqe_data_seg dptr[0]; } uct_ib_mlx5_srq_seg_t; @@ -301,44 +447,27 @@ struct uct_ib_mlx5_atomic_masked_fadd64_seg { uint64_t filed_boundary; } UCS_S_PACKED; +ucs_status_t uct_ib_mlx5_md_get_atomic_mr_id(uct_ib_md_t *md, uint8_t *mr_id); -typedef struct uct_ib_mlx5_iface_res_domain { - uct_worker_tl_data_t super; -#if HAVE_IBV_EXP_RES_DOMAIN - struct ibv_exp_res_domain *ibv_domain; -#elif HAVE_DECL_IBV_ALLOC_TD - struct ibv_td *td; - struct ibv_pd *pd; -#endif -} uct_ib_mlx5_iface_res_domain_t; - - -/** - * MLX5 common iface part - */ -typedef struct uct_ib_mlx5_iface_common { - uct_ib_mlx5_iface_res_domain_t *res_domain; -} uct_ib_mlx5_iface_common_t; - - -ucs_status_t uct_ib_mlx5_iface_init_res_domain(uct_ib_iface_t *iface, - uct_ib_mlx5_iface_common_t *mlx5); - -void uct_ib_mlx5_iface_cleanup_res_domain(uct_ib_mlx5_iface_common_t *mlx5); +ucs_status_t uct_ib_mlx5_iface_get_res_domain(uct_ib_iface_t *iface, + uct_ib_mlx5_qp_t *txwq); +void uct_ib_mlx5_iface_put_res_domain(uct_ib_mlx5_qp_t *qp); ucs_status_t uct_ib_mlx5_iface_create_qp(uct_ib_iface_t *iface, - uct_ib_mlx5_iface_common_t *mlx5, - uct_ib_qp_attr_t *attr, - struct ibv_qp **qp_p); + uct_ib_mlx5_qp_t *qp, + uct_ib_mlx5_qp_attr_t *attr); + +ucs_status_t uct_ib_mlx5_modify_qp_state(uct_ib_mlx5_md_t *md, + uct_ib_mlx5_qp_t *qp, + enum ibv_qp_state state); /** * Create CQ with DV */ -ucs_status_t uct_ib_mlx5_create_cq(struct ibv_context *context, int cqe, - struct ibv_comp_channel *channel, - int comp_vector, int ignore_overrun, - size_t *inl, struct ibv_cq **cq_p); +ucs_status_t uct_ib_mlx5_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir, + const uct_ib_iface_init_attr_t *init_attr, + int preferred_cpu, size_t inl); extern ucs_config_field_t uct_ib_mlx5_iface_config_table[]; @@ -355,7 +484,7 @@ ucs_status_t uct_ib_mlx5_get_compact_av(uct_ib_iface_t *iface, int *compact_av); /** * Requests completion notification. */ -int uct_ib_mlx5dv_arm_cq(uct_ib_mlx5_cq_t *cq, int solicited); +ucs_status_t uct_ib_mlx5dv_arm_cq(uct_ib_mlx5_cq_t *cq, int solicited); /** * Check for completion with error. @@ -363,12 +492,19 @@ int uct_ib_mlx5dv_arm_cq(uct_ib_mlx5_cq_t *cq, int solicited); void uct_ib_mlx5_check_completion(uct_ib_iface_t *iface, uct_ib_mlx5_cq_t *cq, struct mlx5_cqe64 *cqe); +ucs_status_t +uct_ib_mlx5_get_mmio_mode(uct_priv_worker_t *worker, + uct_ib_mlx5_mmio_mode_t cfg_mmio_mode, + unsigned bf_size, + uct_ib_mlx5_mmio_mode_t *mmio_mode); + /** * Initialize txwq structure. */ ucs_status_t uct_ib_mlx5_txwq_init(uct_priv_worker_t *worker, uct_ib_mlx5_mmio_mode_t cfg_mmio_mode, uct_ib_mlx5_txwq_t *txwq, struct ibv_qp *verbs_qp); + void uct_ib_mlx5_txwq_cleanup(uct_ib_mlx5_txwq_t* txwq); /** @@ -384,8 +520,164 @@ ucs_status_t uct_ib_mlx5_get_rxwq(struct ibv_qp *qp, uct_ib_mlx5_rxwq_t *wq); /** * Initialize srq structure. */ -ucs_status_t uct_ib_mlx5_srq_init(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs_srq, - size_t sg_byte_count); -void uct_ib_mlx5_srq_cleanup(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs_srq); +ucs_status_t +uct_ib_mlx5_verbs_srq_init(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs_srq, + size_t sg_byte_count, int num_sge); + +void uct_ib_mlx5_srq_buff_init(uct_ib_mlx5_srq_t *srq, uint32_t head, + uint32_t tail, size_t sg_byte_count, int num_sge); + +void uct_ib_mlx5_verbs_srq_cleanup(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs_srq); + +/** + * DEVX UAR API + */ +int uct_ib_mlx5_devx_uar_cmp(uct_ib_mlx5_devx_uar_t *uar, + uct_ib_mlx5_md_t *md, + uct_ib_mlx5_mmio_mode_t mmio_mode); + +ucs_status_t uct_ib_mlx5_devx_uar_init(uct_ib_mlx5_devx_uar_t *uar, + uct_ib_mlx5_md_t *md, + uct_ib_mlx5_mmio_mode_t mmio_mode); + +void uct_ib_mlx5_devx_uar_cleanup(uct_ib_mlx5_devx_uar_t *uar); + +/** + * DEVX QP API + */ + +#if HAVE_DEVX + +ucs_status_t uct_ib_mlx5_devx_create_qp(uct_ib_iface_t *iface, + uct_ib_mlx5_qp_t *qp, + uct_ib_mlx5_txwq_t *tx, + uct_ib_mlx5_qp_attr_t *attr); + +ucs_status_t uct_ib_mlx5_devx_modify_qp(uct_ib_mlx5_qp_t *qp, + const void *in, size_t inlen, + void *out, size_t outlen); + +ucs_status_t uct_ib_mlx5_devx_modify_qp_state(uct_ib_mlx5_qp_t *qp, + enum ibv_qp_state state); + +void uct_ib_mlx5_devx_destroy_qp(uct_ib_mlx5_md_t *md, uct_ib_mlx5_qp_t *qp); + +static inline ucs_status_t +uct_ib_mlx5_md_buf_alloc(uct_ib_mlx5_md_t *md, size_t size, int silent, + void **buf_p, uct_ib_mlx5_devx_umem_t *mem, + char *name) +{ + ucs_log_level_t level = silent ? UCS_LOG_LEVEL_DEBUG : UCS_LOG_LEVEL_ERROR; + ucs_status_t status; + void *buf; + int ret; + + ret = ucs_posix_memalign(&buf, ucs_get_page_size(), size, name); + if (ret != 0) { + ucs_log(level, "failed to allocate buffer of %zu bytes: %m", size); + return UCS_ERR_NO_MEMORY; + } + + if (md->super.fork_init) { + ret = madvise(buf, size, MADV_DONTFORK); + if (ret != 0) { + ucs_log(level, "madvise(DONTFORK, buf=%p, len=%zu) failed: %m", buf, size); + status = UCS_ERR_IO_ERROR; + goto err_free; + } + } + + mem->size = size; + mem->mem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, buf, size, 0); + if (mem->mem == NULL) { + ucs_log(level, "mlx5dv_devx_umem_reg() failed: %m"); + status = UCS_ERR_NO_MEMORY; + goto err_dofork; + } + + *buf_p = buf; + return UCS_OK; + +err_dofork: + if (md->super.fork_init) { + madvise(buf, size, MADV_DOFORK); + } +err_free: + ucs_free(buf); + + return status; +} + +static inline void +uct_ib_mlx5_md_buf_free(uct_ib_mlx5_md_t *md, void *buf, uct_ib_mlx5_devx_umem_t *mem) +{ + int ret; + + if (buf == NULL) { + return; + } + + mlx5dv_devx_umem_dereg(mem->mem); + if (md->super.fork_init) { + ret = madvise(buf, mem->size, MADV_DOFORK); + if (ret != 0) { + ucs_warn("madvise(DOFORK, buf=%p, len=%zu) failed: %m", buf, mem->size); + } + } + ucs_free(buf); +} + +#else + +static inline ucs_status_t +uct_ib_mlx5_devx_create_qp(uct_ib_iface_t *iface, + uct_ib_mlx5_qp_t *qp, + uct_ib_mlx5_txwq_t *tx, + uct_ib_mlx5_qp_attr_t *attr) +{ + return UCS_ERR_UNSUPPORTED; +} + +static inline ucs_status_t +uct_ib_mlx5_devx_modify_qp(uct_ib_mlx5_qp_t *qp, + enum ibv_qp_state state) +{ + return UCS_ERR_UNSUPPORTED; +} + +static inline ucs_status_t +uct_ib_mlx5_devx_modify_qp_state(uct_ib_mlx5_qp_t *qp, enum ibv_qp_state state) +{ + return UCS_ERR_UNSUPPORTED; +} + +static inline void uct_ib_mlx5_devx_destroy_qp(uct_ib_mlx5_md_t *md, uct_ib_mlx5_qp_t *qp) { } + +#endif + +static inline uct_ib_mlx5_dbrec_t *uct_ib_mlx5_get_dbrec(uct_ib_mlx5_md_t *md) +{ + uct_ib_mlx5_dbrec_t *dbrec; + + ucs_recursive_spin_lock(&md->dbrec_lock); + dbrec = (uct_ib_mlx5_dbrec_t *)ucs_mpool_get_inline(&md->dbrec_pool); + ucs_recursive_spin_unlock(&md->dbrec_lock); + if (dbrec != NULL) { + dbrec->db[MLX5_SND_DBR] = 0; + dbrec->db[MLX5_RCV_DBR] = 0; + dbrec->md = md; + } + + return dbrec; +} + +static inline void uct_ib_mlx5_put_dbrec(uct_ib_mlx5_dbrec_t *dbrec) +{ + uct_ib_mlx5_md_t *md = dbrec->md; + + ucs_recursive_spin_lock(&md->dbrec_lock); + ucs_mpool_put_inline(dbrec); + ucs_recursive_spin_unlock(&md->dbrec_lock); +} #endif diff --git a/src/uct/ib/mlx5/ib_mlx5.inl b/src/uct/ib/mlx5/ib_mlx5.inl index 6a117d55fb6..e28e0013fb4 100644 --- a/src/uct/ib/mlx5/ib_mlx5.inl +++ b/src/uct/ib/mlx5/ib_mlx5.inl @@ -7,30 +7,73 @@ #include "ib_mlx5.h" -static UCS_F_ALWAYS_INLINE struct mlx5_cqe64* -uct_ib_mlx5_get_cqe(uct_ib_mlx5_cq_t *cq, unsigned index) +static UCS_F_ALWAYS_INLINE UCS_F_NON_NULL struct mlx5_cqe64* +uct_ib_mlx5_get_cqe(uct_ib_mlx5_cq_t *cq, unsigned cqe_index) +{ + return UCS_PTR_BYTE_OFFSET(cq->cq_buf, ((cqe_index & (cq->cq_length - 1)) << + cq->cqe_size_log)); +} + +static UCS_F_ALWAYS_INLINE int +uct_ib_mlx5_cqe_is_hw_owned(uint8_t op_own, unsigned cqe_index, unsigned mask) { - return cq->cq_buf + ((index & (cq->cq_length - 1)) << cq->cqe_size_log); + return (op_own & MLX5_CQE_OWNER_MASK) == !(cqe_index & mask); } static UCS_F_ALWAYS_INLINE int -uct_ib_mlx5_cqe_is_hw_owned(uint8_t op_own, unsigned index, unsigned mask) +uct_ib_mlx5_cqe_stride_index(struct mlx5_cqe64* cqe) +{ +#ifdef HAVE_STRUCT_MLX5_CQE64_IB_STRIDE_INDEX + return ntohs(cqe->ib_stride_index); +#else + uint16_t *stride = (uint16_t*)&cqe->rsvd20[2]; + return ntohs(*stride); +#endif +} + +static UCS_F_ALWAYS_INLINE int uct_ib_mlx5_srq_stride(int num_sge) +{ + int stride; + + stride = sizeof(struct mlx5_wqe_srq_next_seg) + + (num_sge * sizeof(struct mlx5_wqe_data_seg)); + + return ucs_roundup_pow2(stride); +} + +static UCS_F_ALWAYS_INLINE int +uct_ib_mlx5_srq_max_wrs(int rxq_len, int num_sge) +{ + return ucs_max(rxq_len / num_sge, UCT_IB_MLX5_XRQ_MIN_UWQ_POST); +} + +static UCS_F_ALWAYS_INLINE int +uct_ib_mlx5_cqe_is_grh_present(struct mlx5_cqe64* cqe) +{ + return cqe->flags_rqpn & htonl(UCT_IB_MLX5_CQE_FLAG_L3_IN_DATA | + UCT_IB_MLX5_CQE_FLAG_L3_IN_CQE); +} + +static UCS_F_ALWAYS_INLINE void* +uct_ib_mlx5_gid_from_cqe(struct mlx5_cqe64* cqe) { - return (op_own & MLX5_CQE_OWNER_MASK) == !(index & mask); + ucs_assert(uct_ib_mlx5_cqe_is_grh_present(cqe) == + htonl(UCT_IB_MLX5_CQE_FLAG_L3_IN_CQE)); /* GRH is in CQE */ + return UCS_PTR_BYTE_OFFSET(cqe, -UCT_IB_GRH_LEN); } static UCS_F_ALWAYS_INLINE struct mlx5_cqe64* uct_ib_mlx5_poll_cq(uct_ib_iface_t *iface, uct_ib_mlx5_cq_t *cq) { struct mlx5_cqe64 *cqe; - unsigned index; + unsigned cqe_index; uint8_t op_own; - index = cq->cq_ci; - cqe = uct_ib_mlx5_get_cqe(cq, index); - op_own = cqe->op_own; + cqe_index = cq->cq_ci; + cqe = uct_ib_mlx5_get_cqe(cq, cqe_index); + op_own = cqe->op_own; - if (ucs_unlikely(uct_ib_mlx5_cqe_is_hw_owned(op_own, index, cq->cq_length))) { + if (ucs_unlikely(uct_ib_mlx5_cqe_is_hw_owned(op_own, cqe_index, cq->cq_length))) { return NULL; } else if (ucs_unlikely(op_own & UCT_IB_MLX5_CQE_OP_OWN_ERR_MASK)) { UCS_STATIC_ASSERT(MLX5_CQE_INVALID & (UCT_IB_MLX5_CQE_OP_OWN_ERR_MASK >> 4)); @@ -39,15 +82,15 @@ uct_ib_mlx5_poll_cq(uct_ib_iface_t *iface, uct_ib_mlx5_cq_t *cq) return NULL; /* No CQE */ } - cq->cq_ci = index + 1; - return cqe; /* TODO optimize - let complier know cqe is not null */ + cq->cq_ci = cqe_index + 1; + return cqe; } static UCS_F_ALWAYS_INLINE uint16_t uct_ib_mlx5_txwq_update_bb(uct_ib_mlx5_txwq_t *wq, uint16_t hw_ci) { -#if ENABLE_ASSERT +#if UCS_ENABLE_ASSERT wq->hw_ci = hw_ci; #endif return wq->bb_max - (wq->prev_sw_pi - hw_ci); @@ -59,7 +102,7 @@ static inline void uct_ib_mlx5_txwq_validate(uct_ib_mlx5_txwq_t *wq, uint16_t num_bb) { -#if ENABLE_ASSERT +#if UCS_ENABLE_ASSERT uint16_t wqe_s, wqe_e; uint16_t hw_ci, sw_pi; uint16_t wqe_cnt; @@ -69,12 +112,12 @@ uct_ib_mlx5_txwq_validate(uct_ib_mlx5_txwq_t *wq, uint16_t num_bb) return; } - wqe_cnt = (wq->qend - wq->qstart) / MLX5_SEND_WQE_BB; + wqe_cnt = UCS_PTR_BYTE_DIFF(wq->qstart, wq->qend) / MLX5_SEND_WQE_BB; if (wqe_cnt < wq->bb_max) { ucs_fatal("wqe count (%u) < bb_max (%u)", wqe_cnt, wq->bb_max); } - wqe_s = (wq->curr - wq->qstart) / MLX5_SEND_WQE_BB; + wqe_s = UCS_PTR_BYTE_DIFF(wq->qstart, wq->curr) / MLX5_SEND_WQE_BB; wqe_e = (wqe_s + num_bb) % wqe_cnt; sw_pi = wq->prev_sw_pi % wqe_cnt; @@ -118,12 +161,16 @@ uct_ib_mlx5_inline_copy(void *restrict dest, const void *restrict src, unsigned { ptrdiff_t n; - if (dest + length <= wq->qend) { + ucs_assert(dest != NULL); + ucs_assert((src != NULL) || (length == 0)); + + if (UCS_PTR_BYTE_OFFSET(dest, length) <= wq->qend) { + /* cppcheck-suppress nullPointer */ memcpy(dest, src, length); } else { - n = wq->qend - dest; + n = UCS_PTR_BYTE_DIFF(dest, wq->qend); memcpy(dest, src, n); - memcpy(wq->qstart, src + n, length - n); + memcpy(wq->qstart, UCS_PTR_BYTE_OFFSET(src, n), length - n); } } @@ -156,7 +203,8 @@ static UCS_F_ALWAYS_INLINE void * uct_ib_mlx5_txwq_wrap_any(uct_ib_mlx5_txwq_t *txwq, void *seg) { if (ucs_unlikely(seg >= txwq->qend)) { - seg -= (txwq->qend - txwq->qstart); + seg = UCS_PTR_BYTE_OFFSET(seg, -UCS_PTR_BYTE_DIFF(txwq->qstart, + txwq->qend)); } return uct_ib_mlx5_txwq_wrap_none(txwq, seg); } @@ -168,7 +216,8 @@ static UCS_F_ALWAYS_INLINE void * uct_ib_mlx5_txwq_wrap_data(uct_ib_mlx5_txwq_t *txwq, void *data) { if (ucs_unlikely(data >= txwq->qend)) { - data -= (txwq->qend - txwq->qstart); + data = UCS_PTR_BYTE_OFFSET(data, -UCS_PTR_BYTE_DIFF(txwq->qstart, + txwq->qend)); } return data; } @@ -210,6 +259,8 @@ uct_ib_mlx5_set_dgram_seg(struct mlx5_wqe_datagram_seg *seg, mlx5_av_base(&seg->av)->key.dc_key = htobe64(UCT_IB_KEY); #endif } + ucs_assert(av != NULL); + /* cppcheck-suppress ctunullpointer */ mlx5_av_base(&seg->av)->dqp_dct = av->dqp_dct; mlx5_av_base(&seg->av)->stat_rate_sl = av->stat_rate_sl; mlx5_av_base(&seg->av)->fl_mlid = av->fl_mlid; @@ -334,12 +385,12 @@ uct_ib_mlx5_set_data_seg(struct mlx5_wqe_data_seg *dptr, static UCS_F_ALWAYS_INLINE -unsigned uct_ib_mlx5_set_data_seg_iov(uct_ib_mlx5_txwq_t *txwq, - struct mlx5_wqe_data_seg *dptr, - const uct_iov_t *iov, size_t iovcnt) +size_t uct_ib_mlx5_set_data_seg_iov(uct_ib_mlx5_txwq_t *txwq, + struct mlx5_wqe_data_seg *dptr, + const uct_iov_t *iov, size_t iovcnt) { - unsigned len = 0; - size_t iov_it; + size_t wqe_size = 0; + size_t iov_it; for (iov_it = 0; iov_it < iovcnt; ++iov_it) { if (!iov[iov_it].length) { /* Skip zero length WQE*/ @@ -349,12 +400,14 @@ unsigned uct_ib_mlx5_set_data_seg_iov(uct_ib_mlx5_txwq_t *txwq, /* place data into the buffer */ dptr = uct_ib_mlx5_txwq_wrap_any(txwq, dptr); - uct_ib_mlx5_set_data_seg(dptr, iov[iov_it].buffer, iov[iov_it].length, - ((uct_ib_mem_t*)iov[iov_it].memh)->lkey); - len += sizeof(*dptr); + uct_ib_mlx5_set_data_seg(dptr, iov[iov_it].buffer, + uct_iov_get_length(iov + iov_it), + uct_ib_memh_get_lkey(iov[iov_it].memh)); + wqe_size += sizeof(*dptr); ++dptr; } - return len; + + return wqe_size; } @@ -378,8 +431,8 @@ void *uct_ib_mlx5_bf_copy(void *dst, void *src, uint16_t num_bb, for (n = 0; n < num_bb; ++n) { uct_ib_mlx5_bf_copy_bb(dst, src); - dst += MLX5_SEND_WQE_BB; - src += MLX5_SEND_WQE_BB; + dst = UCS_PTR_BYTE_OFFSET(dst, MLX5_SEND_WQE_BB); + src = UCS_PTR_BYTE_OFFSET(src, MLX5_SEND_WQE_BB); if (ucs_unlikely(src == wq->qend)) { src = wq->qstart; } @@ -416,7 +469,7 @@ uct_ib_mlx5_post_send(uct_ib_mlx5_txwq_t *wq, ucs_assert(num_bb <= UCT_IB_MLX5_MAX_BB); if (ucs_likely(wq->reg->mode == UCT_IB_MLX5_MMIO_MODE_BF_POST)) { src = uct_ib_mlx5_bf_copy(dst, src, num_bb, wq); - ucs_memory_bus_wc_flush(); + ucs_memory_bus_cacheline_wc_flush(); } else if (wq->reg->mode == UCT_IB_MLX5_MMIO_MODE_BF_POST_MT) { src = uct_ib_mlx5_bf_copy(dst, src, num_bb, wq); /* Make sure that HW observes WC writes in order, in case of multiple @@ -427,7 +480,8 @@ uct_ib_mlx5_post_send(uct_ib_mlx5_txwq_t *wq, ucs_assert(wq->reg->mode == UCT_IB_MLX5_MMIO_MODE_DB); *(volatile uint64_t*)dst = *(volatile uint64_t*)src; ucs_memory_bus_store_fence(); - src = uct_ib_mlx5_txwq_wrap_any(wq, src + (num_bb * MLX5_SEND_WQE_BB)); + src = UCS_PTR_BYTE_OFFSET(src, num_bb * MLX5_SEND_WQE_BB); + src = uct_ib_mlx5_txwq_wrap_any(wq, src); } /* We don't want the compiler to reorder instructions and hurt latency */ @@ -453,43 +507,54 @@ uct_ib_mlx5_post_send(uct_ib_mlx5_txwq_t *wq, static inline uct_ib_mlx5_srq_seg_t * -uct_ib_mlx5_srq_get_wqe(uct_ib_mlx5_srq_t *srq, uint16_t index) +uct_ib_mlx5_srq_get_wqe(uct_ib_mlx5_srq_t *srq, uint16_t wqe_index) { - ucs_assert(index <= srq->mask); - return srq->buf + index * UCT_IB_MLX5_SRQ_STRIDE; + return UCS_PTR_BYTE_OFFSET(srq->buf, (wqe_index & srq->mask) * srq->stride); } -static inline void uct_ib_mlx5_iface_set_av_sport(uct_ib_iface_t *iface, - uct_ib_mlx5_base_av_t *av, - uint32_t flow_id) +static ucs_status_t UCS_F_MAYBE_UNUSED +uct_ib_mlx5_iface_fill_attr(uct_ib_iface_t *iface, + uct_ib_mlx5_qp_t *qp, + uct_ib_mlx5_qp_attr_t *attr) { - uint16_t sport; + ucs_status_t status; - if (!uct_ib_iface_is_roce(iface) || - (ntohs(av->rlid) >= UCT_IB_MLX5_ROCE_SRC_PORT_MIN)) { - return; + status = uct_ib_mlx5_iface_get_res_domain(iface, qp); + if (status != UCS_OK) { + return status; } - sport = flow_id ^ (flow_id >> 16); - av->rlid = htons(UCT_IB_MLX5_ROCE_SRC_PORT_MIN | sport); -} - -static void UCS_F_MAYBE_UNUSED -uct_ib_mlx5_iface_fill_attr(uct_ib_iface_t *iface, - uct_ib_mlx5_iface_common_t *mlx5, - uct_ib_qp_attr_t *attr) - -{ #if HAVE_DECL_IBV_EXP_CREATE_QP - attr->ibv.comp_mask = IBV_EXP_QP_INIT_ATTR_PD; - attr->ibv.pd = uct_ib_iface_md(iface)->pd; + attr->super.ibv.comp_mask = IBV_EXP_QP_INIT_ATTR_PD; + attr->super.ibv.pd = uct_ib_iface_md(iface)->pd; #elif HAVE_DECL_IBV_CREATE_QP_EX - attr->ibv.comp_mask = IBV_QP_INIT_ATTR_PD; - attr->ibv.pd = mlx5->res_domain->pd; + attr->super.ibv.comp_mask = IBV_QP_INIT_ATTR_PD; + if (qp->verbs.rd->pd != NULL) { + attr->super.ibv.pd = qp->verbs.rd->pd; + } else { + attr->super.ibv.pd = uct_ib_iface_md(iface)->pd; + } #endif -#if HAVE_IBV_EXP_RES_DOMAIN - attr->ibv.comp_mask |= IBV_EXP_QP_INIT_ATTR_RES_DOMAIN; - attr->ibv.res_domain = mlx5->res_domain->ibv_domain; +#ifdef HAVE_IBV_EXP_RES_DOMAIN + attr->super.ibv.comp_mask |= IBV_EXP_QP_INIT_ATTR_RES_DOMAIN; + attr->super.ibv.res_domain = qp->verbs.rd->ibv_domain; #endif + + return UCS_OK; +} + +static void UCS_F_MAYBE_UNUSED +uct_ib_mlx5_destroy_qp(uct_ib_mlx5_md_t *md, uct_ib_mlx5_qp_t *qp) +{ + switch (qp->type) { + case UCT_IB_MLX5_OBJ_TYPE_VERBS: + uct_ib_destroy_qp(qp->verbs.qp); + break; + case UCT_IB_MLX5_OBJ_TYPE_DEVX: + uct_ib_mlx5_devx_destroy_qp(md, qp); + break; + case UCT_IB_MLX5_OBJ_TYPE_LAST: + break; + } } diff --git a/src/uct/ib/mlx5/ib_mlx5_dv.c b/src/uct/ib/mlx5/ib_mlx5_dv.c deleted file mode 100644 index 0d6b86e70e3..00000000000 --- a/src/uct/ib/mlx5/ib_mlx5_dv.c +++ /dev/null @@ -1,373 +0,0 @@ -/** -* Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. -* -* See file LICENSE for terms. -*/ - -#include -#include "ib_mlx5.h" -#include "ib_mlx5_log.h" -#include "ib_mlx5_ifc.h" - -typedef struct uct_ib_mlx5_mem { - uct_ib_mem_t super; - struct mlx5dv_devx_obj *atomic_dvmr; -} uct_ib_mlx5_mem_t; - -#if HAVE_DECL_MLX5DV_INIT_OBJ -ucs_status_t uct_ib_mlx5dv_init_obj(uct_ib_mlx5dv_t *obj, uint64_t type) -{ - int ret; - - ret = mlx5dv_init_obj(&obj->dv, type); - if (ret != 0) { - ucs_error("DV failed to get mlx5 information. Type %lx.", type); - return UCS_ERR_NO_DEVICE; - } - - return UCS_OK; -} -#endif - -#if HAVE_DECL_MLX5DV_CONTEXT_FLAGS_DEVX -static ucs_status_t uct_ib_mlx5dv_create_ksm(uct_ib_md_t *ibmd, - uct_ib_mem_t *ib_memh, - off_t offset) -{ - uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); - uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t); - uint32_t out[UCT_IB_MLX5DV_ST_SZ_DW(create_mkey_out)] = {}; - struct ibv_mr *mr = memh->super.mr; - ucs_status_t status = UCS_OK; - struct mlx5dv_pd dvpd = {}; - struct mlx5dv_obj dv = {}; - size_t reg_length, length, inlen; - int list_size, i; - void *mkc, *klm; - uint32_t *in; - intptr_t addr; - - if (!(md->flags & UCT_IB_MLX5_MD_FLAG_KSM)) { - return UCS_ERR_UNSUPPORTED; - } - - reg_length = UCT_IB_MD_MAX_MR_SIZE; - addr = (intptr_t)mr->addr & ~(reg_length - 1); - length = mr->length + (intptr_t)mr->addr - addr; - list_size = ucs_div_round_up(length, reg_length); - inlen = UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_in) + - UCT_IB_MLX5DV_ST_SZ_BYTES(klm) * list_size; - - in = ucs_calloc(1, inlen, "mkey mailbox"); - if (in == NULL) { - return UCS_ERR_NO_MEMORY; - } - - dv.pd.in = md->super.pd; - dv.pd.out = &dvpd; - mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); - - UCT_IB_MLX5DV_SET(create_mkey_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_MKEY); - mkc = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); - UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, UCT_IB_MLX5_MKC_ACCESS_MODE_KSM); - UCT_IB_MLX5DV_SET(mkc, mkc, a, 1); - UCT_IB_MLX5DV_SET(mkc, mkc, rw, 1); - UCT_IB_MLX5DV_SET(mkc, mkc, rr, 1); - UCT_IB_MLX5DV_SET(mkc, mkc, lw, 1); - UCT_IB_MLX5DV_SET(mkc, mkc, lr, 1); - UCT_IB_MLX5DV_SET(mkc, mkc, pd, dvpd.pdn); - UCT_IB_MLX5DV_SET(mkc, mkc, translations_octword_size, list_size); - UCT_IB_MLX5DV_SET(mkc, mkc, log_entity_size, ucs_ilog2(reg_length)); - UCT_IB_MLX5DV_SET(mkc, mkc, qpn, 0xffffff); - UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, offset & 0xff); - UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, addr + offset); - UCT_IB_MLX5DV_SET64(mkc, mkc, len, length); - UCT_IB_MLX5DV_SET(create_mkey_in, in, translations_octword_actual_size, list_size); - - klm = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - for (i = 0; i < list_size; i++) { - if (i == list_size - 1) { - UCT_IB_MLX5DV_SET(klm, klm, byte_count, length % reg_length); - } else { - UCT_IB_MLX5DV_SET(klm, klm, byte_count, reg_length); - } - UCT_IB_MLX5DV_SET(klm, klm, mkey, mr->lkey); - UCT_IB_MLX5DV_SET64(klm, klm, address, addr + (i * reg_length)); - klm += UCT_IB_MLX5DV_ST_SZ_BYTES(klm); - } - - memh->atomic_dvmr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, inlen, - out, sizeof(out)); - if (memh->atomic_dvmr == NULL) { - ucs_debug("CREATE_MKEY KSM failed: %m"); - status = UCS_ERR_UNSUPPORTED; - md->flags &= ~UCT_IB_MLX5_MD_FLAG_KSM; - goto out; - } - - memh->super.atomic_rkey = - (UCT_IB_MLX5DV_GET(create_mkey_out, out, mkey_index) << 8) | - (offset & 0xff); - - ucs_debug("KSM registered memory %p..%p offset 0x%lx on %s rkey 0x%x", - mr->addr, mr->addr + mr->length, offset, uct_ib_device_name(&md->super.dev), - memh->super.atomic_rkey); -out: - ucs_free(in); - return status; -} - -static ucs_status_t uct_ib_mlx5dv_memh_dereg(uct_ib_md_t *ibmd, - uct_ib_mem_t *ib_memh) -{ - uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); - int ret; - - ret = mlx5dv_devx_obj_destroy(memh->atomic_dvmr); - if (ret != 0) { - return UCS_ERR_IO_ERROR; - } - return UCS_OK; -} - -static uct_ib_md_ops_t uct_ib_mlx5dv_md_ops = { - .memh_struct_size = sizeof(uct_ib_mlx5_mem_t), - .reg_atomic_key = uct_ib_mlx5dv_create_ksm, - .dereg_atomic_key = uct_ib_mlx5dv_memh_dereg, -}; - -static ucs_status_t uct_ib_mlx5_check_dc(uct_ib_device_t *dev) -{ - ucs_status_t status = UCS_OK; - struct ibv_context *ctx = dev->ibv_context; - struct ibv_qp_init_attr_ex qp_attr = {}; - struct mlx5dv_qp_init_attr dv_attr = {}; - struct ibv_pd *pd; - struct ibv_cq *cq; - struct ibv_qp *qp; - - pd = ibv_alloc_pd(ctx); - if (pd == NULL) { - ucs_error("ibv_alloc_pd() failed: %m"); - return UCS_ERR_IO_ERROR; - } - - cq = ibv_create_cq(ctx, 1, NULL, NULL, 0); - if (cq == NULL) { - ucs_error("ibv_create_cq() failed: %m"); - status = UCS_ERR_IO_ERROR; - goto err_cq; - } - - qp_attr.send_cq = cq; - qp_attr.recv_cq = cq; - qp_attr.cap.max_send_wr = 1; - qp_attr.cap.max_send_sge = 1; - qp_attr.qp_type = IBV_QPT_DRIVER; - qp_attr.comp_mask = IBV_QP_INIT_ATTR_PD; - qp_attr.pd = pd; - - dv_attr.comp_mask = MLX5DV_QP_INIT_ATTR_MASK_DC; - dv_attr.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCI; - - /* create DCI qp successful means DC is supported */ - qp = mlx5dv_create_qp(ctx, &qp_attr, &dv_attr); - if (qp) { - ibv_destroy_qp(qp); - dev->flags |= UCT_IB_DEVICE_FLAG_DC; - } - - ibv_destroy_cq(cq); -err_cq: - ibv_dealloc_pd(pd); - return status; -} - -static ucs_status_t uct_ib_mlx5dv_md_open(struct ibv_device *ibv_device, - uct_ib_md_t **p_md) -{ - uint32_t out[UCT_IB_MLX5DV_ST_SZ_DW(query_hca_cap_out)] = {}; - uint32_t in[UCT_IB_MLX5DV_ST_SZ_DW(query_hca_cap_in)] = {}; - struct mlx5dv_context_attr dv_attr = {}; - ucs_status_t status = UCS_OK; - int atomic = 0, has_dc = 1; - struct ibv_context *ctx; - uct_ib_device_t *dev; - uct_ib_mlx5_md_t *md; - void *cap; - int ret; - -#if HAVE_DECL_MLX5DV_IS_SUPPORTED - if (!mlx5dv_is_supported(ibv_device)) { - return UCS_ERR_UNSUPPORTED; - } -#endif - - dv_attr.flags |= MLX5DV_CONTEXT_FLAGS_DEVX; - ctx = mlx5dv_open_device(ibv_device, &dv_attr); - if (ctx == NULL) { - ucs_debug("mlx5dv_open_device(%s) failed: %m", ibv_get_device_name(ibv_device)); - status = UCS_ERR_UNSUPPORTED; - goto err; - } - - md = ucs_calloc(1, sizeof(*md), "ib_mlx5_md"); - if (md == NULL) { - status = UCS_ERR_NO_MEMORY; - goto err_free_context; - } - - md->super.ops = &uct_ib_mlx5dv_md_ops; - dev = &md->super.dev; - dev->ibv_context = ctx; - - IBV_EXP_DEVICE_ATTR_SET_COMP_MASK(&dev->dev_attr); - ret = ibv_query_device_ex(dev->ibv_context, NULL, &dev->dev_attr); - if (ret != 0) { - ucs_error("ibv_query_device() returned %d: %m", ret); - status = UCS_ERR_IO_ERROR; - goto err_free; - } - - cap = UCT_IB_MLX5DV_ADDR_OF(query_hca_cap_out, out, capability); - UCT_IB_MLX5DV_SET(query_hca_cap_in, in, opcode, UCT_IB_MLX5_CMD_OP_QUERY_HCA_CAP); - UCT_IB_MLX5DV_SET(query_hca_cap_in, in, op_mod, UCT_IB_MLX5_HCA_CAP_OPMOD_GET_MAX | - (UCT_IB_MLX5_CAP_GENERAL << 1)); - ret = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); - if (ret == 0) { - if (!UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, dct)) { - has_dc = 0; - } - if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, compact_address_vector)) { - dev->flags |= UCT_IB_DEVICE_FLAG_AV; - } - if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, fixed_buffer_size)) { - md->flags |= UCT_IB_MLX5_MD_FLAG_KSM; - } - if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, atomic)) { - atomic = 1; - } - } else if ((errno != EPERM) && - (errno != EPROTONOSUPPORT) && - (errno != EOPNOTSUPP)) { - ucs_error("MLX5_CMD_OP_QUERY_HCA_CAP failed: %m"); - status = UCS_ERR_IO_ERROR; - goto err_free; - } else { - status = UCS_ERR_UNSUPPORTED; - goto err_free; - } - - if (atomic) { - int ops = UCT_IB_MLX5_ATOMIC_OPS_CMP_SWAP | - UCT_IB_MLX5_ATOMIC_OPS_FETCH_ADD; - uint8_t arg_size; - int cap_ops, mode8b; - - UCT_IB_MLX5DV_SET(query_hca_cap_in, in, op_mod, UCT_IB_MLX5_HCA_CAP_OPMOD_GET_MAX | - (UCT_IB_MLX5_CAP_ATOMIC << 1)); - ret = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); - if (ret != 0) { - ucs_error("MLX5_CMD_OP_QUERY_HCA_CAP failed: %m"); - return UCS_ERR_IO_ERROR; - } - - arg_size = UCT_IB_MLX5DV_GET(atomic_caps, cap, atomic_size_qp); - cap_ops = UCT_IB_MLX5DV_GET(atomic_caps, cap, atomic_operations); - mode8b = UCT_IB_MLX5DV_GET(atomic_caps, cap, atomic_req_8B_endianness_mode); - - if ((cap_ops & ops) == ops) { - dev->atomic_arg_sizes = sizeof(uint64_t); - if (!mode8b) { - dev->atomic_arg_sizes_be = sizeof(uint64_t); - } - } - - ops |= UCT_IB_MLX5_ATOMIC_OPS_MASKED_CMP_SWAP | - UCT_IB_MLX5_ATOMIC_OPS_MASKED_FETCH_ADD; - - if (has_dc) { - arg_size &= UCT_IB_MLX5DV_GET(query_hca_cap_out, out, - capability.atomic_caps.atomic_size_dc); - } - - if ((cap_ops & ops) == ops) { - dev->ext_atomic_arg_sizes = arg_size; - if (mode8b) { - arg_size &= ~(sizeof(uint64_t)); - } - dev->ext_atomic_arg_sizes_be = arg_size; - } - - dev->pci_fadd_arg_sizes = UCT_IB_MLX5DV_GET(atomic_caps, cap, fetch_add_pci_atomic) << 2; - dev->pci_cswap_arg_sizes = UCT_IB_MLX5DV_GET(atomic_caps, cap, compare_swap_pci_atomic) << 2; - } - - if (has_dc) { - status = uct_ib_mlx5_check_dc(dev); - } - - dev->flags |= UCT_IB_DEVICE_FLAG_MLX5_PRM; - *p_md = &md->super; - return status; - -err_free: - ucs_free(md); -err_free_context: - ibv_close_device(ctx); -err: - return status; -} - -UCT_IB_MD_OPEN(uct_ib_mlx5dv_md_open, 1); - -ucs_status_t uct_ib_mlx5_get_compact_av(uct_ib_iface_t *iface, int *compact_av) -{ - *compact_av = !!(uct_ib_iface_device(iface)->flags & UCT_IB_DEVICE_FLAG_AV); - return UCS_OK; -} -#endif - -int uct_ib_mlx5dv_arm_cq(uct_ib_mlx5_cq_t *cq, int solicited) -{ - uint64_t doorbell, sn_ci_cmd; - uint32_t sn, ci, cmd; - - sn = cq->cq_sn & 3; - ci = cq->cq_ci & 0xffffff; - cmd = solicited ? MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT; - sn_ci_cmd = (sn << 28) | cmd | ci; - - cq->dbrec[UCT_IB_MLX5_CQ_ARM_DB] = htobe32(sn_ci_cmd); - - ucs_memory_cpu_fence(); - - doorbell = (sn_ci_cmd << 32) | cq->cq_num; - - *(uint64_t *)((uint8_t *)cq->uar + MLX5_CQ_DOORBELL) = htobe64(doorbell); - - ucs_memory_bus_store_fence(); - - return 0; -} - -#if HAVE_DECL_MLX5DV_OBJ_AH -void uct_ib_mlx5_get_av(struct ibv_ah *ah, struct mlx5_wqe_av *av) -{ - struct mlx5dv_obj dv; - struct mlx5dv_ah dah; - - dv.ah.in = ah; - dv.ah.out = &dah; - mlx5dv_init_obj(&dv, MLX5DV_OBJ_AH); - - *av = *(dah.av); - av->dqp_dct |= UCT_IB_MLX5_EXTENDED_UD_AV; -} -#elif !HAVE_INFINIBAND_MLX5_HW_H -void uct_ib_mlx5_get_av(struct ibv_ah *ah, struct mlx5_wqe_av *av) -{ - ucs_bug("MLX5DV_OBJ_AH not supported"); -} -#endif - diff --git a/src/uct/ib/mlx5/ib_mlx5_dv.h b/src/uct/ib/mlx5/ib_mlx5_dv.h deleted file mode 100644 index a4798061fb6..00000000000 --- a/src/uct/ib/mlx5/ib_mlx5_dv.h +++ /dev/null @@ -1,63 +0,0 @@ -/** -* Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. -* -* See file LICENSE for terms. -*/ - -#ifndef UCT_IB_MLX5_DV_H_ -#define UCT_IB_MLX5_DV_H_ - -#ifndef UCT_IB_MLX5_H_ -# error "Never include directly; use instead." -#endif - -#include -#include - -#include - -typedef struct { - struct mlx5dv_obj dv; -} uct_ib_mlx5dv_t; - -typedef struct { - struct mlx5dv_qp dv; -} uct_ib_mlx5dv_qp_t; - -typedef struct { - struct mlx5dv_srq dv; -} uct_ib_mlx5dv_srq_t; - -/* Completion queue */ -typedef struct { - struct mlx5dv_cq dv; -} uct_ib_mlx5dv_cq_t; - -/** - * Get internal verbs information. - */ -ucs_status_t uct_ib_mlx5dv_init_obj(uct_ib_mlx5dv_t *obj, uint64_t type); - -/** - * Update CI to support req_notify_cq - */ -void uct_ib_mlx5_update_cq_ci(struct ibv_cq *cq, unsigned cq_ci); - -/** - * Retrieve CI from the driver - */ -unsigned uct_ib_mlx5_get_cq_ci(struct ibv_cq *cq); - -/** - * Get internal AV information. - */ -void uct_ib_mlx5_get_av(struct ibv_ah *ah, struct mlx5_wqe_av *av); - -/** - * Backports for legacy bare-metal support - */ -struct ibv_qp *uct_dv_get_cmd_qp(struct ibv_srq *srq); - -void *uct_dv_get_info_uar0(void *uar); - -#endif diff --git a/src/uct/ib/mlx5/ib_mlx5_log.c b/src/uct/ib/mlx5/ib_mlx5_log.c index 667470cd72a..4b835d5ff3b 100644 --- a/src/uct/ib/mlx5/ib_mlx5_log.c +++ b/src/uct/ib/mlx5/ib_mlx5_log.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ib_mlx5_log.h" #include @@ -64,7 +68,7 @@ ucs_status_t uct_ib_mlx5_completion_with_err(uct_ib_iface_t *iface, wqe_index = ntohs(ecqe->wqe_counter); qp_num = ntohl(ecqe->s_wqe_opcode_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); if (txwq != NULL) { - wqe_index %= (txwq->qend - txwq->qstart) / MLX5_SEND_WQE_BB; + wqe_index %= UCS_PTR_BYTE_DIFF(txwq->qstart, txwq->qend) / MLX5_SEND_WQE_BB; } if (ecqe->syndrome == MLX5_CQE_SYNDROME_WR_FLUSH_ERR) { @@ -121,7 +125,7 @@ ucs_status_t uct_ib_mlx5_completion_with_err(uct_ib_iface_t *iface, } if ((txwq != NULL) && ((ecqe->op_own >> 4) == MLX5_CQE_REQ_ERR)) { - wqe = txwq->qstart + (MLX5_SEND_WQE_BB * wqe_index); + wqe = UCS_PTR_BYTE_OFFSET(txwq->qstart, MLX5_SEND_WQE_BB * wqe_index); uct_ib_mlx5_wqe_dump(iface, wqe, txwq->qstart, txwq->qend, INT_MAX, 0, NULL, wqe_info, sizeof(wqe_info) - 1, NULL); } else { @@ -141,12 +145,12 @@ ucs_status_t uct_ib_mlx5_completion_with_err(uct_ib_iface_t *iface, } static unsigned uct_ib_mlx5_parse_dseg(void **dseg_p, void *qstart, void *qend, - struct ibv_sge *sg_list, int *index, + struct ibv_sge *sg_list, int *sg_index, int *is_inline) { struct mlx5_wqe_data_seg *dpseg; struct mlx5_wqe_inl_data_seg *inl; - struct ibv_sge *sg = &sg_list[*index]; + struct ibv_sge *sg = &sg_list[*sg_index]; int byte_count; void *addr; int ds; @@ -160,19 +164,19 @@ static unsigned uct_ib_mlx5_parse_dseg(void **dseg_p, void *qstart, void *qend, sg->addr = (uintptr_t)addr; sg->lkey = 0; byte_count = ntohl(inl->byte_count) & ~MLX5_INLINE_SEG; - if (addr + byte_count > qend) { - sg->length = qend - addr; + if (UCS_PTR_BYTE_OFFSET(addr, byte_count) > qend) { + sg->length = UCS_PTR_BYTE_DIFF(addr, qend); (sg + 1)->addr = (uintptr_t)qstart; (sg + 1)->lkey = 0; (sg + 1)->length = byte_count - sg->length; - ++(*index); + ++(*sg_index); } else { - sg->length = byte_count; + sg->length = byte_count; } *is_inline = 1; ds = ucs_div_round_up(sizeof(*inl) + byte_count, UCT_IB_MLX5_WQE_SEG_SIZE); - ++(*index); + ++(*sg_index); } else { dpseg = *dseg_p; sg->addr = be64toh(dpseg->addr); @@ -180,12 +184,12 @@ static unsigned uct_ib_mlx5_parse_dseg(void **dseg_p, void *qstart, void *qend, sg->lkey = ntohl(dpseg->lkey); *is_inline = 0; ds = 1; - ++(*index); + ++(*sg_index); } - *dseg_p += ds * UCT_IB_MLX5_WQE_SEG_SIZE; + *dseg_p = UCS_PTR_BYTE_OFFSET(*dseg_p, ds * UCT_IB_MLX5_WQE_SEG_SIZE); if (*dseg_p >= qend) { - *dseg_p -= (qend - qstart); + *dseg_p = UCS_PTR_BYTE_OFFSET(*dseg_p, -UCS_PTR_BYTE_DIFF(qstart, qend)); } return ds; } @@ -234,8 +238,11 @@ static size_t uct_ib_mlx5_dump_dgram(char *buf, size_t max, void *seg, int is_et sgid_index = (htonl(grh_av->grh_gid_fl) >> 20) & UCS_MASK(8); snprintf(p, endp - p, " sgix %d dgid %s tc %d]", sgid_index, - inet_ntop(AF_INET6, grh_av->rgid, gid_buf, sizeof(gid_buf)), + uct_ib_gid_str((union ibv_gid *)grh_av->rgid, gid_buf, + sizeof(gid_buf)), grh_av->tclass); + } else { + snprintf(p, endp - p, "]"); } return UCT_IB_MLX5_AV_FULL_SIZE; } else { @@ -293,7 +300,7 @@ static void uct_ib_mlx5_wqe_dump(uct_ib_iface_t *iface, void *wqe, void *qstart, /* QP and WQE index */ if (dump_qp) { snprintf(s, ends - s, "QP 0x%x [%03ld] ", qp_num, - (wqe - qstart) / MLX5_SEND_WQE_BB); + UCS_PTR_BYTE_DIFF(qstart, wqe) / MLX5_SEND_WQE_BB); s += strlen(s); } @@ -361,24 +368,26 @@ static void uct_ib_mlx5_wqe_dump(uct_ib_iface_t *iface, void *wqe, void *qstart, int size = 1 << ((opmod & 7) + 2); if (opcode == MLX5_OPCODE_ATOMIC_MASKED_FA) { - add = network_to_host(seg, size); - boundary = network_to_host(seg + size, size); - seg += ucs_align_up_pow2(size * 2, UCT_IB_MLX5_WQE_SEG_SIZE); + add = network_to_host(seg, size); + boundary = network_to_host(UCS_PTR_BYTE_OFFSET(seg, size), size); + seg = UCS_PTR_BYTE_OFFSET(seg, + ucs_align_up_pow2(size * 2, + UCT_IB_MLX5_WQE_SEG_SIZE)); ds -= ucs_div_round_up(2 * size, UCT_IB_MLX5_WQE_SEG_SIZE); uct_ib_log_dump_atomic_masked_fadd(size, add, boundary, s, ends - s); } else if (opcode == MLX5_OPCODE_ATOMIC_MASKED_CS) { - swap = network_to_host(seg, size); - compare = network_to_host(seg + size, size); + swap = network_to_host(seg, size); + compare = network_to_host(UCS_PTR_BYTE_OFFSET(seg, size), size); - seg += size * 2; + seg = UCS_PTR_BYTE_OFFSET(seg, size * 2); if (seg == qend) { seg = qstart; } - swap_mask = network_to_host(seg, size); - compare_mask = network_to_host(seg + size, size); - seg += size * 2; + swap_mask = network_to_host(seg, size); + compare_mask = network_to_host(UCS_PTR_BYTE_OFFSET(seg, size), size); + seg = UCS_PTR_BYTE_OFFSET(seg, size * 2); if (seg == qend) { seg = qstart; } @@ -450,7 +459,7 @@ void __uct_ib_mlx5_log_rx(const char *file, int line, const char *function, length = ntohl(cqe->byte_cnt); if (iface->config.qp_type == IBV_QPT_UD) { length -= UCT_IB_GRH_LEN; - data += UCT_IB_GRH_LEN; + data = UCS_PTR_BYTE_OFFSET(data, UCT_IB_GRH_LEN); } uct_ib_log_dump_recv_completion(iface, ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER), diff --git a/src/uct/ib/rc/accel/rc_mlx5.h b/src/uct/ib/rc/accel/rc_mlx5.h index 95c8c7d7621..be341bf1c2b 100644 --- a/src/uct/ib/rc/accel/rc_mlx5.h +++ b/src/uct/ib/rc/accel/rc_mlx5.h @@ -16,16 +16,23 @@ #include +#define UCT_RC_MLX5_CHECK_RES_PTR(_iface, _ep) \ + UCT_RC_CHECK_CQE_RET(&(_iface)->super, &(_ep)->super, \ + UCS_STATUS_PTR(UCS_ERR_NO_RESOURCE)) \ + UCT_RC_CHECK_TXQP_RET(&(_iface)->super, &(_ep)->super, \ + UCS_STATUS_PTR(UCS_ERR_NO_RESOURCE)) + + /** * RC remote endpoint */ typedef struct uct_rc_mlx5_ep { - uct_rc_ep_t super; - unsigned qp_num; + uct_rc_ep_t super; struct { - uct_ib_mlx5_txwq_t wq; + uct_ib_mlx5_txwq_t wq; } tx; - struct ibv_qp *tm_qp; + uct_ib_mlx5_qp_t tm_qp; + uct_rc_mlx5_mp_context_t mp; } uct_rc_mlx5_ep_t; typedef struct uct_rc_mlx5_ep_address { @@ -107,6 +114,11 @@ ucs_status_t uct_rc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion ucs_status_t uct_rc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, uct_rc_fc_request_t *req); +ucs_status_t uct_rc_mlx5_iface_create_qp(uct_rc_mlx5_iface_common_t *iface, + uct_ib_mlx5_qp_t *qp, + uct_ib_mlx5_txwq_t *txwq, + uct_ib_mlx5_qp_attr_t *attr); + ucs_status_t uct_rc_mlx5_ep_connect_to_ep(uct_ep_h tl_ep, const uct_device_addr_t *dev_addr, const uct_ep_addr_t *ep_addr); @@ -140,4 +152,10 @@ ucs_status_t uct_rc_mlx5_ep_tag_rndv_request(uct_ep_h tl_ep, uct_tag_t tag, ucs_status_t uct_rc_mlx5_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr); +ucs_status_t uct_rc_mlx5_ep_handle_failure(uct_rc_mlx5_ep_t *ep, + ucs_status_t status); + +ucs_status_t uct_rc_mlx5_ep_set_failed(uct_ib_iface_t *iface, uct_ep_h ep, + ucs_status_t status); + #endif diff --git a/src/uct/ib/rc/accel/rc_mlx5.inl b/src/uct/ib/rc/accel/rc_mlx5.inl index 3393ac7aea6..49ac3d986b7 100644 --- a/src/uct/ib/rc/accel/rc_mlx5.inl +++ b/src/uct/ib/rc/accel/rc_mlx5.inl @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -7,6 +7,7 @@ #include "rc_mlx5.h" #include "rc_mlx5_common.h" +#include #include #include @@ -15,6 +16,22 @@ uct_rc_mlx5_iface_common_t *_iface = ucs_derived_of(_tl_ep->iface, \ uct_rc_mlx5_iface_common_t) + +static UCS_F_ALWAYS_INLINE void +uct_rc_mlx5_ep_fence_put(uct_rc_mlx5_iface_common_t *iface, uct_ib_mlx5_txwq_t *txwq, + uct_rkey_t *rkey, uint64_t *addr, uint16_t offset) +{ + uct_rc_ep_fence_put(&iface->super, &txwq->fi, rkey, addr, offset); +} + +static UCS_F_ALWAYS_INLINE void +uct_rc_mlx5_ep_fence_get(uct_rc_mlx5_iface_common_t *iface, uct_ib_mlx5_txwq_t *txwq, + uct_rkey_t *rkey, uint8_t *fm_ce_se) +{ + *rkey = uct_ib_md_direct_rkey(*rkey); + *fm_ce_se |= uct_rc_ep_fm(&iface->super, &txwq->fi, iface->config.atomic_fence_flag); +} + static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_common_update_tx_res(uct_rc_iface_t *rc_iface, uct_ib_mlx5_txwq_t *txwq, uct_rc_txqp_t *txqp, uint16_t hw_ci) @@ -66,7 +83,7 @@ uct_rc_mlx5_iface_common_rx_inline(uct_rc_mlx5_iface_common_t *iface, static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_srq_prefetch_setup(uct_rc_mlx5_iface_common_t *iface) { - unsigned wqe_ctr = (iface->rx.srq.free_idx + 2) & iface->rx.srq.mask; + unsigned wqe_ctr = iface->rx.srq.free_idx + 2; uct_ib_mlx5_srq_seg_t *seg; seg = uct_ib_mlx5_srq_get_wqe(&iface->rx.srq, wqe_ctr); @@ -74,33 +91,82 @@ uct_rc_mlx5_srq_prefetch_setup(uct_rc_mlx5_iface_common_t *iface) uct_ib_iface_recv_desc_hdr(&iface->super.super, seg->srq.desc); } +static UCS_F_NOINLINE void +uct_rc_mlx5_iface_hold_srq_desc(uct_rc_mlx5_iface_common_t *iface, + uct_ib_mlx5_srq_seg_t *seg, + struct mlx5_cqe64 *cqe, uint16_t wqe_ctr, + ucs_status_t status, unsigned offset, + uct_recv_desc_t *release_desc) +{ + void *udesc; + int stride_idx; + int desc_offset; + + if (UCT_RC_MLX5_MP_ENABLED(iface)) { + /* stride_idx is valid in non inline CQEs only. + * We can assume that stride_idx is correct here, because CQE + * with data would always force upper layer to save the data and + * return UCS_OK from the corresponding callback. */ + stride_idx = uct_ib_mlx5_cqe_stride_index(cqe); + ucs_assert(stride_idx < iface->tm.mp.num_strides); + ucs_assert(!(cqe->op_own & (MLX5_INLINE_SCATTER_32 | + MLX5_INLINE_SCATTER_64))); + + udesc = (void*)be64toh(seg->dptr[stride_idx].addr); + desc_offset = offset - iface->super.super.config.rx_hdr_offset; + udesc = UCS_PTR_BYTE_OFFSET(udesc, desc_offset); + uct_recv_desc(udesc) = release_desc; + seg->srq.ptr_mask &= ~UCS_BIT(stride_idx); + } else { + udesc = UCS_PTR_BYTE_OFFSET(seg->srq.desc, offset); + uct_recv_desc(udesc) = release_desc; + seg->srq.ptr_mask &= ~1; + seg->srq.desc = NULL; + } +} + static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_iface_release_srq_seg(uct_rc_mlx5_iface_common_t *iface, - uct_ib_mlx5_srq_seg_t *seg, uint16_t wqe_ctr, + uct_ib_mlx5_srq_seg_t *seg, + struct mlx5_cqe64 *cqe, uint16_t wqe_ctr, ucs_status_t status, unsigned offset, uct_recv_desc_t *release_desc) { - void *udesc; + uint16_t wqe_index; + int seg_free; + + /* Need to wrap wqe_ctr, because in case of cyclic srq topology + * it is wrapped around 0xFFFF regardless of real SRQ size. + * But it respects srq size when srq topology is a linked-list. */ + wqe_index = wqe_ctr & iface->rx.srq.mask; - if (ucs_likely((status == UCS_OK) && - (wqe_ctr == ((iface->rx.srq.ready_idx + 1) & - iface->rx.srq.mask)))) { + if (ucs_unlikely(status != UCS_OK)) { + uct_rc_mlx5_iface_hold_srq_desc(iface, seg, cqe, wqe_ctr, status, + offset, release_desc); + } + + if (UCT_RC_MLX5_MP_ENABLED(iface)) { + if (--seg->srq.strides) { + /* Segment can't be freed until all strides are consumed */ + return; + } + seg->srq.strides = iface->tm.mp.num_strides; + } + + seg_free = (seg->srq.ptr_mask == UCS_MASK(iface->tm.mp.num_strides)); + + if (ucs_likely(seg_free && (wqe_index == ((iface->rx.srq.ready_idx + 1) & + iface->rx.srq.mask)))) { /* If the descriptor was not used - if there are no "holes", we can just * reuse it on the receive queue. Otherwise, ready pointer will stay behind * until post_recv allocated more descriptors from the memory pool, fills * the holes, and moves it forward. */ - ucs_assert(wqe_ctr == ((iface->rx.srq.free_idx + 1) & - iface->rx.srq.mask)); + ucs_assert(wqe_index == ((iface->rx.srq.free_idx + 1) & iface->rx.srq.mask)); ++iface->rx.srq.ready_idx; ++iface->rx.srq.free_idx; } else { - if (status != UCS_OK) { - udesc = (char*)seg->srq.desc + offset; - uct_recv_desc(udesc) = release_desc; - seg->srq.desc = NULL; - } - if (wqe_ctr == ((iface->rx.srq.free_idx + 1) & iface->rx.srq.mask)) { + if (wqe_index == ((iface->rx.srq.free_idx + 1) & iface->rx.srq.mask)) { ++iface->rx.srq.free_idx; } else { /* Mark the segment as out-of-order, post_recv will advance free */ @@ -111,29 +177,119 @@ uct_rc_mlx5_iface_release_srq_seg(uct_rc_mlx5_iface_common_t *iface, ++iface->super.rx.srq.available; } +#define uct_rc_mlx5_iface_mp_hash_lookup(_h_name, _h_ptr, _key, _last, _flags, \ + _iface) \ + ({ \ + uct_rc_mlx5_mp_context_t *ctx; \ + khiter_t h_it; \ + int ret; \ + h_it = kh_get(_h_name, _h_ptr, _key); \ + if (h_it == kh_end(_h_ptr)) { \ + /* No data from this sender - this must be the first fragment */ \ + *(_flags) |= UCT_CB_PARAM_FLAG_FIRST; \ + if (ucs_likely(_last)) { \ + /* fast path - single fragment message */ \ + return &(_iface)->tm.mp.last_frag_ctx; \ + } \ + h_it = kh_put(_h_name, _h_ptr, _key, &ret); \ + ucs_assert(ret != 0); \ + ctx = &kh_value(_h_ptr, h_it); \ + } else { \ + ctx = &kh_value(_h_ptr, h_it); \ + if (_last) { \ + (_iface)->tm.mp.last_frag_ctx = *ctx; \ + kh_del(_h_name, _h_ptr, h_it); \ + return &(_iface)->tm.mp.last_frag_ctx; \ + } \ + } \ + *(_flags) |= UCT_CB_PARAM_FLAG_MORE; \ + ctx; \ + }) + +static UCS_F_ALWAYS_INLINE uct_rc_mlx5_mp_context_t* +uct_rc_mlx5_iface_rx_mp_context_from_ep(uct_rc_mlx5_iface_common_t *iface, + struct mlx5_cqe64 *cqe, unsigned *flags) +{ + uint32_t qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); + uct_rc_mlx5_ep_t *ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super, + qp_num), + uct_rc_mlx5_ep_t); + ucs_assert(ep != NULL); + if (ep->mp.free) { + *flags |= UCT_CB_PARAM_FLAG_FIRST; + ep->mp.free = 0; + } + + if (cqe->byte_cnt & htonl(UCT_RC_MLX5_MP_RQ_LAST_MSG_FIELD)) { + ucs_assert(!ep->mp.free); + ep->mp.free = 1; + } else { + *flags |= UCT_CB_PARAM_FLAG_MORE; + } + + return &ep->mp; +} + +static UCS_F_ALWAYS_INLINE uct_rc_mlx5_mp_context_t* +uct_rc_mlx5_iface_rx_mp_context_from_hash(uct_rc_mlx5_iface_common_t *iface, + struct mlx5_cqe64 *cqe, + unsigned *flags) +{ + uct_rc_mlx5_mp_context_t *mp_ctx; + uct_rc_mlx5_mp_hash_key_t key_gid; + uint64_t key_lid; + void *gid; + int last; + + last = cqe->byte_cnt & htonl(UCT_RC_MLX5_MP_RQ_LAST_MSG_FIELD); + + if (uct_ib_mlx5_cqe_is_grh_present(cqe)) { + gid = uct_ib_mlx5_gid_from_cqe(cqe); + /* Use guid and QP as a key. No need to fetch just qp + * and convert to le. */ + key_gid.guid = *(uint64_t*)UCS_PTR_BYTE_OFFSET(gid, 8); + key_gid.qp_num = cqe->flags_rqpn; + mp_ctx = uct_rc_mlx5_iface_mp_hash_lookup(uct_rc_mlx5_mp_hash_gid, + &iface->tm.mp.hash_gid, + key_gid, last, flags, + iface); + } else { + /* Combine QP and SLID as a key. No need to fetch just qp + * and convert to le. */ + key_lid = (uint64_t)cqe->flags_rqpn << 32 | cqe->slid; + mp_ctx = uct_rc_mlx5_iface_mp_hash_lookup(uct_rc_mlx5_mp_hash_lid, + &iface->tm.mp.hash_lid, + key_lid, last, flags, + iface); + } + + ucs_assert(mp_ctx != NULL); + return mp_ctx; +} + static UCS_F_ALWAYS_INLINE struct mlx5_cqe64* uct_rc_mlx5_iface_poll_rx_cq(uct_rc_mlx5_iface_common_t *iface) { uct_ib_mlx5_cq_t *cq = &iface->cq[UCT_IB_DIR_RX]; struct mlx5_cqe64 *cqe; - unsigned index; + unsigned idx; uint8_t op_own; /* Prefetch the descriptor if it was scheduled */ ucs_prefetch(iface->rx.pref_ptr); - index = cq->cq_ci; - cqe = uct_ib_mlx5_get_cqe(cq, index); + idx = cq->cq_ci; + cqe = uct_ib_mlx5_get_cqe(cq, idx); op_own = cqe->op_own; - if (ucs_unlikely(uct_ib_mlx5_cqe_is_hw_owned(op_own, index, cq->cq_length))) { + if (ucs_unlikely(uct_ib_mlx5_cqe_is_hw_owned(op_own, idx, cq->cq_length))) { return NULL; } else if (ucs_unlikely(op_own & UCT_IB_MLX5_CQE_OP_OWN_ERR_MASK)) { uct_rc_mlx5_iface_check_rx_completion(iface, cqe); return NULL; } - cq->cq_ci = index + 1; + cq->cq_ci = idx + 1; return cqe; /* TODO optimize - let complier know cqe is not null */ } @@ -177,6 +333,69 @@ uct_rc_mlx5_iface_common_data(uct_rc_mlx5_iface_common_t *iface, return hdr; } +static UCS_F_ALWAYS_INLINE uct_rc_mlx5_mp_context_t* +uct_rc_mlx5_iface_single_frag_context(uct_rc_mlx5_iface_common_t *iface, + unsigned *flags) +{ + *flags |= UCT_CB_PARAM_FLAG_FIRST; + return &iface->tm.mp.last_frag_ctx; +} + +static UCS_F_ALWAYS_INLINE void* +uct_rc_mlx5_iface_tm_common_data(uct_rc_mlx5_iface_common_t *iface, + struct mlx5_cqe64 *cqe, unsigned byte_len, + unsigned *flags, int poll_flags, + uct_rc_mlx5_mp_context_t **context_p) +{ + uct_ib_mlx5_srq_seg_t *seg; + void *hdr; + int stride_idx; + + if (!UCT_RC_MLX5_MP_ENABLED(iface)) { + /* uct_rc_mlx5_iface_common_data will initialize flags value */ + hdr = uct_rc_mlx5_iface_common_data(iface, cqe, byte_len, flags); + *context_p = uct_rc_mlx5_iface_single_frag_context(iface, flags); + return hdr; + } + + ucs_assert(byte_len <= UCT_RC_MLX5_MP_RQ_BYTE_CNT_FIELD_MASK); + *flags = 0; + + if (ucs_test_all_flags(poll_flags, UCT_RC_MLX5_POLL_FLAG_HAS_EP | + UCT_RC_MLX5_POLL_FLAG_TAG_CQE)) { + *context_p = uct_rc_mlx5_iface_rx_mp_context_from_ep(iface, cqe, flags); + } else if (poll_flags & UCT_RC_MLX5_POLL_FLAG_TAG_CQE) { + *context_p = uct_rc_mlx5_iface_rx_mp_context_from_hash(iface, cqe, flags); + } else { + /* Non-tagged messages (AM, RNDV Fin) should always arrive in + * a single frgament */ + *context_p = uct_rc_mlx5_iface_single_frag_context(iface, flags); + } + + /* Get a pointer to the tag header or the payload (if it is not the first + * fragment). */ + if (cqe->op_own & MLX5_INLINE_SCATTER_32) { + hdr = cqe; + uct_rc_mlx5_iface_common_rx_inline(iface, NULL, + UCT_RC_MLX5_IFACE_STAT_RX_INL_32, + byte_len); + } else if (cqe->op_own & MLX5_INLINE_SCATTER_64) { + hdr = cqe - 1; + uct_rc_mlx5_iface_common_rx_inline(iface, NULL, + UCT_RC_MLX5_IFACE_STAT_RX_INL_64, + byte_len); + } else { + *flags |= UCT_CB_PARAM_FLAG_DESC; + seg = uct_ib_mlx5_srq_get_wqe(&iface->rx.srq, ntohs(cqe->wqe_counter)); + stride_idx = uct_ib_mlx5_cqe_stride_index(cqe); + ucs_assert(stride_idx < iface->tm.mp.num_strides); + hdr = (void*)be64toh(seg->dptr[stride_idx].addr); + VALGRIND_MAKE_MEM_DEFINED(hdr, byte_len); + } + + return hdr; +} + static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_iface_common_am_handler(uct_rc_mlx5_iface_common_t *iface, struct mlx5_cqe64 *cqe, @@ -209,34 +428,18 @@ uct_rc_mlx5_iface_common_am_handler(uct_rc_mlx5_iface_common_t *iface, flags); } - uct_rc_mlx5_iface_release_srq_seg(iface, seg, wqe_ctr, status, - iface->super.super.config.rx_headroom_offset, - &iface->super.super.release_desc); -} - -static UCS_F_ALWAYS_INLINE void -uct_rc_mlx5_add_fence(uct_ib_md_t *md, uct_ib_mlx5_txwq_t *wq) -{ - if (md->dev.pci_fadd_arg_sizes || md->dev.pci_cswap_arg_sizes) { - wq->next_fm = UCT_IB_MLX5_WQE_CTRL_FENCE_ATOMIC; - } + uct_rc_mlx5_iface_release_srq_seg(iface, seg, cqe, wqe_ctr, status, + iface->tm.am_desc.offset, + &iface->tm.am_desc.super); } static UCS_F_ALWAYS_INLINE uint8_t -uct_rc_mlx5_ep_fm(uct_rc_mlx5_iface_common_t *iface, uct_ib_mlx5_txwq_t *txwq) +uct_rc_mlx5_ep_fm_cq_update(uct_rc_mlx5_iface_common_t *iface, + uct_ib_mlx5_txwq_t *txwq, int flag) { uint8_t fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; - fm_ce_se |= txwq->next_fm; - txwq->next_fm = 0; - - /* a call to iface_fence increases beat, so if endpoint beat is not in - * sync with iface beat it means the endpoint did not post any WQE with - * fence flag yet */ - if (txwq->fence_beat != iface->tx.fence_beat) { - txwq->fence_beat = iface->tx.fence_beat; - fm_ce_se |= iface->tx.next_fm; - } + fm_ce_se |= uct_rc_ep_fm(&iface->super, &txwq->fi, flag); return fm_ce_se; } @@ -256,11 +459,11 @@ uct_rc_mlx5_common_post_send(uct_rc_mlx5_iface_common_t *iface, int qp_type, if (opcode == MLX5_OPCODE_SEND_IMM) { uct_ib_mlx5_set_ctrl_seg_with_imm(ctrl, txwq->sw_pi, opcode, opmod, - txqp->qp->qp_num, fm_ce_se, wqe_size, + txwq->super.qp_num, fm_ce_se, wqe_size, imm); } else { uct_ib_mlx5_set_ctrl_seg(ctrl, txwq->sw_pi, opcode, opmod, - txqp->qp->qp_num, fm_ce_se, wqe_size); + txwq->super.qp_num, fm_ce_se, wqe_size); } ucs_assert(qp_type == iface->super.super.config.qp_type); @@ -280,6 +483,14 @@ uct_rc_mlx5_common_post_send(uct_rc_mlx5_iface_common_t *iface, int qp_type, if (fm_ce_se & MLX5_WQE_CTRL_CQ_UPDATE) { txwq->sig_pi = txwq->prev_sw_pi; } + +#if HAVE_TL_DC + if (qp_type == UCT_IB_QPT_DCI) { + txqp->available -= res_count; + return; + } +#endif + uct_rc_txqp_posted(txqp, &iface->super, res_count, fm_ce_se & MLX5_WQE_CTRL_CQ_UPDATE); } @@ -319,7 +530,8 @@ uct_rc_mlx5_txqp_inline_post(uct_rc_mlx5_iface_common_t *iface, int qp_type, ctrl = txwq->curr; ctrl_av_size = sizeof(*ctrl) + av_size; - next_seg = uct_ib_mlx5_txwq_wrap_exact(txwq, (void*)ctrl + ctrl_av_size); + next_seg = UCS_PTR_BYTE_OFFSET(ctrl, ctrl_av_size); + next_seg = uct_ib_mlx5_txwq_wrap_exact(txwq, next_seg); switch (opcode) { case MLX5_OPCODE_SEND_IMM: @@ -423,24 +635,27 @@ uct_rc_mlx5_txqp_dptr_post(uct_rc_mlx5_iface_common_t *iface, int qp_type, fm_ce_se |= uct_rc_iface_tx_moderation(&iface->super, txqp, MLX5_WQE_CTRL_CQ_UPDATE); } - opmod = 0; + opmod = 0; ctrl = txwq->curr; ctrl_av_size = sizeof(*ctrl) + av_size; - next_seg = uct_ib_mlx5_txwq_wrap_exact(txwq, (void*)ctrl + ctrl_av_size); + next_seg = UCS_PTR_BYTE_OFFSET(ctrl, ctrl_av_size); + next_seg = uct_ib_mlx5_txwq_wrap_exact(txwq, next_seg); switch (opcode_flags) { case MLX5_OPCODE_SEND_IMM: /* Used by tag offload */ case MLX5_OPCODE_SEND: /* Data segment only */ ucs_assert(length < (2ul << 30)); - ucs_assert(length <= iface->super.super.config.seg_size); + + /* TODO: make proper check for all cases TM, MP, etc + * ucs_assert(length <= iface->super.super.config.seg_size); */ wqe_size = ctrl_av_size + sizeof(struct mlx5_wqe_data_seg); uct_ib_mlx5_set_data_seg(next_seg, buffer, length, *lkey_p); break; case MLX5_OPCODE_RDMA_READ: - fm_ce_se |= uct_rc_mlx5_ep_fm(iface, txwq); + fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE; /* Fall through */ case MLX5_OPCODE_RDMA_WRITE: /* Set RDMA segment */ @@ -462,7 +677,8 @@ uct_rc_mlx5_txqp_dptr_post(uct_rc_mlx5_iface_common_t *iface, int qp_type, case MLX5_OPCODE_ATOMIC_FA: case MLX5_OPCODE_ATOMIC_CS: - fm_ce_se |= uct_rc_mlx5_ep_fm(iface, txwq); + fm_ce_se |= uct_rc_mlx5_ep_fm_cq_update(iface, txwq, + iface->config.atomic_fence_flag); ucs_assert(length == sizeof(uint64_t)); raddr = next_seg; uct_ib_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); @@ -481,8 +697,9 @@ uct_rc_mlx5_txqp_dptr_post(uct_rc_mlx5_iface_common_t *iface, int qp_type, break; case MLX5_OPCODE_ATOMIC_MASKED_CS: - fm_ce_se |= uct_rc_mlx5_ep_fm(iface, txwq); - raddr = next_seg; + fm_ce_se |= uct_rc_mlx5_ep_fm_cq_update(iface, txwq, + iface->config.atomic_fence_flag); + raddr = next_seg; uct_ib_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); switch (length) { @@ -519,8 +736,8 @@ uct_rc_mlx5_txqp_dptr_post(uct_rc_mlx5_iface_common_t *iface, int qp_type, break; case MLX5_OPCODE_ATOMIC_MASKED_FA: - fm_ce_se |= uct_rc_mlx5_ep_fm(iface, txwq); - raddr = next_seg; + fm_ce_se |= uct_rc_mlx5_ep_fm_cq_update(iface, txwq, iface->config.atomic_fence_flag); + raddr = next_seg; uct_ib_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); switch (length) { @@ -585,7 +802,8 @@ void uct_rc_mlx5_txqp_dptr_post_iov(uct_rc_mlx5_iface_common_t *iface, int qp_ty ctrl = txwq->curr; ctrl_av_size = sizeof(*ctrl) + av_size; - next_seg = uct_ib_mlx5_txwq_wrap_exact(txwq, (void*)ctrl + ctrl_av_size); + next_seg = UCS_PTR_BYTE_OFFSET(ctrl, ctrl_av_size); + next_seg = uct_ib_mlx5_txwq_wrap_exact(txwq, next_seg); switch (opcode_flags) { case MLX5_OPCODE_SEND: @@ -629,7 +847,7 @@ void uct_rc_mlx5_txqp_dptr_post_iov(uct_rc_mlx5_iface_common_t *iface, int qp_ty #endif case MLX5_OPCODE_RDMA_READ: - fm_ce_se |= uct_rc_mlx5_ep_fm(iface, txwq); + fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE; /* Fall through */ case MLX5_OPCODE_RDMA_WRITE: /* Set RDMA segment */ @@ -657,7 +875,7 @@ void uct_rc_mlx5_txqp_dptr_post_iov(uct_rc_mlx5_iface_common_t *iface, int qp_ty #if IBV_HW_TM static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_set_tm_seg(uct_ib_mlx5_txwq_t *txwq, - uct_rc_mlx5_wqe_tm_seg_t *tmseg, int op, int index, + uct_rc_mlx5_wqe_tm_seg_t *tmseg, int op, int tag_index, uint32_t unexp_cnt, uint64_t tag, uint64_t mask, unsigned tm_flags) { @@ -669,7 +887,7 @@ uct_rc_mlx5_set_tm_seg(uct_ib_mlx5_txwq_t *txwq, return; } - tmseg->index = htons(index); + tmseg->index = htons(tag_index); if (op == UCT_RC_MLX5_TM_OPCODE_REMOVE) { return; @@ -691,7 +909,7 @@ uct_rc_mlx5_release_tag_entry(uct_rc_mlx5_iface_common_t *iface, } static UCS_F_ALWAYS_INLINE void -uct_rc_mlx5_add_cmd_qp_op(uct_rc_mlx5_iface_common_t *iface, +uct_rc_mlx5_add_cmd_wq_op(uct_rc_mlx5_iface_common_t *iface, uct_rc_mlx5_tag_entry_t *tag) { uct_rc_mlx5_srq_op_t *op; @@ -737,9 +955,12 @@ uct_rc_mlx5_txqp_tag_inline_post(uct_rc_mlx5_iface_common_t *iface, int qp_type, break; case IBV_TMH_RNDV: + ucs_assert(iov != NULL); /* RVH can be wrapped */ + /* cppcheck-suppress nullPointer */ uct_rc_mlx5_fill_rvh(&rvh, iov->buffer, - ((uct_ib_mem_t*)iov->memh)->mr->rkey, iov->length); + /* cppcheck-suppress nullPointer */ + ((uct_ib_mem_t*)iov->memh)->rkey, iov->length); uct_ib_mlx5_inline_copy(tmh + 1, &rvh, sizeof(rvh), txwq); tm_hdr_len = sizeof(*tmh) + sizeof(rvh); @@ -796,7 +1017,7 @@ uct_rc_mlx5_iface_common_post_srq_op(uct_rc_mlx5_cmd_wq_t *cmd_wq, tm = uct_ib_mlx5_txwq_wrap_none(txwq, ctrl + 1); uct_ib_mlx5_set_ctrl_seg(ctrl, txwq->sw_pi, UCT_RC_MLX5_OPCODE_TAG_MATCHING, - 0, cmd_wq->qp_num, 0, wqe_size); + 0, txwq->super.qp_num, 0, wqe_size); uct_rc_mlx5_set_tm_seg(txwq, tm, op_code, next_idx, unexp_cnt, tag, tag_mask, tm_flags); @@ -817,10 +1038,19 @@ uct_rc_mlx5_iface_common_tag_recv(uct_rc_mlx5_iface_common_t *iface, uct_rc_mlx5_tag_entry_t *tag_entry; uint16_t next_idx; unsigned ctrl_size; + int ret; UCT_CHECK_IOV_SIZE(iovcnt, 1ul, "uct_rc_mlx5_iface_common_tag_recv"); UCT_RC_MLX5_CHECK_TAG(iface); + kh_put(uct_rc_mlx5_tag_addrs, &iface->tm.tag_addrs, iov->buffer, &ret); + if (ucs_unlikely(ret == 0)) { + /* Do not post the same buffer more than once (even with different tags) + * to avoid memory corruption. */ + return UCS_ERR_ALREADY_EXISTS; + } + ucs_assert(ret > 0); + ctrl_size = sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(uct_rc_mlx5_wqe_tm_seg_t); tag_entry = iface->tm.head; @@ -836,11 +1066,11 @@ uct_rc_mlx5_iface_common_tag_recv(uct_rc_mlx5_iface_common_t *iface, priv->buffer = iov->buffer; /* Only one iov is supported so far */ priv->length = iov->length; - uct_rc_mlx5_add_cmd_qp_op(iface, tag_entry); + uct_rc_mlx5_add_cmd_wq_op(iface, tag_entry); dptr = uct_ib_mlx5_txwq_wrap_none(txwq, (char*)txwq->curr + ctrl_size); uct_ib_mlx5_set_data_seg(dptr, iov->buffer, iov->length, - ((uct_ib_mem_t *)(iov->memh))->lkey); + uct_ib_memh_get_lkey(iov->memh)); uct_rc_mlx5_iface_common_post_srq_op(&iface->tm.cmd_wq, sizeof(*dptr), UCT_RC_MLX5_TM_OPCODE_APPEND, next_idx, @@ -854,27 +1084,39 @@ uct_rc_mlx5_iface_common_tag_recv(uct_rc_mlx5_iface_common_t *iface, return UCS_OK; } +static UCS_F_ALWAYS_INLINE void +uct_rc_mlx5_iface_tag_del_from_hash(uct_rc_mlx5_iface_common_t *iface, + void *buffer) +{ + khiter_t iter; + + iter = kh_get(uct_rc_mlx5_tag_addrs, &iface->tm.tag_addrs, buffer); + ucs_assert(iter != kh_end(&iface->tm.tag_addrs)); + kh_del(uct_rc_mlx5_tag_addrs, &iface->tm.tag_addrs, iter); +} + static UCS_F_ALWAYS_INLINE ucs_status_t uct_rc_mlx5_iface_common_tag_recv_cancel(uct_rc_mlx5_iface_common_t *iface, uct_tag_context_t *ctx, int force) { uct_rc_mlx5_ctx_priv_t *priv = uct_rc_mlx5_ctx_priv(ctx); - uint16_t index = priv->tag_handle; + uint16_t idx = priv->tag_handle; uct_rc_mlx5_tag_entry_t *tag_entry; unsigned flags; - tag_entry = &iface->tm.list[index]; + tag_entry = &iface->tm.list[idx]; if (ucs_likely(force)) { flags = UCT_RC_MLX5_SRQ_FLAG_TM_SW_CNT; uct_rc_mlx5_release_tag_entry(iface, tag_entry); + uct_rc_mlx5_iface_tag_del_from_hash(iface, priv->buffer); } else { flags = UCT_RC_MLX5_SRQ_FLAG_TM_CQE_REQ | UCT_RC_MLX5_SRQ_FLAG_TM_SW_CNT; - uct_rc_mlx5_add_cmd_qp_op(iface, tag_entry); + uct_rc_mlx5_add_cmd_wq_op(iface, tag_entry); } uct_rc_mlx5_iface_common_post_srq_op(&iface->tm.cmd_wq, 0, - UCT_RC_MLX5_TM_OPCODE_REMOVE, index, + UCT_RC_MLX5_TM_OPCODE_REMOVE, idx, iface->tm.unexpected_cnt, 0ul, 0ul, flags); @@ -898,6 +1140,7 @@ uct_rc_mlx5_iface_handle_tm_list_op(uct_rc_mlx5_iface_common_t *iface, int opcod if (opcode == UCT_RC_MLX5_CQE_APP_OP_TM_REMOVE) { ctx = op->tag->ctx; priv = uct_rc_mlx5_ctx_priv(ctx); + uct_rc_mlx5_iface_tag_del_from_hash(iface, priv->buffer); ctx->completed_cb(ctx, priv->tag, 0, priv->length, UCS_ERR_CANCELED); } } @@ -911,6 +1154,7 @@ uct_rc_mlx5_iface_tag_consumed(uct_rc_mlx5_iface_common_t *iface, uct_tag_context_t *ctx; uct_rc_mlx5_ctx_priv_t *priv; + /* coverity[tainted_data] */ tag = &iface->tm.list[ntohs(cqe->app_info)]; ctx = tag->ctx; ctx->tag_consumed_cb(ctx); @@ -926,18 +1170,23 @@ uct_rc_mlx5_iface_tag_consumed(uct_rc_mlx5_iface_common_t *iface, static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_iface_handle_expected(uct_rc_mlx5_iface_common_t *iface, struct mlx5_cqe64 *cqe, - unsigned byte_len, uint64_t tag, uint32_t app_ctx) + uint64_t tag, uint32_t app_ctx) { uint64_t imm_data; uct_rc_mlx5_tag_entry_t *tag_entry; uct_tag_context_t *ctx; uct_rc_mlx5_ctx_priv_t *priv; + unsigned byte_len; + /* coverity[tainted_data] */ tag_entry = &iface->tm.list[ntohs(cqe->app_info)]; ctx = tag_entry->ctx; priv = uct_rc_mlx5_ctx_priv(tag_entry->ctx); + /* Tag expected CQEs use all bits of byte_cnt even if MP XRQ is configured */ + byte_len = ntohl(cqe->byte_cnt); uct_rc_mlx5_release_tag_entry(iface, tag_entry); + uct_rc_mlx5_iface_tag_del_from_hash(iface, priv->buffer); if (cqe->op_own & MLX5_INLINE_SCATTER_64) { ucs_assert(byte_len <= priv->length); @@ -961,17 +1210,18 @@ uct_rc_mlx5_iface_handle_expected(uct_rc_mlx5_iface_common_t *iface, struct mlx5 static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_iface_unexp_consumed(uct_rc_mlx5_iface_common_t *iface, - uct_rc_mlx5_release_desc_t *release, - ucs_status_t status, uint16_t wqe_ctr) + unsigned offset, uct_recv_desc_t *release_desc, + struct mlx5_cqe64 *cqe, ucs_status_t status, + uint16_t wqe_ctr) { uct_ib_mlx5_srq_seg_t *seg; seg = uct_ib_mlx5_srq_get_wqe(&iface->rx.srq, wqe_ctr); - uct_rc_mlx5_iface_release_srq_seg(iface, seg, wqe_ctr, - status, release->offset, &release->super); + uct_rc_mlx5_iface_release_srq_seg(iface, seg, cqe, wqe_ctr, + status, offset, release_desc); - if (ucs_unlikely(!(++iface->tm.unexpected_cnt % IBV_DEVICE_MAX_UNEXP_COUNT))) { + if (ucs_unlikely(!(iface->tm.unexpected_cnt % IBV_DEVICE_MAX_UNEXP_COUNT))) { uct_rc_mlx5_iface_common_post_srq_op(&iface->tm.cmd_wq, 0, UCT_RC_MLX5_TM_OPCODE_NOP, 0, iface->tm.unexpected_cnt, 0ul, 0ul, @@ -983,65 +1233,130 @@ uct_rc_mlx5_iface_unexp_consumed(uct_rc_mlx5_iface_common_t *iface, static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_iface_tag_handle_unexp(uct_rc_mlx5_iface_common_t *iface, - struct mlx5_cqe64 *cqe, unsigned byte_len) + struct mlx5_cqe64 *cqe, unsigned byte_len, + int poll_flags) { - struct ibv_tmh *tmh; - uint64_t imm_data; - ucs_status_t status; - unsigned flags; - - tmh = uct_rc_mlx5_iface_common_data(iface, cqe, - byte_len, &flags); - - if (ucs_likely((tmh->opcode == IBV_TMH_EAGER) && + struct ibv_tmh *tmh; + uint64_t imm_data; + ucs_status_t status; + unsigned flags; + uct_rc_mlx5_mp_context_t *msg_ctx; + + tmh = uct_rc_mlx5_iface_tm_common_data(iface, cqe, byte_len, &flags, + poll_flags | + UCT_RC_MLX5_POLL_FLAG_TAG_CQE, + &msg_ctx); + + /* Fast path: single fragment eager message */ + if (ucs_likely(UCT_RC_MLX5_SINGLE_FRAG_MSG(flags) && + (tmh->opcode == IBV_TMH_EAGER) && !UCT_RC_MLX5_TM_CQE_WITH_IMM(cqe))) { - status = iface->tm.eager_unexp.cb(iface->tm.eager_unexp.arg, - tmh + 1, byte_len - sizeof(*tmh), - flags, tmh->tag, 0); + status = iface->tm.eager_unexp.cb(iface->tm.eager_unexp.arg, tmh + 1, + byte_len - sizeof(*tmh), flags, + tmh->tag, 0, &msg_ctx->context); - uct_rc_mlx5_iface_unexp_consumed(iface, &iface->tm.eager_desc, + ++iface->tm.unexpected_cnt; + uct_rc_mlx5_iface_unexp_consumed(iface, iface->tm.eager_desc.offset, + &iface->tm.eager_desc.super, cqe, status, ntohs(cqe->wqe_counter)); UCT_RC_MLX5_TM_STAT(iface, RX_EAGER_UNEXP); + return; - } else if (tmh->opcode == IBV_TMH_EAGER) { + } + + if (ucs_unlikely(!(flags & UCT_CB_PARAM_FLAG_FIRST))) { + /* Either middle or last fragment. Can pass zero tag, because it was + * already provided in the first fragment. If it is last fragment and + * CQE contains immediate value, construct user's immediate data using + * imm value and TMH->app_ctx (saved in message context when the first + * message arrived). Note, in case of send with immediate, only last + * fragment CQE contains immediate data. */ + ucs_assert(!UCT_RC_MLX5_TM_CQE_WITH_IMM(cqe) || + !(flags & UCT_CB_PARAM_FLAG_MORE)); imm_data = uct_rc_mlx5_tag_imm_data_unpack(cqe->imm_inval_pkey, - tmh->app_ctx, 1); + msg_ctx->app_ctx, + UCT_RC_MLX5_TM_CQE_WITH_IMM(cqe)); + status = iface->tm.eager_unexp.cb(iface->tm.eager_unexp.arg, tmh, + byte_len, flags, msg_ctx->tag, + imm_data, &msg_ctx->context); + + /* Do not increase unexpected_cnt count here, because it is counter per + * message rather than per every fragment */ + uct_rc_mlx5_iface_unexp_consumed(iface, + iface->super.super.config.rx_headroom_offset, + &iface->super.super.release_desc, + cqe, status, ntohs(cqe->wqe_counter)); + return; + } - if (ucs_unlikely(!imm_data)) { - /* Opcode is WITH_IMM, but imm_data is 0 - this must be SW RNDV */ - status = iface->tm.rndv_unexp.cb(iface->tm.rndv_unexp.arg, - flags, tmh->tag, tmh + 1, - byte_len - sizeof(*tmh), - 0ul, 0, NULL); + ++iface->tm.unexpected_cnt; - UCT_RC_MLX5_TM_STAT(iface, RX_RNDV_REQ_UNEXP); - } else { - status = iface->tm.eager_unexp.cb(iface->tm.eager_unexp.arg, - tmh + 1, byte_len - sizeof(*tmh), - flags, tmh->tag, imm_data); + if (ucs_unlikely(tmh->opcode == IBV_TMH_RNDV)) { + uct_rc_mlx5_handle_unexp_rndv(iface, tmh, tmh->tag, cqe, flags, byte_len); + return; + } - UCT_RC_MLX5_TM_STAT(iface, RX_EAGER_UNEXP); - } + ucs_assertv_always(tmh->opcode == IBV_TMH_EAGER, + "Unsupported packet arrived %d", tmh->opcode); - uct_rc_mlx5_iface_unexp_consumed(iface, &iface->tm.eager_desc, - status, ntohs(cqe->wqe_counter)); + /* Eager sync only, eager sync first or eager first. CQE can contain + immediate value if it is eager sync only or sw rndv messages */ + imm_data = uct_rc_mlx5_tag_imm_data_unpack(cqe->imm_inval_pkey, + tmh->app_ctx, + UCT_RC_MLX5_TM_CQE_WITH_IMM(cqe)); + + if (UCT_RC_MLX5_TM_CQE_WITH_IMM(cqe) && !imm_data) { + ucs_assert(UCT_RC_MLX5_SINGLE_FRAG_MSG(flags)); + /* Opcode is WITH_IMM, but imm_data is 0 - this must be SW RNDV */ + status = iface->tm.rndv_unexp.cb(iface->tm.rndv_unexp.arg, 0, tmh->tag, + tmh + 1, byte_len - sizeof(*tmh), + 0ul, 0, NULL); + + UCT_RC_MLX5_TM_STAT(iface, RX_RNDV_REQ_UNEXP); } else { - ucs_assertv_always(tmh->opcode == IBV_TMH_RNDV, - "Unsupported packet arrived %d", tmh->opcode); - status = uct_rc_mlx5_handle_rndv(iface, tmh, tmh->tag, byte_len); - uct_rc_mlx5_iface_unexp_consumed(iface, &iface->tm.rndv_desc, - status, ntohs(cqe->wqe_counter)); + /* Save app_context to assemble eager immediate data when the last + fragment arrives (and contains imm value) */ + msg_ctx->app_ctx = tmh->app_ctx; + + /* Save tag to pass it with non-first fragments */ + msg_ctx->tag = tmh->tag; - UCT_RC_MLX5_TM_STAT(iface, RX_RNDV_UNEXP); + status = iface->tm.eager_unexp.cb(iface->tm.eager_unexp.arg, + tmh + 1, byte_len - sizeof(*tmh), + flags, tmh->tag, imm_data, + &msg_ctx->context); + + UCT_RC_MLX5_TM_STAT(iface, RX_EAGER_UNEXP); } + + uct_rc_mlx5_iface_unexp_consumed(iface, iface->tm.eager_desc.offset, + &iface->tm.eager_desc.super, cqe, + status, ntohs(cqe->wqe_counter)); +} + +static UCS_F_NOINLINE void +uct_rc_mlx5_iface_handle_filler_cqe(uct_rc_mlx5_iface_common_t *iface, + struct mlx5_cqe64 *cqe) +{ + uct_ib_mlx5_srq_seg_t *seg; + + /* filler CQE is relevant for MP XRQ only */ + ucs_assert_always(UCT_RC_MLX5_MP_ENABLED(iface)); + + seg = uct_ib_mlx5_srq_get_wqe(&iface->rx.srq, ntohs(cqe->wqe_counter)); + + /* at least one stride should be in HW ownership when filler CQE arrives */ + ucs_assert(seg->srq.strides); + uct_rc_mlx5_iface_release_srq_seg(iface, seg, cqe, ntohs(cqe->wqe_counter), + UCS_OK, 0, NULL); } #endif /* IBV_HW_TM */ static UCS_F_ALWAYS_INLINE unsigned uct_rc_mlx5_iface_common_poll_rx(uct_rc_mlx5_iface_common_t *iface, - int is_tag_enabled) + int poll_flags) { uct_ib_mlx5_srq_seg_t UCS_V_UNUSED *seg; struct mlx5_cqe64 *cqe; @@ -1055,6 +1370,7 @@ uct_rc_mlx5_iface_common_poll_rx(uct_rc_mlx5_iface_common_t *iface, uct_rc_mlx5_tag_entry_t *tag; uct_tag_context_t *ctx; uct_rc_mlx5_ctx_priv_t *priv; + uct_rc_mlx5_mp_context_t UCS_V_UNUSED *dummy_ctx; #endif ucs_assert(uct_ib_mlx5_srq_get_wqe(&iface->rx.srq, @@ -1070,12 +1386,11 @@ uct_rc_mlx5_iface_common_poll_rx(uct_rc_mlx5_iface_common_t *iface, ucs_memory_cpu_load_fence(); UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_RX_COMPLETION, 1); - byte_len = ntohl(cqe->byte_cnt); + byte_len = ntohl(cqe->byte_cnt) & UCT_RC_MLX5_MP_RQ_BYTE_CNT_FIELD_MASK; count = 1; - if (!is_tag_enabled) { - rc_hdr = uct_rc_mlx5_iface_common_data(iface, cqe, - byte_len, &flags); + if (!(poll_flags & UCT_RC_MLX5_POLL_FLAG_TM)) { + rc_hdr = uct_rc_mlx5_iface_common_data(iface, cqe, byte_len, &flags); uct_rc_mlx5_iface_common_am_handler(iface, cqe, rc_hdr, flags, byte_len); goto done; } @@ -1083,11 +1398,18 @@ uct_rc_mlx5_iface_common_poll_rx(uct_rc_mlx5_iface_common_t *iface, #if IBV_HW_TM ucs_assert(cqe->app == UCT_RC_MLX5_CQE_APP_TAG_MATCHING); + if (ucs_unlikely(byte_len & UCT_RC_MLX5_MP_RQ_FILLER_CQE)) { + /* TODO: Check if cqe->app_op is valid for filler CQE. Then this check + * could be done for specific CQE types only. */ + uct_rc_mlx5_iface_handle_filler_cqe(iface, cqe); + count = 0; + goto done; + } + /* Should be a fast path, because small (latency-critical) messages * are not supposed to be offloaded to the HW. */ if (ucs_likely(cqe->app_op == UCT_RC_MLX5_CQE_APP_OP_TM_UNEXPECTED)) { - uct_rc_mlx5_iface_tag_handle_unexp(iface, cqe, - byte_len); + uct_rc_mlx5_iface_tag_handle_unexp(iface, cqe, byte_len, poll_flags); goto done; } @@ -1103,7 +1425,13 @@ uct_rc_mlx5_iface_common_poll_rx(uct_rc_mlx5_iface_common_t *iface, break; case UCT_RC_MLX5_CQE_APP_OP_TM_NO_TAG: - tmh = uct_rc_mlx5_iface_common_data(iface, cqe, byte_len, &flags); + /* TODO: optimize */ + tmh = uct_rc_mlx5_iface_tm_common_data(iface, cqe, byte_len, &flags, + poll_flags, &dummy_ctx); + + /* With MP XRQ, AM can be single-fragment only */ + ucs_assert(UCT_RC_MLX5_SINGLE_FRAG_MSG(flags)); + if (tmh->opcode == IBV_TMH_NO_TAG) { uct_rc_mlx5_iface_common_am_handler(iface, cqe, (uct_rc_mlx5_hdr_t*)tmh, @@ -1114,7 +1442,7 @@ uct_rc_mlx5_iface_common_poll_rx(uct_rc_mlx5_iface_common_t *iface, seg = uct_ib_mlx5_srq_get_wqe(&iface->rx.srq, ntohs(cqe->wqe_counter)); - uct_rc_mlx5_iface_release_srq_seg(iface, seg, + uct_rc_mlx5_iface_release_srq_seg(iface, seg, cqe, ntohs(cqe->wqe_counter), UCS_OK, 0, NULL); @@ -1133,16 +1461,15 @@ uct_rc_mlx5_iface_common_poll_rx(uct_rc_mlx5_iface_common_t *iface, uct_rc_mlx5_iface_tag_consumed(iface, cqe, UCT_RC_MLX5_CQE_APP_OP_TM_CONSUMED_MSG); - uct_rc_mlx5_iface_handle_expected(iface, cqe, byte_len, - tmh->tag, tmh->app_ctx); + uct_rc_mlx5_iface_handle_expected(iface, cqe, tmh->tag, tmh->app_ctx); break; case UCT_RC_MLX5_CQE_APP_OP_TM_EXPECTED: + /* coverity[tainted_data] */ tag = &iface->tm.list[ntohs(cqe->app_info)]; ctx = tag->ctx; priv = uct_rc_mlx5_ctx_priv(ctx); - uct_rc_mlx5_iface_handle_expected(iface, cqe, byte_len, - priv->tag, priv->app_ctx); + uct_rc_mlx5_iface_handle_expected(iface, cqe, priv->tag, priv->app_ctx); break; default: @@ -1154,12 +1481,12 @@ uct_rc_mlx5_iface_common_poll_rx(uct_rc_mlx5_iface_common_t *iface, done: max_batch = iface->super.super.config.rx_max_batch; if (ucs_unlikely(iface->super.rx.srq.available >= max_batch)) { - uct_rc_mlx5_iface_srq_post_recv(&iface->super, &iface->rx.srq); + uct_rc_mlx5_iface_srq_post_recv(iface); } return count; } -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM /* DM memory should be written by 8 bytes to eliminate * processor cache issues. To make this used uct_rc_mlx5_dm_copy_data_t * datatype where first hdr_len bytes are filled by message header @@ -1208,10 +1535,11 @@ uct_rc_mlx5_iface_common_copy_to_dm(uct_rc_mlx5_dm_copy_data_t *cache, size_t hd log_sge->num_sge = i; /* copy payload to DM */ - UCS_WORD_COPY(volatile uint64_t, dst, misaligned_t, payload + head, body); + UCS_WORD_COPY(volatile uint64_t, dst, misaligned_t, + UCS_PTR_BYTE_OFFSET(payload, head), body); if (tail) { dst += body; - memcpy(&padding, payload + head + body, tail); + memcpy(&padding, UCS_PTR_BYTE_OFFSET(payload, head + body), tail); /* use uint64_t for source datatype because it is aligned buffer on stack */ UCS_WORD_COPY(volatile uint64_t, dst, uint64_t, &padding, sizeof(padding)); } @@ -1249,7 +1577,7 @@ uct_rc_mlx5_common_dm_make_data(uct_rc_mlx5_iface_common_t *iface, * hint to valgrind to make it defined */ VALGRIND_MAKE_MEM_DEFINED(desc, sizeof(*desc)); ucs_assert(desc->super.buffer != NULL); - buffer = (void*)(desc->super.buffer - iface->dm.dm->start_va); + buffer = (void*)UCS_PTR_BYTE_DIFF(iface->dm.dm->start_va, desc->super.buffer); uct_rc_mlx5_iface_common_copy_to_dm(cache, hdr_len, payload, length, desc->super.buffer, log_sge); @@ -1311,7 +1639,7 @@ uct_rc_mlx5_iface_common_atomic_data(unsigned opcode, unsigned size, uint64_t va case UCT_ATOMIC_OP_XOR: *op = MLX5_OPCODE_ATOMIC_MASKED_FA; *compare_mask = 0; - *compare = -1; + *compare = UINT64_MAX; *swap_mask = 0; *swap = UCT_RC_MLX5_TO_BE(value, size); *ext = 1; @@ -1320,7 +1648,7 @@ uct_rc_mlx5_iface_common_atomic_data(unsigned opcode, unsigned size, uint64_t va *op = MLX5_OPCODE_ATOMIC_MASKED_CS; *compare_mask = 0; *compare = 0; - *swap_mask = -1; + *swap_mask = UINT64_MAX; *swap = UCT_RC_MLX5_TO_BE(value, size); *ext = 1; break; @@ -1331,4 +1659,3 @@ uct_rc_mlx5_iface_common_atomic_data(unsigned opcode, unsigned size, uint64_t va return UCS_OK; } - diff --git a/src/uct/ib/rc/accel/rc_mlx5_common.c b/src/uct/ib/rc/accel/rc_mlx5_common.c index cb1740cd638..1b371219d84 100644 --- a/src/uct/ib/rc/accel/rc_mlx5_common.c +++ b/src/uct/ib/rc/accel/rc_mlx5_common.c @@ -4,20 +4,20 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif #include "rc_mlx5.inl" #include #include +#include ucs_config_field_t uct_rc_mlx5_common_config_table[] = { - {"RC_", "", NULL, + {UCT_IB_CONFIG_PREFIX, "", NULL, ucs_offsetof(uct_rc_mlx5_iface_common_config_t, super), - UCS_CONFIG_TYPE_TABLE(uct_rc_iface_config_table)}, - - {"", "", NULL, - ucs_offsetof(uct_rc_mlx5_iface_common_config_t, mlx5_common), UCS_CONFIG_TYPE_TABLE(uct_ib_mlx5_iface_config_table)}, {"TX_MAX_BB", "-1", @@ -36,20 +36,53 @@ ucs_config_field_t uct_rc_mlx5_common_config_table[] = { "-1 means no limit.", ucs_offsetof(uct_rc_mlx5_iface_common_config_t, tm.list_size), UCS_CONFIG_TYPE_UINT}, - {"TM_MAX_BCOPY", "48k", - "Maximal size of copy-out sends when tag-matching offload is enabled", - ucs_offsetof(uct_rc_mlx5_iface_common_config_t, tm.max_bcopy), UCS_CONFIG_TYPE_MEMUNITS}, + {"TM_SEG_SIZE", "48k", + "Maximal size of copy-out sends when tag-matching offload is enabled.", + ucs_offsetof(uct_rc_mlx5_iface_common_config_t, tm.seg_size), + UCS_CONFIG_TYPE_MEMUNITS}, + + {"TM_MP_SRQ_ENABLE", "try", + "Enable multi-packet SRQ support. Relevant for hardware tag-matching only.", + ucs_offsetof(uct_rc_mlx5_iface_common_config_t, tm.mp_enable), + UCS_CONFIG_TYPE_TERNARY}, + + {"TM_MP_NUM_STRIDES", "8", + "Number of strides used per single receive WQE for hardware tag-matching\n" + "unexpected messages. Can be 8 or 16 only. Relevant when MP SRQ is enabled.", + ucs_offsetof(uct_rc_mlx5_iface_common_config_t, tm.mp_num_strides), + UCS_CONFIG_TYPE_ULUNITS}, + + {"TM_MAX_BCOPY", NULL, "", + ucs_offsetof(uct_rc_mlx5_iface_common_config_t, tm.seg_size), + UCS_CONFIG_TYPE_MEMUNITS}, + + {"EXP_BACKOFF", "0", + "Exponential Backoff Timeout Multiplier. ACK timeout will be multiplied \n" + "by 2^EXP_BACKOFF every consecutive retry.", + ucs_offsetof(uct_rc_mlx5_iface_common_config_t, exp_backoff), + UCS_CONFIG_TYPE_UINT}, + + {"CYCLIC_SRQ_ENABLE", "try", + "Enable using the \"cyclic\" SRQ type (SRQ is organized as a continuous \n" + "array of WQEs), otherwise - using the \"list\" SRQ type (SRQ is organized \n" + "as a buffer containing linked list of WQEs.", + ucs_offsetof(uct_rc_mlx5_iface_common_config_t, cyclic_srq_enable), + UCS_CONFIG_TYPE_TERNARY}, {NULL} }; -unsigned uct_rc_mlx5_iface_srq_post_recv(uct_rc_iface_t *iface, uct_ib_mlx5_srq_t *srq) +unsigned uct_rc_mlx5_iface_srq_post_recv(uct_rc_mlx5_iface_common_t *iface) { + uct_ib_mlx5_srq_t *srq = &iface->rx.srq; + uct_rc_iface_t *rc_iface = &iface->super; uct_ib_mlx5_srq_seg_t *seg; uct_ib_iface_recv_desc_t *desc; - uint16_t count, index, next_index; + uint16_t count, wqe_index, next_index; + uint64_t desc_map; void *hdr; + int i; /* Make sure the union is right */ UCS_STATIC_ASSERT(ucs_offsetof(uct_ib_mlx5_srq_seg_t, mlx5_srq.next_wqe_index) == @@ -59,10 +92,10 @@ unsigned uct_rc_mlx5_iface_srq_post_recv(uct_rc_iface_t *iface, uct_ib_mlx5_srq_ ucs_assert(UCS_CIRCULAR_COMPARE16(srq->ready_idx, <=, srq->free_idx)); - index = srq->ready_idx; + wqe_index = srq->ready_idx; for (;;) { - next_index = index + 1; - seg = uct_ib_mlx5_srq_get_wqe(srq, next_index & srq->mask); + next_index = wqe_index + 1; + seg = uct_ib_mlx5_srq_get_wqe(srq, next_index); if (UCS_CIRCULAR_COMPARE16(next_index, >, srq->free_idx)) { if (!seg->srq.free) { break; @@ -73,30 +106,33 @@ unsigned uct_rc_mlx5_iface_srq_post_recv(uct_rc_iface_t *iface, uct_ib_mlx5_srq_ srq->free_idx = next_index; } - if (seg->srq.desc == NULL) { - UCT_TL_IFACE_GET_RX_DESC(&iface->super.super, &iface->rx.mp, - desc, break); + desc_map = ~seg->srq.ptr_mask & UCS_MASK(iface->tm.mp.num_strides); + ucs_for_each_bit(i, desc_map) { + UCT_TL_IFACE_GET_RX_DESC(&rc_iface->super.super, &rc_iface->rx.mp, + desc, goto out); /* Set receive data segment pointer. Length is pre-initialized. */ - hdr = uct_ib_iface_recv_desc_hdr(&iface->super, desc); - seg->srq.desc = desc; - seg->dptr.lkey = htonl(desc->lkey); - seg->dptr.addr = htobe64((uintptr_t)hdr); - VALGRIND_MAKE_MEM_NOACCESS(hdr, iface->super.config.seg_size); + hdr = uct_ib_iface_recv_desc_hdr(&rc_iface->super, desc); + seg->srq.ptr_mask |= UCS_BIT(i); + seg->srq.desc = desc; /* Optimization for non-MP case (1 stride) */ + seg->dptr[i].lkey = htonl(desc->lkey); + seg->dptr[i].addr = htobe64((uintptr_t)hdr); + VALGRIND_MAKE_MEM_NOACCESS(hdr, rc_iface->super.config.seg_size); } - index = next_index; + wqe_index = next_index; } - count = index - srq->sw_pi; - ucs_assert(iface->rx.srq.available >= count); +out: + count = wqe_index - srq->sw_pi; + ucs_assert(rc_iface->rx.srq.available >= count); if (count > 0) { - srq->ready_idx = index; - srq->sw_pi = index; - iface->rx.srq.available -= count; + srq->ready_idx = wqe_index; + srq->sw_pi = wqe_index; + rc_iface->rx.srq.available -= count; ucs_memory_cpu_store_fence(); - *srq->db = htonl(srq->sw_pi); + *srq->db = htonl(srq->sw_pi); ucs_assert(uct_ib_mlx5_srq_get_wqe(srq, srq->mask)->srq.next_wqe_index == 0); } return count; @@ -106,7 +142,7 @@ void uct_rc_mlx5_iface_common_prepost_recvs(uct_rc_mlx5_iface_common_t *iface) { iface->super.rx.srq.available = iface->super.rx.srq.quota; iface->super.rx.srq.quota = 0; - uct_rc_mlx5_iface_srq_post_recv(&iface->super, &iface->rx.srq); + uct_rc_mlx5_iface_srq_post_recv(iface); } #define UCT_RC_MLX5_DEFINE_ATOMIC_LE_HANDLER(_bits) \ @@ -135,7 +171,7 @@ UCT_RC_MLX5_DEFINE_ATOMIC_LE_HANDLER(32) UCT_RC_MLX5_DEFINE_ATOMIC_LE_HANDLER(64) #if IBV_HW_TM -# if ENABLE_STATS +# ifdef ENABLE_STATS static ucs_stats_class_t uct_rc_mlx5_tag_stats_class = { .name = "tag", .num_counters = UCT_RC_MLX5_STAT_TAG_LAST, @@ -153,18 +189,63 @@ static ucs_stats_class_t uct_rc_mlx5_tag_stats_class = { }; # endif -static struct ibv_qp * -uct_rc_mlx5_get_cmd_qp(uct_rc_mlx5_iface_common_t *iface) + +static ucs_status_t UCS_F_MAYBE_UNUSED +uct_rc_mlx5_devx_create_cmd_qp(uct_rc_mlx5_iface_common_t *iface) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.md, + uct_ib_mlx5_md_t); + uct_ib_device_t *dev = &md->super.dev; + struct ibv_ah_attr ah_attr = {}; + uct_ib_mlx5_qp_attr_t attr = {}; + ucs_status_t status; + + ucs_assert(iface->tm.cmd_wq.super.super.type == UCT_IB_MLX5_OBJ_TYPE_LAST); + + attr.super.cap.max_send_wr = iface->tm.cmd_qp_len; + attr.super.cap.max_send_sge = 1; + attr.super.ibv.pd = md->super.pd; + attr.super.ibv.send_cq = iface->super.super.cq[UCT_IB_DIR_RX]; + attr.super.ibv.recv_cq = iface->super.super.cq[UCT_IB_DIR_RX]; + attr.super.srq_num = iface->rx.srq.srq_num; + attr.super.port = dev->first_port; + attr.mmio_mode = iface->tx.mmio_mode; + status = uct_ib_mlx5_devx_create_qp(&iface->super.super, + &iface->tm.cmd_wq.super.super, + &iface->tm.cmd_wq.super, + &attr); + if (status != UCS_OK) { + return status; + } + + ah_attr.is_global = 1; + ah_attr.grh.dgid = iface->super.super.gid_info.gid; + ah_attr.dlid = uct_ib_device_port_attr(dev, attr.super.port)->lid; + ah_attr.port_num = dev->first_port; + status = uct_rc_mlx5_iface_common_devx_connect_qp( + iface, &iface->tm.cmd_wq.super.super, + iface->tm.cmd_wq.super.super.qp_num, &ah_attr, + iface->super.super.config.path_mtu); + if (status != UCS_OK) { + goto err_destroy_qp; + } + + return UCS_OK; + +err_destroy_qp: + uct_ib_mlx5_devx_destroy_qp(md, &iface->tm.cmd_wq.super.super); + return status; +} + +static struct ibv_qp * UCS_F_MAYBE_UNUSED +uct_rc_mlx5_verbs_create_cmd_qp(uct_rc_mlx5_iface_common_t *iface) { -#if HAVE_STRUCT_MLX5_SRQ_CMD_QP - iface->tm.cmd_qp = NULL; - return uct_dv_get_cmd_qp(iface->super.rx.srq.srq); -#else uct_ib_md_t *md = uct_ib_iface_md(&iface->super.super); struct ibv_qp_init_attr qp_init_attr = {}; struct ibv_qp_attr qp_attr = {}; uct_ib_device_t *ibdev = &md->dev; struct ibv_port_attr *port_attr; + ucs_status_t status; struct ibv_qp *qp; uint8_t port_num; int ret; @@ -172,17 +253,23 @@ uct_rc_mlx5_get_cmd_qp(uct_rc_mlx5_iface_common_t *iface) port_num = ibdev->first_port; port_attr = uct_ib_device_port_attr(ibdev, port_num); + status = uct_ib_mlx5_iface_get_res_domain(&iface->super.super, + &iface->tm.cmd_wq.super.super); + if (status != UCS_OK) { + goto err; + } + qp_init_attr.qp_type = IBV_QPT_RC; qp_init_attr.send_cq = iface->super.super.cq[UCT_IB_DIR_RX]; qp_init_attr.recv_cq = iface->super.super.cq[UCT_IB_DIR_RX]; qp_init_attr.cap.max_send_sge = 1; - qp_init_attr.srq = iface->super.rx.srq.srq; + qp_init_attr.srq = iface->rx.srq.verbs.srq; qp_init_attr.cap.max_send_wr = iface->tm.cmd_qp_len; qp = ibv_create_qp(md->pd, &qp_init_attr); if (qp == NULL) { ucs_error("failed to create TM control QP: %m"); - goto err; + goto err_rd; } @@ -203,7 +290,7 @@ uct_rc_mlx5_get_cmd_qp(uct_rc_mlx5_iface_common_t *iface) qp_attr.ah_attr.port_num = port_num; qp_attr.ah_attr.dlid = port_attr->lid; qp_attr.ah_attr.is_global = 1; - qp_attr.ah_attr.grh.dgid = iface->super.super.gid; + qp_attr.ah_attr.grh.dgid = iface->super.super.gid_info.gid; ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER); @@ -223,44 +310,55 @@ uct_rc_mlx5_get_cmd_qp(uct_rc_mlx5_iface_common_t *iface) goto err_destroy_qp; } - iface->tm.cmd_qp = qp; + iface->tm.cmd_wq.super.super.verbs.qp = qp; return qp; err_destroy_qp: - ibv_destroy_qp(qp); + uct_ib_destroy_qp(qp); +err_rd: + uct_ib_mlx5_iface_put_res_domain(&iface->tm.cmd_wq.super.super); err: return NULL; +} + +static ucs_status_t +uct_rc_mlx5_get_cmd_qp(uct_rc_mlx5_iface_common_t *iface) +{ + struct ibv_qp *qp; +#ifdef HAVE_STRUCT_MLX5_SRQ_CMD_QP + iface->tm.cmd_wq.super.super.verbs.qp = NULL; + iface->tm.cmd_wq.super.super.verbs.rd = NULL; + iface->tm.cmd_wq.super.super.type = UCT_IB_MLX5_OBJ_TYPE_LAST; + qp = uct_dv_get_cmd_qp(iface->rx.srq.verbs.srq); +#else + if (iface->rx.srq.type == UCT_IB_MLX5_OBJ_TYPE_DEVX) { + return uct_rc_mlx5_devx_create_cmd_qp(iface); + } else { + qp = uct_rc_mlx5_verbs_create_cmd_qp(iface); + } #endif + iface->tm.cmd_wq.super.super.qp_num = qp->qp_num; + return uct_ib_mlx5_txwq_init(iface->super.super.super.worker, + iface->tx.mmio_mode, + &iface->tm.cmd_wq.super, qp); } #endif -ucs_status_t -uct_rc_mlx5_iface_common_tag_init(uct_rc_mlx5_iface_common_t *iface, - uct_rc_mlx5_iface_common_config_t *config) +ucs_status_t uct_rc_mlx5_iface_common_tag_init(uct_rc_mlx5_iface_common_t *iface) { ucs_status_t status = UCS_OK; #if IBV_HW_TM - struct ibv_qp *cmd_qp; int i; if (!UCT_RC_MLX5_TM_ENABLED(iface)) { return UCS_OK; } - cmd_qp = uct_rc_mlx5_get_cmd_qp(iface); - if (!cmd_qp) { - status = UCS_ERR_NO_DEVICE; - goto err_tag_cleanup; - } - - status = uct_ib_mlx5_txwq_init(iface->super.super.super.worker, - iface->tx.mmio_mode, - &iface->tm.cmd_wq.super, cmd_qp); + status = uct_rc_mlx5_get_cmd_qp(iface); if (status != UCS_OK) { goto err_tag_cleanup; } - iface->tm.cmd_wq.qp_num = cmd_qp->qp_num; iface->tm.cmd_wq.ops_mask = iface->tm.cmd_qp_len - 1; iface->tm.cmd_wq.ops_head = iface->tm.cmd_wq.ops_tail = 0; iface->tm.cmd_wq.ops = ucs_calloc(iface->tm.cmd_qp_len, @@ -307,19 +405,136 @@ uct_rc_mlx5_iface_common_tag_init(uct_rc_mlx5_iface_common_t *iface, void uct_rc_mlx5_iface_common_tag_cleanup(uct_rc_mlx5_iface_common_t *iface) { - if (UCT_RC_MLX5_TM_ENABLED(iface)) { - if (iface->tm.cmd_qp) { - ibv_destroy_qp(iface->tm.cmd_qp); + uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.md, + uct_ib_mlx5_md_t); + uct_rc_mlx5_mp_hash_key_t key_gid; + uint64_t key_lid; + void *recv_buffer; + + if (!UCT_RC_MLX5_TM_ENABLED(iface)) { + return; + } + + uct_ib_mlx5_destroy_qp(md, &iface->tm.cmd_wq.super.super); + uct_ib_mlx5_txwq_cleanup(&iface->tm.cmd_wq.super); + ucs_free(iface->tm.list); + ucs_free(iface->tm.cmd_wq.ops); + uct_rc_mlx5_tag_cleanup(iface); + + kh_foreach_key(&iface->tm.tag_addrs, recv_buffer, { + ucs_debug("destroying iface %p, with recv buffer %p offloaded to the HW", + iface, recv_buffer); + }); + kh_destroy_inplace(uct_rc_mlx5_tag_addrs, &iface->tm.tag_addrs); + + if (!UCT_RC_MLX5_MP_ENABLED(iface)) { + return; + } + + kh_foreach_key(&iface->tm.mp.hash_lid, key_lid, { + ucs_debug("destroying iface %p with partially received rx msg (key: %lu)", + iface, key_lid); + }); + kh_destroy_inplace(uct_rc_mlx5_mp_hash_lid, &iface->tm.mp.hash_lid); + + kh_foreach_key(&iface->tm.mp.hash_gid, key_gid, { + ucs_debug("destroying iface %p with partially received rx msg (key: %lu-%u)", + iface, key_gid.guid, key_gid.qp_num); + }); + kh_destroy_inplace(uct_rc_mlx5_mp_hash_gid, &iface->tm.mp.hash_gid); + + ucs_mpool_cleanup(&iface->tm.mp.tx_mp, 1); +} + +void uct_rc_mlx5_iface_fill_attr(uct_rc_mlx5_iface_common_t *iface, + uct_ib_mlx5_qp_attr_t *qp_attr, + unsigned max_send_wr, + uct_ib_mlx5_srq_t *srq) +{ + switch (srq->type) { + case UCT_IB_MLX5_OBJ_TYPE_VERBS: + uct_rc_iface_fill_attr(&iface->super, &qp_attr->super, max_send_wr, + srq->verbs.srq); + break; + case UCT_IB_MLX5_OBJ_TYPE_DEVX: + uct_rc_iface_fill_attr(&iface->super, &qp_attr->super, max_send_wr, NULL); + break; + case UCT_IB_MLX5_OBJ_TYPE_LAST: + break; + } + + qp_attr->super.srq_num = srq->srq_num; +} + +static ucs_status_t +uct_rc_mlx5_iface_check_no_devx_rx(uct_rc_mlx5_iface_common_t *iface) +{ + if (iface->config.cyclic_srq_enable == UCS_YES) { + ucs_error(UCT_IB_IFACE_FMT ": cyclic SRQ type is not supported", + UCT_IB_IFACE_ARG(&iface->super.super)); + return UCS_ERR_UNSUPPORTED; + } + + return UCS_OK; +} + +ucs_status_t +uct_rc_mlx5_common_iface_init_rx(uct_rc_mlx5_iface_common_t *iface, + const uct_rc_iface_common_config_t *rc_config) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.md, uct_ib_mlx5_md_t); + ucs_status_t status; + + status = uct_rc_mlx5_iface_check_no_devx_rx(iface); + if (status != UCS_OK) { + return status; + } + + status = uct_rc_iface_init_rx(&iface->super, rc_config, + &iface->rx.srq.verbs.srq); + if (status != UCS_OK) { + goto err; + } + + status = uct_ib_mlx5_verbs_srq_init(&iface->rx.srq, iface->rx.srq.verbs.srq, + iface->super.super.config.seg_size, + iface->tm.mp.num_strides); + if (status != UCS_OK) { + goto err_free_srq; + } + + iface->rx.srq.type = UCT_IB_MLX5_OBJ_TYPE_VERBS; + return UCS_OK; + +err_free_srq: + uct_rc_mlx5_destroy_srq(md, &iface->rx.srq); +err: + return status; +} + +void uct_rc_mlx5_destroy_srq(uct_ib_mlx5_md_t *md, uct_ib_mlx5_srq_t *srq) +{ + int UCS_V_UNUSED ret; + + switch (srq->type) { + case UCT_IB_MLX5_OBJ_TYPE_VERBS: + uct_ib_destroy_srq(srq->verbs.srq); + break; + case UCT_IB_MLX5_OBJ_TYPE_DEVX: +#if HAVE_DEVX + ret = mlx5dv_devx_obj_destroy(srq->devx.obj); + if (ret) { + ucs_warn("mlx5dv_devx_obj_destroy(SRQ) failed: %m"); } - uct_ib_mlx5_txwq_cleanup(&iface->tm.cmd_wq.super); - ucs_free(iface->tm.list); - ucs_free(iface->tm.cmd_wq.ops); - uct_rc_mlx5_tag_cleanup(iface); + uct_rc_mlx5_devx_cleanup_srq(md, srq); +#endif + break; + case UCT_IB_MLX5_OBJ_TYPE_LAST: + break; } } -#if IBV_HW_TM -static void uct_rc_mlx5_release_desc(uct_recv_desc_t *self, void *desc) +void uct_rc_mlx5_release_desc(uct_recv_desc_t *self, void *desc) { uct_rc_mlx5_release_desc_t *release = ucs_derived_of(self, uct_rc_mlx5_release_desc_t); @@ -327,11 +542,13 @@ static void uct_rc_mlx5_release_desc(uct_recv_desc_t *self, void *desc) ucs_mpool_put_inline(ib_desc); } +#if IBV_HW_TM /* tag is passed as parameter, because some (but not all!) transports may need * to translate TMH to LE */ -ucs_status_t uct_rc_mlx5_handle_rndv(uct_rc_mlx5_iface_common_t *iface, - struct ibv_tmh *tmh, uct_tag_t tag, - unsigned byte_len) +void uct_rc_mlx5_handle_unexp_rndv(uct_rc_mlx5_iface_common_t *iface, + struct ibv_tmh *tmh, uct_tag_t tag, + struct mlx5_cqe64 *cqe, unsigned flags, + unsigned byte_len) { uct_rc_mlx5_tmh_priv_data_t *priv = (uct_rc_mlx5_tmh_priv_data_t*)tmh->reserved; uct_ib_md_t *ib_md = uct_ib_iface_md(&iface->super.super); @@ -341,7 +558,8 @@ ucs_status_t uct_rc_mlx5_handle_rndv(uct_rc_mlx5_iface_common_t *iface, size_t rndv_data_len; void *rndv_usr_hdr; void *rb; - char packed_rkey[UCT_MD_COMPONENT_NAME_MAX + UCT_IB_MD_PACKED_RKEY_SIZE]; + ucs_status_t status; + char packed_rkey[UCT_COMPONENT_NAME_MAX + UCT_IB_MD_PACKED_RKEY_SIZE]; rvh = (struct ibv_rvh*)(tmh + 1); @@ -359,6 +577,9 @@ ucs_status_t uct_rc_mlx5_handle_rndv(uct_rc_mlx5_iface_common_t *iface, will be overwritten. That's why we saved rvh->length before. */ ucs_assert(priv->length <= UCT_RC_MLX5_TMH_PRIV_LEN); + /* When MP XRQ is configured, RTS is always a single fragment message */ + ucs_assert(UCT_RC_MLX5_SINGLE_FRAG_MSG(flags)); + memcpy((char*)rndv_usr_hdr - priv->length, &priv->data, priv->length); /* Create "packed" rkey to pass it in the callback */ @@ -366,15 +587,21 @@ ucs_status_t uct_rc_mlx5_handle_rndv(uct_rc_mlx5_iface_common_t *iface, uct_ib_md_pack_rkey(ntohl(rvh->rkey), UCT_IB_INVALID_RKEY, rb); /* Do not pass flags to cb, because rkey is allocated on stack */ - return iface->tm.rndv_unexp.cb(iface->tm.rndv_unexp.arg, 0, tag, + status = iface->tm.rndv_unexp.cb(iface->tm.rndv_unexp.arg, 0, tag, (char *)rndv_usr_hdr - priv->length, rndv_usr_hdr_len + priv->length, be64toh(rvh->va), rndv_data_len, packed_rkey); + + uct_rc_mlx5_iface_unexp_consumed(iface, iface->tm.rndv_desc.offset, + &iface->tm.rndv_desc.super, cqe, + status, ntohs(cqe->wqe_counter)); + + UCT_RC_MLX5_TM_STAT(iface, RX_RNDV_UNEXP); } #endif -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM static ucs_status_t uct_rc_mlx5_iface_common_dm_mpool_chunk_malloc(ucs_mpool_t *mp, size_t *size_p, void **chunk_p) { @@ -424,9 +651,10 @@ uct_rc_mlx5_iface_common_dm_tl_init(uct_mlx5_dm_data_t *data, uct_rc_iface_t *iface, const uct_ib_mlx5_iface_config_t *config) { + struct ibv_alloc_dm_attr dm_attr = {}; + struct mlx5dv_dm dvdm = {}; + uct_ib_mlx5dv_t obj = {}; ucs_status_t status; - struct ibv_exp_alloc_dm_attr dm_attr; - struct ibv_exp_reg_mr_in mr_in; data->seg_len = ucs_min(ucs_align_up(config->dm.seg_len, sizeof(uct_rc_mlx5_dm_copy_data_t)), @@ -437,28 +665,32 @@ uct_rc_mlx5_iface_common_dm_tl_init(uct_mlx5_dm_data_t *data, dm_attr.length = data->seg_len * data->seg_count; dm_attr.comp_mask = 0; - data->dm = ibv_exp_alloc_dm(data->device->ibv_context, &dm_attr); + data->dm = ibv_alloc_dm(data->device->ibv_context, &dm_attr); if (data->dm == NULL) { /* TODO: prompt warning? */ - ucs_debug("ibv_exp_alloc_dm(dev=%s length=%zu) failed: %m", + ucs_debug("ibv_alloc_dm(dev=%s length=%zu) failed: %m", uct_ib_device_name(data->device), dm_attr.length); return UCS_ERR_NO_RESOURCE; } - memset(&mr_in, 0, sizeof(mr_in)); - mr_in.pd = uct_ib_iface_md(&iface->super)->pd; - mr_in.comp_mask = IBV_EXP_REG_MR_DM; - mr_in.dm = data->dm; - mr_in.length = dm_attr.length; - data->mr = ibv_exp_reg_mr(&mr_in); + data->mr = ibv_reg_dm_mr(uct_ib_iface_md(&iface->super)->pd, + data->dm, 0, dm_attr.length, + IBV_ACCESS_ZERO_BASED | + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_ATOMIC); if (data->mr == NULL) { - ucs_warn("ibv_exp_reg_mr() error - On Device Memory registration failed, %d %m", errno); + ucs_warn("ibv_reg_mr_dm() error - On Device Memory registration failed, %d %m", errno); status = UCS_ERR_NO_RESOURCE; goto failed_mr; } - data->start_va = ((uct_mlx5_dm_va_t*)data->dm)->start_va; + UCT_IB_MLX5_DV_DM(obj).in = data->dm; + UCT_IB_MLX5_DV_DM(obj).out = &dvdm; + uct_ib_mlx5dv_init_obj(&obj, MLX5DV_OBJ_DM); + data->start_va = dvdm.buf; status = ucs_mpool_init(&data->mp, 0, sizeof(uct_rc_iface_send_desc_t), 0, UCS_SYS_CACHE_LINE_SIZE, @@ -475,7 +707,7 @@ uct_rc_mlx5_iface_common_dm_tl_init(uct_mlx5_dm_data_t *data, failed_mpool: ibv_dereg_mr(data->mr); failed_mr: - ibv_exp_free_dm(data->dm); + ibv_free_dm(data->dm); data->dm = NULL; return status; } @@ -487,26 +719,59 @@ static void uct_rc_mlx5_iface_common_dm_tl_cleanup(uct_mlx5_dm_data_t *data) ucs_mpool_cleanup(&data->mp, 1); ibv_dereg_mr(data->mr); - ibv_exp_free_dm(data->dm); + ibv_free_dm(data->dm); } #endif #if IBV_HW_TM -ucs_status_t uct_rc_mlx5_init_rx_tm(uct_rc_mlx5_iface_common_t *iface, - const uct_rc_mlx5_iface_common_config_t *config, - struct ibv_exp_create_srq_attr *srq_init_attr, - unsigned rndv_hdr_len, - unsigned max_cancel_sync_ops) + +void uct_rc_mlx5_init_rx_tm_common(uct_rc_mlx5_iface_common_t *iface, + const uct_rc_iface_common_config_t *config, + unsigned rndv_hdr_len) { uct_ib_md_t *md = uct_ib_iface_md(&iface->super.super); unsigned tmh_hdrs_len = sizeof(struct ibv_tmh) + rndv_hdr_len; + ucs_status_t status; iface->tm.eager_desc.super.cb = uct_rc_mlx5_release_desc; - iface->tm.eager_desc.offset = sizeof(struct ibv_tmh) - - sizeof(uct_rc_mlx5_hdr_t) - + iface->super.super.config.rx_headroom_offset; - iface->tm.rndv_desc.super.cb = uct_rc_mlx5_release_desc; + + if (UCT_RC_MLX5_MP_ENABLED(iface)) { + iface->tm.eager_desc.offset = sizeof(struct ibv_tmh) + + iface->super.super.config.rx_headroom_offset; + iface->tm.am_desc.offset = sizeof(uct_rc_mlx5_hdr_t) + + iface->super.super.config.rx_headroom_offset; + status = uct_iface_mpool_init(&iface->super.super.super, + &iface->tm.mp.tx_mp, + sizeof(uct_rc_iface_send_desc_t) + + iface->tm.max_bcopy, + sizeof(uct_rc_iface_send_desc_t), + UCS_SYS_CACHE_LINE_SIZE, + &config->super.tx.mp, + iface->super.config.tx_qp_len, + uct_rc_iface_send_desc_init, + "tag_eager_send_desc"); + if (status != UCS_OK) { + return; + } + + kh_init_inplace(uct_rc_mlx5_mp_hash_lid, &iface->tm.mp.hash_lid); + kh_init_inplace(uct_rc_mlx5_mp_hash_gid, &iface->tm.mp.hash_gid); + + iface->tm.bcopy_mp = &iface->tm.mp.tx_mp; + iface->tm.max_zcopy = uct_ib_iface_port_attr(&iface->super.super)->max_msg_sz; + + ucs_debug("MP WQ config: iface %p stride size %d, strides per WQE %d", + iface, iface->super.super.config.seg_size, + iface->tm.mp.num_strides); + } else { + iface->tm.eager_desc.offset = sizeof(struct ibv_tmh) - + sizeof(uct_rc_mlx5_hdr_t) + + iface->super.super.config.rx_headroom_offset; + iface->tm.bcopy_mp = &iface->super.tx.mp; + iface->tm.max_zcopy = iface->super.super.config.seg_size; + } + iface->tm.rndv_desc.offset = iface->tm.eager_desc.offset + rndv_hdr_len; ucs_assert(IBV_DEVICE_TM_CAPS(&md->dev, max_rndv_hdr_size) >= tmh_hdrs_len); @@ -516,71 +781,96 @@ ucs_status_t uct_rc_mlx5_init_rx_tm(uct_rc_mlx5_iface_common_t *iface, /* Init ptr array to store completions of RNDV operations. Index in * ptr_array is used as operation ID and is passed in "app_context" * of TM header. */ - ucs_ptr_array_init(&iface->tm.rndv_comps, 0, "rm_rndv_completions"); + ucs_ptr_array_init(&iface->tm.rndv_comps, "tm_rndv_completions"); + + /* Set of addresses posted to the HW. Used to avoid posting of the same + * address more than once. */ + kh_init_inplace(uct_rc_mlx5_tag_addrs, &iface->tm.tag_addrs); +} + +ucs_status_t uct_rc_mlx5_init_rx_tm(uct_rc_mlx5_iface_common_t *iface, + const uct_rc_iface_common_config_t *config, + struct ibv_srq_init_attr_ex *srq_attr, + unsigned rndv_hdr_len) +{ + uct_ib_md_t *md = uct_ib_iface_md(&iface->super.super); + ucs_status_t status; + status = uct_rc_mlx5_iface_check_no_devx_rx(iface); + if (status != UCS_OK) { + return status; + } + + uct_rc_mlx5_init_rx_tm_common(iface, config, rndv_hdr_len); + + ucs_assert(iface->tm.mp.num_strides == 1); /* MP XRQ is supported with DEVX only */ #if HAVE_DECL_IBV_EXP_CREATE_SRQ /* Create TM-capable XRQ */ - srq_init_attr->base.attr.max_sge = 1; - srq_init_attr->base.attr.max_wr = ucs_max(IBV_DEVICE_MIN_UWQ_POST, - config->super.super.rx.queue_len); - srq_init_attr->base.attr.srq_limit = 0; - srq_init_attr->base.srq_context = iface; - srq_init_attr->srq_type = IBV_EXP_SRQT_TAG_MATCHING; - srq_init_attr->pd = md->pd; - srq_init_attr->cq = iface->super.super.cq[UCT_IB_DIR_RX]; - srq_init_attr->tm_cap.max_num_tags = iface->tm.num_tags; - - /* 2 ops for each tag (ADD + DEL) and extra ops for SYNC. - * There can be up to "max_cancel_sync_ops" SYNC ops during cancellation. - * Also we assume that there can be up to two pending SYNC ops during - * unexpected messages flow. */ - iface->tm.cmd_qp_len = (2 * iface->tm.num_tags) + max_cancel_sync_ops + 2; - srq_init_attr->tm_cap.max_ops = iface->tm.cmd_qp_len; - srq_init_attr->comp_mask |= IBV_EXP_CREATE_SRQ_CQ | + srq_attr->base.attr.max_sge = 1; + srq_attr->base.attr.max_wr = ucs_max(UCT_IB_MLX5_XRQ_MIN_UWQ_POST, + config->super.rx.queue_len); + srq_attr->base.attr.srq_limit = 0; + srq_attr->base.srq_context = iface; + srq_attr->srq_type = IBV_EXP_SRQT_TAG_MATCHING; + srq_attr->pd = md->pd; + srq_attr->cq = iface->super.super.cq[UCT_IB_DIR_RX]; + srq_attr->tm_cap.max_num_tags = iface->tm.num_tags; + + uct_rc_mlx5_iface_tm_set_cmd_qp_len(iface); + srq_attr->tm_cap.max_ops = iface->tm.cmd_qp_len; + srq_attr->comp_mask |= IBV_EXP_CREATE_SRQ_CQ | IBV_EXP_CREATE_SRQ_TM; - iface->super.rx.srq.srq = ibv_exp_create_srq(md->dev.ibv_context, srq_init_attr); - if (iface->super.rx.srq.srq == NULL) { + iface->rx.srq.verbs.srq = ibv_exp_create_srq(md->dev.ibv_context, srq_attr); + if (iface->rx.srq.verbs.srq == NULL) { ucs_error("ibv_exp_create_srq(device=%s) failed: %m", uct_ib_device_name(&md->dev)); return UCS_ERR_IO_ERROR; } - iface->super.rx.srq.quota = srq_init_attr->base.attr.max_wr; + iface->super.rx.srq.quota = srq_attr->base.attr.max_wr; #elif HAVE_DECL_IBV_CREATE_SRQ_EX - srq_init_attr->attr.max_sge = 1; - srq_init_attr->attr.max_wr = ucs_max(IBV_DEVICE_MIN_UWQ_POST, - config->super.super.rx.queue_len); - srq_init_attr->attr.srq_limit = 0; - srq_init_attr->srq_context = iface; - srq_init_attr->srq_type = IBV_SRQT_TM; - srq_init_attr->pd = md->pd; - srq_init_attr->cq = iface->super.super.cq[UCT_IB_DIR_RX]; - srq_init_attr->tm_cap.max_num_tags = iface->tm.num_tags; - - /* 2 ops for each tag (ADD + DEL) and extra ops for SYNC. - * There can be up to "max_cancel_sync_ops" SYNC ops during cancellation. - * Also we assume that there can be up to two pending SYNC ops during - * unexpected messages flow. */ - iface->tm.cmd_qp_len = (2 * iface->tm.num_tags) + max_cancel_sync_ops + 2; - srq_init_attr->tm_cap.max_ops = iface->tm.cmd_qp_len; - srq_init_attr->comp_mask |= IBV_SRQ_INIT_ATTR_TYPE | + srq_attr->attr.max_sge = 1; + srq_attr->attr.max_wr = ucs_max(UCT_IB_MLX5_XRQ_MIN_UWQ_POST, + config->super.rx.queue_len); + srq_attr->attr.srq_limit = 0; + srq_attr->srq_context = iface; + srq_attr->srq_type = IBV_SRQT_TM; + srq_attr->pd = md->pd; + srq_attr->cq = iface->super.super.cq[UCT_IB_DIR_RX]; + srq_attr->tm_cap.max_num_tags = iface->tm.num_tags; + + uct_rc_mlx5_iface_tm_set_cmd_qp_len(iface); + srq_attr->tm_cap.max_ops = iface->tm.cmd_qp_len; + srq_attr->comp_mask |= IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_PD | IBV_SRQ_INIT_ATTR_CQ | IBV_SRQ_INIT_ATTR_TM; - iface->super.rx.srq.srq = ibv_create_srq_ex(md->dev.ibv_context, srq_init_attr); - if (iface->super.rx.srq.srq == NULL) { + iface->rx.srq.verbs.srq = ibv_create_srq_ex(md->dev.ibv_context, srq_attr); + if (iface->rx.srq.verbs.srq == NULL) { ucs_error("ibv_create_srq_ex(device=%s) failed: %m", uct_ib_device_name(&md->dev)); return UCS_ERR_IO_ERROR; } - iface->super.rx.srq.quota = srq_init_attr->attr.max_wr; + iface->super.rx.srq.quota = srq_attr->attr.max_wr; #endif + status = uct_ib_mlx5_verbs_srq_init(&iface->rx.srq, iface->rx.srq.verbs.srq, + iface->super.super.config.seg_size, + iface->tm.mp.num_strides); + if (status != UCS_OK) { + goto err_free_srq; + } + + iface->rx.srq.type = UCT_IB_MLX5_OBJ_TYPE_VERBS; ucs_debug("Tag Matching enabled: tag list size %d", iface->tm.num_tags); return UCS_OK; + +err_free_srq: + uct_ib_destroy_srq(iface->rx.srq.verbs.srq); + return status; } #endif @@ -596,11 +886,11 @@ void uct_rc_mlx5_tag_cleanup(uct_rc_mlx5_iface_common_t *iface) static void uct_rc_mlx5_tag_query(uct_rc_mlx5_iface_common_t *iface, uct_iface_attr_t *iface_attr, - size_t max_inline, size_t max_iov) + size_t max_inline, size_t max_tag_eager_iov) { #if IBV_HW_TM unsigned eager_hdr_size = sizeof(struct ibv_tmh); - struct ibv_exp_port_attr* port_attr; + struct ibv_port_attr* port_attr; if (!UCT_RC_MLX5_TM_ENABLED(iface)) { return; @@ -615,24 +905,21 @@ static void uct_rc_mlx5_tag_query(uct_rc_mlx5_iface_common_t *iface, iface_attr->cap.flags |= UCT_IFACE_FLAG_TAG_EAGER_SHORT; } - iface_attr->cap.tag.eager.max_bcopy = iface->super.super.config.seg_size - - eager_hdr_size; - iface_attr->cap.tag.eager.max_zcopy = iface->super.super.config.seg_size - - eager_hdr_size; - iface_attr->cap.tag.eager.max_iov = max_iov; - port_attr = uct_ib_iface_port_attr(&iface->super.super); - iface_attr->cap.tag.rndv.max_zcopy = port_attr->max_msg_sz; + iface_attr->cap.tag.rndv.max_zcopy = port_attr->max_msg_sz; /* TMH can carry 2 additional bytes of private data */ - iface_attr->cap.tag.rndv.max_hdr = iface->tm.max_rndv_data + - UCT_RC_MLX5_TMH_PRIV_LEN; - iface_attr->cap.tag.rndv.max_iov = 1; - + iface_attr->cap.tag.rndv.max_hdr = iface->tm.max_rndv_data + + UCT_RC_MLX5_TMH_PRIV_LEN; + iface_attr->cap.tag.rndv.max_iov = 1; iface_attr->cap.tag.recv.max_zcopy = port_attr->max_msg_sz; iface_attr->cap.tag.recv.max_iov = 1; - iface_attr->cap.tag.recv.min_recv = 0; + iface_attr->cap.tag.recv.min_recv = + iface->super.super.config.max_inl_cqe[UCT_IB_DIR_RX] + 1; iface_attr->cap.tag.recv.max_outstanding = iface->tm.num_tags; + iface_attr->cap.tag.eager.max_iov = max_tag_eager_iov; + iface_attr->cap.tag.eager.max_bcopy = iface->tm.max_bcopy - eager_hdr_size; + iface_attr->cap.tag.eager.max_zcopy = iface->tm.max_zcopy - eager_hdr_size; #endif } @@ -641,7 +928,7 @@ uct_rc_mlx5_iface_common_dm_init(uct_rc_mlx5_iface_common_t *iface, uct_rc_iface_t *rc_iface, const uct_ib_mlx5_iface_config_t *mlx5_config) { -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM if ((mlx5_config->dm.seg_len * mlx5_config->dm.count) == 0) { goto fallback; } @@ -668,7 +955,7 @@ uct_rc_mlx5_iface_common_dm_init(uct_rc_mlx5_iface_common_t *iface, void uct_rc_mlx5_iface_common_dm_cleanup(uct_rc_mlx5_iface_common_t *iface) { -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM if (iface->dm.dm) { uct_worker_tl_data_put(iface->dm.dm, uct_rc_mlx5_iface_common_dm_tl_cleanup); } @@ -677,7 +964,7 @@ void uct_rc_mlx5_iface_common_dm_cleanup(uct_rc_mlx5_iface_common_t *iface) void uct_rc_mlx5_iface_common_query(uct_ib_iface_t *ib_iface, uct_iface_attr_t *iface_attr, - size_t max_inline, size_t av_size) + size_t max_inline, size_t max_tag_eager_iov) { uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(ib_iface, uct_rc_mlx5_iface_common_t); @@ -687,7 +974,7 @@ void uct_rc_mlx5_iface_common_query(uct_ib_iface_t *ib_iface, iface_attr->cap.flags |= UCT_IFACE_FLAG_ERRHANDLE_ZCOPY_BUF | UCT_IFACE_FLAG_ERRHANDLE_REMOTE_MEM; - if (dev->pci_fadd_arg_sizes || dev->pci_cswap_arg_sizes) { + if (uct_ib_device_has_pci_atomics(dev)) { if (dev->pci_fadd_arg_sizes & sizeof(uint64_t)) { iface_attr->cap.atomic64.op_flags |= UCS_BIT(UCT_ATOMIC_OP_ADD); iface_attr->cap.atomic64.fop_flags |= UCS_BIT(UCT_ATOMIC_OP_ADD); @@ -695,6 +982,7 @@ void uct_rc_mlx5_iface_common_query(uct_ib_iface_t *ib_iface, if (dev->pci_cswap_arg_sizes & sizeof(uint64_t)) { iface_attr->cap.atomic64.fop_flags |= UCS_BIT(UCT_ATOMIC_OP_CSWAP); } + if (dev->pci_fadd_arg_sizes & sizeof(uint32_t)) { iface_attr->cap.atomic32.op_flags |= UCS_BIT(UCT_ATOMIC_OP_ADD); iface_attr->cap.atomic32.fop_flags |= UCS_BIT(UCT_ATOMIC_OP_ADD); @@ -739,11 +1027,10 @@ void uct_rc_mlx5_iface_common_query(uct_ib_iface_t *ib_iface, } /* Software overhead */ - iface_attr->overhead = 40e-9; + iface_attr->overhead = 40e-9; /* Tag Offload */ - uct_rc_mlx5_tag_query(iface, iface_attr, max_inline, - UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(av_size)); + uct_rc_mlx5_tag_query(iface, iface_attr, max_inline, max_tag_eager_iov); } void uct_rc_mlx5_iface_common_update_cqs_ci(uct_rc_mlx5_iface_common_t *iface, @@ -809,7 +1096,8 @@ int uct_rc_mlx5_iface_commom_clean(uct_ib_mlx5_cq_t *mlx5_cq, } else if (nfreed) { dest = uct_ib_mlx5_get_cqe(mlx5_cq, pi + nfreed); owner_bit = dest->op_own & MLX5_CQE_OWNER_MASK; - memcpy((void*)(dest + 1) - cqe_sz, (void*)(cqe + 1) - cqe_sz, cqe_sz); + memcpy(UCS_PTR_BYTE_OFFSET(dest + 1, -cqe_sz), + UCS_PTR_BYTE_OFFSET(cqe + 1, -cqe_sz), cqe_sz); dest->op_own = (dest->op_own & ~MLX5_CQE_OWNER_MASK) | owner_bit; } } @@ -819,30 +1107,3 @@ int uct_rc_mlx5_iface_commom_clean(uct_ib_mlx5_cq_t *mlx5_cq, return nfreed; } -ucs_status_t uct_rc_mlx5_iface_fence(uct_iface_h tl_iface, unsigned flags) -{ - uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_iface, uct_rc_mlx5_iface_common_t); - uct_ib_md_t *md = uct_ib_iface_md(&iface->super.super); - - if (md->dev.pci_fadd_arg_sizes || md->dev.pci_cswap_arg_sizes) { - iface->tx.next_fm = UCT_IB_MLX5_WQE_CTRL_FENCE_ATOMIC; - iface->tx.fence_beat++; - } - - UCT_TL_IFACE_STAT_FENCE(&iface->super.super.super); - return UCS_OK; -} - -ucs_status_t uct_rc_mlx5_init_res_domain(uct_ib_iface_t *ib_iface) -{ - uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(ib_iface, uct_rc_mlx5_iface_common_t); - - return uct_ib_mlx5_iface_init_res_domain(ib_iface, &iface->mlx5_common); -} - -void uct_rc_mlx5_cleanup_res_domain(uct_ib_iface_t *ib_iface) -{ - uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(ib_iface, uct_rc_mlx5_iface_common_t); - - uct_ib_mlx5_iface_cleanup_res_domain(&iface->mlx5_common); -} diff --git a/src/uct/ib/rc/accel/rc_mlx5_common.h b/src/uct/ib/rc/accel/rc_mlx5_common.h index dc1ab84f216..40bd7a791aa 100644 --- a/src/uct/ib/rc/accel/rc_mlx5_common.h +++ b/src/uct/ib/rc/accel/rc_mlx5_common.h @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -13,9 +13,50 @@ #include -#define UCT_RC_MLX5_OPCODE_FLAG_RAW 0x100 -#define UCT_RC_MLX5_OPCODE_FLAG_TM 0x200 -#define UCT_RC_MLX5_OPCODE_MASK 0xff +/* + * HW tag matching + */ +#if IBV_HW_TM +# define UCT_RC_RNDV_HDR_LEN sizeof(struct ibv_rvh) +#else +# define UCT_RC_RNDV_HDR_LEN 0 +#endif + +#if IBV_HW_TM +# if HAVE_INFINIBAND_TM_TYPES_H +# include +# else +# define ibv_tmh ibv_exp_tmh +# define ibv_rvh ibv_exp_tmh_rvh +# define IBV_TM_CAP_RC IBV_EXP_TM_CAP_RC +# define IBV_TMH_EAGER IBV_EXP_TMH_EAGER +# define IBV_TMH_RNDV IBV_EXP_TMH_RNDV +# define IBV_TMH_FIN IBV_EXP_TMH_FIN +# define IBV_TMH_NO_TAG IBV_EXP_TMH_NO_TAG +# endif +# define IBV_DEVICE_TM_CAPS(_dev, _field) ((_dev)->dev_attr.tm_caps._field) +#else +# define IBV_TM_CAP_RC 0 +# define IBV_DEVICE_TM_CAPS(_dev, _field) 0 +#endif + +#if HAVE_STRUCT_IBV_TM_CAPS_FLAGS +# define IBV_DEVICE_TM_FLAGS(_dev) IBV_DEVICE_TM_CAPS(_dev, flags) +#else +# define IBV_DEVICE_TM_FLAGS(_dev) IBV_DEVICE_TM_CAPS(_dev, capability_flags) +#endif + +#define IBV_DEVICE_MAX_UNEXP_COUNT UCS_BIT(14) + +#if HAVE_DECL_IBV_EXP_CREATE_SRQ +# define ibv_srq_init_attr_ex ibv_exp_create_srq_attr +#endif + +#define UCT_RC_MLX5_OPCODE_FLAG_RAW 0x100 +#define UCT_RC_MLX5_OPCODE_FLAG_TM 0x200 +#define UCT_RC_MLX5_OPCODE_MASK 0xff +#define UCT_RC_MLX5_SINGLE_FRAG_MSG(_flags) \ + (((_flags) & UCT_CB_PARAM_FLAG_FIRST) && !((_flags) & UCT_CB_PARAM_FLAG_MORE)) #define UCT_RC_MLX5_CHECK_AM_ZCOPY(_id, _header_length, _length, _seg_size, _av_size) \ UCT_CHECK_AM_ID(_id); \ @@ -76,37 +117,64 @@ enum { /* TODO: Remove/replace this enum when mlx5dv.h is included */ enum { - UCT_RC_MLX5_OPCODE_TAG_MATCHING = 0x28, - UCT_RC_MLX5_CQE_APP_TAG_MATCHING = 1, + UCT_RC_MLX5_OPCODE_TAG_MATCHING = 0x28, + UCT_RC_MLX5_CQE_APP_TAG_MATCHING = 1, + + /* last packet flag for multi-packet RQs */ + UCT_RC_MLX5_MP_RQ_LAST_MSG_FIELD = 0x40000000, + + /* byte count mask for multi-packet RQs */ + UCT_RC_MLX5_MP_RQ_BYTE_CNT_FIELD_MASK = 0x0000FFFF, + + UCT_RC_MLX5_MP_RQ_NUM_STRIDES_FIELD_MASK = 0x3FFF0000, + + /* filler cqe indicator */ + UCT_RC_MLX5_MP_RQ_FILLER_CQE = UCS_BIT(31), /* tag segment flags */ - UCT_RC_MLX5_SRQ_FLAG_TM_SW_CNT = (1 << 6), - UCT_RC_MLX5_SRQ_FLAG_TM_CQE_REQ = (1 << 7), + UCT_RC_MLX5_SRQ_FLAG_TM_SW_CNT = (1 << 6), + UCT_RC_MLX5_SRQ_FLAG_TM_CQE_REQ = (1 << 7), /* tag CQE codes */ - UCT_RC_MLX5_CQE_APP_OP_TM_CONSUMED = 0x1, - UCT_RC_MLX5_CQE_APP_OP_TM_EXPECTED = 0x2, - UCT_RC_MLX5_CQE_APP_OP_TM_UNEXPECTED = 0x3, - UCT_RC_MLX5_CQE_APP_OP_TM_NO_TAG = 0x4, - UCT_RC_MLX5_CQE_APP_OP_TM_APPEND = 0x5, - UCT_RC_MLX5_CQE_APP_OP_TM_REMOVE = 0x6, - UCT_RC_MLX5_CQE_APP_OP_TM_CONSUMED_MSG = 0xA + UCT_RC_MLX5_CQE_APP_OP_TM_CONSUMED = 0x1, + UCT_RC_MLX5_CQE_APP_OP_TM_EXPECTED = 0x2, + UCT_RC_MLX5_CQE_APP_OP_TM_UNEXPECTED = 0x3, + UCT_RC_MLX5_CQE_APP_OP_TM_NO_TAG = 0x4, + UCT_RC_MLX5_CQE_APP_OP_TM_APPEND = 0x5, + UCT_RC_MLX5_CQE_APP_OP_TM_REMOVE = 0x6, + UCT_RC_MLX5_CQE_APP_OP_TM_CONSUMED_MSG = 0xA }; +enum { + UCT_RC_MLX5_POLL_FLAG_TM = UCS_BIT(0), + UCT_RC_MLX5_POLL_FLAG_HAS_EP = UCS_BIT(1), + UCT_RC_MLX5_POLL_FLAG_TAG_CQE = UCS_BIT(2) +}; + + +#define UCT_RC_MLX5_RMA_MAX_IOV(_av_size) \ + ((UCT_IB_MLX5_MAX_SEND_WQE_SIZE - ((_av_size) + \ + sizeof(struct mlx5_wqe_raddr_seg) + sizeof(struct mlx5_wqe_ctrl_seg))) / \ + sizeof(struct mlx5_wqe_data_seg)) + + #if IBV_HW_TM # define UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(_av_size) \ (UCT_IB_MLX5_AM_MAX_SHORT(_av_size + sizeof(struct ibv_tmh))/ \ sizeof(struct mlx5_wqe_data_seg)) -# else +#else # define UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(_av_size) 0 #endif /* IBV_HW_TM */ + #define UCT_RC_MLX5_TM_CQE_WITH_IMM(_cqe64) \ (((_cqe64)->op_own >> 4) == MLX5_CQE_RESP_SEND_IMM) + #define UCT_RC_MLX5_TM_IS_SW_RNDV(_cqe64, _imm_data) \ (ucs_unlikely(UCT_RC_MLX5_TM_CQE_WITH_IMM(_cqe64) && !(_imm_data))) + #define UCT_RC_MLX5_CHECK_TAG(_mlx5_common_iface) \ if (ucs_unlikely((_mlx5_common_iface)->tm.head->next == NULL)) { \ return UCS_ERR_EXCEEDS_LIMIT; \ @@ -157,7 +225,6 @@ typedef struct uct_rc_mlx5_srq_op { /* Command QP work-queue. All tag matching list operations are posted on it. */ typedef struct uct_rc_mlx5_cmd_wq { uct_ib_mlx5_txwq_t super; - uint32_t qp_num; /* command QP num */ uct_rc_mlx5_srq_op_t *ops; /* array of operations on command QP */ int ops_head; /* points to the next operation to be completed */ int ops_tail; /* points to the last adde operation*/ @@ -165,6 +232,62 @@ typedef struct uct_rc_mlx5_cmd_wq { ops array size */ } uct_rc_mlx5_cmd_wq_t; + +/* Message context used with multi-packet XRQ */ +typedef struct uct_rc_mlx5_mp_context { + /* Storage for a per-message user-defined context. Must be passed unchanged + * to the user in uct_tag_unexp_eager_cb_t. */ + void *context; + + /* Tag is saved when first fragment (with TMH) arrives and then passed to + * the eager unexpected callback for subsequent fragments. */ + uct_tag_t tag; + + /* With MP XRQ immediate value is delivered with the last fragment, while + * TMH is present in the first fragment only. Need to save app_context + * from TMH in this field and construct immediate data for unexpected + * eager callback when the last message fragment arrives. */ + uint32_t app_ctx; + + /* Used when local EP can be found by sender QP number (rc_mlx5 tl). + * When 0, it means that tag eager unexpected multi-fragmented message + * is being processed (not all fragments are delivered to the user via + * uct_tag_unexp_eager_cb_t callback yet). Otherwise, any incoming tag + * eager message should be either a single fragment message or the first + * fragment of multi-fragmeneted message. */ + uint8_t free; +} uct_rc_mlx5_mp_context_t; + + +typedef struct uct_rc_mlx5_mp_hash_key { + uint64_t guid; + uint32_t qp_num; +} uct_rc_mlx5_mp_hash_key_t; + + +static UCS_F_ALWAYS_INLINE int +uct_rc_mlx5_mp_hash_equal(uct_rc_mlx5_mp_hash_key_t key1, + uct_rc_mlx5_mp_hash_key_t key2) +{ + return (key1.qp_num == key2.qp_num) && (key1.guid == key2.guid); +} + + +static UCS_F_ALWAYS_INLINE khint32_t +uct_rc_mlx5_mp_hash_func(uct_rc_mlx5_mp_hash_key_t key) +{ + return kh_int64_hash_func(key.guid ^ key.qp_num); +} + + +KHASH_MAP_INIT_INT64(uct_rc_mlx5_mp_hash_lid, uct_rc_mlx5_mp_context_t); + + +KHASH_INIT(uct_rc_mlx5_mp_hash_gid, uct_rc_mlx5_mp_hash_key_t, + uct_rc_mlx5_mp_context_t, 1, uct_rc_mlx5_mp_hash_func, + uct_rc_mlx5_mp_hash_equal); + + #if IBV_HW_TM # define UCT_RC_MLX5_IFACE_GET_TM_BCOPY_DESC(_iface, _mp, _desc, _tag, _app_ctx, \ _pack_cb, _arg, _length) \ @@ -174,7 +297,7 @@ typedef struct uct_rc_mlx5_cmd_wq { (_desc)->super.handler = (uct_rc_send_handler_t)ucs_mpool_put; \ hdr = (_desc) + 1; \ uct_rc_mlx5_fill_tmh(hdr, _tag, _app_ctx, IBV_TMH_EAGER); \ - hdr += sizeof(struct ibv_tmh); \ + hdr = UCS_PTR_BYTE_OFFSET(hdr, sizeof(struct ibv_tmh)); \ _length = _pack_cb(hdr, _arg); \ } #endif @@ -197,6 +320,7 @@ typedef struct uct_rc_mlx5_tmh_priv_data { uint16_t data; } UCS_S_PACKED uct_rc_mlx5_tmh_priv_data_t; +void uct_rc_mlx5_release_desc(uct_recv_desc_t *self, void *desc); typedef struct uct_rc_mlx5_release_desc { uct_recv_desc_t super; @@ -212,12 +336,12 @@ typedef struct uct_rc_mlx5_ctx_priv { uint32_t tag_handle; } uct_rc_mlx5_ctx_priv_t; -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM typedef struct uct_mlx5_dm_data { uct_worker_tl_data_t super; ucs_mpool_t mp; struct ibv_mr *mr; - struct ibv_exp_dm *dm; + struct ibv_dm *dm; void *start_va; size_t seg_len; unsigned seg_count; @@ -232,81 +356,104 @@ typedef union uct_rc_mlx5_dm_copy_data { } UCS_S_PACKED uct_rc_mlx5_dm_copy_data_t; #endif +#define uct_rc_mlx5_tag_addr_hash(_ptr) kh_int64_hash_func((uintptr_t)(_ptr)) +KHASH_INIT(uct_rc_mlx5_tag_addrs, void*, char, 0, uct_rc_mlx5_tag_addr_hash, + kh_int64_hash_equal) + typedef struct uct_rc_mlx5_iface_common { - uct_rc_iface_t super; - uct_ib_mlx5_iface_common_t mlx5_common; + uct_rc_iface_t super; struct { - ucs_mpool_t atomic_desc_mp; - uct_ib_mlx5_mmio_mode_t mmio_mode; - uint16_t bb_max; /* limit number of outstanding WQE BBs */ - uint16_t fence_beat; /* 16bit is enough because if it wraps around, - * it means the older ops are already completed - * because QP size is less than 64k */ - uint8_t next_fm; + ucs_mpool_t atomic_desc_mp; + uct_ib_mlx5_mmio_mode_t mmio_mode; + uint16_t bb_max; /* limit number of outstanding WQE BBs */ } tx; struct { - uct_ib_mlx5_srq_t srq; - void *pref_ptr; + uct_ib_mlx5_srq_t srq; + void *pref_ptr; } rx; - uct_ib_mlx5_cq_t cq[UCT_IB_DIR_NUM]; + uct_ib_mlx5_cq_t cq[UCT_IB_DIR_NUM]; struct { - struct ibv_qp *cmd_qp; /* set if QP was created by UCX */ - uct_rc_mlx5_cmd_wq_t cmd_wq; - uct_rc_mlx5_tag_entry_t *head; - uct_rc_mlx5_tag_entry_t *tail; - uct_rc_mlx5_tag_entry_t *list; - - ucs_ptr_array_t rndv_comps; - unsigned num_tags; - unsigned num_outstanding; - unsigned max_rndv_data; - uint16_t unexpected_cnt; - uint16_t cmd_qp_len; - uint8_t enabled; + uct_rc_mlx5_cmd_wq_t cmd_wq; + uct_rc_mlx5_tag_entry_t *head; + uct_rc_mlx5_tag_entry_t *tail; + uct_rc_mlx5_tag_entry_t *list; + ucs_mpool_t *bcopy_mp; + khash_t(uct_rc_mlx5_tag_addrs) tag_addrs; + + ucs_ptr_array_t rndv_comps; + size_t max_bcopy; + size_t max_zcopy; + unsigned num_tags; + unsigned num_outstanding; + unsigned max_rndv_data; + uint16_t unexpected_cnt; + uint16_t cmd_qp_len; + uint8_t enabled; + struct { + uint8_t num_strides; + ucs_mpool_t tx_mp; + uct_rc_mlx5_mp_context_t last_frag_ctx; + khash_t(uct_rc_mlx5_mp_hash_lid) hash_lid; + khash_t(uct_rc_mlx5_mp_hash_gid) hash_gid; + } mp; struct { - void *arg; /* User defined arg */ - uct_tag_unexp_eager_cb_t cb; /* Callback for unexpected eager messages */ + void *arg; /* User defined arg */ + uct_tag_unexp_eager_cb_t cb; /* Callback for unexpected eager messages */ } eager_unexp; struct { - void *arg; /* User defined arg */ - uct_tag_unexp_rndv_cb_t cb; /* Callback for unexpected rndv messages */ + void *arg; /* User defined arg */ + uct_tag_unexp_rndv_cb_t cb; /* Callback for unexpected rndv messages */ } rndv_unexp; - uct_rc_mlx5_release_desc_t eager_desc; - uct_rc_mlx5_release_desc_t rndv_desc; - UCS_STATS_NODE_DECLARE(stats); + uct_rc_mlx5_release_desc_t eager_desc; + uct_rc_mlx5_release_desc_t rndv_desc; + uct_rc_mlx5_release_desc_t am_desc; + UCS_STATS_NODE_DECLARE(stats) } tm; -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM struct { - uct_mlx5_dm_data_t *dm; - size_t seg_len; /* cached value to avoid double-pointer access */ - ucs_status_t (*am_short)(uct_ep_h tl_ep, uint8_t id, uint64_t hdr, - const void *payload, unsigned length); - ucs_status_t (*tag_short)(uct_ep_h tl_ep, uct_tag_t tag, - const void *data, size_t length); + uct_mlx5_dm_data_t *dm; + size_t seg_len; /* cached value to avoid double-pointer access */ + ucs_status_t (*am_short)(uct_ep_h tl_ep, uint8_t id, uint64_t hdr, + const void *payload, unsigned length); + ucs_status_t (*tag_short)(uct_ep_h tl_ep, uct_tag_t tag, + const void *data, size_t length); } dm; #endif - UCS_STATS_NODE_DECLARE(stats); +#if HAVE_DECL_MLX5DV_DEVX_SUBSCRIBE_DEVX_EVENT + struct mlx5dv_devx_event_channel *event_channel; +#endif + struct { + uint8_t atomic_fence_flag; + ucs_ternary_value_t cyclic_srq_enable; + } config; + UCS_STATS_NODE_DECLARE(stats) } uct_rc_mlx5_iface_common_t; /** * Common RC/DC mlx5 interface configuration */ typedef struct uct_rc_mlx5_iface_common_config { - uct_rc_iface_config_t super; - uct_ib_mlx5_iface_config_t mlx5_common; - unsigned tx_max_bb; + uct_ib_mlx5_iface_config_t super; + unsigned tx_max_bb; struct { - int enable; - unsigned list_size; - size_t max_bcopy; + int enable; + unsigned list_size; + size_t seg_size; + ucs_ternary_value_t mp_enable; + size_t mp_num_strides; } tm; + unsigned exp_backoff; + ucs_ternary_value_t cyclic_srq_enable; } uct_rc_mlx5_iface_common_config_t; -UCS_CLASS_DECLARE(uct_rc_mlx5_iface_common_t, uct_rc_iface_ops_t*, +UCS_CLASS_DECLARE(uct_rc_mlx5_iface_common_t, + uct_rc_iface_ops_t*, uct_md_h, uct_worker_h, - const uct_iface_params_t*, uct_rc_mlx5_iface_common_config_t*, + const uct_iface_params_t*, + uct_rc_iface_common_config_t*, + uct_rc_mlx5_iface_common_config_t*, uct_ib_iface_init_attr_t*); @@ -315,16 +462,12 @@ UCS_CLASS_DECLARE(uct_rc_mlx5_iface_common_t, uct_rc_iface_ops_t*, #define UCT_RC_MLX5_TM_ENABLED(_iface) (_iface)->tm.enabled +#define UCT_RC_MLX5_MP_ENABLED(_iface) ((_iface)->tm.mp.num_strides > 1) + /* TMH can carry 2 bytes of data in its reserved filed */ #define UCT_RC_MLX5_TMH_PRIV_LEN ucs_field_sizeof(uct_rc_mlx5_tmh_priv_data_t, \ data) -#define UCT_RC_MLX5_CHECK_RES_PTR(_iface, _ep) \ - UCT_RC_CHECK_CQE_RET(&(_iface)->super, _ep, &(_ep)->txqp, \ - UCS_STATUS_PTR(UCS_ERR_NO_RESOURCE)) \ - UCT_RC_CHECK_TXQP_RET(&(_iface)->super, _ep, &(_ep)->txqp, \ - UCS_STATUS_PTR(UCS_ERR_NO_RESOURCE)) - #define UCT_RC_MLX5_CHECK_RNDV_PARAMS(_iovcnt, _header_len, _tm_len, \ _max_inline, _max_rndv_hdr) \ { \ @@ -365,9 +508,10 @@ UCS_CLASS_DECLARE(uct_rc_mlx5_iface_common_t, uct_rc_iface_ops_t*, } #if IBV_HW_TM -ucs_status_t uct_rc_mlx5_handle_rndv(uct_rc_mlx5_iface_common_t *iface, - struct ibv_tmh *tmh, uct_tag_t tag, - unsigned byte_len); +void uct_rc_mlx5_handle_unexp_rndv(uct_rc_mlx5_iface_common_t *iface, + struct ibv_tmh *tmh, uct_tag_t tag, + struct mlx5_cqe64 *cqe, unsigned flags, + unsigned byte_len); static UCS_F_ALWAYS_INLINE void @@ -391,8 +535,7 @@ uct_rc_mlx5_fill_rvh(struct ibv_rvh *rvh, const void *vaddr, static UCS_F_ALWAYS_INLINE unsigned uct_rc_mlx5_tag_get_op_id(uct_rc_mlx5_iface_common_t *iface, uct_completion_t *comp) { - uint32_t prev_ph; - return ucs_ptr_array_insert(&iface->tm.rndv_comps, comp, &prev_ph); + return ucs_ptr_array_insert(&iface->tm.rndv_comps, comp); } @@ -439,18 +582,18 @@ uct_rc_mlx5_ctx_priv(uct_tag_context_t *ctx) static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_handle_rndv_fin(uct_rc_mlx5_iface_common_t *iface, uint32_t app_ctx) { + void *rndv_comp = NULL; int found; - void *rndv_comp; found = ucs_ptr_array_lookup(&iface->tm.rndv_comps, app_ctx, rndv_comp); ucs_assert_always(found > 0); uct_invoke_completion((uct_completion_t*)rndv_comp, UCS_OK); - ucs_ptr_array_remove(&iface->tm.rndv_comps, app_ctx, 0); + ucs_ptr_array_remove(&iface->tm.rndv_comps, app_ctx); } extern ucs_config_field_t uct_rc_mlx5_common_config_table[]; -unsigned uct_rc_mlx5_iface_srq_post_recv(uct_rc_iface_t *iface, uct_ib_mlx5_srq_t *srq); +unsigned uct_rc_mlx5_iface_srq_post_recv(uct_rc_mlx5_iface_common_t *iface); void uct_rc_mlx5_iface_common_prepost_recvs(uct_rc_mlx5_iface_common_t *iface); @@ -469,7 +612,7 @@ void uct_rc_mlx5_iface_common_dm_cleanup(uct_rc_mlx5_iface_common_t *iface); void uct_rc_mlx5_iface_common_query(uct_ib_iface_t *ib_iface, uct_iface_attr_t *iface_attr, - size_t max_inline, size_t av_size); + size_t max_inline, size_t max_tag_eager_iov); void uct_rc_mlx5_iface_common_update_cqs_ci(uct_rc_mlx5_iface_common_t *iface, uct_ib_iface_t *ib_iface); @@ -480,27 +623,74 @@ void uct_rc_mlx5_iface_common_sync_cqs_ci(uct_rc_mlx5_iface_common_t *iface, int uct_rc_mlx5_iface_commom_clean(uct_ib_mlx5_cq_t *mlx5_cq, uct_ib_mlx5_srq_t *srq, uint32_t qpn); +static UCS_F_MAYBE_UNUSED void +uct_rc_mlx5_iface_tm_set_cmd_qp_len(uct_rc_mlx5_iface_common_t *iface) +{ + /* 2 ops for each tag (ADD + DEL) and extra ops for SYNC. */ + iface->tm.cmd_qp_len = (2 * iface->tm.num_tags) + 2; +} + +#if IBV_HW_TM +void uct_rc_mlx5_init_rx_tm_common(uct_rc_mlx5_iface_common_t *iface, + const uct_rc_iface_common_config_t *config, + unsigned rndv_hdr_len); + ucs_status_t uct_rc_mlx5_init_rx_tm(uct_rc_mlx5_iface_common_t *iface, - const uct_rc_mlx5_iface_common_config_t *config, - struct ibv_exp_create_srq_attr *srq_init_attr, - unsigned rndv_hdr_len, - unsigned max_cancel_sync_ops); + const uct_rc_iface_common_config_t *config, + struct ibv_srq_init_attr_ex *srq_init_attr, + unsigned rndv_hdr_len); +#else +static UCS_F_MAYBE_UNUSED ucs_status_t +uct_rc_mlx5_init_rx_tm(uct_rc_mlx5_iface_common_t *iface, + const uct_rc_iface_common_config_t *config, + struct ibv_srq_init_attr_ex *srq_init_attr, + unsigned rndv_hdr_len) +{ + return UCS_ERR_UNSUPPORTED; +} +#endif -void uct_rc_mlx5_tag_cleanup(uct_rc_mlx5_iface_common_t *iface); +#if IBV_HW_TM && HAVE_DEVX +ucs_status_t uct_rc_mlx5_devx_init_rx_tm(uct_rc_mlx5_iface_common_t *iface, + const uct_rc_iface_common_config_t *config, + int dc, unsigned rndv_hdr_len); +#else +static UCS_F_MAYBE_UNUSED ucs_status_t +uct_rc_mlx5_devx_init_rx_tm(uct_rc_mlx5_iface_common_t *iface, + const uct_rc_iface_common_config_t *config, + int dc, unsigned rndv_hdr_len) +{ + return UCS_ERR_UNSUPPORTED; +} +#endif -ucs_status_t -uct_rc_mlx5_iface_common_tag_init(uct_rc_mlx5_iface_common_t *iface, - uct_rc_mlx5_iface_common_config_t *config); +#if HAVE_DEVX +ucs_status_t uct_rc_mlx5_devx_init_rx(uct_rc_mlx5_iface_common_t *iface, + const uct_rc_iface_common_config_t *config); -void uct_rc_mlx5_iface_common_tag_cleanup(uct_rc_mlx5_iface_common_t *iface); +void uct_rc_mlx5_devx_cleanup_srq(uct_ib_mlx5_md_t *md, uct_ib_mlx5_srq_t *srq); +#else +static UCS_F_MAYBE_UNUSED ucs_status_t +uct_rc_mlx5_devx_init_rx(uct_rc_mlx5_iface_common_t *iface, + const uct_rc_iface_common_config_t *config) +{ + return UCS_ERR_UNSUPPORTED; +} -ucs_status_t uct_rc_mlx5_ep_tag_rndv_cancel(uct_ep_h tl_ep, void *op); +static UCS_F_MAYBE_UNUSED void +uct_rc_mlx5_devx_cleanup_srq(uct_ib_mlx5_md_t *md, uct_ib_mlx5_srq_t *srq) +{ + ucs_bug("DevX SRQ cleanup has to be done only if DevX support is enabled"); +} +#endif -ucs_status_t uct_rc_mlx5_iface_fence(uct_iface_h tl_iface, unsigned flags); +void uct_rc_mlx5_tag_cleanup(uct_rc_mlx5_iface_common_t *iface); -ucs_status_t uct_rc_mlx5_init_res_domain(uct_ib_iface_t *ib_iface); +ucs_status_t uct_rc_mlx5_iface_common_tag_init(uct_rc_mlx5_iface_common_t *iface); -void uct_rc_mlx5_cleanup_res_domain(uct_ib_iface_t *ib_iface); +void uct_rc_mlx5_iface_common_tag_cleanup(uct_rc_mlx5_iface_common_t *iface); + +ucs_status_t uct_rc_mlx5_ep_tag_rndv_cancel(uct_ep_h tl_ep, void *op); void uct_rc_mlx5_common_packet_dump(uct_base_iface_t *iface, uct_am_trace_type_t type, void *data, size_t length, size_t valid_length, @@ -514,4 +704,46 @@ uct_rc_mlx5_am_hdr_fill(uct_rc_mlx5_hdr_t *rch, uint8_t id) #endif rch->rc_hdr.am_id = id; } + +#if HAVE_DEVX +ucs_status_t +uct_rc_mlx5_iface_common_devx_connect_qp(uct_rc_mlx5_iface_common_t *iface, + uct_ib_mlx5_qp_t *qp, + uint32_t dest_qp_num, + struct ibv_ah_attr *ah_attr, + enum ibv_mtu path_mtu); + +#else +static UCS_F_MAYBE_UNUSED ucs_status_t +uct_rc_mlx5_iface_common_devx_connect_qp(uct_rc_mlx5_iface_common_t *iface, + uct_ib_mlx5_qp_t *qp, + uint32_t dest_qp_num, + struct ibv_ah_attr *ah_attr, + enum ibv_mtu path_mtu) +{ + return UCS_ERR_UNSUPPORTED; +} +#endif + +ucs_status_t uct_rc_mlx5_devx_iface_init_events(uct_rc_mlx5_iface_common_t *iface); + +void uct_rc_mlx5_devx_iface_free_events(uct_rc_mlx5_iface_common_t *iface); + +ucs_status_t uct_rc_mlx5_devx_iface_subscribe_event(uct_rc_mlx5_iface_common_t *iface, + uct_ib_mlx5_qp_t *qp, + unsigned event_num, + enum ibv_event_type event_type, + unsigned event_data); + +void uct_rc_mlx5_iface_fill_attr(uct_rc_mlx5_iface_common_t *iface, + uct_ib_mlx5_qp_attr_t *qp_attr, + unsigned max_send_wr, + uct_ib_mlx5_srq_t *srq); + +ucs_status_t +uct_rc_mlx5_common_iface_init_rx(uct_rc_mlx5_iface_common_t *iface, + const uct_rc_iface_common_config_t *rc_config); + +void uct_rc_mlx5_destroy_srq(uct_ib_mlx5_md_t *md, uct_ib_mlx5_srq_t *srq); + #endif diff --git a/src/uct/ib/rc/accel/rc_mlx5_devx.c b/src/uct/ib/rc/accel/rc_mlx5_devx.c new file mode 100644 index 00000000000..abb04a3330d --- /dev/null +++ b/src/uct/ib/rc/accel/rc_mlx5_devx.c @@ -0,0 +1,434 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "rc_mlx5.inl" + +#include +#include +#include +#include +#include + +ucs_status_t uct_rc_mlx5_devx_iface_subscribe_event(uct_rc_mlx5_iface_common_t *iface, + uct_ib_mlx5_qp_t *qp, + unsigned event_num, + enum ibv_event_type event_type, + unsigned event_data) +{ +#if HAVE_DECL_MLX5DV_DEVX_SUBSCRIBE_DEVX_EVENT + uint64_t cookie; + uint16_t event; + int ret; + + if (iface->event_channel == NULL) { + return UCS_OK; + } + + event = event_num; + cookie = event_type | ((uint64_t)event_data << UCT_IB_MLX5_DEVX_EVENT_DATA_SHIFT); + ret = mlx5dv_devx_subscribe_devx_event(iface->event_channel, qp->devx.obj, + sizeof(event), &event, cookie); + if (ret) { + ucs_error("mlx5dv_devx_subscribe_devx_event() failed: %m"); + return UCS_ERR_IO_ERROR; + } +#endif + + return UCS_OK; +} + +#if HAVE_DECL_MLX5DV_DEVX_SUBSCRIBE_DEVX_EVENT +static void uct_rc_mlx5_devx_iface_event_handler(int fd, int events, void *arg) +{ + uct_rc_mlx5_iface_common_t *iface = arg; + uct_ib_md_t *md = uct_ib_iface_md(&iface->super.super); + struct mlx5dv_devx_async_event_hdr devx_event; + uct_ib_async_event_t event; + int ret; + + ret = mlx5dv_devx_get_event(iface->event_channel, &devx_event, sizeof(devx_event)); + if (ret < 0) { + ucs_warn("mlx5dv_devx_get_event() failed: %m"); + return; + } + + event.event_type = devx_event.cookie & UCT_IB_MLX5_DEVX_EVENT_TYPE_MASK; + switch (event.event_type) { + case IBV_EVENT_QP_LAST_WQE_REACHED: + event.qp_num = devx_event.cookie >> UCT_IB_MLX5_DEVX_EVENT_DATA_SHIFT; + break; + default: + ucs_warn("unexpected async event: %d", event.event_type); + return; + } + + uct_ib_handle_async_event(&md->dev, &event); +} +#endif + +ucs_status_t uct_rc_mlx5_devx_iface_init_events(uct_rc_mlx5_iface_common_t *iface) +{ + ucs_status_t status = UCS_OK; +#if HAVE_DECL_MLX5DV_DEVX_SUBSCRIBE_DEVX_EVENT + uct_ib_mlx5_md_t *md = ucs_derived_of(uct_ib_iface_md(&iface->super.super), + uct_ib_mlx5_md_t); + struct mlx5dv_devx_event_channel *event_channel; + + if (!(md->flags & UCT_IB_MLX5_MD_FLAG_DEVX) || !md->super.dev.async_events) { + iface->event_channel = NULL; + return UCS_OK; + } + + event_channel = mlx5dv_devx_create_event_channel( + md->super.dev.ibv_context, + MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA); + + if (event_channel == NULL) { + ucs_error("mlx5dv_devx_create_event_channel() failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err; + + } + + status = ucs_sys_fcntl_modfl(event_channel->fd, O_NONBLOCK, 0); + if (status != UCS_OK) { + goto err_destroy_channel; + } + + status = ucs_async_set_event_handler(iface->super.super.super.worker->async->mode, + event_channel->fd, UCS_EVENT_SET_EVREAD, + uct_rc_mlx5_devx_iface_event_handler, iface, + iface->super.super.super.worker->async); + if (status != UCS_OK) { + goto err_destroy_channel; + } + + iface->event_channel = event_channel; + return UCS_OK; + +err_destroy_channel: + mlx5dv_devx_destroy_event_channel(event_channel); + iface->event_channel = NULL; +err: +#endif + return status; +} + +void uct_rc_mlx5_devx_iface_free_events(uct_rc_mlx5_iface_common_t *iface) +{ +#if HAVE_DECL_MLX5DV_DEVX_SUBSCRIBE_DEVX_EVENT + if (iface->event_channel == NULL) { + return; + } + + ucs_async_remove_handler(iface->event_channel->fd, 1); + mlx5dv_devx_destroy_event_channel(iface->event_channel); +#endif +} + +static ucs_status_t +uct_rc_mlx5_devx_init_rx_common(uct_rc_mlx5_iface_common_t *iface, + uct_ib_mlx5_md_t *md, + const uct_rc_iface_common_config_t *config, + const struct mlx5dv_pd *dvpd, void *wq) +{ + ucs_status_t status = UCS_ERR_NO_MEMORY; + int len, max, stride, log_num_of_strides, wq_type; + + stride = uct_ib_mlx5_srq_stride(iface->tm.mp.num_strides); + max = uct_ib_mlx5_srq_max_wrs(config->super.rx.queue_len, + iface->tm.mp.num_strides); + max = ucs_roundup_pow2(max); + len = max * stride; + + status = uct_ib_mlx5_md_buf_alloc(md, len, 0, &iface->rx.srq.buf, + &iface->rx.srq.devx.mem, "srq buf"); + if (status != UCS_OK) { + return status; + } + + iface->rx.srq.devx.dbrec = uct_ib_mlx5_get_dbrec(md); + if (!iface->rx.srq.devx.dbrec) { + goto err_free_mem; + } + + iface->rx.srq.db = &iface->rx.srq.devx.dbrec->db[MLX5_RCV_DBR]; + + if (iface->config.cyclic_srq_enable == UCS_NO) { + wq_type = UCT_RC_MLX5_MP_ENABLED(iface) ? + UCT_IB_MLX5_SRQ_TOPO_LIST_MP_RQ : + UCT_IB_MLX5_SRQ_TOPO_LIST; + } else { + wq_type = UCT_RC_MLX5_MP_ENABLED(iface) ? + UCT_IB_MLX5_SRQ_TOPO_CYCLIC_MP_RQ : + UCT_IB_MLX5_SRQ_TOPO_CYCLIC; + } + + UCT_IB_MLX5DV_SET (wq, wq, wq_type, wq_type); + UCT_IB_MLX5DV_SET (wq, wq, log_wq_sz, ucs_ilog2(max)); + UCT_IB_MLX5DV_SET (wq, wq, log_wq_stride, ucs_ilog2(stride)); + UCT_IB_MLX5DV_SET (wq, wq, pd, dvpd->pdn); + UCT_IB_MLX5DV_SET (wq, wq, dbr_umem_id, iface->rx.srq.devx.dbrec->mem_id); + UCT_IB_MLX5DV_SET64(wq, wq, dbr_addr, iface->rx.srq.devx.dbrec->offset); + UCT_IB_MLX5DV_SET (wq, wq, wq_umem_id, iface->rx.srq.devx.mem.mem->umem_id); + + if (UCT_RC_MLX5_MP_ENABLED(iface)) { + /* Normalize to device's interface values (range of (-6) - 7) */ + /* cppcheck-suppress internalAstError */ + log_num_of_strides = ucs_ilog2(iface->tm.mp.num_strides) - 9; + + UCT_IB_MLX5DV_SET(wq, wq, log_wqe_num_of_strides, + log_num_of_strides & 0xF); + UCT_IB_MLX5DV_SET(wq, wq, log_wqe_stride_size, + (ucs_ilog2(iface->super.super.config.seg_size) - 6)); + } + + iface->rx.srq.type = UCT_IB_MLX5_OBJ_TYPE_DEVX; + uct_ib_mlx5_srq_buff_init(&iface->rx.srq, 0, max - 1, + iface->super.super.config.seg_size, + iface->tm.mp.num_strides); + iface->super.rx.srq.quota = max - 1; + + return UCS_OK; + +err_free_mem: + uct_ib_mlx5_md_buf_free(md, iface->rx.srq.buf, &iface->rx.srq.devx.mem); + return status; +} + +ucs_status_t +uct_rc_mlx5_devx_init_rx_tm(uct_rc_mlx5_iface_common_t *iface, + const uct_rc_iface_common_config_t *config, + int dc, unsigned rndv_hdr_len) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(uct_ib_iface_md(&iface->super.super), + uct_ib_mlx5_md_t); + uct_ib_device_t *dev = &md->super.dev; + struct mlx5dv_pd dvpd = {}; + struct mlx5dv_cq dvcq = {}; + struct mlx5dv_obj dv = {}; + char in[UCT_IB_MLX5DV_ST_SZ_BYTES(create_xrq_in)] = {}; + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_xrq_out)] = {}; + ucs_status_t status; + void *xrqc; + + uct_rc_mlx5_init_rx_tm_common(iface, config, rndv_hdr_len); + + dv.pd.in = uct_ib_iface_md(&iface->super.super)->pd; + dv.cq.in = iface->super.super.cq[UCT_IB_DIR_RX]; + dv.pd.out = &dvpd; + dv.cq.out = &dvcq; + mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD | MLX5DV_OBJ_CQ); + + UCT_IB_MLX5DV_SET(create_xrq_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_XRQ); + xrqc = UCT_IB_MLX5DV_ADDR_OF(create_xrq_in, in, xrq_context); + + UCT_IB_MLX5DV_SET(xrqc, xrqc, topology, UCT_IB_MLX5_XRQC_TOPOLOGY_TAG_MATCHING); + UCT_IB_MLX5DV_SET(xrqc, xrqc, offload, UCT_IB_MLX5_XRQC_OFFLOAD_RNDV); + UCT_IB_MLX5DV_SET(xrqc, xrqc, tag_matching_topology_context.log_matching_list_sz, + ucs_ilog2(iface->tm.num_tags) + 1); + UCT_IB_MLX5DV_SET(xrqc, xrqc, dc, dc); + UCT_IB_MLX5DV_SET(xrqc, xrqc, cqn, dvcq.cqn); + + status = uct_rc_mlx5_devx_init_rx_common(iface, md, config, &dvpd, + UCT_IB_MLX5DV_ADDR_OF(xrqc, xrqc, wq)); + if (status != UCS_OK) { + return UCS_OK; + } + + iface->rx.srq.devx.obj = mlx5dv_devx_obj_create(dev->ibv_context, + in, sizeof(in), + out, sizeof(out)); + if (iface->rx.srq.devx.obj == NULL) { + ucs_error("mlx5dv_devx_obj_create(XRQ) failed, syndrome %x: %m", + UCT_IB_MLX5DV_GET(create_xrq_out, out, syndrome)); + status = UCS_ERR_IO_ERROR; + goto err_cleanup_srq; + } + + iface->rx.srq.srq_num = UCT_IB_MLX5DV_GET(create_xrq_out, out, xrqn); + uct_rc_mlx5_iface_tm_set_cmd_qp_len(iface); + + return UCS_OK; + +err_cleanup_srq: + uct_rc_mlx5_devx_cleanup_srq(md, &iface->rx.srq); + return status; +} + +ucs_status_t uct_rc_mlx5_devx_init_rx(uct_rc_mlx5_iface_common_t *iface, + const uct_rc_iface_common_config_t *config) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(uct_ib_iface_md(&iface->super.super), + uct_ib_mlx5_md_t); + uct_ib_device_t *dev = &md->super.dev; + struct mlx5dv_pd dvpd = {}; + struct mlx5dv_obj dv = {}; + char in[UCT_IB_MLX5DV_ST_SZ_BYTES(create_rmp_in)] = {}; + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_rmp_out)] = {}; + ucs_status_t status; + void *rmpc; + + dv.pd.in = uct_ib_iface_md(&iface->super.super)->pd; + dv.pd.out = &dvpd; + mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); + + UCT_IB_MLX5DV_SET(create_rmp_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_RMP); + rmpc = UCT_IB_MLX5DV_ADDR_OF(create_rmp_in, in, rmp_context); + + UCT_IB_MLX5DV_SET(rmpc, rmpc, state, UCT_IB_MLX5_RMPC_STATE_RDY); + + status = uct_rc_mlx5_devx_init_rx_common(iface, md, config, &dvpd, + UCT_IB_MLX5DV_ADDR_OF(rmpc, rmpc, wq)); + if (status != UCS_OK) { + return status; + } + + iface->rx.srq.devx.obj = mlx5dv_devx_obj_create(dev->ibv_context, + in, sizeof(in), + out, sizeof(out)); + if (iface->rx.srq.devx.obj == NULL) { + ucs_error("mlx5dv_devx_obj_create(RMP) failed, syndrome %x: %m", + UCT_IB_MLX5DV_GET(create_rmp_out, out, syndrome)); + status = UCS_ERR_IO_ERROR; + goto err_cleanup_srq; + } + + iface->rx.srq.srq_num = UCT_IB_MLX5DV_GET(create_rmp_out, out, rmpn); + + return UCS_OK; + +err_cleanup_srq: + uct_rc_mlx5_devx_cleanup_srq(md, &iface->rx.srq); + return status; +} + +void uct_rc_mlx5_devx_cleanup_srq(uct_ib_mlx5_md_t *md, uct_ib_mlx5_srq_t *srq) +{ + uct_ib_mlx5_put_dbrec(srq->devx.dbrec); + uct_ib_mlx5_md_buf_free(md, srq->buf, &srq->devx.mem); +} + +ucs_status_t +uct_rc_mlx5_iface_common_devx_connect_qp(uct_rc_mlx5_iface_common_t *iface, + uct_ib_mlx5_qp_t *qp, + uint32_t dest_qp_num, + struct ibv_ah_attr *ah_attr, + enum ibv_mtu path_mtu) +{ + char in_2rtr[UCT_IB_MLX5DV_ST_SZ_BYTES(init2rtr_qp_in)] = {}; + char out_2rtr[UCT_IB_MLX5DV_ST_SZ_BYTES(init2rtr_qp_out)] = {}; + char in_2rts[UCT_IB_MLX5DV_ST_SZ_BYTES(rtr2rts_qp_in)] = {}; + char out_2rts[UCT_IB_MLX5DV_ST_SZ_BYTES(rtr2rts_qp_out)] = {}; + uct_ib_device_t *dev = uct_ib_iface_device(&iface->super.super); + struct mlx5_wqe_av mlx5_av; + ucs_status_t status; + struct ibv_ah *ah; + void *qpc; + + UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, opcode, + UCT_IB_MLX5_CMD_OP_INIT2RTR_QP); + UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, qpn, qp->qp_num); + UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, opt_param_mask, 14); + + ucs_assert(path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU); + qpc = UCT_IB_MLX5DV_ADDR_OF(init2rtr_qp_in, in_2rtr, qpc); + UCT_IB_MLX5DV_SET(qpc, qpc, mtu, path_mtu); + UCT_IB_MLX5DV_SET(qpc, qpc, log_msg_max, UCT_IB_MLX5_LOG_MAX_MSG_SIZE); + UCT_IB_MLX5DV_SET(qpc, qpc, remote_qpn, dest_qp_num); + if (uct_ib_iface_is_roce(&iface->super.super)) { + status = uct_ib_iface_create_ah(&iface->super.super, ah_attr, &ah); + if (status != UCS_OK) { + return status; + } + + uct_ib_mlx5_get_av(ah, &mlx5_av); + memcpy(UCT_IB_MLX5DV_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32), + &mlx5_av.rmac, sizeof(mlx5_av.rmac)); + memcpy(UCT_IB_MLX5DV_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip), + &mlx5_av.rgid, sizeof(mlx5_av.rgid)); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.hop_limit, + mlx5_av.hop_limit); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.src_addr_index, + ah_attr->grh.sgid_index); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.eth_prio, + iface->super.super.config.sl); + if (uct_ib_iface_is_roce_v2(&iface->super.super, dev)) { + ucs_assert(ah_attr->dlid >= UCT_IB_ROCE_UDP_SRC_PORT_BASE); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.udp_sport, + ah_attr->dlid); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.dscp, + iface->super.super.config.traffic_class >> 2); + } + } else { + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.grh, ah_attr->is_global); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.rlid, ah_attr->dlid); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.mlid, + ah_attr->src_path_bits & 0x7f); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.hop_limit, + ah_attr->grh.hop_limit); + memcpy(UCT_IB_MLX5DV_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip), + &ah_attr->grh.dgid, + UCT_IB_MLX5DV_FLD_SZ_BYTES(qpc, primary_address_path.rgid_rip)); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.sl, + iface->super.super.config.sl); + /* TODO add flow_label support */ + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.tclass, + iface->super.super.config.traffic_class); + } + + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.vhca_port_num, ah_attr->port_num); + UCT_IB_MLX5DV_SET(qpc, qpc, log_rra_max, + ucs_ilog2_or0(iface->super.config.max_rd_atomic)); + UCT_IB_MLX5DV_SET(qpc, qpc, atomic_mode, UCT_IB_MLX5_ATOMIC_MODE); + UCT_IB_MLX5DV_SET(qpc, qpc, rre, true); + UCT_IB_MLX5DV_SET(qpc, qpc, rwe, true); + UCT_IB_MLX5DV_SET(qpc, qpc, rae, true); + UCT_IB_MLX5DV_SET(qpc, qpc, min_rnr_nak, iface->super.config.min_rnr_timer); + + status = uct_ib_mlx5_devx_modify_qp(qp, in_2rtr, sizeof(in_2rtr), + out_2rtr, sizeof(out_2rtr)); + if (status != UCS_OK) { + return status; + } + + UCT_IB_MLX5DV_SET(rtr2rts_qp_in, in_2rts, opcode, + UCT_IB_MLX5_CMD_OP_RTR2RTS_QP); + UCT_IB_MLX5DV_SET(rtr2rts_qp_in, in_2rts, qpn, qp->qp_num); + + qpc = UCT_IB_MLX5DV_ADDR_OF(rtr2rts_qp_in, in_2rts, qpc); + UCT_IB_MLX5DV_SET(qpc, qpc, log_sra_max, + ucs_ilog2_or0(iface->super.config.max_rd_atomic)); + UCT_IB_MLX5DV_SET(qpc, qpc, retry_count, iface->super.config.retry_cnt); + UCT_IB_MLX5DV_SET(qpc, qpc, rnr_retry, iface->super.config.rnr_retry); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.ack_timeout, + iface->super.config.timeout); + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.log_rtm, + iface->super.config.exp_backoff); + + status = uct_ib_mlx5_devx_modify_qp(qp, in_2rts, sizeof(in_2rts), + out_2rts, sizeof(out_2rts)); + if (status != UCS_OK) { + return status; + } + + ucs_debug("connected rc devx qp 0x%x on "UCT_IB_IFACE_FMT" to lid %d(+%d) sl %d " + "remote_qp 0x%x mtu %zu timer %dx%d rnr %dx%d rd_atom %d", + qp->qp_num, UCT_IB_IFACE_ARG(&iface->super.super), ah_attr->dlid, + ah_attr->src_path_bits, ah_attr->sl, dest_qp_num, + uct_ib_mtu_value(iface->super.super.config.path_mtu), + iface->super.config.timeout, + iface->super.config.retry_cnt, + iface->super.config.min_rnr_timer, + iface->super.config.rnr_retry, + iface->super.config.max_rd_atomic); + return UCS_OK; +} + diff --git a/src/uct/ib/rc/accel/rc_mlx5_ep.c b/src/uct/ib/rc/accel/rc_mlx5_ep.c index 25b21f078d2..f8b3172151b 100644 --- a/src/uct/ib/rc/accel/rc_mlx5_ep.c +++ b/src/uct/ib/rc/accel/rc_mlx5_ep.c @@ -4,11 +4,17 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rc_mlx5.h" #if HAVE_DECL_IBV_CMD_MODIFY_QP -#include +# include #endif + #include +#include #include #include #include /* For htonl */ @@ -33,8 +39,7 @@ uct_rc_mlx5_txqp_bcopy_post(uct_rc_mlx5_iface_common_t *iface, desc->super.sn = txwq->sw_pi; uct_rc_mlx5_txqp_dptr_post(iface, IBV_QPT_RC, txqp, txwq, opcode, buffer, length, &desc->lkey, - rdma_raddr, uct_ib_md_direct_rkey(rdma_rkey), - 0, 0, 0, 0, + rdma_raddr, rdma_rkey, 0, 0, 0, 0, NULL, NULL, 0, fm_ce_se, imm_val_be, INT_MAX, log_sge); uct_rc_txqp_add_send_op(txqp, &desc->super); } @@ -44,12 +49,13 @@ uct_rc_mlx5_txqp_bcopy_post(uct_rc_mlx5_iface_common_t *iface, * Adds user completion to the callback queue. */ static UCS_F_ALWAYS_INLINE ucs_status_t -uct_rc_mlx5_ep_zcopy_post(uct_rc_mlx5_ep_t *ep, - unsigned opcode, const uct_iov_t *iov, size_t iovcnt, +uct_rc_mlx5_ep_zcopy_post(uct_rc_mlx5_ep_t *ep, unsigned opcode, + const uct_iov_t *iov, size_t iovcnt, size_t iov_total_length, /* SEND */ uint8_t am_id, const void *am_hdr, unsigned am_hdr_len, /* RDMA */ uint64_t rdma_raddr, uct_rkey_t rdma_rkey, /* TAG */ uct_tag_t tag, uint32_t app_ctx, uint32_t ib_imm_be, - int force_sig, uct_completion_t *comp) + int force_sig, uct_rc_send_handler_t handler, + uct_completion_t *comp) { uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_mlx5_iface_common_t); @@ -68,8 +74,9 @@ uct_rc_mlx5_ep_zcopy_post(uct_rc_mlx5_ep_t *ep, (comp == NULL) ? force_sig : MLX5_WQE_CTRL_CQ_UPDATE, UCT_IB_MAX_ZCOPY_LOG_SGE(&iface->super.super)); - uct_rc_txqp_add_send_comp(&iface->super, &ep->super.txqp, comp, sn, - UCT_RC_IFACE_SEND_OP_FLAG_ZCOPY); + uct_rc_txqp_add_send_comp(&iface->super, &ep->super.txqp, handler, comp, sn, + UCT_RC_IFACE_SEND_OP_FLAG_ZCOPY, iov_total_length); + return UCS_INPROGRESS; } @@ -79,13 +86,14 @@ uct_rc_mlx5_ep_put_short_inline(uct_ep_h tl_ep, const void *buffer, unsigned len { UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep); UCT_RC_MLX5_CHECK_PUT_SHORT(length, 0); - UCT_RC_CHECK_RES(&iface->super, &ep->super); + UCT_RC_CHECK_RMA_RES(&iface->super, &ep->super); + uct_rc_mlx5_ep_fence_put(iface, &ep->tx.wq, &rkey, &remote_addr, + ep->super.atomic_mr_offset); uct_rc_mlx5_txqp_inline_post(iface, IBV_QPT_RC, &ep->super.txqp, &ep->tx.wq, MLX5_OPCODE_RDMA_WRITE, - buffer, length, 0, 0, 0, - remote_addr, uct_ib_md_direct_rkey(rkey), + buffer, length, 0, 0, 0, remote_addr, rkey, NULL, NULL, 0, 0, INT_MAX); UCT_TL_EP_STAT_OP(&ep->super.super, PUT, SHORT, length); return UCS_OK; @@ -114,7 +122,7 @@ uct_rc_mlx5_ep_am_short_inline(uct_ep_h tl_ep, uint8_t id, uint64_t hdr, return UCS_OK; } -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM static ucs_status_t UCS_F_ALWAYS_INLINE uct_rc_mlx5_ep_short_dm(uct_rc_mlx5_ep_t *ep, uct_rc_mlx5_dm_copy_data_t *cache, size_t hdr_len, const void *payload, unsigned length, @@ -123,7 +131,7 @@ uct_rc_mlx5_ep_short_dm(uct_rc_mlx5_ep_t *ep, uct_rc_mlx5_dm_copy_data_t *cache, { uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_mlx5_iface_common_t); - uct_rc_iface_send_desc_t *desc; + uct_rc_iface_send_desc_t *desc = NULL; void *buffer; ucs_status_t status; uct_ib_log_sge_t log_sge; @@ -147,7 +155,7 @@ ucs_status_t uct_rc_mlx5_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length, uint64_t remote_addr, uct_rkey_t rkey) { -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_mlx5_iface_common_t); uct_rc_iface_t *rc_iface = &iface->super; uct_rc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_mlx5_ep_t); @@ -156,15 +164,17 @@ uct_rc_mlx5_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length, if (ucs_likely((length <= UCT_IB_MLX5_PUT_MAX_SHORT(0)) || !iface->dm.dm)) { #endif return uct_rc_mlx5_ep_put_short_inline(tl_ep, buffer, length, remote_addr, rkey); -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM } UCT_CHECK_LENGTH(length, 0, iface->dm.seg_len, "put_short"); - UCT_RC_CHECK_RES(rc_iface, &ep->super); - status = uct_rc_mlx5_ep_short_dm(ep, NULL, 0, buffer, length, - MLX5_OPCODE_RDMA_WRITE, - MLX5_WQE_CTRL_CQ_UPDATE, - remote_addr, rkey); + UCT_RC_CHECK_RMA_RES(rc_iface, &ep->super); + uct_rc_mlx5_ep_fence_put(iface, &ep->tx.wq, &rkey, &remote_addr, + ep->super.atomic_mr_offset); + status = uct_rc_mlx5_ep_short_dm(ep, NULL, 0, buffer, length, + MLX5_OPCODE_RDMA_WRITE, + MLX5_WQE_CTRL_CQ_UPDATE, + remote_addr, rkey); if (UCS_STATUS_IS_ERR(status)) { return status; } @@ -181,9 +191,11 @@ ssize_t uct_rc_mlx5_ep_put_bcopy(uct_ep_h tl_ep, uct_pack_callback_t pack_cb, uct_rc_iface_send_desc_t *desc; size_t length; - UCT_RC_CHECK_RES(&iface->super, &ep->super); + UCT_RC_CHECK_RMA_RES(&iface->super, &ep->super); UCT_RC_IFACE_GET_TX_PUT_BCOPY_DESC(&iface->super, &iface->super.tx.mp, desc, pack_cb, arg, length); + uct_rc_mlx5_ep_fence_put(iface, &ep->tx.wq, &rkey, &remote_addr, + ep->super.atomic_mr_offset); uct_rc_mlx5_txqp_bcopy_post(iface, &ep->super.txqp, &ep->tx.wq, MLX5_OPCODE_RDMA_WRITE, length, remote_addr, @@ -197,18 +209,22 @@ ucs_status_t uct_rc_mlx5_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size uint64_t remote_addr, uct_rkey_t rkey, uct_completion_t *comp) { - uct_ib_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ib_iface_t); - uct_rc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_mlx5_ep_t); + UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep); ucs_status_t status; - UCT_CHECK_IOV_SIZE(iovcnt, uct_ib_iface_get_max_iov(iface), + UCT_CHECK_IOV_SIZE(iovcnt, UCT_RC_MLX5_RMA_MAX_IOV(0), "uct_rc_mlx5_ep_put_zcopy"); UCT_CHECK_LENGTH(uct_iov_total_length(iov, iovcnt), 0, UCT_IB_MAX_MESSAGE_SIZE, "put_zcopy"); + UCT_RC_CHECK_NUM_RDMA_READ(&iface->super); + uct_rc_mlx5_ep_fence_put(iface, &ep->tx.wq, &rkey, &remote_addr, + ep->super.atomic_mr_offset); status = uct_rc_mlx5_ep_zcopy_post(ep, MLX5_OPCODE_RDMA_WRITE, iov, iovcnt, - 0, NULL, 0, remote_addr, rkey, 0ul, 0, 0, - MLX5_WQE_CTRL_CQ_UPDATE, comp); + 0ul, 0, NULL, 0, remote_addr, rkey, + 0ul, 0, 0, MLX5_WQE_CTRL_CQ_UPDATE, + uct_rc_ep_send_op_completion_handler, + comp); UCT_TL_EP_STAT_OP_IF_SUCCESS(status, &ep->super.super, PUT, ZCOPY, uct_iov_total_length(iov, iovcnt)); return status; @@ -220,19 +236,21 @@ ucs_status_t uct_rc_mlx5_ep_get_bcopy(uct_ep_h tl_ep, uint64_t remote_addr, uct_rkey_t rkey, uct_completion_t *comp) { + uint8_t fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep); uct_rc_iface_send_desc_t *desc; UCT_CHECK_LENGTH(length, 0, iface->super.super.config.seg_size, "get_bcopy"); - UCT_RC_CHECK_RES(&iface->super, &ep->super); + UCT_RC_CHECK_RMA_RES(&iface->super, &ep->super); UCT_RC_IFACE_GET_TX_GET_BCOPY_DESC(&iface->super, &iface->super.tx.mp, desc, unpack_cb, comp, arg, length); + uct_rc_mlx5_ep_fence_get(iface, &ep->tx.wq, &rkey, &fm_ce_se); uct_rc_mlx5_txqp_bcopy_post(iface, &ep->super.txqp, &ep->tx.wq, MLX5_OPCODE_RDMA_READ, length, remote_addr, - rkey, MLX5_WQE_CTRL_CQ_UPDATE, 0, desc, desc + 1, - NULL); + rkey, fm_ce_se, 0, desc, desc + 1, NULL); UCT_TL_EP_STAT_OP(&ep->super.super, GET, BCOPY, length); + UCT_RC_RDMA_READ_POSTED(&iface->super, length); return UCS_INPROGRESS; } @@ -240,20 +258,28 @@ ucs_status_t uct_rc_mlx5_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size uint64_t remote_addr, uct_rkey_t rkey, uct_completion_t *comp) { + uint8_t fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; + size_t total_length = uct_iov_total_length(iov, iovcnt); UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep); ucs_status_t status; - UCT_CHECK_IOV_SIZE(iovcnt, uct_ib_iface_get_max_iov(&iface->super.super), + UCT_CHECK_IOV_SIZE(iovcnt, UCT_RC_MLX5_RMA_MAX_IOV(0), "uct_rc_mlx5_ep_get_zcopy"); - UCT_CHECK_LENGTH(uct_iov_total_length(iov, iovcnt), - iface->super.super.config.max_inl_resp + 1, - UCT_IB_MAX_MESSAGE_SIZE, "get_zcopy"); + UCT_CHECK_LENGTH(total_length, + iface->super.super.config.max_inl_cqe[UCT_IB_DIR_TX] + 1, + iface->super.config.max_get_zcopy, "get_zcopy"); + UCT_RC_CHECK_NUM_RDMA_READ(&iface->super); + uct_rc_mlx5_ep_fence_get(iface, &ep->tx.wq, &rkey, &fm_ce_se); status = uct_rc_mlx5_ep_zcopy_post(ep, MLX5_OPCODE_RDMA_READ, iov, iovcnt, - 0, NULL, 0, remote_addr, rkey, 0ul, 0, 0, - MLX5_WQE_CTRL_CQ_UPDATE, comp); - UCT_TL_EP_STAT_OP_IF_SUCCESS(status, &ep->super.super, GET, ZCOPY, - uct_iov_total_length(iov, iovcnt)); + total_length, 0, NULL, 0, remote_addr, rkey, + 0ul, 0, 0, fm_ce_se, + uct_rc_ep_get_zcopy_completion_handler, + comp); + if (!UCS_STATUS_IS_ERR(status)) { + UCT_TL_EP_STAT_OP(&ep->super.super, GET, ZCOPY, total_length); + UCT_RC_RDMA_READ_POSTED(&iface->super, total_length); + } return status; } @@ -261,7 +287,7 @@ ucs_status_t uct_rc_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr, const void *payload, unsigned length) { -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_mlx5_iface_common_t); uct_rc_iface_t *rc_iface = &iface->super; uct_rc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_mlx5_ep_t); @@ -272,7 +298,7 @@ uct_rc_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr, !iface->dm.dm)) { #endif return uct_rc_mlx5_ep_am_short_inline(tl_ep, id, hdr, payload, length); -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM } UCT_CHECK_LENGTH(length + sizeof(uct_rc_mlx5_am_short_hdr_t), 0, @@ -336,9 +362,11 @@ ucs_status_t uct_rc_mlx5_ep_am_zcopy(uct_ep_h tl_ep, uint8_t id, const void *hea iface->super.super.config.seg_size, 0); UCT_RC_CHECK_FC(&iface->super, &ep->super, id); - status = uct_rc_mlx5_ep_zcopy_post(ep, MLX5_OPCODE_SEND, iov, iovcnt, + status = uct_rc_mlx5_ep_zcopy_post(ep, MLX5_OPCODE_SEND, iov, iovcnt, 0ul, id, header, header_length, 0, 0, 0ul, 0, 0, - MLX5_WQE_CTRL_SOLICITED, comp); + MLX5_WQE_CTRL_SOLICITED, + uct_rc_ep_send_op_completion_handler, + comp); if (ucs_likely(status >= 0)) { UCT_TL_EP_STAT_OP(&ep->super.super, AM, ZCOPY, header_length + uct_iov_total_length(iov, iovcnt)); @@ -375,15 +403,17 @@ static UCS_F_ALWAYS_INLINE ucs_status_t uct_rc_mlx5_ep_atomic_fop(uct_ep_h tl_ep, int opcode, void *result, int ext, unsigned length, uint64_t remote_addr, uct_rkey_t rkey, uint64_t compare_mask, uint64_t compare, - uint64_t swap_mask, uint64_t swap_add, uct_completion_t *comp) + uint64_t swap_mask, uint64_t swap_add, + uct_completion_t *comp) { UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep); uct_rc_iface_send_desc_t *desc; - UCT_RC_CHECK_RES(&iface->super, &ep->super); - UCT_RC_IFACE_GET_TX_ATOMIC_FETCH_DESC(&iface->super, &iface->tx.atomic_desc_mp, - desc, uct_rc_iface_atomic_handler(&iface->super, ext, - length), + UCT_RC_CHECK_RMA_RES(&iface->super, &ep->super); + UCT_RC_IFACE_GET_TX_ATOMIC_FETCH_DESC(&iface->super, + &iface->tx.atomic_desc_mp, desc, + uct_rc_iface_atomic_handler(&iface->super, + ext, length), result, comp); uct_rc_mlx5_ep_atomic_post(tl_ep, opcode, desc, length, remote_addr, rkey, compare_mask, compare, swap_mask, swap_add); @@ -392,7 +422,8 @@ uct_rc_mlx5_ep_atomic_fop(uct_ep_h tl_ep, int opcode, void *result, int ext, static ucs_status_t UCS_F_ALWAYS_INLINE uct_rc_mlx5_ep_atomic_op_post(uct_ep_h tl_ep, unsigned opcode, unsigned size, - uint64_t value, uint64_t remote_addr, uct_rkey_t rkey) + uint64_t value, uint64_t remote_addr, + uct_rkey_t rkey) { UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep); uct_rc_iface_send_desc_t *desc; @@ -404,16 +435,18 @@ uct_rc_mlx5_ep_atomic_op_post(uct_ep_h tl_ep, unsigned opcode, unsigned size, int ext; /* not used here */ ucs_status_t status; - UCT_RC_CHECK_RES(&iface->super, &ep->super); + UCT_RC_CHECK_RMA_RES(&iface->super, &ep->super); UCT_RC_MLX5_CHECK_ATOMIC_OPS(opcode, size, UCT_RC_MLX5_ATOMIC_OPS); - status = uct_rc_mlx5_iface_common_atomic_data(opcode, size, value, &op, &compare_mask, - &compare, &swap_mask, &swap, &ext); + status = uct_rc_mlx5_iface_common_atomic_data(opcode, size, value, &op, + &compare_mask, &compare, + &swap_mask, &swap, &ext); if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) { return status; } - UCT_RC_IFACE_GET_TX_ATOMIC_DESC(&iface->super, &iface->tx.atomic_desc_mp, desc); + UCT_RC_IFACE_GET_TX_ATOMIC_DESC(&iface->super, &iface->tx.atomic_desc_mp, + desc); uct_rc_mlx5_ep_atomic_post(tl_ep, op, desc, size, remote_addr, rkey, compare_mask, compare, swap_mask, swap); @@ -481,8 +514,8 @@ ucs_status_t uct_rc_mlx5_ep_atomic_cswap64(uct_ep_h tl_ep, uint64_t compare, uin uint64_t *result, uct_completion_t *comp) { return uct_rc_mlx5_ep_atomic_fop(tl_ep, MLX5_OPCODE_ATOMIC_CS, result, 0, sizeof(uint64_t), - remote_addr, rkey, 0, htobe64(compare), -1, htobe64(swap), - comp); + remote_addr, rkey, 0, htobe64(compare), + UINT64_MAX, htobe64(swap), comp); } ucs_status_t uct_rc_mlx5_ep_atomic_cswap32(uct_ep_h tl_ep, uint32_t compare, uint32_t swap, @@ -491,17 +524,14 @@ ucs_status_t uct_rc_mlx5_ep_atomic_cswap32(uct_ep_h tl_ep, uint32_t compare, uin { return uct_rc_mlx5_ep_atomic_fop(tl_ep, MLX5_OPCODE_ATOMIC_MASKED_CS, result, 1, sizeof(uint32_t), remote_addr, rkey, UCS_MASK(32), - htonl(compare), -1, htonl(swap), comp); + htonl(compare), UINT64_MAX, htonl(swap), comp); } ucs_status_t uct_rc_mlx5_ep_fence(uct_ep_h tl_ep, unsigned flags) { - UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep); - uct_ib_md_t *md = uct_ib_iface_md(&iface->super.super); + uct_rc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_mlx5_ep_t); - uct_rc_mlx5_add_fence(md, &ep->tx.wq); - UCT_TL_EP_STAT_FENCE(&ep->super.super); - return UCS_OK; + return uct_rc_ep_fence(tl_ep, &ep->tx.wq.fi, 1); } ucs_status_t uct_rc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags, @@ -511,6 +541,12 @@ ucs_status_t uct_rc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags, ucs_status_t status; uint16_t sn; + if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) { + uct_ep_pending_purge(&ep->super.super.super, NULL, 0); + uct_rc_mlx5_ep_handle_failure(ep, UCS_ERR_CANCELED); + return UCS_OK; + } + status = uct_rc_ep_flush(&ep->super, ep->tx.wq.bb_max, flags); if (status != UCS_INPROGRESS) { return status; @@ -559,13 +595,14 @@ ucs_status_t uct_rc_mlx5_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr) { UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep); uct_rc_mlx5_ep_address_t *rc_addr = (uct_rc_mlx5_ep_address_t*)addr; + uct_ib_md_t *md = uct_ib_iface_md(ucs_derived_of( + tl_ep->iface, uct_ib_iface_t)); - ucs_assert(ep->qp_num == ep->super.txqp.qp->qp_num); - uct_ib_pack_uint24(rc_addr->qp_num, ep->qp_num); - rc_addr->atomic_mr_id = uct_ib_iface_get_atomic_mr_id(&iface->super.super); + uct_ib_pack_uint24(rc_addr->qp_num, ep->tx.wq.super.qp_num); + uct_ib_mlx5_md_get_atomic_mr_id(md, &rc_addr->atomic_mr_id); if (UCT_RC_MLX5_TM_ENABLED(iface)) { - uct_ib_pack_uint24(rc_addr->tm_qp_num, ep->tm_qp->qp_num); + uct_ib_pack_uint24(rc_addr->tm_qp_num, ep->tm_qp.qp_num); } return UCS_OK; @@ -606,10 +643,27 @@ void uct_rc_mlx5_common_packet_dump(uct_base_iface_t *iface, uct_am_trace_type_t data = &rch->rc_hdr; /* coverity[overrun-buffer-val] */ - uct_rc_ep_packet_dump(iface, type, data, length - (data - (void *)rch), + uct_rc_ep_packet_dump(iface, type, data, length - UCS_PTR_BYTE_DIFF(rch, data), valid_length, buffer, max); } +static ucs_status_t UCS_F_ALWAYS_INLINE +uct_rc_mlx5_ep_connect_qp(uct_rc_mlx5_iface_common_t *iface, + uct_ib_mlx5_qp_t *qp, uint32_t qp_num, + struct ibv_ah_attr *ah_attr, enum ibv_mtu path_mtu) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.md, uct_ib_mlx5_md_t); + + ucs_assert(path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU); + if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX) { + return uct_rc_mlx5_iface_common_devx_connect_qp(iface, qp, qp_num, + ah_attr, path_mtu); + } else { + return uct_rc_iface_qp_connect(&iface->super, qp->verbs.qp, qp_num, + ah_attr, path_mtu); + } +} + ucs_status_t uct_rc_mlx5_ep_connect_to_ep(uct_ep_h tl_ep, const uct_device_addr_t *dev_addr, const uct_ep_addr_t *ep_addr) @@ -619,18 +673,21 @@ ucs_status_t uct_rc_mlx5_ep_connect_to_ep(uct_ep_h tl_ep, const uct_rc_mlx5_ep_address_t *rc_addr = (const uct_rc_mlx5_ep_address_t*)ep_addr; uint32_t qp_num; struct ibv_ah_attr ah_attr; + enum ibv_mtu path_mtu; ucs_status_t status; uct_ib_iface_fill_ah_attr_from_addr(&iface->super.super, ib_addr, - ep->super.path_bits, &ah_attr); + ep->super.path_index, &ah_attr, + &path_mtu); + ucs_assert(path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU); if (UCT_RC_MLX5_TM_ENABLED(iface)) { /* For HW TM we need 2 QPs, one of which will be used by the device for * RNDV offload (for issuing RDMA reads and sending RNDV ACK). No WQEs * should be posted to the send side of the QP which is owned by device. */ - status = uct_rc_iface_qp_connect(&iface->super, ep->tm_qp, - uct_ib_unpack_uint24(rc_addr->qp_num), - &ah_attr); + status = uct_rc_mlx5_ep_connect_qp(iface, &ep->tm_qp, + uct_ib_unpack_uint24(rc_addr->qp_num), + &ah_attr, path_mtu); if (status != UCS_OK) { return status; } @@ -642,7 +699,8 @@ ucs_status_t uct_rc_mlx5_ep_connect_to_ep(uct_ep_h tl_ep, qp_num = uct_ib_unpack_uint24(rc_addr->qp_num); } - status = uct_rc_iface_qp_connect(&iface->super, ep->super.txqp.qp, qp_num, &ah_attr); + status = uct_rc_mlx5_ep_connect_qp(iface, &ep->tx.wq.super, qp_num, + &ah_attr, path_mtu); if (status != UCS_OK) { return status; } @@ -660,7 +718,7 @@ ucs_status_t uct_rc_mlx5_ep_tag_rndv_cancel(uct_ep_h tl_ep, void *op) uct_rc_mlx5_iface_common_t); uint32_t op_index = (uint32_t)((uint64_t)op); - ucs_ptr_array_remove(&iface->tm.rndv_comps, op_index, 0); + ucs_ptr_array_remove(&iface->tm.rndv_comps, op_index); return UCS_OK; } @@ -686,7 +744,7 @@ uct_rc_mlx5_ep_tag_eager_short_inline(uct_ep_h tl_ep, uct_tag_t tag, ucs_status_t uct_rc_mlx5_ep_tag_eager_short(uct_ep_h tl_ep, uct_tag_t tag, const void *data, size_t length) { -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep); uct_rc_mlx5_dm_copy_data_t cache; ucs_status_t status; @@ -695,14 +753,14 @@ ucs_status_t uct_rc_mlx5_ep_tag_eager_short(uct_ep_h tl_ep, uct_tag_t tag, !iface->dm.dm)) { #endif return uct_rc_mlx5_ep_tag_eager_short_inline(tl_ep, tag, data, length); -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM } UCT_CHECK_LENGTH(length + sizeof(struct ibv_tmh), 0, iface->dm.seg_len, "tag_short"); UCT_RC_CHECK_RES(&iface->super, &ep->super); - uct_rc_mlx5_fill_tmh(ucs_unaligned_ptr(&cache.tm_hdr), tag, 0, IBV_EXP_TMH_EAGER); + uct_rc_mlx5_fill_tmh(ucs_unaligned_ptr(&cache.tm_hdr), tag, 0, IBV_TMH_EAGER); status = uct_rc_mlx5_ep_short_dm(ep, &cache, sizeof(cache.tm_hdr), data, length, MLX5_OPCODE_SEND, @@ -732,8 +790,8 @@ ssize_t uct_rc_mlx5_ep_tag_eager_bcopy(uct_ep_h tl_ep, uct_tag_t tag, UCT_RC_MLX5_FILL_TM_IMM(imm, app_ctx, ib_imm, opcode, MLX5_OPCODE_SEND, _IMM); - UCT_RC_MLX5_IFACE_GET_TM_BCOPY_DESC(&iface->super, &iface->super.tx.mp, desc, - tag, app_ctx, pack_cb, arg, length); + UCT_RC_MLX5_IFACE_GET_TM_BCOPY_DESC(&iface->super, iface->tm.bcopy_mp, + desc, tag, app_ctx, pack_cb, arg, length); uct_rc_mlx5_txqp_bcopy_post(iface, &ep->super.txqp, &ep->tx.wq, opcode, sizeof(struct ibv_tmh) + length, @@ -758,7 +816,7 @@ ucs_status_t uct_rc_mlx5_ep_tag_eager_zcopy(uct_ep_h tl_ep, uct_tag_t tag, "uct_rc_mlx5_ep_tag_eager_zcopy"); UCT_RC_CHECK_ZCOPY_DATA(sizeof(struct ibv_tmh), uct_iov_total_length(iov, iovcnt), - iface->super.super.config.seg_size); + iface->tm.max_zcopy); UCT_RC_MLX5_FILL_TM_IMM(imm, app_ctx, ib_imm, opcode, MLX5_OPCODE_SEND, _IMM); @@ -767,9 +825,11 @@ ucs_status_t uct_rc_mlx5_ep_tag_eager_zcopy(uct_ep_h tl_ep, uct_tag_t tag, uct_iov_total_length(iov, iovcnt)); return uct_rc_mlx5_ep_zcopy_post(ep, opcode|UCT_RC_MLX5_OPCODE_FLAG_TM, - iov, iovcnt, 0, "", 0, 0, 0, + iov, iovcnt, 0ul, 0, "", 0, 0, 0, tag, app_ctx, ib_imm, - MLX5_WQE_CTRL_SOLICITED, comp); + MLX5_WQE_CTRL_SOLICITED, + uct_rc_ep_send_op_completion_handler, + comp); } ucs_status_ptr_t uct_rc_mlx5_ep_tag_rndv_zcopy(uct_ep_h tl_ep, uct_tag_t tag, @@ -788,7 +848,7 @@ ucs_status_ptr_t uct_rc_mlx5_ep_tag_rndv_zcopy(uct_ep_h tl_ep, uct_tag_t tag, UCT_IB_MLX5_AM_MAX_SHORT(0), iface->tm.max_rndv_data + UCT_RC_MLX5_TMH_PRIV_LEN); - UCT_RC_MLX5_CHECK_RES_PTR(iface, &ep->super); + UCT_RC_MLX5_CHECK_RES_PTR(iface, ep); op_index = uct_rc_mlx5_tag_get_op_id(iface, comp); @@ -820,70 +880,71 @@ ucs_status_t uct_rc_mlx5_ep_tag_rndv_request(uct_ep_h tl_ep, uct_tag_t tag, } #endif /* IBV_HW_TM */ -static ucs_status_t uct_rc_mlx5_ep_tag_qp_create(uct_rc_mlx5_iface_common_t *iface, - uct_rc_mlx5_ep_t *ep) +UCS_CLASS_INIT_FUNC(uct_rc_mlx5_ep_t, const uct_ep_params_t *params) { - struct ibv_qp_cap cap; + uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(params->iface, + uct_rc_mlx5_iface_common_t); + uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.md, + uct_ib_mlx5_md_t); + uct_ib_mlx5_qp_attr_t attr = {}; ucs_status_t status; - int ret; - if (UCT_RC_MLX5_TM_ENABLED(iface)) { - /* Send queue of this QP will be used by FW for HW RNDV. Driver requires - * such a QP to be initialized with zero send queue length. */ - status = uct_rc_iface_qp_create(&iface->super, &ep->tm_qp, &cap, 0); - if (status != UCS_OK) { - return status; - } + /* Need to create QP before super constructor to get QP number */ + uct_rc_mlx5_iface_fill_attr(iface, &attr, iface->super.config.tx_qp_len, + &iface->rx.srq); + uct_ib_exp_qp_fill_attr(&iface->super.super, &attr.super); + status = uct_rc_mlx5_iface_create_qp(iface, &self->tx.wq.super, &self->tx.wq, &attr); + if (status != UCS_OK) { + return status; + } + + UCS_CLASS_CALL_SUPER_INIT(uct_rc_ep_t, &iface->super, + self->tx.wq.super.qp_num, params); - status = uct_rc_iface_qp_init(&iface->super, ep->tm_qp); + if (self->tx.wq.super.type == UCT_IB_MLX5_OBJ_TYPE_VERBS) { + status = uct_rc_iface_qp_init(&iface->super, self->tx.wq.super.verbs.qp); if (status != UCS_OK) { - ret = ibv_destroy_qp(ep->tm_qp); - if (ret) { - ucs_warn("ibv_destroy_qp() returned %d: %m", ret); - } - return status; + goto err; } - uct_rc_iface_add_qp(&iface->super, &ep->super, ep->tm_qp->qp_num); } - return UCS_OK; -} - -UCS_CLASS_INIT_FUNC(uct_rc_mlx5_ep_t, const uct_ep_params_t *params) -{ - uct_rc_mlx5_iface_common_t *iface; - ucs_status_t status; - iface = ucs_derived_of(params->iface, uct_rc_mlx5_iface_common_t); - UCS_CLASS_CALL_SUPER_INIT(uct_rc_ep_t, &iface->super); + uct_rc_iface_add_qp(&iface->super, &self->super, self->tx.wq.super.qp_num); - status = uct_rc_mlx5_ep_tag_qp_create(iface, self); - if (status != UCS_OK) { - return status; - } + if (UCT_RC_MLX5_TM_ENABLED(iface)) { + /* Send queue of this QP will be used by FW for HW RNDV. Driver requires + * such a QP to be initialized with zero send queue length. */ + memset(&attr, 0, sizeof(attr)); + uct_rc_mlx5_iface_fill_attr(iface, &attr, 0, &iface->rx.srq); + uct_ib_exp_qp_fill_attr(&iface->super.super, &attr.super); + status = uct_rc_mlx5_iface_create_qp(iface, &self->tm_qp, NULL, &attr); + if (status != UCS_OK) { + goto err; + } - status = uct_ib_mlx5_txwq_init(iface->super.super.super.worker, - iface->tx.mmio_mode, &self->tx.wq, - self->super.txqp.qp); - if (status != UCS_OK) { - ucs_error("Failed to get mlx5 QP information"); - return status; + uct_rc_iface_add_qp(&iface->super, &self->super, self->tm_qp.qp_num); } - self->qp_num = self->super.txqp.qp->qp_num; self->tx.wq.bb_max = ucs_min(self->tx.wq.bb_max, iface->tx.bb_max); + self->mp.free = 1; uct_rc_txqp_available_set(&self->super.txqp, self->tx.wq.bb_max); return UCS_OK; + +err: + uct_ib_mlx5_destroy_qp(md, &self->tx.wq.super); + return status; } -static void uct_rc_mlx5_ep_clean_qp(uct_rc_mlx5_ep_t *ep, struct ibv_qp *qp) +static void uct_rc_mlx5_ep_clean_qp(uct_rc_mlx5_ep_t *ep, uct_ib_mlx5_qp_t *qp) { uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_mlx5_iface_common_t); + uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.md, + uct_ib_mlx5_md_t); /* Make the HW generate CQEs for all in-progress SRQ receives from the QP, * so we clean them all before ibv_modify_qp() can see them. */ -#if HAVE_DECL_IBV_CMD_MODIFY_QP +#if HAVE_DECL_IBV_CMD_MODIFY_QP && !HAVE_DEVX struct ibv_qp_attr qp_attr; struct ibv_modify_qp cmd; int ret; @@ -893,12 +954,12 @@ static void uct_rc_mlx5_ep_clean_qp(uct_rc_mlx5_ep_t *ep, struct ibv_qp *qp) */ memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.qp_state = IBV_QPS_RESET; - ret = ibv_cmd_modify_qp(qp, &qp_attr, IBV_QP_STATE, &cmd, sizeof(cmd)); + ret = ibv_cmd_modify_qp(qp->verbs.qp, &qp_attr, IBV_QP_STATE, &cmd, sizeof(cmd)); if (ret) { ucs_warn("modify qp 0x%x to RESET failed: %m", qp->qp_num); } #else - (void)uct_ib_modify_qp(qp, IBV_QPS_ERR); + (void)uct_ib_mlx5_modify_qp_state(md, qp, IBV_QPS_ERR); #endif iface->super.rx.srq.available += uct_rc_mlx5_iface_commom_clean( @@ -909,7 +970,7 @@ static void uct_rc_mlx5_ep_clean_qp(uct_rc_mlx5_ep_t *ep, struct ibv_qp *qp) * completions for this QP (both send and receive) during ibv_destroy_qp(). */ uct_rc_mlx5_iface_common_update_cqs_ci(iface, &iface->super.super); - (void)uct_ib_modify_qp(qp, IBV_QPS_RESET); + (void)uct_ib_mlx5_modify_qp_state(md, qp, IBV_QPS_RESET); uct_rc_mlx5_iface_common_sync_cqs_ci(iface, &iface->super.super); } @@ -917,19 +978,22 @@ static UCS_CLASS_CLEANUP_FUNC(uct_rc_mlx5_ep_t) { uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(self->super.super.super.iface, uct_rc_mlx5_iface_common_t); + uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.md, + uct_ib_mlx5_md_t); uct_ib_mlx5_txwq_cleanup(&self->tx.wq); - uct_rc_mlx5_ep_clean_qp(self, self->super.txqp.qp); + uct_rc_mlx5_ep_clean_qp(self, &self->tx.wq.super); #if IBV_HW_TM if (UCT_RC_MLX5_TM_ENABLED(iface)) { - uct_rc_mlx5_ep_clean_qp(self, self->tm_qp); - uct_rc_iface_remove_qp(&iface->super, self->tm_qp->qp_num); - if (ibv_destroy_qp(self->tm_qp)) { - ucs_warn("failed to destroy TM RNDV QP: %m"); - } + uct_rc_mlx5_ep_clean_qp(self, &self->tm_qp); + uct_ib_mlx5_iface_put_res_domain(&self->tm_qp); + uct_rc_iface_remove_qp(&iface->super, self->tm_qp.qp_num); + uct_ib_mlx5_destroy_qp(md, &self->tm_qp); } #endif + ucs_assert(self->mp.free == 1); + /* Return all credits if user do flush(UCT_FLUSH_FLAG_CANCEL) before * ep_destroy. */ @@ -937,7 +1001,33 @@ static UCS_CLASS_CLEANUP_FUNC(uct_rc_mlx5_ep_t) self->tx.wq.bb_max - uct_rc_txqp_available(&self->super.txqp)); - uct_ib_mlx5_srq_cleanup(&iface->rx.srq, iface->super.rx.srq.srq); + uct_ib_mlx5_verbs_srq_cleanup(&iface->rx.srq, iface->rx.srq.verbs.srq); + + uct_rc_iface_remove_qp(&iface->super, self->tx.wq.super.qp_num); + uct_ib_mlx5_destroy_qp(md, &self->tx.wq.super); +} + +ucs_status_t uct_rc_mlx5_ep_handle_failure(uct_rc_mlx5_ep_t *ep, + ucs_status_t status) +{ + uct_ib_iface_t *ib_iface = ucs_derived_of(ep->super.super.super.iface, + uct_ib_iface_t); + uct_rc_iface_t *rc_iface = ucs_derived_of(ib_iface, uct_rc_iface_t); + + uct_rc_txqp_purge_outstanding(&ep->super.txqp, status, 0); + /* poll_cqe for mlx5 returns NULL in case of failure and the cq_avaialble + is not updated for the error cqe and all outstanding wqes*/ + rc_iface->tx.cq_available += ep->tx.wq.bb_max - + uct_rc_txqp_available(&ep->super.txqp); + return ib_iface->ops->set_ep_failed(ib_iface, &ep->super.super.super, + status); +} + +ucs_status_t uct_rc_mlx5_ep_set_failed(uct_ib_iface_t *iface, uct_ep_h ep, + ucs_status_t status) +{ + return uct_set_ep_failed(&UCS_CLASS_NAME(uct_rc_mlx5_ep_t), ep, + &iface->super.super, status); } UCS_CLASS_DEFINE(uct_rc_mlx5_ep_t, uct_rc_ep_t); diff --git a/src/uct/ib/rc/accel/rc_mlx5_iface.c b/src/uct/ib/rc/accel/rc_mlx5_iface.c index 06b4630a70e..87a99a8f01b 100644 --- a/src/uct/ib/rc/accel/rc_mlx5_iface.c +++ b/src/uct/ib/rc/accel/rc_mlx5_iface.c @@ -4,9 +4,14 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include -#include +#include +#include #include #include #include @@ -15,12 +20,21 @@ #include "rc_mlx5.inl" +enum { + UCT_RC_MLX5_IFACE_ADDR_TYPE_BASIC, + + /* Tag Matching address. It additionaly contains QP number which + * is used for hardware offloads. */ + UCT_RC_MLX5_IFACE_ADDR_TYPE_TM +}; + + /** * RC mlx5 interface configuration */ typedef struct uct_rc_mlx5_iface_config { - uct_rc_mlx5_iface_common_config_t super; - uct_rc_fc_config_t fc; + uct_rc_iface_config_t super; + uct_rc_mlx5_iface_common_config_t rc_mlx5_common; /* TODO wc_mode, UAR mode SnB W/A... */ } uct_rc_mlx5_iface_config_t; @@ -28,11 +42,11 @@ typedef struct uct_rc_mlx5_iface_config { ucs_config_field_t uct_rc_mlx5_iface_config_table[] = { {"RC_", "", NULL, ucs_offsetof(uct_rc_mlx5_iface_config_t, super), - UCS_CONFIG_TYPE_TABLE(uct_rc_mlx5_common_config_table)}, + UCS_CONFIG_TYPE_TABLE(uct_rc_iface_config_table)}, - {"", "", NULL, - ucs_offsetof(uct_rc_mlx5_iface_config_t, fc), - UCS_CONFIG_TYPE_TABLE(uct_rc_fc_config_table)}, + {"RC_", "", NULL, + ucs_offsetof(uct_rc_mlx5_iface_config_t, rc_mlx5_common), + UCS_CONFIG_TYPE_TABLE(uct_rc_mlx5_common_config_table)}, {NULL} }; @@ -40,7 +54,7 @@ ucs_config_field_t uct_rc_mlx5_iface_config_table[] = { static uct_rc_iface_ops_t uct_rc_mlx5_iface_ops; -#if ENABLE_STATS +#ifdef ENABLE_STATS ucs_stats_class_t uct_rc_mlx5_iface_stats_class = { .name = "mlx5", .num_counters = UCT_RC_MLX5_IFACE_STAT_LAST, @@ -69,7 +83,8 @@ void uct_rc_mlx5_iface_check_rx_completion(uct_rc_mlx5_iface_common_t *iface, wqe_ctr = ntohs(ecqe->wqe_counter); seg = uct_ib_mlx5_srq_get_wqe(&iface->rx.srq, wqe_ctr); ++cq->cq_ci; - uct_rc_mlx5_iface_release_srq_seg(iface, seg, wqe_ctr, UCS_OK, + /* TODO: Check if ib_stride_index valid for error CQE */ + uct_rc_mlx5_iface_release_srq_seg(iface, seg, cqe, wqe_ctr, UCS_OK, iface->super.super.config.rx_headroom_offset, &iface->super.super.release_desc); } else { @@ -96,12 +111,11 @@ uct_rc_mlx5_iface_poll_tx(uct_rc_mlx5_iface_common_t *iface) ucs_memory_cpu_load_fence(); qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); - ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super, qp_num), uct_rc_mlx5_ep_t); + ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super, qp_num), + uct_rc_mlx5_ep_t); /* TODO: temporary workaround for uct_ep_flush(cancel) case when EP has been * destroyed but successful CQE was not polled out from the CQ */ if (ucs_unlikely(ep == NULL)) { - ucs_debug(UCT_IB_IFACE_FMT": qp_num %x not found", - UCT_IB_IFACE_ARG(&iface->super.super), qp_num); return 1; } @@ -111,10 +125,14 @@ uct_rc_mlx5_iface_poll_tx(uct_rc_mlx5_iface_common_t *iface) uct_rc_mlx5_common_update_tx_res(&iface->super, &ep->tx.wq, &ep->super.txqp, hw_ci); - uct_rc_mlx5_txqp_process_tx_cqe(&ep->super.txqp, cqe, hw_ci); + /* process pending elements prior to CQ entries to avoid out-of-order + * transmission in completion callbacks */ ucs_arbiter_group_schedule(&iface->super.tx.arbiter, &ep->super.arb_group); - ucs_arbiter_dispatch(&iface->super.tx.arbiter, 1, uct_rc_ep_process_pending, NULL); + ucs_arbiter_dispatch(&iface->super.tx.arbiter, 1, uct_rc_ep_process_pending, + NULL); + + uct_rc_mlx5_txqp_process_tx_cqe(&ep->super.txqp, cqe, hw_ci); return 1; } @@ -124,7 +142,7 @@ unsigned uct_rc_mlx5_iface_progress(void *arg) uct_rc_mlx5_iface_common_t *iface = arg; unsigned count; - count = uct_rc_mlx5_iface_common_poll_rx(iface, 0); + count = uct_rc_mlx5_iface_common_poll_rx(iface, UCT_RC_MLX5_POLL_FLAG_HAS_EP); if (count > 0) { return count; } @@ -139,7 +157,7 @@ static ucs_status_t uct_rc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr size_t max_put_inline = UCT_IB_MLX5_PUT_MAX_SHORT(0); ucs_status_t status; -#if HAVE_IBV_EXP_DM +#if HAVE_IBV_DM if (iface->dm.dm != NULL) { max_am_inline = ucs_max(iface->dm.dm->seg_len, UCT_IB_MLX5_AM_MAX_SHORT(0)); max_put_inline = ucs_max(iface->dm.dm->seg_len, UCT_IB_MLX5_PUT_MAX_SHORT(0)); @@ -151,15 +169,17 @@ static ucs_status_t uct_rc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr max_am_inline, UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(0), UCT_IB_MLX5_AM_ZCOPY_MAX_IOV, - UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(0), - sizeof(uct_rc_mlx5_hdr_t)); + sizeof(uct_rc_mlx5_hdr_t), + UCT_RC_MLX5_RMA_MAX_IOV(0)); if (status != UCS_OK) { return status; } - uct_rc_mlx5_iface_common_query(&rc_iface->super, iface_attr, max_am_inline, 0); - iface_attr->latency.growth += 1e-9; /* 1 ns per each extra QP */ - iface_attr->ep_addr_len = sizeof(uct_rc_mlx5_ep_address_t); + uct_rc_mlx5_iface_common_query(&rc_iface->super, iface_attr, max_am_inline, + UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(0)); + iface_attr->latency.m += 1e-9; /* 1 ns per each extra QP */ + iface_attr->ep_addr_len = sizeof(uct_rc_mlx5_ep_address_t); + iface_attr->iface_addr_len = sizeof(uint8_t); return UCS_OK; } @@ -181,15 +201,16 @@ static void uct_rc_mlx5_iface_handle_failure(uct_ib_iface_t *ib_iface, void *arg, ucs_status_t status) { - struct mlx5_cqe64 *cqe = arg; - uct_rc_iface_t *iface = ucs_derived_of(ib_iface, uct_rc_iface_t); - unsigned qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); - uct_rc_mlx5_ep_t *ep = ucs_derived_of(uct_rc_iface_lookup_ep(iface, qp_num), - uct_rc_mlx5_ep_t); - ucs_log_level_t log_lvl = UCS_LOG_LEVEL_FATAL; + struct mlx5_cqe64 *cqe = arg; + uct_rc_iface_t *iface = ucs_derived_of(ib_iface, uct_rc_iface_t); + unsigned qp_num = ntohl(cqe->sop_drop_qpn) & + UCS_MASK(UCT_IB_QPN_ORDER); + uct_rc_mlx5_ep_t *ep = ucs_derived_of(uct_rc_iface_lookup_ep(iface, + qp_num), + uct_rc_mlx5_ep_t); + ucs_log_level_t log_lvl = UCS_LOG_LEVEL_FATAL; uct_ib_mlx5_txwq_t txwq_copy; - size_t txwq_size; - ucs_status_t ep_status; + size_t txwq_size; if (!ep) { return; @@ -198,21 +219,14 @@ uct_rc_mlx5_iface_handle_failure(uct_ib_iface_t *ib_iface, void *arg, /* Create a copy of RC txwq for completion error reporting, since the QP * would be released by set_ep_failed()*/ txwq_copy = ep->tx.wq; - txwq_size = ep->tx.wq.qend - ep->tx.wq.qstart; + txwq_size = UCS_PTR_BYTE_DIFF(ep->tx.wq.qstart, ep->tx.wq.qend); txwq_copy.qstart = ucs_malloc(txwq_size, "rc_txwq_copy"); if (txwq_copy.qstart != NULL) { memcpy(txwq_copy.qstart, ep->tx.wq.qstart, txwq_size); - txwq_copy.qend = txwq_copy.qstart + txwq_size; + txwq_copy.qend = UCS_PTR_BYTE_OFFSET(txwq_copy.qstart, txwq_size); } - uct_rc_txqp_purge_outstanding(&ep->super.txqp, status, 0); - /* poll_cqe for mlx5 returns NULL in case of failure and the cq_avaialble - is not updated for the error cqe and all outstanding wqes*/ - iface->tx.cq_available += ep->tx.wq.bb_max - - uct_rc_txqp_available(&ep->super.txqp); - ep_status = ib_iface->ops->set_ep_failed(ib_iface, &ep->super.super.super, - status); - if (ep_status == UCS_OK) { + if (uct_rc_mlx5_ep_handle_failure(ep, status) == UCS_OK) { log_lvl = ib_iface->super.config.failure_level; } @@ -222,13 +236,6 @@ uct_rc_mlx5_iface_handle_failure(uct_ib_iface_t *ib_iface, void *arg, ucs_free(txwq_copy.qstart); } -static ucs_status_t uct_rc_mlx5_ep_set_failed(uct_ib_iface_t *iface, - uct_ep_h ep, ucs_status_t status) -{ - return uct_set_ep_failed(&UCS_CLASS_NAME(uct_rc_mlx5_ep_t), ep, - &iface->super.super, status); -} - static void uct_rc_mlx5_iface_progress_enable(uct_iface_h tl_iface, unsigned flags) { uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_iface, uct_rc_mlx5_iface_common_t); @@ -241,50 +248,100 @@ static void uct_rc_mlx5_iface_progress_enable(uct_iface_h tl_iface, unsigned fla iface->super.progress, flags); } -static ucs_status_t uct_rc_mlx5_iface_create_qp(uct_ib_iface_t *ib_iface, - uct_ib_qp_attr_t *attr, - struct ibv_qp **qp_p) +ucs_status_t uct_rc_mlx5_iface_create_qp(uct_rc_mlx5_iface_common_t *iface, + uct_ib_mlx5_qp_t *qp, + uct_ib_mlx5_txwq_t *txwq, + uct_ib_mlx5_qp_attr_t *attr) { - uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(ib_iface, uct_rc_mlx5_iface_common_t); + uct_ib_iface_t *ib_iface = &iface->super.super; + ucs_status_t status; + uct_ib_mlx5_md_t *md = ucs_derived_of(ib_iface->super.md, + uct_ib_mlx5_md_t); #if HAVE_DECL_MLX5DV_CREATE_QP - uct_ib_device_t *dev = uct_ib_iface_device(ib_iface); + uct_ib_device_t *dev = &md->super.dev; struct mlx5dv_qp_init_attr dv_attr = {}; - struct ibv_qp *qp; - uct_ib_iface_fill_attr(ib_iface, attr); - uct_ib_mlx5_iface_fill_attr(ib_iface, &iface->mlx5_common, attr); + if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX_RC_QP) { + attr->mmio_mode = iface->tx.mmio_mode; + status = uct_ib_mlx5_devx_create_qp(ib_iface, qp, txwq, attr); + if (status != UCS_OK) { + return status; + } + + status = uct_rc_mlx5_devx_iface_subscribe_event(iface, qp, + UCT_IB_MLX5_EVENT_TYPE_SRQ_LAST_WQE, + IBV_EVENT_QP_LAST_WQE_REACHED, qp->qp_num); + if (status != UCS_OK) { + goto err_destory_qp; + } + + return UCS_OK; + } + + status = uct_ib_mlx5_iface_fill_attr(ib_iface, qp, attr); + if (status != UCS_OK) { + return status; + } + + uct_ib_iface_fill_attr(ib_iface, &attr->super); #if HAVE_DECL_MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE dv_attr.comp_mask = MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS; dv_attr.create_flags = MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE; #endif - qp = mlx5dv_create_qp(dev->ibv_context, &attr->ibv, &dv_attr); - if (qp == NULL) { - ucs_error("iface=%p: failed to create QP: %m", iface); - return UCS_ERR_IO_ERROR; + qp->verbs.qp = mlx5dv_create_qp(dev->ibv_context, &attr->super.ibv, &dv_attr); + if (qp->verbs.qp == NULL) { + ucs_error("mlx5dv_create_qp("UCT_IB_IFACE_FMT"): failed: %m", + UCT_IB_IFACE_ARG(ib_iface)); + status = UCS_ERR_IO_ERROR; + goto err; } - attr->cap = attr->ibv.cap; - *qp_p = qp; - - return UCS_OK; + qp->qp_num = qp->verbs.qp->qp_num; #else - return uct_ib_mlx5_iface_create_qp(ib_iface, &iface->mlx5_common, attr, qp_p); + status = uct_ib_mlx5_iface_create_qp(ib_iface, qp, attr); + if (status != UCS_OK) { + goto err; + } #endif + + status = uct_rc_iface_qp_init(&iface->super, qp->verbs.qp); + if (status != UCS_OK) { + goto err_destory_qp; + } + + if (attr->super.cap.max_send_wr) { + status = uct_ib_mlx5_txwq_init(iface->super.super.super.worker, + iface->tx.mmio_mode, txwq, + qp->verbs.qp); + if (status != UCS_OK) { + ucs_error("Failed to get mlx5 QP information"); + goto err_destory_qp; + } + } + + return UCS_OK; + +err_destory_qp: + uct_ib_mlx5_destroy_qp(md, qp); +err: + return status; } -#if IBV_HW_TM -static unsigned uct_rc_mlx5_iface_progress_tm(void *arg) +static UCS_F_MAYBE_UNUSED unsigned uct_rc_mlx5_iface_progress_tm(void *arg) { uct_rc_mlx5_iface_common_t *iface = arg; unsigned count; - count = uct_rc_mlx5_iface_common_poll_rx(iface, 1); + count = uct_rc_mlx5_iface_common_poll_rx(iface, + UCT_RC_MLX5_POLL_FLAG_HAS_EP | + UCT_RC_MLX5_POLL_FLAG_TM); if (count > 0) { return count; } return uct_rc_mlx5_iface_poll_tx(iface); } +#if IBV_HW_TM static ucs_status_t uct_rc_mlx5_iface_tag_recv_zcopy(uct_iface_h tl_iface, uct_tag_t tag, uct_tag_t tag_mask, @@ -308,88 +365,179 @@ static ucs_status_t uct_rc_mlx5_iface_tag_recv_cancel(uct_iface_h tl_iface, } #endif -static void uct_rc_mlx5_iface_preinit(uct_rc_mlx5_iface_common_t *iface, uct_md_h md, - uct_rc_mlx5_iface_common_config_t *config, - const uct_iface_params_t *params, - uct_ib_iface_init_attr_t *init_attr) +static ucs_status_t uct_rc_mlx5_iface_preinit(uct_rc_mlx5_iface_common_t *iface, + uct_md_h tl_md, + uct_rc_iface_common_config_t *rc_config, + uct_rc_mlx5_iface_common_config_t *mlx5_config, + const uct_iface_params_t *params, + uct_ib_iface_init_attr_t *init_attr) { + uct_ib_mlx5_md_t *md = ucs_derived_of(tl_md, uct_ib_mlx5_md_t); #if IBV_HW_TM - uct_ib_device_t UCS_V_UNUSED *dev = &ucs_derived_of(md, uct_ib_md_t)->dev; - uint32_t cap_flags = IBV_DEVICE_TM_FLAGS(dev); + uct_ib_device_t UCS_V_UNUSED *dev = &md->super.dev; struct ibv_tmh tmh; + int mtu; + int tm_params; + ucs_status_t status; +#endif - iface->tm.enabled = config->tm.enable && - (cap_flags & init_attr->tm_cap_bit); + iface->config.cyclic_srq_enable = mlx5_config->cyclic_srq_enable; +#if IBV_HW_TM + /* Both eager and rndv callbacks should be provided for + * tag matching support */ + tm_params = ucs_test_all_flags(params->field_mask, + UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_CB | + UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_CB); + + iface->tm.enabled = mlx5_config->tm.enable && tm_params && + (init_attr->flags & UCT_IB_TM_SUPPORTED); if (!iface->tm.enabled) { goto out_tm_disabled; } - /* Compile-time check that THM and uct_rc_mlx5_hdr_t are wire-compatible for the - * case of no-tag protocol. + /* Compile-time check that THM and uct_rc_mlx5_hdr_t are wire-compatible + * for the case of no-tag protocol. */ - UCS_STATIC_ASSERT(sizeof(tmh.opcode) == sizeof(((uct_rc_mlx5_hdr_t*)0)->tmh_opcode)); + UCS_STATIC_ASSERT(sizeof(tmh.opcode) == + sizeof(((uct_rc_mlx5_hdr_t*)0)->tmh_opcode)); UCS_STATIC_ASSERT(ucs_offsetof(struct ibv_tmh, opcode) == ucs_offsetof(uct_rc_mlx5_hdr_t, tmh_opcode)); UCS_STATIC_ASSERT(sizeof(uct_rc_mlx5_ctx_priv_t) <= UCT_TAG_PRIV_LEN); - iface->tm.eager_unexp.cb = (params->field_mask & - UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_CB) ? - params->eager_cb : NULL; + iface->tm.eager_unexp.cb = params->eager_cb; + iface->tm.rndv_unexp.cb = params->rndv_cb; iface->tm.eager_unexp.arg = (params->field_mask & UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_ARG) ? params->eager_arg : NULL; - iface->tm.rndv_unexp.cb = (params->field_mask & - UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_CB) ? - params->rndv_cb : NULL; iface->tm.rndv_unexp.arg = (params->field_mask & UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_ARG) ? params->rndv_arg : NULL; iface->tm.unexpected_cnt = 0; iface->tm.num_outstanding = 0; iface->tm.num_tags = ucs_min(IBV_DEVICE_TM_CAPS(dev, max_num_tags), - config->tm.list_size); + mlx5_config->tm.list_size); /* There can be: * - up to rx.queue_len RX CQEs * - up to 3 CQEs for every posted tag: ADD, TM_CONSUMED and MSG_ARRIVED * - one SYNC CQE per every IBV_DEVICE_MAX_UNEXP_COUNT unexpected receives */ UCS_STATIC_ASSERT(IBV_DEVICE_MAX_UNEXP_COUNT); - init_attr->rx_cq_len = config->super.super.rx.queue_len + iface->tm.num_tags * 3 + - config->super.super.rx.queue_len / - IBV_DEVICE_MAX_UNEXP_COUNT; - init_attr->seg_size = ucs_max(config->tm.max_bcopy, - config->super.super.super.max_bcopy); - return; + init_attr->cq_len[UCT_IB_DIR_RX] = rc_config->super.rx.queue_len + + iface->tm.num_tags * 3 + + rc_config->super.rx.queue_len / + IBV_DEVICE_MAX_UNEXP_COUNT; + init_attr->seg_size = ucs_max(mlx5_config->tm.seg_size, + rc_config->super.seg_size); + iface->tm.mp.num_strides = 1; + iface->tm.max_bcopy = init_attr->seg_size; + + if (mlx5_config->tm.mp_enable == UCS_NO) { + return UCS_OK; + } + + /* Multi-Packet XRQ initialization */ + if (!ucs_test_all_flags(md->flags, UCT_IB_MLX5_MD_FLAG_MP_RQ | + UCT_IB_MLX5_MD_FLAG_DEVX_RC_SRQ | + UCT_IB_MLX5_MD_FLAG_DEVX_RC_QP)) { + goto out_mp_disabled; + } + + if ((mlx5_config->tm.mp_num_strides != 8) && + (mlx5_config->tm.mp_num_strides != 16)) { + ucs_error("invalid value of TM_MP_NUM_STRIDES: %lu, must be 8 or 16", + mlx5_config->tm.mp_num_strides); + return UCS_ERR_INVALID_PARAM; + } else { + iface->tm.mp.num_strides = mlx5_config->tm.mp_num_strides; + } + + status = uct_ib_device_mtu(params->mode.device.dev_name, tl_md, &mtu); + if (status != UCS_OK) { + ucs_error("failed to get port MTU: %s", ucs_status_string(status)); + return UCS_ERR_IO_ERROR; + } + + init_attr->seg_size = mtu; + + return UCS_OK; out_tm_disabled: #else - iface->tm.enabled = 0; + iface->tm.enabled = 0; #endif - init_attr->rx_cq_len = config->super.super.rx.queue_len; - init_attr->seg_size = config->super.super.super.max_bcopy; + init_attr->cq_len[UCT_IB_DIR_RX] = rc_config->super.rx.queue_len; + init_attr->seg_size = rc_config->super.seg_size; + iface->tm.mp.num_strides = 1; + +#if IBV_HW_TM +out_mp_disabled: +#endif + if (mlx5_config->tm.mp_enable == UCS_YES) { + ucs_error("%s: MP SRQ is requested, but not supported: (md flags 0x%x), " + "hardware tag-matching is %s", + uct_ib_device_name(&md->super.dev), md->flags, + iface->tm.enabled ? "enabled" : "disabled"); + return UCS_ERR_INVALID_PARAM; + } + + return UCS_OK; } static ucs_status_t -uct_rc_mlx5_init_rx(uct_rc_iface_t *rc_iface, - const uct_rc_iface_config_t *rc_config) +uct_rc_mlx5_iface_init_rx(uct_rc_iface_t *rc_iface, + const uct_rc_iface_common_config_t *rc_config) { - uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(rc_iface, uct_rc_mlx5_iface_common_t); -#if IBV_HW_TM - uct_rc_mlx5_iface_common_config_t *config = ucs_derived_of(rc_config, - uct_rc_mlx5_iface_common_config_t); + uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(rc_iface, + uct_rc_mlx5_iface_common_t); + uct_ib_mlx5_md_t *md = ucs_derived_of(rc_iface->super.super.md, + uct_ib_mlx5_md_t); + struct ibv_srq_init_attr_ex srq_attr = {}; + ucs_status_t status; if (UCT_RC_MLX5_TM_ENABLED(iface)) { - struct ibv_exp_create_srq_attr srq_init_attr = {}; + if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX_RC_SRQ) { + status = uct_rc_mlx5_devx_init_rx_tm(iface, rc_config, 0, + UCT_RC_RNDV_HDR_LEN); + } else { + status = uct_rc_mlx5_init_rx_tm(iface, rc_config, &srq_attr, + UCT_RC_RNDV_HDR_LEN); + } + + if (status != UCS_OK) { + return status; + } iface->super.progress = uct_rc_mlx5_iface_progress_tm; - return uct_rc_mlx5_init_rx_tm(iface, config, &srq_init_attr, - sizeof(struct ibv_rvh), 0); + return UCS_OK; } -#endif + + /* MP XRQ is supported with HW TM only */ + ucs_assert(!UCT_RC_MLX5_MP_ENABLED(iface)); + + if (ucs_test_all_flags(md->flags, UCT_IB_MLX5_MD_FLAG_RMP | + UCT_IB_MLX5_MD_FLAG_DEVX_RC_SRQ)) { + status = uct_rc_mlx5_devx_init_rx(iface, rc_config); + } else { + status = uct_rc_mlx5_common_iface_init_rx(iface, rc_config); + } + + if (status != UCS_OK) { + return status; + } + iface->super.progress = uct_rc_mlx5_iface_progress; - return uct_rc_iface_init_rx(rc_iface, rc_config); + return UCS_OK; +} + +static void uct_rc_mlx5_iface_cleanup_rx(uct_rc_iface_t *rc_iface) +{ + uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(rc_iface, uct_rc_mlx5_iface_common_t); + uct_ib_mlx5_md_t *md = ucs_derived_of(rc_iface->super.super.md, + uct_ib_mlx5_md_t); + + uct_rc_mlx5_destroy_srq(md, &iface->rx.srq); } static void uct_rc_mlx5_iface_event_cq(uct_ib_iface_t *ib_iface, @@ -400,33 +548,79 @@ static void uct_rc_mlx5_iface_event_cq(uct_ib_iface_t *ib_iface, iface->cq[dir].cq_sn++; } +static uint8_t uct_rc_mlx5_iface_get_address_type(uct_iface_h tl_iface) +{ + uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_iface, + uct_rc_mlx5_iface_common_t); + + return UCT_RC_MLX5_TM_ENABLED(iface) ? UCT_RC_MLX5_IFACE_ADDR_TYPE_TM : + UCT_RC_MLX5_IFACE_ADDR_TYPE_BASIC; +} + +static ucs_status_t uct_rc_mlx5_iface_get_address(uct_iface_h tl_iface, + uct_iface_addr_t *addr) +{ + *(uint8_t*)addr = uct_rc_mlx5_iface_get_address_type(tl_iface); + + return UCS_OK; +} + +int uct_rc_mlx5_iface_is_reachable(const uct_iface_h tl_iface, + const uct_device_addr_t *dev_addr, + const uct_iface_addr_t *iface_addr) +{ + uint8_t my_type = uct_rc_mlx5_iface_get_address_type(tl_iface); + + if ((iface_addr != NULL) && (my_type != *(uint8_t*)iface_addr)) { + return 0; + } + + return uct_ib_iface_is_reachable(tl_iface, dev_addr, iface_addr); +} + UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_common_t, uct_rc_iface_ops_t *ops, - uct_md_h md, uct_worker_h worker, + uct_md_h tl_md, uct_worker_h worker, const uct_iface_params_t *params, - uct_rc_mlx5_iface_common_config_t *config, + uct_rc_iface_common_config_t *rc_config, + uct_rc_mlx5_iface_common_config_t *mlx5_config, uct_ib_iface_init_attr_t *init_attr) { + uct_ib_mlx5_md_t *md = ucs_derived_of(tl_md, uct_ib_mlx5_md_t); + uct_ib_device_t *dev; ucs_status_t status; - uct_rc_mlx5_iface_preinit(self, md, config, params, init_attr); + status = uct_rc_mlx5_iface_preinit(self, tl_md, rc_config, mlx5_config, + params, init_attr); + if (status != UCS_OK) { + return status; + } + + self->rx.srq.type = UCT_IB_MLX5_OBJ_TYPE_LAST; + self->tm.cmd_wq.super.super.type = UCT_IB_MLX5_OBJ_TYPE_LAST; + init_attr->rx_hdr_len = UCT_RC_MLX5_MP_ENABLED(self) ? + 0 : sizeof(uct_rc_mlx5_hdr_t); - UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, ops, md, worker, params, - &config->super, init_attr); + UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, ops, tl_md, worker, params, + rc_config, init_attr); - self->tx.mmio_mode = config->mlx5_common.mmio_mode; - self->tx.bb_max = ucs_min(config->tx_max_bb, UINT16_MAX); - self->super.config.tx_moderation = ucs_min(self->super.config.tx_moderation, - self->tx.bb_max / 4); - self->tx.next_fm = 0; - self->tx.fence_beat = 0; + dev = uct_ib_iface_device(&self->super.super); + self->tx.mmio_mode = mlx5_config->super.mmio_mode; + self->tx.bb_max = ucs_min(mlx5_config->tx_max_bb, UINT16_MAX); + self->tm.am_desc.super.cb = uct_rc_mlx5_release_desc; - status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_TX], &self->cq[UCT_IB_DIR_TX]); + if (!UCT_RC_MLX5_MP_ENABLED(self)) { + self->tm.am_desc.offset = self->super.super.config.rx_headroom_offset; + } + + status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_TX], + &self->cq[UCT_IB_DIR_TX]); if (status != UCS_OK) { return status; } - status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_RX], &self->cq[UCT_IB_DIR_RX]); + status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_RX], + &self->cq[UCT_IB_DIR_RX]); if (status != UCS_OK) { return status; } @@ -437,24 +631,41 @@ UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_common_t, return status; } - status = uct_rc_mlx5_iface_common_tag_init(self, config); + status = uct_rc_mlx5_iface_common_tag_init(self); if (status != UCS_OK) { goto cleanup_stats; } - status = uct_ib_mlx5_srq_init(&self->rx.srq, self->super.rx.srq.srq, - self->super.super.config.seg_size); + status = uct_rc_mlx5_iface_common_dm_init(self, &self->super, + &mlx5_config->super); if (status != UCS_OK) { goto cleanup_tm; } - status = uct_rc_mlx5_iface_common_dm_init(self, &self->super, &config->mlx5_common); - if (status != UCS_OK) { + self->super.config.fence_mode = (uct_rc_fence_mode_t)rc_config->fence_mode; + self->super.rx.srq.quota = self->rx.srq.mask + 1; + self->super.config.exp_backoff = mlx5_config->exp_backoff; + + if ((rc_config->fence_mode == UCT_RC_FENCE_MODE_WEAK) || + ((rc_config->fence_mode == UCT_RC_FENCE_MODE_AUTO) && + (uct_ib_device_has_pci_atomics(dev) || md->super.relaxed_order))) { + if (uct_ib_device_has_pci_atomics(dev)) { + self->config.atomic_fence_flag = UCT_IB_MLX5_WQE_CTRL_FLAG_FENCE; + } else { + self->config.atomic_fence_flag = 0; + } + self->super.config.fence_mode = UCT_RC_FENCE_MODE_WEAK; + } else if ((rc_config->fence_mode == UCT_RC_FENCE_MODE_NONE) || + ((rc_config->fence_mode == UCT_RC_FENCE_MODE_AUTO) && + !uct_ib_device_has_pci_atomics(dev))) { + self->config.atomic_fence_flag = 0; + self->super.config.fence_mode = UCT_RC_FENCE_MODE_NONE; + } else { + ucs_error("incorrect fence value: %d", self->super.config.fence_mode); + status = UCS_ERR_INVALID_PARAM; goto cleanup_tm; } - self->super.rx.srq.quota = self->rx.srq.mask + 1; - /* By default set to something that is always in cache */ self->rx.pref_ptr = self; @@ -463,7 +674,7 @@ UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_common_t, sizeof(uct_rc_iface_send_desc_t) + UCT_IB_MAX_ATOMIC_SIZE, sizeof(uct_rc_iface_send_desc_t) + UCT_IB_MAX_ATOMIC_SIZE, UCS_SYS_CACHE_LINE_SIZE, - &config->super.super.tx.mp, + &rc_config->super.tx.mp, self->super.config.tx_qp_len, uct_rc_iface_send_desc_init, "rc_mlx5_atomic_desc"); @@ -471,6 +682,13 @@ UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_common_t, goto cleanup_dm; } +#if HAVE_DEVX + status = uct_rc_mlx5_devx_iface_init_events(self); + if (status != UCS_OK) { + goto cleanup_dm; + } +#endif + /* For little-endian atomic reply, override the default functions, to still * treat the response as big-endian when it arrives in the CQE. */ @@ -497,6 +715,9 @@ UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_common_t, static UCS_CLASS_CLEANUP_FUNC(uct_rc_mlx5_iface_common_t) { +#if HAVE_DEVX + uct_rc_mlx5_devx_iface_free_events(self); +#endif ucs_mpool_cleanup(&self->tx.atomic_desc_mp, 1); uct_rc_mlx5_iface_common_dm_cleanup(self); uct_rc_mlx5_iface_common_tag_cleanup(self); @@ -510,36 +731,38 @@ typedef struct { } uct_rc_mlx5_iface_t; UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_t, - uct_md_h md, uct_worker_h worker, + uct_md_h tl_md, uct_worker_h worker, const uct_iface_params_t *params, const uct_iface_config_t *tl_config) { uct_rc_mlx5_iface_config_t *config = ucs_derived_of(tl_config, uct_rc_mlx5_iface_config_t); + uct_ib_mlx5_md_t UCS_V_UNUSED *md = ucs_derived_of(tl_md, uct_ib_mlx5_md_t); uct_ib_iface_init_attr_t init_attr = {}; ucs_status_t status; - init_attr.tm_cap_bit = IBV_TM_CAP_RC; - init_attr.fc_req_size = sizeof(uct_rc_fc_request_t); - init_attr.flags = UCT_IB_CQ_IGNORE_OVERRUN; - init_attr.rx_hdr_len = sizeof(uct_rc_mlx5_hdr_t); - init_attr.qp_type = IBV_QPT_RC; + init_attr.fc_req_size = sizeof(uct_rc_fc_request_t); + init_attr.flags = UCT_IB_CQ_IGNORE_OVERRUN; + init_attr.rx_hdr_len = sizeof(uct_rc_mlx5_hdr_t); + init_attr.cq_len[UCT_IB_DIR_TX] = config->super.tx_cq_len; + init_attr.qp_type = IBV_QPT_RC; + + if (IBV_DEVICE_TM_FLAGS(&md->super.dev)) { + init_attr.flags |= UCT_IB_TM_SUPPORTED; + } UCS_CLASS_CALL_SUPER_INIT(uct_rc_mlx5_iface_common_t, &uct_rc_mlx5_iface_ops, - md, worker, params, &config->super, &init_attr); + tl_md, worker, params, &config->super.super, + &config->rc_mlx5_common, &init_attr); + + self->super.super.config.tx_moderation = ucs_min(config->super.tx_cq_moderation, + self->super.tx.bb_max / 4); - status = uct_rc_init_fc_thresh(&config->fc, &config->super.super, &self->super.super); + status = uct_rc_init_fc_thresh(&config->super, &self->super.super); if (status != UCS_OK) { return status; } - /* Set max_iov for put_zcopy and get_zcopy */ - uct_ib_iface_set_max_iov(&self->super.super.super, - (UCT_IB_MLX5_MAX_SEND_WQE_SIZE - - sizeof(struct mlx5_wqe_raddr_seg) - - sizeof(struct mlx5_wqe_ctrl_seg)) / - sizeof(struct mlx5_wqe_data_seg)); - return UCS_OK; } @@ -593,7 +816,7 @@ static uct_rc_iface_ops_t uct_rc_mlx5_iface_ops = { .iface_tag_recv_cancel = uct_rc_mlx5_iface_tag_recv_cancel, #endif .iface_flush = uct_rc_iface_flush, - .iface_fence = uct_rc_mlx5_iface_fence, + .iface_fence = uct_rc_iface_fence, .iface_progress_enable = uct_rc_mlx5_iface_progress_enable, .iface_progress_disable = uct_base_iface_progress_disable, .iface_progress = uct_rc_iface_do_progress, @@ -601,42 +824,35 @@ static uct_rc_iface_ops_t uct_rc_mlx5_iface_ops = { .iface_event_arm = uct_rc_iface_event_arm, .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_rc_mlx5_iface_t), .iface_query = uct_rc_mlx5_iface_query, - .iface_get_address = uct_rc_iface_get_address, + .iface_get_address = uct_rc_mlx5_iface_get_address, .iface_get_device_address = uct_ib_iface_get_device_address, - .iface_is_reachable = uct_rc_iface_is_reachable + .iface_is_reachable = uct_rc_mlx5_iface_is_reachable }, .create_cq = uct_ib_mlx5_create_cq, .arm_cq = uct_rc_mlx5_iface_arm_cq, .event_cq = uct_rc_mlx5_iface_event_cq, .handle_failure = uct_rc_mlx5_iface_handle_failure, .set_ep_failed = uct_rc_mlx5_ep_set_failed, - .create_qp = uct_rc_mlx5_iface_create_qp, - .init_res_domain = uct_rc_mlx5_init_res_domain, - .cleanup_res_domain = uct_rc_mlx5_cleanup_res_domain, }, - .init_rx = uct_rc_mlx5_init_rx, + .init_rx = uct_rc_mlx5_iface_init_rx, + .cleanup_rx = uct_rc_mlx5_iface_cleanup_rx, .fc_ctrl = uct_rc_mlx5_ep_fc_ctrl, .fc_handler = uct_rc_iface_fc_handler, }; - -static ucs_status_t uct_rc_mlx5_query_resources(uct_md_h md, - uct_tl_resource_desc_t **resources_p, - unsigned *num_resources_p) +static ucs_status_t +uct_rc_mlx5_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) { uct_ib_md_t *ib_md = ucs_derived_of(md, uct_ib_md_t); + int flags; - return uct_ib_device_query_tl_resources(&ib_md->dev, "rc_mlx5", - UCT_IB_DEVICE_FLAG_MLX5_PRM | - (ib_md->config.eth_pause ? 0 : UCT_IB_DEVICE_FLAG_LINK_IB), - resources_p, num_resources_p); + flags = UCT_IB_DEVICE_FLAG_MLX5_PRM | + (ib_md->config.eth_pause ? 0 : UCT_IB_DEVICE_FLAG_LINK_IB); + return uct_ib_device_query_ports(&ib_md->dev, flags, tl_devices_p, + num_tl_devices_p); } -UCT_TL_COMPONENT_DEFINE(uct_rc_mlx5_tl, - uct_rc_mlx5_query_resources, - uct_rc_mlx5_iface_t, - "rc_mlx5", - "RC_MLX5_", - uct_rc_mlx5_iface_config_table, - uct_rc_mlx5_iface_config_t); -UCT_MD_REGISTER_TL(&uct_ib_mdc, &uct_rc_mlx5_tl); +UCT_TL_DEFINE(&uct_ib_component, rc_mlx5, uct_rc_mlx5_query_tl_devices, + uct_rc_mlx5_iface_t, "RC_MLX5_", uct_rc_mlx5_iface_config_table, + uct_rc_mlx5_iface_config_t); diff --git a/src/uct/ib/rc/base/rc_ep.c b/src/uct/ib/rc/base/rc_ep.c index 96659c420d3..f3970f793ab 100644 --- a/src/uct/ib/rc/base/rc_ep.c +++ b/src/uct/ib/rc/base/rc_ep.c @@ -5,6 +5,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rc_ep.h" #include "rc_iface.h" @@ -14,7 +18,7 @@ #include #include -#if ENABLE_STATS +#ifdef ENABLE_STATS static ucs_stats_class_t uct_rc_fc_stats_class = { .name = "rc_fc", .num_counters = UCT_RC_FC_STAT_LAST, @@ -43,46 +47,21 @@ static ucs_stats_class_t uct_rc_txqp_stats_class = { #endif ucs_status_t uct_rc_txqp_init(uct_rc_txqp_t *txqp, uct_rc_iface_t *iface, - struct ibv_qp_cap *cap + uint32_t qp_num UCS_STATS_ARG(ucs_stats_node_t* stats_parent)) { - ucs_status_t status; - txqp->unsignaled = 0; txqp->unsignaled_store = 0; txqp->unsignaled_store_count = 0; txqp->available = 0; ucs_queue_head_init(&txqp->outstanding); - status = uct_rc_iface_qp_create(iface, &txqp->qp, cap, - iface->config.tx_qp_len); - if (status != UCS_OK) { - goto err; - } - - status = UCS_STATS_NODE_ALLOC(&txqp->stats, &uct_rc_txqp_stats_class, - stats_parent, "-0x%x", txqp->qp->qp_num); - if (status != UCS_OK) { - goto err_destroy_qp; - } - - return UCS_OK; - -err_destroy_qp: - ibv_destroy_qp(txqp->qp); -err: - return status; + return UCS_STATS_NODE_ALLOC(&txqp->stats, &uct_rc_txqp_stats_class, + stats_parent, "-0x%x", qp_num); } void uct_rc_txqp_cleanup(uct_rc_txqp_t *txqp) { - int ret; - - ret = ibv_destroy_qp(txqp->qp); - if (ret != 0) { - ucs_warn("ibv_destroy_qp() returned %d: %m", ret); - } - uct_rc_txqp_purge_outstanding(txqp, UCS_ERR_CANCELED, 1); UCS_STATS_NODE_FREE(txqp->stats); } @@ -111,23 +90,20 @@ void uct_rc_fc_cleanup(uct_rc_fc_t *fc) UCS_STATS_NODE_FREE(fc->stats); } -UCS_CLASS_INIT_FUNC(uct_rc_ep_t, uct_rc_iface_t *iface) +UCS_CLASS_INIT_FUNC(uct_rc_ep_t, uct_rc_iface_t *iface, uint32_t qp_num, + const uct_ep_params_t *params) { - struct ibv_qp_cap cap; ucs_status_t status; UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super.super); - status = uct_rc_txqp_init(&self->txqp, iface, &cap + status = uct_rc_txqp_init(&self->txqp, iface, qp_num UCS_STATS_ARG(self->super.stats)); if (status != UCS_OK) { - goto err; + return status; } - status = uct_rc_iface_qp_init(iface, self->txqp.qp); - if (status != UCS_OK) { - goto err_txqp_cleanup; - } + self->path_index = UCT_EP_PARAMS_GET_PATH_INDEX(params); status = uct_rc_fc_init(&self->fc, iface->config.fc_wnd_size UCS_STATS_ARG(self->super.stats)); @@ -135,33 +111,25 @@ UCS_CLASS_INIT_FUNC(uct_rc_ep_t, uct_rc_iface_t *iface) goto err_txqp_cleanup; } - self->sl = iface->super.config.sl; /* TODO multi-rail */ - self->path_bits = iface->super.path_bits[0]; /* TODO multi-rail */ - /* Check that FC protocol fits AM id * (just in case AM id space gets extended) */ UCS_STATIC_ASSERT(UCT_RC_EP_FC_MASK < UINT8_MAX); ucs_arbiter_group_init(&self->arb_group); - uct_rc_iface_add_qp(iface, self, self->txqp.qp->qp_num); ucs_list_add_head(&iface->ep_list, &self->list); return UCS_OK; err_txqp_cleanup: uct_rc_txqp_cleanup(&self->txqp); -err: return status; } static UCS_CLASS_CLEANUP_FUNC(uct_rc_ep_t) { - uct_rc_iface_t *iface = ucs_derived_of(self->super.super.iface, - uct_rc_iface_t); ucs_debug("destroy rc ep %p", self); ucs_list_del(&self->list); - uct_rc_iface_remove_qp(iface, self->txqp.qp->qp_num); uct_rc_ep_pending_purge(&self->super.super, NULL, NULL); uct_rc_fc_cleanup(&self->fc); uct_rc_txqp_cleanup(&self->txqp); @@ -169,42 +137,6 @@ static UCS_CLASS_CLEANUP_FUNC(uct_rc_ep_t) UCS_CLASS_DEFINE(uct_rc_ep_t, uct_base_ep_t) -ucs_status_t uct_rc_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr) -{ - uct_rc_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_ep_t); - uct_rc_ep_address_t *rc_addr = (uct_rc_ep_address_t*)addr; - uct_rc_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_iface_t); - - uct_ib_pack_uint24(rc_addr->qp_num, ep->txqp.qp->qp_num); - rc_addr->atomic_mr_id = uct_ib_iface_get_atomic_mr_id(&iface->super); - - return UCS_OK; -} - -ucs_status_t uct_rc_ep_connect_to_ep(uct_ep_h tl_ep, const uct_device_addr_t *dev_addr, - const uct_ep_addr_t *ep_addr) -{ - uct_rc_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_ep_t); - uct_rc_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_rc_iface_t); - const uct_ib_address_t *ib_addr = (const uct_ib_address_t *)dev_addr; - const uct_rc_ep_address_t *rc_addr = (const uct_rc_ep_address_t*)ep_addr; - uint32_t qp_num; - struct ibv_ah_attr ah_attr; - ucs_status_t status; - - uct_ib_iface_fill_ah_attr_from_addr(&iface->super, ib_addr, ep->path_bits, &ah_attr); - qp_num = uct_ib_unpack_uint24(rc_addr->qp_num); - - status = uct_rc_iface_qp_connect(iface, ep->txqp.qp, qp_num, &ah_attr); - if (status != UCS_OK) { - return status; - } - - ep->atomic_mr_offset = uct_ib_md_atomic_offset(rc_addr->atomic_mr_id); - - return UCS_OK; -} - void uct_rc_ep_packet_dump(uct_base_iface_t *iface, uct_am_trace_type_t type, void *data, size_t length, size_t valid_length, char *buffer, size_t max) @@ -228,6 +160,22 @@ void uct_rc_ep_packet_dump(uct_base_iface_t *iface, uct_am_trace_type_t type, } } +static UCS_F_ALWAYS_INLINE void +uct_rc_op_release_iface_resources(uct_rc_iface_send_op_t *op, int is_get_zcopy) +{ + uct_rc_iface_send_desc_t *desc; + uct_rc_iface_t *iface; + + if (is_get_zcopy) { + op->iface->tx.reads_available += op->length; + return; + } + + desc = ucs_derived_of(op, uct_rc_iface_send_desc_t); + iface = ucs_container_of(ucs_mpool_obj_owner(desc), uct_rc_iface_t, tx.mp); + iface->tx.reads_available += op->length; +} + void uct_rc_ep_get_bcopy_handler(uct_rc_iface_send_op_t *op, const void *resp) { uct_rc_iface_send_desc_t *desc = ucs_derived_of(op, uct_rc_iface_send_desc_t); @@ -236,8 +184,8 @@ void uct_rc_ep_get_bcopy_handler(uct_rc_iface_send_op_t *op, const void *resp) desc->unpack_cb(desc->super.unpack_arg, resp, desc->super.length); + uct_rc_op_release_iface_resources(op, 0); uct_invoke_completion(desc->super.user_comp, UCS_OK); - ucs_mpool_put(desc); } @@ -249,10 +197,17 @@ void uct_rc_ep_get_bcopy_handler_no_completion(uct_rc_iface_send_op_t *op, VALGRIND_MAKE_MEM_DEFINED(resp, desc->super.length); desc->unpack_cb(desc->super.unpack_arg, resp, desc->super.length); - + uct_rc_op_release_iface_resources(op, 0); ucs_mpool_put(desc); } +void uct_rc_ep_get_zcopy_completion_handler(uct_rc_iface_send_op_t *op, + const void *resp) +{ + uct_rc_op_release_iface_resources(op, 1); + uct_rc_ep_send_op_completion_handler(op, resp); +} + void uct_rc_ep_send_op_completion_handler(uct_rc_iface_send_op_t *op, const void *resp) { @@ -292,6 +247,7 @@ ucs_status_t uct_rc_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *n, } ucs_arbiter_cb_result_t uct_rc_ep_process_pending(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { @@ -300,8 +256,9 @@ ucs_arbiter_cb_result_t uct_rc_ep_process_pending(ucs_arbiter_t *arbiter, ucs_status_t status; uct_rc_ep_t *ep; + ucs_trace_data("progressing pending request %p", req); status = req->func(req); - ucs_trace_data("progress pending request %p returned: %s", req, + ucs_trace_data("status returned from progress pending: %s", ucs_status_string(status)); if (status == UCS_OK) { @@ -309,7 +266,7 @@ ucs_arbiter_cb_result_t uct_rc_ep_process_pending(ucs_arbiter_t *arbiter, } else if (status == UCS_INPROGRESS) { return UCS_ARBITER_CB_RESULT_NEXT_GROUP; } else { - ep = ucs_container_of(ucs_arbiter_elem_group(elem), uct_rc_ep_t, arb_group); + ep = ucs_container_of(group, uct_rc_ep_t, arb_group); iface = ucs_derived_of(ep->super.super.iface, uct_rc_iface_t); if (!uct_rc_iface_has_tx_resources(iface)) { /* No iface resources */ @@ -324,15 +281,17 @@ ucs_arbiter_cb_result_t uct_rc_ep_process_pending(ucs_arbiter_t *arbiter, } static ucs_arbiter_cb_result_t uct_rc_ep_abriter_purge_cb(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { - uct_rc_fc_request_t *freq; uct_purge_cb_args_t *cb_args = arg; uct_pending_purge_callback_t cb = cb_args->cb; - uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, priv); - uct_rc_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), - uct_rc_ep_t, arb_group); + uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, + priv); + uct_rc_ep_t UCS_V_UNUSED *ep = ucs_container_of(group, uct_rc_ep_t, + arb_group); + uct_rc_fc_request_t *freq; /* Invoke user's callback only if it is not internal FC message */ if (ucs_likely(req->func != uct_rc_ep_fc_grant)){ @@ -391,15 +350,26 @@ void uct_rc_txqp_purge_outstanding(uct_rc_txqp_t *txqp, ucs_status_t status, if (op->user_comp != NULL) { /* This must be uct_rc_ep_get_bcopy_handler, - * uct_rc_ep_send_completion_proxy_handler, + * uct_rc_ep_get_bcopy_handler_no_completion, + * uct_rc_ep_get_zcopy_completion_handler, + * uct_rc_ep_flush_op_completion_handler or * one of the atomic handlers, * so invoke user completion */ uct_invoke_completion(op->user_comp, status); } + + /* Need to release rdma_read resources taken by get operations */ + if ((op->handler == uct_rc_ep_get_bcopy_handler) || + (op->handler == uct_rc_ep_get_bcopy_handler_no_completion)) { + uct_rc_op_release_iface_resources(op, 0); + } else if (op->handler == uct_rc_ep_get_zcopy_completion_handler) { + uct_rc_op_release_iface_resources(op, 1); + } } op->flags &= ~(UCT_RC_IFACE_SEND_OP_FLAG_INUSE | UCT_RC_IFACE_SEND_OP_FLAG_ZCOPY); - if (op->handler == uct_rc_ep_send_op_completion_handler) { + if ((op->handler == uct_rc_ep_send_op_completion_handler) || + (op->handler == uct_rc_ep_get_zcopy_completion_handler)) { uct_rc_iface_put_send_op(op); } else if (op->handler == uct_rc_ep_flush_op_completion_handler) { ucs_mpool_put(op); @@ -413,15 +383,11 @@ void uct_rc_txqp_purge_outstanding(uct_rc_txqp_t *txqp, ucs_status_t status, ucs_status_t uct_rc_ep_flush(uct_rc_ep_t *ep, int16_t max_available, unsigned flags) { - uct_rc_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_rc_iface_t); - - if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) { - uct_rc_txqp_purge_outstanding(&ep->txqp, UCS_ERR_CANCELED, 0); - uct_ep_pending_purge(&ep->super.super, NULL, 0); - return UCS_OK; - } + uct_rc_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_rc_iface_t); - if (!uct_rc_iface_has_tx_resources(iface) || !uct_rc_ep_has_tx_resources(ep)) { + if (!uct_rc_iface_has_tx_resources(iface) || + !uct_rc_ep_has_tx_resources(ep)) { return UCS_ERR_NO_RESOURCE; } @@ -433,6 +399,28 @@ ucs_status_t uct_rc_ep_flush(uct_rc_ep_t *ep, int16_t max_available, return UCS_INPROGRESS; } +ucs_status_t uct_rc_ep_check_cqe(uct_rc_iface_t *iface, uct_rc_ep_t *ep) +{ + uct_rc_txqp_t *txqp; + + if (!uct_rc_iface_have_tx_cqe_avail(iface)) { + UCS_STATS_UPDATE_COUNTER(iface->stats, UCT_RC_IFACE_STAT_NO_CQE, 1); + UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); + return UCS_ERR_NO_RESOURCE; + } + + txqp = &ep->txqp; + /* if unsignaled == RC_UNSIGNALED_INF this value was already saved and \ + next operation will be defenitly signaled */ + if (txqp->unsignaled != RC_UNSIGNALED_INF) { + txqp->unsignaled_store_count++; + txqp->unsignaled_store += txqp->unsignaled; + txqp->unsignaled = RC_UNSIGNALED_INF; + } + + return UCS_OK; +} + #define UCT_RC_DEFINE_ATOMIC_HANDLER_FUNC(_num_bits, _is_be) \ void UCT_RC_DEFINE_ATOMIC_HANDLER_FUNC_NAME(_num_bits, _is_be) \ (uct_rc_iface_send_op_t *op, const void *resp) \ diff --git a/src/uct/ib/rc/base/rc_ep.h b/src/uct/ib/rc/base/rc_ep.h index a3e56bd2fb2..74846f1e9e8 100644 --- a/src/uct/ib/rc/base/rc_ep.h +++ b/src/uct/ib/rc/base/rc_ep.h @@ -76,36 +76,47 @@ enum { /* * Check for send resources */ -#define UCT_RC_CHECK_CQE_RET(_iface, _ep, _txqp, _ret) \ +#define UCT_RC_CHECK_CQE_RET(_iface, _ep, _ret) \ /* tx_moderation == 0 for TLs which don't support it */ \ if (ucs_unlikely((_iface)->tx.cq_available <= \ (signed)(_iface)->config.tx_moderation)) { \ - if (!uct_rc_iface_have_tx_cqe_avail(_iface)) { \ - UCS_STATS_UPDATE_COUNTER((_iface)->stats, UCT_RC_IFACE_STAT_NO_CQE, 1); \ - UCS_STATS_UPDATE_COUNTER((_ep)->super.stats, UCT_EP_STAT_NO_RES, 1); \ + if (uct_rc_ep_check_cqe(_iface, _ep) != UCS_OK) { \ return _ret; \ } \ - /* if unsignaled == RC_UNSIGNALED_INF this value was already saved and \ - next operation will be defenitly signaled */ \ - if ((_txqp)->unsignaled != RC_UNSIGNALED_INF) { \ - (_txqp)->unsignaled_store_count++; \ - (_txqp)->unsignaled_store += (_txqp)->unsignaled; \ - (_txqp)->unsignaled = RC_UNSIGNALED_INF; \ - } \ } -#define UCT_RC_CHECK_TXQP_RET(_iface, _ep, _txqp, _ret) \ - if (uct_rc_txqp_available(_txqp) <= 0) { \ - UCS_STATS_UPDATE_COUNTER((_txqp)->stats, UCT_RC_TXQP_STAT_QP_FULL, 1); \ +#define UCT_RC_CHECK_TXQP_RET(_iface, _ep, _ret) \ + if (uct_rc_txqp_available(&(_ep)->txqp) <= 0) { \ + UCS_STATS_UPDATE_COUNTER((_ep)->txqp.stats, UCT_RC_TXQP_STAT_QP_FULL, 1); \ UCS_STATS_UPDATE_COUNTER((_ep)->super.stats, UCT_EP_STAT_NO_RES, 1); \ return _ret; \ } -#define UCT_RC_CHECK_CQE(_iface, _ep, _txqp) \ - UCT_RC_CHECK_CQE_RET(_iface, _ep, _txqp, UCS_ERR_NO_RESOURCE) +#define UCT_RC_CHECK_NUM_RDMA_READ(_iface) \ + if (ucs_unlikely((_iface)->tx.reads_available <= 0)) { \ + UCS_STATS_UPDATE_COUNTER((_iface)->stats, \ + UCT_RC_IFACE_STAT_NO_READS, 1); \ + return UCS_ERR_NO_RESOURCE; \ + } + +#define UCT_RC_RDMA_READ_POSTED(_iface, _length) \ + { \ + ucs_assert((_iface)->tx.reads_available > 0); \ + (_iface)->tx.reads_available -= (_length); \ + } + +#define UCT_RC_CHECK_RES(_iface, _ep) \ + UCT_RC_CHECK_CQE_RET(_iface, _ep, UCS_ERR_NO_RESOURCE) \ + UCT_RC_CHECK_TXQP_RET(_iface, _ep, UCS_ERR_NO_RESOURCE) -#define UCT_RC_CHECK_TXQP(_iface, _ep, _txqp) \ - UCT_RC_CHECK_TXQP_RET(_iface, _ep, _txqp, UCS_ERR_NO_RESOURCE) \ +/** + * All RMA and AMO operations are not allowed if no RDMA_READ credits. + * Otherwise operations ordering can be broken (which fence operation + * relies on). + */ +#define UCT_RC_CHECK_RMA_RES(_iface, _ep) \ + UCT_RC_CHECK_RES(_iface, _ep) \ + UCT_RC_CHECK_NUM_RDMA_READ(_iface) /* * check for FC credits and add FC protocol bits (if any) @@ -159,14 +170,9 @@ enum { UCT_RC_UPDATE_FC_WND(_iface, &(_ep)->fc) \ } -#define UCT_RC_CHECK_RES(_iface, _ep) \ - UCT_RC_CHECK_CQE(_iface, (_ep), &(_ep)->txqp) \ - UCT_RC_CHECK_TXQP(_iface, (_ep), &(_ep)->txqp); - /* this is a common type for all rc and dc transports */ struct uct_rc_txqp { - struct ibv_qp *qp; ucs_queue_head_t outstanding; /* RC_UNSIGNALED_INF value forces signaled in moderation logic when * CQ credits are close to zero (less tx_moderation value) */ @@ -179,7 +185,7 @@ struct uct_rc_txqp { * exact value on each signaled completion */ uint16_t unsignaled_store_count; int16_t available; - UCS_STATS_NODE_DECLARE(stats); + UCS_STATS_NODE_DECLARE(stats) }; typedef struct uct_rc_fc { @@ -187,32 +193,21 @@ typedef struct uct_rc_fc { int16_t fc_wnd; /* used only for FC protocol at this point (3 higher bits) */ uint8_t flags; - UCS_STATS_NODE_DECLARE(stats); + UCS_STATS_NODE_DECLARE(stats) } uct_rc_fc_t; struct uct_rc_ep { uct_base_ep_t super; uct_rc_txqp_t txqp; - uint16_t atomic_mr_offset; - uint8_t sl; - uint8_t path_bits; ucs_list_link_t list; ucs_arbiter_group_t arb_group; uct_rc_fc_t fc; + uint16_t atomic_mr_offset; + uint8_t path_index; }; -UCS_CLASS_DECLARE(uct_rc_ep_t, uct_rc_iface_t*); - +UCS_CLASS_DECLARE(uct_rc_ep_t, uct_rc_iface_t*, uint32_t, const uct_ep_params_t*); -typedef struct uct_rc_ep_address { - uct_ib_uint24_t qp_num; - uint8_t atomic_mr_id; -} UCS_S_PACKED uct_rc_ep_address_t; - -ucs_status_t uct_rc_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr); - -ucs_status_t uct_rc_ep_connect_to_ep(uct_ep_h tl_ep, const uct_device_addr_t *dev_addr, - const uct_ep_addr_t *ep_addr); void uct_rc_ep_packet_dump(uct_base_iface_t *iface, uct_am_trace_type_t type, void *data, size_t length, size_t valid_length, @@ -223,6 +218,9 @@ void uct_rc_ep_get_bcopy_handler(uct_rc_iface_send_op_t *op, const void *resp); void uct_rc_ep_get_bcopy_handler_no_completion(uct_rc_iface_send_op_t *op, const void *resp); +void uct_rc_ep_get_zcopy_completion_handler(uct_rc_iface_send_op_t *op, + const void *resp); + void uct_rc_ep_send_op_completion_handler(uct_rc_iface_send_op_t *op, const void *resp); @@ -236,6 +234,7 @@ void uct_rc_ep_pending_purge(uct_ep_h ep, uct_pending_purge_callback_t cb, void*arg); ucs_arbiter_cb_result_t uct_rc_ep_process_pending(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg); @@ -251,6 +250,8 @@ void uct_rc_txqp_purge_outstanding(uct_rc_txqp_t *txqp, ucs_status_t status, ucs_status_t uct_rc_ep_flush(uct_rc_ep_t *ep, int16_t max_available, unsigned flags); +ucs_status_t uct_rc_ep_check_cqe(uct_rc_iface_t *iface, uct_rc_ep_t *ep); + void UCT_RC_DEFINE_ATOMIC_HANDLER_FUNC_NAME(32, 0)(uct_rc_iface_send_op_t *op, const void *resp); void UCT_RC_DEFINE_ATOMIC_HANDLER_FUNC_NAME(32, 1)(uct_rc_iface_send_op_t *op, @@ -261,7 +262,7 @@ void UCT_RC_DEFINE_ATOMIC_HANDLER_FUNC_NAME(64, 1)(uct_rc_iface_send_op_t *op, const void *resp); ucs_status_t uct_rc_txqp_init(uct_rc_txqp_t *txqp, uct_rc_iface_t *iface, - struct ibv_qp_cap *cap + uint32_t qp_num UCS_STATS_ARG(ucs_stats_node_t* stats_parent)); void uct_rc_txqp_cleanup(uct_rc_txqp_t *txqp); @@ -285,12 +286,6 @@ static inline uint16_t uct_rc_txqp_unsignaled(uct_rc_txqp_t *txqp) return txqp->unsignaled; } -static UCS_F_ALWAYS_INLINE void uct_rc_txqp_check(uct_rc_txqp_t *txqp) -{ - ucs_assertv(txqp->qp->state == IBV_QPS_RTS, "QP 0x%x state is %d", - txqp->qp->qp_num, txqp->qp->state); -} - static UCS_F_ALWAYS_INLINE int uct_rc_fc_has_resources(uct_rc_iface_t *iface, uct_rc_fc_t *fc) { @@ -331,7 +326,8 @@ uct_rc_txqp_add_send_op_sn(uct_rc_txqp_t *txqp, uct_rc_iface_send_op_t *op, uint static UCS_F_ALWAYS_INLINE void uct_rc_txqp_add_send_comp(uct_rc_iface_t *iface, uct_rc_txqp_t *txqp, - uct_completion_t *comp, uint16_t sn, uint16_t flags) + uct_rc_send_handler_t handler, uct_completion_t *comp, + uint16_t sn, uint16_t flags, size_t length) { uct_rc_iface_send_op_t *op; @@ -340,8 +336,10 @@ uct_rc_txqp_add_send_comp(uct_rc_iface_t *iface, uct_rc_txqp_t *txqp, } op = uct_rc_iface_get_send_op(iface); + op->handler = handler; op->user_comp = comp; op->flags |= flags; + op->length = length; uct_rc_txqp_add_send_op_sn(txqp, op, sn); } @@ -445,4 +443,43 @@ uct_rc_fc_req_moderation(uct_rc_fc_t *fc, uct_rc_iface_t *iface) UCT_RC_EP_FC_FLAG_SOFT_REQ : 0; } +static UCS_F_ALWAYS_INLINE int +uct_rc_ep_fm(uct_rc_iface_t *iface, uct_ib_fence_info_t* fi, int flag) +{ + int fence; + + /* a call to iface_fence increases beat, so if endpoint beat is not in + * sync with iface beat it means the endpoint did not post any WQE with + * fence flag yet */ + fence = (fi->fence_beat != iface->tx.fi.fence_beat) ? flag : 0; + fi->fence_beat = iface->tx.fi.fence_beat; + return fence; +} + +static UCS_F_ALWAYS_INLINE ucs_status_t +uct_rc_ep_fence(uct_ep_h tl_ep, uct_ib_fence_info_t* fi, int fence) +{ + uct_rc_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_iface_t); + + /* in case if fence is requested and enabled by configuration + * we need to schedule fence for next RDMA operation */ + if (fence && (iface->config.fence_mode != UCT_RC_FENCE_MODE_NONE)) { + fi->fence_beat = iface->tx.fi.fence_beat - 1; + } + + UCT_TL_EP_STAT_FENCE(ucs_derived_of(tl_ep, uct_base_ep_t)); + return UCS_OK; +} + +static UCS_F_ALWAYS_INLINE void +uct_rc_ep_fence_put(uct_rc_iface_t *iface, uct_ib_fence_info_t *fi, + uct_rkey_t *rkey, uint64_t *addr, uint16_t offset) +{ + if (uct_rc_ep_fm(iface, fi, 1)) { + *rkey = uct_ib_resolve_atomic_rkey(*rkey, offset, addr); + } else { + *rkey = uct_ib_md_direct_rkey(*rkey); + } +} + #endif diff --git a/src/uct/ib/rc/base/rc_iface.c b/src/uct/ib/rc/base/rc_iface.c index dbd530d2d0f..22a51fe6388 100644 --- a/src/uct/ib/rc/base/rc_iface.c +++ b/src/uct/ib/rc/base/rc_iface.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rc_iface.h" #include "rc_ep.h" @@ -13,81 +17,115 @@ #include -ucs_config_field_t uct_rc_iface_config_table[] = { - {"IB_", "RX_INLINE=64;RX_QUEUE_LEN=4095", NULL, - ucs_offsetof(uct_rc_iface_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_ib_iface_config_table)}, +static const char *uct_rc_fence_mode_values[] = { + [UCT_RC_FENCE_MODE_NONE] = "none", + [UCT_RC_FENCE_MODE_WEAK] = "weak", + [UCT_RC_FENCE_MODE_AUTO] = "auto", + [UCT_RC_FENCE_MODE_LAST] = NULL +}; - {"PATH_MTU", "default", - "Path MTU. \"default\" will select the best MTU for the device.", - ucs_offsetof(uct_rc_iface_config_t, path_mtu), UCS_CONFIG_TYPE_ENUM(uct_ib_mtu_values)}, +ucs_config_field_t uct_rc_iface_common_config_table[] = { + {UCT_IB_CONFIG_PREFIX, "RX_INLINE=64;TX_INLINE_RESP=64;RX_QUEUE_LEN=4095;SEG_SIZE=8256", NULL, + ucs_offsetof(uct_rc_iface_common_config_t, super), + UCS_CONFIG_TYPE_TABLE(uct_ib_iface_config_table)}, {"MAX_RD_ATOMIC", "4", "Maximal number of outstanding read or atomic replies", - ucs_offsetof(uct_rc_iface_config_t, max_rd_atomic), UCS_CONFIG_TYPE_UINT}, + ucs_offsetof(uct_rc_iface_common_config_t, max_rd_atomic), UCS_CONFIG_TYPE_UINT}, {"TIMEOUT", "1.0s", "Transport timeout", - ucs_offsetof(uct_rc_iface_config_t, tx.timeout), UCS_CONFIG_TYPE_TIME}, + ucs_offsetof(uct_rc_iface_common_config_t, tx.timeout), UCS_CONFIG_TYPE_TIME}, {"RETRY_COUNT", "7", "Transport retries", - ucs_offsetof(uct_rc_iface_config_t, tx.retry_count), UCS_CONFIG_TYPE_UINT}, + ucs_offsetof(uct_rc_iface_common_config_t, tx.retry_count), UCS_CONFIG_TYPE_UINT}, - {"RNR_TIMEOUT", "30ms", + {"RNR_TIMEOUT", "1ms", "RNR timeout", - ucs_offsetof(uct_rc_iface_config_t,tx. rnr_timeout), UCS_CONFIG_TYPE_TIME}, + ucs_offsetof(uct_rc_iface_common_config_t, tx.rnr_timeout), UCS_CONFIG_TYPE_TIME}, {"RNR_RETRY_COUNT", "7", "RNR retries", - ucs_offsetof(uct_rc_iface_config_t, tx.rnr_retry_count), UCS_CONFIG_TYPE_UINT}, - - {"TX_CQ_LEN", "4096", - "Length of send completion queue. This limits the total number of outstanding signaled sends.", - ucs_offsetof(uct_rc_iface_config_t, tx.cq_len), UCS_CONFIG_TYPE_UINT}, + ucs_offsetof(uct_rc_iface_common_config_t, tx.rnr_retry_count), UCS_CONFIG_TYPE_UINT}, {"FC_ENABLE", "y", "Enable flow control protocol to prevent sender from overwhelming the receiver,\n" "thus avoiding RC RnR backoff timer.", - ucs_offsetof(uct_rc_iface_config_t, fc.enable), UCS_CONFIG_TYPE_BOOL}, + ucs_offsetof(uct_rc_iface_common_config_t, fc.enable), UCS_CONFIG_TYPE_BOOL}, {"FC_WND_SIZE", "512", "The size of flow control window per endpoint. limits the number of AM\n" "which can be sent w/o acknowledgment.", - ucs_offsetof(uct_rc_iface_config_t, fc.wnd_size), UCS_CONFIG_TYPE_UINT}, + ucs_offsetof(uct_rc_iface_common_config_t, fc.wnd_size), UCS_CONFIG_TYPE_UINT}, {"FC_HARD_THRESH", "0.25", "Threshold for sending hard request for FC credits to the peer. This value\n" "refers to the percentage of the FC_WND_SIZE value. (must be > 0 and < 1)", - ucs_offsetof(uct_rc_iface_config_t, fc.hard_thresh), UCS_CONFIG_TYPE_DOUBLE}, + ucs_offsetof(uct_rc_iface_common_config_t, fc.hard_thresh), UCS_CONFIG_TYPE_DOUBLE}, #if HAVE_DECL_IBV_EXP_QP_OOO_RW_DATA_PLACEMENT {"OOO_RW", "n", "Enable out-of-order RDMA data placement", - ucs_offsetof(uct_rc_iface_config_t, ooo_rw), UCS_CONFIG_TYPE_BOOL}, + ucs_offsetof(uct_rc_iface_common_config_t, ooo_rw), UCS_CONFIG_TYPE_BOOL}, #endif + {"FENCE", "auto", + "IB fence type when API fence requested:\n" + " none - fence is a no-op\n" + " weak - fence makes sure remote reads are ordered with respect to remote writes\n" + " auto - select fence mode based on hardware capabilities", + ucs_offsetof(uct_rc_iface_common_config_t, fence_mode), + UCS_CONFIG_TYPE_ENUM(uct_rc_fence_mode_values)}, + + {"TX_NUM_GET_OPS", "", + "The configuration parameter replaced by UCX_RC_TX_NUM_GET_BYTES.", + UCS_CONFIG_DEPRECATED_FIELD_OFFSET, UCS_CONFIG_TYPE_DEPRECATED}, + + {"MAX_GET_ZCOPY", "auto", + "Maximal size of get operation with zcopy protocol.", + ucs_offsetof(uct_rc_iface_common_config_t, tx.max_get_zcopy), UCS_CONFIG_TYPE_MEMUNITS}, + + {"TX_NUM_GET_BYTES", "inf", + "Maximal number of bytes simultaneously transferred by get/RDMA_READ operations.", + ucs_offsetof(uct_rc_iface_common_config_t, tx.max_get_bytes), UCS_CONFIG_TYPE_MEMUNITS}, + {NULL} }; -ucs_config_field_t uct_rc_fc_config_table[] = { +/* Config relevant for rc_mlx5 and rc_verbs only (not for dc) */ +ucs_config_field_t uct_rc_iface_config_table[] = { + {"RC_", "MAX_NUM_EPS=256", NULL, + ucs_offsetof(uct_rc_iface_config_t, super), + UCS_CONFIG_TYPE_TABLE(uct_rc_iface_common_config_table)}, + {"FC_SOFT_THRESH", "0.5", "Threshold for sending soft request for FC credits to the peer. This value\n" "refers to the percentage of the FC_WND_SIZE value. (must be > HARD_THRESH and < 1)", - ucs_offsetof(uct_rc_fc_config_t, soft_thresh), UCS_CONFIG_TYPE_DOUBLE}, + ucs_offsetof(uct_rc_iface_config_t, soft_thresh), UCS_CONFIG_TYPE_DOUBLE}, + + {"TX_CQ_MODERATION", "64", + "Maximum number of send WQEs which can be posted without requesting a completion.", + ucs_offsetof(uct_rc_iface_config_t, tx_cq_moderation), UCS_CONFIG_TYPE_UINT}, + + {"TX_CQ_LEN", "4096", + "Length of send completion queue. This limits the total number of outstanding signaled sends.", + ucs_offsetof(uct_rc_iface_config_t, tx_cq_len), UCS_CONFIG_TYPE_UINT}, {NULL} }; -#if ENABLE_STATS +#ifdef ENABLE_STATS static ucs_stats_class_t uct_rc_iface_stats_class = { .name = "rc_iface", .num_counters = UCT_RC_IFACE_STAT_LAST, .counter_names = { [UCT_RC_IFACE_STAT_RX_COMPLETION] = "rx_completion", [UCT_RC_IFACE_STAT_TX_COMPLETION] = "tx_completion", - [UCT_RC_IFACE_STAT_NO_CQE] = "no_cqe" + [UCT_RC_IFACE_STAT_NO_CQE] = "no_cqe", + [UCT_RC_IFACE_STAT_NO_READS] = "no_reads" } }; @@ -123,7 +161,7 @@ ucs_status_t uct_rc_iface_query(uct_rc_iface_t *iface, uct_iface_attr_t *iface_attr, size_t put_max_short, size_t max_inline, size_t am_max_hdr, size_t am_max_iov, - size_t tag_max_iov, size_t tag_min_hdr) + size_t am_min_hdr, size_t rma_max_iov) { uct_ib_device_t *dev = uct_ib_iface_device(&iface->super); ucs_status_t status; @@ -136,7 +174,6 @@ ucs_status_t uct_rc_iface_query(uct_rc_iface_t *iface, } iface_attr->iface_addr_len = 0; - iface_attr->ep_addr_len = sizeof(uct_rc_ep_address_t); iface_attr->max_conn_priv = 0; iface_attr->cap.flags = UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_AM_ZCOPY | @@ -146,51 +183,37 @@ ucs_status_t uct_rc_iface_query(uct_rc_iface_t *iface, UCT_IFACE_FLAG_GET_ZCOPY | UCT_IFACE_FLAG_PENDING | UCT_IFACE_FLAG_CONNECT_TO_EP | - UCT_IFACE_FLAG_CB_SYNC | - UCT_IFACE_FLAG_EVENT_SEND_COMP | - UCT_IFACE_FLAG_EVENT_RECV; - - if (dev->atomic_arg_sizes & sizeof(uint64_t)) { - /* TODO: remove deprecated flags */ - iface_attr->cap.flags |= UCT_IFACE_FLAG_ATOMIC_DEVICE; - - iface_attr->cap.atomic64.op_flags |= UCS_BIT(UCT_ATOMIC_OP_ADD); - iface_attr->cap.atomic64.fop_flags |= UCS_BIT(UCT_ATOMIC_OP_ADD) | - UCS_BIT(UCT_ATOMIC_OP_CSWAP); - } - -#if HAVE_IB_EXT_ATOMICS - if (dev->ext_atomic_arg_sizes & sizeof(uint64_t)) { - /* TODO: remove deprecated flags */ - iface_attr->cap.flags |= UCT_IFACE_FLAG_ATOMIC_DEVICE; - - iface_attr->cap.atomic64.fop_flags |= UCS_BIT(UCT_ATOMIC_OP_SWAP); - } - - if (dev->ext_atomic_arg_sizes & sizeof(uint32_t)) { - /* TODO: remove deprecated flags */ - iface_attr->cap.flags |= UCT_IFACE_FLAG_ATOMIC_DEVICE; - - iface_attr->cap.atomic32.op_flags |= UCS_BIT(UCT_ATOMIC_OP_ADD); - iface_attr->cap.atomic32.fop_flags |= UCS_BIT(UCT_ATOMIC_OP_ADD) | - UCS_BIT(UCT_ATOMIC_OP_SWAP) | - UCS_BIT(UCT_ATOMIC_OP_CSWAP); - } -#endif + UCT_IFACE_FLAG_CB_SYNC; + iface_attr->cap.event_flags = UCT_IFACE_FLAG_EVENT_SEND_COMP | + UCT_IFACE_FLAG_EVENT_RECV | + UCT_IFACE_FLAG_EVENT_FD; + + if (uct_ib_device_has_pci_atomics(dev)) { + if (dev->pci_fadd_arg_sizes & sizeof(uint64_t)) { + iface_attr->cap.atomic64.op_flags |= UCS_BIT(UCT_ATOMIC_OP_ADD); + iface_attr->cap.atomic64.fop_flags |= UCS_BIT(UCT_ATOMIC_OP_ADD); + } + if (dev->pci_cswap_arg_sizes & sizeof(uint64_t)) { + iface_attr->cap.atomic64.fop_flags |= UCS_BIT(UCT_ATOMIC_OP_CSWAP); + } + iface_attr->cap.flags |= UCT_IFACE_FLAG_ATOMIC_CPU; + } else { + if (dev->atomic_arg_sizes & sizeof(uint64_t)) { + /* TODO: remove deprecated flags */ + iface_attr->cap.flags |= UCT_IFACE_FLAG_ATOMIC_DEVICE; - if (dev->pci_fadd_arg_sizes || dev->pci_cswap_arg_sizes) { - iface_attr->cap.atomic32.op_flags = 0; - iface_attr->cap.atomic32.fop_flags = 0; - iface_attr->cap.atomic64.op_flags = 0; - iface_attr->cap.atomic64.fop_flags = 0; + iface_attr->cap.atomic64.op_flags |= UCS_BIT(UCT_ATOMIC_OP_ADD); + iface_attr->cap.atomic64.fop_flags |= UCS_BIT(UCT_ATOMIC_OP_ADD) | + UCS_BIT(UCT_ATOMIC_OP_CSWAP); + } } iface_attr->cap.put.opt_zcopy_align = UCS_SYS_PCI_MAX_PAYLOAD; iface_attr->cap.get.opt_zcopy_align = UCS_SYS_PCI_MAX_PAYLOAD; iface_attr->cap.am.opt_zcopy_align = UCS_SYS_PCI_MAX_PAYLOAD; - iface_attr->cap.put.align_mtu = uct_ib_mtu_value(iface->config.path_mtu); - iface_attr->cap.get.align_mtu = uct_ib_mtu_value(iface->config.path_mtu); - iface_attr->cap.am.align_mtu = uct_ib_mtu_value(iface->config.path_mtu); + iface_attr->cap.put.align_mtu = uct_ib_mtu_value(iface->super.config.path_mtu); + iface_attr->cap.get.align_mtu = uct_ib_mtu_value(iface->super.config.path_mtu); + iface_attr->cap.am.align_mtu = uct_ib_mtu_value(iface->super.config.path_mtu); /* PUT */ @@ -198,20 +221,20 @@ ucs_status_t uct_rc_iface_query(uct_rc_iface_t *iface, iface_attr->cap.put.max_bcopy = iface->super.config.seg_size; iface_attr->cap.put.min_zcopy = 0; iface_attr->cap.put.max_zcopy = uct_ib_iface_port_attr(&iface->super)->max_msg_sz; - iface_attr->cap.put.max_iov = uct_ib_iface_get_max_iov(&iface->super); + iface_attr->cap.put.max_iov = rma_max_iov; /* GET */ iface_attr->cap.get.max_bcopy = iface->super.config.seg_size; - iface_attr->cap.get.min_zcopy = iface->super.config.max_inl_resp + 1; - iface_attr->cap.get.max_zcopy = uct_ib_iface_port_attr(&iface->super)->max_msg_sz; - iface_attr->cap.get.max_iov = uct_ib_iface_get_max_iov(&iface->super); + iface_attr->cap.get.min_zcopy = iface->super.config.max_inl_cqe[UCT_IB_DIR_TX] + 1; + iface_attr->cap.get.max_zcopy = iface->config.max_get_zcopy; + iface_attr->cap.get.max_iov = rma_max_iov; /* AM */ - iface_attr->cap.am.max_short = uct_ib_iface_hdr_size(max_inline, tag_min_hdr); - iface_attr->cap.am.max_bcopy = iface->super.config.seg_size - tag_min_hdr; + iface_attr->cap.am.max_short = uct_ib_iface_hdr_size(max_inline, am_min_hdr); + iface_attr->cap.am.max_bcopy = iface->super.config.seg_size - am_min_hdr; iface_attr->cap.am.min_zcopy = 0; - iface_attr->cap.am.max_zcopy = iface->super.config.seg_size - tag_min_hdr; - iface_attr->cap.am.max_hdr = am_max_hdr - tag_min_hdr; + iface_attr->cap.am.max_zcopy = iface->super.config.seg_size - am_min_hdr; + iface_attr->cap.am.max_hdr = am_max_hdr - am_min_hdr; iface_attr->cap.am.max_iov = am_max_iov; /* Error Handling */ @@ -228,26 +251,6 @@ ucs_status_t uct_rc_iface_query(uct_rc_iface_t *iface, return UCS_OK; } -ucs_status_t uct_rc_iface_get_address(uct_iface_h tl_iface, - uct_iface_addr_t *addr) -{ - *(uint8_t*)addr = UCT_RC_IFACE_ADDR_TYPE_BASIC; - return UCS_OK; -} - -int uct_rc_iface_is_reachable(const uct_iface_h tl_iface, - const uct_device_addr_t *dev_addr, - const uct_iface_addr_t *iface_addr) -{ - uint8_t my_type = UCT_RC_IFACE_ADDR_TYPE_BASIC; - - if ((iface_addr != NULL) && (my_type != *(uint8_t*)iface_addr)) { - return 0; - } - - return uct_ib_iface_is_reachable(tl_iface, dev_addr, iface_addr); -} - void uct_rc_iface_add_qp(uct_rc_iface_t *iface, uct_rc_ep_t *ep, unsigned qp_num) { @@ -286,6 +289,11 @@ ucs_status_t uct_rc_iface_flush(uct_iface_h tl_iface, unsigned flags, return UCS_ERR_UNSUPPORTED; } + status = uct_rc_iface_fence_relaxed_order(tl_iface); + if (status != UCS_OK) { + return status; + } + count = 0; ucs_list_for_each(ep, &iface->ep_list, list) { status = uct_ep_flush(&ep->super.super, 0, NULL); @@ -308,48 +316,26 @@ ucs_status_t uct_rc_iface_flush(uct_iface_h tl_iface, unsigned flags, void uct_rc_iface_send_desc_init(uct_iface_h tl_iface, void *obj, uct_mem_h memh) { uct_rc_iface_send_desc_t *desc = obj; - uct_ib_mem_t *ib_memh = memh; - desc->lkey = ib_memh->lkey; + desc->lkey = uct_ib_memh_get_lkey(memh); desc->super.flags = 0; } -static void uct_rc_iface_set_path_mtu(uct_rc_iface_t *iface, - const uct_rc_iface_config_t *config) -{ - enum ibv_mtu port_mtu = uct_ib_iface_port_attr(&iface->super)->active_mtu; - uct_ib_device_t *dev = uct_ib_iface_device(&iface->super); - - /* MTU is set by user configuration */ - if (config->path_mtu != UCT_IB_MTU_DEFAULT) { - iface->config.path_mtu = config->path_mtu + (IBV_MTU_512 - UCT_IB_MTU_512); - } else if ((port_mtu > IBV_MTU_2048) && (IBV_DEV_ATTR(dev, vendor_id) == 0x02c9) && - ((IBV_DEV_ATTR(dev, vendor_part_id) == 4099) || (IBV_DEV_ATTR(dev, vendor_part_id) == 4100) || - (IBV_DEV_ATTR(dev, vendor_part_id) == 4103) || (IBV_DEV_ATTR(dev, vendor_part_id) == 4104))) - { - /* On some devices optimal path_mtu is 2048 */ - iface->config.path_mtu = IBV_MTU_2048; - } else { - iface->config.path_mtu = port_mtu; - } -} - -ucs_status_t uct_rc_init_fc_thresh(uct_rc_fc_config_t *fc_cfg, - uct_rc_iface_config_t *rc_cfg, +ucs_status_t uct_rc_init_fc_thresh(uct_rc_iface_config_t *config, uct_rc_iface_t *iface) { /* Check FC parameters correctness */ - if ((fc_cfg->soft_thresh <= rc_cfg->fc.hard_thresh) || - (fc_cfg->soft_thresh >= 1)) { + if ((config->soft_thresh <= config->super.fc.hard_thresh) || + (config->soft_thresh >= 1)) { ucs_error("The factor for soft FC threshold should be bigger" " than FC_HARD_THRESH value and less than 1 (s=%f, h=%f)", - fc_cfg->soft_thresh, rc_cfg->fc.hard_thresh); + config->soft_thresh, config->super.fc.hard_thresh); return UCS_ERR_INVALID_PARAM; } - if (rc_cfg->fc.enable) { + if (config->super.fc.enable) { iface->config.fc_soft_thresh = ucs_max((int)(iface->config.fc_wnd_size * - fc_cfg->soft_thresh), 1); + config->soft_thresh), 1); } else { iface->config.fc_soft_thresh = 0; } @@ -490,61 +476,95 @@ unsigned uct_rc_iface_do_progress(uct_iface_h tl_iface) } ucs_status_t uct_rc_iface_init_rx(uct_rc_iface_t *iface, - const uct_rc_iface_config_t *config) + const uct_rc_iface_common_config_t *config, + struct ibv_srq **srq_p) { struct ibv_srq_init_attr srq_init_attr; struct ibv_pd *pd = uct_ib_iface_md(&iface->super)->pd; + struct ibv_srq *srq; srq_init_attr.attr.max_sge = 1; srq_init_attr.attr.max_wr = config->super.rx.queue_len; srq_init_attr.attr.srq_limit = 0; srq_init_attr.srq_context = iface; - iface->rx.srq.srq = ibv_create_srq(pd, &srq_init_attr); - if (iface->rx.srq.srq == NULL) { + srq = ibv_create_srq(pd, &srq_init_attr); + if (srq == NULL) { ucs_error("ibv_create_srq() failed: %m"); return UCS_ERR_IO_ERROR; } iface->rx.srq.quota = srq_init_attr.attr.max_wr; + *srq_p = srq; return UCS_OK; } +static int uct_rc_iface_config_limit_value(const char *name, + int provided, int limit) +{ + if (provided > limit) { + ucs_warn("using maximal value for %s (%d) instead of %d", + name, limit, provided); + return limit; + } else { + return provided; + } +} + UCS_CLASS_INIT_FUNC(uct_rc_iface_t, uct_rc_iface_ops_t *ops, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, - const uct_rc_iface_config_t *config, + const uct_rc_iface_common_config_t *config, uct_ib_iface_init_attr_t *init_attr) { uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev; + uint32_t max_ib_msg_size; ucs_status_t status; - init_attr->tx_cq_len = config->tx.cq_len; - UCS_CLASS_CALL_SUPER_INIT(uct_ib_iface_t, &ops->super, md, worker, params, &config->super, init_attr); - self->tx.cq_available = init_attr->tx_cq_len - 1; + self->tx.cq_available = init_attr->cq_len[UCT_IB_DIR_TX] - 1; self->rx.srq.available = 0; self->rx.srq.quota = 0; self->config.tx_qp_len = config->super.tx.queue_len; self->config.tx_min_sge = config->super.tx.min_sge; self->config.tx_min_inline = config->super.tx.min_inline; - self->config.tx_moderation = ucs_min(config->super.tx.cq_moderation, - config->super.tx.queue_len / 4); - self->config.tx_ops_count = init_attr->tx_cq_len; - self->config.rx_inline = config->super.rx.inl; - self->config.min_rnr_timer = uct_ib_to_fabric_time(config->tx.rnr_timeout); - self->config.timeout = uct_ib_to_fabric_time(config->tx.timeout); - self->config.rnr_retry = ucs_min(config->tx.rnr_retry_count, - UCT_RC_QP_MAX_RETRY_COUNT); - self->config.retry_cnt = ucs_min(config->tx.retry_count, - UCT_RC_QP_MAX_RETRY_COUNT); + self->config.tx_ops_count = init_attr->cq_len[UCT_IB_DIR_TX]; + self->config.min_rnr_timer = uct_ib_to_rnr_fabric_time(config->tx.rnr_timeout); + self->config.timeout = uct_ib_to_qp_fabric_time(config->tx.timeout); + self->config.rnr_retry = uct_rc_iface_config_limit_value( + "RNR_RETRY_COUNT", + config->tx.rnr_retry_count, + UCT_RC_QP_MAX_RETRY_COUNT); + self->config.retry_cnt = uct_rc_iface_config_limit_value( + "RETRY_COUNT", + config->tx.retry_count, + UCT_RC_QP_MAX_RETRY_COUNT); self->config.max_rd_atomic = config->max_rd_atomic; self->config.ooo_rw = config->ooo_rw; -#if ENABLE_ASSERT - self->config.tx_cq_len = init_attr->tx_cq_len; +#if UCS_ENABLE_ASSERT + self->config.tx_cq_len = init_attr->cq_len[UCT_IB_DIR_TX]; #endif + max_ib_msg_size = uct_ib_iface_port_attr(&self->super)->max_msg_sz; + + if (config->tx.max_get_zcopy == UCS_MEMUNITS_AUTO) { + self->config.max_get_zcopy = max_ib_msg_size; + } else if (config->tx.max_get_zcopy <= max_ib_msg_size) { + self->config.max_get_zcopy = config->tx.max_get_zcopy; + } else { + ucs_warn("rc_iface on %s:%d: reduced max_get_zcopy to %u", + uct_ib_device_name(dev), self->super.config.port_num, + max_ib_msg_size); + self->config.max_get_zcopy = max_ib_msg_size; + } - uct_rc_iface_set_path_mtu(self, config); + if ((config->tx.max_get_bytes == UCS_MEMUNITS_INF) || + (config->tx.max_get_bytes == UCS_MEMUNITS_AUTO)) { + self->tx.reads_available = SSIZE_MAX; + } else { + self->tx.reads_available = config->tx.max_get_bytes; + } + + uct_ib_fence_info_init(&self->tx.fi); memset(self->eps, 0, sizeof(self->eps)); ucs_arbiter_init(&self->tx.arbiter); ucs_list_head_init(&self->ep_list); @@ -598,7 +618,7 @@ UCS_CLASS_INIT_FUNC(uct_rc_iface_t, uct_rc_iface_ops_t *ops, uct_md_h md, status = UCS_STATS_NODE_ALLOC(&self->stats, &uct_rc_iface_stats_class, self->super.super.stats); if (status != UCS_OK) { - goto err_destroy_tx_mp; + goto err_cleanup_tx_ops; } /* Initialize RX resources (SRQ) */ @@ -630,7 +650,7 @@ UCS_CLASS_INIT_FUNC(uct_rc_iface_t, uct_rc_iface_ops_t *ops, uct_md_h md, &uct_rc_fc_pending_mpool_ops, "pending-fc-grants-only"); if (status != UCS_OK) { - goto err_destroy_srq; + goto err_cleanup_rx; } } else { self->config.fc_wnd_size = INT16_MAX; @@ -639,12 +659,11 @@ UCS_CLASS_INIT_FUNC(uct_rc_iface_t, uct_rc_iface_ops_t *ops, uct_md_h md, return UCS_OK; -err_destroy_srq: - if (self->rx.srq.srq != NULL) { - ibv_destroy_srq(self->rx.srq.srq); - } +err_cleanup_rx: + ops->cleanup_rx(self); err_destroy_stats: UCS_STATS_NODE_FREE(self->stats); +err_cleanup_tx_ops: uct_rc_iface_tx_ops_cleanup(self); err_destroy_tx_mp: ucs_mpool_cleanup(&self->tx.mp, 1); @@ -656,8 +675,8 @@ UCS_CLASS_INIT_FUNC(uct_rc_iface_t, uct_rc_iface_ops_t *ops, uct_md_h md, static UCS_CLASS_CLEANUP_FUNC(uct_rc_iface_t) { + uct_rc_iface_ops_t *ops = ucs_derived_of(self->super.ops, uct_rc_iface_ops_t); unsigned i; - int ret; /* Release table. TODO release on-demand when removing ep. */ for (i = 0; i < UCT_RC_QP_TABLE_SIZE; ++i) { @@ -672,13 +691,7 @@ static UCS_CLASS_CLEANUP_FUNC(uct_rc_iface_t) UCS_STATS_NODE_FREE(self->stats); - if (self->rx.srq.srq != NULL) { - /* TODO flush RX buffers */ - ret = ibv_destroy_srq(self->rx.srq.srq); - if (ret) { - ucs_warn("failed to destroy SRQ: %m"); - } - } + ops->cleanup_rx(self); uct_rc_iface_tx_ops_cleanup(self); ucs_mpool_cleanup(&self->tx.mp, 1); ucs_mpool_cleanup(&self->rx.mp, 0); /* Cannot flush SRQ */ @@ -689,31 +702,29 @@ static UCS_CLASS_CLEANUP_FUNC(uct_rc_iface_t) UCS_CLASS_DEFINE(uct_rc_iface_t, uct_ib_iface_t); +void uct_rc_iface_fill_attr(uct_rc_iface_t *iface, uct_ib_qp_attr_t *attr, + unsigned max_send_wr, struct ibv_srq *srq) +{ + attr->srq = srq; + attr->cap.max_send_wr = max_send_wr; + attr->cap.max_recv_wr = 0; + attr->cap.max_send_sge = iface->config.tx_min_sge; + attr->cap.max_recv_sge = 1; + attr->cap.max_inline_data = iface->config.tx_min_inline; + attr->qp_type = iface->super.config.qp_type; + attr->sq_sig_all = !iface->config.tx_moderation; + attr->max_inl_cqe[UCT_IB_DIR_RX] = iface->super.config.max_inl_cqe[UCT_IB_DIR_RX]; + attr->max_inl_cqe[UCT_IB_DIR_TX] = iface->super.config.max_inl_cqe[UCT_IB_DIR_TX]; +} ucs_status_t uct_rc_iface_qp_create(uct_rc_iface_t *iface, struct ibv_qp **qp_p, - struct ibv_qp_cap *cap, unsigned max_send_wr) + uct_ib_qp_attr_t *attr, unsigned max_send_wr, + struct ibv_srq *srq) { - uct_ib_qp_attr_t qp_init_attr = {}; - static ucs_status_t status; + uct_rc_iface_fill_attr(iface, attr, max_send_wr, srq); + uct_ib_iface_fill_attr(&iface->super, attr); - if (iface->super.config.qp_type == IBV_QPT_RC) { - qp_init_attr.srq = iface->rx.srq.srq; - } - qp_init_attr.cap.max_send_wr = max_send_wr; - qp_init_attr.cap.max_recv_wr = 0; - qp_init_attr.cap.max_send_sge = iface->config.tx_min_sge; - qp_init_attr.cap.max_recv_sge = 1; - qp_init_attr.cap.max_inline_data = iface->config.tx_min_inline; - qp_init_attr.qp_type = iface->super.config.qp_type; - qp_init_attr.sq_sig_all = !iface->config.tx_moderation; - qp_init_attr.max_inl_recv = iface->config.rx_inline; - - status = iface->super.ops->create_qp(&iface->super, &qp_init_attr, qp_p); - if (status == UCS_OK) { - *cap = qp_init_attr.cap; - } - - return status; + return uct_ib_iface_create_qp(&iface->super, attr, qp_p); } ucs_status_t uct_rc_iface_qp_init(uct_rc_iface_t *iface, struct ibv_qp *qp) @@ -745,7 +756,8 @@ ucs_status_t uct_rc_iface_qp_init(uct_rc_iface_t *iface, struct ibv_qp *qp) ucs_status_t uct_rc_iface_qp_connect(uct_rc_iface_t *iface, struct ibv_qp *qp, const uint32_t dest_qp_num, - struct ibv_ah_attr *ah_attr) + struct ibv_ah_attr *ah_attr, + enum ibv_mtu path_mtu) { #if HAVE_DECL_IBV_EXP_QP_OOO_RW_DATA_PLACEMENT struct ibv_exp_qp_attr qp_attr; @@ -756,12 +768,14 @@ ucs_status_t uct_rc_iface_qp_connect(uct_rc_iface_t *iface, struct ibv_qp *qp, long qp_attr_mask; int ret; + ucs_assert(path_mtu != 0); + memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.qp_state = IBV_QPS_RTR; qp_attr.dest_qp_num = dest_qp_num; qp_attr.rq_psn = 0; - qp_attr.path_mtu = iface->config.path_mtu; + qp_attr.path_mtu = path_mtu; qp_attr.max_dest_rd_atomic = iface->config.max_rd_atomic; qp_attr.min_rnr_timer = iface->config.min_rnr_timer; qp_attr.ah_attr = *ah_attr; @@ -868,3 +882,16 @@ ucs_status_t uct_rc_iface_event_arm(uct_iface_h tl_iface, unsigned events) { return uct_rc_iface_common_event_arm(tl_iface, events, 0); } + +ucs_status_t uct_rc_iface_fence(uct_iface_h tl_iface, unsigned flags) +{ + uct_rc_iface_t *iface = ucs_derived_of(tl_iface, uct_rc_iface_t); + + if (iface->config.fence_mode != UCT_RC_FENCE_MODE_NONE) { + iface->tx.fi.fence_beat++; + } + + UCT_TL_IFACE_STAT_FENCE(&iface->super.super); + return UCS_OK; +} + diff --git a/src/uct/ib/rc/base/rc_iface.h b/src/uct/ib/rc/base/rc_iface.h index da598445b23..b59c9416409 100644 --- a/src/uct/ib/rc/base/rc_iface.h +++ b/src/uct/ib/rc/base/rc_iface.h @@ -87,27 +87,18 @@ _desc->super.user_comp = _comp; -enum { - UCT_RC_IFACE_ADDR_TYPE_BASIC, - - /* Tag Matching address. It additionaly contains QP number which - * is used for hardware offloads. */ - UCT_RC_IFACE_ADDR_TYPE_TM, - UCT_RC_IFACE_ADDR_TYPE_LAST -}; - - enum { UCT_RC_IFACE_STAT_RX_COMPLETION, UCT_RC_IFACE_STAT_TX_COMPLETION, UCT_RC_IFACE_STAT_NO_CQE, + UCT_RC_IFACE_STAT_NO_READS, UCT_RC_IFACE_STAT_LAST }; /* flags for uct_rc_iface_send_op_t */ enum { -#if ENABLE_ASSERT +#if UCS_ENABLE_ASSERT UCT_RC_IFACE_SEND_OP_FLAG_ZCOPY = UCS_BIT(13), /* zcopy */ UCT_RC_IFACE_SEND_OP_FLAG_IFACE = UCS_BIT(14), /* belongs to iface ops buffer */ UCT_RC_IFACE_SEND_OP_FLAG_INUSE = UCS_BIT(15) /* queued on a txqp */ @@ -136,23 +127,31 @@ typedef struct uct_rc_fc_request { } uct_rc_fc_request_t; -typedef struct uct_rc_fc_config { - double soft_thresh; -} uct_rc_fc_config_t; +/** + * RC fence type. + */ +typedef enum uct_rc_fence_mode { + UCT_RC_FENCE_MODE_NONE, + UCT_RC_FENCE_MODE_WEAK, + UCT_RC_FENCE_MODE_AUTO, + UCT_RC_FENCE_MODE_LAST +} uct_rc_fence_mode_t; -struct uct_rc_iface_config { +/* Common configuration used for rc verbs, rcx and dc transports */ +typedef struct uct_rc_iface_common_config { uct_ib_iface_config_t super; - uct_ib_mtu_t path_mtu; unsigned max_rd_atomic; int ooo_rw; /* Enable out-of-order RDMA data placement */ + int fence_mode; struct { double timeout; unsigned retry_count; double rnr_timeout; unsigned rnr_retry_count; - unsigned cq_len; + size_t max_get_zcopy; + size_t max_get_bytes; } tx; struct { @@ -160,13 +159,24 @@ struct uct_rc_iface_config { double hard_thresh; unsigned wnd_size; } fc; +} uct_rc_iface_common_config_t; + + +/* RC specific configuration used for rc verbs and rcx transports only */ +struct uct_rc_iface_config { + uct_rc_iface_common_config_t super; + double soft_thresh; + unsigned tx_cq_moderation; /* How many TX messages are + batched to one CQE */ + unsigned tx_cq_len; }; typedef struct uct_rc_iface_ops { uct_ib_iface_ops_t super; ucs_status_t (*init_rx)(uct_rc_iface_t *iface, - const uct_rc_iface_config_t *config); + const uct_rc_iface_common_config_t *config); + void (*cleanup_rx)(uct_rc_iface_t *iface); ucs_status_t (*fc_ctrl)(uct_ep_t *ep, unsigned op, uct_rc_fc_request_t *req); ucs_status_t (*fc_handler)(uct_rc_iface_t *iface, unsigned qp_num, @@ -177,7 +187,6 @@ typedef struct uct_rc_iface_ops { typedef struct uct_rc_srq { - struct ibv_srq *srq; unsigned available; unsigned quota; } uct_rc_srq_t; @@ -196,9 +205,11 @@ struct uct_rc_iface { * In case of verbs TL we use QWE number, so 1 post always takes 1 * credit */ signed cq_available; + ssize_t reads_available; uct_rc_iface_send_op_t *free_ops; /* stack of free send operations */ ucs_arbiter_t arbiter; uct_rc_iface_send_op_t *ops_buffer; + uct_ib_fence_info_t fi; } tx; struct { @@ -211,7 +222,6 @@ struct uct_rc_iface { unsigned tx_min_sge; unsigned tx_min_inline; unsigned tx_ops_count; - unsigned rx_inline; uint16_t tx_moderation; /* Threshold to send "soft" FC credit request. The peer will try to @@ -230,12 +240,14 @@ struct uct_rc_iface { uint8_t rnr_retry; uint8_t retry_cnt; uint8_t max_rd_atomic; - enum ibv_mtu path_mtu; /* Enable out-of-order RDMA data placement */ uint8_t ooo_rw; -#if ENABLE_ASSERT +#if UCS_ENABLE_ASSERT int tx_cq_len; #endif + uct_rc_fence_mode_t fence_mode; + unsigned exp_backoff; + size_t max_get_zcopy; /* Atomic callbacks */ uct_rc_send_handler_t atomic64_handler; /* 64bit ib-spec */ @@ -243,7 +255,7 @@ struct uct_rc_iface { uct_rc_send_handler_t atomic64_ext_handler; /* 64bit extended */ } config; - UCS_STATS_NODE_DECLARE(stats); + UCS_STATS_NODE_DECLARE(stats) uct_rc_ep_t **eps[UCT_RC_QP_TABLE_SIZE]; ucs_list_link_t ep_list; @@ -252,7 +264,7 @@ struct uct_rc_iface { ucs_callback_t progress; }; UCS_CLASS_DECLARE(uct_rc_iface_t, uct_rc_iface_ops_t*, uct_md_h, uct_worker_h, - const uct_iface_params_t*, const uct_rc_iface_config_t*, + const uct_iface_params_t*, const uct_rc_iface_common_config_t*, uct_ib_iface_init_attr_t*); @@ -266,9 +278,10 @@ struct uct_rc_iface_send_op { uint16_t flags; unsigned length; union { - void *buffer; /* atomics / desc */ - void *unpack_arg; /* get_bcopy / desc */ - uct_rc_iface_t *iface; /* zcopy / op */ + void *buffer; /* atomics / desc */ + void *unpack_arg; /* get_bcopy / desc */ + uct_rc_iface_t *iface; /* should not be used with + get_bcopy completions */ }; uct_completion_t *user_comp; }; @@ -291,7 +304,7 @@ typedef struct uct_rc_am_short_hdr { extern ucs_config_field_t uct_rc_iface_config_table[]; -extern ucs_config_field_t uct_rc_fc_config_table[]; +extern ucs_config_field_t uct_rc_iface_common_config_table[]; unsigned uct_rc_iface_do_progress(uct_iface_h tl_iface); @@ -299,14 +312,7 @@ ucs_status_t uct_rc_iface_query(uct_rc_iface_t *iface, uct_iface_attr_t *iface_attr, size_t put_max_short, size_t max_inline, size_t am_max_hdr, size_t am_max_iov, - size_t tag_max_iov, size_t tag_min_hdr); - -ucs_status_t uct_rc_iface_get_address(uct_iface_h tl_iface, - uct_iface_addr_t *addr); - -int uct_rc_iface_is_reachable(const uct_iface_h tl_iface, - const uct_device_addr_t *dev_addr, - const uct_iface_addr_t *iface_addr); + size_t am_min_hdr, size_t rma_max_iov); void uct_rc_iface_add_qp(uct_rc_iface_t *iface, uct_rc_ep_t *ep, unsigned qp_num); @@ -321,23 +327,29 @@ void uct_rc_iface_send_desc_init(uct_iface_h tl_iface, void *obj, uct_mem_h memh void uct_rc_ep_am_zcopy_handler(uct_rc_iface_send_op_t *op, const void *resp); /** - * Creates an RC or DCI QP and fills 'cap' with QP capabilities; + * Creates an RC or DCI QP */ ucs_status_t uct_rc_iface_qp_create(uct_rc_iface_t *iface, struct ibv_qp **qp_p, - struct ibv_qp_cap *cap, unsigned max_send_wr); + uct_ib_qp_attr_t *attr, unsigned max_send_wr, + struct ibv_srq *srq); + +void uct_rc_iface_fill_attr(uct_rc_iface_t *iface, + uct_ib_qp_attr_t *qp_init_attr, + unsigned max_send_wr, + struct ibv_srq *srq); ucs_status_t uct_rc_iface_qp_init(uct_rc_iface_t *iface, struct ibv_qp *qp); ucs_status_t uct_rc_iface_qp_connect(uct_rc_iface_t *iface, struct ibv_qp *qp, const uint32_t qp_num, - struct ibv_ah_attr *ah_attr); + struct ibv_ah_attr *ah_attr, + enum ibv_mtu path_mtu); ucs_status_t uct_rc_iface_fc_handler(uct_rc_iface_t *iface, unsigned qp_num, uct_rc_hdr_t *hdr, unsigned length, uint32_t imm_data, uint16_t lid, unsigned flags); -ucs_status_t uct_rc_init_fc_thresh(uct_rc_fc_config_t *fc_cfg, - uct_rc_iface_config_t *rc_cfg, +ucs_status_t uct_rc_init_fc_thresh(uct_rc_iface_config_t *rc_cfg, uct_rc_iface_t *iface); ucs_status_t uct_rc_iface_event_arm(uct_iface_h tl_iface, unsigned events); @@ -346,7 +358,10 @@ ucs_status_t uct_rc_iface_common_event_arm(uct_iface_h tl_iface, unsigned events, int force_rx_all); ucs_status_t uct_rc_iface_init_rx(uct_rc_iface_t *iface, - const uct_rc_iface_config_t *config); + const uct_rc_iface_common_config_t *config, + struct ibv_srq **p_srq); + +ucs_status_t uct_rc_iface_fence(uct_iface_h tl_iface, unsigned flags); static UCS_F_ALWAYS_INLINE ucs_status_t uct_rc_fc_ctrl(uct_ep_t *ep, unsigned op, uct_rc_fc_request_t *req) @@ -421,7 +436,8 @@ static inline void uct_rc_zcopy_desc_set_header(uct_rc_hdr_t *rch, static inline int uct_rc_iface_has_tx_resources(uct_rc_iface_t *iface) { return uct_rc_iface_have_tx_cqe_avail(iface) && - !ucs_mpool_is_empty(&iface->tx.mp); + !ucs_mpool_is_empty(&iface->tx.mp) && + (iface->tx.reads_available > 0); } static UCS_F_ALWAYS_INLINE uct_rc_send_handler_t @@ -438,4 +454,18 @@ uct_rc_iface_atomic_handler(uct_rc_iface_t *iface, int ext, unsigned length) return NULL; } +static UCS_F_ALWAYS_INLINE ucs_status_t +uct_rc_iface_fence_relaxed_order(uct_iface_h tl_iface) +{ + uct_base_iface_t *iface = ucs_derived_of(tl_iface, uct_base_iface_t); + uct_ib_md_t *md = ucs_derived_of(iface->md, uct_ib_md_t); + + ucs_assert(tl_iface->ops.iface_fence == uct_rc_iface_fence); + + if (!md->relaxed_order) { + return UCS_OK; + } + + return uct_rc_iface_fence(tl_iface, 0); +} #endif diff --git a/src/uct/ib/rc/verbs/rc_verbs.h b/src/uct/ib/rc/verbs/rc_verbs.h index bec9b3dd4de..9e3918326ee 100644 --- a/src/uct/ib/rc/verbs/rc_verbs.h +++ b/src/uct/ib/rc/verbs/rc_verbs.h @@ -11,6 +11,7 @@ #include #include + #define UCT_RC_VERBS_IFACE_FOREACH_TXWQE(_iface, _i, _wc, _num_wcs) \ status = uct_ib_poll_cq((_iface)->super.cq[UCT_IB_DIR_TX], &_num_wcs, _wc); \ if (status != UCS_OK) { \ @@ -21,17 +22,37 @@ for (_i = 0; _i < _num_wcs; ++_i) +enum { + UCT_RC_VERBS_ADDR_HAS_ATOMIC_MR = UCS_BIT(0) +}; + + +typedef struct uct_rc_verbs_ep_address { + uint8_t flags; + uct_ib_uint24_t qp_num; + uint64_t flush_addr; + uint32_t flush_rkey; +} UCS_S_PACKED uct_rc_verbs_ep_address_t; + + typedef struct uct_rc_verbs_txcnt { uint16_t pi; /* producer (post_send) count */ uint16_t ci; /* consumer (ibv_poll_cq) completion count */ } uct_rc_verbs_txcnt_t; + /** * RC verbs communication context. */ typedef struct uct_rc_verbs_ep { uct_rc_ep_t super; uct_rc_verbs_txcnt_t txcnt; + uct_ib_fence_info_t fi; + struct ibv_qp *qp; + struct { + uintptr_t remote_addr; + uint32_t rkey; + } flush; } uct_rc_verbs_ep_t; @@ -42,7 +63,6 @@ typedef struct uct_rc_verbs_iface_config { uct_rc_iface_config_t super; size_t max_am_hdr; unsigned tx_max_wr; - uct_rc_fc_config_t fc; } uct_rc_verbs_iface_config_t; @@ -51,20 +71,26 @@ typedef struct uct_rc_verbs_iface_config { */ typedef struct uct_rc_verbs_iface { uct_rc_iface_t super; + struct ibv_srq *srq; struct ibv_send_wr inl_am_wr; struct ibv_send_wr inl_rwrite_wr; struct ibv_sge inl_sge[2]; uct_rc_am_short_hdr_t am_inl_hdr; ucs_mpool_t short_desc_mp; - uct_rc_iface_send_desc_t *fc_desc; /* used when max_inline is zero */ + uct_rc_iface_send_desc_t *fc_desc; /* used when max_inline is zero */ + struct ibv_mr *flush_mr; /* MR for writing dummy value to flush */ + void *flush_mem; struct { size_t short_desc_size; size_t max_inline; + size_t max_send_sge; unsigned tx_max_wr; } config; } uct_rc_verbs_iface_t; +ucs_status_t uct_rc_verbs_iface_flush_mem_create(uct_rc_verbs_iface_t *iface); + UCS_CLASS_DECLARE(uct_rc_verbs_ep_t, const uct_ep_params_t *); UCS_CLASS_DECLARE_NEW_FUNC(uct_rc_verbs_ep_t, uct_ep_t, const uct_ep_params_t *); UCS_CLASS_DECLARE_DELETE_FUNC(uct_rc_verbs_ep_t, uct_ep_t); @@ -109,30 +135,29 @@ ucs_status_t uct_rc_verbs_ep_atomic_cswap64(uct_ep_h tl_ep, uint64_t compare, ui uint64_t remote_addr, uct_rkey_t rkey, uint64_t *result, uct_completion_t *comp); -ucs_status_t uct_rc_verbs_ep_atomic_cswap32(uct_ep_h tl_ep, uint32_t compare, uint32_t swap, - uint64_t remote_addr, uct_rkey_t rkey, - uint32_t *result, uct_completion_t *comp); - ucs_status_t uct_rc_verbs_ep_atomic64_post(uct_ep_h tl_ep, unsigned opcode, uint64_t value, uint64_t remote_addr, uct_rkey_t rkey); -ucs_status_t uct_rc_verbs_ep_atomic32_post(uct_ep_h tl_ep, unsigned opcode, uint32_t value, - uint64_t remote_addr, uct_rkey_t rkey); - ucs_status_t uct_rc_verbs_ep_atomic64_fetch(uct_ep_h tl_ep, uct_atomic_op_t opcode, uint64_t value, uint64_t *result, uint64_t remote_addr, uct_rkey_t rkey, uct_completion_t *comp); -ucs_status_t uct_rc_verbs_ep_atomic32_fetch(uct_ep_h tl_ep, uct_atomic_op_t opcode, - uint32_t value, uint32_t *result, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp); - ucs_status_t uct_rc_verbs_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp); +ucs_status_t uct_rc_verbs_ep_fence(uct_ep_h tl_ep, unsigned flags); + ucs_status_t uct_rc_verbs_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, uct_rc_fc_request_t *req); +ucs_status_t uct_rc_verbs_ep_handle_failure(uct_rc_verbs_ep_t *ep, + ucs_status_t status); + +ucs_status_t uct_rc_verbs_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr); + +ucs_status_t uct_rc_verbs_ep_connect_to_ep(uct_ep_h tl_ep, + const uct_device_addr_t *dev_addr, + const uct_ep_addr_t *ep_addr); + #endif diff --git a/src/uct/ib/rc/verbs/rc_verbs_ep.c b/src/uct/ib/rc/verbs/rc_verbs_ep.c index cce921f8007..69701407280 100644 --- a/src/uct/ib/rc/verbs/rc_verbs_ep.c +++ b/src/uct/ib/rc/verbs/rc_verbs_ep.c @@ -5,6 +5,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rc_verbs.h" #include "rc_verbs_impl.h" @@ -16,6 +20,14 @@ void uct_rc_verbs_txcnt_init(uct_rc_verbs_txcnt_t *txcnt) txcnt->pi = txcnt->ci = 0; } +static UCS_F_ALWAYS_INLINE void +uct_rc_verbs_ep_fence_put(uct_rc_verbs_iface_t *iface, uct_rc_verbs_ep_t *ep, + uct_rkey_t *rkey, uint64_t *addr) +{ + uct_rc_ep_fence_put(&iface->super, &ep->fi, rkey, addr, + ep->super.atomic_mr_offset); +} + static UCS_F_ALWAYS_INLINE void uct_rc_verbs_ep_post_send(uct_rc_verbs_iface_t* iface, uct_rc_verbs_ep_t* ep, struct ibv_send_wr *wr, int send_flags, int max_log_sge) @@ -23,19 +35,24 @@ uct_rc_verbs_ep_post_send(uct_rc_verbs_iface_t* iface, uct_rc_verbs_ep_t* ep, struct ibv_send_wr *bad_wr; int ret; - uct_rc_txqp_check(&ep->super.txqp); + ucs_assertv(ep->qp->state == IBV_QPS_RTS, "QP 0x%x state is %d", + ep->qp->qp_num, ep->qp->state); if (!(send_flags & IBV_SEND_SIGNALED)) { send_flags |= uct_rc_iface_tx_moderation(&iface->super, &ep->super.txqp, IBV_SEND_SIGNALED); } + if (wr->opcode == IBV_WR_RDMA_READ) { + send_flags |= uct_rc_ep_fm(&iface->super, &ep->fi, IBV_SEND_FENCE); + } + wr->send_flags = send_flags; wr->wr_id = uct_rc_txqp_unsignaled(&ep->super.txqp); - uct_ib_log_post_send(&iface->super.super, ep->super.txqp.qp, wr, max_log_sge, + uct_ib_log_post_send(&iface->super.super, ep->qp, wr, max_log_sge, (wr->opcode == IBV_WR_SEND) ? uct_rc_ep_packet_dump : NULL); - ret = ibv_post_send(ep->super.txqp.qp, wr, &bad_wr); + ret = ibv_post_send(ep->qp, wr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_post_send() returned %d (%m)", ret); } @@ -43,36 +60,6 @@ uct_rc_verbs_ep_post_send(uct_rc_verbs_iface_t* iface, uct_rc_verbs_ep_t* ep, uct_rc_verbs_txqp_posted(&ep->super.txqp, &ep->txcnt, &iface->super, send_flags & IBV_SEND_SIGNALED); } -#if HAVE_DECL_IBV_EXP_POST_SEND && (HAVE_DECL_IBV_EXP_WR_NOP || HAVE_IB_EXT_ATOMICS) -static UCS_F_ALWAYS_INLINE void -uct_rc_verbs_exp_post_send(uct_rc_verbs_ep_t *ep, struct ibv_exp_send_wr *wr, - uint64_t signal, int max_log_sge) -{ - uct_rc_verbs_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, - uct_rc_verbs_iface_t); - uct_rc_txqp_check(&ep->super.txqp); - - struct ibv_exp_send_wr *bad_wr; - int ret; - - signal |= uct_rc_iface_tx_moderation(&iface->super, &ep->super.txqp, - IBV_EXP_SEND_SIGNALED); - wr->exp_send_flags = signal; - wr->wr_id = uct_rc_txqp_unsignaled(&ep->super.txqp); - - uct_ib_log_exp_post_send(&iface->super.super, ep->super.txqp.qp, wr, max_log_sge, - (wr->exp_opcode == IBV_EXP_WR_SEND) ? - uct_rc_ep_packet_dump : NULL); - - ret = ibv_exp_post_send(ep->super.txqp.qp, wr, &bad_wr); - if (ret != 0) { - ucs_fatal("ibv_exp_post_send() returned %d (%m)", ret); - } - - uct_rc_verbs_txqp_posted(&ep->super.txqp, &ep->txcnt, &iface->super, signal); -} -#endif - /* * Helper function for posting sends with a descriptor. * User needs to fill: wr.opcode, wr.sg_list, wr.num_sge, first sge length, and @@ -92,8 +79,10 @@ uct_rc_verbs_ep_post_send_desc(uct_rc_verbs_ep_t* ep, struct ibv_send_wr *wr, static inline ucs_status_t uct_rc_verbs_ep_rdma_zcopy(uct_rc_verbs_ep_t *ep, const uct_iov_t *iov, - size_t iovcnt, uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp, int opcode) + size_t iovcnt, size_t iov_total_length, + uint64_t remote_addr, uct_rkey_t rkey, + uct_completion_t *comp, uct_rc_send_handler_t handler, + int opcode) { uct_rc_verbs_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_verbs_iface_t); @@ -101,15 +90,22 @@ uct_rc_verbs_ep_rdma_zcopy(uct_rc_verbs_ep_t *ep, const uct_iov_t *iov, struct ibv_send_wr wr; size_t sge_cnt; - UCT_RC_CHECK_RES(&iface->super, &ep->super); + ucs_assertv(iovcnt <= ucs_min(UCT_IB_MAX_IOV, iface->config.max_send_sge), + "iovcnt %zu, maxcnt (%zu, %zu)", + iovcnt, UCT_IB_MAX_IOV, iface->config.max_send_sge); + + UCT_RC_CHECK_RMA_RES(&iface->super, &ep->super); sge_cnt = uct_ib_verbs_sge_fill_iov(sge, iov, iovcnt); + /* cppcheck-suppress syntaxError */ UCT_SKIP_ZERO_LENGTH(sge_cnt); - UCT_RC_VERBS_FILL_RDMA_WR_IOV(wr, wr.opcode, opcode, sge, sge_cnt, remote_addr, rkey); + UCT_RC_VERBS_FILL_RDMA_WR_IOV(wr, wr.opcode, (enum ibv_wr_opcode)opcode, + sge, sge_cnt, remote_addr, rkey); wr.next = NULL; uct_rc_verbs_ep_post_send(iface, ep, &wr, IBV_SEND_SIGNALED, INT_MAX); - uct_rc_txqp_add_send_comp(&iface->super, &ep->super.txqp, comp, ep->txcnt.pi, - UCT_RC_IFACE_SEND_OP_FLAG_ZCOPY); + uct_rc_txqp_add_send_comp(&iface->super, &ep->super.txqp, handler, comp, + ep->txcnt.pi, UCT_RC_IFACE_SEND_OP_FLAG_ZCOPY, + iov_total_length); return UCS_INPROGRESS; } @@ -118,13 +114,12 @@ uct_rc_verbs_ep_atomic_post(uct_rc_verbs_ep_t *ep, int opcode, uint64_t compare_ uint64_t swap, uint64_t remote_addr, uct_rkey_t rkey, uct_rc_iface_send_desc_t *desc, int force_sig) { - uint32_t ib_rkey = uct_ib_resolve_atomic_rkey(rkey, ep->super.atomic_mr_offset, - &remote_addr); struct ibv_send_wr wr; struct ibv_sge sge; - UCT_RC_VERBS_FILL_ATOMIC_WR(wr, wr.opcode, sge, opcode, compare_add, swap, - remote_addr, ib_rkey); + UCT_RC_VERBS_FILL_ATOMIC_WR(wr, wr.opcode, sge, (enum ibv_wr_opcode)opcode, + compare_add, swap, remote_addr, + uct_ib_md_direct_rkey(rkey)); UCT_TL_EP_STAT_ATOMIC(&ep->super.super); uct_rc_verbs_ep_post_send_desc(ep, &wr, desc, force_sig, INT_MAX); } @@ -138,65 +133,27 @@ uct_rc_verbs_ep_atomic(uct_rc_verbs_ep_t *ep, int opcode, void *result, uct_rc_verbs_iface_t); uct_rc_iface_send_desc_t *desc; - UCT_RC_CHECK_RES(&iface->super, &ep->super); + UCT_RC_CHECK_RMA_RES(&iface->super, &ep->super); UCT_RC_IFACE_GET_TX_ATOMIC_FETCH_DESC(&iface->super, &iface->short_desc_mp, desc, iface->super.config.atomic64_handler, result, comp); uct_rc_verbs_ep_atomic_post(ep, opcode, compare_add, swap, remote_addr, - rkey, desc, IBV_SEND_SIGNALED); - return UCS_INPROGRESS; -} - -#if HAVE_IB_EXT_ATOMICS -static inline void -uct_rc_verbs_ep_ext_atomic_post(uct_rc_verbs_ep_t *ep, int opcode, uint32_t length, - uint64_t compare_mask, uint64_t compare_add, - uint64_t swap, uint64_t remote_addr, uct_rkey_t rkey, - uct_rc_iface_send_desc_t *desc, uint64_t force_sig) -{ - struct ibv_exp_send_wr wr; - struct ibv_sge sge; - - uct_rc_verbs_fill_ext_atomic_wr(&wr, &sge, opcode, length, compare_mask, - compare_add, swap, remote_addr, rkey, ep->super.atomic_mr_offset); - UCT_RC_VERBS_FILL_DESC_WR(&wr, desc); - UCT_TL_EP_STAT_ATOMIC(&ep->super.super); - uct_rc_verbs_exp_post_send(ep, &wr, force_sig|IBV_EXP_SEND_EXT_ATOMIC_INLINE, INT_MAX); - uct_rc_txqp_add_send_op_sn(&ep->super.txqp, &desc->super, ep->txcnt.pi); -} - -static inline ucs_status_t -uct_rc_verbs_ep_ext_atomic(uct_rc_verbs_ep_t *ep, int opcode, void *result, - uint32_t length, uint64_t compare_mask, - uint64_t compare_add, uint64_t swap, uint64_t remote_addr, - uct_rkey_t rkey, uct_completion_t *comp) -{ - uct_rc_verbs_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, - uct_rc_verbs_iface_t); - uct_rc_send_handler_t handler = uct_rc_iface_atomic_handler(&iface->super, 1, - length); - uct_rc_iface_send_desc_t *desc; - - UCT_RC_CHECK_RES(&iface->super, &ep->super); - UCT_RC_IFACE_GET_TX_ATOMIC_FETCH_DESC(&iface->super, &iface->short_desc_mp, - desc, handler, result, comp); - uct_rc_verbs_ep_ext_atomic_post(ep, opcode, length, compare_mask, compare_add, - swap, remote_addr, rkey, desc, - IBV_EXP_SEND_SIGNALED); + rkey, desc, IBV_SEND_SIGNALED | + uct_rc_ep_fm(&iface->super, &ep->fi, IBV_SEND_FENCE)); return UCS_INPROGRESS; } -#endif ucs_status_t uct_rc_verbs_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length, uint64_t remote_addr, uct_rkey_t rkey) { uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_verbs_iface_t); - uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); + uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); UCT_CHECK_LENGTH(length, 0, iface->config.max_inline, "put_short"); - UCT_RC_CHECK_RES(&iface->super, &ep->super); + UCT_RC_CHECK_RMA_RES(&iface->super, &ep->super); + uct_rc_verbs_ep_fence_put(iface, ep, &rkey, &remote_addr); UCT_RC_VERBS_FILL_INL_PUT_WR(iface, remote_addr, rkey, buffer, length); UCT_TL_EP_STAT_OP(&ep->super.super, PUT, SHORT, length); uct_rc_verbs_ep_post_send(iface, ep, &iface->inl_rwrite_wr, @@ -208,15 +165,16 @@ ssize_t uct_rc_verbs_ep_put_bcopy(uct_ep_h tl_ep, uct_pack_callback_t pack_cb, void *arg, uint64_t remote_addr, uct_rkey_t rkey) { uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_verbs_iface_t); - uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); + uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); uct_rc_iface_send_desc_t *desc; struct ibv_send_wr wr; struct ibv_sge sge; size_t length; - UCT_RC_CHECK_RES(&iface->super, &ep->super); + UCT_RC_CHECK_RMA_RES(&iface->super, &ep->super); UCT_RC_IFACE_GET_TX_PUT_BCOPY_DESC(&iface->super, &iface->super.tx.mp, desc, pack_cb, arg, length); + uct_rc_verbs_ep_fence_put(iface, ep, &rkey, &remote_addr); UCT_RC_VERBS_FILL_RDMA_WR(wr, wr.opcode, IBV_WR_RDMA_WRITE, sge, length, remote_addr, rkey); UCT_TL_EP_STAT_OP(&ep->super.super, PUT, BCOPY, length); @@ -228,14 +186,18 @@ ucs_status_t uct_rc_verbs_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, siz uint64_t remote_addr, uct_rkey_t rkey, uct_completion_t *comp) { - uct_ib_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ib_iface_t); - uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); + uct_rc_verbs_iface_t UCS_V_UNUSED *iface = ucs_derived_of(tl_ep->iface, + uct_rc_verbs_iface_t); + uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, + uct_rc_verbs_ep_t); ucs_status_t status; - UCT_CHECK_IOV_SIZE(iovcnt, uct_ib_iface_get_max_iov(iface), + UCT_CHECK_IOV_SIZE(iovcnt, iface->config.max_send_sge, "uct_rc_verbs_ep_put_zcopy"); - status = uct_rc_verbs_ep_rdma_zcopy(ep, iov, iovcnt, remote_addr, - rkey, comp, IBV_WR_RDMA_WRITE); + uct_rc_verbs_ep_fence_put(iface, ep, &rkey, &remote_addr); + status = uct_rc_verbs_ep_rdma_zcopy(ep, iov, iovcnt, 0ul, remote_addr, rkey, + comp, uct_rc_ep_send_op_completion_handler, + IBV_WR_RDMA_WRITE); UCT_TL_EP_STAT_OP_IF_SUCCESS(status, &ep->super.super, PUT, ZCOPY, uct_iov_total_length(iov, iovcnt)); return status; @@ -254,33 +216,42 @@ ucs_status_t uct_rc_verbs_ep_get_bcopy(uct_ep_h tl_ep, struct ibv_sge sge; UCT_CHECK_LENGTH(length, 0, iface->super.super.config.seg_size, "get_bcopy"); - UCT_RC_CHECK_RES(&iface->super, &ep->super); + UCT_RC_CHECK_RMA_RES(&iface->super, &ep->super); UCT_RC_IFACE_GET_TX_GET_BCOPY_DESC(&iface->super, &iface->super.tx.mp, desc, unpack_cb, comp, arg, length); UCT_RC_VERBS_FILL_RDMA_WR(wr, wr.opcode, IBV_WR_RDMA_READ, sge, length, remote_addr, - rkey); + uct_ib_md_direct_rkey(rkey)); UCT_TL_EP_STAT_OP(&ep->super.super, GET, BCOPY, length); uct_rc_verbs_ep_post_send_desc(ep, &wr, desc, IBV_SEND_SIGNALED, INT_MAX); + UCT_RC_RDMA_READ_POSTED(&iface->super, length); return UCS_INPROGRESS; } -ucs_status_t uct_rc_verbs_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp) +ucs_status_t uct_rc_verbs_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, + size_t iovcnt, uint64_t remote_addr, + uct_rkey_t rkey, uct_completion_t *comp) { - uct_ib_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ib_iface_t); - uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); + uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, + uct_rc_verbs_iface_t); + uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); + size_t total_length = uct_iov_total_length(iov, iovcnt); ucs_status_t status; - UCT_CHECK_IOV_SIZE(iovcnt, uct_ib_iface_get_max_iov(iface), + UCT_CHECK_IOV_SIZE(iovcnt, iface->config.max_send_sge, "uct_rc_verbs_ep_get_zcopy"); - status = uct_rc_verbs_ep_rdma_zcopy(ep, iov, iovcnt, remote_addr, - rkey, comp, IBV_WR_RDMA_READ); - if (status == UCS_INPROGRESS) { - UCT_TL_EP_STAT_OP(&ep->super.super, GET, ZCOPY, - uct_iov_total_length(iov, iovcnt)); + UCT_CHECK_LENGTH(total_length, + iface->super.super.config.max_inl_cqe[UCT_IB_DIR_TX] + 1, + iface->super.config.max_get_zcopy, "get_zcopy"); + + status = uct_rc_verbs_ep_rdma_zcopy(ep, iov, iovcnt, total_length, remote_addr, + uct_ib_md_direct_rkey(rkey), comp, + uct_rc_ep_get_zcopy_completion_handler, + IBV_WR_RDMA_READ); + if (!UCS_STATUS_IS_ERR(status)) { + UCT_RC_RDMA_READ_POSTED(&iface->super, total_length); + UCT_TL_EP_STAT_OP(&ep->super.super, GET, ZCOPY, total_length); } return status; } @@ -343,7 +314,8 @@ ucs_status_t uct_rc_verbs_ep_am_zcopy(uct_ep_h tl_ep, uint8_t id, const void *he int send_flags; size_t sge_cnt; - UCT_CHECK_IOV_SIZE(iovcnt, uct_ib_iface_get_max_iov(&iface->super.super) - 1, + /* 1 iov consumed by am header */ + UCT_CHECK_IOV_SIZE(iovcnt, iface->config.max_send_sge - 1, "uct_rc_verbs_ep_am_zcopy"); UCT_RC_CHECK_AM_ZCOPY(id, header_length, uct_iov_total_length(iov, iovcnt), iface->config.short_desc_size, @@ -379,7 +351,7 @@ ucs_status_t uct_rc_verbs_ep_atomic64_post(uct_ep_h tl_ep, unsigned opcode, uint } /* TODO don't allocate descriptor - have dummy buffer */ - UCT_RC_CHECK_RES(&iface->super, &ep->super); + UCT_RC_CHECK_RMA_RES(&iface->super, &ep->super); UCT_RC_IFACE_GET_TX_ATOMIC_DESC(&iface->super, &iface->short_desc_mp, desc); uct_rc_verbs_ep_atomic_post(ep, @@ -389,76 +361,18 @@ ucs_status_t uct_rc_verbs_ep_atomic64_post(uct_ep_h tl_ep, unsigned opcode, uint return UCS_OK; } -ucs_status_t uct_rc_verbs_ep_atomic32_post(uct_ep_h tl_ep, unsigned opcode, uint32_t value, - uint64_t remote_addr, uct_rkey_t rkey) -{ -#if HAVE_IB_EXT_ATOMICS - uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_verbs_iface_t); - uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); - uct_rc_iface_send_desc_t *desc; - - if (opcode != UCT_ATOMIC_OP_ADD) { - return UCS_ERR_UNSUPPORTED; - } - - UCT_RC_CHECK_RES(&iface->super, &ep->super); - UCT_RC_IFACE_GET_TX_ATOMIC_DESC(&iface->super, &iface->short_desc_mp, desc); - - uct_rc_verbs_ep_ext_atomic_post(ep, IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD, - sizeof(uint32_t), 0, value, 0, remote_addr, - rkey, desc, IBV_EXP_SEND_SIGNALED); - return UCS_OK; -#else - return UCS_ERR_UNSUPPORTED; -#endif -} - ucs_status_t uct_rc_verbs_ep_atomic64_fetch(uct_ep_h tl_ep, uct_atomic_op_t opcode, uint64_t value, uint64_t *result, uint64_t remote_addr, uct_rkey_t rkey, uct_completion_t *comp) { - switch (opcode) { - case UCT_ATOMIC_OP_ADD: - return uct_rc_verbs_ep_atomic(ucs_derived_of(tl_ep, uct_rc_verbs_ep_t), - IBV_WR_ATOMIC_FETCH_AND_ADD, result, value, 0, - remote_addr, rkey, comp); -#if HAVE_IB_EXT_ATOMICS - case UCT_ATOMIC_OP_SWAP: - return uct_rc_verbs_ep_ext_atomic(ucs_derived_of(tl_ep, uct_rc_verbs_ep_t), - IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP, - result, sizeof(uint64_t), 0, 0, value, remote_addr, - rkey, comp); -#endif - default: - break; - } - - return UCS_ERR_UNSUPPORTED; -} - -ucs_status_t uct_rc_verbs_ep_atomic32_fetch(uct_ep_h tl_ep, uct_atomic_op_t opcode, - uint32_t value, uint32_t *result, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp) -{ -#if HAVE_IB_EXT_ATOMICS - int op; - uint32_t add; - uint32_t swap; - ucs_status_t status; - - status = uct_rc_verbs_ep_atomic32_data(opcode, value, &op, &add, &swap); - if (UCS_STATUS_IS_ERR(status)) { - return status; + if (opcode != UCT_ATOMIC_OP_ADD) { + return UCS_ERR_UNSUPPORTED; } - return uct_rc_verbs_ep_ext_atomic(ucs_derived_of(tl_ep, uct_rc_verbs_ep_t), op, - result, sizeof(uint32_t), 0, add, swap, - remote_addr, rkey, comp); -#else - return UCS_ERR_UNSUPPORTED; -#endif + return uct_rc_verbs_ep_atomic(ucs_derived_of(tl_ep, uct_rc_verbs_ep_t), + IBV_WR_ATOMIC_FETCH_AND_ADD, result, value, 0, + remote_addr, rkey, comp); } ucs_status_t uct_rc_verbs_ep_atomic_cswap64(uct_ep_h tl_ep, uint64_t compare, uint64_t swap, @@ -470,38 +384,34 @@ ucs_status_t uct_rc_verbs_ep_atomic_cswap64(uct_ep_h tl_ep, uint64_t compare, ui remote_addr, rkey, comp); } -ucs_status_t uct_rc_verbs_ep_atomic_cswap32(uct_ep_h tl_ep, uint32_t compare, uint32_t swap, - uint64_t remote_addr, uct_rkey_t rkey, - uint32_t *result, uct_completion_t *comp) -{ -#if HAVE_IB_EXT_ATOMICS - return uct_rc_verbs_ep_ext_atomic(ucs_derived_of(tl_ep, uct_rc_verbs_ep_t), - IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP, - result, sizeof(uint32_t), (uint32_t)(-1), - compare, swap, remote_addr, rkey, comp); -#else - return UCS_ERR_UNSUPPORTED; -#endif -} - -static ucs_status_t uct_rc_verbs_ep_nop(uct_rc_verbs_ep_t *ep) +static ucs_status_t uct_rc_verbs_ep_post_flush(uct_rc_verbs_ep_t *ep) { -#if HAVE_DECL_IBV_EXP_WR_NOP uct_rc_verbs_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_verbs_iface_t); - struct ibv_exp_send_wr wr; + struct ibv_send_wr wr; + struct ibv_sge sge; + int inl_flag; - wr.next = NULL; - wr.num_sge = 0; - wr.exp_opcode = IBV_EXP_WR_NOP; - wr.exp_send_flags = IBV_EXP_SEND_FENCE; - wr.comp_mask = 0; UCT_RC_CHECK_RES(&iface->super, &ep->super); - uct_rc_verbs_exp_post_send(ep, &wr, IBV_EXP_SEND_SIGNALED, INT_MAX); + + /* + * Send small RDMA_WRITE as a flush operation + * (some adapters do not support 0-size RDMA_WRITE or inline sends) + */ + sge.addr = (uintptr_t)iface->flush_mem; + sge.length = 1; + sge.lkey = iface->flush_mr->lkey; + wr.next = NULL; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_RDMA_WRITE; + wr.wr.rdma.remote_addr = ep->flush.remote_addr; + wr.wr.rdma.rkey = ep->flush.rkey; + inl_flag = (iface->config.max_inline >= sge.length) ? + IBV_SEND_INLINE : 0; + + uct_rc_verbs_ep_post_send(iface, ep, &wr, inl_flag | IBV_SEND_SIGNALED, 1); return UCS_OK; -#else - return UCS_ERR_UNSUPPORTED; -#endif } ucs_status_t uct_rc_verbs_ep_flush(uct_ep_h tl_ep, unsigned flags, @@ -511,17 +421,19 @@ ucs_status_t uct_rc_verbs_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); ucs_status_t status; + if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) { + uct_ep_pending_purge(&ep->super.super.super, NULL, 0); + uct_rc_verbs_ep_handle_failure(ep, UCS_ERR_CANCELED); + return UCS_OK; + } + status = uct_rc_ep_flush(&ep->super, iface->config.tx_max_wr, flags); if (status != UCS_INPROGRESS) { return status; } if (uct_rc_txqp_unsignaled(&ep->super.txqp) != 0) { - if (IBV_DEVICE_HAS_NOP(&uct_ib_iface_device(&iface->super.super)->dev_attr)) { - status = uct_rc_verbs_ep_nop(ep); - } else { - status = uct_rc_verbs_ep_put_short(tl_ep, NULL, 0, 0, 0); - } + status = uct_rc_verbs_ep_post_flush(ep); if (status != UCS_OK) { return status; } @@ -531,6 +443,13 @@ ucs_status_t uct_rc_verbs_ep_flush(uct_ep_h tl_ep, unsigned flags, &ep->super.txqp, comp, ep->txcnt.pi); } +ucs_status_t uct_rc_verbs_ep_fence(uct_ep_h tl_ep, unsigned flags) +{ + uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); + + return uct_rc_ep_fence(tl_ep, &ep->fi, 1); +} + ucs_status_t uct_rc_verbs_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, uct_rc_fc_request_t *req) { @@ -574,16 +493,119 @@ ucs_status_t uct_rc_verbs_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, return UCS_OK; } +ucs_status_t uct_rc_verbs_ep_handle_failure(uct_rc_verbs_ep_t *ep, + ucs_status_t status) +{ + uct_rc_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, + uct_rc_iface_t); + + iface->tx.cq_available += ep->txcnt.pi - ep->txcnt.ci; + /* Reset CI to prevent cq_available overrun on ep_destroy */ + ep->txcnt.ci = ep->txcnt.pi; + uct_rc_txqp_purge_outstanding(&ep->super.txqp, status, 0); + + return iface->super.ops->set_ep_failed(&iface->super, &ep->super.super.super, + status); +} + +ucs_status_t uct_rc_verbs_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr) +{ + uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, + uct_rc_verbs_iface_t); + uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); + uct_ib_md_t *md = uct_ib_iface_md(&iface->super.super); + uct_rc_verbs_ep_address_t *rc_addr = (uct_rc_verbs_ep_address_t*)addr; + ucs_status_t status; + uint8_t mr_id; + + status = uct_rc_verbs_iface_flush_mem_create(iface); + if (status != UCS_OK) { + return status; + } + + rc_addr->flags = 0; + rc_addr->flush_addr = (uintptr_t)iface->flush_mem; + rc_addr->flush_rkey = iface->flush_mr->rkey; + uct_ib_pack_uint24(rc_addr->qp_num, ep->qp->qp_num); + + if (md->ops->get_atomic_mr_id(md, &mr_id) == UCS_OK) { + rc_addr->flags |= UCT_RC_VERBS_ADDR_HAS_ATOMIC_MR; + *(uint8_t*)(rc_addr + 1) = mr_id; + } + return UCS_OK; +} + +ucs_status_t uct_rc_verbs_ep_connect_to_ep(uct_ep_h tl_ep, + const uct_device_addr_t *dev_addr, + const uct_ep_addr_t *ep_addr) +{ + uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, + uct_rc_verbs_ep_t); + uct_rc_iface_t *iface = ucs_derived_of(tl_ep->iface, + uct_rc_iface_t); + const uct_ib_address_t *ib_addr = (const uct_ib_address_t *)dev_addr; + const uct_rc_verbs_ep_address_t *rc_addr = + (const uct_rc_verbs_ep_address_t*)ep_addr; + ucs_status_t status; + uint32_t qp_num; + struct ibv_ah_attr ah_attr; + enum ibv_mtu path_mtu; + + uct_ib_iface_fill_ah_attr_from_addr(&iface->super, ib_addr, + ep->super.path_index, &ah_attr, + &path_mtu); + ucs_assert(path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU); + + qp_num = uct_ib_unpack_uint24(rc_addr->qp_num); + status = uct_rc_iface_qp_connect(iface, ep->qp, qp_num, &ah_attr, path_mtu); + if (status != UCS_OK) { + return status; + } + + ep->flush.remote_addr = rc_addr->flush_addr; + ep->flush.rkey = rc_addr->flush_rkey; + + if (rc_addr->flags & UCT_RC_VERBS_ADDR_HAS_ATOMIC_MR) { + ep->super.atomic_mr_offset = *(uint8_t*)(rc_addr + 1); + } else { + ep->super.atomic_mr_offset = 0; + } + + return UCS_OK; +} + UCS_CLASS_INIT_FUNC(uct_rc_verbs_ep_t, const uct_ep_params_t *params) { uct_rc_verbs_iface_t *iface = ucs_derived_of(params->iface, uct_rc_verbs_iface_t); + uct_ib_qp_attr_t attr = {}; + ucs_status_t status; + + status = uct_rc_iface_qp_create(&iface->super, &self->qp, &attr, + iface->super.config.tx_qp_len, iface->srq); + if (status != UCS_OK) { + goto err; + } + + UCS_CLASS_CALL_SUPER_INIT(uct_rc_ep_t, &iface->super, self->qp->qp_num, + params); - UCS_CLASS_CALL_SUPER_INIT(uct_rc_ep_t, &iface->super); + status = uct_rc_iface_qp_init(&iface->super, self->qp); + if (status != UCS_OK) { + goto err_qp_cleanup; + } + + uct_rc_iface_add_qp(&iface->super, &self->super, self->qp->qp_num); uct_rc_txqp_available_set(&self->super.txqp, iface->config.tx_max_wr); uct_rc_verbs_txcnt_init(&self->txcnt); + uct_ib_fence_info_init(&self->fi); return UCS_OK; + +err_qp_cleanup: + uct_ib_destroy_qp(self->qp); +err: + return status; } static UCS_CLASS_CLEANUP_FUNC(uct_rc_verbs_ep_t) @@ -600,6 +622,8 @@ static UCS_CLASS_CLEANUP_FUNC(uct_rc_verbs_ep_t) ucs_assert(self->txcnt.pi >= self->txcnt.ci); iface->super.tx.cq_available += self->txcnt.pi - self->txcnt.ci; ucs_assert(iface->super.tx.cq_available < iface->super.config.tx_ops_count); + uct_rc_iface_remove_qp(&iface->super, self->qp->qp_num); + uct_ib_destroy_qp(self->qp); } UCS_CLASS_DEFINE(uct_rc_verbs_ep_t, uct_rc_ep_t); diff --git a/src/uct/ib/rc/verbs/rc_verbs_iface.c b/src/uct/ib/rc/verbs/rc_verbs_iface.c index 186942f08fd..a3f247db9fc 100644 --- a/src/uct/ib/rc/verbs/rc_verbs_iface.c +++ b/src/uct/ib/rc/verbs/rc_verbs_iface.c @@ -4,10 +4,15 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rc_verbs.h" #include "rc_verbs_impl.h" #include +#include #include #include #include @@ -34,10 +39,6 @@ static ucs_config_field_t uct_rc_verbs_iface_config_table[] = { "a minimum between this value and the TX queue length. -1 means no limit.", ucs_offsetof(uct_rc_verbs_iface_config_t, tx_max_wr), UCS_CONFIG_TYPE_UINT}, - {"", "", NULL, - ucs_offsetof(uct_rc_verbs_iface_config_t, fc), - UCS_CONFIG_TYPE_TABLE(uct_rc_fc_config_table)}, - {NULL} }; @@ -55,18 +56,13 @@ static void uct_rc_verbs_handle_failure(uct_ib_iface_t *ib_iface, void *arg, return; } - iface->tx.cq_available += ep->txcnt.pi - ep->txcnt.ci; - /* Reset CI to prevent cq_available overrun on ep_destoroy */ - ep->txcnt.ci = ep->txcnt.pi; - uct_rc_txqp_purge_outstanding(&ep->super.txqp, status, 0); - - if (ib_iface->ops->set_ep_failed(ib_iface, &ep->super.super.super, - status) == UCS_OK) { + if (uct_rc_verbs_ep_handle_failure(ep, status) == UCS_OK) { log_lvl = iface->super.super.config.failure_level; } - ucs_log(log_lvl, "send completion with error: %s", - ibv_wc_status_str(wc->status)); + ucs_log(log_lvl, + "send completion with error: %s qpn 0x%x wrid 0x%lx vendor_err 0x%x", + ibv_wc_status_str(wc->status), wc->qp_num, wc->wr_id, wc->vendor_err); } static ucs_status_t uct_rc_verbs_ep_set_failed(uct_ib_iface_t *iface, @@ -111,15 +107,21 @@ uct_rc_verbs_iface_poll_tx(uct_rc_verbs_iface_t *iface) } count = uct_rc_verbs_txcq_get_comp_count(&wc[i], &ep->super.txqp); - ucs_trace_poll("rc_verbs iface %p tx_wc: ep %p qpn 0x%x count %d", - iface, ep, wc[i].qp_num, count); + ucs_trace_poll("rc_verbs iface %p tx_wc wrid 0x%lx ep %p qpn 0x%x count %d", + iface, wc[i].wr_id, ep, wc[i].qp_num, count); uct_rc_verbs_txqp_completed(&ep->super.txqp, &ep->txcnt, count); iface->super.tx.cq_available += count; + /* process pending elements prior to CQ entries to avoid out-of-order + * transmission in completion callbacks */ + ucs_arbiter_group_schedule(&iface->super.tx.arbiter, + &ep->super.arb_group); + ucs_arbiter_dispatch(&iface->super.tx.arbiter, 1, + uct_rc_ep_process_pending, NULL); + uct_rc_txqp_completion_desc(&ep->super.txqp, ep->txcnt.ci); - ucs_arbiter_group_schedule(&iface->super.tx.arbiter, &ep->super.arb_group); } - ucs_arbiter_dispatch(&iface->super.tx.arbiter, 1, uct_rc_ep_process_pending, NULL); + return num_wcs; } @@ -128,7 +130,7 @@ static unsigned uct_rc_verbs_iface_progress(void *arg) uct_rc_verbs_iface_t *iface = arg; unsigned count; - count = uct_rc_verbs_iface_poll_rx_common(&iface->super); + count = uct_rc_verbs_iface_poll_rx_common(iface); if (count > 0) { return count; } @@ -154,53 +156,131 @@ static void uct_rc_verbs_iface_init_inl_wrs(uct_rc_verbs_iface_t *iface) static ucs_status_t uct_rc_verbs_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_iface, uct_rc_verbs_iface_t); + uct_ib_md_t *md = uct_ib_iface_md(ucs_derived_of(iface, uct_ib_iface_t)); + uint8_t mr_id; ucs_status_t status; status = uct_rc_iface_query(&iface->super, iface_attr, iface->config.max_inline, iface->config.max_inline, iface->config.short_desc_size, - uct_ib_iface_get_max_iov(&iface->super.super) - 1, - uct_ib_iface_get_max_iov(&iface->super.super) - 1, - sizeof(uct_rc_hdr_t)); + iface->config.max_send_sge - 1, + sizeof(uct_rc_hdr_t), + iface->config.max_send_sge); if (status != UCS_OK) { return status; } - iface_attr->latency.growth += 1e-9; /* 1 ns per each extra QP */ - iface_attr->iface_addr_len = sizeof(uint8_t); /* overwrite */ - iface_attr->overhead = 75e-9; /* Software overhead */ + iface_attr->latency.m += 1e-9; /* 1 ns per each extra QP */ + iface_attr->overhead = 75e-9; /* Software overhead */ + + iface_attr->ep_addr_len = sizeof(uct_rc_verbs_ep_address_t); + if (md->ops->get_atomic_mr_id(md, &mr_id) == UCS_OK) { + iface_attr->ep_addr_len += sizeof(mr_id); + } + + return UCS_OK; +} + +ucs_status_t uct_rc_verbs_iface_flush_mem_create(uct_rc_verbs_iface_t *iface) +{ + uct_ib_md_t *md = uct_ib_iface_md(&iface->super.super); + ucs_status_t status; + struct ibv_mr *mr; + void *mem; + + if (iface->flush_mr != NULL) { + ucs_assert(iface->flush_mem != NULL); + return UCS_OK; + } + + /* + * Map a whole page for the remote side to issue a dummy RDMA_WRITE on it, + * to flush its outstanding operations. A whole page is used to prevent any + * other allocations from using same page, so it would be fork-safe. + */ + mem = ucs_mmap(NULL, ucs_get_page_size(), PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0, "flush_mem"); + if (mem == MAP_FAILED) { + ucs_error("failed to allocate page for remote flush: %m"); + status = UCS_ERR_NO_MEMORY; + goto err; + } + + status = uct_ib_reg_mr(md->pd, mem, ucs_get_page_size(), + UCT_IB_MEM_ACCESS_FLAGS, &mr); + if (status != UCS_OK) { + goto err_munmap; + } + iface->flush_mem = mem; + iface->flush_mr = mr; return UCS_OK; + +err_munmap: + ucs_munmap(mem, ucs_get_page_size()); +err: + return status; +} + +static ucs_status_t +uct_rc_iface_verbs_init_rx(uct_rc_iface_t *rc_iface, + const uct_rc_iface_common_config_t *config) +{ + uct_rc_verbs_iface_t *iface = ucs_derived_of(rc_iface, uct_rc_verbs_iface_t); + + return uct_rc_iface_init_rx(rc_iface, config, &iface->srq); +} + +void uct_rc_iface_verbs_cleanup_rx(uct_rc_iface_t *rc_iface) +{ + uct_rc_verbs_iface_t *iface = ucs_derived_of(rc_iface, uct_rc_verbs_iface_t); + + /* TODO flush RX buffers */ + uct_ib_destroy_srq(iface->srq); } -static UCS_CLASS_INIT_FUNC(uct_rc_verbs_iface_t, uct_md_h md, uct_worker_h worker, - const uct_iface_params_t *params, +static UCS_CLASS_INIT_FUNC(uct_rc_verbs_iface_t, uct_md_h tl_md, + uct_worker_h worker, const uct_iface_params_t *params, const uct_iface_config_t *tl_config) { uct_rc_verbs_iface_config_t *config = ucs_derived_of(tl_config, uct_rc_verbs_iface_config_t); ucs_status_t status; uct_ib_iface_init_attr_t init_attr = {}; - struct ibv_qp_cap cap; + uct_ib_qp_attr_t attr = {}; struct ibv_qp *qp; uct_rc_hdr_t *hdr; - init_attr.fc_req_size = sizeof(uct_rc_fc_request_t); - init_attr.rx_hdr_len = sizeof(uct_rc_hdr_t); - init_attr.qp_type = IBV_QPT_RC; - init_attr.rx_cq_len = config->super.super.rx.queue_len; - init_attr.seg_size = config->super.super.super.max_bcopy; + init_attr.fc_req_size = sizeof(uct_rc_fc_request_t); + init_attr.rx_hdr_len = sizeof(uct_rc_hdr_t); + init_attr.qp_type = IBV_QPT_RC; + init_attr.cq_len[UCT_IB_DIR_RX] = config->super.super.super.rx.queue_len; + init_attr.cq_len[UCT_IB_DIR_TX] = config->super.tx_cq_len; + init_attr.seg_size = config->super.super.super.seg_size; - UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, &uct_rc_verbs_iface_ops, md, - worker, params, &config->super, &init_attr); + UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, &uct_rc_verbs_iface_ops, tl_md, + worker, params, &config->super.super, &init_attr); self->config.tx_max_wr = ucs_min(config->tx_max_wr, self->super.config.tx_qp_len); - self->super.config.tx_moderation = ucs_min(self->super.config.tx_moderation, + self->super.config.tx_moderation = ucs_min(config->super.tx_cq_moderation, self->config.tx_max_wr / 4); - - self->super.progress = uct_rc_verbs_iface_progress; + self->super.config.fence_mode = (uct_rc_fence_mode_t)config->super.super.fence_mode; + self->super.progress = uct_rc_verbs_iface_progress; + self->flush_mem = NULL; + self->flush_mr = NULL; + + if ((config->super.super.fence_mode == UCT_RC_FENCE_MODE_WEAK) || + (config->super.super.fence_mode == UCT_RC_FENCE_MODE_AUTO)) { + self->super.config.fence_mode = UCT_RC_FENCE_MODE_WEAK; + } else if (config->super.super.fence_mode == UCT_RC_FENCE_MODE_NONE) { + self->super.config.fence_mode = UCT_RC_FENCE_MODE_NONE; + } else { + ucs_error("incorrect fence value: %d", self->super.config.fence_mode); + status = UCS_ERR_INVALID_PARAM; + goto err; + } memset(self->inl_sge, 0, sizeof(self->inl_sge)); uct_rc_am_hdr_fill(&self->am_inl_hdr.rc_hdr, 0); @@ -218,7 +298,7 @@ static UCS_CLASS_INIT_FUNC(uct_rc_verbs_iface_t, uct_md_h md, uct_worker_h worke self->config.short_desc_size, sizeof(uct_rc_iface_send_desc_t), UCS_SYS_CACHE_LINE_SIZE, - &config->super.super.tx.mp, + &config->super.super.super.tx.mp, self->super.config.tx_qp_len, uct_rc_iface_send_desc_init, "rc_verbs_short_desc"); @@ -229,21 +309,25 @@ static UCS_CLASS_INIT_FUNC(uct_rc_verbs_iface_t, uct_md_h md, uct_worker_h worke uct_rc_verbs_iface_init_inl_wrs(self); /* Check FC parameters correctness */ - status = uct_rc_init_fc_thresh(&config->fc, &config->super, &self->super); + status = uct_rc_init_fc_thresh(&config->super, &self->super); if (status != UCS_OK) { goto err_common_cleanup; } /* Create a dummy QP in order to find out max_inline */ - status = uct_rc_iface_qp_create(&self->super, &qp, &cap, - self->super.config.tx_qp_len); + uct_ib_exp_qp_fill_attr(&self->super.super, &attr); + status = uct_rc_iface_qp_create(&self->super, &qp, &attr, + self->super.config.tx_qp_len, + self->srq); if (status != UCS_OK) { goto err_common_cleanup; } - ibv_destroy_qp(qp); + uct_ib_destroy_qp(qp); - self->config.max_inline = cap.max_inline_data; - uct_ib_iface_set_max_iov(&self->super.super, cap.max_send_sge); + self->config.max_inline = attr.cap.max_inline_data; + self->config.max_send_sge = ucs_min(UCT_IB_MAX_IOV, attr.cap.max_send_sge); + ucs_assertv_always(self->config.max_send_sge > 1, /* need 1 iov for am header*/ + "max_send_sge %zu", self->config.max_send_sge); if (self->config.max_inline < sizeof(*hdr)) { self->fc_desc = ucs_mpool_get(&self->short_desc_mp); @@ -262,15 +346,15 @@ static UCS_CLASS_INIT_FUNC(uct_rc_verbs_iface_t, uct_md_h md, uct_worker_h worke return status; } -ucs_status_t uct_rc_verbs_iface_common_prepost_recvs(uct_rc_iface_t *iface, +ucs_status_t uct_rc_verbs_iface_common_prepost_recvs(uct_rc_verbs_iface_t *iface, unsigned max) { unsigned count; - count = ucs_min(max, iface->rx.srq.quota); - iface->rx.srq.available += count; - iface->rx.srq.quota -= count; - while (iface->rx.srq.available > 0) { + count = ucs_min(max, iface->super.rx.srq.quota); + iface->super.rx.srq.available += count; + iface->super.rx.srq.quota -= count; + while (iface->super.rx.srq.available > 0) { if (uct_rc_verbs_iface_post_recv_common(iface, 1) == 0) { ucs_error("failed to post receives"); return UCS_ERR_NO_MEMORY; @@ -281,7 +365,7 @@ ucs_status_t uct_rc_verbs_iface_common_prepost_recvs(uct_rc_iface_t *iface, void uct_rc_verbs_iface_common_progress_enable(uct_iface_h tl_iface, unsigned flags) { - uct_rc_iface_t *iface = ucs_derived_of(tl_iface, uct_rc_iface_t); + uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_iface, uct_rc_verbs_iface_t); if (flags & UCT_PROGRESS_RECV) { /* ignore return value from prepost_recv, since it's not really possible @@ -291,11 +375,12 @@ void uct_rc_verbs_iface_common_progress_enable(uct_iface_h tl_iface, unsigned fl uct_rc_verbs_iface_common_prepost_recvs(iface, UINT_MAX); } - uct_base_iface_progress_enable_cb(&iface->super.super, iface->progress, + uct_base_iface_progress_enable_cb(&iface->super.super.super, + iface->super.progress, flags); } -unsigned uct_rc_verbs_iface_post_recv_always(uct_rc_iface_t *iface, unsigned max) +unsigned uct_rc_verbs_iface_post_recv_always(uct_rc_verbs_iface_t *iface, unsigned max) { struct ibv_recv_wr *bad_wr; uct_ib_recv_wr_t *wrs; @@ -304,17 +389,17 @@ unsigned uct_rc_verbs_iface_post_recv_always(uct_rc_iface_t *iface, unsigned max wrs = ucs_alloca(sizeof *wrs * max); - count = uct_ib_iface_prepare_rx_wrs(&iface->super, &iface->rx.mp, + count = uct_ib_iface_prepare_rx_wrs(&iface->super.super, &iface->super.rx.mp, wrs, max); if (ucs_unlikely(count == 0)) { return 0; } - ret = ibv_post_srq_recv(iface->rx.srq.srq, &wrs[0].ibwr, &bad_wr); + ret = ibv_post_srq_recv(iface->srq, &wrs[0].ibwr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_post_srq_recv() returned %d: %m", ret); } - iface->rx.srq.available -= count; + iface->super.rx.srq.available -= count; return count; } @@ -323,6 +408,12 @@ static UCS_CLASS_CLEANUP_FUNC(uct_rc_verbs_iface_t) { uct_base_iface_progress_disable(&self->super.super.super.super, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); + + if (self->flush_mr != NULL) { + uct_ib_dereg_mr(self->flush_mr); + ucs_assert(self->flush_mem != NULL); + ucs_munmap(self->flush_mem, ucs_get_page_size()); + } if (self->fc_desc != NULL) { ucs_mpool_put(self->fc_desc); } @@ -349,19 +440,19 @@ static uct_rc_iface_ops_t uct_rc_verbs_iface_ops = { .ep_atomic_cswap64 = uct_rc_verbs_ep_atomic_cswap64, .ep_atomic64_post = uct_rc_verbs_ep_atomic64_post, .ep_atomic64_fetch = uct_rc_verbs_ep_atomic64_fetch, - .ep_atomic_cswap32 = uct_rc_verbs_ep_atomic_cswap32, - .ep_atomic32_post = uct_rc_verbs_ep_atomic32_post, - .ep_atomic32_fetch = uct_rc_verbs_ep_atomic32_fetch, + .ep_atomic_cswap32 = (uct_ep_atomic_cswap32_func_t)ucs_empty_function_return_unsupported, + .ep_atomic32_post = (uct_ep_atomic32_post_func_t)ucs_empty_function_return_unsupported, + .ep_atomic32_fetch = (uct_ep_atomic32_fetch_func_t)ucs_empty_function_return_unsupported, .ep_pending_add = uct_rc_ep_pending_add, .ep_pending_purge = uct_rc_ep_pending_purge, .ep_flush = uct_rc_verbs_ep_flush, - .ep_fence = uct_base_ep_fence, + .ep_fence = uct_rc_verbs_ep_fence, .ep_create = UCS_CLASS_NEW_FUNC_NAME(uct_rc_verbs_ep_t), .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_rc_verbs_ep_t), - .ep_get_address = uct_rc_ep_get_address, - .ep_connect_to_ep = uct_rc_ep_connect_to_ep, + .ep_get_address = uct_rc_verbs_ep_get_address, + .ep_connect_to_ep = uct_rc_verbs_ep_connect_to_ep, .iface_flush = uct_rc_iface_flush, - .iface_fence = uct_base_iface_fence, + .iface_fence = uct_rc_iface_fence, .iface_progress_enable = uct_rc_verbs_iface_common_progress_enable, .iface_progress_disable = uct_base_iface_progress_disable, .iface_progress = uct_rc_iface_do_progress, @@ -369,40 +460,35 @@ static uct_rc_iface_ops_t uct_rc_verbs_iface_ops = { .iface_event_arm = uct_rc_iface_event_arm, .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_rc_verbs_iface_t), .iface_query = uct_rc_verbs_iface_query, - .iface_get_address = uct_rc_iface_get_address, + .iface_get_address = ucs_empty_function_return_success, .iface_get_device_address = uct_ib_iface_get_device_address, - .iface_is_reachable = uct_rc_iface_is_reachable, + .iface_is_reachable = uct_ib_iface_is_reachable, }, .create_cq = uct_ib_verbs_create_cq, .arm_cq = uct_ib_iface_arm_cq, - .event_cq = (void*)ucs_empty_function, + .event_cq = (uct_ib_iface_event_cq_func_t)ucs_empty_function, .handle_failure = uct_rc_verbs_handle_failure, .set_ep_failed = uct_rc_verbs_ep_set_failed, - .create_qp = uct_ib_iface_create_qp, - .init_res_domain = (void*)ucs_empty_function_return_success, - .cleanup_res_domain = (void*)ucs_empty_function, }, - .init_rx = uct_rc_iface_init_rx, + .init_rx = uct_rc_iface_verbs_init_rx, + .cleanup_rx = uct_rc_iface_verbs_cleanup_rx, .fc_ctrl = uct_rc_verbs_ep_fc_ctrl, .fc_handler = uct_rc_iface_fc_handler }; -static ucs_status_t uct_rc_verbs_query_resources(uct_md_h md, - uct_tl_resource_desc_t **resources_p, - unsigned *num_resources_p) +static ucs_status_t +uct_rc_verbs_query_tl_devices(uct_md_h md, + uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) { uct_ib_md_t *ib_md = ucs_derived_of(md, uct_ib_md_t); + int flags; - return uct_ib_device_query_tl_resources(&ib_md->dev, "rc", - (ib_md->config.eth_pause ? 0 : UCT_IB_DEVICE_FLAG_LINK_IB), - resources_p, num_resources_p); + flags = ib_md->config.eth_pause ? 0 : UCT_IB_DEVICE_FLAG_LINK_IB; + return uct_ib_device_query_ports(&ib_md->dev, flags, tl_devices_p, + num_tl_devices_p); } -UCT_TL_COMPONENT_DEFINE(uct_rc_verbs_tl, - uct_rc_verbs_query_resources, - uct_rc_verbs_iface_t, - "rc", - "RC_VERBS_", - uct_rc_verbs_iface_config_table, - uct_rc_verbs_iface_config_t); -UCT_MD_REGISTER_TL(&uct_ib_mdc, &uct_rc_verbs_tl); +UCT_TL_DEFINE(&uct_ib_component, rc_verbs, uct_rc_verbs_query_tl_devices, + uct_rc_verbs_iface_t, "RC_VERBS_", uct_rc_verbs_iface_config_table, + uct_rc_verbs_iface_config_t); diff --git a/src/uct/ib/rc/verbs/rc_verbs_impl.h b/src/uct/ib/rc/verbs/rc_verbs_impl.h index 195e196a871..70902d57120 100644 --- a/src/uct/ib/rc/verbs/rc_verbs_impl.h +++ b/src/uct/ib/rc/verbs/rc_verbs_impl.h @@ -29,24 +29,24 @@ uct_rc_verbs_txqp_completed(uct_rc_txqp_t *txqp, uct_rc_verbs_txcnt_t *txcnt, ui uct_rc_txqp_available_add(txqp, count); } -ucs_status_t uct_rc_verbs_iface_common_prepost_recvs(uct_rc_iface_t *iface, +ucs_status_t uct_rc_verbs_iface_common_prepost_recvs(uct_rc_verbs_iface_t *iface, unsigned max); void uct_rc_verbs_iface_common_progress_enable(uct_iface_h tl_iface, unsigned flags); -unsigned uct_rc_verbs_iface_post_recv_always(uct_rc_iface_t *iface, unsigned max); +unsigned uct_rc_verbs_iface_post_recv_always(uct_rc_verbs_iface_t *iface, unsigned max); -static inline unsigned uct_rc_verbs_iface_post_recv_common(uct_rc_iface_t *iface, +static inline unsigned uct_rc_verbs_iface_post_recv_common(uct_rc_verbs_iface_t *iface, int fill) { - unsigned batch = iface->super.config.rx_max_batch; + unsigned batch = iface->super.super.config.rx_max_batch; unsigned count; - if (iface->rx.srq.available < batch) { + if (iface->super.rx.srq.available < batch) { if (ucs_likely(fill == 0)) { return 0; } else { - count = iface->rx.srq.available; + count = iface->super.rx.srq.available; } } else { count = batch; @@ -105,28 +105,28 @@ uct_rc_verbs_iface_handle_am(uct_rc_iface_t *iface, uct_rc_hdr_t *hdr, } static UCS_F_ALWAYS_INLINE unsigned -uct_rc_verbs_iface_poll_rx_common(uct_rc_iface_t *iface) +uct_rc_verbs_iface_poll_rx_common(uct_rc_verbs_iface_t *iface) { uct_rc_hdr_t *hdr; unsigned i; ucs_status_t status; - unsigned num_wcs = iface->super.config.rx_max_poll; + unsigned num_wcs = iface->super.super.config.rx_max_poll; struct ibv_wc wc[num_wcs]; - status = uct_ib_poll_cq(iface->super.cq[UCT_IB_DIR_RX], &num_wcs, wc); + status = uct_ib_poll_cq(iface->super.super.cq[UCT_IB_DIR_RX], &num_wcs, wc); if (status != UCS_OK) { num_wcs = 0; goto out; } - UCT_IB_IFACE_VERBS_FOREACH_RXWQE(&iface->super, i, hdr, wc, num_wcs) { - uct_ib_log_recv_completion(&iface->super, &wc[i], hdr, wc[i].byte_len, + UCT_IB_IFACE_VERBS_FOREACH_RXWQE(&iface->super.super, i, hdr, wc, num_wcs) { + uct_ib_log_recv_completion(&iface->super.super, &wc[i], hdr, wc[i].byte_len, uct_rc_ep_packet_dump); - uct_rc_verbs_iface_handle_am(iface, hdr, wc[i].wr_id, wc[i].qp_num, + uct_rc_verbs_iface_handle_am(&iface->super, hdr, wc[i].wr_id, wc[i].qp_num, wc[i].byte_len, wc[i].imm_data, wc[i].slid); } - iface->rx.srq.available += num_wcs; - UCS_STATS_UPDATE_COUNTER(iface->stats, UCT_RC_IFACE_STAT_RX_COMPLETION, num_wcs); + iface->super.rx.srq.available += num_wcs; + UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_RX_COMPLETION, num_wcs); out: uct_rc_verbs_iface_post_recv_common(iface, 0); @@ -178,13 +178,13 @@ uct_rc_verbs_iface_fill_inl_am_sge(uct_rc_verbs_iface_t *iface, _sge, _length, _raddr, _rkey) \ UCT_RC_VERBS_FILL_SGE(_wr, _sge, _length) \ _wr.wr.rdma.remote_addr = _raddr; \ - _wr.wr.rdma.rkey = uct_ib_md_direct_rkey(_rkey); \ + _wr.wr.rdma.rkey = _rkey; \ _wr_opcode = _opcode; \ #define UCT_RC_VERBS_FILL_RDMA_WR_IOV(_wr, _wr_opcode, _opcode, _sge, _sgelen, \ _raddr, _rkey) \ _wr.wr.rdma.remote_addr = _raddr; \ - _wr.wr.rdma.rkey = uct_ib_md_direct_rkey(_rkey); \ + _wr.wr.rdma.rkey = _rkey; \ _wr.sg_list = _sge; \ _wr.num_sge = _sgelen; \ _wr_opcode = _opcode; @@ -208,59 +208,4 @@ uct_rc_verbs_iface_fill_inl_am_sge(uct_rc_verbs_iface_t *iface, _wr.wr.atomic.rkey = _rkey; \ -#if HAVE_IB_EXT_ATOMICS -static inline void -uct_rc_verbs_fill_ext_atomic_wr(struct ibv_exp_send_wr *wr, struct ibv_sge *sge, - int opcode, uint32_t length, uint32_t compare_mask, - uint64_t compare_add, uint64_t swap, uint64_t remote_addr, - uct_rkey_t rkey, size_t atomic_mr_offset) -{ - sge->length = length; - wr->sg_list = sge; - wr->num_sge = 1; - wr->exp_opcode = (enum ibv_exp_wr_opcode)opcode; - wr->comp_mask = 0; - - wr->ext_op.masked_atomics.log_arg_sz = ucs_ilog2(length); - wr->ext_op.masked_atomics.rkey = uct_ib_resolve_atomic_rkey(rkey, - atomic_mr_offset, - &remote_addr); - wr->ext_op.masked_atomics.remote_addr = remote_addr; - - switch (opcode) { - case IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP: - wr->ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap.compare_mask = compare_mask; - wr->ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap.compare_val = compare_add; - wr->ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap.swap_mask = (uint64_t)(-1); - wr->ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap.swap_val = swap; - break; - case IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD: - wr->ext_op.masked_atomics.wr_data.inline_data.op.fetch_add.add_val = compare_add; - wr->ext_op.masked_atomics.wr_data.inline_data.op.fetch_add.field_boundary = 0; - break; - } -} - -static UCS_F_ALWAYS_INLINE -ucs_status_t uct_rc_verbs_ep_atomic32_data(uct_atomic_op_t opcode, uint32_t value, - int *op, uint32_t *add, uint32_t *swap) -{ - switch (opcode) { - case UCT_ATOMIC_OP_ADD: - *op = IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD; - *add = value; - *swap = 0; - return UCS_OK; - case UCT_ATOMIC_OP_SWAP: - *op = IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP; - *add = 0; - *swap = value; - return UCS_OK; - default: - return UCS_ERR_UNSUPPORTED; - } -} -#endif - - #endif diff --git a/src/uct/ib/rdmacm/Makefile.am b/src/uct/ib/rdmacm/Makefile.am index 66bae29f712..0e4ad2403d8 100644 --- a/src/uct/ib/rdmacm/Makefile.am +++ b/src/uct/ib/rdmacm/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. +# Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. # See file LICENSE for terms. # @@ -11,7 +11,8 @@ module_LTLIBRARIES = libuct_rdmacm.la libuct_rdmacm_la_CPPFLAGS = $(BASE_CPPFLAGS) $(IBVERBS_CPPFLAGS) $(RDMACM_CPPFLAGS) libuct_rdmacm_la_CFLAGS = $(BASE_CFLAGS) libuct_rdmacm_la_LIBADD = $(RDMACM_LIBS) $(top_builddir)/src/ucs/libucs.la \ - $(top_builddir)/src/uct/libuct.la + $(top_builddir)/src/uct/libuct.la \ + $(top_builddir)/src/uct/ib/libuct_ib.la libuct_rdmacm_la_LDFLAGS = $(IBVERBS_LDFLAGS) $(RDMACM_LDFLAGS) -version-info $(SOVERSION) noinst_HEADERS = \ @@ -25,6 +26,18 @@ libuct_rdmacm_la_SOURCES = \ rdmacm_iface.c \ rdmacm_ep.c +if HAVE_RDMACM_QP_LESS +noinst_HEADERS += \ + rdmacm_cm.h \ + rdmacm_listener.h \ + rdmacm_cm_ep.h + +libuct_rdmacm_la_SOURCES += \ + rdmacm_cm.c \ + rdmacm_listener.c \ + rdmacm_cm_ep.c +endif # HAVE_RDMACM_QP_LESS + include $(top_srcdir)/config/module.am -endif +endif # HAVE_RDMACM diff --git a/src/uct/ib/rdmacm/configure.m4 b/src/uct/ib/rdmacm/configure.m4 index a73ec275703..35f078f06e7 100644 --- a/src/uct/ib/rdmacm/configure.m4 +++ b/src/uct/ib/rdmacm/configure.m4 @@ -8,6 +8,7 @@ # Check for RDMACM support # rdmacm_happy="no" +rdmacm_qp_less_happy="no" AC_ARG_WITH([rdmacm], [AS_HELP_STRING([--with-rdmacm=(DIR)], [Enable the use of RDMACM (default is guess).])], [], [with_rdmacm=guess]) @@ -29,14 +30,19 @@ AS_IF([test "x$with_rdmacm" != xno], AC_CHECK_HEADER([$ucx_check_rdmacm_dir/include/rdma/rdma_cma.h], [ AC_CHECK_LIB([rdmacm], [rdma_create_id], - [uct_modules+=":rdmacm" + [uct_modules="${uct_modules}:rdmacm" rdmacm_happy="yes" AS_IF([test "$ucx_check_rdmacm_dir" != /usr], [ AC_SUBST(RDMACM_CPPFLAGS, ["-I$ucx_check_rdmacm_dir/include"]) AC_SUBST(RDMACM_LDFLAGS, ["-L$ucx_check_rdmacm_dir/lib$libsuff"])]) AC_SUBST(RDMACM_LIBS, [-lrdmacm]) - ], + # QP less support + AC_CHECK_DECLS([rdma_establish, rdma_init_qp_attr], + [rdmacm_qp_less_happy="yes" + AC_DEFINE([HAVE_RDMACM_QP_LESS], 1, [RDMACM QP less support])], + [], [#include <$ucx_check_rdmacm_dir/include/rdma/rdma_cma.h>]) + ], [AC_MSG_WARN([RDMACM requested but librdmacm is not found]) AC_MSG_ERROR([Please install librdmacm and librdmacm-devel or disable rdmacm support]) ]) @@ -53,4 +59,5 @@ AS_IF([test "x$with_rdmacm" != xno], ) AM_CONDITIONAL([HAVE_RDMACM], [test "x$rdmacm_happy" != xno]) +AM_CONDITIONAL([HAVE_RDMACM_QP_LESS], [test "x$rdmacm_qp_less_happy" != xno]) AC_CONFIG_FILES([src/uct/ib/rdmacm/Makefile]) diff --git a/src/uct/ib/rdmacm/rdmacm_cm.c b/src/uct/ib/rdmacm/rdmacm_cm.c new file mode 100644 index 00000000000..cc1b1bf90d4 --- /dev/null +++ b/src/uct/ib/rdmacm/rdmacm_cm.c @@ -0,0 +1,597 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" /* Defines HAVE_RDMACM_QP_LESS */ +#endif + +#include "rdmacm_cm_ep.h" +#include +#include + +#include +#include + + +ucs_status_t uct_rdmacm_cm_destroy_id(struct rdma_cm_id *id) +{ + ucs_trace("destroying cm_id %p", id); + + if (rdma_destroy_id(id)) { + ucs_warn("rdma_destroy_id() failed: %m"); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + +ucs_status_t uct_rdmacm_cm_ack_event(struct rdma_cm_event *event) +{ + ucs_trace("ack event %p, cm_id %p", event, event->id); + + if (rdma_ack_cm_event(event)) { + ucs_warn("rdma_ack_cm_event failed on event %s: %m", + rdma_event_str(event->event)); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + +ucs_status_t uct_rdmacm_cm_reject(struct rdma_cm_id *id) +{ + ucs_trace("reject on cm_id %p", id); + + if (rdma_reject(id, NULL, 0)) { + ucs_error("rdma_reject (id=%p) failed with error: %m", id); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + +size_t uct_rdmacm_cm_get_max_conn_priv() +{ + return UCT_RDMACM_TCP_PRIV_DATA_LEN - sizeof(uct_rdmacm_priv_data_hdr_t); +} + +static ucs_status_t uct_rdmacm_cm_query(uct_cm_h cm, uct_cm_attr_t *cm_attr) +{ + if (cm_attr->field_mask & UCT_CM_ATTR_FIELD_MAX_CONN_PRIV) { + cm_attr->max_conn_priv = uct_rdmacm_cm_get_max_conn_priv(); + } + return UCS_OK; +} + +static void uct_rdmacm_cm_handle_event_addr_resolved(struct rdma_cm_event *event) +{ + struct sockaddr *remote_addr = rdma_get_peer_addr(event->id); + uct_rdmacm_cm_ep_t *cep = (uct_rdmacm_cm_ep_t *)event->id->context; + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + char ep_str[UCT_RDMACM_EP_STRING_LEN]; + uct_cm_remote_data_t remote_data; + + ucs_assert(event->id == cep->id); + + ucs_trace("%s: rdma_resolve_route on cm_id %p", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), + event->id); + + if (rdma_resolve_route(event->id, 1000 /* TODO */)) { + ucs_error("%s: rdma_resolve_route(to addr=%s) failed: %m", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), + ucs_sockaddr_str(remote_addr, ip_port_str, + UCS_SOCKADDR_STRING_LEN)); + remote_data.field_mask = 0; + uct_rdmacm_cm_ep_set_failed(cep, &remote_data, UCS_ERR_IO_ERROR); + } +} + +static void uct_rdmacm_cm_handle_event_route_resolved(struct rdma_cm_event *event) +{ + struct sockaddr *remote_addr = rdma_get_peer_addr(event->id); + uct_rdmacm_cm_ep_t *cep = (uct_rdmacm_cm_ep_t *)event->id->context; + uct_cm_remote_data_t remote_data; + ucs_status_t status; + struct rdma_conn_param conn_param; + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + char ep_str[UCT_RDMACM_EP_STRING_LEN]; + + ucs_assert(event->id == cep->id); + + memset(&conn_param, 0, sizeof(conn_param)); + conn_param.private_data = ucs_alloca(uct_rdmacm_cm_get_max_conn_priv() + + sizeof(uct_rdmacm_priv_data_hdr_t)); + + status = uct_rdmacm_cm_ep_conn_param_init(cep, &conn_param); + if (status != UCS_OK) { + remote_data.field_mask = 0; + uct_rdmacm_cm_ep_set_failed(cep, &remote_data, status); + return; + } + + ucs_trace("%s: rdma_connect, cm_id %p", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), cep->id); + + if (rdma_connect(cep->id, &conn_param)) { + ucs_error("%s: rdma_connect(to addr=%s) failed: %m", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), + ucs_sockaddr_str(remote_addr, ip_port_str, + UCS_SOCKADDR_STRING_LEN)); + remote_data.field_mask = 0; + uct_rdmacm_cm_ep_set_failed(cep, &remote_data, UCS_ERR_IO_ERROR); + } +} + +static ucs_status_t uct_rdmacm_cm_id_to_dev_addr(struct rdma_cm_id *cm_id, + uct_device_addr_t **dev_addr_p, + size_t *dev_addr_len_p) +{ + uct_ib_address_pack_params_t params; + struct ibv_port_attr port_attr; + uct_ib_address_t *dev_addr; + struct ibv_qp_attr qp_attr; + size_t addr_length; + int qp_attr_mask; + char dev_name[UCT_DEVICE_NAME_MAX]; + char ah_attr_str[128]; + uct_ib_roce_version_info_t roce_info; + + params.flags = 0; + + /* get the qp attributes in order to modify the qp state. + * the ah_attr fields from them are required to extract the device address + * of the remote peer. + */ + qp_attr.qp_state = IBV_QPS_RTR; + if (rdma_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask)) { + ucs_error("rdma_init_qp_attr (id=%p, qp_state=%d) failed: %m", + cm_id, qp_attr.qp_state); + return UCS_ERR_IO_ERROR; + } + + if (ibv_query_port(cm_id->verbs, cm_id->port_num, &port_attr)) { + uct_rdmacm_cm_id_to_dev_name(cm_id, dev_name); + ucs_error("ibv_query_port (%s) failed: %m", dev_name); + return UCS_ERR_IO_ERROR; + } + + if (qp_attr.ah_attr.is_global) { + ucs_assert(!memcmp(&cm_id->route.addr.addr.ibaddr.dgid, + &qp_attr.ah_attr.grh.dgid, + sizeof(qp_attr.ah_attr.grh.dgid))); + params.flags |= UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX; + params.gid_index = qp_attr.ah_attr.grh.sgid_index; + } + + ucs_debug("cm_id %p: ah_attr %s", cm_id, + uct_ib_ah_attr_str(ah_attr_str, sizeof(ah_attr_str), + &qp_attr.ah_attr)); + ucs_assert_always(qp_attr.path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU); + params.flags |= UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU; + params.path_mtu = qp_attr.path_mtu; + + if (IBV_PORT_IS_LINK_LAYER_ETHERNET(&port_attr)) { + /* Ethernet address */ + ucs_assert(qp_attr.ah_attr.is_global); + + /* pack the remote RoCE version as ANY assuming that rdmacm guarantees + * that the remote peer is reachable to the local one */ + roce_info.ver = UCT_IB_DEVICE_ROCE_ANY; + roce_info.addr_family = 0; + params.roce_info = roce_info; + params.flags |= UCT_IB_ADDRESS_PACK_FLAG_ETH; + } else { + params.flags |= UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID | + UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX; + } + + params.gid = cm_id->route.addr.addr.ibaddr.dgid; + params.lid = qp_attr.ah_attr.dlid; + addr_length = uct_ib_address_size(¶ms); + dev_addr = ucs_malloc(addr_length, "IB device address"); + if (dev_addr == NULL) { + ucs_error("failed to allocate IB device address"); + return UCS_ERR_NO_MEMORY; + } + + uct_ib_address_pack(¶ms, dev_addr); + + *dev_addr_p = (uct_device_addr_t *)dev_addr; + *dev_addr_len_p = addr_length; + return UCS_OK; +} + +static void uct_rdmacm_cm_handle_event_connect_request(struct rdma_cm_event *event) +{ + uct_rdmacm_priv_data_hdr_t *hdr = (uct_rdmacm_priv_data_hdr_t *) + event->param.conn.private_data; + uct_rdmacm_listener_t *listener = event->listen_id->context; + char dev_name[UCT_DEVICE_NAME_MAX]; + uct_device_addr_t *dev_addr; + size_t addr_length; + uct_cm_remote_data_t remote_data; + ucs_status_t status; + uct_cm_listener_conn_request_args_t conn_req_args; + ucs_sock_addr_t client_saddr; + size_t size; + + ucs_assert(hdr->status == UCS_OK); + + uct_rdmacm_cm_id_to_dev_name(event->id, dev_name); + + status = uct_rdmacm_cm_id_to_dev_addr(event->id, &dev_addr, &addr_length); + if (status != UCS_OK) { + goto err; + } + + remote_data.field_mask = UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR | + UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR_LENGTH | + UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA | + UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA_LENGTH; + remote_data.dev_addr = dev_addr; + remote_data.dev_addr_length = addr_length; + remote_data.conn_priv_data = hdr + 1; + remote_data.conn_priv_data_length = hdr->length; + + client_saddr.addr = rdma_get_peer_addr(event->id); + + status = ucs_sockaddr_sizeof(client_saddr.addr, &size); + if (status != UCS_OK) { + goto err_free_dev_addr; + } + + client_saddr.addrlen = size; + + conn_req_args.field_mask = UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_DEV_NAME | + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_CONN_REQUEST | + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_REMOTE_DATA | + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_CLIENT_ADDR; + conn_req_args.conn_request = event; + conn_req_args.remote_data = &remote_data; + conn_req_args.client_address = client_saddr; + ucs_strncpy_safe(conn_req_args.dev_name, dev_name, UCT_DEVICE_NAME_MAX); + + listener->conn_request_cb(&listener->super, listener->user_data, + &conn_req_args); + ucs_free(dev_addr); + + return; + +err_free_dev_addr: + ucs_free(dev_addr); +err: + uct_rdmacm_cm_reject(event->id); + uct_rdmacm_cm_destroy_id(event->id); + uct_rdmacm_cm_ack_event(event); +} + +static void uct_rdmacm_cm_handle_event_connect_response(struct rdma_cm_event *event) +{ + struct sockaddr *remote_addr = rdma_get_peer_addr(event->id); + uct_rdmacm_priv_data_hdr_t *hdr = (uct_rdmacm_priv_data_hdr_t *) + event->param.conn.private_data; + uct_rdmacm_cm_ep_t *cep = event->id->context; + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + uct_device_addr_t *dev_addr; + size_t addr_length; + uct_cm_remote_data_t remote_data; + ucs_status_t status; + + ucs_assert(event->id == cep->id); + + /* Do not notify user on disconnected EP, RDMACM out of order case */ + if (cep->flags & UCT_RDMACM_CM_EP_GOT_DISCONNECT) { + return; + } + + remote_data.field_mask = UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA | + UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA_LENGTH; + remote_data.conn_priv_data = hdr + 1; + remote_data.conn_priv_data_length = hdr->length; + + status = uct_rdmacm_cm_id_to_dev_addr(event->id, &dev_addr, &addr_length); + if (status != UCS_OK) { + ucs_error("client (ep=%p id=%p) failed to process a connect response " + "from server %s.", cep, event->id, + ucs_sockaddr_str(remote_addr, ip_port_str, + UCS_SOCKADDR_STRING_LEN)); + uct_rdmacm_cm_ep_set_failed(cep, &remote_data, status); + return; + } + + remote_data.field_mask |= UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR | + UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR_LENGTH; + remote_data.dev_addr = dev_addr; + remote_data.dev_addr_length = addr_length; + + uct_rdmacm_cm_ep_client_connect_cb(cep, &remote_data, + (ucs_status_t)hdr->status); + ucs_free(dev_addr); +} + +static void uct_rdmacm_cm_handle_event_established(struct rdma_cm_event *event) +{ + uct_rdmacm_cm_ep_t *cep = event->id->context; + + ucs_assert(event->id == cep->id); + /* do not call connect callback again, RDMACM out of order case */ + if (cep->flags & UCT_RDMACM_CM_EP_GOT_DISCONNECT) { + return; + } + + uct_rdmacm_cm_ep_server_conn_notify_cb(cep, UCS_OK); +} + +static void uct_rdmacm_cm_handle_event_disconnected(struct rdma_cm_event *event) +{ + uct_rdmacm_cm_ep_t *cep = event->id->context; + struct sockaddr UCS_V_UNUSED *remote_addr = rdma_get_peer_addr(event->id); + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + char ep_str[UCT_RDMACM_EP_STRING_LEN]; + uct_cm_remote_data_t remote_data; + + ucs_debug("%s: got disconnect event, status %d peer %s", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), + event->status, ucs_sockaddr_str(remote_addr, ip_port_str, + UCS_SOCKADDR_STRING_LEN)); + + cep->flags |= UCT_RDMACM_CM_EP_GOT_DISCONNECT; + /* calling error_cb instead of disconnect CB directly handles out-of-order + * disconnect event prior connect_response/connect_established event */ + remote_data.field_mask = 0; + uct_rdmacm_cm_ep_error_cb(cep, &remote_data, UCS_ERR_CONNECTION_RESET); +} + +static void uct_rdmacm_cm_handle_error_event(struct rdma_cm_event *event) +{ + uct_rdmacm_cm_ep_t *cep = event->id->context; + struct sockaddr *remote_addr = rdma_get_peer_addr(event->id); + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + char ep_str[UCT_RDMACM_EP_STRING_LEN]; + uct_cm_remote_data_t remote_data; + ucs_log_level_t log_level; + ucs_status_t status; + + if (event->event == RDMA_CM_EVENT_REJECTED) { + if (cep->flags & UCT_RDMACM_CM_EP_ON_SERVER) { + /* response was rejected by the client in the middle of + * connection establishment, so report connection reset */ + status = UCS_ERR_CONNECTION_RESET; + } else { + ucs_assert(cep->flags & UCT_RDMACM_CM_EP_ON_CLIENT); + status = UCS_ERR_REJECTED; + } + + log_level = UCS_LOG_LEVEL_DEBUG; + } else { + status = UCS_ERR_IO_ERROR; + log_level = UCS_LOG_LEVEL_ERROR; + } + + ucs_log(log_level, "%s: got error event %s, status %d peer %s", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), + rdma_event_str(event->event), event->status, + ucs_sockaddr_str(remote_addr, ip_port_str, + UCS_SOCKADDR_STRING_LEN)); + + if (uct_rdmacm_ep_is_connected(cep) && + !(cep->flags & UCT_RDMACM_CM_EP_FAILED)) { + /* first failure on connected EP has to be reported as disconnect event + * to allow user to call disconnect due to UCT API limitation - + * disconnect callback does not have status arg */ + uct_rdmacm_cm_handle_event_disconnected(event); + } else { + remote_data.field_mask = 0; + uct_rdmacm_cm_ep_set_failed(cep, &remote_data, status); + } +} + +static void +uct_rdmacm_cm_process_event(uct_rdmacm_cm_t *cm, struct rdma_cm_event *event) +{ + struct sockaddr UCS_V_UNUSED *remote_addr = rdma_get_peer_addr(event->id); + uint8_t ack_event = 1; + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + + ucs_trace("rdmacm event (fd=%d cm_id %p cm %p event_channel %p status %s): %s. Peer: %s.", + cm->ev_ch->fd, event->id, cm, cm->ev_ch, strerror(event->status), + rdma_event_str(event->event), + ucs_sockaddr_str(remote_addr, ip_port_str, UCS_SOCKADDR_STRING_LEN)); + + /* The following applies for rdma_cm_id of type RDMA_PS_TCP only */ + ucs_assert(event->id->ps == RDMA_PS_TCP); + + /* Using https://linux.die.net/man/3/rdma_get_cm_event to distinguish + * between client and server events */ + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + /* Client side event */ + uct_rdmacm_cm_handle_event_addr_resolved(event); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + /* Client side event */ + uct_rdmacm_cm_handle_event_route_resolved(event); + break; + case RDMA_CM_EVENT_CONNECT_REQUEST: + /* Server side event */ + uct_rdmacm_cm_handle_event_connect_request(event); + /* The server will ack the event after accepting/rejecting the request + * (in ep_create). */ + ack_event = 0; + break; + case RDMA_CM_EVENT_CONNECT_RESPONSE: + /* Client side event */ + uct_rdmacm_cm_handle_event_connect_response(event); + break; + case RDMA_CM_EVENT_ESTABLISHED: + /* Server side event */ + uct_rdmacm_cm_handle_event_established(event); + break; + case RDMA_CM_EVENT_DISCONNECTED: + /* Client and Server side event */ + uct_rdmacm_cm_handle_event_disconnected(event); + break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + /* This event is generated when the QP associated with the connection + * has exited its timewait state and is now ready to be re-used. + * After a QP has been disconnected, it is maintained in a timewait + * state to allow any in flight packets to exit the network. + * After the timewait state has completed, the rdma_cm will report this event.*/ + break; + /* client error events */ + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_ADDR_CHANGE: + /* client and server error events */ + case RDMA_CM_EVENT_REJECTED: + case RDMA_CM_EVENT_CONNECT_ERROR: + uct_rdmacm_cm_handle_error_event(event); + break; + default: + ucs_warn("unexpected RDMACM event: %s", rdma_event_str(event->event)); + break; + } + + if (ack_event) { + uct_rdmacm_cm_ack_event(event); + } +} + +static void uct_rdmacm_cm_event_handler(int fd, int events, void *arg) +{ + uct_rdmacm_cm_t *cm = (uct_rdmacm_cm_t *)arg; + struct rdma_cm_event *event; + int ret; + + for (;;) { + /* Fetch an event */ + ret = rdma_get_cm_event(cm->ev_ch, &event); + if (ret) { + /* EAGAIN (in a non-blocking rdma_get_cm_event) means that + * there are no more events */ + if ((errno != EAGAIN) && (errno != EINTR)) { + ucs_warn("rdma_get_cm_event() failed: %m"); + } + + return; + } + + UCS_ASYNC_BLOCK(uct_rdmacm_cm_get_async(cm)); + uct_rdmacm_cm_process_event(cm, event); + UCS_ASYNC_UNBLOCK(uct_rdmacm_cm_get_async(cm)); + } +} + +static uct_cm_ops_t uct_rdmacm_cm_ops = { + .close = UCS_CLASS_DELETE_FUNC_NAME(uct_rdmacm_cm_t), + .cm_query = uct_rdmacm_cm_query, + .listener_create = UCS_CLASS_NEW_FUNC_NAME(uct_rdmacm_listener_t), + .listener_reject = uct_rdmacm_listener_reject, + .listener_query = uct_rdmacm_listener_query, + .listener_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_rdmacm_listener_t), + .ep_create = UCS_CLASS_NEW_FUNC_NAME(uct_rdmacm_cm_ep_t) +}; + +static uct_iface_ops_t uct_rdmacm_cm_iface_ops = { + .ep_pending_purge = ucs_empty_function, + .ep_disconnect = uct_rdmacm_cm_ep_disconnect, + .cm_ep_conn_notify = uct_rdmacm_cm_ep_conn_notify, + .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_rdmacm_cm_ep_t), + .ep_put_short = (uct_ep_put_short_func_t)ucs_empty_function_return_unsupported, + .ep_put_bcopy = (uct_ep_put_bcopy_func_t)ucs_empty_function_return_unsupported, + .ep_get_bcopy = (uct_ep_get_bcopy_func_t)ucs_empty_function_return_unsupported, + .ep_am_short = (uct_ep_am_short_func_t)ucs_empty_function_return_unsupported, + .ep_am_bcopy = (uct_ep_am_bcopy_func_t)ucs_empty_function_return_unsupported, + .ep_atomic_cswap64 = (uct_ep_atomic_cswap64_func_t)ucs_empty_function_return_unsupported, + .ep_atomic64_post = (uct_ep_atomic64_post_func_t)ucs_empty_function_return_unsupported, + .ep_atomic64_fetch = (uct_ep_atomic64_fetch_func_t)ucs_empty_function_return_unsupported, + .ep_atomic_cswap32 = (uct_ep_atomic_cswap32_func_t)ucs_empty_function_return_unsupported, + .ep_atomic32_post = (uct_ep_atomic32_post_func_t)ucs_empty_function_return_unsupported, + .ep_atomic32_fetch = (uct_ep_atomic32_fetch_func_t)ucs_empty_function_return_unsupported, + .ep_pending_add = (uct_ep_pending_add_func_t)ucs_empty_function_return_unsupported, + .ep_flush = (uct_ep_flush_func_t)ucs_empty_function_return_success, + .ep_fence = (uct_ep_fence_func_t)ucs_empty_function_return_unsupported, + .ep_check = (uct_ep_check_func_t)ucs_empty_function_return_unsupported, + .ep_create = (uct_ep_create_func_t)ucs_empty_function_return_unsupported, + .iface_flush = (uct_iface_flush_func_t)ucs_empty_function_return_unsupported, + .iface_fence = (uct_iface_fence_func_t)ucs_empty_function_return_unsupported, + .iface_progress_enable = ucs_empty_function, + .iface_progress_disable = ucs_empty_function, + .iface_progress = (uct_iface_progress_func_t)ucs_empty_function_return_zero, + .iface_event_fd_get = (uct_iface_event_fd_get_func_t)ucs_empty_function_return_unsupported, + .iface_event_arm = (uct_iface_event_arm_func_t)ucs_empty_function_return_unsupported, + .iface_close = ucs_empty_function, + .iface_query = (uct_iface_query_func_t)ucs_empty_function_return_unsupported, + .iface_get_device_address = (uct_iface_get_device_address_func_t)ucs_empty_function_return_unsupported, + .iface_get_address = (uct_iface_get_address_func_t)ucs_empty_function_return_unsupported, + .iface_is_reachable = (uct_iface_is_reachable_func_t)ucs_empty_function_return_zero +}; + +UCS_CLASS_INIT_FUNC(uct_rdmacm_cm_t, uct_component_h component, + uct_worker_h worker, const uct_cm_config_t *config) +{ + uct_priv_worker_t *worker_priv; + ucs_status_t status; + + UCS_CLASS_CALL_SUPER_INIT(uct_cm_t, &uct_rdmacm_cm_ops, + &uct_rdmacm_cm_iface_ops, worker, component); + + self->ev_ch = rdma_create_event_channel(); + if (self->ev_ch == NULL) { + ucs_error("rdma_create_event_channel failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err; + } + + /* Set the event_channel fd to non-blocking mode + * (so that rdma_get_cm_event won't be blocking) */ + status = ucs_sys_fcntl_modfl(self->ev_ch->fd, O_NONBLOCK, 0); + if (status != UCS_OK) { + status = UCS_ERR_IO_ERROR; + goto err_destroy_ev_ch; + } + + worker_priv = ucs_derived_of(worker, uct_priv_worker_t); + status = ucs_async_set_event_handler(worker_priv->async->mode, + self->ev_ch->fd, UCS_EVENT_SET_EVREAD, + uct_rdmacm_cm_event_handler, self, + worker_priv->async); + if (status != UCS_OK) { + goto err_destroy_ev_ch; + } + + ucs_debug("created rdmacm_cm %p with event_channel %p (fd=%d)", + self, self->ev_ch, self->ev_ch->fd); + + return UCS_OK; + +err_destroy_ev_ch: + rdma_destroy_event_channel(self->ev_ch); +err: + return status; +} + +UCS_CLASS_CLEANUP_FUNC(uct_rdmacm_cm_t) +{ + ucs_status_t status; + + status = ucs_async_remove_handler(self->ev_ch->fd, 1); + if (status != UCS_OK) { + ucs_warn("failed to remove event handler for fd %d: %s", + self->ev_ch->fd, ucs_status_string(status)); + } + + ucs_trace("destroying event_channel %p on cm %p", self->ev_ch, self); + rdma_destroy_event_channel(self->ev_ch); +} + +UCS_CLASS_DEFINE(uct_rdmacm_cm_t, uct_cm_t); +UCS_CLASS_DEFINE_NEW_FUNC(uct_rdmacm_cm_t, uct_cm_t, uct_component_h, + uct_worker_h, const uct_cm_config_t *); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_rdmacm_cm_t, uct_cm_t); diff --git a/src/uct/ib/rdmacm/rdmacm_cm.h b/src/uct/ib/rdmacm/rdmacm_cm.h new file mode 100644 index 00000000000..6a236719c79 --- /dev/null +++ b/src/uct/ib/rdmacm/rdmacm_cm.h @@ -0,0 +1,41 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef UCT_RDMACM_CM_H +#define UCT_RDMACM_CM_H + +#include +#include "rdmacm_def.h" + + +/** + * An rdmacm connection manager + */ +typedef struct uct_rdmacm_cm { + uct_cm_t super; + struct rdma_event_channel *ev_ch; +} uct_rdmacm_cm_t; + +UCS_CLASS_DECLARE_NEW_FUNC(uct_rdmacm_cm_t, uct_cm_t, uct_component_h, + uct_worker_h, const uct_cm_config_t *); +UCS_CLASS_DECLARE_DELETE_FUNC(uct_rdmacm_cm_t, uct_cm_t); + +static UCS_F_ALWAYS_INLINE ucs_async_context_t * +uct_rdmacm_cm_get_async(uct_rdmacm_cm_t *cm) +{ + uct_priv_worker_t *wpriv = ucs_derived_of(cm->super.iface.worker, + uct_priv_worker_t); + + return wpriv->async; +} + +ucs_status_t uct_rdmacm_cm_destroy_id(struct rdma_cm_id *id); + +ucs_status_t uct_rdmacm_cm_ack_event(struct rdma_cm_event *event); + +ucs_status_t uct_rdmacm_cm_reject(struct rdma_cm_id *id); + +#endif diff --git a/src/uct/ib/rdmacm/rdmacm_cm_ep.c b/src/uct/ib/rdmacm/rdmacm_cm_ep.c new file mode 100644 index 00000000000..e1f37f47a23 --- /dev/null +++ b/src/uct/ib/rdmacm/rdmacm_cm_ep.c @@ -0,0 +1,532 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "rdmacm_cm_ep.h" +#include "rdmacm_cm.h" +#include + + +static UCS_F_ALWAYS_INLINE +uct_rdmacm_cm_t *uct_rdmacm_cm_ep_get_cm(uct_rdmacm_cm_ep_t *cep) +{ + /* return the rdmacm connection manager this ep is using */ + return ucs_container_of(cep->super.super.super.iface, uct_rdmacm_cm_t, + super.iface); +} + +static UCS_F_ALWAYS_INLINE +ucs_async_context_t *uct_rdmacm_cm_ep_get_async(uct_rdmacm_cm_ep_t *cep) +{ + return uct_rdmacm_cm_get_async(uct_rdmacm_cm_ep_get_cm(cep)); +} + +const char* uct_rdmacm_cm_ep_str(uct_rdmacm_cm_ep_t *cep, char *str, + size_t max_len) +{ + char flags_buf[UCT_RDMACM_EP_FLAGS_STRING_LEN]; + + static const char *ep_flag_to_str[] = { + [ucs_ilog2(UCT_RDMACM_CM_EP_ON_CLIENT)] = "client", + [ucs_ilog2(UCT_RDMACM_CM_EP_ON_SERVER)] = "server", + [ucs_ilog2(UCT_RDMACM_CM_EP_CLIENT_CONN_CB_INVOKED)] = "connect_cb_invoked", + [ucs_ilog2(UCT_RDMACM_CM_EP_SERVER_NOTIFY_CB_INVOKED)] = "notify_cb_invoked", + [ucs_ilog2(UCT_RDMACM_CM_EP_GOT_DISCONNECT)] = "got_disconnect", + [ucs_ilog2(UCT_RDMACM_CM_EP_DISCONNECTING)] = "disconnecting", + [ucs_ilog2(UCT_RDMACM_CM_EP_FAILED)] = "failed", + NULL + }; + + ucs_flags_str(flags_buf, sizeof(flags_buf), cep->flags, ep_flag_to_str); + ucs_snprintf_safe(str, max_len, "rdmacm_ep %p, status %s, flags %s", + cep, ucs_status_string(cep->status), flags_buf); + return str; +} + +int uct_rdmacm_ep_is_connected(uct_rdmacm_cm_ep_t *cep) +{ + return cep->flags & (UCT_RDMACM_CM_EP_CLIENT_CONN_CB_INVOKED | + UCT_RDMACM_CM_EP_SERVER_NOTIFY_CB_INVOKED); +} + +void uct_rdmacm_cm_ep_client_connect_cb(uct_rdmacm_cm_ep_t *cep, + uct_cm_remote_data_t *remote_data, + ucs_status_t status) +{ + cep->flags |= UCT_RDMACM_CM_EP_CLIENT_CONN_CB_INVOKED; + uct_cm_ep_client_connect_cb(&cep->super, remote_data, status); +} + +void uct_rdmacm_cm_ep_server_conn_notify_cb(uct_rdmacm_cm_ep_t *cep, + ucs_status_t status) +{ + cep->flags |= UCT_RDMACM_CM_EP_SERVER_NOTIFY_CB_INVOKED; + uct_cm_ep_server_conn_notify_cb(&cep->super, status); +} + +void uct_rdmacm_cm_ep_error_cb(uct_rdmacm_cm_ep_t *cep, + uct_cm_remote_data_t *remote_data, + ucs_status_t status) +{ + if (cep->flags & UCT_RDMACM_CM_EP_FAILED) { + return; + } + + ucs_assert(status != UCS_OK); + cep->status = status; + + if (uct_rdmacm_ep_is_connected(cep)) { + /* already connected, so call disconnect callback */ + uct_cm_ep_disconnect_cb(&cep->super); + } else if (cep->flags & UCT_RDMACM_CM_EP_ON_CLIENT) { + /* not connected yet, so call client side connect callback with err + * status */ + uct_rdmacm_cm_ep_client_connect_cb(cep, remote_data, status); + } else { + ucs_assert(cep->flags & UCT_RDMACM_CM_EP_ON_SERVER); + /* not connected yet, so call server side notify callback with err + * status */ + uct_rdmacm_cm_ep_server_conn_notify_cb(cep, status); + } +} + +void uct_rdmacm_cm_ep_set_failed(uct_rdmacm_cm_ep_t *cep, + uct_cm_remote_data_t *remote_data, + ucs_status_t status) +{ + UCS_ASYNC_BLOCK(uct_rdmacm_cm_ep_get_async(cep)); + uct_rdmacm_cm_ep_error_cb(cep, remote_data, status); + cep->flags |= UCT_RDMACM_CM_EP_FAILED; + UCS_ASYNC_UNBLOCK(uct_rdmacm_cm_ep_get_async(cep)); +} + +ucs_status_t uct_rdmacm_cm_ep_conn_notify(uct_ep_h ep) +{ + uct_rdmacm_cm_ep_t *cep = ucs_derived_of(ep, uct_rdmacm_cm_ep_t); + struct sockaddr *remote_addr = rdma_get_peer_addr(cep->id); + uct_rdmacm_cm_t UCS_V_UNUSED *rdmacm_cm = uct_rdmacm_cm_ep_get_cm(cep); + char ep_str[UCT_RDMACM_EP_STRING_LEN]; + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + + ucs_trace("%s: rdma_establish on client (cm_id %p, rdmacm %p, event_channel=%p)", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), + cep->id, rdmacm_cm, rdmacm_cm->ev_ch); + + UCS_ASYNC_BLOCK(uct_rdmacm_cm_ep_get_async(cep)); + if (cep->flags & (UCT_RDMACM_CM_EP_FAILED | + UCT_RDMACM_CM_EP_GOT_DISCONNECT)) { + goto ep_failed; + } + + UCS_ASYNC_UNBLOCK(uct_rdmacm_cm_ep_get_async(cep)); + + if (rdma_establish(cep->id)) { + ucs_error("rdma_establish on ep %p (to server addr=%s) failed: %m", + cep, ucs_sockaddr_str(remote_addr, ip_port_str, + UCS_SOCKADDR_STRING_LEN)); + UCS_ASYNC_BLOCK(uct_rdmacm_cm_ep_get_async(cep)); + cep->status = UCS_ERR_IO_ERROR; + cep->flags |= UCT_RDMACM_CM_EP_FAILED; + goto ep_failed; + } + + return UCS_OK; + +ep_failed: + UCS_ASYNC_UNBLOCK(uct_rdmacm_cm_ep_get_async(cep)); + return cep->status; +} + +static void uct_rdmacm_cm_ep_destroy_dummy_cq_qp(uct_rdmacm_cm_ep_t *cep) +{ + int ret; + + if (cep->qp != NULL) { + ret = ibv_destroy_qp(cep->qp); + if (ret != 0) { + ucs_warn("ibv_destroy_qp() returned %d: %m", ret); + } + } + + if (cep->cq != NULL) { + ret = ibv_destroy_cq(cep->cq); + if (ret != 0) { + ucs_warn("ibv_destroy_cq() returned %d: %m", ret); + } + } + + cep->qp = NULL; + cep->cq = NULL; +} + +static ucs_status_t uct_rdmacm_cm_create_dummy_cq_qp(struct rdma_cm_id *id, + struct ibv_cq **cq_p, + struct ibv_qp **qp_p) +{ + struct ibv_qp_init_attr qp_init_attr; + ucs_status_t status; + struct ibv_cq *cq; + struct ibv_qp *qp; + + /* Create a dummy completion queue */ + cq = ibv_create_cq(id->verbs, 1, NULL, NULL, 0); + if (cq == NULL) { + ucs_error("ibv_create_cq() failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err; + } + + /* Create a dummy UD qp */ + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + qp_init_attr.send_cq = cq; + qp_init_attr.recv_cq = cq; + qp_init_attr.qp_type = IBV_QPT_UD; + qp_init_attr.cap.max_send_wr = 2; + qp_init_attr.cap.max_recv_wr = 2; + qp_init_attr.cap.max_send_sge = 1; + qp_init_attr.cap.max_recv_sge = 1; + + qp = ibv_create_qp(id->pd, &qp_init_attr); + if (qp == NULL) { + ucs_error("failed to create a dummy ud qp. %m"); + status = UCS_ERR_IO_ERROR; + goto err_destroy_cq; + } + + ucs_debug("created ud QP %p with qp_num: 0x%x and cq %p on rdmacm_id %p", + qp, qp->qp_num, cq, id); + + *cq_p = cq; + *qp_p = qp; + + return UCS_OK; + +err_destroy_cq: + ibv_destroy_cq(cq); +err: + return status; +} + +ucs_status_t +uct_rdamcm_cm_ep_set_qp_num(struct rdma_conn_param *conn_param, + uct_rdmacm_cm_ep_t *cep) +{ + ucs_status_t status; + struct ibv_qp *qp; + struct ibv_cq *cq; + + /* create a dummy qp in order to get a unique qp_num to provide to librdmacm */ + status = uct_rdmacm_cm_create_dummy_cq_qp(cep->id, &cq, &qp); + if (status != UCS_OK) { + return status; + } + + cep->cq = cq; + cep->qp = qp; + conn_param->qp_num = qp->qp_num; + return UCS_OK; +} + +ucs_status_t uct_rdmacm_cm_ep_conn_param_init(uct_rdmacm_cm_ep_t *cep, + struct rdma_conn_param *conn_param) +{ + uct_rdmacm_priv_data_hdr_t *hdr; + ucs_status_t status; + char dev_name[UCT_DEVICE_NAME_MAX]; + size_t priv_data_ret; + uct_cm_ep_priv_data_pack_args_t pack_args; + + uct_rdmacm_cm_id_to_dev_name(cep->id, dev_name); + + /* Pack data to send inside rdmacm's conn_param to the remote peer */ + hdr = (uct_rdmacm_priv_data_hdr_t*)conn_param->private_data; + pack_args.field_mask = UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME; + ucs_strncpy_safe(pack_args.dev_name, dev_name, UCT_DEVICE_NAME_MAX); + + status = uct_cm_ep_pack_cb(&cep->super, cep->super.user_data, &pack_args, + hdr + 1, uct_rdmacm_cm_get_max_conn_priv(), + &priv_data_ret); + + if (status != UCS_OK) { + goto err; + } + + ucs_assert_always(priv_data_ret <= UINT8_MAX); + hdr->length = (uint8_t)priv_data_ret; + hdr->status = UCS_OK; + + status = uct_rdamcm_cm_ep_set_qp_num(conn_param, cep); + if (status != UCS_OK) { + goto err; + } + + conn_param->private_data_len = sizeof(*hdr) + hdr->length; + + return UCS_OK; + +err: + return status; +} + +static ucs_status_t uct_rdamcm_cm_ep_client_init(uct_rdmacm_cm_ep_t *cep, + const uct_ep_params_t *params) +{ + uct_rdmacm_cm_t *rdmacm_cm = uct_rdmacm_cm_ep_get_cm(cep); + uct_cm_base_ep_t *cm_ep = &cep->super; + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + char ep_str[UCT_RDMACM_EP_STRING_LEN]; + ucs_status_t status; + + cep->flags |= UCT_RDMACM_CM_EP_ON_CLIENT; + + status = UCT_CM_SET_CB(params, UCT_EP_PARAM_FIELD_SOCKADDR_CONNECT_CB_CLIENT, + cm_ep->client.connect_cb, params->sockaddr_cb_client, + uct_cm_ep_client_connect_callback_t, + ucs_empty_function); + if (status != UCS_OK) { + goto err; + } + + ucs_trace("%s: rdma_create_id on client (rdmacm %p, event_channel=%p)", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), + rdmacm_cm, rdmacm_cm->ev_ch); + + if (rdma_create_id(rdmacm_cm->ev_ch, &cep->id, cep, RDMA_PS_TCP)) { + ucs_error("rdma_create_id() failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err; + } + + /* rdma_resolve_addr needs to be called last in the ep_create flow to + * prevent a race where there are uninitialized fields used when the + * RDMA_CM_EVENT_ROUTE_RESOLVED event is already received in the the async + * thread. Therefore, all ep fields have to be initialized before this + * function is called. */ + ucs_trace("%s: rdma_resolve_addr on cm_id %p", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), cep->id); + if (rdma_resolve_addr(cep->id, NULL, (struct sockaddr *)params->sockaddr->addr, + 1000/* TODO */)) { + ucs_error("rdma_resolve_addr() to dst addr %s failed: %m", + ucs_sockaddr_str((struct sockaddr *)params->sockaddr->addr, + ip_port_str, UCS_SOCKADDR_STRING_LEN)); + status = UCS_ERR_IO_ERROR; + goto err_destroy_id; + } + + return UCS_OK; + +err_destroy_id: + uct_rdmacm_cm_destroy_id(cep->id); +err: + return status; +} + +static ucs_status_t uct_rdamcm_cm_ep_server_init(uct_rdmacm_cm_ep_t *cep, + const uct_ep_params_t *params) +{ + struct rdma_cm_event *event = (struct rdma_cm_event *)params->conn_request; + uct_rdmacm_cm_t *cm = uct_rdmacm_cm_ep_get_cm(cep); + uct_cm_base_ep_t *cm_ep = &cep->super; + struct rdma_conn_param conn_param; + ucs_status_t status; + char ep_str[UCT_RDMACM_EP_STRING_LEN]; + + cep->flags |= UCT_RDMACM_CM_EP_ON_SERVER; + + if (event->listen_id->channel != cm->ev_ch) { + /* the server will open the ep to the client on a different CM. + * not the one on which its listener is listening on */ + if (rdma_migrate_id(event->id, cm->ev_ch)) { + ucs_error("failed to migrate id %p to event_channel %p (cm=%p)", + event->id, cm->ev_ch, cm); + uct_rdmacm_cm_reject(event->id); + status = UCS_ERR_IO_ERROR; + goto err; + } + + ucs_debug("%s: migrated id %p from event_channel=%p to " + "new cm %p (event_channel=%p)", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), + event->id, event->listen_id->channel, cm, cm->ev_ch); + } + + status = UCT_CM_SET_CB(params, UCT_EP_PARAM_FIELD_SOCKADDR_NOTIFY_CB_SERVER, + cm_ep->server.notify_cb, params->sockaddr_cb_server, + uct_cm_ep_server_conn_notify_callback_t, + ucs_empty_function); + if (status != UCS_OK) { + goto err; + } + + cep->id = event->id; + cep->id->context = cep; + + memset(&conn_param, 0, sizeof(conn_param)); + conn_param.private_data = ucs_alloca(uct_rdmacm_cm_get_max_conn_priv() + + sizeof(uct_rdmacm_priv_data_hdr_t)); + + status = uct_rdmacm_cm_ep_conn_param_init(cep, &conn_param); + if (status != UCS_OK) { + uct_rdmacm_cm_reject(event->id); + goto err; + } + + ucs_trace("%s: rdma_accept on cm_id %p", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), + event->id); + + if (rdma_accept(event->id, &conn_param)) { + ucs_error("rdma_accept(on id=%p) failed: %m", event->id); + uct_rdmacm_cm_ep_destroy_dummy_cq_qp(cep); + status = UCS_ERR_IO_ERROR; + goto err; + } + + uct_rdmacm_cm_ack_event(event); + return UCS_OK; + +err: + UCS_ASYNC_BLOCK(uct_rdmacm_cm_ep_get_async(cep)); + cep->status = status; + cep->flags |= UCT_RDMACM_CM_EP_FAILED; + UCS_ASYNC_UNBLOCK(uct_rdmacm_cm_ep_get_async(cep)); + uct_rdmacm_cm_destroy_id(event->id); + uct_rdmacm_cm_ack_event(event); + return status; +} + +ucs_status_t uct_rdmacm_cm_ep_disconnect(uct_ep_h ep, unsigned flags) +{ + uct_rdmacm_cm_ep_t *cep = ucs_derived_of(ep, uct_rdmacm_cm_ep_t); + char ep_str[UCT_RDMACM_EP_STRING_LEN]; + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + ucs_status_t status; + + UCS_ASYNC_BLOCK(uct_rdmacm_cm_ep_get_async(cep)); + if (ucs_unlikely(cep->flags & UCT_RDMACM_CM_EP_FAILED)) { + ucs_error("%s: id=%p to peer %s", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), + cep->id, ucs_sockaddr_str(rdma_get_peer_addr(cep->id), + ip_port_str, + UCS_SOCKADDR_STRING_LEN)); + status = cep->status; + goto out; + } + + if (ucs_unlikely(cep->flags & UCT_RDMACM_CM_EP_DISCONNECTING)) { + if (cep->flags & UCT_RDMACM_CM_EP_GOT_DISCONNECT) { + ucs_error("%s: duplicate call of uct_ep_disconnect on a " + "disconnected ep (id=%p to peer %s)", + uct_rdmacm_cm_ep_str(cep, ep_str, + UCT_RDMACM_EP_STRING_LEN), + cep->id, ucs_sockaddr_str(rdma_get_peer_addr(cep->id), + ip_port_str, + UCS_SOCKADDR_STRING_LEN)); + status = UCS_ERR_NOT_CONNECTED; + goto out; + } + + ucs_debug("%s: duplicate call of uct_ep_disconnect on an ep " + "that was not disconnected yet (id=%p to peer %s).", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), + cep->id, ucs_sockaddr_str(rdma_get_peer_addr(cep->id), + ip_port_str, + UCS_SOCKADDR_STRING_LEN)); + status = UCS_INPROGRESS; + goto out; + } + + if (!uct_rdmacm_ep_is_connected(cep)) { + ucs_debug("%s: calling uct_ep_disconnect on an ep that is not " + "connected yet (id=%p to peer %s)", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), + cep->id, ucs_sockaddr_str(rdma_get_peer_addr(cep->id), + ip_port_str, + UCS_SOCKADDR_STRING_LEN)); + status = UCS_ERR_BUSY; + goto out; + } + + cep->flags |= UCT_RDMACM_CM_EP_DISCONNECTING; + if (rdma_disconnect(cep->id)) { + ucs_error("%s: (id=%p) failed to disconnect from peer %p", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), + cep->id, ucs_sockaddr_str(rdma_get_peer_addr(cep->id), ip_port_str, + UCS_SOCKADDR_STRING_LEN)); + status = UCS_ERR_IO_ERROR; + goto out; + } + + ucs_debug("%s: (id=%p) disconnecting from peer :%s", + uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), + cep->id, ucs_sockaddr_str(rdma_get_peer_addr(cep->id), ip_port_str, + UCS_SOCKADDR_STRING_LEN)); + status = UCS_OK; + +out: + UCS_ASYNC_UNBLOCK(uct_rdmacm_cm_ep_get_async(cep)); + return status; +} + +UCS_CLASS_INIT_FUNC(uct_rdmacm_cm_ep_t, const uct_ep_params_t *params) +{ + ucs_status_t status; + char ep_str[UCT_RDMACM_EP_STRING_LEN]; + + UCS_CLASS_CALL_SUPER_INIT(uct_cm_base_ep_t, params); + + self->cq = NULL; + self->qp = NULL; + self->flags = 0; + self->status = UCS_OK; + + if (params->field_mask & UCT_EP_PARAM_FIELD_SOCKADDR) { + status = uct_rdamcm_cm_ep_client_init(self, params); + } else if (params->field_mask & UCT_EP_PARAM_FIELD_CONN_REQUEST) { + status = uct_rdamcm_cm_ep_server_init(self, params); + } else { + ucs_error("either UCT_EP_PARAM_FIELD_SOCKADDR or UCT_EP_PARAM_FIELD_CONN_REQUEST " + "has to be provided"); + status = UCS_ERR_INVALID_PARAM; + } + + if (status == UCS_OK) { + ucs_debug("%s: created an endpoint on rdmacm %p id: %p", + uct_rdmacm_cm_ep_str(self, ep_str, UCT_RDMACM_EP_STRING_LEN), + uct_rdmacm_cm_ep_get_cm(self), self->id); + } + + return status; +} + +UCS_CLASS_CLEANUP_FUNC(uct_rdmacm_cm_ep_t) +{ + uct_rdmacm_cm_t *rdmacm_cm = uct_rdmacm_cm_ep_get_cm(self); + uct_priv_worker_t *worker_priv = ucs_derived_of(rdmacm_cm->super.iface.worker, + uct_priv_worker_t); + char ep_str[UCT_RDMACM_EP_STRING_LEN]; + + ucs_trace("%s: destroy ep on cm %p (worker_priv=%p)", + uct_rdmacm_cm_ep_str(self, ep_str, UCT_RDMACM_EP_STRING_LEN), + rdmacm_cm, worker_priv); + + UCS_ASYNC_BLOCK(worker_priv->async); + + uct_rdmacm_cm_ep_destroy_dummy_cq_qp(self); + + /* rdma_destroy_id() cleans all events not yet reported on progress thread, + * so no events would be reported to the user after destroying the id */ + uct_rdmacm_cm_destroy_id(self->id); + + UCS_ASYNC_UNBLOCK(worker_priv->async); +} + +UCS_CLASS_DEFINE(uct_rdmacm_cm_ep_t, uct_base_ep_t); +UCS_CLASS_DEFINE_NEW_FUNC(uct_rdmacm_cm_ep_t, uct_ep_t, const uct_ep_params_t *); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_rdmacm_cm_ep_t, uct_ep_t); diff --git a/src/uct/ib/rdmacm/rdmacm_cm_ep.h b/src/uct/ib/rdmacm/rdmacm_cm_ep.h new file mode 100644 index 00000000000..dd708a138ed --- /dev/null +++ b/src/uct/ib/rdmacm/rdmacm_cm_ep.h @@ -0,0 +1,69 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#include "rdmacm_listener.h" + + +/** + * RDMACM endpoint that is opened on a connection manager + */ +typedef struct uct_rdmacm_cm_ep { + uct_cm_base_ep_t super; + struct rdma_cm_id *id; /* The rdmacm id that is created per this ep */ + struct ibv_cq *cq; /* Dummy cq used for creating a dummy qp */ + struct ibv_qp *qp; /* Dummy qp used for generating a unique qp_num */ + uint8_t flags; + ucs_status_t status; +} uct_rdmacm_cm_ep_t; + +enum { + UCT_RDMACM_CM_EP_ON_CLIENT = UCS_BIT(0), + UCT_RDMACM_CM_EP_ON_SERVER = UCS_BIT(1), + UCT_RDMACM_CM_EP_CLIENT_CONN_CB_INVOKED = UCS_BIT(2), /* Connect callback was + invoked on the client. */ + UCT_RDMACM_CM_EP_SERVER_NOTIFY_CB_INVOKED = UCS_BIT(3), /* Notify callback was + invoked on the server. */ + UCT_RDMACM_CM_EP_GOT_DISCONNECT = UCS_BIT(4), /* Got disconnect event. */ + UCT_RDMACM_CM_EP_DISCONNECTING = UCS_BIT(5), /* @ref uct_ep_disconnect was + called on the ep. */ + UCT_RDMACM_CM_EP_FAILED = UCS_BIT(6) /* The EP is in error state, + see @ref + uct_rdmacm_cm_ep_t::status.*/ +}; + +UCS_CLASS_DECLARE_NEW_FUNC(uct_rdmacm_cm_ep_t, uct_ep_t, const uct_ep_params_t *); +UCS_CLASS_DECLARE_DELETE_FUNC(uct_rdmacm_cm_ep_t, uct_ep_t); + +ucs_status_t uct_rdmacm_cm_ep_disconnect(uct_ep_h ep, unsigned flags); + +ucs_status_t uct_rdmacm_cm_ep_conn_notify(uct_ep_h ep); + +ucs_status_t +uct_rdamcm_cm_ep_set_qp_num(struct rdma_conn_param *conn_param, + uct_rdmacm_cm_ep_t *cep); + +ucs_status_t uct_rdmacm_cm_ep_conn_param_init(uct_rdmacm_cm_ep_t *cep, + struct rdma_conn_param *conn_param); + +void uct_rdmacm_cm_ep_error_cb(uct_rdmacm_cm_ep_t *cep, + uct_cm_remote_data_t *remote_data, + ucs_status_t status); + +void uct_rdmacm_cm_ep_set_failed(uct_rdmacm_cm_ep_t *cep, + uct_cm_remote_data_t *remote_data, + ucs_status_t status); + +const char* uct_rdmacm_cm_ep_str(uct_rdmacm_cm_ep_t *cep, char *str, + size_t max_len); + +int uct_rdmacm_ep_is_connected(uct_rdmacm_cm_ep_t *cep); + +void uct_rdmacm_cm_ep_client_connect_cb(uct_rdmacm_cm_ep_t *cep, + uct_cm_remote_data_t *remote_data, + ucs_status_t status); + +void uct_rdmacm_cm_ep_server_conn_notify_cb(uct_rdmacm_cm_ep_t *cep, + ucs_status_t status); diff --git a/src/uct/ib/rdmacm/rdmacm_def.h b/src/uct/ib/rdmacm/rdmacm_def.h index fbcd7d0e6a9..6220be24665 100644 --- a/src/uct/ib/rdmacm/rdmacm_def.h +++ b/src/uct/ib/rdmacm/rdmacm_def.h @@ -13,18 +13,24 @@ #include #include #include +#include +#include #include #include #define UCT_RDMACM_TL_NAME "rdmacm" #define UCT_RDMACM_UDP_PRIV_DATA_LEN 136 /** See rdma_accept(3) */ +#define UCT_RDMACM_TCP_PRIV_DATA_LEN 56 /** See rdma_connect(3) */ +#define UCT_RDMACM_EP_FLAGS_STRING_LEN 128 /** A string to hold the + representation of the ep flags */ +#define UCT_RDMACM_EP_STRING_LEN 192 /** A string to hold the ep info */ typedef struct uct_rdmacm_iface uct_rdmacm_iface_t; typedef struct uct_rdmacm_ep uct_rdmacm_ep_t; typedef struct uct_rdmacm_priv_data_hdr { uint8_t length; /* length of the private data */ - int8_t status; + uint8_t status; } uct_rdmacm_priv_data_hdr_t; typedef struct uct_rdmacm_ctx { @@ -33,6 +39,8 @@ typedef struct uct_rdmacm_ctx { ucs_list_link_t list; /* for list of used cm_ids */ } uct_rdmacm_ctx_t; +size_t uct_rdmacm_cm_get_max_conn_priv(); + ucs_status_t uct_rdmacm_resolve_addr(struct rdma_cm_id *cm_id, struct sockaddr *addr, int timeout_ms, ucs_log_level_t log_level); @@ -41,4 +49,10 @@ ucs_status_t uct_rdmacm_ep_resolve_addr(uct_rdmacm_ep_t *ep); ucs_status_t uct_rdmacm_ep_set_cm_id(uct_rdmacm_iface_t *iface, uct_rdmacm_ep_t *ep); +static inline void uct_rdmacm_cm_id_to_dev_name(struct rdma_cm_id *cm_id, char *dev_name) +{ + ucs_snprintf_zero(dev_name, UCT_DEVICE_NAME_MAX, "%s:%d", + ibv_get_device_name(cm_id->verbs->device), cm_id->port_num); +} + #endif /* UCT_RDMACM_H */ diff --git a/src/uct/ib/rdmacm/rdmacm_ep.c b/src/uct/ib/rdmacm/rdmacm_ep.c index b993b5e5476..fd170e15034 100644 --- a/src/uct/ib/rdmacm/rdmacm_ep.c +++ b/src/uct/ib/rdmacm/rdmacm_ep.c @@ -3,6 +3,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rdmacm_ep.h" @@ -260,4 +264,5 @@ void uct_rdmacm_ep_invoke_completions(uct_rdmacm_ep_t *ep, ucs_status_t status) ucs_free(op); pthread_mutex_lock(&ep->ops_mutex); } + /* coverity[missing_unlock] */ } diff --git a/src/uct/ib/rdmacm/rdmacm_ep.h b/src/uct/ib/rdmacm/rdmacm_ep.h index b2fb02e53b2..3eb323c288e 100644 --- a/src/uct/ib/rdmacm/rdmacm_ep.h +++ b/src/uct/ib/rdmacm/rdmacm_ep.h @@ -18,20 +18,20 @@ struct uct_rdmacm_ep_op { struct uct_rdmacm_ep { - uct_base_ep_t super; - uct_sockaddr_priv_pack_callback_t pack_cb; - void *pack_cb_arg; - uint32_t pack_cb_flags; - int is_on_pending; - - pthread_mutex_t ops_mutex; /* guards ops and status */ - ucs_queue_head_t ops; - ucs_status_t status; /* client EP status */ - - ucs_list_link_t list_elem; /* for the pending_eps_list */ - struct sockaddr_storage remote_addr; - uct_worker_cb_id_t slow_prog_id; - uct_rdmacm_ctx_t *cm_id_ctx; + uct_base_ep_t super; + uct_cm_ep_priv_data_pack_callback_t pack_cb; + void *pack_cb_arg; + uint32_t pack_cb_flags; + int is_on_pending; + + pthread_mutex_t ops_mutex; /* guards ops and status */ + ucs_queue_head_t ops; + ucs_status_t status; /* client EP status */ + + ucs_list_link_t list_elem; /* for the pending_eps_list */ + struct sockaddr_storage remote_addr; + uct_worker_cb_id_t slow_prog_id; + uct_rdmacm_ctx_t *cm_id_ctx; }; UCS_CLASS_DECLARE_NEW_FUNC(uct_rdmacm_ep_t, uct_ep_t, const uct_ep_params_t *); diff --git a/src/uct/ib/rdmacm/rdmacm_iface.c b/src/uct/ib/rdmacm/rdmacm_iface.c index 276970c4c7a..4f3dbcc9c79 100644 --- a/src/uct/ib/rdmacm/rdmacm_iface.c +++ b/src/uct/ib/rdmacm/rdmacm_iface.c @@ -3,6 +3,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rdmacm_iface.h" #include "rdmacm_ep.h" #include @@ -15,6 +19,10 @@ enum uct_rdmacm_process_event_flags { }; static ucs_config_field_t uct_rdmacm_iface_config_table[] = { + {"", "", NULL, + ucs_offsetof(uct_rdmacm_iface_config_t, super), + UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)}, + {"BACKLOG", "1024", "Maximum number of pending connections for an rdma_cm_id.", ucs_offsetof(uct_rdmacm_iface_config_t, backlog), UCS_CONFIG_TYPE_UINT}, @@ -31,7 +39,11 @@ static UCS_CLASS_DECLARE_DELETE_FUNC(uct_rdmacm_iface_t, uct_iface_t); static ucs_status_t uct_rdmacm_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { - memset(iface_attr, 0, sizeof(uct_iface_attr_t)); + uct_rdmacm_iface_t *rdmacm_iface = ucs_derived_of(tl_iface, uct_rdmacm_iface_t); + struct sockaddr *addr; + ucs_status_t status; + + uct_base_iface_query(&rdmacm_iface->super, iface_attr); iface_attr->iface_addr_len = sizeof(ucs_sock_addr_t); iface_attr->device_addr_len = 0; @@ -42,15 +54,16 @@ static ucs_status_t uct_rdmacm_iface_query(uct_iface_h tl_iface, * the private_data header (to hold the length of the data) */ iface_attr->max_conn_priv = UCT_RDMACM_MAX_CONN_PRIV; - return UCS_OK; -} + if (rdmacm_iface->is_server) { + addr = rdma_get_local_addr(rdmacm_iface->cm_id); + status = ucs_sockaddr_copy((struct sockaddr *)&iface_attr->listen_sockaddr, + addr); + if (status != UCS_OK) { + return status; + } + } -static int uct_rdmacm_iface_is_reachable(const uct_iface_h tl_iface, - const uct_device_addr_t *dev_addr, - const uct_iface_addr_t *iface_addr) -{ - /* Reachability can be checked with the uct_md_is_sockaddr_accessible API call */ - return 1; + return UCS_OK; } static ucs_status_t uct_rdmacm_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *iface_addr) @@ -98,7 +111,7 @@ static ucs_status_t uct_rdmacm_iface_reject(uct_iface_h tl_iface, ucs_status_t status = UCS_OK; uct_rdmacm_priv_data_hdr_t hdr = { .length = 0, - .status = UCS_ERR_REJECTED + .status = (uint8_t)UCS_ERR_REJECTED }; ucs_trace("rejecting event %p with id %p", event, event->id); @@ -143,15 +156,15 @@ static uct_iface_ops_t uct_rdmacm_iface_ops = { .ep_pending_purge = ucs_empty_function, .iface_accept = uct_rdmacm_iface_accept, .iface_reject = uct_rdmacm_iface_reject, - .iface_progress_enable = (void*)ucs_empty_function_return_success, - .iface_progress_disable = (void*)ucs_empty_function_return_success, + .iface_progress_enable = (uct_iface_progress_enable_func_t)ucs_empty_function_return_success, + .iface_progress_disable = (uct_iface_progress_disable_func_t)ucs_empty_function_return_success, .iface_progress = ucs_empty_function_return_zero, .iface_flush = uct_base_iface_flush, .iface_fence = uct_base_iface_fence, .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_rdmacm_iface_t), .iface_query = uct_rdmacm_iface_query, - .iface_is_reachable = uct_rdmacm_iface_is_reachable, - .iface_get_device_address = (void*)ucs_empty_function_return_success, + .iface_is_reachable = (uct_iface_is_reachable_func_t)ucs_empty_function_return_zero, + .iface_get_device_address = (uct_iface_get_device_address_func_t)ucs_empty_function_return_success, .iface_get_address = uct_rdmacm_iface_get_address }; @@ -236,8 +249,10 @@ static void uct_rdmacm_iface_process_conn_req(uct_rdmacm_iface_t *iface, * is locked. */ static void uct_rdmacm_iface_release_cm_id(uct_rdmacm_iface_t *iface, - uct_rdmacm_ctx_t *cm_id_ctx) + uct_rdmacm_ctx_t **cm_id_ctx_p) { + uct_rdmacm_ctx_t *cm_id_ctx = *cm_id_ctx_p; + ucs_trace("destroying cm_id %p", cm_id_ctx->cm_id); ucs_list_del(&cm_id_ctx->list); @@ -247,13 +262,8 @@ static void uct_rdmacm_iface_release_cm_id(uct_rdmacm_iface_t *iface, rdma_destroy_id(cm_id_ctx->cm_id); ucs_free(cm_id_ctx); iface->cm_id_quota++; -} -static void uct_rdmacm_iface_cm_id_to_dev_name(struct rdma_cm_id *cm_id, - char *dev_name) -{ - ucs_snprintf_zero(dev_name, UCT_DEVICE_NAME_MAX, "%s:%d", - ibv_get_device_name(cm_id->verbs->device), cm_id->port_num); + *cm_id_ctx_p = NULL; } static unsigned @@ -264,9 +274,10 @@ uct_rdmacm_iface_process_event(uct_rdmacm_iface_t *iface, uct_rdmacm_md_t *rdmacm_md = (uct_rdmacm_md_t *)iface->super.md; unsigned ret_flags = UCT_RDMACM_PROCESS_EVENT_ACK_EVENT_FLAG; uct_rdmacm_ep_t *ep = NULL; + uct_cm_ep_priv_data_pack_args_t pack_args; char ip_port_str[UCS_SOCKADDR_STRING_LEN]; char dev_name[UCT_DEVICE_NAME_MAX]; - uct_rdmacm_priv_data_hdr_t hdr; + uct_rdmacm_priv_data_hdr_t *hdr; struct rdma_conn_param conn_param; uct_rdmacm_ctx_t *cm_id_ctx; ssize_t priv_data_ret; @@ -314,30 +325,29 @@ uct_rdmacm_iface_process_event(uct_rdmacm_iface_t *iface, conn_param.private_data = ucs_alloca(UCT_RDMACM_MAX_CONN_PRIV + sizeof(uct_rdmacm_priv_data_hdr_t)); - uct_rdmacm_iface_cm_id_to_dev_name(ep->cm_id_ctx->cm_id, dev_name); + uct_rdmacm_cm_id_to_dev_name(ep->cm_id_ctx->cm_id, dev_name); + + hdr = (uct_rdmacm_priv_data_hdr_t*)conn_param.private_data; + pack_args.field_mask = UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME; + ucs_strncpy_safe(pack_args.dev_name, dev_name, UCT_DEVICE_NAME_MAX); /* TODO check the ep's cb_flags to determine when to invoke this callback. * currently only UCT_CB_FLAG_ASYNC is supported so the cb is invoked from here */ - priv_data_ret = ep->pack_cb(ep->pack_cb_arg, dev_name, - (void*)(conn_param.private_data + - sizeof(uct_rdmacm_priv_data_hdr_t))); + priv_data_ret = ep->pack_cb(ep->pack_cb_arg, &pack_args, hdr + 1); if (priv_data_ret < 0) { ucs_trace("rdmacm client (iface=%p cm_id=%p fd=%d) failed to fill " "private data. status: %s", iface, event->id, iface->event_ch->fd, - ucs_status_string(priv_data_ret)); + ucs_status_string((ucs_status_t)priv_data_ret)); ret_flags |= UCT_RDMACM_PROCESS_EVENT_DESTROY_CM_ID_FLAG; - uct_rdmacm_client_handle_failure(iface, ep, priv_data_ret); + uct_rdmacm_client_handle_failure(iface, ep, (ucs_status_t)priv_data_ret); break; } - hdr.length = (uint8_t)priv_data_ret; - hdr.status = UCS_OK; - UCS_STATIC_ASSERT(sizeof(hdr) == sizeof(uct_rdmacm_priv_data_hdr_t)); + hdr->length = (uint8_t)priv_data_ret; + hdr->status = UCS_OK; /* The private_data starts with the header of the user's private data * and then the private data itself */ - memcpy((void*)conn_param.private_data, &hdr, sizeof(uct_rdmacm_priv_data_hdr_t)); - conn_param.private_data_len = sizeof(uct_rdmacm_priv_data_hdr_t) + - hdr.length; + conn_param.private_data_len = sizeof(*hdr) + hdr->length; if (rdma_connect(event->id, &conn_param)) { ucs_error("rdma_connect(to addr=%s) failed: %m", @@ -381,11 +391,11 @@ uct_rdmacm_iface_process_event(uct_rdmacm_iface_t *iface, /* client error events */ case RDMA_CM_EVENT_UNREACHABLE: - hdr = *(uct_rdmacm_priv_data_hdr_t *)event->param.conn.private_data; - if ((event->param.conn.private_data_len > 0) && - (hdr.status == UCS_ERR_REJECTED)) { - ucs_assert(hdr.length == 0); - ucs_assert(event->param.conn.private_data_len >= sizeof(hdr)); + hdr = (uct_rdmacm_priv_data_hdr_t *)event->param.ud.private_data; + if ((hdr != NULL) && (event->param.ud.private_data_len > 0) && + ((ucs_status_t)hdr->status == UCS_ERR_REJECTED)) { + ucs_assert(hdr->length == 0); + ucs_assert(event->param.ud.private_data_len >= sizeof(*hdr)); ucs_assert(!iface->is_server); status = UCS_ERR_REJECTED; } @@ -417,7 +427,7 @@ uct_rdmacm_iface_process_event(uct_rdmacm_iface_t *iface, return ret_flags; } -static void uct_rdmacm_iface_event_handler(int fd, void *arg) +static void uct_rdmacm_iface_event_handler(int fd, int events, void *arg) { uct_rdmacm_iface_t *iface = arg; uct_rdmacm_ctx_t *cm_id_ctx = NULL; @@ -451,7 +461,7 @@ static void uct_rdmacm_iface_event_handler(int fd, void *arg) if ((proc_event_flags & UCT_RDMACM_PROCESS_EVENT_DESTROY_CM_ID_FLAG) && (cm_id_ctx != NULL)) { - uct_rdmacm_iface_release_cm_id(iface, cm_id_ctx); + uct_rdmacm_iface_release_cm_id(iface, &cm_id_ctx); uct_rdmacm_iface_client_start_next_ep(iface); } } @@ -513,6 +523,8 @@ static UCS_CLASS_INIT_FUNC(uct_rdmacm_iface_t, uct_md_h md, uct_worker_h worker, } if (params->open_mode & UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER) { + self->is_server = 1; + /* Create an id for this interface. Events associated with this id will be * reported on the event_channel that was previously created. */ if (rdma_create_id(self->event_ch, &self->cm_id, NULL, RDMA_PS_UDP)) { @@ -552,7 +564,6 @@ static UCS_CLASS_INIT_FUNC(uct_rdmacm_iface_t, uct_md_h md, uct_worker_h worker, self->cb_flags = params->mode.sockaddr.cb_flags; self->conn_request_cb = params->mode.sockaddr.conn_request_cb; self->conn_request_arg = params->mode.sockaddr.conn_request_arg; - self->is_server = 1; } else { self->cm_id = NULL; self->is_server = 0; @@ -564,7 +575,7 @@ static UCS_CLASS_INIT_FUNC(uct_rdmacm_iface_t, uct_md_h md, uct_worker_h worker, /* Server and client register an event handler for incoming messages */ status = ucs_async_set_event_handler(self->super.worker->async->mode, - self->event_ch->fd, POLLIN, + self->event_ch->fd, UCS_EVENT_SET_EVREAD, uct_rdmacm_iface_event_handler, self, self->super.worker->async); if (status != UCS_OK) { @@ -578,7 +589,9 @@ static UCS_CLASS_INIT_FUNC(uct_rdmacm_iface_t, uct_md_h md, uct_worker_h worker, return UCS_OK; err_destroy_id: - rdma_destroy_id(self->cm_id); + if (self->is_server) { + rdma_destroy_id(self->cm_id); + } err_destroy_event_channel: rdma_destroy_event_channel(self->event_ch); err: @@ -587,7 +600,7 @@ static UCS_CLASS_INIT_FUNC(uct_rdmacm_iface_t, uct_md_h md, uct_worker_h worker, static UCS_CLASS_CLEANUP_FUNC(uct_rdmacm_iface_t) { - uct_rdmacm_ctx_t *cm_id_ctx; + uct_rdmacm_ctx_t *cm_id_ctx, *tmp_cm_id_ctx; ucs_async_remove_handler(self->event_ch->fd, 1); if (self->is_server) { @@ -596,12 +609,9 @@ static UCS_CLASS_CLEANUP_FUNC(uct_rdmacm_iface_t) UCS_ASYNC_BLOCK(self->super.worker->async); - while (!ucs_list_is_empty(&self->used_cm_ids_list)) { - cm_id_ctx = ucs_list_extract_head(&self->used_cm_ids_list, - uct_rdmacm_ctx_t, list); - rdma_destroy_id(cm_id_ctx->cm_id); - ucs_free(cm_id_ctx); - self->cm_id_quota++; + ucs_list_for_each_safe(cm_id_ctx, tmp_cm_id_ctx, + &self->used_cm_ids_list, list) { + uct_rdmacm_iface_release_cm_id(self, &cm_id_ctx); } UCS_ASYNC_UNBLOCK(self->super.worker->async); @@ -615,20 +625,15 @@ static UCS_CLASS_DEFINE_NEW_FUNC(uct_rdmacm_iface_t, uct_iface_t, uct_md_h, const uct_iface_config_t *); static UCS_CLASS_DEFINE_DELETE_FUNC(uct_rdmacm_iface_t, uct_iface_t); -static ucs_status_t uct_rdmacm_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) +static ucs_status_t +uct_rdmacm_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) { - *num_resources_p = 0; - *resource_p = NULL; + *num_tl_devices_p = 0; + *tl_devices_p = NULL; return UCS_OK; } -UCT_TL_COMPONENT_DEFINE(uct_rdmacm_tl, - uct_rdmacm_query_tl_resources, - uct_rdmacm_iface_t, - UCT_RDMACM_TL_NAME, - "RDMACM_", - uct_rdmacm_iface_config_table, - uct_rdmacm_iface_config_t); -UCT_MD_REGISTER_TL(&uct_rdmacm_mdc, &uct_rdmacm_tl); +UCT_TL_DEFINE(&uct_rdmacm_component, rdmacm, uct_rdmacm_query_tl_devices, + uct_rdmacm_iface_t, "RDMACM_", uct_rdmacm_iface_config_table, + uct_rdmacm_iface_config_t); diff --git a/src/uct/ib/rdmacm/rdmacm_iface.h b/src/uct/ib/rdmacm/rdmacm_iface.h index f83b52eb66a..a10297f3c36 100644 --- a/src/uct/ib/rdmacm/rdmacm_iface.h +++ b/src/uct/ib/rdmacm/rdmacm_iface.h @@ -43,6 +43,6 @@ struct uct_rdmacm_iface { void uct_rdmacm_iface_client_start_next_ep(uct_rdmacm_iface_t *iface); -extern uct_md_component_t uct_rdmacm_mdc; +extern uct_component_t uct_rdmacm_component; #endif diff --git a/src/uct/ib/rdmacm/rdmacm_listener.c b/src/uct/ib/rdmacm/rdmacm_listener.c new file mode 100644 index 00000000000..f785e0cf369 --- /dev/null +++ b/src/uct/ib/rdmacm/rdmacm_listener.c @@ -0,0 +1,112 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "rdmacm_listener.h" + + +UCS_CLASS_INIT_FUNC(uct_rdmacm_listener_t, uct_cm_h cm, + const struct sockaddr *saddr, socklen_t socklen, + const uct_listener_params_t *params) +{ + uct_rdmacm_cm_t *rdmacm_cm = ucs_derived_of(cm, uct_rdmacm_cm_t); + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + ucs_status_t status; + int backlog; + + UCS_CLASS_CALL_SUPER_INIT(uct_listener_t, cm); + + self->conn_request_cb = params->conn_request_cb; + self->user_data = (params->field_mask & UCT_LISTENER_PARAM_FIELD_USER_DATA) ? + params->user_data : NULL; + + if (rdma_create_id(rdmacm_cm->ev_ch, &self->id, self, RDMA_PS_TCP)) { + ucs_error("rdma_create_id() failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err; + } + + if (rdma_bind_addr(self->id, (struct sockaddr *)saddr)) { + status = ((errno == EADDRINUSE) || (errno == EADDRNOTAVAIL)) ? + UCS_ERR_BUSY : UCS_ERR_IO_ERROR; + ucs_error("rdma_bind_addr(addr=%s) failed: %m", + ucs_sockaddr_str(saddr, ip_port_str, + UCS_SOCKADDR_STRING_LEN)); + goto err_destroy_id; + } + + backlog = (params->field_mask & UCT_LISTENER_PARAM_FIELD_BACKLOG) ? + params->backlog : SOMAXCONN; + if (rdma_listen(self->id, backlog)) { + ucs_error("rdma_listen(id:=%p addr=%s backlog=%d) failed: %m", + self->id, ucs_sockaddr_str(saddr, ip_port_str, + UCS_SOCKADDR_STRING_LEN), + backlog); + status = UCS_ERR_IO_ERROR; + goto err_destroy_id; + } + + ucs_debug("created an RDMACM listener %p on cm %p with cm_id: %p. " + "listening on %s:%d", self, cm, self->id, + ucs_sockaddr_str(saddr, ip_port_str, UCS_SOCKADDR_STRING_LEN), + ntohs(rdma_get_src_port(self->id))); + + return UCS_OK; + +err_destroy_id: + uct_rdmacm_cm_destroy_id(self->id); +err: + return status; +} + +ucs_status_t uct_rdmacm_listener_reject(uct_listener_h listener, + uct_conn_request_h conn_request) +{ + uct_rdmacm_listener_t *rdmacm_listener = ucs_derived_of(listener, uct_rdmacm_listener_t); + struct rdma_cm_event *event = (struct rdma_cm_event *)conn_request; + + ucs_assert_always(rdmacm_listener->id == event->listen_id); + + uct_rdmacm_cm_reject(event->id); + + uct_rdmacm_cm_destroy_id(event->id); + + return uct_rdmacm_cm_ack_event(event); +} + +UCS_CLASS_CLEANUP_FUNC(uct_rdmacm_listener_t) +{ + uct_rdmacm_cm_destroy_id(self->id); +} + +ucs_status_t uct_rdmacm_listener_query(uct_listener_h listener, + uct_listener_attr_t *listener_attr) +{ + uct_rdmacm_listener_t *rdmacm_listener = ucs_derived_of(listener, + uct_rdmacm_listener_t); + struct sockaddr *addr; + ucs_status_t status; + + if (listener_attr->field_mask & UCT_LISTENER_ATTR_FIELD_SOCKADDR) { + addr = rdma_get_local_addr(rdmacm_listener->id); + status = ucs_sockaddr_copy((struct sockaddr *)&listener_attr->sockaddr, + addr); + if (status != UCS_OK) { + return status; + } + } + + return UCS_OK; +} + +UCS_CLASS_DEFINE(uct_rdmacm_listener_t, uct_listener_t); +UCS_CLASS_DEFINE_NEW_FUNC(uct_rdmacm_listener_t, uct_listener_t, + uct_cm_h , const struct sockaddr *, socklen_t , + const uct_listener_params_t *); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_rdmacm_listener_t, uct_listener_t); diff --git a/src/uct/ib/rdmacm/rdmacm_listener.h b/src/uct/ib/rdmacm/rdmacm_listener.h new file mode 100644 index 00000000000..16ecbbdc8a7 --- /dev/null +++ b/src/uct/ib/rdmacm/rdmacm_listener.h @@ -0,0 +1,35 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#include "rdmacm_cm.h" + +/** + * An rdmacm listener for incoming connections requests on the server side. + */ +typedef struct uct_rdmacm_listener { + uct_listener_t super; + + /** The rdmacm id assiciated with the listener */ + struct rdma_cm_id *id; + + /** Callback to invoke upon receiving a connection request from a client */ + uct_cm_listener_conn_request_callback_t conn_request_cb; + + /** User's data to be passed as argument to the conn_request_cb */ + void *user_data; +} uct_rdmacm_listener_t; + + +UCS_CLASS_DECLARE_NEW_FUNC(uct_rdmacm_listener_t, uct_listener_t, + uct_cm_h , const struct sockaddr *, socklen_t , + const uct_listener_params_t *); +UCS_CLASS_DECLARE_DELETE_FUNC(uct_rdmacm_listener_t, uct_listener_t); + +ucs_status_t uct_rdmacm_listener_query(uct_listener_h listener, + uct_listener_attr_t *listener_attr); + +ucs_status_t uct_rdmacm_listener_reject(uct_listener_h listener, + uct_conn_request_h conn_request); diff --git a/src/uct/ib/rdmacm/rdmacm_md.c b/src/uct/ib/rdmacm/rdmacm_md.c index 30eb7de215a..85b54823a6b 100644 --- a/src/uct/ib/rdmacm/rdmacm_md.c +++ b/src/uct/ib/rdmacm/rdmacm_md.c @@ -1,11 +1,16 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2017. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2017-219. ALL RIGHTS RESERVED. + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rdmacm_md.h" +#include "rdmacm_cm.h" -#define UCT_RDMACM_MD_PREFIX "rdmacm" static ucs_config_field_t uct_rdmacm_md_config_table[] = { {"", "", NULL, @@ -21,10 +26,10 @@ static ucs_config_field_t uct_rdmacm_md_config_table[] = { static void uct_rdmacm_md_close(uct_md_h md); static uct_md_ops_t uct_rdmacm_md_ops = { - .close = uct_rdmacm_md_close, - .query = uct_rdmacm_md_query, - .is_sockaddr_accessible = uct_rdmacm_is_sockaddr_accessible, - .is_mem_type_owned = (void *)ucs_empty_function_return_zero, + .close = uct_rdmacm_md_close, + .query = uct_rdmacm_md_query, + .is_sockaddr_accessible = uct_rdmacm_is_sockaddr_accessible, + .detect_memory_type = ucs_empty_function_return_unsupported, }; static void uct_rdmacm_md_close(uct_md_h md) @@ -35,28 +40,30 @@ static void uct_rdmacm_md_close(uct_md_h md) ucs_status_t uct_rdmacm_md_query(uct_md_h md, uct_md_attr_t *md_attr) { - md_attr->cap.flags = UCT_MD_FLAG_SOCKADDR; - md_attr->cap.reg_mem_types = 0; - md_attr->cap.mem_type = UCT_MD_MEM_TYPE_HOST; - md_attr->cap.max_alloc = 0; - md_attr->cap.max_reg = 0; - md_attr->rkey_packed_size = 0; - md_attr->reg_cost.overhead = 0; - md_attr->reg_cost.growth = 0; + md_attr->cap.flags = UCT_MD_FLAG_SOCKADDR; + md_attr->cap.reg_mem_types = 0; + md_attr->cap.access_mem_type = UCS_MEMORY_TYPE_HOST; + md_attr->cap.detect_mem_types = 0; + md_attr->cap.max_alloc = 0; + md_attr->cap.max_reg = 0; + md_attr->rkey_packed_size = 0; + md_attr->reg_cost = ucs_linear_func_make(0, 0); memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); return UCS_OK; } -static int uct_rdmacm_get_event_type(struct rdma_event_channel *event_ch) +static enum rdma_cm_event_type +uct_rdmacm_get_event_type(struct rdma_event_channel *event_ch) { + enum rdma_cm_event_type event_type; struct rdma_cm_event *event; - int ret, event_type; + int ret; /* Fetch an event */ ret = rdma_get_cm_event(event_ch, &event); if (ret) { ucs_warn("rdma_get_cm_event() failed: %m"); - return 0; + return RDMA_CM_EVENT_ADDR_RESOLVED; } event_type = event->event; @@ -73,8 +80,8 @@ static int uct_rdmacm_is_addr_route_resolved(struct rdma_cm_id *cm_id, int timeout_ms) { char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + enum rdma_cm_event_type event_type; ucs_status_t status; - int event_type; status = uct_rdmacm_resolve_addr(cm_id, addr, timeout_ms, UCS_LOG_LEVEL_DEBUG); if (status != UCS_OK) { @@ -112,25 +119,6 @@ static int uct_rdmacm_is_addr_route_resolved(struct rdma_cm_id *cm_id, return 1; } -static int uct_rdmacm_is_sockaddr_inaddr_any(struct sockaddr *addr) -{ - struct sockaddr_in6 *addr_in6; - struct sockaddr_in *addr_in; - - switch (addr->sa_family) { - case AF_INET: - addr_in = (struct sockaddr_in *)addr; - return addr_in->sin_addr.s_addr == INADDR_ANY; - case AF_INET6: - addr_in6 = (struct sockaddr_in6 *)addr; - return !memcmp(&addr_in6->sin6_addr, &in6addr_any, sizeof(addr_in6->sin6_addr)); - default: - ucs_debug("Invalid address family: %d", addr->sa_family); - } - - return 0; -} - int uct_rdmacm_is_sockaddr_accessible(uct_md_h md, const ucs_sock_addr_t *sockaddr, uct_sockaddr_accessibility_t mode) { @@ -165,7 +153,7 @@ int uct_rdmacm_is_sockaddr_accessible(uct_md_h md, const ucs_sock_addr_t *sockad goto out_destroy_id; } - if (uct_rdmacm_is_sockaddr_inaddr_any((struct sockaddr *)sockaddr->addr)) { + if (ucs_sockaddr_is_inaddr_any((struct sockaddr *)sockaddr->addr)) { is_accessible = 1; goto out_print; } @@ -194,8 +182,10 @@ int uct_rdmacm_is_sockaddr_accessible(uct_md_h md, const ucs_sock_addr_t *sockad return is_accessible; } -static ucs_status_t uct_rdmacm_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) +static ucs_status_t +uct_rdmacm_query_md_resources(uct_component_t *component, + uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p) { struct rdma_event_channel *event_ch = NULL; @@ -204,21 +194,22 @@ static ucs_status_t uct_rdmacm_query_md_resources(uct_md_resource_desc_t **resou if (event_ch == NULL) { ucs_debug("could not create an RDMACM event channel. %m. " "Disabling the RDMACM resource"); - *resources_p = NULL; - *num_resources_p = 0; - return UCS_OK; + return uct_md_query_empty_md_resource(resources_p, num_resources_p); + } rdma_destroy_event_channel(event_ch); - return uct_single_md_resource(&uct_rdmacm_mdc, resources_p, num_resources_p); + return uct_md_query_single_md_resource(component, resources_p, + num_resources_p); } static ucs_status_t -uct_rdmacm_md_open(const char *md_name, const uct_md_config_t *uct_md_config, - uct_md_h *md_p) +uct_rdmacm_md_open(uct_component_t *component, const char *md_name, + const uct_md_config_t *uct_md_config, uct_md_h *md_p) { - uct_rdmacm_md_config_t *md_config = ucs_derived_of(uct_md_config, uct_rdmacm_md_config_t); + uct_rdmacm_md_config_t *md_config = ucs_derived_of(uct_md_config, + uct_rdmacm_md_config_t); uct_rdmacm_md_t *md; ucs_status_t status; @@ -229,9 +220,10 @@ uct_rdmacm_md_open(const char *md_name, const uct_md_config_t *uct_md_config, } md->super.ops = &uct_rdmacm_md_ops; - md->super.component = &uct_rdmacm_mdc; + md->super.component = &uct_rdmacm_component; md->addr_resolve_timeout = md_config->addr_resolve_timeout; + /* cppcheck-suppress autoVariables */ *md_p = &md->super; status = UCS_OK; @@ -239,8 +231,35 @@ uct_rdmacm_md_open(const char *md_name, const uct_md_config_t *uct_md_config, return status; } -UCT_MD_COMPONENT_DEFINE(uct_rdmacm_mdc, UCT_RDMACM_MD_PREFIX, - uct_rdmacm_query_md_resources, uct_rdmacm_md_open, NULL, - ucs_empty_function_return_unsupported, - (void*)ucs_empty_function_return_success, - "RDMACM_", uct_rdmacm_md_config_table, uct_rdmacm_md_config_t); +uct_component_t uct_rdmacm_component = { + .query_md_resources = uct_rdmacm_query_md_resources, + .md_open = uct_rdmacm_md_open, +#if HAVE_RDMACM_QP_LESS + .cm_open = UCS_CLASS_NEW_FUNC_NAME(uct_rdmacm_cm_t), +#else + .cm_open = ucs_empty_function_return_unsupported, +#endif + .rkey_unpack = ucs_empty_function_return_unsupported, + .rkey_ptr = ucs_empty_function_return_unsupported, + .rkey_release = ucs_empty_function_return_success, + .name = "rdmacm", + .md_config = { + .name = "RDMA-CM memory domain", + .prefix = "RDMACM_", + .table = uct_rdmacm_md_config_table, + .size = sizeof(uct_rdmacm_md_config_t), + }, + .cm_config = { + .name = "RDMA-CM connection manager", + .prefix = "RDMACM_", + .table = uct_cm_config_table, + .size = sizeof(uct_cm_config_t), + }, + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_rdmacm_component), +#if HAVE_RDMACM_QP_LESS + .flags = UCT_COMPONENT_FLAG_CM +#else + .flags = 0 +#endif +}; +UCT_COMPONENT_REGISTER(&uct_rdmacm_component) diff --git a/src/uct/ib/rdmacm/rdmacm_md.h b/src/uct/ib/rdmacm/rdmacm_md.h index 04fae393799..cd93010aae2 100644 --- a/src/uct/ib/rdmacm/rdmacm_md.h +++ b/src/uct/ib/rdmacm/rdmacm_md.h @@ -28,7 +28,7 @@ typedef struct uct_rdmacm_md_config { double addr_resolve_timeout; } uct_rdmacm_md_config_t; -extern uct_md_component_t uct_rdmacm_mdc; +extern uct_component_t uct_rdmacm_component; ucs_status_t uct_rdmacm_md_query(uct_md_h md, uct_md_attr_t *md_attr); diff --git a/src/uct/ib/ud/accel/ud_mlx5.c b/src/uct/ib/ud/accel/ud_mlx5.c index 049ae5dd3f7..1e4da5fea2b 100644 --- a/src/uct/ib/ud/accel/ud_mlx5.c +++ b/src/uct/ib/ud/accel/ud_mlx5.c @@ -5,6 +5,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ud_mlx5.h" #include @@ -19,7 +23,7 @@ #include #include -#include +#include #include #include @@ -32,11 +36,11 @@ static ucs_config_field_t uct_ud_mlx5_iface_config_table[] = { ucs_offsetof(uct_ud_mlx5_iface_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_ud_iface_config_table)}, - {"", "", NULL, + {UCT_IB_CONFIG_PREFIX, "", NULL, ucs_offsetof(uct_ud_mlx5_iface_config_t, mlx5_common), UCS_CONFIG_TYPE_TABLE(uct_ib_mlx5_iface_config_table)}, - {"", "", NULL, + {"UD_", "", NULL, ucs_offsetof(uct_ud_mlx5_iface_config_t, ud_mlx5_common), UCS_CONFIG_TYPE_TABLE(uct_ud_mlx5_iface_common_config_table)}, @@ -49,16 +53,30 @@ uct_ud_mlx5_ep_ctrl_av_size(uct_ud_mlx5_ep_t *ep) return sizeof(struct mlx5_wqe_ctrl_seg) + uct_ib_mlx5_wqe_av_size(&ep->av); } +static UCS_F_ALWAYS_INLINE size_t uct_ud_mlx5_max_am_iov() +{ + return ucs_min(UCT_IB_MLX5_AM_ZCOPY_MAX_IOV, UCT_IB_MAX_IOV); +} + +static UCS_F_ALWAYS_INLINE size_t uct_ud_mlx5_max_inline() +{ + return UCT_IB_MLX5_AM_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE); +} + static UCS_F_ALWAYS_INLINE void uct_ud_mlx5_post_send(uct_ud_mlx5_iface_t *iface, uct_ud_mlx5_ep_t *ep, - uint8_t se, struct mlx5_wqe_ctrl_seg *ctrl, size_t wqe_size, - int max_log_sge) + uint8_t ce_se, struct mlx5_wqe_ctrl_seg *ctrl, + size_t wqe_size, uct_ud_neth_t *neth, int max_log_sge) { struct mlx5_wqe_datagram_seg *dgram = (void*)(ctrl + 1); + ucs_assert(wqe_size <= UCT_IB_MLX5_MAX_SEND_WQE_SIZE); + + UCT_UD_EP_HOOK_CALL_TX(&ep->super, neth); + uct_ib_mlx5_set_ctrl_seg(ctrl, iface->tx.wq.sw_pi, MLX5_OPCODE_SEND, 0, iface->super.qp->qp_num, - uct_ud_mlx5_tx_moderation(iface) | se, wqe_size); + uct_ud_mlx5_tx_moderation(iface, ce_se), wqe_size); uct_ib_mlx5_set_dgram_seg(dgram, &ep->av, ep->is_global ? &ep->grh_av : NULL, IBV_QPT_UD); @@ -69,53 +87,83 @@ uct_ud_mlx5_post_send(uct_ud_mlx5_iface_t *iface, uct_ud_mlx5_ep_t *ep, ucs_assert((int16_t)iface->tx.wq.bb_max >= iface->super.tx.available); } -static UCS_F_ALWAYS_INLINE void -uct_ud_mlx5_ep_tx_skb(uct_ud_mlx5_iface_t *iface, uct_ud_mlx5_ep_t *ep, - uct_ud_send_skb_t *skb, uint8_t se, int max_log_sge) +static UCS_F_ALWAYS_INLINE struct mlx5_wqe_ctrl_seg * +uct_ud_mlx5_ep_get_next_wqe(uct_ud_mlx5_iface_t *iface, uct_ud_mlx5_ep_t *ep, + size_t *wqe_size_p, void **next_seg_p) { size_t ctrl_av_size = uct_ud_mlx5_ep_ctrl_av_size(ep); struct mlx5_wqe_ctrl_seg *ctrl; - struct mlx5_wqe_data_seg *dptr; + void *ptr; - ctrl = iface->tx.wq.curr; - dptr = uct_ib_mlx5_txwq_wrap_exact(&iface->tx.wq, (void*)ctrl + ctrl_av_size); - uct_ib_mlx5_set_data_seg(dptr, skb->neth, skb->len, skb->lkey); - UCT_UD_EP_HOOK_CALL_TX(&ep->super, skb->neth); - uct_ud_mlx5_post_send(iface, ep, se, ctrl, ctrl_av_size + sizeof(*dptr), max_log_sge); -} + ucs_assert((ctrl_av_size % UCT_IB_MLX5_WQE_SEG_SIZE) == 0); -static inline void -uct_ud_mlx5_ep_tx_inl(uct_ud_mlx5_iface_t *iface, uct_ud_mlx5_ep_t *ep, - const void *buf, unsigned length, uint8_t se) -{ - size_t ctrl_av_size = uct_ud_mlx5_ep_ctrl_av_size(ep); - struct mlx5_wqe_ctrl_seg *ctrl; - struct mlx5_wqe_inl_data_seg *inl; + ctrl = iface->tx.wq.curr; + ptr = UCS_PTR_BYTE_OFFSET(ctrl, ctrl_av_size); - ctrl = iface->tx.wq.curr; - inl = uct_ib_mlx5_txwq_wrap_exact(&iface->tx.wq, (void*)ctrl + ctrl_av_size); - inl->byte_count = htonl(length | MLX5_INLINE_SEG); - uct_ib_mlx5_inline_copy(inl + 1, buf, length, &iface->tx.wq); - UCT_UD_EP_HOOK_CALL_TX(&ep->super, (uct_ud_neth_t *)buf); - uct_ud_mlx5_post_send(iface, ep, se, ctrl, - ctrl_av_size + sizeof(*inl) + length, INT_MAX); -} + *wqe_size_p = ctrl_av_size; + *next_seg_p = uct_ib_mlx5_txwq_wrap_exact(&iface->tx.wq, ptr); + return ctrl; +} -static void uct_ud_mlx5_ep_tx_ctl_skb(uct_ud_ep_t *ud_ep, uct_ud_send_skb_t *skb, - int solicited) +static uint16_t uct_ud_mlx5_ep_send_ctl(uct_ud_ep_t *ud_ep, uct_ud_send_skb_t *skb, + const uct_ud_iov_t *iov, uint16_t iovcnt, + int flags, int max_log_sge) { uct_ud_mlx5_iface_t *iface = ucs_derived_of(ud_ep->super.super.iface, uct_ud_mlx5_iface_t); - uct_ud_mlx5_ep_t *ep = ucs_derived_of(ud_ep, uct_ud_mlx5_ep_t); - uint8_t se; + uct_ud_mlx5_ep_t *ep = ucs_derived_of(ud_ep, uct_ud_mlx5_ep_t); + struct mlx5_wqe_inl_data_seg *inl; + struct mlx5_wqe_ctrl_seg *ctrl; + struct mlx5_wqe_data_seg *dptr; + uint16_t iov_index; + size_t wqe_size; + void *next_seg; + uint8_t ce_se; + uint16_t sn; + + /* set WQE flags */ + sn = iface->tx.wq.sw_pi; + ce_se = 0; + if (flags & UCT_UD_IFACE_SEND_CTL_FLAG_SOLICITED) { + ce_se |= MLX5_WQE_CTRL_SOLICITED; + } + if (flags & UCT_UD_IFACE_SEND_CTL_FLAG_SIGNALED) { + ce_se |= MLX5_WQE_CTRL_CQ_UPDATE; + } - se = solicited ? MLX5_WQE_CTRL_SOLICITED : 0; - if (skb->len >= iface->super.config.max_inline) { - uct_ud_mlx5_ep_tx_skb(iface, ep, skb, se, INT_MAX); + /* set skb header as inline (if fits the length) or as data pointer */ + ctrl = uct_ud_mlx5_ep_get_next_wqe(iface, ep, &wqe_size, &next_seg); + if (skb->len <= uct_ud_mlx5_max_inline()) { + inl = next_seg; + inl->byte_count = htonl(skb->len | MLX5_INLINE_SEG); + wqe_size += ucs_align_up_pow2(sizeof(*inl) + skb->len, + UCT_IB_MLX5_WQE_SEG_SIZE); + uct_ib_mlx5_inline_copy(inl + 1, skb->neth, skb->len, &iface->tx.wq); } else { - uct_ud_mlx5_ep_tx_inl(iface, ep, skb->neth, skb->len, se); + ucs_assert(!(flags & UCT_UD_IFACE_SEND_CTL_FLAG_INLINE)); + dptr = next_seg; + wqe_size += sizeof(*dptr); + uct_ib_mlx5_set_data_seg(dptr, skb->neth, skb->len, skb->lkey); } + + /* copy IOV from descriptor to WQE */ + dptr = UCS_PTR_BYTE_OFFSET(ctrl, wqe_size); + for (iov_index = 0; iov_index < iovcnt; ++iov_index) { + if (iov[iov_index].length == 0) { + continue; + } + + dptr = uct_ib_mlx5_txwq_wrap_any(&iface->tx.wq, dptr); + uct_ib_mlx5_set_data_seg(dptr, iov[iov_index].buffer, + iov[iov_index].length, iov[iov_index].lkey); + wqe_size += sizeof(*dptr); + ++dptr; + } + + uct_ud_mlx5_post_send(iface, ep, ce_se, ctrl, wqe_size, skb->neth, + max_log_sge); + return sn; } static UCS_F_NOINLINE void @@ -149,11 +197,12 @@ uct_ud_mlx5_iface_post_recv(uct_ud_mlx5_iface_t *iface) *iface->rx.wq.dbrec = htonl(pi); } -static UCS_CLASS_INIT_FUNC(uct_ud_mlx5_ep_t, uct_iface_h tl_iface) +static UCS_CLASS_INIT_FUNC(uct_ud_mlx5_ep_t, uct_iface_h tl_iface, + const uct_ep_params_t *params) { uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_iface, uct_ud_mlx5_iface_t); ucs_trace_func(""); - UCS_CLASS_CALL_SUPER_INIT(uct_ud_ep_t, &iface->super); + UCS_CLASS_CALL_SUPER_INIT(uct_ud_ep_t, &iface->super, params); return UCS_OK; } @@ -163,83 +212,161 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ud_mlx5_ep_t) } UCS_CLASS_DEFINE(uct_ud_mlx5_ep_t, uct_ud_ep_t); -static UCS_CLASS_DEFINE_NEW_FUNC(uct_ud_mlx5_ep_t, uct_ep_t, uct_iface_h); +static UCS_CLASS_DEFINE_NEW_FUNC(uct_ud_mlx5_ep_t, uct_ep_t, uct_iface_h, + const uct_ep_params_t*); UCS_CLASS_DEFINE_DELETE_FUNC(uct_ud_mlx5_ep_t, uct_ep_t); -static ucs_status_t -uct_ud_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr, - const void *buffer, unsigned length) +/* + * Generic inline+iov post-send function + * The caller should check that header size + sg list would not exceed WQE size. + */ +static UCS_F_ALWAYS_INLINE ucs_status_t +uct_ud_mlx5_ep_inline_iov_post(uct_ep_h tl_ep, uint8_t am_id, + /* inl. header */ const void *header, size_t header_size, + /* inl. data */ const void *data, size_t data_size, + /* iov data */ const uct_iov_t *iov, size_t iovcnt, + uint32_t packet_flags, uct_completion_t *comp, + unsigned stat_ops_counter, unsigned stat_bytes_counter, + const char *func_name) { - uct_ud_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t); uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ud_mlx5_iface_t); - size_t ctrl_av_size = uct_ud_mlx5_ep_ctrl_av_size(ep); - struct mlx5_wqe_ctrl_seg *ctrl; + uct_ud_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t); struct mlx5_wqe_inl_data_seg *inl; - uct_ud_am_short_hdr_t *am; - uct_ud_neth_t *neth; + struct mlx5_wqe_ctrl_seg *ctrl; + size_t inline_size, wqe_size; + void *next_seg, *wqe_data; uct_ud_send_skb_t *skb; - size_t wqe_size; + ucs_status_t status; + uct_ud_neth_t *neth; - /* data a written directly into tx wqe, so it is impossible to use - * common ud am code - */ - UCT_CHECK_AM_ID(id); - UCT_CHECK_LENGTH(sizeof(uct_ud_neth_t) + sizeof(hdr) + length, - 0, iface->super.config.max_inline, "am_short"); + UCT_CHECK_AM_ID(am_id); + UCT_UD_CHECK_ZCOPY_LENGTH(&iface->super, header_size + data_size, + uct_iov_total_length(iov, iovcnt)); + UCT_CHECK_IOV_SIZE(iovcnt, uct_ud_mlx5_max_am_iov(), func_name); uct_ud_enter(&iface->super); skb = uct_ud_ep_get_tx_skb(&iface->super, &ep->super); if (!skb) { - uct_ud_leave(&iface->super); - return UCS_ERR_NO_RESOURCE; + status = UCS_ERR_NO_RESOURCE; + goto out; + } + + ctrl = uct_ud_mlx5_ep_get_next_wqe(iface, ep, &wqe_size, + &next_seg); + inl = next_seg; + inline_size = sizeof(*neth) + header_size + data_size; + inl->byte_count = htonl(inline_size | MLX5_INLINE_SEG); + wqe_size += sizeof(*inl) + inline_size; + skb->len = inline_size; + + /* set network header */ + neth = (void*)(inl + 1); + neth->packet_type = (am_id << UCT_UD_PACKET_AM_ID_SHIFT) | + ep->super.dest_ep_id | + packet_flags; + uct_ud_neth_init_data(&ep->super, neth); + if (!(packet_flags & UCT_UD_PACKET_FLAG_ACK_REQ)) { + /* check for ACK_REQ, if not already enabled by packet_flags */ + neth->packet_type |= uct_ud_ep_req_ack(&ep->super) << UCT_UD_PACKET_ACK_REQ_SHIFT; } - ctrl = iface->tx.wq.curr; - /* Set inline segment which has AM id, AM header, and AM payload */ - inl = uct_ib_mlx5_txwq_wrap_exact(&iface->tx.wq, (void*)ctrl + ctrl_av_size); - wqe_size = length + sizeof(*am) + sizeof(*neth); - inl->byte_count = htonl(wqe_size | MLX5_INLINE_SEG); + /* copy inline "header", assume it fits to one BB so we won't have to check + * for QP wrap-around. This is either the "put" header or the 64-bit + * am_short header, not the am_zcopy header. + */ + wqe_data = UCS_PTR_BYTE_OFFSET(neth + 1, header_size); + ucs_assert(wqe_data <= iface->tx.wq.qend); + memcpy(neth + 1, header, header_size); + + /* copy inline "data" */ + uct_ib_mlx5_inline_copy(wqe_data, data, data_size, &iface->tx.wq); + + /* set iov to dptr */ + if (iovcnt > 0) { + wqe_size = ucs_align_up_pow2(wqe_size, UCT_IB_MLX5_WQE_SEG_SIZE); + wqe_size += uct_ib_mlx5_set_data_seg_iov(&iface->tx.wq, + UCS_PTR_BYTE_OFFSET(ctrl, wqe_size), + iov, iovcnt); + } - /* assume that neth and am header fit into one bb */ - ucs_assert(sizeof(*am) + sizeof(*neth) < MLX5_SEND_WQE_BB); - neth = (void*)(inl + 1); - uct_ud_am_set_neth(neth, &ep->super, id); + uct_ud_mlx5_post_send(iface, ep, 0, ctrl, wqe_size, neth, + UCT_IB_MAX_ZCOPY_LOG_SGE(&iface->super.super)); - am = (void*)(neth + 1); - am->hdr = hdr; - uct_ib_mlx5_inline_copy(am + 1, buffer, length, &iface->tx.wq); + memcpy(skb->neth, neth, sizeof(*neth) + header_size); + memcpy(UCS_PTR_BYTE_OFFSET(skb->neth + 1, header_size), data, data_size); - wqe_size += ctrl_av_size + sizeof(*inl); - UCT_CHECK_LENGTH(wqe_size, 0, UCT_IB_MLX5_MAX_SEND_WQE_SIZE, "am_short"); - UCT_UD_EP_HOOK_CALL_TX(&ep->super, neth); - uct_ud_mlx5_post_send(iface, ep, 0, ctrl, wqe_size, INT_MAX); + if (iovcnt > 0) { + uct_ud_skb_set_zcopy_desc(skb, iov, iovcnt, comp); + status = UCS_INPROGRESS; + } else { + status = UCS_OK; + } - skb->len = sizeof(*neth) + sizeof(*am); - memcpy(skb->neth, neth, skb->len); - uct_ud_iface_complete_tx_inl(&iface->super, &ep->super, skb, - (char *)skb->neth + skb->len, buffer, length); - UCT_TL_EP_STAT_OP(&ep->super.super, AM, SHORT, sizeof(hdr) + length); + uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb); + uct_ud_ep_ctl_op_del(&ep->super, UCT_UD_EP_OP_ACK|UCT_UD_EP_OP_ACK_REQ); + + UCS_STATS_UPDATE_COUNTER(ep->super.super.stats, stat_ops_counter, 1); + UCS_STATS_UPDATE_COUNTER(ep->super.super.stats, stat_bytes_counter, + header_size + data_size + + uct_iov_total_length(iov, iovcnt)); +out: uct_ud_leave(&iface->super); - return UCS_OK; + return status; +} + +static UCS_F_ALWAYS_INLINE ucs_status_t +uct_ud_mlx5_ep_short_common(uct_ep_h tl_ep, uint8_t am_id, + /* inline header */ const void *header, size_t header_size, + /* inline data */ const void *data, size_t data_size, + uint32_t packet_flags, unsigned stat_ops_counter, + const char *func_name) +{ + UCT_CHECK_LENGTH(sizeof(uct_ud_neth_t) + header_size + data_size, 0, + uct_ud_mlx5_max_inline(), func_name); + return uct_ud_mlx5_ep_inline_iov_post(tl_ep, am_id, + header, header_size, + data, data_size, + /* iov */ NULL, 0, + packet_flags, + /* completion */ NULL, + stat_ops_counter, + UCT_EP_STAT_BYTES_SHORT, + func_name); +} + +static ucs_status_t +uct_ud_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr, + const void *buffer, unsigned length) +{ + return uct_ud_mlx5_ep_short_common(tl_ep, id, + /* inline header */ &hdr, sizeof(hdr), + /* inline data */ buffer, length, + /* packet flags */ UCT_UD_PACKET_FLAG_AM, + UCT_EP_STAT_AM, + "uct_ud_mlx5_ep_am_short"); } static ssize_t uct_ud_mlx5_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id, uct_pack_callback_t pack_cb, void *arg, unsigned flags) { - uct_ud_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t); + uct_ud_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t); uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ud_mlx5_iface_t); + struct mlx5_wqe_ctrl_seg *ctrl; + struct mlx5_wqe_data_seg *dptr; uct_ud_send_skb_t *skb; ucs_status_t status; + size_t wqe_size; + void *next_seg; size_t length; uct_ud_enter(&iface->super); - status = uct_ud_am_common(&iface->super, &ep->super, id, &skb); + status = uct_ud_am_skb_common(&iface->super, &ep->super, id, &skb); if (status != UCS_OK) { uct_ud_leave(&iface->super); return status; @@ -248,7 +375,12 @@ static ssize_t uct_ud_mlx5_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id, length = uct_ud_skb_bcopy(skb, pack_cb, arg); UCT_UD_CHECK_BCOPY_LENGTH(&iface->super, length); - uct_ud_mlx5_ep_tx_skb(iface, ep, skb, 0, INT_MAX); + ctrl = uct_ud_mlx5_ep_get_next_wqe(iface, ep, &wqe_size, &next_seg); + dptr = next_seg; + uct_ib_mlx5_set_data_seg(dptr, skb->neth, skb->len, skb->lkey); + uct_ud_mlx5_post_send(iface, ep, 0, ctrl, wqe_size + sizeof(*dptr), + skb->neth, INT_MAX); + uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb); UCT_TL_EP_STAT_OP(&ep->super.super, AM, BCOPY, length); uct_ud_leave(&iface->super); @@ -260,120 +392,34 @@ uct_ud_mlx5_ep_am_zcopy(uct_ep_h tl_ep, uint8_t id, const void *header, unsigned header_length, const uct_iov_t *iov, size_t iovcnt, unsigned flags, uct_completion_t *comp) { - uct_ud_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t); - uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, - uct_ud_mlx5_iface_t); - size_t ctrl_av_size = uct_ud_mlx5_ep_ctrl_av_size(ep); - uct_ud_send_skb_t *skb; - struct mlx5_wqe_ctrl_seg *ctrl; - struct mlx5_wqe_inl_data_seg *inl; - uct_ud_neth_t *neth; - size_t inl_size, wqe_size; + char dummy = 0 ; /* pass dummy pointer to 0-length header to avoid compiler + warnings */ - UCT_CHECK_AM_ID(id); - UCT_CHECK_IOV_SIZE(iovcnt, uct_ib_iface_get_max_iov(&iface->super.super), - "uct_ud_mlx5_ep_am_zcopy"); UCT_CHECK_LENGTH(sizeof(uct_ud_neth_t) + header_length, 0, UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(UCT_IB_MLX5_AV_FULL_SIZE), "am_zcopy header"); - UCT_UD_CHECK_ZCOPY_LENGTH(&iface->super, header_length, - uct_iov_total_length(iov, iovcnt)); - - uct_ud_enter(&iface->super); - - skb = uct_ud_ep_get_tx_skb(&iface->super, &ep->super); - if (!skb) { - uct_ud_leave(&iface->super); - return UCS_ERR_NO_RESOURCE; - } - - ctrl = iface->tx.wq.curr; - inl = uct_ib_mlx5_txwq_wrap_exact(&iface->tx.wq, (void*)ctrl + ctrl_av_size); - inl_size = header_length + sizeof(*neth); - inl->byte_count = htonl(inl_size | MLX5_INLINE_SEG); - - neth = (void*)(inl + 1); - uct_ud_am_set_neth(neth, &ep->super, id); - /* force ACK_REQ because we want to call user completion ASAP */ - neth->packet_type |= UCT_UD_PACKET_FLAG_ACK_REQ; - - uct_ib_mlx5_inline_copy(neth + 1, header, header_length, &iface->tx.wq); - - wqe_size = ucs_align_up_pow2(ctrl_av_size + inl_size + sizeof(*inl), - UCT_IB_MLX5_WQE_SEG_SIZE); - wqe_size += uct_ib_mlx5_set_data_seg_iov(&iface->tx.wq, (void *)ctrl + wqe_size, - iov, iovcnt); - ucs_assert(wqe_size <= UCT_IB_MLX5_MAX_SEND_WQE_SIZE); - - UCT_UD_EP_HOOK_CALL_TX(&ep->super, neth); - uct_ud_mlx5_post_send(iface, ep, 0, ctrl, wqe_size, - UCT_IB_MAX_ZCOPY_LOG_SGE(&iface->super.super)); - - skb->len = sizeof(*neth) + header_length; - memcpy(skb->neth, neth, sizeof(*neth)); - memcpy(skb->neth + 1, header, header_length); - uct_ud_am_set_zcopy_desc(skb, iov, iovcnt, comp); - - uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb); - UCT_TL_EP_STAT_OP(&ep->super.super, AM, ZCOPY, header_length + - uct_iov_total_length(iov, iovcnt)); - uct_ud_leave(&iface->super); - return UCS_INPROGRESS; + return uct_ud_mlx5_ep_inline_iov_post(tl_ep, id, + /* inl. header */ &dummy, 0, + /* inl. data */ header, header_length, + /* iov */ iov, iovcnt, + /* packet flags */ UCT_UD_PACKET_FLAG_AM | + UCT_UD_PACKET_FLAG_ACK_REQ, + /* completion */ comp, + UCT_EP_STAT_AM, UCT_EP_STAT_BYTES_ZCOPY, + "uct_ud_mlx5_ep_am_zcopy"); } static ucs_status_t uct_ud_mlx5_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length, uint64_t remote_addr, uct_rkey_t rkey) { - uct_ud_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t); - uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, - uct_ud_mlx5_iface_t); - size_t ctrl_av_size = uct_ud_mlx5_ep_ctrl_av_size(ep); - struct mlx5_wqe_ctrl_seg *ctrl; - struct mlx5_wqe_inl_data_seg *inl; - uct_ud_put_hdr_t *put_hdr; - uct_ud_neth_t *neth; - uct_ud_send_skb_t *skb; - size_t wqe_size; - - uct_ud_enter(&iface->super); - - skb = uct_ud_ep_get_tx_skb(&iface->super, &ep->super); - if (!skb) { - uct_ud_leave(&iface->super); - return UCS_ERR_NO_RESOURCE; - } - - ctrl = iface->tx.wq.curr; - /* Set inline segment which has AM id, AM header, and AM payload */ - inl = uct_ib_mlx5_txwq_wrap_exact(&iface->tx.wq, (void*)ctrl + ctrl_av_size); - wqe_size = length + sizeof(*put_hdr) + sizeof(*neth); - inl->byte_count = htonl(wqe_size | MLX5_INLINE_SEG); - - /* assume that neth and am header fit into one bb */ - ucs_assert(sizeof(*put_hdr) + sizeof(*neth) < MLX5_SEND_WQE_BB); - neth = (void*)(inl + 1); - uct_ud_neth_init_data(&ep->super, neth); - uct_ud_neth_set_type_put(&ep->super, neth); - uct_ud_neth_ack_req(&ep->super, neth); - - put_hdr = (uct_ud_put_hdr_t *)(neth+1); - put_hdr->rva = remote_addr; - - uct_ib_mlx5_inline_copy(put_hdr + 1, buffer, length, &iface->tx.wq); - - wqe_size += ctrl_av_size + sizeof(*inl); - UCT_CHECK_LENGTH(wqe_size, 0, UCT_IB_MLX5_MAX_SEND_WQE_SIZE, "put_short"); - UCT_UD_EP_HOOK_CALL_TX(&ep->super, neth); - uct_ud_mlx5_post_send(iface, ep, 0, ctrl, wqe_size, INT_MAX); - - skb->len = sizeof(*neth) + sizeof(*put_hdr); - memcpy(skb->neth, neth, skb->len); - uct_ud_iface_complete_tx_inl(&iface->super, &ep->super, skb, - (char *)skb->neth + skb->len, buffer, length); - UCT_TL_EP_STAT_OP(&ep->super.super, PUT, SHORT, length); - uct_ud_leave(&iface->super); - return UCS_OK; + uct_ud_put_hdr_t puth = { .rva = remote_addr }; + return uct_ud_mlx5_ep_short_common(tl_ep, 0, + /* inl. header */ &puth, sizeof(puth), + /* inl. data */ buffer, length, + /* packet flags */ UCT_UD_PACKET_FLAG_PUT, + UCT_EP_STAT_PUT, + "uct_ud_mlx5_ep_put_short"); } static UCS_F_ALWAYS_INLINE unsigned @@ -385,11 +431,13 @@ uct_ud_mlx5_iface_poll_rx(uct_ud_mlx5_iface_t *iface, int is_async) uint32_t len; void *packet; unsigned count; + ptrdiff_t rx_hdr_offset; - ci = iface->rx.wq.cq_wqe_counter & iface->rx.wq.mask; - packet = (void *)be64toh(iface->rx.wq.wqes[ci].addr); - ucs_prefetch(packet + UCT_IB_GRH_LEN); - desc = (uct_ib_iface_recv_desc_t *)(packet - iface->super.super.config.rx_hdr_offset); + ci = iface->rx.wq.cq_wqe_counter & iface->rx.wq.mask; + packet = (void *)be64toh(iface->rx.wq.wqes[ci].addr); + ucs_prefetch(UCS_PTR_BYTE_OFFSET(packet, UCT_IB_GRH_LEN)); + rx_hdr_offset = iface->super.super.config.rx_hdr_offset; + desc = UCS_PTR_BYTE_OFFSET(packet, -rx_hdr_offset); cqe = uct_ib_mlx5_poll_cq(&iface->super.super, &iface->cq[UCT_IB_DIR_RX]); if (cqe == NULL) { @@ -409,15 +457,16 @@ uct_ud_mlx5_iface_poll_rx(uct_ud_mlx5_iface_t *iface, int is_async) len = ntohl(cqe->byte_cnt); VALGRIND_MAKE_MEM_DEFINED(packet, len); - if (!uct_ud_iface_check_grh(&iface->super, packet + UCT_IB_GRH_LEN, - (ntohl(cqe->flags_rqpn) >> 28) & 3)) { + if (!uct_ud_iface_check_grh(&iface->super, packet, + uct_ib_mlx5_cqe_is_grh_present(cqe))) { ucs_mpool_put_inline(desc); goto out; } uct_ib_mlx5_log_rx(&iface->super.super, cqe, packet, uct_ud_dump_packet); + /* coverity[tainted_data] */ uct_ud_ep_process_rx(&iface->super, - (uct_ud_neth_t *)(packet + UCT_IB_GRH_LEN), + (uct_ud_neth_t *)UCS_PTR_BYTE_OFFSET(packet, UCT_IB_GRH_LEN), len - UCT_IB_GRH_LEN, (uct_ud_recv_skb_t *)ucs_unaligned_ptr(desc), is_async); out: @@ -432,9 +481,10 @@ uct_ud_mlx5_iface_poll_rx(uct_ud_mlx5_iface_t *iface, int is_async) } static UCS_F_ALWAYS_INLINE unsigned -uct_ud_mlx5_iface_poll_tx(uct_ud_mlx5_iface_t *iface) +uct_ud_mlx5_iface_poll_tx(uct_ud_mlx5_iface_t *iface, int is_async) { struct mlx5_cqe64 *cqe; + uint16_t hw_ci; cqe = uct_ib_mlx5_poll_cq(&iface->super.super, &iface->cq[UCT_IB_DIR_TX]); if (cqe == NULL) { @@ -444,8 +494,11 @@ uct_ud_mlx5_iface_poll_tx(uct_ud_mlx5_iface_t *iface) ucs_memory_cpu_load_fence(); uct_ib_mlx5_log_cqe(cqe); - iface->super.tx.available = uct_ib_mlx5_txwq_update_bb(&iface->tx.wq, - ntohs(cqe->wqe_counter)); + hw_ci = ntohs(cqe->wqe_counter); + iface->super.tx.available = uct_ib_mlx5_txwq_update_bb(&iface->tx.wq, hw_ci); + + uct_ud_iface_send_completion(&iface->super, hw_ci, is_async); + return 1; } @@ -456,7 +509,7 @@ static unsigned uct_ud_mlx5_iface_progress(uct_iface_h tl_iface) unsigned n, count = 0; uct_ud_enter(&iface->super); - uct_ud_iface_dispatch_zcopy_comps(&iface->super); + uct_ud_iface_dispatch_async_comps(&iface->super); status = uct_ud_iface_dispatch_pending_rx(&iface->super); if (ucs_likely(status == UCS_OK)) { @@ -466,7 +519,7 @@ static unsigned uct_ud_mlx5_iface_progress(uct_iface_h tl_iface) } while ((n > 0) && (count < iface->super.super.config.rx_max_poll)); } - count += uct_ud_mlx5_iface_poll_tx(iface); + count += uct_ud_mlx5_iface_poll_tx(iface, 0); uct_ud_iface_progress_pending(&iface->super, 0); uct_ud_leave(&iface->super); return count; @@ -481,9 +534,9 @@ static unsigned uct_ud_mlx5_iface_async_progress(uct_ud_iface_t *ud_iface) do { n = uct_ud_mlx5_iface_poll_rx(iface, 1); count += n; - } while (n > 0); + } while ((n > 0) && (count < iface->super.rx.async_max_poll)); - count += uct_ud_mlx5_iface_poll_tx(iface); + count += uct_ud_mlx5_iface_poll_tx(iface, 1); uct_ud_iface_progress_pending(&iface->super, 1); @@ -497,23 +550,22 @@ uct_ud_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) ucs_status_t status; ucs_trace_func(""); - status = uct_ud_iface_query(iface, iface_attr); + + status = uct_ud_iface_query(iface, iface_attr, uct_ud_mlx5_max_am_iov(), + UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(UCT_IB_MLX5_AV_FULL_SIZE) + - sizeof(uct_ud_neth_t)); if (status != UCS_OK) { return status; } - iface_attr->overhead = 80e-9; /* Software overhead */ - iface_attr->cap.am.max_iov = uct_ib_iface_get_max_iov(&iface->super); - - iface_attr->cap.am.max_hdr = UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(UCT_IB_MLX5_AV_FULL_SIZE) - - sizeof(uct_ud_neth_t); + iface_attr->overhead = 80e-9; /* Software overhead */ return UCS_OK; } static ucs_status_t uct_ud_mlx5_ep_create_ah(uct_ud_mlx5_iface_t *iface, uct_ud_mlx5_ep_t *ep, - const uct_ib_address_t *ib_addr, + const uct_ib_address_t *ib_addr, unsigned path_index, const uct_ud_iface_addr_t *if_addr) { ucs_status_t status; @@ -521,8 +573,8 @@ uct_ud_mlx5_ep_create_ah(uct_ud_mlx5_iface_t *iface, uct_ud_mlx5_ep_t *ep, int is_global; status = uct_ud_mlx5_iface_get_av(&iface->super.super, &iface->ud_mlx5_common, - ib_addr, ep->super.path_bits, &ep->av, - &ep->grh_av, &is_global); + ib_addr, path_index, &ep->av, &ep->grh_av, + &is_global); if (status != UCS_OK) { return status; } @@ -530,8 +582,6 @@ uct_ud_mlx5_ep_create_ah(uct_ud_mlx5_iface_t *iface, uct_ud_mlx5_ep_t *ep, remote_qpn = uct_ib_unpack_uint24(if_addr->qp_num); ep->is_global = is_global; ep->av.dqp_dct |= htonl(remote_qpn); - uct_ib_mlx5_iface_set_av_sport(&iface->super.super, &ep->av, - remote_qpn ^ iface->super.qp->qp_num); return UCS_OK; } @@ -539,7 +589,7 @@ static ucs_status_t uct_ud_mlx5_ep_create_connected(uct_iface_h iface_h, const uct_device_addr_t *dev_addr, const uct_iface_addr_t *iface_addr, - uct_ep_h *new_ep_p) + unsigned path_index, uct_ep_h *new_ep_p) { uct_ud_mlx5_iface_t *iface = ucs_derived_of(iface_h, uct_ud_mlx5_iface_t); uct_ud_mlx5_ep_t *ep; @@ -551,7 +601,7 @@ uct_ud_mlx5_ep_create_connected(uct_iface_h iface_h, uct_ud_enter(&iface->super); status = uct_ud_ep_create_connected_common(&iface->super, ib_addr, if_addr, - &new_ud_ep, &skb); + path_index, &new_ud_ep, &skb); if (status != UCS_OK && status != UCS_ERR_NO_RESOURCE && status != UCS_ERR_ALREADY_EXISTS) { @@ -560,13 +610,15 @@ uct_ud_mlx5_ep_create_connected(uct_iface_h iface_h, } ep = ucs_derived_of(new_ud_ep, uct_ud_mlx5_ep_t); + /* cppcheck-suppress autoVariables */ *new_ep_p = &ep->super.super.super; if (status == UCS_ERR_ALREADY_EXISTS) { uct_ud_leave(&iface->super); return UCS_OK; } - status_ah = uct_ud_mlx5_ep_create_ah(iface, ep, ib_addr, if_addr); + status_ah = uct_ud_mlx5_ep_create_ah(iface, ep, ib_addr, + ep->super.path_index, if_addr); if (status_ah != UCS_OK) { uct_ud_ep_destroy_connected(&ep->super, ib_addr, if_addr); *new_ep_p = NULL; @@ -575,7 +627,7 @@ uct_ud_mlx5_ep_create_connected(uct_iface_h iface_h, } if (status == UCS_OK) { - uct_ud_mlx5_ep_tx_ctl_skb(&ep->super, skb, 1); + uct_ud_mlx5_ep_send_ctl(&ep->super, skb, NULL, 0, 1, 1); uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb); ep->super.flags |= UCT_UD_EP_FLAG_CREQ_SENT; } @@ -590,10 +642,12 @@ uct_ud_mlx5_ep_create(const uct_ep_params_t* params, uct_ep_h *ep_p) if (ucs_test_all_flags(params->field_mask, UCT_EP_PARAM_FIELD_DEV_ADDR | UCT_EP_PARAM_FIELD_IFACE_ADDR)) { return uct_ud_mlx5_ep_create_connected(params->iface, params->dev_addr, - params->iface_addr, ep_p); + params->iface_addr, + UCT_EP_PARAMS_GET_PATH_INDEX(params), + ep_p); } - return uct_ud_mlx5_ep_t_new(params->iface, ep_p); + return uct_ud_mlx5_ep_t_new(params->iface, params, ep_p); } @@ -615,7 +669,8 @@ uct_ud_mlx5_ep_connect_to_ep(uct_ep_h tl_ep, return status; } - status = uct_ud_mlx5_ep_create_ah(iface, ep, ib_addr, (const uct_ud_iface_addr_t *)ep_addr); + status = uct_ud_mlx5_ep_create_ah(iface, ep, ib_addr, ep->super.path_index, + (const uct_ud_iface_addr_t *)ep_addr); if (status != UCS_OK) { return status; } @@ -653,26 +708,24 @@ static void uct_ud_mlx5_iface_event_cq(uct_ib_iface_t *ib_iface, } static ucs_status_t uct_ud_mlx5_iface_create_qp(uct_ib_iface_t *ib_iface, - uct_ib_qp_attr_t *attr, + uct_ib_qp_attr_t *ib_attr, struct ibv_qp **qp_p) { uct_ud_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_ud_mlx5_iface_t); + uct_ib_mlx5_qp_t *qp = &iface->tx.wq.super; + uct_ib_mlx5_qp_attr_t attr = {}; + ucs_status_t status; - return uct_ib_mlx5_iface_create_qp(ib_iface, &iface->mlx5_common, attr, qp_p); -} - -static ucs_status_t uct_ud_mlx5_init_res_domain(uct_ib_iface_t *ib_iface) -{ - uct_ud_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_ud_mlx5_iface_t); - - return uct_ib_mlx5_iface_init_res_domain(ib_iface, &iface->mlx5_common); -} + attr.super = *ib_attr; + attr.mmio_mode = UCT_IB_MLX5_MMIO_MODE_LAST; -static void uct_ud_mlx5_cleanup_res_domain(uct_ib_iface_t *ib_iface) -{ - uct_ud_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_ud_mlx5_iface_t); + status = uct_ib_mlx5_iface_create_qp(ib_iface, qp, &attr); + if (status != UCS_OK) { + return status; + } - uct_ib_mlx5_iface_cleanup_res_domain(&iface->mlx5_common); + *qp_p = qp->verbs.qp; + return status; } static void UCS_CLASS_DELETE_FUNC_NAME(uct_ud_mlx5_iface_t)(uct_iface_t*); @@ -682,13 +735,9 @@ static void uct_ud_mlx5_iface_handle_failure(uct_ib_iface_t *ib_iface, void *arg { uct_ud_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_ud_mlx5_iface_t); - if (status == UCS_ERR_ENDPOINT_TIMEOUT) { - uct_ud_iface_handle_failure(ib_iface, arg, status); - } else { - /* Local side failure - treat as fatal */ - uct_ib_mlx5_completion_with_err(ib_iface, arg, &iface->tx.wq, - UCS_LOG_LEVEL_FATAL); - } + /* Local side failure - treat as fatal */ + uct_ib_mlx5_completion_with_err(ib_iface, arg, &iface->tx.wq, + UCS_LOG_LEVEL_FATAL); } static uct_ud_iface_ops_t uct_ud_mlx5_iface_ops = { @@ -709,9 +758,10 @@ static uct_ud_iface_ops_t uct_ud_mlx5_iface_ops = { .iface_flush = uct_ud_iface_flush, .iface_fence = uct_base_iface_fence, .iface_progress_enable = uct_ud_iface_progress_enable, - .iface_progress_disable = uct_base_iface_progress_disable, + .iface_progress_disable = uct_ud_iface_progress_disable, .iface_progress = uct_ud_mlx5_iface_progress, - .iface_event_fd_get = uct_ib_iface_event_fd_get, + .iface_event_fd_get = (uct_iface_event_fd_get_func_t) + ucs_empty_function_return_unsupported, .iface_event_arm = uct_ud_iface_event_arm, .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_ud_mlx5_iface_t), .iface_query = uct_ud_mlx5_iface_query, @@ -724,13 +774,11 @@ static uct_ud_iface_ops_t uct_ud_mlx5_iface_ops = { .event_cq = uct_ud_mlx5_iface_event_cq, .handle_failure = uct_ud_mlx5_iface_handle_failure, .set_ep_failed = uct_ud_mlx5_ep_set_failed, - .create_qp = uct_ud_mlx5_iface_create_qp, - .init_res_domain = uct_ud_mlx5_init_res_domain, - .cleanup_res_domain = uct_ud_mlx5_cleanup_res_domain, }, .async_progress = uct_ud_mlx5_iface_async_progress, - .tx_skb = uct_ud_mlx5_ep_tx_ctl_skb, - .ep_free = UCS_CLASS_DELETE_FUNC_NAME(uct_ud_mlx5_ep_t) + .send_ctl = uct_ud_mlx5_ep_send_ctl, + .ep_free = UCS_CLASS_DELETE_FUNC_NAME(uct_ud_mlx5_ep_t), + .create_qp = uct_ud_mlx5_iface_create_qp, }; static UCS_CLASS_INIT_FUNC(uct_ud_mlx5_iface_t, @@ -746,13 +794,16 @@ static UCS_CLASS_INIT_FUNC(uct_ud_mlx5_iface_t, ucs_trace_func(""); - init_attr.flags = UCT_IB_CQ_IGNORE_OVERRUN; + init_attr.flags = UCT_IB_CQ_IGNORE_OVERRUN; + init_attr.cq_len[UCT_IB_DIR_TX] = config->super.super.tx.queue_len * UCT_IB_MLX5_MAX_BB; + init_attr.cq_len[UCT_IB_DIR_RX] = config->super.super.rx.queue_len; + + self->tx.wq.super.type = UCT_IB_MLX5_OBJ_TYPE_LAST; UCS_CLASS_CALL_SUPER_INIT(uct_ud_iface_t, &uct_ud_mlx5_iface_ops, md, worker, params, &config->super, &init_attr); - uct_ib_iface_set_max_iov(&self->super.super, UCT_IB_MLX5_AM_ZCOPY_MAX_IOV); - self->super.config.max_inline = UCT_IB_MLX5_AM_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE); + self->super.config.max_inline = uct_ud_mlx5_max_inline(); status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_TX], &self->cq[UCT_IB_DIR_TX]); if (status != UCS_OK) { @@ -770,13 +821,17 @@ static UCS_CLASS_INIT_FUNC(uct_ud_mlx5_iface_t, if (status != UCS_OK) { return status; } + self->super.tx.available = self->tx.wq.bb_max; + ucs_assert(init_attr.cq_len[UCT_IB_DIR_TX] >= self->tx.wq.bb_max); status = uct_ib_mlx5_get_rxwq(self->super.qp, &self->rx.wq); if (status != UCS_OK) { return status; } + ucs_assert(init_attr.cq_len[UCT_IB_DIR_RX] > self->rx.wq.mask); + status = uct_ud_mlx5_iface_common_init(&self->super.super, &self->ud_mlx5_common, &config->ud_mlx5_common); @@ -808,7 +863,6 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ud_mlx5_iface_t) uct_ud_iface_remove_async_handlers(&self->super); uct_ud_enter(&self->super); UCT_UD_IFACE_DELETE_EPS(&self->super, uct_ud_mlx5_ep_t); - ucs_twheel_cleanup(&self->super.async.slow_timer); uct_ib_mlx5_txwq_cleanup(&self->tx.wq); uct_ud_leave(&self->super); } @@ -822,22 +876,15 @@ static UCS_CLASS_DEFINE_NEW_FUNC(uct_ud_mlx5_iface_t, uct_iface_t, uct_md_h, static UCS_CLASS_DEFINE_DELETE_FUNC(uct_ud_mlx5_iface_t, uct_iface_t); static ucs_status_t -uct_ud_mlx5_query_resources(uct_md_h md, - uct_tl_resource_desc_t **resources_p, - unsigned *num_resources_p) +uct_ud_mlx5_query_tl_devices(uct_md_h md, + uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) { - ucs_trace_func(""); - /* TODO take transport overhead into account */ - return uct_ib_device_query_tl_resources(&ucs_derived_of(md, uct_ib_md_t)->dev, - "ud_mlx5", UCT_IB_DEVICE_FLAG_MLX5_PRM, - resources_p, num_resources_p); -} - -UCT_TL_COMPONENT_DEFINE(uct_ud_mlx5_tl, - uct_ud_mlx5_query_resources, - uct_ud_mlx5_iface_t, - "ud_mlx5", - "UD_MLX5_", - uct_ud_mlx5_iface_config_table, - uct_ud_mlx5_iface_config_t); -UCT_MD_REGISTER_TL(&uct_ib_mdc, &uct_ud_mlx5_tl); + uct_ib_md_t *ib_md = ucs_derived_of(md, uct_ib_md_t); + return uct_ib_device_query_ports(&ib_md->dev, UCT_IB_DEVICE_FLAG_MLX5_PRM, + tl_devices_p, num_tl_devices_p); +} + +UCT_TL_DEFINE(&uct_ib_component, ud_mlx5, uct_ud_mlx5_query_tl_devices, + uct_ud_mlx5_iface_t, "UD_MLX5_", uct_ud_mlx5_iface_config_table, + uct_ud_mlx5_iface_config_t); diff --git a/src/uct/ib/ud/accel/ud_mlx5.h b/src/uct/ib/ud/accel/ud_mlx5.h index a55c788a6e5..bf1d44ec212 100644 --- a/src/uct/ib/ud/accel/ud_mlx5.h +++ b/src/uct/ib/ud/accel/ud_mlx5.h @@ -29,7 +29,6 @@ typedef struct { typedef struct { uct_ud_iface_t super; - uct_ib_mlx5_iface_common_t mlx5_common; struct { uct_ib_mlx5_txwq_t wq; } tx; @@ -41,14 +40,17 @@ typedef struct { } uct_ud_mlx5_iface_t; -static inline unsigned uct_ud_mlx5_tx_moderation(uct_ud_mlx5_iface_t *iface) +static UCS_F_ALWAYS_INLINE unsigned +uct_ud_mlx5_tx_moderation(uct_ud_mlx5_iface_t *iface, uint8_t ce_se) { - if (iface->super.tx.unsignaled >= UCT_UD_TX_MODERATION) { + if ((ce_se & MLX5_WQE_CTRL_CQ_UPDATE) || + (iface->super.tx.unsignaled >= (UCT_UD_TX_MODERATION - 1))) { iface->super.tx.unsignaled = 0; - return MLX5_WQE_CTRL_CQ_UPDATE; + return ce_se | MLX5_WQE_CTRL_CQ_UPDATE; } + iface->super.tx.unsignaled++; - return 0; + return ce_se; } #endif diff --git a/src/uct/ib/ud/accel/ud_mlx5_common.c b/src/uct/ib/ud/accel/ud_mlx5_common.c index 23b56dc6fdc..9d2064d9d72 100644 --- a/src/uct/ib/ud/accel/ud_mlx5_common.c +++ b/src/uct/ib/ud/accel/ud_mlx5_common.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ud_mlx5_common.h" @@ -31,7 +35,7 @@ ucs_status_t uct_ud_mlx5_iface_common_init(uct_ib_iface_t *ib_iface, ucs_status_t uct_ud_mlx5_iface_get_av(uct_ib_iface_t *iface, uct_ud_mlx5_iface_common_t *ud_common_iface, const uct_ib_address_t *ib_addr, - uint8_t path_bits, + unsigned path_index, uct_ib_mlx5_base_av_t *base_av, struct mlx5_grh_av *grh_av, int *is_global) @@ -40,8 +44,10 @@ ucs_status_t uct_ud_mlx5_iface_get_av(uct_ib_iface_t *iface, struct ibv_ah *ah; struct mlx5_wqe_av mlx5_av; struct ibv_ah_attr ah_attr; + enum ibv_mtu path_mtu; - uct_ib_iface_fill_ah_attr_from_addr(iface, ib_addr, path_bits, &ah_attr); + uct_ib_iface_fill_ah_attr_from_addr(iface, ib_addr, path_index, &ah_attr, + &path_mtu); status = uct_ib_iface_create_ah(iface, &ah_attr, &ah); if (status != UCS_OK) { return status; diff --git a/src/uct/ib/ud/accel/ud_mlx5_common.h b/src/uct/ib/ud/accel/ud_mlx5_common.h index 70f6f628ebe..5736e6e6ffc 100644 --- a/src/uct/ib/ud/accel/ud_mlx5_common.h +++ b/src/uct/ib/ud/accel/ud_mlx5_common.h @@ -42,7 +42,7 @@ ucs_status_t uct_ud_mlx5_iface_common_init(uct_ib_iface_t *ib_iface, ucs_status_t uct_ud_mlx5_iface_get_av(uct_ib_iface_t *iface, uct_ud_mlx5_iface_common_t *ud_common_iface, const uct_ib_address_t *ib_addr, - uint8_t path_bits, + unsigned path_index, uct_ib_mlx5_base_av_t *base_av, struct mlx5_grh_av *grh_av, int *is_global); diff --git a/src/uct/ib/ud/base/ud_def.h b/src/uct/ib/ud/base/ud_def.h index b4f2e8312b0..313ad741f37 100644 --- a/src/uct/ib/ud/base/ud_def.h +++ b/src/uct/ib/ud/base/ud_def.h @@ -16,7 +16,6 @@ #define UCT_UD_QP_HASH_SIZE 256 #define UCT_UD_TX_MODERATION 64 -#define UCT_UD_MIN_INLINE 48 #define UCT_UD_HASH_SIZE 997 #define UCT_UD_RX_BATCH_MIN 8 @@ -27,6 +26,7 @@ #define UCT_UD_CA_DUP_ACK_CNT 2 /* TODO: not implemented yet */ #define UCT_UD_RESENDS_PER_ACK 4 /* request per every N resends */ #define UCT_UD_SKB_ALIGN UCS_SYS_CACHE_LINE_SIZE +#define UCT_UD_SKIP_SWEEP 8 /* note that the ud tx window is [acked_psn+1, max_psn) * and max_psn = acked_psn + cwnd @@ -126,11 +126,21 @@ typedef struct uct_ud_neth { enum { - UCT_UD_SEND_SKB_FLAG_ACK_REQ = UCS_BIT(1), /* ACK was requested for this skb */ - UCT_UD_SEND_SKB_FLAG_COMP = UCS_BIT(2), /* This skb contains a completion */ - UCT_UD_SEND_SKB_FLAG_ZCOPY = UCS_BIT(3), /* This skb contains a zero-copy segment */ - UCT_UD_SEND_SKB_FLAG_ERR = UCS_BIT(4), /* This skb contains a status after failure */ - UCT_UD_SEND_SKB_FLAG_CANCEL = UCS_BIT(5) /* This skb contains a UCS_ERR_CANCEL status */ + UCT_UD_SEND_SKB_FLAG_ACK_REQ = UCS_BIT(0), /* ACK was requested for this skb */ + UCT_UD_SEND_SKB_FLAG_COMP = UCS_BIT(1), /* This skb contains a completion */ + UCT_UD_SEND_SKB_FLAG_ZCOPY = UCS_BIT(2), /* This skb contains a zero-copy segment */ + UCT_UD_SEND_SKB_FLAG_RESENDING = UCS_BIT(3), /* An active control skb refers to this skb */ + +#if UCS_ENABLE_ASSERT + UCT_UD_SEND_SKB_FLAG_CTL_ACK = UCS_BIT(5), /* This is a control-ack skb */ + UCT_UD_SEND_SKB_FLAG_CTL_RESEND = UCS_BIT(6), /* This is a control-resend rsb */ + UCT_UD_SEND_SKB_FLAG_INVALID = UCS_BIT(7) /* skb is released */ + +#else + UCT_UD_SEND_SKB_FLAG_CTL_ACK = 0, + UCT_UD_SEND_SKB_FLAG_CTL_RESEND = 0, + UCT_UD_SEND_SKB_FLAG_INVALID = 0 +#endif }; @@ -144,15 +154,17 @@ typedef struct uct_ud_send_skb { ucs_queue_elem_t queue; /* in send window */ uint32_t lkey; uint16_t len; /* data size */ - uint8_t flags; - int8_t status; /* used in case of failure */ + uint16_t flags; uct_ud_neth_t neth[0]; } UCS_S_PACKED UCS_V_ALIGNED(UCT_UD_SKB_ALIGN) uct_ud_send_skb_t; +/* + * Call user completion handler + */ typedef struct uct_ud_comp_desc { uct_completion_t *comp; - uct_ud_ep_t *ep; + ucs_status_t status; /* used in case of failure */ } uct_ud_comp_desc_t; @@ -161,10 +173,22 @@ typedef struct uct_ud_comp_desc { */ typedef struct uct_ud_iov { void *buffer; /**< Data buffer */ - uint16_t length; /**< Length of the buffer in bytes */ + uint32_t lkey; /**< Lkey for memory region */ + uint16_t length; /**< Length of the buffer in bytes */ } UCS_S_PACKED uct_ud_iov_t; +typedef struct uct_ud_ctl_desc { + ucs_queue_elem_t queue; /* Queue element in outstanding queue */ + uint16_t sn; /* Sequence number in outstanding queue */ + uct_ud_send_skb_t *self_skb; /* Back-pointer to owner skb */ + uct_ud_send_skb_t *resent_skb; /* For resend skb: points to a re-sent + skb in the window, can be NULL */ + uct_ud_ep_t *ep; /* For resend skb: points to the endpoint + on which the resend was made */ +} uct_ud_ctl_desc_t; + + typedef struct uct_ud_zcopy_desc { uct_ud_comp_desc_t super; uct_ud_iov_t iov[UCT_IB_MAX_IOV]; @@ -172,12 +196,6 @@ typedef struct uct_ud_zcopy_desc { } uct_ud_zcopy_desc_t; -typedef struct uct_ud_send_skb_inl { - uct_ud_send_skb_t super; - char data[sizeof(uct_ud_neth_t)]; /* placeholder for super.neth */ -} uct_ud_send_skb_inl_t; - - typedef struct uct_ud_recv_skb { uct_ib_iface_recv_desc_t super; union { @@ -233,18 +251,26 @@ static inline void uct_ud_neth_set_am_id(uct_ud_neth_t *neth, uint8_t id) neth->packet_type |= (id << UCT_UD_PACKET_AM_ID_SHIFT); } +static inline uct_ud_ctl_desc_t *uct_ud_ctl_desc(uct_ud_send_skb_t *skb) +{ + ucs_assert(skb->flags & (UCT_UD_SEND_SKB_FLAG_CTL_ACK | + UCT_UD_SEND_SKB_FLAG_CTL_RESEND)); + ucs_assert(!(skb->flags & UCT_UD_SEND_SKB_FLAG_INVALID)); + return (uct_ud_ctl_desc_t*)((char*)skb->neth + skb->len); +} + static inline uct_ud_comp_desc_t *uct_ud_comp_desc(uct_ud_send_skb_t *skb) { - ucs_assert(skb->flags & (UCT_UD_SEND_SKB_FLAG_COMP | - UCT_UD_SEND_SKB_FLAG_ERR | - UCT_UD_SEND_SKB_FLAG_CANCEL)); - return (uct_ud_comp_desc_t*)((char *)skb->neth + skb->len); + ucs_assert(skb->flags & UCT_UD_SEND_SKB_FLAG_COMP); + ucs_assert(!(skb->flags & UCT_UD_SEND_SKB_FLAG_INVALID)); + return (uct_ud_comp_desc_t*)((char*)skb->neth + skb->len); } static inline uct_ud_zcopy_desc_t *uct_ud_zcopy_desc(uct_ud_send_skb_t *skb) { ucs_assert(skb->flags & UCT_UD_SEND_SKB_FLAG_ZCOPY); - return (uct_ud_zcopy_desc_t*)((char *)skb->neth + skb->len); + ucs_assert(!(skb->flags & UCT_UD_SEND_SKB_FLAG_INVALID)); + return (uct_ud_zcopy_desc_t*)((char*)skb->neth + skb->len); } diff --git a/src/uct/ib/ud/base/ud_ep.c b/src/uct/ib/ud/base/ud_ep.c index acfcc2ef9bb..c83e4874868 100644 --- a/src/uct/ib/ud/base/ud_ep.c +++ b/src/uct/ib/ud/base/ud_ep.c @@ -1,9 +1,14 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ud_ep.h" #include "ud_iface.h" #include "ud_inl.h" @@ -25,7 +30,7 @@ static void uct_ud_ep_do_pending_ctl(uct_ud_ep_t *ep, uct_ud_iface_t *iface); static void uct_ud_peer_name(uct_ud_peer_name_t *peer) { - gethostname(peer->name, sizeof(peer->name)); + ucs_strncpy_zero(peer->name, ucs_get_host_name(), sizeof(peer->name)); peer->pid = getpid(); } @@ -53,9 +58,19 @@ static void uct_ud_ep_resend_start(uct_ud_iface_t *iface, uct_ud_ep_t *ep) uct_ud_ep_ctl_op_add(iface, ep, UCT_UD_EP_OP_RESEND); } +static void uct_ud_ep_resend_end(uct_ud_ep_t *ep) +{ + uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_RESEND); + ep->flags &= ~UCT_UD_EP_FLAG_TX_NACKED; +} -static void uct_ud_ep_resend_ack(uct_ud_iface_t *iface, uct_ud_ep_t *ep) +static UCS_F_ALWAYS_INLINE void +uct_ud_ep_resend_ack(uct_ud_iface_t *iface, uct_ud_ep_t *ep) { + if (ucs_likely(UCT_UD_PSN_COMPARE(ep->resend.psn, >, ep->resend.max_psn))) { + return; + } + if (UCT_UD_PSN_COMPARE(ep->tx.acked_psn, <, ep->resend.max_psn)) { /* new ack arrived that acked something in our resend window. */ if (UCT_UD_PSN_COMPARE(ep->resend.psn, <=, ep->tx.acked_psn)) { @@ -68,11 +83,10 @@ static void uct_ud_ep_resend_ack(uct_ud_iface_t *iface, uct_ud_ep_t *ep) } else { /* everything in resend window was acked - no need to resend anymore */ ep->resend.psn = ep->resend.max_psn + 1; - uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_RESEND); + uct_ud_ep_resend_end(ep); } } - static void uct_ud_ep_ca_drop(uct_ud_ep_t *ep) { ucs_debug("ep: %p ca drop@cwnd = %d in flight: %d", @@ -90,7 +104,7 @@ static void uct_ud_ep_ca_drop(uct_ud_ep_t *ep) static UCS_F_ALWAYS_INLINE void uct_ud_ep_ca_ack(uct_ud_ep_t *ep) { - if (ep->ca.cwnd < UCT_UD_CA_MAX_WINDOW) { + if (ep->ca.cwnd < ep->ca.wmax) { ep->ca.cwnd += UCT_UD_CA_AI_VALUE; } ep->tx.max_psn = ep->tx.acked_psn + ep->ca.cwnd; @@ -101,6 +115,8 @@ static void uct_ud_ep_reset(uct_ud_ep_t *ep) { ep->tx.psn = UCT_UD_INITIAL_PSN; ep->ca.cwnd = UCT_UD_CA_MIN_WINDOW; + ep->ca.wmax = ucs_derived_of(ep->super.super.iface, + uct_ud_iface_t)->config.max_window; ep->tx.max_psn = ep->tx.psn + ep->ca.cwnd; ep->tx.acked_psn = UCT_UD_INITIAL_PSN - 1; ep->tx.pending.ops = UCT_UD_EP_OP_NONE; @@ -109,6 +125,7 @@ static void uct_ud_ep_reset(uct_ud_ep_t *ep) ep->resend.pos = ucs_queue_iter_begin(&ep->tx.window); ep->resend.psn = ep->tx.psn; ep->resend.max_psn = ep->tx.acked_psn; + ep->tx.resend_count = 0; ep->rx_creq_count = 0; ep->rx.acked_psn = UCT_UD_INITIAL_PSN - 1; @@ -122,7 +139,7 @@ static ucs_status_t uct_ud_ep_free_by_timeout(uct_ud_ep_t *ep, uct_ud_iface_ops_t *ops; ucs_time_t diff; - diff = ucs_twheel_get_time(&iface->async.slow_timer) - ep->close_time; + diff = ucs_twheel_get_time(&iface->tx.timer) - ep->close_time; if (diff > iface->config.peer_timeout) { ucs_debug("ud_ep %p is destroyed after %fs with timeout %fs\n", ep, ucs_time_to_sec(diff), @@ -134,56 +151,190 @@ static ucs_status_t uct_ud_ep_free_by_timeout(uct_ud_ep_t *ep, return UCS_INPROGRESS; } -static void uct_ud_ep_slow_timer(ucs_wtimer_t *self) +static UCS_F_ALWAYS_INLINE int +uct_ud_skb_is_completed(uct_ud_send_skb_t *skb, uct_ud_psn_t ack_psn) { - uct_ud_ep_t *ep = ucs_container_of(self, uct_ud_ep_t, slow_timer); - uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, - uct_ud_iface_t); - ucs_time_t now; - ucs_time_t diff; - ucs_status_t status; + ucs_assert(!(skb->flags & UCT_UD_SEND_SKB_FLAG_INVALID)); + return UCT_UD_PSN_COMPARE(skb->neth->psn, <=, ack_psn) && + !(skb->flags & UCT_UD_SEND_SKB_FLAG_RESENDING); +} + +static UCS_F_ALWAYS_INLINE void +uct_ud_ep_window_release_inline(uct_ud_iface_t *iface, uct_ud_ep_t *ep, + uct_ud_psn_t ack_psn, ucs_status_t status, + int is_async, int invalidate_resend, int dummy_ack) +{ + uct_ud_send_skb_t *skb; + + ucs_queue_for_each_extract(skb, &ep->tx.window, queue, + uct_ud_skb_is_completed(skb, ack_psn)) { + if (invalidate_resend && (ep->resend.pos == &skb->queue.next)) { + ep->resend.pos = ucs_queue_iter_begin(&ep->tx.window); + ep->resend.psn = ep->tx.acked_psn + 1; + } + if (ucs_likely(!(skb->flags & UCT_UD_SEND_SKB_FLAG_COMP))) { + /* fast path case: skb without completion callback */ + uct_ud_skb_release(skb, 1, dummy_ack, ep); + } else if (ucs_likely(!is_async)) { + /* dispatch user completion immediately */ + uct_ud_iface_dispatch_comp(iface, uct_ud_comp_desc(skb)->comp, + status); + uct_ud_skb_release(skb, 1, dummy_ack, ep); + } else { + /* Don't call user completion from async context. Instead, put + * it on a queue which will be progressed from main thread. + */ + uct_ud_iface_add_async_comp(iface, skb, status); + } + } +} + +static UCS_F_NOINLINE void +uct_ud_ep_window_release(uct_ud_ep_t *ep, ucs_status_t status, int is_async) +{ + uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ud_iface_t); + + uct_ud_ep_window_release_inline(iface, ep, ep->tx.acked_psn, status, is_async, 0, 0); +} + +void uct_ud_ep_window_release_completed(uct_ud_ep_t *ep, int is_async) +{ + uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ud_iface_t); + + uct_ud_ep_window_release_inline(iface, ep, ep->tx.acked_psn, UCS_OK, is_async, 1, 0); +} + +static void uct_ud_ep_purge_outstanding(uct_ud_ep_t *ep) +{ + uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ud_iface_t); + uct_ud_ctl_desc_t *cdesc; + ucs_queue_iter_t iter; + + ucs_queue_for_each_safe(cdesc, iter, &iface->tx.outstanding_q, queue) { + if (cdesc->ep == ep) { + ucs_queue_del_iter(&iface->tx.outstanding_q, iter); + uct_ud_iface_ctl_skb_complete(iface, cdesc, 0); + } + } + + ucs_assert_always(ep->tx.resend_count == 0); +} + +static void uct_ud_ep_purge(uct_ud_ep_t *ep, ucs_status_t status) +{ + uct_ud_ep_tx_stop(ep); + uct_ud_ep_purge_outstanding(ep); + ep->tx.acked_psn = (uct_ud_psn_t)(ep->tx.psn - 1); + uct_ud_ep_window_release(ep, status, 0); + ucs_assert(ucs_queue_is_empty(&ep->tx.window)); +} + +static unsigned uct_ud_ep_deferred_timeout_handler(void *arg) +{ + uct_ud_ep_t *ep = arg; + uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ud_iface_t); + ucs_status_t status; + + if (ep->flags & UCT_UD_EP_FLAG_DISCONNECTED) { + uct_ud_ep_purge(ep, UCS_ERR_ENDPOINT_TIMEOUT); + return 0; + } + + if (ep->flags & UCT_UD_EP_FLAG_PRIVATE) { + ucs_assert(ucs_queue_is_empty(&ep->tx.window)); + uct_ep_destroy(&ep->super.super); + return 0; + } + + uct_ud_ep_purge(ep, UCS_ERR_ENDPOINT_TIMEOUT); + + status = iface->super.ops->set_ep_failed(&iface->super, &ep->super.super, + UCS_ERR_ENDPOINT_TIMEOUT); + if (status != UCS_OK) { + ucs_fatal("UD endpoint %p to "UCT_UD_EP_PEER_NAME_FMT": " + "unhandled timeout error", + ep, UCT_UD_EP_PEER_NAME_ARG(ep)); + } + + return 1; +} + +static void uct_ud_ep_timer_backoff(uct_ud_ep_t *ep) +{ + uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ud_iface_t); + + ep->tx.tick = ucs_min(ep->tx.tick * iface->tx.timer_backoff, + UCT_UD_SLOW_TIMER_MAX_TICK(iface)); + ucs_wtimer_add(&iface->tx.timer, &ep->timer, ep->tx.tick); +} + +static UCS_F_ALWAYS_INLINE int uct_ud_ep_is_last_ack_received(uct_ud_ep_t *ep) +{ + return UCT_UD_PSN_COMPARE(ep->tx.acked_psn, ==, ep->tx.psn - 1); +} + +static void uct_ud_ep_timer(ucs_wtimer_t *self) +{ + uct_ud_ep_t *ep = ucs_container_of(self, uct_ud_ep_t, timer); + uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ud_iface_t); + ucs_time_t now, last_send, diff; + ucs_status_t status; UCT_UD_EP_HOOK_CALL_TIMER(ep); - if (ucs_queue_is_empty(&ep->tx.window)) { + if (uct_ud_ep_is_last_ack_received(ep)) { /* Do not free the EP until all scheduled communications are done. */ if (ep->flags & UCT_UD_EP_FLAG_DISCONNECTED) { status = uct_ud_ep_free_by_timeout(ep, iface); if (status == UCS_INPROGRESS) { - goto again; + uct_ud_ep_timer_backoff(ep); } } return; } - now = ucs_twheel_get_time(&iface->async.slow_timer); + ucs_assert(!ucs_queue_is_empty(&ep->tx.window)); + + now = ucs_twheel_get_time(&iface->tx.timer); diff = now - ep->tx.send_time; if (diff > iface->config.peer_timeout) { - ucs_debug("ep %p: timeout of %.2f sec", ep, ucs_time_to_sec(diff)); - iface->super.ops->handle_failure(&iface->super, ep, - UCS_ERR_ENDPOINT_TIMEOUT); + ucs_debug("ep %p: timeout of %.2f sec, config::peer_timeout - %.2f sec", + ep, ucs_time_to_sec(diff), + ucs_time_to_sec(iface->config.peer_timeout)); + ucs_callbackq_add_safe(&iface->super.super.worker->super.progress_q, + uct_ud_ep_deferred_timeout_handler, ep, + UCS_CALLBACKQ_FLAG_ONESHOT); + return; + } + + /* If we are already resending, do not consider this timeout as packet drop. + * It just means the sender is slow. + */ + if (uct_ud_ep_ctl_op_check(ep, UCT_UD_EP_OP_ACK_REQ|UCT_UD_EP_OP_RESEND) || + (ep->tx.resend_count > 0)) { + ucs_trace("ep %p: resend still in progress, ops 0x%x tx_count %d", + ep, ep->tx.pending.ops, ep->tx.resend_count); + uct_ud_ep_timer_backoff(ep); return; - } else if (diff > 3*iface->async.slow_tick) { - ucs_trace("scheduling resend now: %lu send_time: %lu diff: %lu tick: %lu", - now, ep->tx.send_time, now - ep->tx.send_time, - ep->tx.slow_tick); - uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_ACK_REQ); - uct_ud_ep_ca_drop(ep); - uct_ud_ep_resend_start(iface, ep); - } else if ((diff > iface->async.slow_tick) && uct_ud_ep_is_connected(ep)) { - /* It is possible that the sender is slow. - * Try to flush the window twice before going into - * full resend mode. - */ - uct_ud_ep_ctl_op_add(iface, ep, UCT_UD_EP_OP_ACK_REQ); } -again: - /* Cool down the timer on rescheduling/resending */ - ep->tx.slow_tick *= iface->config.slow_timer_backoff; - ep->tx.slow_tick = ucs_min(ep->tx.slow_tick, - UCT_UD_SLOW_TIMER_MAX_TICK(iface)); - ucs_wtimer_add(&iface->async.slow_timer, &ep->slow_timer, ep->tx.slow_tick); + last_send = ucs_max(ep->tx.send_time, ep->tx.resend_time); + diff = now - last_send; + if (diff > iface->tx.tick) { + if (diff > 3 * iface->tx.tick) { + ucs_trace("scheduling resend now: %lu last_send: %lu diff: %lu tick: %lu", + now, last_send, diff, ep->tx.tick); + uct_ud_ep_ca_drop(ep); + uct_ud_ep_resend_start(iface, ep); + } + + if (uct_ud_ep_is_connected(ep)) { + /* Try to request ACK/NACK twice before going into full resend mode */ + uct_ud_ep_ctl_op_add(iface, ep, UCT_UD_EP_OP_ACK_REQ); + } + } + + uct_ud_ep_timer_backoff(ep); } #if HAVE_HNS_ROCE @@ -199,39 +350,55 @@ static void uct_ud_ep_pskb_free(uct_ud_ep_t *ep) #define uct_ud_ep_pskb_free(ep) #endif -UCS_CLASS_INIT_FUNC(uct_ud_ep_t, uct_ud_iface_t *iface) +UCS_CLASS_INIT_FUNC(uct_ud_ep_t, uct_ud_iface_t *iface, + const uct_ep_params_t* params) { ucs_trace_func(""); memset(self, 0, sizeof(*self)); UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super.super); + uct_ud_enter(iface); + self->dest_ep_id = UCT_UD_EP_NULL_ID; + self->path_index = UCT_EP_PARAMS_GET_PATH_INDEX(params); uct_ud_ep_reset(self); ucs_list_head_init(&self->cep_list); uct_ud_iface_add_ep(iface, self); - self->tx.slow_tick = iface->async.slow_tick; - ucs_wtimer_init(&self->slow_timer, uct_ud_ep_slow_timer); + self->tx.tick = iface->tx.tick; + ucs_wtimer_init(&self->timer, uct_ud_ep_timer); #if HAVE_HNS_ROCE ucs_queue_head_init(&self->pending_skb); #endif ucs_arbiter_group_init(&self->tx.pending.group); ucs_arbiter_elem_init(&self->tx.pending.elem); - self->path_bits = iface->super.path_bits[0]; /* TODO multi-rail */ - UCT_UD_EP_HOOK_INIT(self); - ucs_debug("created ep ep=%p iface=%p id=%d src_path_bits=%d", - self, iface, self->ep_id, self->path_bits); + ucs_debug("created ep ep=%p iface=%p id=%d", self, iface, self->ep_id); + + uct_ud_leave(iface); + return UCS_OK; } +static UCS_F_ALWAYS_INLINE int +uct_ud_ep_is_last_pending_elem(uct_ud_ep_t *ep, ucs_arbiter_elem_t *elem) +{ + return (/* this is the only one pending element in the group */ + (ucs_arbiter_elem_is_only(elem)) || + (/* the next element in the group is control operation */ + (elem->next == &ep->tx.pending.elem) && + /* only two elements are in the group (the 1st element is the + * current one, the 2nd (or the last) element is the control one) */ + (ucs_arbiter_group_tail(&ep->tx.pending.group) == &ep->tx.pending.elem))); + +} + static ucs_arbiter_cb_result_t -uct_ud_ep_pending_cancel_cb(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, - void *arg) +uct_ud_ep_pending_cancel_cb(ucs_arbiter_t *arbiter, ucs_arbiter_group_t *group, + ucs_arbiter_elem_t *elem, void *arg) { - uct_ud_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), - uct_ud_ep_t, tx.pending.group); + uct_ud_ep_t *ep = ucs_container_of(group, uct_ud_ep_t, tx.pending.group); uct_pending_req_t *req; /* we may have pending op on ep */ @@ -244,10 +411,20 @@ uct_ud_ep_pending_cancel_cb(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, req = ucs_container_of(elem, uct_pending_req_t, priv); ucs_warn("ep=%p removing user pending req=%p", ep, req); + if (uct_ud_ep_is_last_pending_elem(ep, elem)) { + uct_ud_ep_remove_has_pending_flag(ep); + } + /* return ignored by arbiter */ return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; } +static int uct_ud_ep_remove_timeout_filter(const ucs_callbackq_elem_t *elem, + void *arg) +{ + return (elem->cb == uct_ud_ep_deferred_timeout_handler) && (elem->arg == arg); +} + static UCS_CLASS_CLEANUP_FUNC(uct_ud_ep_t) { uct_ud_iface_t *iface = ucs_derived_of(self->super.super.iface, uct_ud_iface_t); @@ -255,7 +432,13 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ud_ep_t) ucs_trace_func("ep=%p id=%d conn_id=%d", self, self->ep_id, self->conn_id); uct_ud_ep_pskb_free(self); - ucs_wtimer_remove(&self->slow_timer); + uct_ud_enter(iface); + + ucs_callbackq_remove_if(&iface->super.super.worker->super.progress_q, + uct_ud_ep_remove_timeout_filter, self); + uct_ud_ep_purge(self, UCS_ERR_CANCELED); + + ucs_wtimer_remove(&iface->tx.timer, &self->timer); uct_ud_iface_remove_ep(iface, self); uct_ud_iface_cep_remove(self); ucs_frag_list_cleanup(&self->rx.ooo_pkts); @@ -269,6 +452,7 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ud_ep_t) (int)ucs_queue_length(&self->tx.window)); } ucs_arbiter_group_cleanup(&self->tx.pending.group); + uct_ud_leave(iface); } UCS_CLASS_DEFINE(uct_ud_ep_t, uct_base_ep_t); @@ -298,7 +482,7 @@ static ucs_status_t uct_ud_ep_connect_to_iface(uct_ud_ep_t *ep, const uct_ud_iface_addr_t *if_addr) { uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ud_iface_t); - uct_ib_device_t *dev = uct_ib_iface_device(&iface->super); + uct_ib_device_t UCS_V_UNUSED *dev = uct_ib_iface_device(&iface->super); char buf[128]; ucs_frag_list_cleanup(&ep->rx.ooo_pkts); @@ -320,7 +504,9 @@ static ucs_status_t uct_ud_ep_disconnect_from_iface(uct_ep_h tl_ep) ucs_frag_list_cleanup(&ep->rx.ooo_pkts); uct_ud_ep_reset(ep); + ep->dest_ep_id = UCT_UD_EP_NULL_ID; + ep->flags &= ~UCT_UD_EP_FLAG_CONNECTED; return UCS_OK; } @@ -328,6 +514,7 @@ static ucs_status_t uct_ud_ep_disconnect_from_iface(uct_ep_h tl_ep) ucs_status_t uct_ud_ep_create_connected_common(uct_ud_iface_t *iface, const uct_ib_address_t *ib_addr, const uct_ud_iface_addr_t *if_addr, + unsigned path_index, uct_ud_ep_t **new_ep_p, uct_ud_send_skb_t **skb_p) { @@ -336,7 +523,8 @@ ucs_status_t uct_ud_ep_create_connected_common(uct_ud_iface_t *iface, uct_ud_ep_t *ep; uct_ep_h new_ep_h; - ep = uct_ud_iface_cep_lookup(iface, ib_addr, if_addr, UCT_UD_EP_CONN_ID_MAX); + ep = uct_ud_iface_cep_lookup(iface, ib_addr, if_addr, UCT_UD_EP_CONN_ID_MAX, + path_index); if (ep) { uct_ud_ep_set_state(ep, UCT_UD_EP_FLAG_CREQ_NOTSENT); ep->flags &= ~UCT_UD_EP_FLAG_PRIVATE; @@ -345,8 +533,11 @@ ucs_status_t uct_ud_ep_create_connected_common(uct_ud_iface_t *iface, return UCS_ERR_ALREADY_EXISTS; } - params.field_mask = UCT_EP_PARAM_FIELD_IFACE; + params.field_mask = UCT_EP_PARAM_FIELD_IFACE | + UCT_EP_PARAM_FIELD_PATH_INDEX; params.iface = &iface->super.super.super; + params.path_index = path_index; + status = uct_ep_create(¶ms, &new_ep_h); if (status != UCS_OK) { return status; @@ -358,7 +549,8 @@ ucs_status_t uct_ud_ep_create_connected_common(uct_ud_iface_t *iface, return status; } - status = uct_ud_iface_cep_insert(iface, ib_addr, if_addr, ep, UCT_UD_EP_CONN_ID_MAX); + status = uct_ud_iface_cep_insert(iface, ib_addr, if_addr, ep, + UCT_UD_EP_CONN_ID_MAX, path_index); if (status != UCS_OK) { goto err_cep_insert; } @@ -391,13 +583,13 @@ ucs_status_t uct_ud_ep_connect_to_ep(uct_ud_ep_t *ep, const uct_ud_ep_addr_t *ep_addr) { uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ud_iface_t); - uct_ib_device_t *dev = uct_ib_iface_device(&iface->super); + uct_ib_device_t UCS_V_UNUSED *dev = uct_ib_iface_device(&iface->super); char buf[128]; ucs_assert_always(ep->dest_ep_id == UCT_UD_EP_NULL_ID); ucs_trace_func(""); - ep->dest_ep_id = uct_ib_unpack_uint24(ep_addr->ep_id); + uct_ud_ep_set_dest_ep_id(ep, uct_ib_unpack_uint24(ep_addr->ep_id)); ucs_frag_list_cleanup(&ep->rx.ooo_pkts); uct_ud_ep_reset(ep); @@ -407,82 +599,30 @@ ucs_status_t uct_ud_ep_connect_to_ep(uct_ud_ep_t *ep, dev->port_attr[iface->super.config.port_num - dev->first_port].lid, iface->qp->qp_num, ep->ep_id, uct_ib_address_str(ib_addr, buf, sizeof(buf)), - uct_ib_unpack_uint24(ep_addr->iface_addr.qp_num), ep->dest_ep_id); + uct_ib_unpack_uint24(ep_addr->iface_addr.qp_num), + ep->dest_ep_id); return UCS_OK; } -static UCS_F_ALWAYS_INLINE void -uct_ud_iface_add_async_comp(uct_ud_iface_t *iface, uct_ud_ep_t *ep, - uct_ud_send_skb_t *skb, ucs_status_t status) -{ - uct_ud_comp_desc_t *cdesc; - - skb->status = status; - if (status != UCS_OK) { - if (!(skb->flags & UCT_UD_SEND_SKB_FLAG_COMP)) { - skb->len = 0; - } - - if (status == UCS_ERR_ENDPOINT_TIMEOUT) { - skb->flags |= UCT_UD_SEND_SKB_FLAG_ERR; - ++ep->tx.err_skb_count; - } else if (status == UCS_ERR_CANCELED) { - skb->flags |= UCT_UD_SEND_SKB_FLAG_CANCEL; - } - } - - cdesc = uct_ud_comp_desc(skb); - - /* don't call user completion from async context. instead, put - * it on a queue which will be progressed from main thread. - */ - ucs_queue_push(&iface->tx.async_comp_q, &skb->queue); - cdesc->ep = ep; - ep->flags |= UCT_UD_EP_FLAG_ASYNC_COMPS; -} - static UCS_F_ALWAYS_INLINE void uct_ud_ep_process_ack(uct_ud_iface_t *iface, uct_ud_ep_t *ep, uct_ud_psn_t ack_psn, int is_async, int dummy_ack) { - uct_ud_send_skb_t *skb; + /* Ignore duplicate ACK */ if (ucs_unlikely(UCT_UD_PSN_COMPARE(ack_psn, <=, ep->tx.acked_psn))) { return; } ep->tx.acked_psn = ack_psn; - /* Release acknowledged skb's */ - ucs_queue_for_each_extract(skb, &ep->tx.window, queue, - UCT_UD_PSN_COMPARE(skb->neth->psn, <=, ack_psn)) { - if (ucs_unlikely(skb->flags & UCT_UD_SEND_SKB_FLAG_COMP)) { - if (ucs_unlikely(is_async)) { - uct_ud_iface_add_async_comp(iface, ep, skb, UCS_OK); - continue; - } - - uct_invoke_completion(uct_ud_comp_desc(skb)->comp, UCS_OK); - } - - skb->flags = 0; /* reset also ACK_REQ flag */ -#if HAVE_HNS_ROCE - if (dummy_ack) - ucs_queue_push(&ep->pending_skb, &skb->queue); - else -#endif - ucs_mpool_put(skb); - } - + uct_ud_ep_window_release_inline(iface, ep, ack_psn, UCS_OK, is_async, 0, dummy_ack); uct_ud_ep_ca_ack(ep); - - if (ucs_unlikely(UCT_UD_PSN_COMPARE(ep->resend.psn, <=, ep->resend.max_psn))) { - uct_ud_ep_resend_ack(iface, ep); - } + uct_ud_ep_resend_ack(iface, ep); ucs_arbiter_group_schedule(&iface->tx.pending_q, &ep->tx.pending.group); - ep->tx.slow_tick = iface->async.slow_tick; - ep->tx.send_time = uct_ud_iface_get_async_time(iface); + ep->tx.tick = iface->tx.tick; + ep->tx.send_time = uct_ud_iface_get_time(iface); } static inline void uct_ud_ep_rx_put(uct_ud_neth_t *neth, unsigned byte_len) @@ -513,9 +653,11 @@ static uct_ud_ep_t *uct_ud_ep_create_passive(uct_ud_iface_t *iface, uct_ud_ctl_h (void*)&ctl->conn_req.ep_addr); ucs_assert_always(status == UCS_OK); + ep->path_index = ctl->conn_req.path_index; + status = uct_ud_iface_cep_insert(iface, uct_ud_creq_ib_addr(ctl), &ctl->conn_req.ep_addr.iface_addr, - ep, ctl->conn_req.conn_id); + ep, ctl->conn_req.conn_id, ep->path_index); ucs_assert_always(status == UCS_OK); return ep; } @@ -529,7 +671,8 @@ static void uct_ud_ep_rx_creq(uct_ud_iface_t *iface, uct_ud_neth_t *neth) ep = uct_ud_iface_cep_lookup(iface, uct_ud_creq_ib_addr(ctl), &ctl->conn_req.ep_addr.iface_addr, - ctl->conn_req.conn_id); + ctl->conn_req.conn_id, + ctl->conn_req.path_index); if (!ep) { ep = uct_ud_ep_create_passive(iface, ctl); ucs_assert_always(ep != NULL); @@ -540,7 +683,7 @@ static void uct_ud_ep_rx_creq(uct_ud_iface_t *iface, uct_ud_neth_t *neth) } else { if (ep->dest_ep_id == UCT_UD_EP_NULL_ID) { /* simultanuous CREQ */ - ep->dest_ep_id = uct_ib_unpack_uint24(ctl->conn_req.ep_addr.ep_id); + uct_ud_ep_set_dest_ep_id(ep, uct_ib_unpack_uint24(ctl->conn_req.ep_addr.ep_id)); ep->rx.ooo_pkts.head_sn = neth->psn; uct_ud_peer_copy(&ep->peer, ucs_unaligned_ptr(&ctl->peer)); ucs_debug("simultanuous CREQ ep=%p" @@ -559,8 +702,20 @@ static void uct_ud_ep_rx_creq(uct_ud_iface_t *iface, uct_ud_neth_t *neth) ++ep->rx_creq_count; - ucs_assert_always(ctl->conn_req.conn_id == ep->conn_id); - ucs_assert_always(uct_ib_unpack_uint24(ctl->conn_req.ep_addr.ep_id) == ep->dest_ep_id); + ucs_assertv_always(ctl->conn_req.conn_id == ep->conn_id, + "creq->conn_id=%d ep->conn_id=%d", + ctl->conn_req.conn_id, ep->conn_id); + + ucs_assertv_always(ctl->conn_req.path_index == ep->path_index, + "creq->path_index=%d ep->path_index=%d", + ctl->conn_req.path_index, ep->path_index); + + ucs_assertv_always(uct_ib_unpack_uint24(ctl->conn_req.ep_addr.ep_id) == + ep->dest_ep_id, + "creq->ep_addr.ep_id=%d ep->dest_ep_id=%d", + uct_ib_unpack_uint24(ctl->conn_req.ep_addr.ep_id), + ep->dest_ep_id); + /* creq must always have same psn */ ucs_assertv_always(ep->rx.ooo_pkts.head_sn == neth->psn, "iface=%p ep=%p conn_id=%d ep_id=%d, dest_ep_id=%d rx_psn=%u " @@ -585,8 +740,14 @@ static void uct_ud_ep_rx_ctl(uct_ud_iface_t *iface, uct_ud_ep_t *ep, ucs_trace_func(""); ucs_assert_always(ctl->type == UCT_UD_PACKET_CREP); - ucs_assert_always(ep->dest_ep_id == UCT_UD_EP_NULL_ID || - ep->dest_ep_id == ctl->conn_rep.src_ep_id); + + if (uct_ud_ep_is_connected(ep)) { + ucs_assertv_always(ep->dest_ep_id == ctl->conn_rep.src_ep_id, + "ep [id=%d dest_ep_id=%d flags=0x%x] " + "crep [neth->dest=%d dst_ep_id=%d src_ep_id=%d]", + ep->ep_id, ep->dest_ep_id, ep->path_index, ep->flags, + uct_ud_neth_get_dest_id(neth), ctl->conn_rep.src_ep_id); + } /* Discard duplicate CREP */ if (UCT_UD_PSN_COMPARE(neth->psn, <, ep->rx.ooo_pkts.head_sn)) { @@ -596,7 +757,7 @@ static void uct_ud_ep_rx_ctl(uct_ud_iface_t *iface, uct_ud_ep_t *ep, uct_ud_ep_pskb_free(ep); ep->rx.ooo_pkts.head_sn = neth->psn; - ep->dest_ep_id = ctl->conn_rep.src_ep_id; + uct_ud_ep_set_dest_ep_id(ep, ctl->conn_rep.src_ep_id); ucs_arbiter_group_schedule(&iface->tx.pending_q, &ep->tx.pending.group); uct_ud_peer_copy(&ep->peer, ucs_unaligned_ptr(&ctl->peer)); uct_ud_ep_set_state(ep, UCT_UD_EP_FLAG_CREP_RCVD); @@ -635,8 +796,9 @@ uct_ud_send_skb_t *uct_ud_ep_prepare_creq(uct_ud_ep_t *ep) creq = (uct_ud_ctl_hdr_t *)(neth + 1); - creq->type = UCT_UD_PACKET_CREQ; - creq->conn_req.conn_id = ep->conn_id; + creq->type = UCT_UD_PACKET_CREQ; + creq->conn_req.conn_id = ep->conn_id; + creq->conn_req.path_index = ep->path_index; status = uct_ud_ep_get_address(&ep->super.super, (void*)&creq->conn_req.ep_addr); @@ -675,7 +837,7 @@ void uct_ud_ep_process_rx(uct_ud_iface_t *iface, uct_ud_neth_t *neth, unsigned b uct_ud_ep_rx_creq(iface, neth); goto out; } else if (ucs_unlikely(!ucs_ptr_array_lookup(&iface->eps, dest_id, ep) || - ep->ep_id != dest_id)) + (ep->ep_id != dest_id))) { /* Drop the packet because it is * allowed to do disconnect without flush/barrier. So it @@ -696,7 +858,16 @@ void uct_ud_ep_process_rx(uct_ud_iface_t *iface, uct_ud_neth_t *neth, unsigned b ep->rx.ooo_pkts.head_sn, neth->psn); } + if (ucs_unlikely(UCT_UD_PSN_COMPARE(neth->psn, >, ep->rx.ooo_pkts.head_sn + 1))) { + uct_ud_ep_ctl_op_add(iface, ep, UCT_UD_EP_OP_NACK); + } + if (ucs_unlikely(!is_am)) { + if (neth->packet_type & UCT_UD_PACKET_FLAG_NAK) { + uct_ud_ep_set_state(ep, UCT_UD_EP_FLAG_TX_NACKED); + goto out; + } + if ((size_t)byte_len == sizeof(*neth)) { goto out; } @@ -708,13 +879,12 @@ void uct_ud_ep_process_rx(uct_ud_iface_t *iface, uct_ud_neth_t *neth, unsigned b ooo_type = ucs_frag_list_insert(&ep->rx.ooo_pkts, &skb->u.ooo.elem, neth->psn); if (ucs_unlikely(ooo_type != UCS_FRAG_LIST_INSERT_FAST)) { - if (ooo_type != UCS_FRAG_LIST_INSERT_DUP && - ooo_type != UCS_FRAG_LIST_INSERT_FAIL) { + if ((ooo_type != UCS_FRAG_LIST_INSERT_DUP) && + (ooo_type != UCS_FRAG_LIST_INSERT_FAIL)) { ucs_fatal("Out of order is not implemented: got %d", ooo_type); } ucs_trace_data("DUP/OOB - schedule ack, head_sn=%d sn=%d", ep->rx.ooo_pkts.head_sn, neth->psn); - uct_ud_ep_ctl_op_add(iface, ep, UCT_UD_EP_OP_ACK); goto out; } @@ -748,7 +918,6 @@ ucs_status_t uct_ud_ep_flush_nolock(uct_ud_iface_t *iface, uct_ud_ep_t *ep, uct_completion_t *comp) { uct_ud_send_skb_t *skb; - uct_ud_psn_t psn; if (ucs_unlikely(!uct_ud_ep_is_connected(ep))) { /* check for CREQ either being scheduled or sent and waiting for CREP ack */ @@ -770,39 +939,23 @@ ucs_status_t uct_ud_ep_flush_nolock(uct_ud_iface_t *iface, uct_ud_ep_t *ep, return UCS_ERR_NO_RESOURCE; } - if (ucs_queue_is_empty(&ep->tx.window)) { - uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_ACK_REQ); - - /* Check if have pending async completions for this ep, - * if not - all was acknowledged, nothing is pending - return OK - * if yes - continue to add - * */ - if (!(ep->flags & UCT_UD_EP_FLAG_ASYNC_COMPS)) { - return UCS_OK; - } - - /* - * If we have pending async completion, and the user requested a callback, - * add a new async completion in the queue. - */ - if (comp != NULL) { - skb = ucs_mpool_get(&iface->tx.mp); - if (skb == NULL) { - return UCS_ERR_NO_RESOURCE; - } + if (ucs_queue_is_empty(&ep->tx.window) && + ucs_queue_is_empty(&iface->tx.async_comp_q)) { + /* No outstanding operations */ + ucs_assert(ep->tx.resend_count == 0); + return UCS_OK; + } - skb->flags = UCT_UD_SEND_SKB_FLAG_COMP; - skb->len = 0; - uct_ud_comp_desc(skb)->comp = comp; - uct_ud_comp_desc(skb)->ep = ep; - ucs_queue_push(&iface->tx.async_comp_q, &skb->queue); - } + /* Expedite acknowledgment on the last skb in the window */ + if (uct_ud_ep_is_last_ack_received(ep)) { + uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_ACK_REQ); } else { - skb = ucs_queue_tail_elem_non_empty(&ep->tx.window, uct_ud_send_skb_t, queue); - psn = skb->neth->psn; + ucs_assert(!ucs_queue_is_empty(&ep->tx.window)); + skb = ucs_queue_tail_elem_non_empty(&ep->tx.window, uct_ud_send_skb_t, + queue); if (!(skb->flags & UCT_UD_SEND_SKB_FLAG_ACK_REQ)) { /* If we didn't ask for ACK on last skb, send an ACK_REQ message. - * It will speed up the flush because we will not have to wait untill + * It will speed up the flush because we will not have to wait until * retransmit is triggered. * Also, prevent from sending more control messages like this after * first time by turning on the flag on the last skb. @@ -821,64 +974,63 @@ ucs_status_t uct_ud_ep_flush_nolock(uct_ud_iface_t *iface, uct_ud_ep_t *ep, skb->flags |= UCT_UD_SEND_SKB_FLAG_ACK_REQ; } + } - /* If the user requested a callback, add a dummy skb to the window which - * will be released when the current sequence number is acknowledged. - */ - if (comp != NULL) { - skb = ucs_mpool_get(&iface->tx.mp); - if (skb == NULL) { - return UCS_ERR_NO_RESOURCE; - } + /* If the user requested a callback, allocate a dummy skb which will be + * released when the current sequence number is completed. + */ + if (comp != NULL) { + ucs_assert(comp->count > 0); - /* Add dummy skb to the window, which would call user completion - * callback when getting ACK. - */ - skb->flags = UCT_UD_SEND_SKB_FLAG_COMP; - skb->len = sizeof(skb->neth[0]); - skb->neth->packet_type = 0; - skb->neth->psn = psn; - uct_ud_comp_desc(skb)->comp = comp; - ucs_assert(psn == (uct_ud_psn_t)(ep->tx.psn - 1)); - - uct_ud_neth_set_dest_id(skb->neth, UCT_UD_EP_NULL_ID); + skb = ucs_mpool_get(&iface->tx.mp); + if (skb == NULL) { + return UCS_ERR_NO_RESOURCE; + } + + /* Add dummy skb to the window, which would call user completion + * callback when getting ACK. + */ + skb->flags = UCT_UD_SEND_SKB_FLAG_COMP; + skb->len = sizeof(skb->neth[0]); + skb->neth->packet_type = 0; + skb->neth->psn = (uct_ud_psn_t)(ep->tx.psn - 1); + uct_ud_neth_set_dest_id(skb->neth, UCT_UD_EP_NULL_ID); + uct_ud_comp_desc(skb)->comp = comp; + + if (!ucs_queue_is_empty(&ep->tx.window)) { + /* If window non-empty: add to window */ ucs_queue_push(&ep->tx.window, &skb->queue); - ucs_trace_data("added dummy flush skb %p psn %d user_comp %p", skb, - skb->neth->psn, comp); + } else { + /* Otherwise, add the skb after async completions */ + ucs_assert(ep->tx.resend_count == 0); + uct_ud_iface_add_async_comp(iface, skb, UCS_OK); } + + ucs_trace_data("added dummy flush skb %p psn %d user_comp %p", skb, + skb->neth->psn, comp); } return UCS_INPROGRESS; } -void uct_ud_tx_wnd_purge_outstanding(uct_ud_iface_t *iface, uct_ud_ep_t *ud_ep, - ucs_status_t status) -{ - uct_ud_send_skb_t *skb; - - uct_ud_ep_tx_stop(ud_ep); - - ucs_queue_for_each_extract(skb, &ud_ep->tx.window, queue, 1) { - uct_ud_iface_add_async_comp(iface, ud_ep, skb, status); - } -} - ucs_status_t uct_ud_ep_flush(uct_ep_h ep_h, unsigned flags, uct_completion_t *comp) { - ucs_status_t status; uct_ud_ep_t *ep = ucs_derived_of(ep_h, uct_ud_ep_t); uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ud_iface_t); + ucs_status_t status; uct_ud_enter(iface); if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) { - uct_ud_tx_wnd_purge_outstanding(iface, ep, UCS_ERR_CANCELED); - uct_ud_iface_dispatch_zcopy_comps(iface); uct_ep_pending_purge(ep_h, NULL, 0); - /* Open window after cancellation for next sending */ - uct_ud_ep_ca_ack(ep); + uct_ud_iface_dispatch_async_comps(iface); + uct_ud_ep_purge(ep, UCS_ERR_CANCELED); + /* FIXME make flush(CANCEL) operation truly non-blocking and wait until + * all of the outstanding sends are completed. Without this, zero-copy + * sends which are still on the QP could be reported as completed which + * can lead to sending corrupt data, or local access error. */ status = UCS_OK; goto out; } @@ -912,7 +1064,7 @@ static uct_ud_send_skb_t *uct_ud_ep_prepare_crep(uct_ud_ep_t *ep) /* Check that CREQ is neither sheduled nor waiting for CREP ack */ ucs_assertv_always(!uct_ud_ep_ctl_op_check(ep, UCT_UD_EP_OP_CREQ) && - ucs_queue_is_empty(&ep->tx.window), + uct_ud_ep_is_last_ack_received(ep), "iface=%p ep=%p conn_id=%d ep_id=%d, dest_ep_id=%d rx_psn=%u " "ep_flags=0x%x ctl_ops=0x%x rx_creq_count=%d", iface, ep, ep->conn_id, ep->ep_id, ep->dest_ep_id, @@ -942,62 +1094,129 @@ static uct_ud_send_skb_t *uct_ud_ep_prepare_crep(uct_ud_ep_t *ep) return skb; } -static uct_ud_send_skb_t *uct_ud_ep_resend(uct_ud_ep_t *ep) +static void uct_ud_ep_send_creq_crep(uct_ud_iface_t *iface, uct_ud_ep_t *ep, + uct_ud_send_skb_t *skb) { - uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ud_iface_t); + uct_ud_iface_send_ctl(iface, ep, skb, NULL, 0, + UCT_UD_IFACE_SEND_CTL_FLAG_SOLICITED, 1); + uct_ud_iface_complete_tx_skb(iface, ep, skb); +} + +static void uct_ud_ep_resend(uct_ud_ep_t *ep) +{ + uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_ud_iface_t); + size_t max_len_without_nack = sizeof(uct_ud_neth_t) + + sizeof(uct_ud_ctl_hdr_t) + + iface->super.addr_size; uct_ud_send_skb_t *skb, *sent_skb; ucs_queue_iter_t resend_pos; uct_ud_zcopy_desc_t *zdesc; - size_t iov_it; + uct_ud_iov_t skb_iov, *iov; + uct_ud_ctl_desc_t *cdesc; + int max_log_sge; + uint16_t iovcnt; + + /* check if the resend window was acknowledged */ + if (UCT_UD_PSN_COMPARE(ep->resend.max_psn, <=, ep->tx.acked_psn)) { + uct_ud_ep_resend_end(ep); + return; + } /* check window */ - resend_pos = (void*)ep->resend.pos; - sent_skb = ucs_queue_iter_elem(sent_skb, resend_pos, queue); - if (sent_skb == NULL) { - uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_RESEND); - return NULL; + resend_pos = ep->resend.pos; + if (ucs_queue_iter_end(&ep->tx.window, resend_pos)) { + uct_ud_ep_resend_end(ep); + return; } + sent_skb = ucs_queue_iter_elem(sent_skb, resend_pos, queue); + ucs_assert(((uintptr_t)sent_skb % UCT_UD_SKB_ALIGN) == 0); if (UCT_UD_PSN_COMPARE(sent_skb->neth->psn, >=, ep->tx.max_psn)) { ucs_debug("ep(%p): out of window(psn=%d/max_psn=%d) - can not resend more", ep, sent_skb ? sent_skb->neth->psn : -1, ep->tx.max_psn); - uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_RESEND); - return NULL; + uct_ud_ep_resend_end(ep); + return; + } + + /* stop resend if packet is larger than CREQ and there wasn't NACK from + * other side */ + if (!(ep->flags & UCT_UD_EP_FLAG_TX_NACKED) && + (sent_skb->len > max_len_without_nack)) { + uct_ud_ep_resend_end(ep); + return; + } + + /* Update resend position */ + ep->resend.pos = ucs_queue_iter_next(resend_pos); + + /* skip skb which was already resent but didn't get send completion yet */ + if (sent_skb->flags & UCT_UD_SEND_SKB_FLAG_RESENDING) { + ucs_debug("ep(%p): skb %p already being resent", ep, sent_skb); + return; } /* skip dummy skb created for non-blocking flush */ if ((uct_ud_neth_get_dest_id(sent_skb->neth) == UCT_UD_EP_NULL_ID) && - !(sent_skb->neth->packet_type & UCT_UD_PACKET_FLAG_CTL)) - { - ep->resend.pos = ucs_queue_iter_next(resend_pos); - return NULL; + !(sent_skb->neth->packet_type & UCT_UD_PACKET_FLAG_CTL)) { + return; } /* creq/crep must remove creq packet from window */ ucs_assertv_always(!(uct_ud_ep_is_connected(ep) && - (uct_ud_neth_get_dest_id(sent_skb->neth) == UCT_UD_EP_NULL_ID) && - !(sent_skb->neth->packet_type & UCT_UD_PACKET_FLAG_AM)), + (uct_ud_neth_get_dest_id(sent_skb->neth) == UCT_UD_EP_NULL_ID) && + !(sent_skb->neth->packet_type & UCT_UD_PACKET_FLAG_AM)), "ep(%p): CREQ resend on endpoint which is already connected", ep); - skb = uct_ud_iface_resend_skb_get(iface); - ucs_assert_always(skb != NULL); + /* Allocate a control skb which would refer to the original skb. + * + * If we didn't resend an skb, it would be released after remote ACK: we can + * assume that if it was received by remote side, it has been fully sent by + * local side. However, if we started resend, all bets are off: we can get + * an ACK while there is still a resend-skb in the QP. In this case, we must + * wait for send completion on that resend-skb before signaling completion + * to the user. If the resend-skb got send completion, assume the original + * skb was sent as well. + */ + skb = uct_ud_iface_ctl_skb_get(iface); + skb->flags = UCT_UD_SEND_SKB_FLAG_CTL_RESEND; + sent_skb->flags |= UCT_UD_SEND_SKB_FLAG_RESENDING; + ep->resend.psn = sent_skb->neth->psn; + ep->tx.resend_time = uct_ud_iface_get_time(iface); - ep->resend.pos = ucs_queue_iter_next(resend_pos); - ep->resend.psn = sent_skb->neth->psn; - memcpy(skb->neth, sent_skb->neth, sent_skb->len); - skb->neth->ack_psn = ep->rx.acked_psn; - skb->len = sent_skb->len; if (sent_skb->flags & UCT_UD_SEND_SKB_FLAG_ZCOPY) { - zdesc = uct_ud_zcopy_desc(sent_skb); - for (iov_it = 0; iov_it < zdesc->iovcnt; ++iov_it) { - if (zdesc->iov[iov_it].length > 0) { - memcpy((char *)skb->neth + skb->len, zdesc->iov[iov_it].buffer, - zdesc->iov[iov_it].length); - skb->len += zdesc->iov[iov_it].length; - } - } + /* copy neth + am header part */ + skb->len = sent_skb->len; + + /* set iov pointer to payload */ + zdesc = uct_ud_zcopy_desc(sent_skb); + iov = zdesc->iov; + iovcnt = zdesc->iovcnt; + max_log_sge = UCT_IB_MAX_ZCOPY_LOG_SGE(&iface->super); + } else { + /* copy neth part only, since we may not have enough room in the control + * skb for the whole payload + ctl desc, and we also prefer to avoid + * memcpy() overhead. */ + ucs_assert(sent_skb->len >= sizeof(uct_ud_neth_t)); + skb->len = sizeof(uct_ud_neth_t); + + /* set iov to skb payload */ + skb_iov.buffer = UCS_PTR_BYTE_OFFSET(sent_skb->neth, sizeof(uct_ud_neth_t)); + skb_iov.length = sent_skb->len - sizeof(uct_ud_neth_t); + skb_iov.lkey = sent_skb->lkey; + iov = &skb_iov; + iovcnt = 1; + max_log_sge = 2; } + + memcpy(skb->neth, sent_skb->neth, skb->len); + skb->neth->ack_psn = ep->rx.acked_psn; + cdesc = uct_ud_ctl_desc(skb); + cdesc->self_skb = skb; + cdesc->resent_skb = sent_skb; + cdesc->ep = ep; + /* force ack request on every Nth packet or on first packet in resend window */ if ((skb->neth->psn % UCT_UD_RESENDS_PER_ACK) == 0 || UCT_UD_PSN_COMPARE(skb->neth->psn, ==, ep->tx.acked_psn+1)) { @@ -1014,78 +1233,98 @@ static uct_ud_send_skb_t *uct_ud_ep_resend(uct_ud_ep_t *ep) if (UCT_UD_PSN_COMPARE(ep->resend.psn, ==, ep->resend.max_psn)) { ucs_debug("ep(%p): resending completed", ep); ep->resend.psn = ep->resend.max_psn + 1; - uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_RESEND); + uct_ud_ep_resend_end(ep); } - return skb; + /* Send control message and save operation on queue. Use signaled-send to + * make sure user completion will not be delayed indefinitely */ + cdesc->sn = uct_ud_iface_send_ctl(iface, ep, skb, iov, iovcnt, + UCT_UD_IFACE_SEND_CTL_FLAG_SIGNALED | + UCT_UD_IFACE_SEND_CTL_FLAG_SOLICITED, + max_log_sge); + uct_ud_iface_add_ctl_desc(iface, cdesc); + ++ep->tx.resend_count; +} + +static void uct_ud_ep_send_ack(uct_ud_iface_t *iface, uct_ud_ep_t *ep) +{ + int ctl_flags = 0; + uct_ud_ctl_desc_t *cdesc; + uct_ud_send_skb_t *skb; + + /* Do not send ACKs if not connected yet. It may happen if CREQ and CREP + * from peer are lost. Need to wait for CREP resend from peer. + */ + if (!uct_ud_ep_is_connected(ep)) { + goto out; + } + + if (sizeof(uct_ud_neth_t) <= iface->config.max_inline) { + skb = ucs_alloca(sizeof(*skb) + sizeof(uct_ud_neth_t)); + skb->flags = 0; +#if UCS_ENABLE_ASSERT + skb->lkey = 0; +#endif + ctl_flags |= UCT_UD_IFACE_SEND_CTL_FLAG_INLINE; + } else { + skb = uct_ud_iface_ctl_skb_get(iface); + } + + uct_ud_neth_init_data(ep, skb->neth); + skb->flags = UCT_UD_SEND_SKB_FLAG_CTL_ACK; + skb->len = sizeof(uct_ud_neth_t); + skb->neth->packet_type = ep->dest_ep_id; + if (uct_ud_ep_ctl_op_check(ep, UCT_UD_EP_OP_ACK_REQ)) { + skb->neth->packet_type |= UCT_UD_PACKET_FLAG_ACK_REQ; + ctl_flags |= UCT_UD_IFACE_SEND_CTL_FLAG_SOLICITED; + } + + if (uct_ud_ep_ctl_op_check(ep, UCT_UD_EP_OP_NACK)) { + skb->neth->packet_type |= UCT_UD_PACKET_FLAG_NAK; + } + + if (ctl_flags & UCT_UD_IFACE_SEND_CTL_FLAG_INLINE) { + uct_ud_iface_send_ctl(iface, ep, skb, NULL, 0, ctl_flags, 1); + } else { + /* if skb is taken from memory pool, release it in send completion */ + cdesc = uct_ud_ctl_desc(skb); + cdesc->sn = uct_ud_iface_send_ctl(iface, ep, skb, NULL, 0, + ctl_flags, 1); + cdesc->self_skb = skb; + cdesc->resent_skb = NULL; + cdesc->ep = NULL; + uct_ud_iface_add_ctl_desc(iface, cdesc); + } + +out: + uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_CTL_ACK); } static void uct_ud_ep_do_pending_ctl(uct_ud_ep_t *ep, uct_ud_iface_t *iface) { uct_ud_send_skb_t *skb; - int flag = 0; if (uct_ud_ep_ctl_op_check(ep, UCT_UD_EP_OP_CREQ)) { skb = uct_ud_ep_prepare_creq(ep); if (skb) { - flag = 1; uct_ud_ep_set_state(ep, UCT_UD_EP_FLAG_CREQ_SENT); uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_CREQ); + uct_ud_ep_send_creq_crep(iface, ep, skb); } } else if (uct_ud_ep_ctl_op_check(ep, UCT_UD_EP_OP_CREP)) { skb = uct_ud_ep_prepare_crep(ep); if (skb) { - flag = 1; uct_ud_ep_set_state(ep, UCT_UD_EP_FLAG_CREP_SENT); uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_CREP); + uct_ud_ep_send_creq_crep(iface, ep, skb); } } else if (uct_ud_ep_ctl_op_check(ep, UCT_UD_EP_OP_RESEND)) { - skb = uct_ud_ep_resend(ep); - } else if (uct_ud_ep_ctl_op_check(ep, UCT_UD_EP_OP_ACK)) { - if (uct_ud_ep_is_connected(ep)) { - if (iface->config.max_inline >= sizeof(uct_ud_neth_t)) { - skb = ucs_unaligned_ptr(&iface->tx.skb_inl.super); - } else { - skb = uct_ud_iface_resend_skb_get(iface); - skb->len = sizeof(uct_ud_neth_t); - } - uct_ud_neth_ctl_ack(ep, skb->neth); - } else { - /* Do not send ACKs if not connected yet. It may happen if - * CREQ and CREP from peer are lost. Need to wait for CREP - * resending by peer. */ - skb = NULL; - } - uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_ACK); - } else if (uct_ud_ep_ctl_op_check(ep, UCT_UD_EP_OP_ACK_REQ)) { - if (iface->config.max_inline >= sizeof(uct_ud_neth_t)) { - skb = ucs_unaligned_ptr(&iface->tx.skb_inl.super); - } else { - skb = uct_ud_iface_resend_skb_get(iface); - skb->len = sizeof(uct_ud_neth_t); - } - uct_ud_neth_ctl_ack_req(ep, skb->neth); - uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_ACK_REQ); - } else if (uct_ud_ep_ctl_op_isany(ep)) { - ucs_fatal("unsupported pending op mask: %x", ep->tx.pending.ops); + uct_ud_ep_resend(ep); + } else if (uct_ud_ep_ctl_op_check(ep, UCT_UD_EP_OP_CTL_ACK)) { + uct_ud_ep_send_ack(iface, ep); } else { - skb = 0; - } - - if (!skb) { - /* no pending - nothing to do */ - return; - } - - VALGRIND_MAKE_MEM_DEFINED(skb, sizeof *skb); - ucs_derived_of(iface->super.ops, uct_ud_iface_ops_t)->tx_skb(ep, skb, flag); - if (flag) { - /* creq and crep allocate real skb, it must be put on window like - * a regular packet to ensure a retransmission. - */ - uct_ud_iface_complete_tx_skb(iface, ep, skb); - } else { - uct_ud_iface_resend_skb_put(iface, skb); + ucs_assertv(!uct_ud_ep_ctl_op_isany(ep), + "unsupported pending op mask: %x", ep->tx.pending.ops); } } @@ -1116,20 +1355,20 @@ uct_ud_ep_ctl_op_next(uct_ud_ep_t *ep) * However we can not let pending uct req block control forever. */ ucs_arbiter_cb_result_t -uct_ud_ep_do_pending(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, +uct_ud_ep_do_pending(ucs_arbiter_t *arbiter, ucs_arbiter_group_t *group, + ucs_arbiter_elem_t *elem, void *arg) { - uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, - priv); - uct_ud_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), - uct_ud_ep_t, + uct_ud_ep_t *ep = ucs_container_of(group, uct_ud_ep_t, tx.pending.group); uct_ud_iface_t *iface = ucs_container_of(arbiter, uct_ud_iface_t, tx.pending_q); uintptr_t in_async_progress = (uintptr_t)arg; + uct_pending_req_t *req; int allow_callback; int async_before_pending; ucs_status_t status; + int is_last_pending_elem; /* check if we have global resources * - tx_wqe @@ -1154,10 +1393,9 @@ uct_ud_ep_do_pending(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, /* we can desched group: iff * - no control * - no ep resources (connect or window) - **/ - + */ if (!uct_ud_ep_ctl_op_isany(ep) && - (!uct_ud_ep_is_connected(ep) || + (!uct_ud_ep_is_connected(ep) || uct_ud_ep_no_window(ep))) { return UCS_ARBITER_CB_RESULT_DESCHED_GROUP; } @@ -1177,6 +1415,7 @@ uct_ud_ep_do_pending(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, * - not in async progress * - there are no high priority pending control messages */ + req = ucs_container_of(elem, uct_pending_req_t, priv); allow_callback = !in_async_progress || (uct_ud_pending_req_priv(req)->flags & UCT_CB_FLAG_ASYNC); if (allow_callback && !uct_ud_ep_ctl_op_check(ep, UCT_UD_EP_OP_CTL_HI_PRIO)) { @@ -1187,7 +1426,22 @@ uct_ud_ep_do_pending(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, /* temporary reset the flag to unblock sends from async context */ iface->tx.async_before_pending = 0; } + /* temporary reset `UCT_UD_EP_HAS_PENDING` flag to unblock sends */ + uct_ud_ep_remove_has_pending_flag(ep); + + is_last_pending_elem = uct_ud_ep_is_last_pending_elem(ep, elem); + status = req->func(req); +#if UCS_ENABLE_ASSERT + /* do not touch the request (or the arbiter element) after + * calling the callback if UCS_OK is returned from the callback */ + if (status == UCS_OK) { + req = NULL; + elem = NULL; + } +#endif + + uct_ud_ep_set_has_pending_flag(ep); iface->tx.async_before_pending = async_before_pending; ep->flags &= ~UCT_UD_EP_FLAG_IN_PENDING; @@ -1201,6 +1455,11 @@ uct_ud_ep_do_pending(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, uct_ud_ep_do_pending_ctl(ep, iface); return uct_ud_ep_ctl_op_next(ep); } + + if (is_last_pending_elem) { + uct_ud_ep_remove_has_pending_flag(ep); + } + return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; } @@ -1221,7 +1480,7 @@ uct_ud_ep_do_pending(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, ucs_status_t uct_ud_ep_pending_add(uct_ep_h ep_h, uct_pending_req_t *req, unsigned flags) { - uct_ud_ep_t *ep = ucs_derived_of(ep_h, uct_ud_ep_t); + uct_ud_ep_t *ep = ucs_derived_of(ep_h, uct_ud_ep_t); uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ud_iface_t); @@ -1239,7 +1498,7 @@ ucs_status_t uct_ud_ep_pending_add(uct_ep_h ep_h, uct_pending_req_t *req, if (uct_ud_iface_can_tx(iface) && uct_ud_iface_has_skbs(iface) && - uct_ud_ep_is_connected(ep) && + uct_ud_ep_is_connected_and_no_pending(ep) && !uct_ud_ep_no_window(ep)) { uct_ud_leave(iface); @@ -1250,6 +1509,7 @@ ucs_status_t uct_ud_ep_pending_add(uct_ep_h ep_h, uct_pending_req_t *req, UCS_STATIC_ASSERT(sizeof(uct_ud_pending_req_priv_t) <= UCT_PENDING_REQ_PRIV_LEN); uct_ud_pending_req_priv(req)->flags = flags; + uct_ud_ep_set_has_pending_flag(ep); uct_pending_req_arb_group_push(&ep->tx.pending.group, req); ucs_arbiter_group_schedule(&iface->tx.pending_q, &ep->tx.pending.group); ucs_trace_data("ud ep %p: added pending req %p tx_psn %d acked_psn %d cwnd %d", @@ -1261,19 +1521,23 @@ ucs_status_t uct_ud_ep_pending_add(uct_ep_h ep_h, uct_pending_req_t *req, } static ucs_arbiter_cb_result_t -uct_ud_ep_pending_purge_cb(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, - void *arg) +uct_ud_ep_pending_purge_cb(ucs_arbiter_t *arbiter, ucs_arbiter_group_t *group, + ucs_arbiter_elem_t *elem, void *arg) { - uct_ud_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), - uct_ud_ep_t, tx.pending.group); + uct_ud_ep_t *ep = ucs_container_of(group, uct_ud_ep_t, + tx.pending.group); uct_purge_cb_args_t *cb_args = arg; uct_pending_purge_callback_t cb = cb_args->cb; uct_pending_req_t *req; + int is_last_pending_elem; if (&ep->tx.pending.elem == elem) { /* return ignored by arbiter */ return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; } + + is_last_pending_elem = uct_ud_ep_is_last_pending_elem(ep, elem); + req = ucs_container_of(elem, uct_pending_req_t, priv); if (cb) { cb(req, cb_args->arg); @@ -1281,6 +1545,10 @@ uct_ud_ep_pending_purge_cb(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, ucs_debug("ep=%p cancelling user pending request %p", ep, req); } + if (is_last_pending_elem) { + uct_ud_ep_remove_has_pending_flag(ep); + } + /* return ignored by arbiter */ return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; } @@ -1289,29 +1557,29 @@ uct_ud_ep_pending_purge_cb(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, void uct_ud_ep_pending_purge(uct_ep_h ep_h, uct_pending_purge_callback_t cb, void *arg) { - uct_ud_ep_t *ep = ucs_derived_of(ep_h, uct_ud_ep_t); - uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, - uct_ud_iface_t); + uct_ud_ep_t *ep = ucs_derived_of(ep_h, uct_ud_ep_t); + uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_ud_iface_t); uct_purge_cb_args_t args = {cb, arg}; uct_ud_enter(iface); ucs_arbiter_group_purge(&iface->tx.pending_q, &ep->tx.pending.group, uct_ud_ep_pending_purge_cb, &args); if (uct_ud_ep_ctl_op_isany(ep)) { - ucs_arbiter_group_push_elem(&ep->tx.pending.group, - &ep->tx.pending.elem); - ucs_arbiter_group_schedule(&iface->tx.pending_q, &ep->tx.pending.group); + uct_ud_ep_ctl_op_schedule(iface, ep); } uct_ud_leave(iface); } -void uct_ud_ep_disconnect(uct_ep_h tl_ep) +void uct_ud_ep_disconnect(uct_ep_h tl_ep) { uct_ud_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_ep_t); uct_ud_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ud_iface_t); ucs_debug("ep %p: disconnect", ep); + uct_ud_enter(iface); + /* cancel user pending */ uct_ud_ep_pending_purge(tl_ep, NULL, NULL); @@ -1319,10 +1587,12 @@ void uct_ud_ep_disconnect(uct_ep_h tl_ep) uct_ud_ep_flush(tl_ep, 0, NULL); /* the EP will be destroyed by interface destroy or timeout in - * uct_ud_ep_slow_timer + * uct_ud_ep_timer */ - ep->close_time = ucs_twheel_get_time(&iface->async.slow_timer); + ep->close_time = ucs_twheel_get_time(&iface->tx.timer); ep->flags |= UCT_UD_EP_FLAG_DISCONNECTED; - ucs_wtimer_add(&iface->async.slow_timer, &ep->slow_timer, + ucs_wtimer_add(&iface->tx.timer, &ep->timer, UCT_UD_SLOW_TIMER_MAX_TICK(iface)); + + uct_ud_leave(iface); } diff --git a/src/uct/ib/ud/base/ud_ep.h b/src/uct/ib/ud/base/ud_ep.h index 206e8100486..143185d2888 100644 --- a/src/uct/ib/ud/base/ud_ep.h +++ b/src/uct/ib/ud/base/ud_ep.h @@ -1,5 +1,6 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -44,7 +45,7 @@ typedef ucs_status_t (*uct_ud_ep_hook_t)(uct_ud_ep_t *ep, uct_ud_neth_t *neth); -#define UCT_UD_EP_HOOK_DECLARE(name) uct_ud_ep_hook_t name +#define UCT_UD_EP_HOOK_DECLARE(name) uct_ud_ep_hook_t name; #define UCT_UD_EP_HOOK_CALL_RX(ep, neth, len) \ if ((ep)->rx.rx_hook(ep, neth) != UCS_OK) { \ @@ -167,11 +168,13 @@ enum { UCT_UD_EP_OP_ACK_REQ = UCS_BIT(1), /* request ack of sent packets */ UCT_UD_EP_OP_RESEND = UCS_BIT(2), /* resend un acked packets */ UCT_UD_EP_OP_CREP = UCS_BIT(3), /* send connection reply */ - UCT_UD_EP_OP_CREQ = UCS_BIT(4) /* send connection request */ + UCT_UD_EP_OP_CREQ = UCS_BIT(4), /* send connection request */ + UCT_UD_EP_OP_NACK = UCS_BIT(5), /* send NACK */ }; #define UCT_UD_EP_OP_CTL_LOW_PRIO (UCT_UD_EP_OP_ACK_REQ|UCT_UD_EP_OP_ACK) #define UCT_UD_EP_OP_CTL_HI_PRIO (UCT_UD_EP_OP_CREQ|UCT_UD_EP_OP_CREP|UCT_UD_EP_OP_RESEND) +#define UCT_UD_EP_OP_CTL_ACK (UCT_UD_EP_OP_ACK|UCT_UD_EP_OP_ACK_REQ|UCT_UD_EP_OP_NACK) typedef struct uct_ud_ep_pending_op { ucs_arbiter_group_t group; @@ -185,23 +188,24 @@ enum { /* TODO: optimize endpoint memory footprint */ enum { - UCT_UD_EP_FLAG_ASYNC_COMPS = UCS_BIT(0), /* set if there are completions that - * were picked by async thread and queued */ - UCT_UD_EP_FLAG_DISCONNECTED = UCS_BIT(1), /* set if the endpoint was disconnected */ - UCT_UD_EP_FLAG_PRIVATE = UCS_BIT(2), /* EP is was created as internal */ + UCT_UD_EP_FLAG_DISCONNECTED = UCS_BIT(0), /* EP was disconnected */ + UCT_UD_EP_FLAG_PRIVATE = UCS_BIT(1), /* EP was created as internal */ + UCT_UD_EP_FLAG_HAS_PENDING = UCS_BIT(2), /* EP has some pending requests */ + UCT_UD_EP_FLAG_CONNECTED = UCS_BIT(3), /* EP was connected to the peer */ /* debug flags */ - UCT_UD_EP_FLAG_CREQ_RCVD = UCS_BIT(3), /* CREQ message was received */ - UCT_UD_EP_FLAG_CREP_RCVD = UCS_BIT(4), /* CREP message was received */ - UCT_UD_EP_FLAG_CREQ_SENT = UCS_BIT(5), /* CREQ message was sent */ - UCT_UD_EP_FLAG_CREP_SENT = UCS_BIT(6), /* CREP message was sent */ - UCT_UD_EP_FLAG_CREQ_NOTSENT = UCS_BIT(7), /* CREQ message is NOT sent, because + UCT_UD_EP_FLAG_CREQ_RCVD = UCS_BIT(4), /* CREQ message was received */ + UCT_UD_EP_FLAG_CREP_RCVD = UCS_BIT(5), /* CREP message was received */ + UCT_UD_EP_FLAG_CREQ_SENT = UCS_BIT(6), /* CREQ message was sent */ + UCT_UD_EP_FLAG_CREP_SENT = UCS_BIT(7), /* CREP message was sent */ + UCT_UD_EP_FLAG_CREQ_NOTSENT = UCS_BIT(8), /* CREQ message is NOT sent, because connection establishment process is driven by remote side. */ + UCT_UD_EP_FLAG_TX_NACKED = UCS_BIT(9), /* Last psn was acked with NAK */ /* Endpoint is currently executing the pending queue */ -#if ENABLE_ASSERT - UCT_UD_EP_FLAG_IN_PENDING = UCS_BIT(8) +#if UCS_ENABLE_ASSERT + UCT_UD_EP_FLAG_IN_PENDING = UCS_BIT(10) #else UCT_UD_EP_FLAG_IN_PENDING = 0 #endif @@ -217,23 +221,24 @@ struct uct_ud_ep { uint32_t ep_id; uint32_t dest_ep_id; struct { - uct_ud_psn_t psn; /* Next PSN to send */ - uct_ud_psn_t max_psn; /* Largest PSN that can be sent */ - uct_ud_psn_t acked_psn; /* last psn that was acked by remote side */ - uint16_t err_skb_count;/* number of failed SKBs on the ep */ - ucs_queue_head_t window; /* send window: [acked_psn+1, psn-1] */ - uct_ud_ep_pending_op_t pending; /* pending ops */ - ucs_time_t send_time; /* tx time of last packet */ - ucs_time_t slow_tick; /* timeout to trigger slow timer */ - UCS_STATS_NODE_DECLARE(stats); - UCT_UD_EP_HOOK_DECLARE(tx_hook); + uct_ud_psn_t psn; /* Next PSN to send */ + uct_ud_psn_t max_psn; /* Largest PSN that can be sent */ + uct_ud_psn_t acked_psn; /* last psn that was acked by remote side */ + uint16_t resend_count; /* number of in-flight resends on the ep */ + ucs_queue_head_t window; /* send window: [acked_psn+1, psn-1] */ + uct_ud_ep_pending_op_t pending; /* pending ops */ + ucs_time_t send_time; /* tx time of last packet */ + ucs_time_t resend_time; /* tx time of last resent packet */ + ucs_time_t tick; /* timeout to trigger timer */ + UCS_STATS_NODE_DECLARE(stats) + UCT_UD_EP_HOOK_DECLARE(tx_hook) } tx; struct { uct_ud_psn_t acked_psn; /* Last psn we acked */ ucs_frag_list_t ooo_pkts; /* Out of order packets that can not be processed yet, also keeps last psn we successfully received and processed */ - UCS_STATS_NODE_DECLARE(stats); - UCT_UD_EP_HOOK_DECLARE(rx_hook); + UCS_STATS_NODE_DECLARE(stats) + UCT_UD_EP_HOOK_DECLARE(rx_hook) } rx; struct { uct_ud_psn_t wmax; @@ -247,21 +252,30 @@ struct uct_ud_ep { ucs_list_link_t cep_list; uint32_t conn_id; /* connection id. assigned in connect_to_iface() */ uint16_t flags; - uint8_t path_bits; uint8_t rx_creq_count; /* TODO: remove when reason for DUP/OOO CREQ is found */ - ucs_wtimer_t slow_timer; + uint8_t path_index; + ucs_wtimer_t timer; ucs_time_t close_time; /* timestamp of closure */ #if HAVE_HNS_ROCE ucs_queue_head_t pending_skb; #endif - UCS_STATS_NODE_DECLARE(stats); - UCT_UD_EP_HOOK_DECLARE(timer_hook); + UCS_STATS_NODE_DECLARE(stats) + UCT_UD_EP_HOOK_DECLARE(timer_hook) #if ENABLE_DEBUG_DATA uct_ud_peer_name_t peer; #endif }; -UCS_CLASS_DECLARE(uct_ud_ep_t, uct_ud_iface_t*) +#if ENABLE_DEBUG_DATA +# define UCT_UD_EP_PEER_NAME_FMT "%s:%d" +# define UCT_UD_EP_PEER_NAME_ARG(_ep) (_ep)->peer.name, (_ep)->peer.pid +#else +# define UCT_UD_EP_PEER_NAME_FMT "%s" +# define UCT_UD_EP_PEER_NAME_ARG(_ep) "" +#endif + + +UCS_CLASS_DECLARE(uct_ud_ep_t, uct_ud_iface_t*, const uct_ep_params_t*) /** * UD pending request private data @@ -280,7 +294,7 @@ uct_ud_pending_req_priv(uct_pending_req_t *req) void uct_ud_tx_wnd_purge_outstanding(uct_ud_iface_t *iface, uct_ud_ep_t *ud_ep, - ucs_status_t status); + ucs_status_t status, int is_async); ucs_status_t uct_ud_ep_flush(uct_ep_h ep, unsigned flags, uct_completion_t *comp); @@ -302,11 +316,14 @@ void uct_ud_ep_pending_purge(uct_ep_h ep, uct_pending_purge_callback_t cb, void uct_ud_ep_disconnect(uct_ep_h ep); +void uct_ud_ep_window_release_completed(uct_ud_ep_t *ep, int is_async); + /* helper function to create/destroy new connected ep */ ucs_status_t uct_ud_ep_create_connected_common(uct_ud_iface_t *iface, const uct_ib_address_t *ib_addr, const uct_ud_iface_addr_t *if_addr, + unsigned path_index, uct_ud_ep_t **new_ep_p, uct_ud_send_skb_t **skb_p); @@ -317,8 +334,8 @@ void uct_ud_ep_destroy_connected(uct_ud_ep_t *ep, uct_ud_send_skb_t *uct_ud_ep_prepare_creq(uct_ud_ep_t *ep); ucs_arbiter_cb_result_t -uct_ud_ep_do_pending(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, - void *arg); +uct_ud_ep_do_pending(ucs_arbiter_t *arbiter, ucs_arbiter_group_t *group, + ucs_arbiter_elem_t *elem, void *arg); void uct_ud_ep_clone(uct_ud_ep_t *old_ep, uct_ud_ep_t *new_ep); @@ -341,22 +358,6 @@ void uct_ud_ep_process_rx(uct_ud_iface_t *iface, uct_ud_recv_skb_t *skb, int is_async); -static UCS_F_ALWAYS_INLINE void -uct_ud_neth_ctl_ack(uct_ud_ep_t *ep, uct_ud_neth_t *neth) -{ - neth->psn = ep->tx.psn; - neth->ack_psn = ep->rx.acked_psn = ucs_frag_list_sn(&ep->rx.ooo_pkts); - neth->packet_type = ep->dest_ep_id; -} - -static UCS_F_ALWAYS_INLINE void -uct_ud_neth_ctl_ack_req(uct_ud_ep_t *ep, uct_ud_neth_t *neth) -{ - neth->psn = ep->tx.psn; - neth->ack_psn = ep->rx.acked_psn = ucs_frag_list_sn(&ep->rx.ooo_pkts); - neth->packet_type = ep->dest_ep_id|UCT_UD_PACKET_FLAG_ACK_REQ; -} - static UCS_F_ALWAYS_INLINE void uct_ud_neth_init_data(uct_ud_ep_t *ep, uct_ud_neth_t *neth) { @@ -364,8 +365,6 @@ uct_ud_neth_init_data(uct_ud_ep_t *ep, uct_ud_neth_t *neth) neth->ack_psn = ep->rx.acked_psn = ucs_frag_list_sn(&ep->rx.ooo_pkts); } - - static inline int uct_ud_ep_compare(uct_ud_ep_t *a, uct_ud_ep_t *b) { return a->conn_id - b->conn_id; @@ -407,11 +406,20 @@ uct_ud_ep_ctl_op_check_ex(uct_ud_ep_t *ep, uint32_t ops) ((ep->tx.pending.ops & ~ops) == 0); } - -/* TODO: relay on window check instead. max_psn = psn */ +/* TODO: rely on window check instead. max_psn = psn */ static UCS_F_ALWAYS_INLINE int uct_ud_ep_is_connected(uct_ud_ep_t *ep) { - return ep->dest_ep_id != UCT_UD_EP_NULL_ID; + ucs_assert((ep->dest_ep_id == UCT_UD_EP_NULL_ID) == + !(ep->flags & UCT_UD_EP_FLAG_CONNECTED)); + return ep->flags & UCT_UD_EP_FLAG_CONNECTED; +} + +static UCS_F_ALWAYS_INLINE int +uct_ud_ep_is_connected_and_no_pending(uct_ud_ep_t *ep) +{ + return (ep->flags & (UCT_UD_EP_FLAG_CONNECTED | + UCT_UD_EP_FLAG_HAS_PENDING)) + == UCT_UD_EP_FLAG_CONNECTED; } static UCS_F_ALWAYS_INLINE int uct_ud_ep_no_window(uct_ud_ep_t *ep) diff --git a/src/uct/ib/ud/base/ud_iface.c b/src/uct/ib/ud/base/ud_iface.c index d3b0bcad103..e1ab377e283 100644 --- a/src/uct/ib/ud/base/ud_iface.c +++ b/src/uct/ib/ud/base/ud_iface.c @@ -1,10 +1,14 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ud_iface.h" #include "ud_ep.h" #include "ud_inl.h" @@ -15,13 +19,9 @@ #include #include #include -#include -#define UCT_UD_IPV4_ADDR_LEN sizeof(struct in_addr) -#define UCT_UD_IPV6_ADDR_LEN sizeof(struct in6_addr) - -#if ENABLE_STATS +#ifdef ENABLE_STATS static ucs_stats_class_t uct_ud_iface_stats_class = { .name = "ud_iface", .num_counters = UCT_UD_IFACE_STAT_LAST, @@ -31,14 +31,12 @@ static ucs_stats_class_t uct_ud_iface_stats_class = { }; #endif +/* cppcheck-suppress ctunullpointer */ SGLIB_DEFINE_LIST_FUNCTIONS(uct_ud_iface_peer_t, uct_ud_iface_peer_cmp, next) SGLIB_DEFINE_HASHED_CONTAINER_FUNCTIONS(uct_ud_iface_peer_t, UCT_UD_HASH_SIZE, uct_ud_iface_peer_hash) -static void uct_ud_iface_free_resend_skbs(uct_ud_iface_t *iface); -static void uct_ud_iface_timer(int timer_id, void *arg); - static void uct_ud_iface_free_pending_rx(uct_ud_iface_t *iface); static void uct_ud_iface_free_async_comps(uct_ud_iface_t *iface); @@ -84,26 +82,29 @@ void uct_ud_iface_cep_cleanup(uct_ud_iface_t *iface) static uct_ud_iface_peer_t * uct_ud_iface_cep_lookup_addr(uct_ud_iface_t *iface, uint16_t dlid, - const union ibv_gid *dgid, uint32_t dest_qpn) + const union ibv_gid *dgid, uint32_t dest_qpn, + int path_index) { uct_ud_iface_peer_t key; - key.dlid = dlid; - key.dgid = *dgid; - key.dst_qpn = dest_qpn; + key.dlid = dlid; + key.dgid = *dgid; + key.dst_qpn = dest_qpn; + key.path_index = path_index; return sglib_hashed_uct_ud_iface_peer_t_find_member(iface->peers, &key); } static uct_ud_iface_peer_t * uct_ud_iface_cep_lookup_peer(uct_ud_iface_t *iface, const uct_ib_address_t *src_ib_addr, - const uct_ud_iface_addr_t *src_if_addr) + const uct_ud_iface_addr_t *src_if_addr, + int path_index) { uint32_t dest_qpn = uct_ib_unpack_uint24(src_if_addr->qp_num); - union ibv_gid dgid; - uint16_t dlid; + uct_ib_address_pack_params_t params; - uct_ib_address_unpack(src_ib_addr, &dlid, &dgid); - return uct_ud_iface_cep_lookup_addr(iface, dlid, &dgid, dest_qpn); + uct_ib_address_unpack(src_ib_addr, ¶ms); + return uct_ud_iface_cep_lookup_addr(iface, params.lid, ¶ms.gid, + dest_qpn, path_index); } static uct_ud_ep_t * @@ -145,25 +146,27 @@ uct_ud_iface_cep_getid(uct_ud_iface_peer_t *peer, uint32_t conn_id) ucs_status_t uct_ud_iface_cep_insert(uct_ud_iface_t *iface, const uct_ib_address_t *src_ib_addr, const uct_ud_iface_addr_t *src_if_addr, - uct_ud_ep_t *ep, uint32_t conn_id) + uct_ud_ep_t *ep, uint32_t conn_id, + int path_index) { uint32_t dest_qpn = uct_ib_unpack_uint24(src_if_addr->qp_num); + uct_ib_address_pack_params_t params; uct_ud_iface_peer_t *peer; - union ibv_gid dgid; uct_ud_ep_t *cep; - uint16_t dlid; - uct_ib_address_unpack(src_ib_addr, &dlid, &dgid); - peer = uct_ud_iface_cep_lookup_addr(iface, dlid, &dgid, dest_qpn); + uct_ib_address_unpack(src_ib_addr, ¶ms); + peer = uct_ud_iface_cep_lookup_addr(iface, params.lid, ¶ms.gid, + dest_qpn, path_index); if (peer == NULL) { peer = malloc(sizeof *peer); if (peer == NULL) { return UCS_ERR_NO_MEMORY; } - peer->dlid = dlid; - peer->dgid = dgid; - peer->dst_qpn = dest_qpn; + peer->dlid = params.lid; + peer->dgid = params.gid; + peer->dst_qpn = dest_qpn; + peer->path_index = path_index; sglib_hashed_uct_ud_iface_peer_t_add(iface->peers, peer); ucs_list_head_init(&peer->ep_list); peer->conn_id_last = 0; @@ -175,8 +178,8 @@ ucs_status_t uct_ud_iface_cep_insert(uct_ud_iface_t *iface, } if (ucs_list_is_empty(&peer->ep_list)) { - ucs_list_add_head(&peer->ep_list, &ep->cep_list); - return UCS_OK; + ucs_list_add_head(&peer->ep_list, &ep->cep_list); + return UCS_OK; } ucs_list_for_each(cep, &peer->ep_list, cep_list) { ucs_assert_always(cep->conn_id != ep->conn_id); @@ -201,12 +204,13 @@ void uct_ud_iface_cep_remove(uct_ud_ep_t *ep) uct_ud_ep_t *uct_ud_iface_cep_lookup(uct_ud_iface_t *iface, const uct_ib_address_t *src_ib_addr, const uct_ud_iface_addr_t *src_if_addr, - uint32_t conn_id) + uint32_t conn_id, int path_index) { uct_ud_iface_peer_t *peer; uct_ud_ep_t *ep; - peer = uct_ud_iface_cep_lookup_peer(iface, src_ib_addr, src_if_addr); + peer = uct_ud_iface_cep_lookup_peer(iface, src_ib_addr, src_if_addr, + path_index); if (peer == NULL) { return NULL; } @@ -225,7 +229,8 @@ void uct_ud_iface_cep_rollback(uct_ud_iface_t *iface, { uct_ud_iface_peer_t *peer; - peer = uct_ud_iface_cep_lookup_peer(iface, src_ib_addr, src_if_addr); + peer = uct_ud_iface_cep_lookup_peer(iface, src_ib_addr, src_if_addr, + ep->path_index); ucs_assert_always(peer != NULL); ucs_assert_always(peer->conn_id_last > 0); ucs_assert_always(ep->conn_id + 1 == peer->conn_id_last); @@ -240,16 +245,16 @@ static void uct_ud_iface_send_skb_init(uct_iface_h tl_iface, void *obj, uct_mem_h memh) { uct_ud_send_skb_t *skb = obj; - uct_ib_mem_t *ib_memh = memh; - skb->lkey = ib_memh->lkey; - skb->flags = 0; + skb->lkey = uct_ib_memh_get_lkey(memh); + skb->flags = UCT_UD_SEND_SKB_FLAG_INVALID; } static ucs_status_t uct_ud_iface_create_qp(uct_ud_iface_t *self, const uct_ud_iface_config_t *config) { - uct_ib_qp_attr_t qp_init_attr = {}; + uct_ud_iface_ops_t *ops = ucs_derived_of(self->super.ops, uct_ud_iface_ops_t); + uct_ib_qp_attr_t qp_init_attr = {}; struct ibv_qp_attr qp_attr; static ucs_status_t status; int ret; @@ -260,10 +265,9 @@ uct_ud_iface_create_qp(uct_ud_iface_t *self, const uct_ud_iface_config_t *config qp_init_attr.cap.max_recv_wr = config->super.rx.queue_len; qp_init_attr.cap.max_send_sge = 2; qp_init_attr.cap.max_recv_sge = 1; - qp_init_attr.cap.max_inline_data = ucs_max(config->super.tx.min_inline, - UCT_UD_MIN_INLINE); + qp_init_attr.cap.max_inline_data = config->super.tx.min_inline; - status = self->super.ops->create_qp(&self->super, &qp_init_attr, &self->qp); + status = ops->create_qp(&self->super, &qp_init_attr, &self->qp); if (status != UCS_OK) { return status; } @@ -273,7 +277,6 @@ uct_ud_iface_create_qp(uct_ud_iface_t *self, const uct_ud_iface_config_t *config #else self->config.max_inline = qp_init_attr.cap.max_inline_data; #endif - uct_ib_iface_set_max_iov(&self->super, qp_init_attr.cap.max_send_sge); memset(&qp_attr, 0, sizeof(qp_attr)); /* Modify QP to INIT state */ @@ -307,80 +310,156 @@ uct_ud_iface_create_qp(uct_ud_iface_t *self, const uct_ud_iface_config_t *config return UCS_OK; err_destroy_qp: - ibv_destroy_qp(self->qp); + uct_ib_destroy_qp(self->qp); return UCS_ERR_INVALID_PARAM; } +static inline void uct_ud_iface_async_progress(uct_ud_iface_t *iface) +{ + uct_ud_iface_ops_t *ops = + ucs_derived_of(iface->super.ops, uct_ud_iface_ops_t); + unsigned ev_count; + + if (ucs_unlikely(iface->async.disable)) { + return; + } + + ev_count = ops->async_progress(iface); + if (ev_count > 0) { + uct_ud_iface_raise_pending_async_ev(iface); + } +} + +static void uct_ud_iface_async_handler(int fd, int events, void *arg) +{ + uct_ud_iface_t *iface = arg; + + uct_ud_iface_async_progress(iface); + + /* arm for new solicited events + * if user asks to provide notifications for all completion + * events by calling uct_iface_event_arm(), RX CQ will be + * armed again with solicited flag = 0 */ + uct_ib_iface_pre_arm(&iface->super); + iface->super.ops->arm_cq(&iface->super, UCT_IB_DIR_RX, 1); + + ucs_assert(iface->async.event_cb != NULL); + /* notify user */ + iface->async.event_cb(iface->async.event_arg, 0); +} + +static void uct_ud_iface_timer(int timer_id, int events, void *arg) +{ + uct_ud_iface_t *iface = arg; + + uct_ud_iface_async_progress(iface); +} + ucs_status_t uct_ud_iface_complete_init(uct_ud_iface_t *iface) { ucs_async_context_t *async = iface->super.super.worker->async; ucs_async_mode_t async_mode = async->mode; ucs_status_t status; + int event_fd; - iface->tx.resend_skbs_quota = iface->tx.available; - - /* TODO: make tick configurable */ - iface->async.slow_tick = ucs_time_from_msec(100); - status = ucs_twheel_init(&iface->async.slow_timer, - iface->async.slow_tick / 4, - uct_ud_iface_get_async_time(iface)); + status = ucs_twheel_init(&iface->tx.timer, iface->tx.tick / 4, + uct_ud_iface_get_time(iface)); if (status != UCS_OK) { goto err; } - status = ucs_async_add_timer(async_mode, iface->async.slow_tick, - uct_ud_iface_timer, iface, async, - &iface->async.timer_id); + status = uct_ib_iface_event_fd_get(&iface->super.super.super, &event_fd); if (status != UCS_OK) { goto err_twheel_cleanup; } + if (iface->async.event_cb != NULL) { + status = ucs_async_set_event_handler(async_mode, event_fd, + UCS_EVENT_SET_EVREAD | + UCS_EVENT_SET_EVERR, + uct_ud_iface_async_handler, + iface, async); + if (status != UCS_OK) { + goto err_twheel_cleanup; + } + + status = iface->super.ops->arm_cq(&iface->super, UCT_IB_DIR_RX, 1); + if (status != UCS_OK) { + goto err_twheel_cleanup; + } + } + return UCS_OK; err_twheel_cleanup: - ucs_twheel_cleanup(&iface->async.slow_timer); + ucs_twheel_cleanup(&iface->tx.timer); err: return status; } void uct_ud_iface_remove_async_handlers(uct_ud_iface_t *iface) { - uct_base_iface_progress_disable(&iface->super.super.super, - UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); - ucs_async_remove_handler(iface->async.timer_id, 1); + ucs_status_t status; + int event_fd; + + uct_ud_iface_progress_disable(&iface->super.super.super, + UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); + if (iface->async.event_cb != NULL) { + status = uct_ib_iface_event_fd_get(&iface->super.super.super, + &event_fd); + if (status == UCS_OK) { + ucs_async_remove_handler(event_fd, 1); + } + } } -/* Calculate real GIDs len. Can be either 16 (RoCEv1 or RoCEv2/IPv6) - * or 4 (RoCEv2/IPv4). This len is used for packets filtering by DGIDs. - * - * According to Annex17_RoCEv2 (A17.4.5.2): - * "The first 40 bytes of user posted UD Receive Buffers are reserved for the L3 - * header of the incoming packet (as per the InfiniBand Spec Section 11.4.1.2). - * In RoCEv2, this area is filled up with the IP header. IPv6 header uses the - * entire 40 bytes. IPv4 headers use the 20 bytes in the second half of the - * reserved 40 bytes area (i.e. offset 20 from the beginning of the receive - * buffer). In this case, the content of the first 20 bytes is undefined." */ -static void uct_ud_iface_calc_gid_len(uct_ud_iface_t *iface) +static ucs_status_t uct_ud_iface_gid_hash_init(uct_ud_iface_t *iface, + uct_md_h md) { - uint16_t *local_gid_u16 = (uint16_t*)iface->super.gid.raw; + static const union ibv_gid zero_gid = { .raw = {0} }; + uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev; + int port = iface->super.config.port_num; + uct_ib_device_gid_info_t gid_info; + int gid_idx, gid_tbl_len, kh_ret; + ucs_status_t status; + char gid_str[128]; - /* Make sure that daddr in IPv4 resides in the last 4 bytes in GRH */ - UCS_STATIC_ASSERT((UCT_IB_GRH_LEN - (20 + offsetof(struct iphdr, daddr))) == - UCT_UD_IPV4_ADDR_LEN); + kh_init_inplace(uct_ud_iface_gid, &iface->gid_table.hash); - /* Make sure that dgid resides in the last 16 bytes in GRH */ - UCS_STATIC_ASSERT((UCT_IB_GRH_LEN - offsetof(struct ibv_grh, dgid)) == - UCT_UD_IPV6_ADDR_LEN); - - /* IPv4 mapped to IPv6 looks like: 0000:0000:0000:0000:0000:ffff:????:????, - * so check for leading zeroes and verify that 11-12 bytes are 0xff. - * Otherwise either RoCEv1 or RoCEv2/IPv6 are used. */ - if (local_gid_u16[0] == 0x0000) { - ucs_assert_always(local_gid_u16[5] == 0xffff); - iface->config.gid_len = UCT_UD_IPV4_ADDR_LEN; - } else { - iface->config.gid_len = UCT_UD_IPV6_ADDR_LEN; + gid_tbl_len = uct_ib_device_port_attr(dev, port)->gid_tbl_len; + for (gid_idx = 0; gid_idx < gid_tbl_len; ++gid_idx) { + status = uct_ib_device_query_gid_info(dev->ibv_context, + uct_ib_device_name(dev), + port, gid_idx, &gid_info); + if (status != UCS_OK) { + goto err; + } + + if (!memcmp(&gid_info.gid, &zero_gid, sizeof(zero_gid))) { + continue; + } + + ucs_debug("iface %p: adding gid %s to hash on device %s port %d index " + "%d)", iface, uct_ib_gid_str(&gid_info.gid, gid_str, + sizeof(gid_str)), + uct_ib_device_name(dev), port, gid_idx); + kh_put(uct_ud_iface_gid, &iface->gid_table.hash, gid_info.gid, + &kh_ret); + if (kh_ret < 0) { + ucs_error("failed to add gid to hash on device %s port %d index %d", + uct_ib_device_name(dev), port, gid_idx); + status = UCS_ERR_NO_MEMORY; + goto err; + } } + + iface->gid_table.last = zero_gid; + iface->gid_table.last_len = sizeof(zero_gid); + return UCS_OK; + +err: + kh_destroy_inplace(uct_ud_iface_gid, &iface->gid_table.hash); + return status; } UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, @@ -419,9 +498,7 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, init_attr->rx_priv_len = sizeof(uct_ud_recv_skb_t) - sizeof(uct_ib_iface_recv_desc_t); init_attr->rx_hdr_len = UCT_IB_GRH_LEN + sizeof(uct_ud_neth_t); - init_attr->tx_cq_len = config->super.tx.queue_len; - init_attr->rx_cq_len = config->super.rx.queue_len; - init_attr->seg_size = ucs_min(mtu, config->super.super.max_bcopy); + init_attr->seg_size = ucs_min(mtu, config->super.seg_size); init_attr->qp_type = IBV_QPT_UD; UCS_CLASS_CALL_SUPER_INIT(uct_ib_iface_t, &ops->super, md, worker, @@ -434,6 +511,8 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, self->tx.unsignaled = 0; self->tx.available = config->super.tx.queue_len; + self->tx.timer_sweep_count = 0; + self->async.disable = 0; self->rx.available = config->super.rx.queue_len; self->rx.quota = 0; @@ -442,24 +521,66 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, self->config.check_grh_dgid = config->dgid_check && uct_ib_iface_is_roce(&self->super); - if (config->slow_timer_backoff <= 0.) { - ucs_error("The slow timer back off should be > 0 (%lf)", - config->slow_timer_backoff); + if ((config->max_window < UCT_UD_CA_MIN_WINDOW) || + (config->max_window > UCT_UD_CA_MAX_WINDOW)) { + ucs_error("Max congestion avoidance window should be >= %d and <= %d (%d)", + UCT_UD_CA_MIN_WINDOW, UCT_UD_CA_MAX_WINDOW, config->max_window); return UCS_ERR_INVALID_PARAM; + } + + self->config.max_window = config->max_window; + + self->rx.async_max_poll = config->rx_async_max_poll; + + if (config->timer_tick <= 0.) { + ucs_error("The timer tick should be > 0 (%lf)", + config->timer_tick); + return UCS_ERR_INVALID_PARAM; + } else { + self->tx.tick = ucs_time_from_sec(config->timer_tick); + } + + if (config->timer_backoff < UCT_UD_MIN_TIMER_TIMER_BACKOFF) { + ucs_error("The timer back off must be >= %lf (%lf)", + UCT_UD_MIN_TIMER_TIMER_BACKOFF, config->timer_backoff); + return UCS_ERR_INVALID_PARAM; + } else { + self->tx.timer_backoff = config->timer_backoff; + } + + if (config->event_timer_tick <= 0.) { + ucs_error("The event timer tick should be > 0 (%lf)", + config->event_timer_tick); + return UCS_ERR_INVALID_PARAM; + } else { + self->async.tick = ucs_time_from_sec(config->event_timer_tick); + } + + if (params->field_mask & UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_CB) { + self->async.event_cb = params->async_event_cb; + } else { + self->async.event_cb = NULL; + } + + if (params->field_mask & UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_ARG) { + self->async.event_arg = params->async_event_arg; } else { - self->config.slow_timer_backoff = config->slow_timer_backoff; + self->async.event_arg = NULL; } + self->async.timer_id = 0; + /* Redefine receive desc release callback */ - self->super.release_desc.cb = uct_ud_iface_release_desc; + self->super.release_desc.cb = uct_ud_iface_release_desc; UCT_UD_IFACE_HOOK_INIT(self); - if (uct_ud_iface_create_qp(self, config) != UCS_OK) { + status = uct_ud_iface_create_qp(self, config); + if (status != UCS_OK) { return UCS_ERR_INVALID_PARAM; } - ucs_ptr_array_init(&self->eps, 0, "ud_eps"); + ucs_ptr_array_init(&self->eps, "ud_eps"); uct_ud_iface_cep_init(self); status = uct_ib_iface_recv_mpool_init(&self->super, &config->super, @@ -475,8 +596,10 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, data_size = sizeof(uct_ud_ctl_hdr_t) + self->super.addr_size; data_size = ucs_max(data_size, self->super.config.seg_size); - data_size = ucs_max(data_size, sizeof(uct_ud_zcopy_desc_t) + self->config.max_inline); - + data_size = ucs_max(data_size, + sizeof(uct_ud_zcopy_desc_t) + self->config.max_inline); + data_size = ucs_max(data_size, + sizeof(uct_ud_ctl_desc_t) + sizeof(uct_ud_neth_t)); status = uct_iface_mpool_init(&self->super.super, &self->tx.mp, sizeof(uct_ud_send_skb_t) + data_size, sizeof(uct_ud_send_skb_t), @@ -487,38 +610,35 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, goto err_rx_mpool; } - ucs_assert_always(data_size >= UCT_UD_MIN_INLINE); - - self->tx.skb = NULL; - self->tx.skb_inl.super.len = sizeof(uct_ud_neth_t); - - ucs_queue_head_init(&self->tx.resend_skbs); - self->tx.resend_skbs_quota = 0; + self->tx.skb = NULL; + self->tx.async_before_pending = 0; ucs_arbiter_init(&self->tx.pending_q); - + ucs_queue_head_init(&self->tx.outstanding_q); ucs_queue_head_init(&self->tx.async_comp_q); - ucs_queue_head_init(&self->rx.pending_q); - self->tx.async_before_pending = 0; - - uct_ud_iface_calc_gid_len(self); - status = UCS_STATS_NODE_ALLOC(&self->stats, &uct_ud_iface_stats_class, self->super.super.stats); if (status != UCS_OK) { goto err_tx_mpool; } + status = uct_ud_iface_gid_hash_init(self, md); + if (status != UCS_OK) { + goto err_release_stats; + } + return UCS_OK; +err_release_stats: + UCS_STATS_NODE_FREE(self->stats); err_tx_mpool: ucs_mpool_cleanup(&self->tx.mp, 1); err_rx_mpool: ucs_mpool_cleanup(&self->rx.mp, 1); err_qp: - ibv_destroy_qp(self->qp); + uct_ib_destroy_qp(self->qp); ucs_ptr_array_cleanup(&self->eps); return status; } @@ -529,46 +649,66 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ud_iface_t) /* TODO: proper flush and connection termination */ uct_ud_enter(self); + ucs_twheel_cleanup(&self->tx.timer); ucs_debug("iface(%p): cep cleanup", self); uct_ud_iface_cep_cleanup(self); - uct_ud_iface_free_resend_skbs(self); uct_ud_iface_free_async_comps(self); ucs_mpool_cleanup(&self->tx.mp, 0); /* TODO: qp to error state and cleanup all wqes */ uct_ud_iface_free_pending_rx(self); ucs_mpool_cleanup(&self->rx.mp, 0); - ibv_destroy_qp(self->qp); + uct_ib_destroy_qp(self->qp); ucs_debug("iface(%p): ptr_array cleanup", self); ucs_ptr_array_cleanup(&self->eps); ucs_arbiter_cleanup(&self->tx.pending_q); UCS_STATS_NODE_FREE(self->stats); + kh_destroy_inplace(uct_ud_iface_gid, &self->gid_table.hash); uct_ud_leave(self); } UCS_CLASS_DEFINE(uct_ud_iface_t, uct_ib_iface_t); ucs_config_field_t uct_ud_iface_config_table[] = { - {"IB_", "", NULL, + {UCT_IB_CONFIG_PREFIX, "", NULL, ucs_offsetof(uct_ud_iface_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_ib_iface_config_table)}, - {"", "", NULL, + {"UD_", "", NULL, ucs_offsetof(uct_ud_iface_config_t, ud_common), UCS_CONFIG_TYPE_TABLE(uct_ud_iface_common_config_table)}, {"TIMEOUT", "5.0m", "Transport timeout", ucs_offsetof(uct_ud_iface_config_t, peer_timeout), UCS_CONFIG_TYPE_TIME}, - {"SLOW_TIMER_BACKOFF", "2.0", "Timeout multiplier for resending trigger", - ucs_offsetof(uct_ud_iface_config_t, slow_timer_backoff), + {"TIMER_TICK", "10ms", "Initial timeout for retransmissions", + ucs_offsetof(uct_ud_iface_config_t, timer_tick), UCS_CONFIG_TYPE_TIME}, + {"TIMER_BACKOFF", "2.0", + "Timeout multiplier for resending trigger (must be >= " + UCS_PP_MAKE_STRING(UCT_UD_MIN_TIMER_TIMER_BACKOFF) ")", + ucs_offsetof(uct_ud_iface_config_t, timer_backoff), UCS_CONFIG_TYPE_DOUBLE}, + {"ASYNC_TIMER_TICK", "100ms", "Resolution for async timer", + ucs_offsetof(uct_ud_iface_config_t, event_timer_tick), UCS_CONFIG_TYPE_TIME}, {"ETH_DGID_CHECK", "y", - "Enable checking destination GID for incoming packets of Ethernet network\n" + "Enable checking destination GID for incoming packets of Ethernet network.\n" "Mismatched packets are silently dropped.", ucs_offsetof(uct_ud_iface_config_t, dgid_check), UCS_CONFIG_TYPE_BOOL}, + + {"MAX_WINDOW", UCS_PP_MAKE_STRING(UCT_UD_CA_MAX_WINDOW), + "Max congestion avoidance window. Should be >= " + UCS_PP_MAKE_STRING(UCT_UD_CA_MIN_WINDOW) " and <= " + UCS_PP_MAKE_STRING(UCT_UD_CA_MAX_WINDOW), + ucs_offsetof(uct_ud_iface_config_t, max_window), UCS_CONFIG_TYPE_UINT}, + + {"RX_ASYNC_MAX_POLL", "64", + "Max number of receive completions to pick during asynchronous TX poll", + ucs_offsetof(uct_ud_iface_config_t, rx_async_max_poll), UCS_CONFIG_TYPE_UINT}, + {NULL} }; -ucs_status_t uct_ud_iface_query(uct_ud_iface_t *iface, uct_iface_attr_t *iface_attr) +ucs_status_t uct_ud_iface_query(uct_ud_iface_t *iface, + uct_iface_attr_t *iface_attr, + size_t am_max_iov, size_t am_max_hdr) { ucs_status_t status; @@ -586,9 +726,10 @@ ucs_status_t uct_ud_iface_query(uct_ud_iface_t *iface, uct_iface_attr_t *iface_a UCT_IFACE_FLAG_PENDING | UCT_IFACE_FLAG_CB_SYNC | UCT_IFACE_FLAG_CB_ASYNC | - UCT_IFACE_FLAG_EVENT_SEND_COMP | - UCT_IFACE_FLAG_EVENT_RECV | UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE; + iface_attr->cap.event_flags = UCT_IFACE_FLAG_EVENT_SEND_COMP | + UCT_IFACE_FLAG_EVENT_RECV | + UCT_IFACE_FLAG_EVENT_ASYNC_CB; iface_attr->cap.am.max_short = uct_ib_iface_hdr_size(iface->config.max_inline, sizeof(uct_ud_neth_t)); @@ -597,8 +738,8 @@ ucs_status_t uct_ud_iface_query(uct_ud_iface_t *iface, uct_iface_attr_t *iface_a iface_attr->cap.am.max_zcopy = iface->super.config.seg_size - sizeof(uct_ud_neth_t); iface_attr->cap.am.align_mtu = uct_ib_mtu_value(uct_ib_iface_port_attr(&iface->super)->active_mtu); iface_attr->cap.am.opt_zcopy_align = UCS_SYS_PCI_MAX_PAYLOAD; - /* The first iov is reserved for the header */ - iface_attr->cap.am.max_iov = uct_ib_iface_get_max_iov(&iface->super) - 1; + iface_attr->cap.am.max_iov = am_max_iov; + iface_attr->cap.am.max_hdr = am_max_hdr; iface_attr->cap.put.max_short = uct_ib_iface_hdr_size(iface->config.max_inline, sizeof(uct_ud_neth_t) + @@ -609,7 +750,7 @@ ucs_status_t uct_ud_iface_query(uct_ud_iface_t *iface, uct_iface_attr_t *iface_a iface_attr->max_conn_priv = 0; /* UD lacks of scatter to CQE support */ - iface_attr->latency.overhead += 10e-9; + iface_attr->latency.c += 30e-9; if (iface_attr->cap.am.max_short) { iface_attr->cap.flags |= UCT_IFACE_FLAG_AM_SHORT; @@ -645,7 +786,8 @@ ucs_status_t uct_ud_iface_flush(uct_iface_h tl_iface, unsigned flags, uct_ud_enter(iface); - if (ucs_unlikely(uct_ud_iface_has_pending_async_ev(iface))) { + if (ucs_unlikely(uct_ud_iface_has_pending_async_ev(iface) || + !ucs_queue_is_empty(&iface->tx.outstanding_q))) { UCT_TL_IFACE_STAT_FLUSH_WAIT(&iface->super.super); uct_ud_leave(iface); return UCS_INPROGRESS; @@ -672,15 +814,14 @@ ucs_status_t uct_ud_iface_flush(uct_iface_h tl_iface, unsigned flags, void uct_ud_iface_add_ep(uct_ud_iface_t *iface, uct_ud_ep_t *ep) { - uint32_t prev_gen; - ep->ep_id = ucs_ptr_array_insert(&iface->eps, ep, &prev_gen); + ep->ep_id = ucs_ptr_array_insert(&iface->eps, ep); } void uct_ud_iface_remove_ep(uct_ud_iface_t *iface, uct_ud_ep_t *ep) { if (ep->ep_id != UCT_UD_EP_NULL_ID) { ucs_trace("iface(%p) remove ep: %p id %d", iface, ep, ep->ep_id); - ucs_ptr_array_remove(&iface->eps, ep->ep_id, 0); + ucs_ptr_array_remove(&iface->eps, ep->ep_id); } } @@ -693,100 +834,43 @@ void uct_ud_iface_replace_ep(uct_ud_iface_t *iface, p = ucs_ptr_array_replace(&iface->eps, old_ep->ep_id, new_ep); ucs_assert_always(p == (void *)old_ep); ucs_trace("replace_ep: old(%p) id=%d new(%p) id=%d", old_ep, old_ep->ep_id, new_ep, new_ep->ep_id); - ucs_ptr_array_remove(&iface->eps, new_ep->ep_id, 0); + ucs_ptr_array_remove(&iface->eps, new_ep->ep_id); } - -uct_ud_send_skb_t *uct_ud_iface_resend_skb_get(uct_ud_iface_t *iface) +uct_ud_send_skb_t *uct_ud_iface_ctl_skb_get(uct_ud_iface_t *iface) { - ucs_queue_elem_t *elem; uct_ud_send_skb_t *skb; /* grow reserved skb's queue on-demand */ - if (iface->tx.resend_skbs_quota > 0) { - skb = ucs_mpool_get(&iface->tx.mp); - if (skb == NULL) { - ucs_fatal("failed to allocate control skb"); - } - --iface->tx.resend_skbs_quota; - return skb; - } else { - elem = ucs_queue_pull(&iface->tx.resend_skbs); - ucs_assert(elem != NULL); - return ucs_container_of(elem, uct_ud_send_skb_t, queue); + skb = ucs_mpool_get(&iface->tx.mp); + if (skb == NULL) { + ucs_fatal("failed to allocate control skb"); } -} - - -static void uct_ud_iface_free_resend_skbs(uct_ud_iface_t *iface) -{ - uct_ud_send_skb_t *skb; - iface->tx.resend_skbs_quota = 0; - ucs_queue_for_each_extract(skb, &iface->tx.resend_skbs, queue, 1) { - ucs_mpool_put(skb); - } -} - -static void uct_ud_ep_dispatch_err_comp(uct_ud_ep_t *ep, uct_ud_send_skb_t *skb) -{ - uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ud_iface_t); - ucs_status_t status; - - ucs_assert(ep->tx.err_skb_count > 0); - --ep->tx.err_skb_count; - - if ((ep->tx.err_skb_count > 0) || (ep->flags & UCT_UD_EP_FLAG_DISCONNECTED)) { - return; - } - - if (ep->flags & UCT_UD_EP_FLAG_PRIVATE) { - uct_ep_destroy(&ep->super.super); - return; - } - - status = iface->super.ops->set_ep_failed(&iface->super, &ep->super.super, - skb->status); - if (status != UCS_OK) { - ucs_fatal("transport error: %s", ucs_status_string(status)); - } + VALGRIND_MAKE_MEM_DEFINED(&skb->lkey, sizeof(skb->lkey)); + skb->flags = 0; + return skb; } void uct_ud_iface_dispatch_async_comps_do(uct_ud_iface_t *iface) { uct_ud_comp_desc_t *cdesc; - uct_ud_send_skb_t *skb; - uct_ud_ep_t *ep; + uct_ud_send_skb_t *skb; - do { - skb = ucs_queue_pull_elem_non_empty(&iface->tx.async_comp_q, - uct_ud_send_skb_t, queue); + ucs_queue_for_each_extract(skb, &iface->tx.async_comp_q, queue, 1) { + ucs_assert(!(skb->flags & UCT_UD_SEND_SKB_FLAG_RESENDING)); cdesc = uct_ud_comp_desc(skb); - ep = cdesc->ep; - - if (skb->flags & UCT_UD_SEND_SKB_FLAG_COMP) { - ucs_assert(!(ep->flags & UCT_UD_EP_FLAG_DISCONNECTED)); - uct_invoke_completion(cdesc->comp, skb->status); - } - - if (ucs_unlikely(skb->flags & UCT_UD_SEND_SKB_FLAG_ERR)) { - uct_ud_ep_dispatch_err_comp(ep, skb); - } - - ep->flags &= ~UCT_UD_EP_FLAG_ASYNC_COMPS; - skb->flags = 0; - ucs_mpool_put(skb); - } while (!ucs_queue_is_empty(&iface->tx.async_comp_q)); + uct_ud_iface_dispatch_comp(iface, cdesc->comp, cdesc->status); + uct_ud_skb_release(skb, 0, 0, NULL); + } } static void uct_ud_iface_free_async_comps(uct_ud_iface_t *iface) { uct_ud_send_skb_t *skb; - while (!ucs_queue_is_empty(&iface->tx.async_comp_q)) { - skb = ucs_queue_pull_elem_non_empty(&iface->tx.async_comp_q, - uct_ud_send_skb_t, queue); - ucs_mpool_put(skb); + ucs_queue_for_each_extract(skb, &iface->tx.async_comp_q, queue, 1) { + uct_ud_skb_release(skb, 0, 0, NULL); } } @@ -827,31 +911,6 @@ static void uct_ud_iface_free_pending_rx(uct_ud_iface_t *iface) } } -static inline void uct_ud_iface_async_progress(uct_ud_iface_t *iface) -{ - unsigned ev_count; - uct_ud_iface_ops_t *ops; - - ops = ucs_derived_of(iface->super.ops, uct_ud_iface_ops_t); - ev_count = ops->async_progress(iface); - if (ev_count > 0) { - uct_ud_iface_raise_pending_async_ev(iface); - } -} - -static void uct_ud_iface_timer(int timer_id, void *arg) -{ - uct_ud_iface_t *iface = arg; - ucs_time_t now; - - uct_ud_enter(iface); - now = uct_ud_iface_get_async_time(iface); - ucs_trace_async("iface(%p) slow_timer_sweep: now %lu", iface, now); - ucs_twheel_sweep(&iface->async.slow_timer, now); - uct_ud_iface_async_progress(iface); - uct_ud_leave(iface); -} - void uct_ud_iface_release_desc(uct_recv_desc_t *self, void *desc) { uct_ud_iface_t *iface = ucs_container_of(self, @@ -862,13 +921,6 @@ void uct_ud_iface_release_desc(uct_recv_desc_t *self, void *desc) uct_ud_leave(iface); } -void uct_ud_iface_handle_failure(uct_ib_iface_t *iface, void *arg, - ucs_status_t status) -{ - uct_ud_tx_wnd_purge_outstanding(ucs_derived_of(iface, uct_ud_iface_t), - (uct_ud_ep_t *)arg, status); -} - ucs_status_t uct_ud_iface_event_arm(uct_iface_h tl_iface, unsigned events) { uct_ud_iface_t *iface = ucs_derived_of(tl_iface, uct_ud_iface_t); @@ -920,15 +972,128 @@ ucs_status_t uct_ud_iface_event_arm(uct_iface_h tl_iface, unsigned events) void uct_ud_iface_progress_enable(uct_iface_h tl_iface, unsigned flags) { - uct_ud_iface_t *iface = ucs_derived_of(tl_iface, uct_ud_iface_t); + uct_ud_iface_t *iface = ucs_derived_of(tl_iface, uct_ud_iface_t); + ucs_async_context_t *async = iface->super.super.worker->async; + ucs_async_mode_t async_mode = async->mode; + ucs_status_t status; + + uct_ud_enter(iface); if (flags & UCT_PROGRESS_RECV) { - uct_ud_enter(iface); iface->rx.available += iface->rx.quota; iface->rx.quota = 0; /* let progress (possibly async) post the missing receives */ - uct_ud_leave(iface); } + if (iface->async.timer_id == 0) { + status = ucs_async_add_timer(async_mode, iface->async.tick, + uct_ud_iface_timer, iface, async, + &iface->async.timer_id); + if (status != UCS_OK) { + ucs_fatal("iface(%p): unable to add iface timer handler - %s", + iface, ucs_status_string(status)); + } + ucs_assert(iface->async.timer_id != 0); + } + + uct_ud_leave(iface); + uct_base_iface_progress_enable(tl_iface, flags); } + +void uct_ud_iface_progress_disable(uct_iface_h tl_iface, unsigned flags) +{ + uct_ud_iface_t *iface = ucs_derived_of(tl_iface, uct_ud_iface_t); + ucs_status_t status; + + uct_ud_enter(iface); + + if (iface->async.timer_id != 0) { + status = ucs_async_remove_handler(iface->async.timer_id, 1); + if (status != UCS_OK) { + ucs_fatal("iface(%p): unable to remove iface timer handler (%d) - %s", + iface, iface->async.timer_id, ucs_status_string(status)); + } + iface->async.timer_id = 0; + } + + uct_ud_leave(iface); + + uct_base_iface_progress_disable(tl_iface, flags); +} + +void uct_ud_iface_ctl_skb_complete(uct_ud_iface_t *iface, + uct_ud_ctl_desc_t *cdesc, int is_async) +{ + uct_ud_send_skb_t *resent_skb, *skb; + + skb = cdesc->self_skb; + ucs_assert(!(skb->flags & UCT_UD_SEND_SKB_FLAG_INVALID)); + + resent_skb = cdesc->resent_skb; + ucs_assert(uct_ud_ctl_desc(skb) == cdesc); + + if (resent_skb != NULL) { + ucs_assert(skb->flags & UCT_UD_SEND_SKB_FLAG_CTL_RESEND); + ucs_assert(resent_skb->flags & UCT_UD_SEND_SKB_FLAG_RESENDING); + + resent_skb->flags &= ~UCT_UD_SEND_SKB_FLAG_RESENDING; + --cdesc->ep->tx.resend_count; + + uct_ud_ep_window_release_completed(cdesc->ep, is_async); + } else { + ucs_assert(skb->flags & UCT_UD_SEND_SKB_FLAG_CTL_ACK); + } + + uct_ud_skb_release(skb, 0, 0, NULL); + +} + +void uct_ud_iface_send_completion(uct_ud_iface_t *iface, uint16_t sn, + int is_async) +{ + uct_ud_ctl_desc_t *cdesc; + + ucs_queue_for_each_extract(cdesc, &iface->tx.outstanding_q, queue, + UCS_CIRCULAR_COMPARE16(cdesc->sn, <=, sn)) { + uct_ud_iface_ctl_skb_complete(iface, cdesc, is_async); + } +} + +union ibv_gid* uct_ud_grh_get_dgid(struct ibv_grh *grh, size_t dgid_len) +{ + size_t i; + + /* Make sure that daddr in IPv4 resides in the last 4 bytes in GRH */ + UCS_STATIC_ASSERT((UCT_IB_GRH_LEN - (20 + offsetof(struct iphdr, daddr))) == + UCS_IPV4_ADDR_LEN); + + /* Make sure that dgid resides in the last 16 bytes in GRH */ + UCS_STATIC_ASSERT((UCT_IB_GRH_LEN - offsetof(struct ibv_grh, dgid)) == + UCS_IPV6_ADDR_LEN); + + ucs_assert((dgid_len == UCS_IPV4_ADDR_LEN) || + (dgid_len == UCS_IPV6_ADDR_LEN)); + + /* + * According to Annex17_RoCEv2 (A17.4.5.2): + * "The first 40 bytes of user posted UD Receive Buffers are reserved for the L3 + * header of the incoming packet (as per the InfiniBand Spec Section 11.4.1.2). + * In RoCEv2, this area is filled up with the IP header. IPv6 header uses the + * entire 40 bytes. IPv4 headers use the 20 bytes in the second half of the + * reserved 40 bytes area (i.e. offset 20 from the beginning of the receive + * buffer). In this case, the content of the first 20 bytes is undefined. " + */ + if (dgid_len == UCS_IPV4_ADDR_LEN) { + /* IPv4 mapped to IPv6 looks like: 0000:0000:0000:0000:0000:ffff:????:???? + reset begin to make hash function working */ + for (i = 0; i < (sizeof(union ibv_gid) - UCS_IPV4_ADDR_LEN - 2);) { + grh->dgid.raw[i++] = 0x00; + } + + grh->dgid.raw[i++] = 0xff; + grh->dgid.raw[i++] = 0xff; + } + + return &grh->dgid; +} diff --git a/src/uct/ib/ud/base/ud_iface.h b/src/uct/ib/ud/base/ud_iface.h index 40aaf4aebe3..a66d8b143f0 100644 --- a/src/uct/ib/ud/base/ud_iface.h +++ b/src/uct/ib/ud/base/ud_iface.h @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -9,6 +9,7 @@ #define UCT_UD_IFACE_H #include +#include #include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include #include "ud_def.h" #include "ud_ep.h" @@ -25,23 +27,38 @@ BEGIN_C_DECLS -/** @file ud_iface.h */ -#define UCT_UD_MIN_INLINE 48 +#define UCT_UD_MIN_TIMER_TIMER_BACKOFF 1.0 + + +/** @file ud_iface.h */ enum { UCT_UD_IFACE_STAT_RX_DROP, UCT_UD_IFACE_STAT_LAST }; + +/* flags for uct_ud_iface_send_ctl() */ +enum { + UCT_UD_IFACE_SEND_CTL_FLAG_INLINE = UCS_BIT(0), + UCT_UD_IFACE_SEND_CTL_FLAG_SOLICITED = UCS_BIT(1), + UCT_UD_IFACE_SEND_CTL_FLAG_SIGNALED = UCS_BIT(2) +}; + + /* TODO: maybe tx_moderation can be defined at compile-time since tx completions are used only to know how much space is there in tx qp */ typedef struct uct_ud_iface_config { uct_ib_iface_config_t super; uct_ud_iface_common_config_t ud_common; double peer_timeout; - double slow_timer_backoff; + double timer_tick; + double timer_backoff; + double event_timer_tick; int dgid_check; + unsigned max_window; + unsigned rx_async_max_poll; } uct_ud_iface_config_t; @@ -50,34 +67,43 @@ struct uct_ud_iface_peer { union ibv_gid dgid; uint16_t dlid; uint32_t dst_qpn; + uint8_t path_index; uint32_t conn_id_last; ucs_list_link_t ep_list; /* ep list ordered by connection id */ }; -static inline int uct_ud_iface_peer_cmp(uct_ud_iface_peer_t *a, uct_ud_iface_peer_t *b) { +static inline int +uct_ud_iface_peer_cmp(uct_ud_iface_peer_t *a, uct_ud_iface_peer_t *b) +{ return (int)a->dst_qpn - (int)b->dst_qpn || memcmp(a->dgid.raw, b->dgid.raw, sizeof(union ibv_gid)) || - (int)a->dlid - (int)b->dlid; + ((int)a->dlid - (int)b->dlid) || + ((int)a->path_index - (int)b->path_index); } -static inline int uct_ud_iface_peer_hash(uct_ud_iface_peer_t *a) { - return (a->dlid + a->dgid.global.interface_id + a->dgid.global.subnet_prefix) - % UCT_UD_HASH_SIZE; + +static inline int uct_ud_iface_peer_hash(uct_ud_iface_peer_t *a) +{ + return (a->dlid + a->dgid.global.interface_id + + a->dgid.global.subnet_prefix + (a->path_index * 137)) % + UCT_UD_HASH_SIZE; } + SGLIB_DEFINE_LIST_PROTOTYPES(uct_ud_iface_peer_t, uct_ud_iface_peer_cmp, next) SGLIB_DEFINE_HASHED_CONTAINER_PROTOTYPES(uct_ud_iface_peer_t, UCT_UD_HASH_SIZE, uct_ud_iface_peer_hash) - #if UCT_UD_EP_DEBUG_HOOKS typedef ucs_status_t (*uct_ud_iface_hook_t)(uct_ud_iface_t *iface, uct_ud_neth_t *neth); + #define UCT_UD_IFACE_HOOK_DECLARE(_name) \ - uct_ud_iface_hook_t _name + uct_ud_iface_hook_t _name; + #define UCT_UD_IFACE_HOOK_CALL_RX(_iface, _neth, _len) \ if ((_iface)->rx.hook(_iface, _neth) != UCS_OK) { \ @@ -85,10 +111,12 @@ typedef ucs_status_t (*uct_ud_iface_hook_t)(uct_ud_iface_t *iface, uct_ud_neth_t return; \ } + #define UCT_UD_IFACE_HOOK_INIT(_iface) { \ (_iface)->rx.hook = uct_ud_iface_null_hook; \ } + static inline ucs_status_t uct_ud_iface_null_hook(uct_ud_iface_t *iface, uct_ud_neth_t *neth) { @@ -103,14 +131,51 @@ static inline ucs_status_t uct_ud_iface_null_hook(uct_ud_iface_t *iface, #endif + typedef struct uct_ud_iface_ops { uct_ib_iface_ops_t super; unsigned (*async_progress)(uct_ud_iface_t *iface); - void (*tx_skb)(uct_ud_ep_t *ep, uct_ud_send_skb_t *skb, - int solicited); + uint16_t (*send_ctl)(uct_ud_ep_t *ud_ep, uct_ud_send_skb_t *skb, + const uct_ud_iov_t *iov, uint16_t iovcnt, + int flags, int max_log_sge); void (*ep_free)(uct_ep_h ep); + ucs_status_t (*create_qp)(uct_ib_iface_t *iface, uct_ib_qp_attr_t *attr, + struct ibv_qp **qp_p); } uct_ud_iface_ops_t; + +/* device GIDs set */ +KHASH_TYPE(uct_ud_iface_gid, union ibv_gid, char); + + +static UCS_F_ALWAYS_INLINE +khint32_t uct_ud_iface_kh_gid_hash_func(union ibv_gid gid) +{ + return kh_int64_hash_func(gid.global.subnet_prefix ^ + gid.global.interface_id); +} + + +static UCS_F_ALWAYS_INLINE int +uct_ud_gid_equal(const union ibv_gid *a, const union ibv_gid *b, size_t length) +{ + ucs_assert(length <= sizeof(union ibv_gid)); + return !memcmp(UCS_PTR_BYTE_OFFSET(a, sizeof(*a) - length), + UCS_PTR_BYTE_OFFSET(b, sizeof(*b) - length), length); +} + + +static UCS_F_ALWAYS_INLINE int +uct_ud_iface_kh_gid_hash_equal(union ibv_gid a, union ibv_gid b) +{ + return uct_ud_gid_equal(&a, &b, sizeof(a)); +} + + +KHASH_IMPL(uct_ud_iface_gid, union ibv_gid, char, 0, + uct_ud_iface_kh_gid_hash_func, uct_ud_iface_kh_gid_hash_equal) + + struct uct_ud_iface { uct_ib_iface_t super; struct ibv_qp *qp; @@ -118,48 +183,60 @@ struct uct_ud_iface { ucs_mpool_t mp; unsigned available; unsigned quota; + unsigned async_max_poll; ucs_queue_head_t pending_q; - UCT_UD_IFACE_HOOK_DECLARE(hook); + UCT_UD_IFACE_HOOK_DECLARE(hook) } rx; struct { uct_ud_send_skb_t *skb; /* ready to use skb */ - uct_ud_send_skb_inl_t skb_inl; ucs_mpool_t mp; /* got async events but pending queue was not dispatched */ uint8_t async_before_pending; int16_t available; unsigned unsignaled; - /* pool of skbs that are reserved for retransmissions */ - ucs_queue_head_t resend_skbs; - unsigned resend_skbs_quota; + ucs_queue_head_t outstanding_q; ucs_arbiter_t pending_q; ucs_queue_head_t async_comp_q; + ucs_twheel_t timer; + ucs_time_t tick; + double timer_backoff; + unsigned timer_sweep_count; } tx; struct { ucs_time_t peer_timeout; - double slow_timer_backoff; unsigned tx_qp_len; unsigned max_inline; int check_grh_dgid; - unsigned gid_len; + unsigned max_window; } config; - UCS_STATS_NODE_DECLARE(stats); + UCS_STATS_NODE_DECLARE(stats) ucs_ptr_array_t eps; uct_ud_iface_peer_t *peers[UCT_UD_HASH_SIZE]; struct { - ucs_twheel_t slow_timer; - ucs_time_t slow_tick; + ucs_time_t tick; int timer_id; + void *event_arg; + uct_async_event_cb_t event_cb; + unsigned disable; } async; + + /* used for GRH GID filter */ + struct { + union ibv_gid last; + unsigned last_len; + khash_t(uct_ud_iface_gid) hash; + } gid_table; }; + UCS_CLASS_DECLARE(uct_ud_iface_t, uct_ud_iface_ops_t*, uct_md_h, uct_worker_h, const uct_iface_params_t*, const uct_ud_iface_config_t*, uct_ib_iface_init_attr_t*) + struct uct_ud_ctl_hdr { uint8_t type; uint8_t reserved[3]; @@ -167,6 +244,7 @@ struct uct_ud_ctl_hdr { struct { uct_ud_ep_addr_t ep_addr; uint32_t conn_id; + uint8_t path_index; } conn_req; struct { uint32_t src_ep_id; @@ -180,13 +258,19 @@ struct uct_ud_ctl_hdr { extern ucs_config_field_t uct_ud_iface_config_table[]; -ucs_status_t uct_ud_iface_query(uct_ud_iface_t *iface, uct_iface_attr_t *iface_attr); + +ucs_status_t uct_ud_iface_query(uct_ud_iface_t *iface, + uct_iface_attr_t *iface_attr, + size_t am_max_iov, size_t am_max_hdr); + void uct_ud_iface_release_desc(uct_recv_desc_t *self, void *desc); ucs_status_t uct_ud_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *addr); void uct_ud_iface_add_ep(uct_ud_iface_t *iface, uct_ud_ep_t *ep); + void uct_ud_iface_remove_ep(uct_ud_iface_t *iface, uct_ud_ep_t *ep); + void uct_ud_iface_replace_ep(uct_ud_iface_t *iface, uct_ud_ep_t *old_ep, uct_ud_ep_t *new_ep); ucs_status_t uct_ud_iface_flush(uct_iface_h tl_iface, unsigned flags, @@ -200,69 +284,9 @@ void uct_ud_dump_packet(uct_base_iface_t *iface, uct_am_trace_type_t type, void *data, size_t length, size_t valid_length, char *buffer, size_t max); +union ibv_gid* uct_ud_grh_get_dgid(struct ibv_grh *grh, size_t dgid_len); -static UCS_F_ALWAYS_INLINE int uct_ud_iface_can_tx(uct_ud_iface_t *iface) -{ - return iface->tx.available > 0; -} - -static UCS_F_ALWAYS_INLINE int uct_ud_iface_has_skbs(uct_ud_iface_t *iface) -{ - return iface->tx.skb || !ucs_mpool_is_empty(&iface->tx.mp); -} - - -uct_ud_send_skb_t *uct_ud_iface_resend_skb_get(uct_ud_iface_t *iface); - -static inline void -uct_ud_iface_resend_skb_put(uct_ud_iface_t *iface, uct_ud_send_skb_t *skb) -{ - if (skb != ucs_unaligned_ptr(&iface->tx.skb_inl.super)) { - ucs_queue_push(&iface->tx.resend_skbs, &skb->queue); - } -} - -static inline uct_ib_address_t* uct_ud_creq_ib_addr(uct_ud_ctl_hdr_t *conn_req) -{ - ucs_assert(conn_req->type == UCT_UD_PACKET_CREQ); - return (uct_ib_address_t*)(conn_req + 1); -} - -static UCS_F_ALWAYS_INLINE void uct_ud_enter(uct_ud_iface_t *iface) -{ - UCS_ASYNC_BLOCK(iface->super.super.worker->async); -} - -static UCS_F_ALWAYS_INLINE void uct_ud_leave(uct_ud_iface_t *iface) -{ - UCS_ASYNC_UNBLOCK(iface->super.super.worker->async); -} - -static UCS_F_ALWAYS_INLINE int -uct_ud_iface_check_grh(uct_ud_iface_t *iface, void *grh_end, int is_grh_present) -{ - void *dest_gid, *local_gid; - - if (!iface->config.check_grh_dgid) { - return 1; - } - - if (ucs_unlikely(!is_grh_present)) { - ucs_warn("RoCE packet does not contain GRH"); - return 1; - } - - local_gid = (char*)iface->super.gid.raw + (16 - iface->config.gid_len); - dest_gid = (char*)grh_end - iface->config.gid_len; - - if (memcmp(local_gid, dest_gid, iface->config.gid_len)) { - UCS_STATS_UPDATE_COUNTER(iface->stats, UCT_UD_IFACE_STAT_RX_DROP, 1); - ucs_trace_data("Drop packet with wrong dgid"); - return 0; - } - - return 1; -} +uct_ud_send_skb_t *uct_ud_iface_ctl_skb_get(uct_ud_iface_t *iface); /* management of connecting endpoints (cep) @@ -349,7 +373,7 @@ void uct_ud_iface_cep_init(uct_ud_iface_t *iface); uct_ud_ep_t *uct_ud_iface_cep_lookup(uct_ud_iface_t *iface, const uct_ib_address_t *src_ib_addr, const uct_ud_iface_addr_t *src_if_addr, - uint32_t conn_id); + uint32_t conn_id, int path_index); /* remove ep */ void uct_ud_iface_cep_remove(uct_ud_ep_t *ep); @@ -366,10 +390,109 @@ void uct_ud_iface_cep_rollback(uct_ud_iface_t *iface, ucs_status_t uct_ud_iface_cep_insert(uct_ud_iface_t *iface, const uct_ib_address_t *src_ib_addr, const uct_ud_iface_addr_t *src_if_addr, - uct_ud_ep_t *ep, uint32_t conn_id); + uct_ud_ep_t *ep, uint32_t conn_id, + int path_index); void uct_ud_iface_cep_cleanup(uct_ud_iface_t *iface); +ucs_status_t uct_ud_iface_dispatch_pending_rx_do(uct_ud_iface_t *iface); + +ucs_status_t uct_ud_iface_event_arm(uct_iface_h tl_iface, unsigned events); + +void uct_ud_iface_progress_enable(uct_iface_h tl_iface, unsigned flags); + +void uct_ud_iface_progress_disable(uct_iface_h tl_iface, unsigned flags); + +void uct_ud_iface_ctl_skb_complete(uct_ud_iface_t *iface, + uct_ud_ctl_desc_t *cdesc, int is_async); + +void uct_ud_iface_send_completion(uct_ud_iface_t *iface, uint16_t sn, + int is_async); + +void uct_ud_iface_dispatch_async_comps_do(uct_ud_iface_t *iface); + + +static UCS_F_ALWAYS_INLINE int uct_ud_iface_can_tx(uct_ud_iface_t *iface) +{ + return iface->tx.available > 0; +} + + +static UCS_F_ALWAYS_INLINE int uct_ud_iface_has_skbs(uct_ud_iface_t *iface) +{ + return iface->tx.skb || !ucs_mpool_is_empty(&iface->tx.mp); +} + + +static inline uct_ib_address_t* uct_ud_creq_ib_addr(uct_ud_ctl_hdr_t *conn_req) +{ + ucs_assert(conn_req->type == UCT_UD_PACKET_CREQ); + return (uct_ib_address_t*)(conn_req + 1); +} + + +static UCS_F_ALWAYS_INLINE void uct_ud_enter(uct_ud_iface_t *iface) +{ + UCS_ASYNC_BLOCK(iface->super.super.worker->async); +} + + +static UCS_F_ALWAYS_INLINE void uct_ud_leave(uct_ud_iface_t *iface) +{ + UCS_ASYNC_UNBLOCK(iface->super.super.worker->async); +} + + +static UCS_F_ALWAYS_INLINE unsigned +uct_ud_grh_get_dgid_len(struct ibv_grh *grh) +{ + static const uint8_t ipmask = 0xf0; + uint8_t ipver = ((*(uint8_t*)grh) & ipmask); + + return (ipver == (6 << 4)) ? UCS_IPV6_ADDR_LEN : UCS_IPV4_ADDR_LEN; +} + + +static UCS_F_ALWAYS_INLINE int +uct_ud_iface_check_grh(uct_ud_iface_t *iface, void *packet, int is_grh_present) +{ + struct ibv_grh *grh = (struct ibv_grh *)packet; + size_t gid_len; + union ibv_gid *gid; + khiter_t khiter; + char gid_str[128] UCS_V_UNUSED; + + if (!iface->config.check_grh_dgid) { + return 1; + } + + if (ucs_unlikely(!is_grh_present)) { + ucs_warn("RoCE packet does not contain GRH"); + return 1; + } + + gid_len = uct_ud_grh_get_dgid_len(grh); + if (ucs_likely((gid_len == iface->gid_table.last_len) && + uct_ud_gid_equal(&grh->dgid, &iface->gid_table.last, + gid_len))) { + return 1; + } + + gid = uct_ud_grh_get_dgid(grh, gid_len); + khiter = kh_get(uct_ud_iface_gid, &iface->gid_table.hash, *gid); + if (ucs_likely(khiter != kh_end(&iface->gid_table.hash))) { + iface->gid_table.last = *gid; + iface->gid_table.last_len = gid_len; + return 1; + } + + UCS_STATS_UPDATE_COUNTER(iface->stats, UCT_UD_IFACE_STAT_RX_DROP, 1); + ucs_trace_data("iface %p: drop packet with wrong dgid %s", iface, + uct_ib_gid_str(gid, gid_str, sizeof(gid_str))); + return 0; +} + + /* get time of the last async wakeup */ static UCS_F_ALWAYS_INLINE ucs_time_t uct_ud_iface_get_async_time(uct_ud_iface_t *iface) @@ -377,9 +500,34 @@ uct_ud_iface_get_async_time(uct_ud_iface_t *iface) return iface->super.super.worker->async->last_wakeup; } + +static UCS_F_ALWAYS_INLINE ucs_time_t +uct_ud_iface_get_time(uct_ud_iface_t *iface) +{ + return ucs_get_time(); +} + + +static UCS_F_ALWAYS_INLINE void +uct_ud_iface_twheel_sweep(uct_ud_iface_t *iface) +{ + if (iface->tx.timer_sweep_count++ % UCT_UD_SKIP_SWEEP) { + return; + } + + if (ucs_twheel_is_empty(&iface->tx.timer)) { + return; + } + + ucs_twheel_sweep(&iface->tx.timer, uct_ud_iface_get_time(iface)); +} + + static UCS_F_ALWAYS_INLINE void uct_ud_iface_progress_pending(uct_ud_iface_t *iface, const uintptr_t is_async) { + uct_ud_iface_twheel_sweep(iface); + if (!is_async) { iface->tx.async_before_pending = 0; } @@ -392,12 +540,14 @@ uct_ud_iface_progress_pending(uct_ud_iface_t *iface, const uintptr_t is_async) (void *)is_async); } + static UCS_F_ALWAYS_INLINE int uct_ud_iface_has_pending_async_ev(uct_ud_iface_t *iface) { return iface->tx.async_before_pending; } + static UCS_F_ALWAYS_INLINE void uct_ud_iface_raise_pending_async_ev(uct_ud_iface_t *iface) { @@ -406,6 +556,25 @@ uct_ud_iface_raise_pending_async_ev(uct_ud_iface_t *iface) } } + +static UCS_F_ALWAYS_INLINE uint16_t +uct_ud_iface_send_ctl(uct_ud_iface_t *iface, uct_ud_ep_t *ep, uct_ud_send_skb_t *skb, + const uct_ud_iov_t *iov, uint16_t iovcnt, int flags, + int max_log_sge) +{ + uct_ud_iface_ops_t *ud_ops = ucs_derived_of(iface->super.ops, + uct_ud_iface_ops_t); + return ud_ops->send_ctl(ep, skb, iov, iovcnt, flags, max_log_sge); +} + + +static UCS_F_ALWAYS_INLINE void +uct_ud_iface_add_ctl_desc(uct_ud_iface_t *iface, uct_ud_ctl_desc_t *cdesc) +{ + ucs_queue_push(&iface->tx.outstanding_q, &cdesc->queue); +} + + /* Go over all active eps and remove them. Do it this way because class destructors are not * virtual */ @@ -418,14 +587,6 @@ uct_ud_iface_raise_pending_async_ev(uct_ud_iface_t *iface) } \ } -ucs_status_t uct_ud_iface_dispatch_pending_rx_do(uct_ud_iface_t *iface); - -void uct_ud_iface_handle_failure(uct_ib_iface_t *iface, void *arg, - ucs_status_t status); - -ucs_status_t uct_ud_iface_event_arm(uct_iface_h tl_iface, unsigned events); - -void uct_ud_iface_progress_enable(uct_iface_h tl_iface, unsigned flags); static UCS_F_ALWAYS_INLINE ucs_status_t uct_ud_iface_dispatch_pending_rx(uct_ud_iface_t *iface) @@ -436,10 +597,9 @@ uct_ud_iface_dispatch_pending_rx(uct_ud_iface_t *iface) return uct_ud_iface_dispatch_pending_rx_do(iface); } -void uct_ud_iface_dispatch_async_comps_do(uct_ud_iface_t *iface); static UCS_F_ALWAYS_INLINE void -uct_ud_iface_dispatch_zcopy_comps(uct_ud_iface_t *iface) +uct_ud_iface_dispatch_async_comps(uct_ud_iface_t *iface) { if (ucs_likely(ucs_queue_is_empty(&iface->tx.async_comp_q))) { return; diff --git a/src/uct/ib/ud/base/ud_iface_common.c b/src/uct/ib/ud/base/ud_iface_common.c index 598ec1e45ca..940dbf9113d 100644 --- a/src/uct/ib/ud/base/ud_iface_common.c +++ b/src/uct/ib/ud/base/ud_iface_common.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ud_iface_common.h" #include diff --git a/src/uct/ib/ud/base/ud_inl.h b/src/uct/ib/ud/base/ud_inl.h index 05971d303b2..6be3ec9d5cf 100644 --- a/src/uct/ib/ud/base/ud_inl.h +++ b/src/uct/ib/ud/base/ud_inl.h @@ -1,5 +1,6 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -61,16 +62,62 @@ uct_ud_send_skb_t *uct_ud_iface_get_tx_skb(uct_ud_iface_t *iface, } iface->tx.skb = skb; } - VALGRIND_MAKE_MEM_DEFINED(skb, sizeof *skb); + VALGRIND_MAKE_MEM_DEFINED(&skb->lkey, sizeof(skb->lkey)); + skb->flags = 0; ucs_prefetch(skb->neth); return skb; } +static UCS_F_ALWAYS_INLINE void +uct_ud_skb_release(uct_ud_send_skb_t *skb, int is_inline, int dummy_ack, uct_ud_ep_t *ep) +{ + ucs_assert(!(skb->flags & UCT_UD_SEND_SKB_FLAG_INVALID)); + skb->flags = UCT_UD_SEND_SKB_FLAG_INVALID; + +#if HAVE_HNS_ROCE + if (dummy_ack) { + ucs_queue_push(&ep->pending_skb, &skb->queue); + } else +#endif + if (is_inline) { + ucs_mpool_put_inline(skb); + } else { + ucs_mpool_put(skb); + } +} + +#if UCS_ENABLE_ASSERT +static UCS_F_ALWAYS_INLINE int uct_ud_ep_has_pending(uct_ud_ep_t *ep) +{ + return !ucs_arbiter_group_is_empty(&ep->tx.pending.group) && + !ucs_arbiter_elem_is_only(&ep->tx.pending.elem); +} +#endif + +static UCS_F_ALWAYS_INLINE void uct_ud_ep_set_has_pending_flag(uct_ud_ep_t *ep) +{ + ep->flags |= UCT_UD_EP_FLAG_HAS_PENDING; +} + +static UCS_F_ALWAYS_INLINE void uct_ud_ep_remove_has_pending_flag(uct_ud_ep_t *ep) +{ + ucs_assert(ep->flags & UCT_UD_EP_FLAG_HAS_PENDING); + ep->flags &= ~UCT_UD_EP_FLAG_HAS_PENDING; +} + +static UCS_F_ALWAYS_INLINE void uct_ud_ep_set_dest_ep_id(uct_ud_ep_t *ep, + uint32_t dest_id) +{ + ucs_assert(dest_id != UCT_UD_EP_NULL_ID); + ep->dest_ep_id = dest_id; + ep->flags |= UCT_UD_EP_FLAG_CONNECTED; +} + /* same as above but also check ep resources: window&connection state */ static UCS_F_ALWAYS_INLINE uct_ud_send_skb_t * uct_ud_ep_get_tx_skb(uct_ud_iface_t *iface, uct_ud_ep_t *ep) { - if (ucs_unlikely(!uct_ud_ep_is_connected(ep) || + if (ucs_unlikely(!uct_ud_ep_is_connected_and_no_pending(ep) || uct_ud_ep_no_window(ep) || uct_ud_iface_has_pending_async_ev(iface))) { ucs_trace_poll("iface=%p ep=%p (%d->%d) no ep resources (psn=%u max_psn=%u)", @@ -85,21 +132,28 @@ uct_ud_ep_get_tx_skb(uct_ud_iface_t *iface, uct_ud_ep_t *ep) } static UCS_F_ALWAYS_INLINE void -uct_ud_am_set_zcopy_desc(uct_ud_send_skb_t *skb, const uct_iov_t *iov, size_t iovcnt, - uct_completion_t *comp) +uct_ud_skb_set_zcopy_desc(uct_ud_send_skb_t *skb, const uct_iov_t *iov, + size_t iovcnt, uct_completion_t *comp) { uct_ud_zcopy_desc_t *zdesc; size_t iov_it_length; + uct_ud_iov_t *ud_iov; size_t iov_it; skb->flags |= UCT_UD_SEND_SKB_FLAG_ZCOPY; zdesc = uct_ud_zcopy_desc(skb); - zdesc->iovcnt = iovcnt; + zdesc->iovcnt = 0; for (iov_it = 0; iov_it < iovcnt; ++iov_it) { iov_it_length = uct_iov_get_length(iov + iov_it); + if (iov_it_length == 0) { + continue; + } + ucs_assert(iov_it_length <= UINT16_MAX); - zdesc->iov[iov_it].buffer = iov[iov_it].buffer; - zdesc->iov[iov_it].length = iov_it_length; + ud_iov = &zdesc->iov[zdesc->iovcnt++]; + ud_iov->buffer = iov[iov_it].buffer; + ud_iov->lkey = uct_ib_memh_get_lkey(iov[iov_it].memh); + ud_iov->length = iov_it_length; } if (comp != NULL) { skb->flags |= UCT_UD_SEND_SKB_FLAG_COMP; @@ -108,51 +162,51 @@ uct_ud_am_set_zcopy_desc(uct_ud_send_skb_t *skb, const uct_iov_t *iov, size_t io } static UCS_F_ALWAYS_INLINE void -uct_ud_iface_complete_tx_inl(uct_ud_iface_t *iface, uct_ud_ep_t *ep, - uct_ud_send_skb_t *skb, void *data, - const void *buffer, unsigned length) +uct_ud_iface_complete_tx(uct_ud_iface_t *iface, uct_ud_ep_t *ep, + uct_ud_send_skb_t *skb, int has_data, void *data, + const void *buffer, unsigned length) { - iface->tx.skb = ucs_mpool_get(&iface->tx.mp); + ucs_time_t now = uct_ud_iface_get_time(iface); + iface->tx.skb = ucs_mpool_get(&iface->tx.mp); ep->tx.psn++; - skb->len += length; - memcpy(data, buffer, length); + + if (has_data) { + skb->len += length; + memcpy(data, buffer, length); + } + ucs_queue_push(&ep->tx.window, &skb->queue); - ep->tx.slow_tick = iface->async.slow_tick; - ucs_wtimer_add(&iface->async.slow_timer, &ep->slow_timer, - uct_ud_iface_get_async_time(iface) - - ucs_twheel_get_time(&iface->async.slow_timer) + - ep->tx.slow_tick); - ep->tx.send_time = uct_ud_iface_get_async_time(iface); + ep->tx.tick = iface->tx.tick; + + if (!iface->async.disable) { + ucs_wtimer_add(&iface->tx.timer, &ep->timer, + now - ucs_twheel_get_time(&iface->tx.timer) + ep->tx.tick); + } + + ep->tx.send_time = now; } static UCS_F_ALWAYS_INLINE void -uct_ud_iface_complete_tx_skb(uct_ud_iface_t *iface, uct_ud_ep_t *ep, - uct_ud_send_skb_t *skb) +uct_ud_iface_complete_tx_inl(uct_ud_iface_t *iface, uct_ud_ep_t *ep, + uct_ud_send_skb_t *skb, void *data, + const void *buffer, unsigned length) { - iface->tx.skb = ucs_mpool_get(&iface->tx.mp); - ep->tx.psn++; - ucs_queue_push(&ep->tx.window, &skb->queue); - ep->tx.slow_tick = iface->async.slow_tick; - ucs_wtimer_add(&iface->async.slow_timer, &ep->slow_timer, - uct_ud_iface_get_async_time(iface) - - ucs_twheel_get_time(&iface->async.slow_timer) + - ep->tx.slow_tick); - ep->tx.send_time = uct_ud_iface_get_async_time(iface); + uct_ud_iface_complete_tx(iface, ep, skb, 1, data, buffer, length); } static UCS_F_ALWAYS_INLINE void -uct_ud_am_set_neth(uct_ud_neth_t *neth, uct_ud_ep_t *ep, uint8_t id) +uct_ud_iface_complete_tx_skb(uct_ud_iface_t *iface, uct_ud_ep_t *ep, + uct_ud_send_skb_t *skb) { - uct_ud_neth_init_data(ep, neth); - uct_ud_neth_set_type_am(ep, neth, id); - uct_ud_neth_ack_req(ep, neth); + uct_ud_iface_complete_tx(iface, ep, skb, 0, NULL, NULL, 0); } static UCS_F_ALWAYS_INLINE ucs_status_t -uct_ud_am_common(uct_ud_iface_t *iface, uct_ud_ep_t *ep, uint8_t id, - uct_ud_send_skb_t **skb_p) +uct_ud_am_skb_common(uct_ud_iface_t *iface, uct_ud_ep_t *ep, uint8_t id, + uct_ud_send_skb_t **skb_p) { uct_ud_send_skb_t *skb; + uct_ud_neth_t *neth; UCT_CHECK_AM_ID(id); @@ -166,13 +220,15 @@ uct_ud_am_common(uct_ud_iface_t *iface, uct_ud_ep_t *ep, uint8_t id, * (we don't care about reordering with respect to control messages) */ ucs_assertv((ep->flags & UCT_UD_EP_FLAG_IN_PENDING) || - ucs_arbiter_group_is_empty(&ep->tx.pending.group) || - ucs_arbiter_elem_is_only(&ep->tx.pending.group, &ep->tx.pending.elem), - "out-of-order send detected for ep %p am %d ep_pending %d arbtail %p arbelem %p", + !uct_ud_ep_has_pending(ep), + "out-of-order send detected for ep %p am %d ep_pending %d arbelem %p", ep, id, (ep->flags & UCT_UD_EP_FLAG_IN_PENDING), - ep->tx.pending.group.tail, &ep->tx.pending.elem); - uct_ud_am_set_neth(skb->neth, ep, id); + + neth = skb->neth; + uct_ud_neth_init_data(ep, neth); + uct_ud_neth_set_type_am(ep, neth, id); + uct_ud_neth_ack_req(ep, neth); *skb_p = skb; return UCS_OK; @@ -187,3 +243,25 @@ uct_ud_skb_bcopy(uct_ud_send_skb_t *skb, uct_pack_callback_t pack_cb, void *arg) skb->len = sizeof(skb->neth[0]) + payload_len; return payload_len; } + +static UCS_F_ALWAYS_INLINE void +uct_ud_iface_dispatch_comp(uct_ud_iface_t *iface, uct_completion_t *comp, + ucs_status_t status) +{ + /* Avoid reordering with pending queue - if we have any pending requests, + * prevent send operations from the completion callback + */ + uct_ud_iface_raise_pending_async_ev(iface); + uct_invoke_completion(comp, status); +} + +static UCS_F_ALWAYS_INLINE void +uct_ud_iface_add_async_comp(uct_ud_iface_t *iface, uct_ud_send_skb_t *skb, + ucs_status_t status) +{ + uct_ud_comp_desc_t *cdesc = uct_ud_comp_desc(skb); + + cdesc->status = status; + ucs_queue_push(&iface->tx.async_comp_q, &skb->queue); +} + diff --git a/src/uct/ib/ud/base/ud_log.c b/src/uct/ib/ud/base/ud_log.c index 56e26368f28..33f15a0ec1f 100644 --- a/src/uct/ib/ud/base/ud_log.c +++ b/src/uct/ib/ud/base/ud_log.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ud_iface.h" #include "ud_ep.h" @@ -38,17 +42,19 @@ void uct_ud_dump_packet(uct_base_iface_t *iface, uct_am_trace_type_t type, snprintf(p, endp - p, " NAK"); } else if (neth->packet_type & UCT_UD_PACKET_FLAG_PUT) { puth = (uct_ud_put_hdr_t *)(neth + 1); - snprintf(p, endp - p, " PUT: 0x%0lx", puth->rva); + snprintf(p, endp - p, " PUT: 0x%0lx len %zu", puth->rva, + length - sizeof(*puth) - sizeof(*neth)); } else if (neth->packet_type & UCT_UD_PACKET_FLAG_CTL) { ctlh = (uct_ud_ctl_hdr_t *)(neth + 1); switch (ctlh->type) { case UCT_UD_PACKET_CREQ: - snprintf(p, endp - p, " CREQ from %s:%d qpn 0x%x %s epid %d cid %d", + snprintf(p, endp - p, + " CREQ from %s:%d qpn 0x%x %s epid %d cid %d path %d", ctlh->peer.name, ctlh->peer.pid, uct_ib_unpack_uint24(ctlh->conn_req.ep_addr.iface_addr.qp_num), uct_ib_address_str(uct_ud_creq_ib_addr(ctlh), buf, sizeof(buf)), uct_ib_unpack_uint24(ctlh->conn_req.ep_addr.ep_id), - ctlh->conn_req.conn_id); + ctlh->conn_req.conn_id, ctlh->conn_req.path_index); break; case UCT_UD_PACKET_CREP: snprintf(p, endp - p, " CREP from %s:%d src_ep_id %d", diff --git a/src/uct/ib/ud/verbs/ud_verbs.c b/src/uct/ib/ud/verbs/ud_verbs.c index f3abda85911..427f259fe9e 100644 --- a/src/uct/ib/ud/verbs/ud_verbs.c +++ b/src/uct/ib/ud/verbs/ud_verbs.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include #include @@ -45,7 +49,7 @@ UCS_CLASS_INIT_FUNC(uct_ud_verbs_ep_t, const uct_ep_params_t *params) uct_ud_verbs_iface_t); ucs_trace_func(""); - UCS_CLASS_CALL_SUPER_INIT(uct_ud_ep_t, &iface->super); + UCS_CLASS_CALL_SUPER_INIT(uct_ud_ep_t, &iface->super, params); self->ah = NULL; return UCS_OK; } @@ -61,73 +65,97 @@ static UCS_CLASS_DEFINE_NEW_FUNC(uct_ud_verbs_ep_t, uct_ep_t, UCS_CLASS_DEFINE_DELETE_FUNC(uct_ud_verbs_ep_t, uct_ep_t); static inline void -uct_ud_verbs_iface_fill_tx_wr(uct_ud_verbs_iface_t *iface, - uct_ud_verbs_ep_t *ep, - struct ibv_send_wr *wr, unsigned flags) +uct_ud_verbs_post_send(uct_ud_verbs_iface_t *iface, uct_ud_verbs_ep_t *ep, + struct ibv_send_wr *wr, unsigned send_flags, + unsigned max_log_sge) { - if (iface->super.tx.unsignaled >= UCT_UD_TX_MODERATION) { - wr->send_flags = (flags|IBV_SEND_SIGNALED); + struct ibv_send_wr *bad_wr; + int UCS_V_UNUSED ret; + + if ((send_flags & IBV_SEND_SIGNALED) || + (iface->super.tx.unsignaled >= (UCT_UD_TX_MODERATION - 1))) { + wr->send_flags = send_flags | IBV_SEND_SIGNALED; + wr->wr_id = iface->super.tx.unsignaled; iface->super.tx.unsignaled = 0; } else { - wr->send_flags = flags; + wr->send_flags = send_flags; +#if UCS_ENABLE_ASSERT + wr->wr_id = UINT64_MAX; +#endif ++iface->super.tx.unsignaled; } + wr->wr.ud.remote_qpn = ep->dest_qpn; wr->wr.ud.ah = ep->ah; + + UCT_UD_EP_HOOK_CALL_TX(&ep->super, (uct_ud_neth_t*)iface->tx.sge[0].addr); + ret = ibv_post_send(iface->super.qp, wr, &bad_wr); + ucs_assertv(ret == 0, "ibv_post_send() returned %d (%m)", ret); + + uct_ib_log_post_send(&iface->super.super, iface->super.qp, wr, max_log_sge, + uct_ud_dump_packet); + --iface->super.tx.available; + ++iface->tx.send_sn; } static inline void uct_ud_verbs_ep_tx_inlv(uct_ud_verbs_iface_t *iface, uct_ud_verbs_ep_t *ep, const void *buffer, unsigned length) { - int UCS_V_UNUSED ret; - struct ibv_send_wr *bad_wr; - iface->tx.sge[1].addr = (uintptr_t)buffer; iface->tx.sge[1].length = length; - uct_ud_verbs_iface_fill_tx_wr(iface, ep, &iface->tx.wr_inl, IBV_SEND_INLINE); - UCT_UD_EP_HOOK_CALL_TX(&ep->super, (uct_ud_neth_t *)iface->tx.sge[0].addr); - ret = ibv_post_send(iface->super.qp, &iface->tx.wr_inl, &bad_wr); - ucs_assertv(ret == 0, "ibv_post_send() returned %d (%m)", ret); - uct_ib_log_post_send(&iface->super.super, iface->super.qp, &iface->tx.wr_inl, - INT_MAX, uct_ud_dump_packet); - --iface->super.tx.available; + ucs_assert(iface->tx.wr_inl.num_sge == 2); + uct_ud_verbs_post_send(iface, ep, &iface->tx.wr_inl, IBV_SEND_INLINE, 2); } static inline void -uct_ud_verbs_ep_tx_skb(uct_ud_verbs_iface_t *iface, - uct_ud_verbs_ep_t *ep, uct_ud_send_skb_t *skb, unsigned flags) +uct_ud_verbs_ep_tx_skb(uct_ud_verbs_iface_t *iface, uct_ud_verbs_ep_t *ep, + uct_ud_send_skb_t *skb, unsigned send_flags, + unsigned max_log_sge) { - int UCS_V_UNUSED ret; - struct ibv_send_wr *bad_wr; - iface->tx.sge[0].lkey = skb->lkey; iface->tx.sge[0].length = skb->len; iface->tx.sge[0].addr = (uintptr_t)skb->neth; - uct_ud_verbs_iface_fill_tx_wr(iface, ep, &iface->tx.wr_skb, flags); - UCT_UD_EP_HOOK_CALL_TX(&ep->super, (uct_ud_neth_t *)iface->tx.sge[0].addr); - ret = ibv_post_send(iface->super.qp, &iface->tx.wr_skb, &bad_wr); - ucs_assertv(ret == 0, "ibv_post_send() returned %d (%m)", ret); - uct_ib_log_post_send(&iface->super.super, iface->super.qp, &iface->tx.wr_skb, - INT_MAX, uct_ud_dump_packet); - --iface->super.tx.available; + uct_ud_verbs_post_send(iface, ep, &iface->tx.wr_skb, send_flags, max_log_sge); } -static void uct_ud_verbs_ep_tx_ctl_skb(uct_ud_ep_t *ud_ep, uct_ud_send_skb_t *skb, - int solicited) +static uint16_t +uct_ud_verbs_ep_send_ctl(uct_ud_ep_t *ud_ep, uct_ud_send_skb_t *skb, + const uct_ud_iov_t *iov, uint16_t iovcnt, int flags, + int max_log_sge) { uct_ud_verbs_iface_t *iface = ucs_derived_of(ud_ep->super.super.iface, uct_ud_verbs_iface_t); uct_ud_verbs_ep_t *ep = ucs_derived_of(ud_ep, uct_ud_verbs_ep_t); - unsigned flags = 0; + unsigned send_flags; + uint16_t iov_index; - if (skb->len < iface->super.config.max_inline) { - flags = IBV_SEND_INLINE; + /* set send flags */ + send_flags = 0; + if ((skb->len <= iface->super.config.max_inline) && (iovcnt == 0)) { + send_flags |= IBV_SEND_INLINE; + } else { + ucs_assert(!(flags & UCT_UD_IFACE_SEND_CTL_FLAG_INLINE)); + } + if (flags & UCT_UD_IFACE_SEND_CTL_FLAG_SOLICITED) { + send_flags |= IBV_SEND_SOLICITED; } - if (solicited) { - flags |= IBV_SEND_SOLICITED; + if (flags & UCT_UD_IFACE_SEND_CTL_FLAG_SIGNALED) { + send_flags |= IBV_SEND_SIGNALED; + } + + /* copy iov array */ + for (iov_index = 0; iov_index < iovcnt; ++iov_index) { + iface->tx.sge[iov_index + 1].addr = (uintptr_t)iov[iov_index].buffer; + iface->tx.sge[iov_index + 1].length = iov[iov_index].length; + iface->tx.sge[iov_index + 1].lkey = iov[iov_index].lkey; } - uct_ud_verbs_ep_tx_skb(iface, ep, skb, flags); + iface->tx.wr_skb.num_sge = iovcnt + 1; + + uct_ud_verbs_ep_tx_skb(iface, ep, skb, send_flags, max_log_sge); + iface->tx.wr_skb.num_sge = 1; + + return iface->tx.send_sn; } static @@ -146,7 +174,7 @@ ucs_status_t uct_ud_verbs_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr, uct_ud_enter(&iface->super); - status = uct_ud_am_common(&iface->super, &ep->super, id, &skb); + status = uct_ud_am_skb_common(&iface->super, &ep->super, id, &skb); if (status != UCS_OK) { uct_ud_leave(&iface->super); return status; @@ -181,7 +209,7 @@ static ssize_t uct_ud_verbs_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id, uct_ud_enter(&iface->super); - status = uct_ud_am_common(&iface->super, &ep->super, id, &skb); + status = uct_ud_am_skb_common(&iface->super, &ep->super, id, &skb); if (status != UCS_OK) { uct_ud_leave(&iface->super); return status; @@ -190,7 +218,8 @@ static ssize_t uct_ud_verbs_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id, length = uct_ud_skb_bcopy(skb, pack_cb, arg); UCT_UD_CHECK_BCOPY_LENGTH(&iface->super, length); - uct_ud_verbs_ep_tx_skb(iface, ep, skb, 0); + ucs_assert(iface->tx.wr_skb.num_sge == 1); + uct_ud_verbs_ep_tx_skb(iface, ep, skb, 0, INT_MAX); uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb); UCT_TL_EP_STAT_OP(&ep->super.super, AM, BCOPY, length); uct_ud_leave(&iface->super); @@ -208,7 +237,7 @@ uct_ud_verbs_ep_am_zcopy(uct_ep_h tl_ep, uint8_t id, const void *header, uct_ud_send_skb_t *skb; ucs_status_t status; - UCT_CHECK_IOV_SIZE(iovcnt, uct_ib_iface_get_max_iov(&iface->super.super) - 1, + UCT_CHECK_IOV_SIZE(iovcnt, (size_t)iface->config.max_send_sge, "uct_ud_verbs_ep_am_zcopy"); UCT_CHECK_LENGTH(sizeof(uct_ud_neth_t) + sizeof(uct_ud_zcopy_desc_t) + header_length, @@ -219,7 +248,7 @@ uct_ud_verbs_ep_am_zcopy(uct_ep_h tl_ep, uint8_t id, const void *header, uct_ud_enter(&iface->super); - status = uct_ud_am_common(&iface->super, &ep->super, id, &skb); + status = uct_ud_am_skb_common(&iface->super, &ep->super, id, &skb); if (status != UCS_OK) { uct_ud_leave(&iface->super); return status; @@ -231,11 +260,11 @@ uct_ud_verbs_ep_am_zcopy(uct_ep_h tl_ep, uint8_t id, const void *header, iface->tx.wr_skb.num_sge = uct_ib_verbs_sge_fill_iov(iface->tx.sge + 1, iov, iovcnt) + 1; - - uct_ud_verbs_ep_tx_skb(iface, ep, skb, 0); + uct_ud_verbs_ep_tx_skb(iface, ep, skb, 0, + UCT_IB_MAX_ZCOPY_LOG_SGE(&iface->super.super)); iface->tx.wr_skb.num_sge = 1; - uct_ud_am_set_zcopy_desc(skb, iov, iovcnt, comp); + uct_ud_skb_set_zcopy_desc(skb, iov, iovcnt, comp); uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb); UCT_TL_EP_STAT_OP(&ep->super.super, AM, ZCOPY, header_length + uct_iov_total_length(iov, iovcnt)); @@ -255,7 +284,9 @@ ucs_status_t uct_ud_verbs_ep_put_short(uct_ep_h tl_ep, uct_ud_put_hdr_t *put_hdr; uct_ud_neth_t *neth; - /* TODO: UCT_CHECK_LENGTH(length <= iface->config.max_inline, "put_short"); */ + UCT_CHECK_LENGTH(sizeof(*neth) + sizeof(*put_hdr) + length, + 0, iface->super.config.max_inline, "put_short"); + uct_ud_enter(&iface->super); skb = uct_ud_ep_get_tx_skb(&iface->super, &ep->super); @@ -286,8 +317,9 @@ ucs_status_t uct_ud_verbs_ep_put_short(uct_ep_h tl_ep, static UCS_F_ALWAYS_INLINE unsigned -uct_ud_verbs_iface_poll_tx(uct_ud_verbs_iface_t *iface) +uct_ud_verbs_iface_poll_tx(uct_ud_verbs_iface_t *iface, int is_async) { + unsigned num_completed; struct ibv_wc wc; int ret; @@ -307,7 +339,14 @@ uct_ud_verbs_iface_poll_tx(uct_ud_verbs_iface_t *iface) return 0; } - iface->super.tx.available += UCT_UD_TX_MODERATION + 1; + num_completed = wc.wr_id + 1; + ucs_assertv(num_completed <= UCT_UD_TX_MODERATION, "num_compeleted=%u", + num_completed); + + iface->super.tx.available += num_completed; + iface->tx.comp_sn += num_completed; + + uct_ud_iface_send_completion(&iface->super, iface->tx.comp_sn, is_async); return 1; } @@ -327,7 +366,7 @@ uct_ud_verbs_iface_poll_rx(uct_ud_verbs_iface_t *iface, int is_async) } UCT_IB_IFACE_VERBS_FOREACH_RXWQE(&iface->super.super, i, packet, wc, num_wcs) { - if (!uct_ud_iface_check_grh(&iface->super, packet + UCT_IB_GRH_LEN, + if (!uct_ud_iface_check_grh(&iface->super, packet, wc[i].wc_flags & IBV_WC_GRH)) { ucs_mpool_put_inline((void*)wc[i].wr_id); continue; @@ -335,7 +374,7 @@ uct_ud_verbs_iface_poll_rx(uct_ud_verbs_iface_t *iface, int is_async) uct_ib_log_recv_completion(&iface->super.super, &wc[i], packet, wc[i].byte_len, uct_ud_dump_packet); uct_ud_ep_process_rx(&iface->super, - (uct_ud_neth_t *)(packet + UCT_IB_GRH_LEN), + (uct_ud_neth_t *)UCS_PTR_BYTE_OFFSET(packet, UCT_IB_GRH_LEN), wc[i].byte_len - UCT_IB_GRH_LEN, (uct_ud_recv_skb_t *)wc[i].wr_id, is_async); @@ -363,9 +402,9 @@ static unsigned uct_ud_verbs_iface_async_progress(uct_ud_iface_t *ud_iface) do { n = uct_ud_verbs_iface_poll_rx(iface, 1); count += n; - } while (n > 0); + } while ((n > 0) && (count < iface->super.rx.async_max_poll)); - count += uct_ud_verbs_iface_poll_tx(iface); + count += uct_ud_verbs_iface_poll_tx(iface, 1); uct_ud_iface_progress_pending(&iface->super, 1); return count; @@ -378,12 +417,12 @@ static unsigned uct_ud_verbs_iface_progress(uct_iface_h tl_iface) unsigned count; uct_ud_enter(&iface->super); - uct_ud_iface_dispatch_zcopy_comps(&iface->super); + uct_ud_iface_dispatch_async_comps(&iface->super); status = uct_ud_iface_dispatch_pending_rx(&iface->super); if (status == UCS_OK) { count = uct_ud_verbs_iface_poll_rx(iface, 0); if (count == 0) { - count = uct_ud_verbs_iface_poll_tx(iface); + count = uct_ud_verbs_iface_poll_tx(iface, 0); } } else { count = 0; @@ -398,26 +437,30 @@ static unsigned uct_ud_verbs_iface_progress(uct_iface_h tl_iface) static ucs_status_t uct_ud_verbs_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { - uct_ud_iface_t *iface = ucs_derived_of(tl_iface, uct_ud_iface_t); + uct_ud_verbs_iface_t *iface = ucs_derived_of(tl_iface, uct_ud_verbs_iface_t); + size_t am_max_hdr; ucs_status_t status; ucs_trace_func(""); - status = uct_ud_iface_query(iface, iface_attr); + + am_max_hdr = uct_ib_iface_hdr_size(iface->super.super.config.seg_size, + sizeof(uct_ud_neth_t) + + sizeof(uct_ud_zcopy_desc_t)); + status = uct_ud_iface_query(&iface->super, iface_attr, + iface->config.max_send_sge, am_max_hdr); if (status != UCS_OK) { return status; } - iface_attr->overhead = 105e-9; /* Software overhead */ - iface_attr->cap.am.max_hdr = uct_ib_iface_hdr_size(iface->super.config.seg_size, - sizeof(uct_ud_neth_t) + - sizeof(uct_ud_zcopy_desc_t)); + iface_attr->overhead = 105e-9; /* Software overhead */ return UCS_OK; } static ucs_status_t uct_ud_verbs_ep_create_connected(uct_iface_h iface_h, const uct_device_addr_t *dev_addr, - const uct_iface_addr_t *iface_addr, uct_ep_h *new_ep_p) + const uct_iface_addr_t *iface_addr, + unsigned path_index, uct_ep_h *new_ep_p) { uct_ud_verbs_iface_t *iface = ucs_derived_of(iface_h, uct_ud_verbs_iface_t); uct_ib_iface_t *ib_iface = &iface->super.super; @@ -428,10 +471,11 @@ uct_ud_verbs_ep_create_connected(uct_iface_h iface_h, const uct_device_addr_t *d uct_ud_send_skb_t *skb; ucs_status_t status, status_ah; struct ibv_ah_attr ah_attr; + enum ibv_mtu path_mtu; uct_ud_enter(&iface->super); status = uct_ud_ep_create_connected_common(&iface->super, ib_addr, if_addr, - &new_ud_ep, &skb); + path_index, &new_ud_ep, &skb); if (status != UCS_OK && status != UCS_ERR_NO_RESOURCE && status != UCS_ERR_ALREADY_EXISTS) { @@ -440,6 +484,7 @@ uct_ud_verbs_ep_create_connected(uct_iface_h iface_h, const uct_device_addr_t *d } ep = ucs_derived_of(new_ud_ep, uct_ud_verbs_ep_t); + /* cppcheck-suppress autoVariables */ *new_ep_p = &ep->super.super.super; if (status == UCS_ERR_ALREADY_EXISTS) { uct_ud_leave(&iface->super); @@ -448,7 +493,8 @@ uct_ud_verbs_ep_create_connected(uct_iface_h iface_h, const uct_device_addr_t *d ucs_assert_always(ep->ah == NULL); - uct_ib_iface_fill_ah_attr_from_addr(ib_iface, ib_addr, ep->super.path_bits, &ah_attr); + uct_ib_iface_fill_ah_attr_from_addr(ib_iface, ib_addr, ep->super.path_index, + &ah_attr, &path_mtu); status_ah = uct_ib_iface_create_ah(ib_iface, &ah_attr, &ep->ah); if (status_ah != UCS_OK) { uct_ud_ep_destroy_connected(&ep->super, ib_addr, if_addr); @@ -460,7 +506,8 @@ uct_ud_verbs_ep_create_connected(uct_iface_h iface_h, const uct_device_addr_t *d ep->dest_qpn = uct_ib_unpack_uint24(if_addr->qp_num); if (status == UCS_OK) { - uct_ud_verbs_ep_tx_skb(iface, ep, skb, IBV_SEND_INLINE|IBV_SEND_SOLICITED); + uct_ud_verbs_ep_send_ctl(&ep->super, skb, NULL, 0, + UCT_UD_IFACE_SEND_CTL_FLAG_SOLICITED, 1); uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb); ep->super.flags |= UCT_UD_EP_FLAG_CREQ_SENT; } @@ -480,6 +527,7 @@ uct_ud_verbs_ep_connect_to_ep(uct_ep_h tl_ep, const uct_ud_ep_addr_t *ud_ep_addr = (const uct_ud_ep_addr_t *)ep_addr; ucs_status_t status; struct ibv_ah_attr ah_attr; + enum ibv_mtu path_mtu; status = uct_ud_ep_connect_to_ep(&ep->super, ib_addr, ud_ep_addr); if (status != UCS_OK) { @@ -488,7 +536,8 @@ uct_ud_verbs_ep_connect_to_ep(uct_ep_h tl_ep, ucs_assert_always(ep->ah == NULL); ep->dest_qpn = uct_ib_unpack_uint24(ud_ep_addr->iface_addr.qp_num); - uct_ib_iface_fill_ah_attr_from_addr(iface, ib_addr, ep->super.path_bits, &ah_attr); + uct_ib_iface_fill_ah_attr_from_addr(iface, ib_addr, ep->super.path_index, + &ah_attr, &path_mtu); return uct_ib_iface_create_ah(iface, &ah_attr, &ep->ah); } @@ -498,7 +547,9 @@ uct_ud_verbs_ep_create(const uct_ep_params_t *params, uct_ep_h *ep_p) if (ucs_test_all_flags(params->field_mask, UCT_EP_PARAM_FIELD_DEV_ADDR | UCT_EP_PARAM_FIELD_IFACE_ADDR)) { return uct_ud_verbs_ep_create_connected(params->iface, params->dev_addr, - params->iface_addr, ep_p); + params->iface_addr, + UCT_EP_PARAMS_GET_PATH_INDEX(params), + ep_p); } return uct_ud_verbs_ep_t_new(params, ep_p); @@ -524,9 +575,10 @@ static uct_ud_iface_ops_t uct_ud_verbs_iface_ops = { .iface_flush = uct_ud_iface_flush, .iface_fence = uct_base_iface_fence, .iface_progress_enable = uct_ud_iface_progress_enable, - .iface_progress_disable = uct_base_iface_progress_disable, + .iface_progress_disable = uct_ud_iface_progress_disable, .iface_progress = uct_ud_verbs_iface_progress, - .iface_event_fd_get = uct_ib_iface_event_fd_get, + .iface_event_fd_get = (uct_iface_event_fd_get_func_t) + ucs_empty_function_return_unsupported, .iface_event_arm = uct_ud_iface_event_arm, .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_ud_verbs_iface_t), .iface_query = uct_ud_verbs_iface_query, @@ -536,16 +588,14 @@ static uct_ud_iface_ops_t uct_ud_verbs_iface_ops = { }, .create_cq = uct_ib_verbs_create_cq, .arm_cq = uct_ib_iface_arm_cq, - .event_cq = (void*)ucs_empty_function, - .handle_failure = uct_ud_iface_handle_failure, + .event_cq = (uct_ib_iface_event_cq_func_t)ucs_empty_function, + .handle_failure = (uct_ib_iface_handle_failure_func_t)ucs_empty_function_do_assert, .set_ep_failed = uct_ud_verbs_ep_set_failed, - .create_qp = uct_ib_iface_create_qp, - .init_res_domain = (void*)ucs_empty_function_return_success, - .cleanup_res_domain = (void*)ucs_empty_function, }, .async_progress = uct_ud_verbs_iface_async_progress, - .tx_skb = uct_ud_verbs_ep_tx_ctl_skb, - .ep_free = UCS_CLASS_DELETE_FUNC_NAME(uct_ud_verbs_ep_t) + .send_ctl = uct_ud_verbs_ep_send_ctl, + .ep_free = UCS_CLASS_DELETE_FUNC_NAME(uct_ud_verbs_ep_t), + .create_qp = uct_ib_iface_create_qp, }; static UCS_F_NOINLINE void @@ -582,6 +632,26 @@ uct_ud_verbs_iface_post_recv(uct_ud_verbs_iface_t *iface) uct_ud_verbs_iface_post_recv_always(iface, batch); } +/* Used for am zcopy only */ +ucs_status_t uct_ud_verbs_qp_max_send_sge(uct_ud_verbs_iface_t *iface, + size_t *max_send_sge) +{ + uint32_t max_sge; + ucs_status_t status; + + status = uct_ib_qp_max_send_sge(iface->super.qp, &max_sge); + if (status != UCS_OK) { + return status; + } + + /* need to reserve 1 iov for am zcopy header */ + ucs_assert_always(max_sge > 1); + + *max_send_sge = ucs_min(max_sge - 1, UCT_IB_MAX_IOV); + + return UCS_OK; +} + static UCS_CLASS_INIT_FUNC(uct_ud_verbs_iface_t, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, const uct_iface_config_t *tl_config) @@ -593,6 +663,9 @@ static UCS_CLASS_INIT_FUNC(uct_ud_verbs_iface_t, uct_md_h md, uct_worker_h worke ucs_trace_func(""); + init_attr.cq_len[UCT_IB_DIR_TX] = config->super.tx.queue_len; + init_attr.cq_len[UCT_IB_DIR_RX] = config->super.rx.queue_len; + UCS_CLASS_CALL_SUPER_INIT(uct_ud_iface_t, &uct_ud_verbs_iface_ops, md, worker, params, config, &init_attr); @@ -614,22 +687,25 @@ static UCS_CLASS_INIT_FUNC(uct_ud_verbs_iface_t, uct_md_h md, uct_worker_h worke self->tx.wr_skb.sg_list = self->tx.sge; self->tx.wr_skb.num_sge = 1; + self->tx.send_sn = 0; + self->tx.comp_sn = 0; + if (self->super.super.config.rx_max_batch < UCT_UD_RX_BATCH_MIN) { ucs_warn("rx max batch is too low (%d < %d), performance may be impacted", self->super.super.config.rx_max_batch, UCT_UD_RX_BATCH_MIN); } - while (self->super.rx.available >= self->super.super.config.rx_max_batch) { - uct_ud_verbs_iface_post_recv(self); - } - - status = uct_ud_iface_complete_init(&self->super); + status = uct_ud_verbs_qp_max_send_sge(self, &self->config.max_send_sge); if (status != UCS_OK) { return status; } - return UCS_OK; + while (self->super.rx.available >= self->super.super.config.rx_max_batch) { + uct_ud_verbs_iface_post_recv(self); + } + + return uct_ud_iface_complete_init(&self->super); } static UCS_CLASS_CLEANUP_FUNC(uct_ud_verbs_iface_t) @@ -638,7 +714,6 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ud_verbs_iface_t) uct_ud_iface_remove_async_handlers(&self->super); uct_ud_enter(&self->super); UCT_UD_IFACE_DELETE_EPS(&self->super, uct_ud_verbs_ep_t); - ucs_twheel_cleanup(&self->super.async.slow_timer); uct_ud_leave(&self->super); } @@ -650,21 +725,16 @@ static UCS_CLASS_DEFINE_NEW_FUNC(uct_ud_verbs_iface_t, uct_iface_t, uct_md_h, static UCS_CLASS_DEFINE_DELETE_FUNC(uct_ud_verbs_iface_t, uct_iface_t); -static -ucs_status_t uct_ud_verbs_query_resources(uct_md_h md, - uct_tl_resource_desc_t **resources_p, - unsigned *num_resources_p) +static ucs_status_t +uct_ud_verbs_query_tl_devices(uct_md_h md, + uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) { - return uct_ib_device_query_tl_resources(&ucs_derived_of(md, uct_ib_md_t)->dev, - "ud", 0, - resources_p, num_resources_p); + uct_ib_md_t *ib_md = ucs_derived_of(md, uct_ib_md_t); + return uct_ib_device_query_ports(&ib_md->dev, 0, tl_devices_p, + num_tl_devices_p); } -UCT_TL_COMPONENT_DEFINE(uct_ud_verbs_tl, - uct_ud_verbs_query_resources, - uct_ud_verbs_iface_t, - "ud", - "UD_VERBS_", - uct_ud_verbs_iface_config_table, - uct_ud_iface_config_t); -UCT_MD_REGISTER_TL(&uct_ib_mdc, &uct_ud_verbs_tl); +UCT_TL_DEFINE(&uct_ib_component, ud_verbs, uct_ud_verbs_query_tl_devices, + uct_ud_verbs_iface_t, "UD_VERBS_", + uct_ud_verbs_iface_config_table, uct_ud_iface_config_t); diff --git a/src/uct/ib/ud/verbs/ud_verbs.h b/src/uct/ib/ud/verbs/ud_verbs.h index 502def8db54..4768bacf2bf 100644 --- a/src/uct/ib/ud/verbs/ud_verbs.h +++ b/src/uct/ib/ud/verbs/ud_verbs.h @@ -20,15 +20,26 @@ typedef struct { struct ibv_ah *ah; } uct_ud_verbs_ep_t; + typedef struct { uct_ud_iface_t super; struct { struct ibv_sge sge[UCT_IB_MAX_IOV]; struct ibv_send_wr wr_inl; struct ibv_send_wr wr_skb; + uint16_t send_sn; + uint16_t comp_sn; } tx; + struct { + size_t max_send_sge; + } config; } uct_ud_verbs_iface_t; + UCS_CLASS_DECLARE(uct_ud_verbs_ep_t, const uct_ep_params_t *) + +ucs_status_t uct_ud_verbs_qp_max_send_sge(uct_ud_verbs_iface_t *iface, + size_t *max_send_sge); + #endif diff --git a/src/uct/rocm/Makefile.am b/src/uct/rocm/Makefile.am index bd6bf65c367..8e4e3f769be 100644 --- a/src/uct/rocm/Makefile.am +++ b/src/uct/rocm/Makefile.am @@ -12,7 +12,10 @@ libuct_rocm_la_CPPFLAGS = $(BASE_CPPFLAGS) $(ROCM_CPPFLAGS) libuct_rocm_la_CFLAGS = $(BASE_CFLAGS) libuct_rocm_la_LIBADD = $(top_builddir)/src/ucs/libucs.la \ $(top_builddir)/src/uct/libuct.la -libuct_rocm_la_LDFLAGS = $(ROCM_LDFLAGS) $(ROCM_LIBS) -version-info $(SOVERSION) +libuct_rocm_la_LDFLAGS = $(ROCM_LDFLAGS) $(ROCM_LIBS) -version-info $(SOVERSION) \ + $(patsubst %, -Xlinker %, -L$(ROCM_ROOT)/lib -rpath $(ROCM_ROOT)/hip/lib -rpath $(ROCM_ROOT)/lib) \ + $(patsubst %, -Xlinker %, --enable-new-dtags) \ + $(patsubst %, -Xlinker %, -rpath $(ROCM_ROOT)/lib64) noinst_HEADERS = \ base/rocm_base.h diff --git a/src/uct/rocm/base/rocm_base.c b/src/uct/rocm/base/rocm_base.c index a2acf7148bc..718d80e6d43 100644 --- a/src/uct/rocm/base/rocm_base.c +++ b/src/uct/rocm/base/rocm_base.c @@ -3,14 +3,18 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rocm_base.h" #include #include - #include + #define MAX_AGENTS 16 static struct agents { hsa_agent_t agents[MAX_AGENTS]; @@ -29,7 +33,7 @@ static hsa_status_t uct_rocm_hsa_agent_callback(hsa_agent_t agent, void* data) { hsa_device_type_t device_type; - assert(uct_rocm_base_agents.num < MAX_AGENTS); + ucs_assert(uct_rocm_base_agents.num < MAX_AGENTS); hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); if (device_type == HSA_DEVICE_TYPE_CPU) { @@ -87,9 +91,32 @@ hsa_status_t uct_rocm_base_init(void) return status; } +ucs_status_t +uct_rocm_base_query_md_resources(uct_component_h component, + uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p) +{ + if (uct_rocm_base_init() != HSA_STATUS_SUCCESS) { + ucs_debug("could not initialize ROCm support"); + return uct_md_query_empty_md_resource(resources_p, num_resources_p); + } + + return uct_md_query_single_md_resource(component, resources_p, + num_resources_p); +} + +ucs_status_t uct_rocm_base_query_devices(uct_md_h md, + uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) +{ + return uct_single_device_resource(md, md->component->name, + UCT_DEVICE_TYPE_ACC, tl_devices_p, + num_tl_devices_p); +} + hsa_agent_t uct_rocm_base_get_dev_agent(int dev_num) { - assert(dev_num < uct_rocm_base_agents.num); + ucs_assert(dev_num < uct_rocm_base_agents.num); return uct_rocm_base_agents.agents[dev_num]; } @@ -101,7 +128,7 @@ int uct_rocm_base_get_dev_num(hsa_agent_t agent) if (uct_rocm_base_agents.agents[i].handle == agent.handle) return i; } - assert(0); + ucs_assert(0); return -1; } @@ -143,28 +170,33 @@ hsa_status_t uct_rocm_base_get_ptr_info(void *ptr, size_t size, return HSA_STATUS_SUCCESS; } -int uct_rocm_base_is_mem_type_owned(uct_md_h md, void *addr, size_t length) +ucs_status_t uct_rocm_base_detect_memory_type(uct_md_h md, const void *addr, + size_t length, + ucs_memory_type_t *mem_type_p) { hsa_status_t status; hsa_amd_pointer_info_t info; if (addr == NULL) { - return 0; + *mem_type_p = UCS_MEMORY_TYPE_HOST; + return UCS_OK; } info.size = sizeof(hsa_amd_pointer_info_t); - status = hsa_amd_pointer_info(addr, &info, NULL, NULL, NULL); + status = hsa_amd_pointer_info((void*)addr, &info, NULL, NULL, NULL); if ((status == HSA_STATUS_SUCCESS) && (info.type == HSA_EXT_POINTER_TYPE_HSA)) { hsa_device_type_t dev_type; status = hsa_agent_get_info(info.agentOwner, HSA_AGENT_INFO_DEVICE, &dev_type); if ((status == HSA_STATUS_SUCCESS) && - (dev_type == HSA_DEVICE_TYPE_GPU)) - return 1; + (dev_type == HSA_DEVICE_TYPE_GPU)) { + *mem_type_p = UCS_MEMORY_TYPE_ROCM; + return UCS_OK; + } } - return 0; + return UCS_ERR_INVALID_ADDR; } UCS_MODULE_INIT() { diff --git a/src/uct/rocm/base/rocm_base.h b/src/uct/rocm/base/rocm_base.h index b3da598e381..d818b73c005 100644 --- a/src/uct/rocm/base/rocm_base.h +++ b/src/uct/rocm/base/rocm_base.h @@ -7,10 +7,18 @@ #ifndef ROCM_BASE_H #define ROCM_BASE_H +#include #include #include + hsa_status_t uct_rocm_base_init(void); +ucs_status_t uct_rocm_base_query_md_resources(uct_component_h component, + uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p); +ucs_status_t uct_rocm_base_query_devices(uct_md_h md, + uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p); hsa_agent_t uct_rocm_base_get_dev_agent(int dev_num); int uct_rocm_base_is_gpu_agent(hsa_agent_t agent); int uct_rocm_base_get_gpu_agents(hsa_agent_t **agents); @@ -18,6 +26,8 @@ int uct_rocm_base_get_dev_num(hsa_agent_t agent); hsa_status_t uct_rocm_base_get_ptr_info(void *ptr, size_t size, void **base_ptr, size_t *base_size, hsa_agent_t *agent); -int uct_rocm_base_is_mem_type_owned(uct_md_h md, void *addr, size_t length); +ucs_status_t uct_rocm_base_detect_memory_type(uct_md_h md, const void *addr, + size_t length, + ucs_memory_type_t *mem_type_p); #endif diff --git a/src/uct/rocm/configure.m4 b/src/uct/rocm/configure.m4 index c8488f3865d..f5179a54ff1 100644 --- a/src/uct/rocm/configure.m4 +++ b/src/uct/rocm/configure.m4 @@ -6,7 +6,7 @@ UCX_CHECK_ROCM -AS_IF([test "x$rocm_happy" = "xyes"], [uct_modules+=":rocm"]) +AS_IF([test "x$rocm_happy" = "xyes"], [uct_modules="${uct_modules}:rocm"]) uct_rocm_modules="" m4_include([src/uct/rocm/gdr/configure.m4]) AC_DEFINE_UNQUOTED([uct_rocm_MODULES], ["${uct_rocm_modules}"], [ROCM loadable modules]) diff --git a/src/uct/rocm/copy/rocm_copy_ep.c b/src/uct/rocm/copy/rocm_copy_ep.c index 35bca63142e..04095ad2515 100644 --- a/src/uct/rocm/copy/rocm_copy_ep.c +++ b/src/uct/rocm/copy/rocm_copy_ep.c @@ -3,12 +3,21 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rocm_copy_ep.h" #include "rocm_copy_iface.h" #include +#include #include #include +#include + +#define uct_rocm_memcpy_h2d(_d,_s,_l) memcpy((_d),(_s),(_l)) +#define uct_rocm_memcpy_d2h(_d,_s,_l) ucs_memcpy_nontemporal((_d),(_s),(_l)) static UCS_CLASS_INIT_FUNC(uct_rocm_copy_ep_t, const uct_ep_params_t *params) { @@ -44,9 +53,9 @@ uct_rocm_copy_ep_zcopy(uct_ep_h tl_ep, } if (is_put) - memcpy((void *)remote_addr, iov->buffer, size); + uct_rocm_memcpy_h2d((void *)remote_addr, iov->buffer, size); else - memcpy(iov->buffer, (void *)remote_addr, size); + uct_rocm_memcpy_d2h(iov->buffer, (void *)remote_addr, size); return UCS_OK; } @@ -87,7 +96,7 @@ ucs_status_t uct_rocm_copy_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length, uint64_t remote_addr, uct_rkey_t rkey) { - memcpy((void *)remote_addr, buffer, length); + uct_rocm_memcpy_h2d((void *)remote_addr, buffer, length); UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), PUT, SHORT, length); ucs_trace_data("PUT_SHORT size %d from %p to %p", @@ -99,8 +108,7 @@ ucs_status_t uct_rocm_copy_ep_get_short(uct_ep_h tl_ep, void *buffer, unsigned length, uint64_t remote_addr, uct_rkey_t rkey) { - /* device to host */ - memcpy(buffer, (void *)remote_addr, length); + uct_rocm_memcpy_d2h(buffer, (void *)remote_addr, length); UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, SHORT, length); ucs_trace_data("GET_SHORT size %d from %p to %p", diff --git a/src/uct/rocm/copy/rocm_copy_iface.c b/src/uct/rocm/copy/rocm_copy_iface.c index 8c7b09b063a..1d6b1a0bb7c 100644 --- a/src/uct/rocm/copy/rocm_copy_iface.c +++ b/src/uct/rocm/copy/rocm_copy_iface.c @@ -3,13 +3,19 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rocm_copy_iface.h" #include "rocm_copy_md.h" #include "rocm_copy_ep.h" +#include #include #include + static ucs_config_field_t uct_rocm_copy_iface_config_table[] = { {"", "", NULL, @@ -42,10 +48,12 @@ static int uct_rocm_copy_iface_is_reachable(const uct_iface_h tl_iface, return (addr != NULL) && (iface->id == *addr); } -static ucs_status_t uct_rocm_copy_iface_query(uct_iface_h iface, +static ucs_status_t uct_rocm_copy_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { - memset(iface_attr, 0, sizeof(uct_iface_attr_t)); + uct_rocm_copy_iface_t *iface = ucs_derived_of(tl_iface, uct_rocm_copy_iface_t); + + uct_base_iface_query(&iface->super, iface_attr); iface_attr->iface_addr_len = sizeof(uct_rocm_copy_iface_addr_t); iface_attr->device_addr_len = 0; @@ -82,9 +90,9 @@ static ucs_status_t uct_rocm_copy_iface_query(uct_iface_h iface, iface_attr->cap.am.max_hdr = 0; iface_attr->cap.am.max_iov = 1; - iface_attr->latency.overhead = 10e-6; /* 10 us */ - iface_attr->latency.growth = 0; - iface_attr->bandwidth = 6911 * 1024.0 * 1024.0; + iface_attr->latency = ucs_linear_func_make(10e-6, 0); + iface_attr->bandwidth.dedicated = 6911.0 * UCS_MBYTE; + iface_attr->bandwidth.shared = 0; iface_attr->overhead = 0; iface_attr->priority = 0; @@ -109,7 +117,7 @@ static uct_iface_ops_t uct_rocm_copy_iface_ops = { .iface_progress = ucs_empty_function_return_zero, .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_rocm_copy_iface_t), .iface_query = uct_rocm_copy_iface_query, - .iface_get_device_address = (void*)ucs_empty_function_return_success, + .iface_get_device_address = (uct_iface_get_device_address_func_t)ucs_empty_function_return_success, .iface_get_address = uct_rocm_copy_iface_get_address, .iface_is_reachable = uct_rocm_copy_iface_is_reachable, }; @@ -137,37 +145,7 @@ UCS_CLASS_DEFINE_NEW_FUNC(uct_rocm_copy_iface_t, uct_iface_t, uct_md_h, uct_work const uct_iface_params_t*, const uct_iface_config_t*); static UCS_CLASS_DEFINE_DELETE_FUNC(uct_rocm_copy_iface_t, uct_iface_t); - -static ucs_status_t uct_rocm_copy_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) -{ - uct_tl_resource_desc_t *resource; - - resource = ucs_calloc(1, sizeof(uct_tl_resource_desc_t), "ROCm copy resource desc"); - if (NULL == resource) { - ucs_error("Failed to allocate memory"); - return UCS_ERR_NO_MEMORY; - } - - ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", - UCT_ROCM_COPY_TL_NAME); - ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", - md->component->name); - - resource->dev_type = UCT_DEVICE_TYPE_ACC; - - *num_resources_p = 1; - *resource_p = resource; - return UCS_OK; -} - -UCT_TL_COMPONENT_DEFINE(uct_rocm_copy_tl, - uct_rocm_copy_query_tl_resources, - uct_rocm_copy_iface_t, - UCT_ROCM_COPY_TL_NAME, - "ROCM_COPY_", - uct_rocm_copy_iface_config_table, - uct_rocm_copy_iface_config_t); - -UCT_MD_REGISTER_TL(&uct_rocm_copy_md_component, &uct_rocm_copy_tl); +UCT_TL_DEFINE(&uct_rocm_copy_component, rocm_copy, + uct_rocm_base_query_devices, uct_rocm_copy_iface_t, + "ROCM_COPY_", uct_rocm_copy_iface_config_table, + uct_rocm_copy_iface_config_t); diff --git a/src/uct/rocm/copy/rocm_copy_md.c b/src/uct/rocm/copy/rocm_copy_md.c index 659cfa50935..21ebad69482 100644 --- a/src/uct/rocm/copy/rocm_copy_md.c +++ b/src/uct/rocm/copy/rocm_copy_md.c @@ -3,6 +3,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rocm_copy_md.h" #include @@ -26,14 +30,15 @@ static ucs_config_field_t uct_rocm_copy_md_config_table[] = { static ucs_status_t uct_rocm_copy_md_query(uct_md_h md, uct_md_attr_t *md_attr) { - md_attr->cap.flags = UCT_MD_FLAG_REG; - md_attr->cap.reg_mem_types = UCS_BIT(UCT_MD_MEM_TYPE_HOST); - md_attr->cap.mem_type = UCT_MD_MEM_TYPE_ROCM; - md_attr->cap.max_alloc = 0; - md_attr->cap.max_reg = ULONG_MAX; - md_attr->rkey_packed_size = 0; - md_attr->reg_cost.overhead = 0; - md_attr->reg_cost.growth = 0; + md_attr->cap.flags = UCT_MD_FLAG_REG; + md_attr->cap.reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); + md_attr->cap.access_mem_type = UCS_MEMORY_TYPE_ROCM; + md_attr->cap.detect_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM) | + UCS_BIT(UCS_MEMORY_TYPE_ROCM_MANAGED); + md_attr->cap.max_alloc = 0; + md_attr->cap.max_reg = ULONG_MAX; + md_attr->rkey_packed_size = 0; + md_attr->reg_cost = ucs_linear_func_make(0, 0); memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); return UCS_OK; } @@ -44,17 +49,17 @@ static ucs_status_t uct_rocm_copy_mkey_pack(uct_md_h md, uct_mem_h memh, return UCS_OK; } -static ucs_status_t uct_rocm_copy_rkey_unpack(uct_md_component_t *mdc, - const void *rkey_buffer, uct_rkey_t *rkey_p, - void **handle_p) +static ucs_status_t uct_rocm_copy_rkey_unpack(uct_component_t *component, + const void *rkey_buffer, + uct_rkey_t *rkey_p, void **handle_p) { *rkey_p = 0xdeadbeef; *handle_p = NULL; return UCS_OK; } -static ucs_status_t uct_rocm_copy_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, - void *handle) +static ucs_status_t uct_rocm_copy_rkey_release(uct_component_t *component, + uct_rkey_t rkey, void *handle) { return UCS_OK; } @@ -96,20 +101,6 @@ static ucs_status_t uct_rocm_copy_mem_dereg(uct_md_h md, uct_mem_h memh) return UCS_OK; } -static ucs_status_t uct_rocm_copy_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) -{ - if (uct_rocm_base_init() != HSA_STATUS_SUCCESS) { - ucs_debug("Could not initialize ROCm support"); - *resources_p = NULL; - *num_resources_p = 0; - return UCS_OK; - } - - return uct_single_md_resource(&uct_rocm_copy_md_component, resources_p, - num_resources_p); -} - static void uct_rocm_copy_md_close(uct_md_h uct_md) { uct_rocm_copy_md_t *md = ucs_derived_of(uct_md, uct_rocm_copy_md_t); @@ -117,16 +108,17 @@ static void uct_rocm_copy_md_close(uct_md_h uct_md) { } static uct_md_ops_t md_ops = { - .close = uct_rocm_copy_md_close, - .query = uct_rocm_copy_md_query, - .mkey_pack = uct_rocm_copy_mkey_pack, - .mem_reg = uct_rocm_copy_mem_reg, - .mem_dereg = uct_rocm_copy_mem_dereg, - .is_mem_type_owned = uct_rocm_base_is_mem_type_owned, + .close = uct_rocm_copy_md_close, + .query = uct_rocm_copy_md_query, + .mkey_pack = uct_rocm_copy_mkey_pack, + .mem_reg = uct_rocm_copy_mem_reg, + .mem_dereg = uct_rocm_copy_mem_dereg, + .detect_memory_type = uct_rocm_base_detect_memory_type }; -static ucs_status_t uct_rocm_copy_md_open(const char *md_name, const uct_md_config_t *md_config, - uct_md_h *md_p) +static ucs_status_t +uct_rocm_copy_md_open(uct_component_h component, const char *md_name, + const uct_md_config_t *md_config, uct_md_h *md_p) { uct_rocm_copy_md_t *md; @@ -136,14 +128,30 @@ static ucs_status_t uct_rocm_copy_md_open(const char *md_name, const uct_md_conf return UCS_ERR_NO_MEMORY; } - md->super.ops = &md_ops; - md->super.component = &uct_rocm_copy_md_component; + md->super.ops = &md_ops; + md->super.component = &uct_rocm_copy_component; *md_p = (uct_md_h) md; return UCS_OK; } -UCT_MD_COMPONENT_DEFINE(uct_rocm_copy_md_component, UCT_ROCM_COPY_MD_NAME, - uct_rocm_copy_query_md_resources, uct_rocm_copy_md_open, NULL, - uct_rocm_copy_rkey_unpack, uct_rocm_copy_rkey_release, "ROCM_COPY_", - uct_rocm_copy_md_config_table, uct_rocm_copy_md_config_t); +uct_component_t uct_rocm_copy_component = { + .query_md_resources = uct_rocm_base_query_md_resources, + .md_open = uct_rocm_copy_md_open, + .cm_open = ucs_empty_function_return_unsupported, + .rkey_unpack = uct_rocm_copy_rkey_unpack, + .rkey_ptr = ucs_empty_function_return_unsupported, + .rkey_release = uct_rocm_copy_rkey_release, + .name = "rocm_cpy", + .md_config = { + .name = "ROCm-copy memory domain", + .prefix = "ROCM_COPY_", + .table = uct_rocm_copy_md_config_table, + .size = sizeof(uct_rocm_copy_md_config_t), + }, + .cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY, + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_rocm_copy_component), + .flags = 0 +}; +UCT_COMPONENT_REGISTER(&uct_rocm_copy_component); + diff --git a/src/uct/rocm/copy/rocm_copy_md.h b/src/uct/rocm/copy/rocm_copy_md.h index 275787e2424..642d20275c4 100644 --- a/src/uct/rocm/copy/rocm_copy_md.h +++ b/src/uct/rocm/copy/rocm_copy_md.h @@ -8,9 +8,8 @@ #include -#define UCT_ROCM_COPY_MD_NAME "rocm_cpy" -extern uct_md_component_t uct_rocm_copy_md_component; +extern uct_component_t uct_rocm_copy_component; typedef struct uct_rocm_copy_md { struct uct_md super; diff --git a/src/uct/rocm/gdr/configure.m4 b/src/uct/rocm/gdr/configure.m4 index 067c5424a87..23c32b4181e 100644 --- a/src/uct/rocm/gdr/configure.m4 +++ b/src/uct/rocm/gdr/configure.m4 @@ -6,5 +6,5 @@ UCX_CHECK_GDRCOPY AS_IF([test "x$gdrcopy_happy" = "xyes" && test "x$rocm_happy" = "xyes"], - [uct_rocm_modules+=":gdr"]) + [uct_rocm_modules="${uct_rocm_modules}:gdr"]) AC_CONFIG_FILES([src/uct/rocm/gdr/Makefile]) diff --git a/src/uct/rocm/gdr/rocm_gdr_ep.c b/src/uct/rocm/gdr/rocm_gdr_ep.c index dbf2b1c4ee6..0ed879dd355 100644 --- a/src/uct/rocm/gdr/rocm_gdr_ep.c +++ b/src/uct/rocm/gdr/rocm_gdr_ep.c @@ -3,6 +3,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rocm_gdr_ep.h" #include "rocm_gdr_iface.h" diff --git a/src/uct/rocm/gdr/rocm_gdr_iface.c b/src/uct/rocm/gdr/rocm_gdr_iface.c index 0dd00f52f1b..4fb20073625 100644 --- a/src/uct/rocm/gdr/rocm_gdr_iface.c +++ b/src/uct/rocm/gdr/rocm_gdr_iface.c @@ -3,13 +3,19 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rocm_gdr_iface.h" #include "rocm_gdr_md.h" #include "rocm_gdr_ep.h" +#include #include #include + static ucs_config_field_t uct_rocm_gdr_iface_config_table[] = { {"", "", NULL, @@ -42,10 +48,12 @@ static int uct_rocm_gdr_iface_is_reachable(const uct_iface_h tl_iface, return (addr != NULL) && (iface->id == *addr); } -static ucs_status_t uct_rocm_gdr_iface_query(uct_iface_h iface, +static ucs_status_t uct_rocm_gdr_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { - memset(iface_attr, 0, sizeof(uct_iface_attr_t)); + uct_rocm_gdr_iface_t *iface = ucs_derived_of(tl_iface, uct_rocm_gdr_iface_t); + + uct_base_iface_query(&iface->super, iface_attr); iface_attr->iface_addr_len = sizeof(uct_rocm_gdr_iface_addr_t); iface_attr->device_addr_len = 0; @@ -79,9 +87,9 @@ static ucs_status_t uct_rocm_gdr_iface_query(uct_iface_h iface, iface_attr->cap.am.max_hdr = 0; iface_attr->cap.am.max_iov = 1; - iface_attr->latency.overhead = 1e-6; /* 1 us */ - iface_attr->latency.growth = 0; - iface_attr->bandwidth = 6911 * 1024.0 * 1024.0; + iface_attr->latency = ucs_linear_func_make(1e-6, 0); + iface_attr->bandwidth.dedicated = 0; + iface_attr->bandwidth.shared = 6911.0 * UCS_MBYTE; iface_attr->overhead = 0; iface_attr->priority = 0; @@ -104,7 +112,7 @@ static uct_iface_ops_t uct_rocm_gdr_iface_ops = { .iface_progress = ucs_empty_function_return_zero, .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_rocm_gdr_iface_t), .iface_query = uct_rocm_gdr_iface_query, - .iface_get_device_address = (void*)ucs_empty_function_return_success, + .iface_get_device_address = (uct_iface_get_device_address_func_t)ucs_empty_function_return_success, .iface_get_address = uct_rocm_gdr_iface_get_address, .iface_is_reachable = uct_rocm_gdr_iface_is_reachable, }; @@ -132,37 +140,6 @@ UCS_CLASS_DEFINE_NEW_FUNC(uct_rocm_gdr_iface_t, uct_iface_t, uct_md_h, uct_worke const uct_iface_params_t*, const uct_iface_config_t*); static UCS_CLASS_DEFINE_DELETE_FUNC(uct_rocm_gdr_iface_t, uct_iface_t); - -static ucs_status_t uct_rocm_gdr_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) -{ - uct_tl_resource_desc_t *resource; - - resource = ucs_calloc(1, sizeof(uct_tl_resource_desc_t), "ROCm copy resource desc"); - if (NULL == resource) { - ucs_error("Failed to allocate memory"); - return UCS_ERR_NO_MEMORY; - } - - ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", - UCT_ROCM_GDR_TL_NAME); - ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", - md->component->name); - - resource->dev_type = UCT_DEVICE_TYPE_ACC; - - *num_resources_p = 1; - *resource_p = resource; - return UCS_OK; -} - -UCT_TL_COMPONENT_DEFINE(uct_rocm_gdr_tl, - uct_rocm_gdr_query_tl_resources, - uct_rocm_gdr_iface_t, - UCT_ROCM_GDR_TL_NAME, - "ROCM_GDR_", - uct_rocm_gdr_iface_config_table, - uct_rocm_gdr_iface_config_t); - -UCT_MD_REGISTER_TL(&uct_rocm_gdr_md_component, &uct_rocm_gdr_tl); +UCT_TL_DEFINE(&uct_rocm_gdr_component, rocm_gdr, uct_rocm_base_query_devices, + uct_rocm_gdr_iface_t, "ROCM_GDR_", + uct_rocm_gdr_iface_config_table, uct_rocm_gdr_iface_config_t); diff --git a/src/uct/rocm/gdr/rocm_gdr_md.c b/src/uct/rocm/gdr/rocm_gdr_md.c index 4f0ef5c1a3c..1e72ea26407 100644 --- a/src/uct/rocm/gdr/rocm_gdr_md.c +++ b/src/uct/rocm/gdr/rocm_gdr_md.c @@ -3,6 +3,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rocm_gdr_md.h" #include @@ -26,15 +30,15 @@ static ucs_config_field_t uct_rocm_gdr_md_config_table[] = { static ucs_status_t uct_rocm_gdr_md_query(uct_md_h md, uct_md_attr_t *md_attr) { - md_attr->cap.flags = UCT_MD_FLAG_REG | - UCT_MD_FLAG_NEED_RKEY; - md_attr->cap.reg_mem_types = UCS_BIT(UCT_MD_MEM_TYPE_ROCM); - md_attr->cap.mem_type = UCT_MD_MEM_TYPE_ROCM; - md_attr->cap.max_alloc = 0; - md_attr->cap.max_reg = ULONG_MAX; - md_attr->rkey_packed_size = sizeof(uct_rocm_gdr_key_t); - md_attr->reg_cost.overhead = 0; - md_attr->reg_cost.growth = 0; + md_attr->cap.flags = UCT_MD_FLAG_REG | + UCT_MD_FLAG_NEED_RKEY; + md_attr->cap.reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM); + md_attr->cap.access_mem_type = UCS_MEMORY_TYPE_ROCM; + md_attr->cap.detect_mem_types = 0; + md_attr->cap.max_alloc = 0; + md_attr->cap.max_reg = ULONG_MAX; + md_attr->rkey_packed_size = sizeof(uct_rocm_gdr_key_t); + md_attr->reg_cost = ucs_linear_func_make(0, 0); memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); return UCS_OK; } @@ -48,9 +52,9 @@ static ucs_status_t uct_rocm_gdr_mkey_pack(uct_md_h md, uct_mem_h memh, return UCS_OK; } -static ucs_status_t uct_rocm_gdr_rkey_unpack(uct_md_component_t *mdc, - const void *rkey_buffer, uct_rkey_t *rkey_p, - void **handle_p) +static ucs_status_t uct_rocm_gdr_rkey_unpack(uct_component_t *component, + const void *rkey_buffer, + uct_rkey_t *rkey_p, void **handle_p) { //uct_rocm_gdr_key_t *packed = (uct_rocm_gdr_key_t *)rkey_buffer; uct_rocm_gdr_key_t *key; @@ -69,8 +73,8 @@ static ucs_status_t uct_rocm_gdr_rkey_unpack(uct_md_component_t *mdc, return UCS_OK; } -static ucs_status_t uct_rocm_gdr_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, - void *handle) +static ucs_status_t uct_rocm_gdr_rkey_release(uct_component_t *component, + uct_rkey_t rkey, void *handle) { ucs_assert(NULL == handle); ucs_free((void *)rkey); @@ -100,13 +104,6 @@ static ucs_status_t uct_rocm_gdr_mem_dereg(uct_md_h md, uct_mem_h memh) return UCS_OK; } -static ucs_status_t uct_rocm_gdr_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) -{ - return uct_single_md_resource(&uct_rocm_gdr_md_component, resources_p, - num_resources_p); -} - static void uct_rocm_gdr_md_close(uct_md_h uct_md) { uct_rocm_gdr_md_t *md = ucs_derived_of(uct_md, uct_rocm_gdr_md_t); @@ -114,16 +111,17 @@ static void uct_rocm_gdr_md_close(uct_md_h uct_md) { } static uct_md_ops_t md_ops = { - .close = uct_rocm_gdr_md_close, - .query = uct_rocm_gdr_md_query, - .mkey_pack = uct_rocm_gdr_mkey_pack, - .mem_reg = uct_rocm_gdr_mem_reg, - .mem_dereg = uct_rocm_gdr_mem_dereg, - .is_mem_type_owned = uct_rocm_base_is_mem_type_owned, + .close = uct_rocm_gdr_md_close, + .query = uct_rocm_gdr_md_query, + .mkey_pack = uct_rocm_gdr_mkey_pack, + .mem_reg = uct_rocm_gdr_mem_reg, + .mem_dereg = uct_rocm_gdr_mem_dereg, + .detect_memory_type = ucs_empty_function_return_unsupported, }; -static ucs_status_t uct_rocm_gdr_md_open(const char *md_name, const uct_md_config_t *md_config, - uct_md_h *md_p) +static ucs_status_t +uct_rocm_gdr_md_open(uct_component_h component, const char *md_name, + const uct_md_config_t *md_config, uct_md_h *md_p) { uct_rocm_gdr_md_t *md; @@ -133,14 +131,30 @@ static ucs_status_t uct_rocm_gdr_md_open(const char *md_name, const uct_md_confi return UCS_ERR_NO_MEMORY; } - md->super.ops = &md_ops; - md->super.component = &uct_rocm_gdr_md_component; + md->super.ops = &md_ops; + md->super.component = &uct_rocm_gdr_component; *md_p = (uct_md_h) md; return UCS_OK; } -UCT_MD_COMPONENT_DEFINE(uct_rocm_gdr_md_component, UCT_ROCM_GDR_MD_NAME, - uct_rocm_gdr_query_md_resources, uct_rocm_gdr_md_open, NULL, - uct_rocm_gdr_rkey_unpack, uct_rocm_gdr_rkey_release, "ROCM_GDR_", - uct_rocm_gdr_md_config_table, uct_rocm_gdr_md_config_t); +uct_component_t uct_rocm_gdr_component = { + .query_md_resources = uct_md_query_single_md_resource, + .md_open = uct_rocm_gdr_md_open, + .cm_open = ucs_empty_function_return_unsupported, + .rkey_unpack = uct_rocm_gdr_rkey_unpack, + .rkey_ptr = ucs_empty_function_return_unsupported, + .rkey_release = uct_rocm_gdr_rkey_release, + .name = "rocm_gdr", + .md_config = { + .name = "ROCm-gdr memory domain", + .prefix = "ROCM_GDR_", + .table = uct_rocm_gdr_md_config_table, + .size = sizeof(uct_rocm_gdr_md_config_t), + }, + .cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY, + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_rocm_gdr_component), + .flags = 0 +}; +UCT_COMPONENT_REGISTER(&uct_rocm_gdr_component); + diff --git a/src/uct/rocm/gdr/rocm_gdr_md.h b/src/uct/rocm/gdr/rocm_gdr_md.h index f87786ea62d..f91facac6f3 100644 --- a/src/uct/rocm/gdr/rocm_gdr_md.h +++ b/src/uct/rocm/gdr/rocm_gdr_md.h @@ -8,9 +8,8 @@ #include -#define UCT_ROCM_GDR_MD_NAME "rocm_gdr" -extern uct_md_component_t uct_rocm_gdr_md_component; +extern uct_component_t uct_rocm_gdr_component; typedef struct uct_rocm_gdr_md { struct uct_md super; diff --git a/src/uct/rocm/ipc/rocm_ipc_cache.c b/src/uct/rocm/ipc/rocm_ipc_cache.c index e63647874d7..6575225b9d7 100644 --- a/src/uct/rocm/ipc/rocm_ipc_cache.c +++ b/src/uct/rocm/ipc/rocm_ipc_cache.c @@ -5,6 +5,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rocm_ipc_cache.h" #include @@ -15,8 +19,13 @@ static ucs_pgt_dir_t *uct_rocm_ipc_cache_pgt_dir_alloc(const ucs_pgtable_t *pgtable) { - return ucs_memalign(UCS_PGT_ENTRY_MIN_ALIGN, sizeof(ucs_pgt_dir_t), - "rocm_ipc_cache_pgdir"); + void *ptr; + int ret; + + ret = ucs_posix_memalign(&ptr, + ucs_max(sizeof(void *), UCS_PGT_ENTRY_MIN_ALIGN), + sizeof(ucs_pgt_dir_t), "rocm_ipc_cache_pgdir"); + return (ret == 0) ? ptr : NULL; } static void uct_rocm_ipc_cache_pgt_dir_release(const ucs_pgtable_t *pgtable, @@ -93,6 +102,7 @@ ucs_status_t uct_rocm_ipc_cache_map_memhandle(void *arg, uct_rocm_ipc_key_t *key ucs_pgt_region_t *pgt_region; uct_rocm_ipc_cache_region_t *region; hsa_status_t hsa_status; + int ret; pthread_rwlock_rdlock(&cache->lock); pgt_region = UCS_PROFILE_CALL(ucs_pgtable_lookup, @@ -136,10 +146,11 @@ ucs_status_t uct_rocm_ipc_cache_map_memhandle(void *arg, uct_rocm_ipc_key_t *key } /*create new cache entry */ - region = ucs_memalign(UCS_PGT_ENTRY_MIN_ALIGN, - sizeof(uct_rocm_ipc_cache_region_t), - "uct_rocm_ipc_cache_region"); - if (region == NULL) { + ret = ucs_posix_memalign((void **)®ion, + ucs_max(sizeof(void *), UCS_PGT_ENTRY_MIN_ALIGN), + sizeof(uct_rocm_ipc_cache_region_t), + "uct_rocm_ipc_cache_region"); + if (ret != 0) { ucs_warn("failed to allocate uct_rocm_ipc_cache region"); status = UCS_ERR_NO_MEMORY; goto err; diff --git a/src/uct/rocm/ipc/rocm_ipc_ep.c b/src/uct/rocm/ipc/rocm_ipc_ep.c index 1d133ca4727..56f432a535d 100644 --- a/src/uct/rocm/ipc/rocm_ipc_ep.c +++ b/src/uct/rocm/ipc/rocm_ipc_ep.c @@ -1,13 +1,19 @@ /* * Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rocm_ipc_ep.h" #include "rocm_ipc_iface.h" #include "rocm_ipc_md.h" #include +#include static UCS_CLASS_INIT_FUNC(uct_rocm_ipc_ep_t, const uct_ep_params_t *params) { @@ -52,17 +58,14 @@ ucs_status_t uct_rocm_ipc_ep_zcopy(uct_ep_h tl_ep, { uct_rocm_ipc_ep_t *ep = ucs_derived_of(tl_ep, uct_rocm_ipc_ep_t); hsa_status_t status; - hsa_agent_t local_agent, remote_agent; + hsa_agent_t local_agent; size_t size = uct_iov_get_length(iov); ucs_status_t ret = UCS_OK; void *base_addr, *local_addr = iov->buffer; uct_rocm_ipc_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rocm_ipc_iface_t); void *remote_base_addr, *remote_copy_addr; void *dst_addr, *src_addr; - hsa_agent_t dst_agent, src_agent; uct_rocm_ipc_signal_desc_t *rocm_ipc_signal; - hsa_agent_t *gpu_agents; - int num_gpu; /* no data to deliver */ if (!size) @@ -89,37 +92,22 @@ ucs_status_t uct_rocm_ipc_ep_zcopy(uct_ep_h tl_ep, return ret; } - num_gpu = uct_rocm_base_get_gpu_agents(&gpu_agents); - status = hsa_amd_agents_allow_access(num_gpu, gpu_agents, NULL, base_addr); - if (status != HSA_STATUS_SUCCESS) { - ucs_error("fail to map local mem %p %p %d\n", - local_addr, base_addr, status); - return UCS_ERR_INVALID_ADDR; - } - - remote_copy_addr = remote_base_addr + (remote_addr - key->address); - remote_agent = uct_rocm_base_get_dev_agent(key->dev_num); - + remote_copy_addr = UCS_PTR_BYTE_OFFSET(remote_base_addr, + remote_addr - key->address); if (is_put) { dst_addr = remote_copy_addr; - dst_agent = remote_agent; - src_addr = local_addr; - src_agent = local_agent; } else { dst_addr = local_addr; - dst_agent = local_agent; - src_addr = remote_copy_addr; - src_agent = remote_agent; } rocm_ipc_signal = ucs_mpool_get(&iface->signal_pool); hsa_signal_store_screlease(rocm_ipc_signal->signal, 1); - status = hsa_amd_memory_async_copy(dst_addr, dst_agent, - src_addr, src_agent, + status = hsa_amd_memory_async_copy(dst_addr, local_agent, + src_addr, local_agent, size, 0, NULL, rocm_ipc_signal->signal); diff --git a/src/uct/rocm/ipc/rocm_ipc_iface.c b/src/uct/rocm/ipc/rocm_ipc_iface.c index 1a84872faec..cdffd0642af 100644 --- a/src/uct/rocm/ipc/rocm_ipc_iface.c +++ b/src/uct/rocm/ipc/rocm_ipc_iface.c @@ -1,16 +1,23 @@ /* * Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rocm_ipc_iface.h" #include "rocm_ipc_md.h" #include "rocm_ipc_ep.h" +#include #include #include #include + static ucs_config_field_t uct_rocm_ipc_iface_config_table[] = { {"", "", NULL, @@ -55,7 +62,9 @@ static int uct_rocm_ipc_iface_is_reachable(const uct_iface_h tl_iface, static ucs_status_t uct_rocm_ipc_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { - memset(iface_attr, 0, sizeof(uct_iface_attr_t)); + uct_rocm_ipc_iface_t *iface = ucs_derived_of(tl_iface, uct_rocm_ipc_iface_t); + + uct_base_iface_query(&iface->super, iface_attr); iface_attr->cap.put.min_zcopy = 0; iface_attr->cap.put.max_zcopy = SIZE_MAX; @@ -79,9 +88,9 @@ static ucs_status_t uct_rocm_ipc_iface_query(uct_iface_h tl_iface, UCT_IFACE_FLAG_CONNECT_TO_IFACE; /* TODO: get accurate info */ - iface_attr->latency.overhead = 80e-9; /* 80 ns */ - iface_attr->latency.growth = 0; - iface_attr->bandwidth = 10240 * 1024.0 * 1024.0; /* 10240 MB*/ + iface_attr->latency = ucs_linear_func_make(80e-9, 0); + iface_attr->bandwidth.dedicated = 10.0 * UCS_GBYTE; /* 10 GB */ + iface_attr->bandwidth.shared = 0; iface_attr->overhead = 0.4e-6; /* 0.4 us */ return UCS_OK; @@ -233,36 +242,6 @@ static UCS_CLASS_DEFINE_NEW_FUNC(uct_rocm_ipc_iface_t, uct_iface_t, uct_md_h, const uct_iface_config_t *); static UCS_CLASS_DEFINE_DELETE_FUNC(uct_rocm_ipc_iface_t, uct_iface_t); -static ucs_status_t uct_rocm_ipc_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) -{ - uct_tl_resource_desc_t *resource; - - resource = ucs_calloc(1, sizeof(uct_tl_resource_desc_t), "ROCm IPC resource desc"); - if (NULL == resource) { - ucs_error("Failed to allocate memory"); - return UCS_ERR_NO_MEMORY; - } - - ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", - UCT_ROCM_IPC_TL_NAME); - ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", - md->component->name); - - resource->dev_type = UCT_DEVICE_TYPE_ACC; - - *num_resources_p = 1; - *resource_p = resource; - return UCS_OK; -} - -UCT_TL_COMPONENT_DEFINE(uct_rocm_ipc_tl, - uct_rocm_ipc_query_tl_resources, - uct_rocm_ipc_iface_t, - UCT_ROCM_IPC_TL_NAME, - "ROCM_IPC_", - uct_rocm_ipc_iface_config_table, - uct_rocm_ipc_iface_config_t); - -UCT_MD_REGISTER_TL(&uct_rocm_ipc_md_component, &uct_rocm_ipc_tl); +UCT_TL_DEFINE(&uct_rocm_ipc_component, rocm_ipc, uct_rocm_base_query_devices, + uct_rocm_ipc_iface_t, "ROCM_IPC_", + uct_rocm_ipc_iface_config_table, uct_rocm_ipc_iface_config_t); diff --git a/src/uct/rocm/ipc/rocm_ipc_md.c b/src/uct/rocm/ipc/rocm_ipc_md.c index 33746d1ac65..e8b2b606004 100644 --- a/src/uct/rocm/ipc/rocm_ipc_md.c +++ b/src/uct/rocm/ipc/rocm_ipc_md.c @@ -1,12 +1,18 @@ /* * Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "rocm_ipc_md.h" #include + static ucs_config_field_t uct_rocm_ipc_md_config_table[] = { {"", "", NULL, ucs_offsetof(uct_rocm_ipc_md_config_t, super), @@ -15,33 +21,19 @@ static ucs_config_field_t uct_rocm_ipc_md_config_table[] = { {NULL} }; -static ucs_status_t uct_rocm_ipc_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) -{ - if (uct_rocm_base_init() != HSA_STATUS_SUCCESS) { - ucs_debug("Could not initialize ROCm support"); - *resources_p = NULL; - *num_resources_p = 0; - return UCS_OK; - } - - return uct_single_md_resource(&uct_rocm_ipc_md_component, resources_p, - num_resources_p); -} - static ucs_status_t uct_rocm_ipc_md_query(uct_md_h md, uct_md_attr_t *md_attr) { - md_attr->rkey_packed_size = sizeof(uct_rocm_ipc_key_t); - md_attr->cap.flags = UCT_MD_FLAG_REG | - UCT_MD_FLAG_NEED_RKEY; - md_attr->cap.reg_mem_types = UCS_BIT(UCT_MD_MEM_TYPE_ROCM); - md_attr->cap.mem_type = UCT_MD_MEM_TYPE_ROCM; - md_attr->cap.max_alloc = 0; - md_attr->cap.max_reg = ULONG_MAX; + md_attr->rkey_packed_size = sizeof(uct_rocm_ipc_key_t); + md_attr->cap.flags = UCT_MD_FLAG_REG | + UCT_MD_FLAG_NEED_RKEY; + md_attr->cap.reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM); + md_attr->cap.access_mem_type = UCS_MEMORY_TYPE_ROCM; + md_attr->cap.detect_mem_types = 0; + md_attr->cap.max_alloc = 0; + md_attr->cap.max_reg = ULONG_MAX; /* TODO: get accurate number */ - md_attr->reg_cost.overhead = 9e-9; - md_attr->reg_cost.growth = 0; + md_attr->reg_cost = ucs_linear_func_make(9e-9, 0); memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); return UCS_OK; @@ -116,30 +108,30 @@ static ucs_status_t uct_rocm_ipc_mem_dereg(uct_md_h md, uct_mem_h memh) return UCS_OK; } -static ucs_status_t uct_rocm_ipc_md_open(const char *md_name, - const uct_md_config_t *uct_md_config, - uct_md_h *md_p) +static ucs_status_t +uct_rocm_ipc_md_open(uct_component_h component, const char *md_name, + const uct_md_config_t *uct_md_config, uct_md_h *md_p) { static uct_md_ops_t md_ops = { - .close = (void*)ucs_empty_function, - .query = uct_rocm_ipc_md_query, - .mkey_pack = uct_rocm_ipc_mkey_pack, - .mem_reg = uct_rocm_ipc_mem_reg, - .mem_dereg = uct_rocm_ipc_mem_dereg, - .is_mem_type_owned = uct_rocm_base_is_mem_type_owned, + .close = (uct_md_close_func_t)ucs_empty_function, + .query = uct_rocm_ipc_md_query, + .mkey_pack = uct_rocm_ipc_mkey_pack, + .mem_reg = uct_rocm_ipc_mem_reg, + .mem_dereg = uct_rocm_ipc_mem_dereg, + .detect_memory_type = ucs_empty_function_return_unsupported, }; static uct_md_t md = { .ops = &md_ops, - .component = &uct_rocm_ipc_md_component, + .component = &uct_rocm_ipc_component, }; *md_p = &md; return UCS_OK; } -static ucs_status_t uct_rocm_ipc_rkey_unpack(uct_md_component_t *mdc, - const void *rkey_buffer, uct_rkey_t *rkey_p, - void **handle_p) +static ucs_status_t uct_rocm_ipc_rkey_unpack(uct_component_t *component, + const void *rkey_buffer, + uct_rkey_t *rkey_p, void **handle_p) { uct_rocm_ipc_key_t *packed = (uct_rocm_ipc_key_t *)rkey_buffer; uct_rocm_ipc_key_t *key; @@ -157,7 +149,7 @@ static ucs_status_t uct_rocm_ipc_rkey_unpack(uct_md_component_t *mdc, return UCS_OK; } -static ucs_status_t uct_rocm_ipc_rkey_release(uct_md_component_t *mdc, +static ucs_status_t uct_rocm_ipc_rkey_release(uct_component_t *component, uct_rkey_t rkey, void *handle) { ucs_assert(NULL == handle); @@ -165,12 +157,23 @@ static ucs_status_t uct_rocm_ipc_rkey_release(uct_md_component_t *mdc, return UCS_OK; } -UCT_MD_COMPONENT_DEFINE(uct_rocm_ipc_md_component, - UCT_ROCM_IPC_MD_NAME, - uct_rocm_ipc_query_md_resources, - uct_rocm_ipc_md_open, 0, - uct_rocm_ipc_rkey_unpack, - uct_rocm_ipc_rkey_release, - "ROCM_IPC_MD_", - uct_rocm_ipc_md_config_table, - uct_rocm_ipc_md_config_t); +uct_component_t uct_rocm_ipc_component = { + .query_md_resources = uct_rocm_base_query_md_resources, + .md_open = uct_rocm_ipc_md_open, + .cm_open = ucs_empty_function_return_unsupported, + .rkey_unpack = uct_rocm_ipc_rkey_unpack, + .rkey_ptr = ucs_empty_function_return_unsupported, + .rkey_release = uct_rocm_ipc_rkey_release, + .name = "rocm_ipc", + .md_config = { + .name = "ROCm-IPC memory domain", + .prefix = "ROCM_IPC_MD_", + .table = uct_rocm_ipc_md_config_table, + .size = sizeof(uct_rocm_ipc_md_config_t), + }, + .cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY, + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_rocm_ipc_component), + .flags = 0 +}; +UCT_COMPONENT_REGISTER(&uct_rocm_ipc_component); + diff --git a/src/uct/rocm/ipc/rocm_ipc_md.h b/src/uct/rocm/ipc/rocm_ipc_md.h index b971961e9f3..ebe46985493 100644 --- a/src/uct/rocm/ipc/rocm_ipc_md.h +++ b/src/uct/rocm/ipc/rocm_ipc_md.h @@ -9,9 +9,8 @@ #include #include -#define UCT_ROCM_IPC_MD_NAME "rocm_ipc" -extern uct_md_component_t uct_rocm_ipc_md_component; +extern uct_component_t uct_rocm_ipc_component; typedef struct uct_rocm_ipc_md { struct uct_md super; diff --git a/src/uct/sm/Makefile.am b/src/uct/sm/Makefile.am index 189f97cc0c3..48a2d618dec 100644 --- a/src/uct/sm/Makefile.am +++ b/src/uct/sm/Makefile.am @@ -3,4 +3,4 @@ # See file LICENSE for terms. # -SUBDIRS = cma knem mm +SUBDIRS = scopy mm diff --git a/src/uct/sm/base/sm_ep.c b/src/uct/sm/base/sm_ep.c index e6910f3ec42..1fecff32545 100644 --- a/src/uct/sm/base/sm_ep.c +++ b/src/uct/sm/base/sm_ep.c @@ -3,6 +3,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "sm_ep.h" #include diff --git a/src/uct/sm/base/sm_iface.c b/src/uct/sm/base/sm_iface.c index 93f9a16617d..feb5a01af9c 100644 --- a/src/uct/sm/base/sm_iface.c +++ b/src/uct/sm/base/sm_iface.c @@ -5,37 +5,114 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "sm_iface.h" -#include #include #include #include #include +#include + + +#define UCS_SM_IFACE_ADDR_FLAG_EXT UCS_BIT(63) + + +typedef struct { + uint64_t id; +} ucs_sm_iface_base_device_addr_t; +typedef struct { + ucs_sm_iface_base_device_addr_t super; + ucs_sys_ns_t ipc_ns; +} ucs_sm_iface_ext_device_addr_t; -static uint64_t uct_sm_iface_node_guid(uct_base_iface_t *iface) + +ucs_config_field_t uct_sm_iface_config_table[] = { + {"", "", NULL, + ucs_offsetof(uct_sm_iface_config_t, super), + UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)}, + + {"BW", "12179MBs", + "Effective memory bandwidth", + ucs_offsetof(uct_sm_iface_config_t, bandwidth), UCS_CONFIG_TYPE_BW}, + + {NULL} +}; + +ucs_status_t +uct_sm_base_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) { - /* The address should be different for different mm 'devices' so that - * they won't seem reachable one to another. Their 'name' will create the - * uniqueness in the address */ - return ucs_machine_guid() * - ucs_string_to_id(iface->md->component->name); + return uct_single_device_resource(md, UCT_SM_DEVICE_NAME, + UCT_DEVICE_TYPE_SHM, tl_devices_p, + num_tl_devices_p); } -ucs_status_t uct_sm_iface_get_device_address(uct_iface_t *tl_iface, - uct_device_addr_t *addr) + +/* read boot_id GUID or use machine_guid */ +static uint64_t uct_sm_iface_get_system_id() { - uct_base_iface_t *iface = ucs_derived_of(tl_iface, uct_base_iface_t); - *(uint64_t*)addr = uct_sm_iface_node_guid(iface); + uint64_t high; + uint64_t low; + ucs_status_t status; + + status = ucs_sys_get_boot_id(&high, &low); + if (status == UCS_OK) { + return high ^ low; + } + + return ucs_machine_guid(); +} + +ucs_status_t UCS_F_NOOPTIMIZE /* GCC failed to compile it in release mode */ +uct_sm_iface_get_device_address(uct_iface_t *tl_iface, uct_device_addr_t *addr) +{ + ucs_sm_iface_ext_device_addr_t *ext_addr = (void*)addr; + + ext_addr->super.id = uct_sm_iface_get_system_id() & ~UCS_SM_IFACE_ADDR_FLAG_EXT; + + if (!ucs_sys_ns_is_default(UCS_SYS_NS_TYPE_IPC)) { + ext_addr->super.id |= UCS_SM_IFACE_ADDR_FLAG_EXT; + ext_addr->ipc_ns = ucs_sys_get_ns(UCS_SYS_NS_TYPE_IPC); + } + return UCS_OK; } -int uct_sm_iface_is_reachable(const uct_iface_h tl_iface, const uct_device_addr_t *dev_addr, +int uct_sm_iface_is_reachable(const uct_iface_h tl_iface, + const uct_device_addr_t *dev_addr, const uct_iface_addr_t *iface_addr) { - uct_base_iface_t *iface = ucs_derived_of(tl_iface, uct_base_iface_t); - return uct_sm_iface_node_guid(iface) == *(const uint64_t*)dev_addr; + ucs_sm_iface_ext_device_addr_t *ext_addr = (void*)dev_addr; + ucs_sm_iface_ext_device_addr_t my_addr = {}; + ucs_status_t status; + + status = uct_sm_iface_get_device_address(tl_iface, + (uct_device_addr_t*)&my_addr); + if (status != UCS_OK) { + ucs_error("failed to get device address"); + return 0; + } + + /* do not merge these evaluations into single 'if' due + * to clags compilation warning */ + /* check if both processes are on same host and + * both of them are in root (or non-root) pid namespace */ + if (ext_addr->super.id != my_addr.super.id) { + return 0; + } + + if (!(ext_addr->super.id & UCS_SM_IFACE_ADDR_FLAG_EXT)) { + return 1; /* both processes are in root namespace */ + } + + /* ok, we are in non-root PID namespace - return 1 if ID of + * namespaces are same */ + return ext_addr->ipc_ns == my_addr.ipc_ns; } ucs_status_t uct_sm_iface_fence(uct_iface_t *tl_iface, unsigned flags) @@ -51,3 +128,42 @@ ucs_status_t uct_sm_ep_fence(uct_ep_t *tl_ep, unsigned flags) UCT_TL_EP_STAT_FENCE(ucs_derived_of(tl_ep, uct_base_ep_t)); return UCS_OK; } + +size_t uct_sm_iface_get_device_addr_len() +{ + return ucs_sys_ns_is_default(UCS_SYS_NS_TYPE_IPC) ? + sizeof(ucs_sm_iface_base_device_addr_t) : + sizeof(ucs_sm_iface_ext_device_addr_t); +} + +UCS_CLASS_INIT_FUNC(uct_sm_iface_t, uct_iface_ops_t *ops, uct_md_h md, + uct_worker_h worker, const uct_iface_params_t *params, + const uct_iface_config_t *tl_config) +{ + uct_sm_iface_config_t *sm_config = ucs_derived_of(tl_config, + uct_sm_iface_config_t); + + UCT_CHECK_PARAM(params->field_mask & UCT_IFACE_PARAM_FIELD_OPEN_MODE, + "UCT_IFACE_PARAM_FIELD_OPEN_MODE is not defined"); + if (!(params->open_mode & UCT_IFACE_OPEN_MODE_DEVICE)) { + ucs_error("only UCT_IFACE_OPEN_MODE_DEVICE is supported"); + return UCS_ERR_UNSUPPORTED; + } + + UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, ops, md, worker, params, + tl_config + UCS_STATS_ARG((params->field_mask & + UCT_IFACE_PARAM_FIELD_STATS_ROOT) ? + params->stats_root : NULL) + UCS_STATS_ARG(params->mode.device.dev_name)); + + self->config.bandwidth = sm_config->bandwidth; + + return UCS_OK; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_sm_iface_t) +{ +} + +UCS_CLASS_DEFINE(uct_sm_iface_t, uct_base_iface_t); diff --git a/src/uct/sm/base/sm_iface.h b/src/uct/sm/base/sm_iface.h index b4e4112aed0..d4c48879228 100644 --- a/src/uct/sm/base/sm_iface.h +++ b/src/uct/sm/base/sm_iface.h @@ -8,11 +8,33 @@ #define SM_IFACE_H_ #include +#include #include -#include +#include + -#define UCT_SM_IFACE_DEVICE_ADDR_LEN sizeof(uint64_t) #define UCT_SM_MAX_IOV 16 +#define UCT_SM_DEVICE_NAME "memory" + + +extern ucs_config_field_t uct_sm_iface_config_table[]; + +typedef struct uct_sm_iface_common_config { + uct_iface_config_t super; + double bandwidth; /* Memory bandwidth in bytes per second */ +} uct_sm_iface_config_t; + +typedef struct uct_sm_iface { + uct_base_iface_t super; + struct { + double bandwidth; /* Memory bandwidth in bytes per second */ + } config; +} uct_sm_iface_t; + + +ucs_status_t +uct_sm_base_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p); ucs_status_t uct_sm_iface_get_device_address(uct_iface_t *tl_iface, uct_device_addr_t *addr); @@ -22,11 +44,11 @@ int uct_sm_iface_is_reachable(const uct_iface_h tl_iface, const uct_device_addr_ ucs_status_t uct_sm_iface_fence(uct_iface_t *tl_iface, unsigned flags); -ucs_status_t uct_sm_ep_fence(uct_ep_t *tl_ep, unsigned flags); +size_t uct_sm_iface_get_device_addr_len(); -static UCS_F_ALWAYS_INLINE size_t uct_sm_get_max_iov() { - return ucs_min(UCT_SM_MAX_IOV, ucs_get_max_iov()); -} +ucs_status_t uct_sm_ep_fence(uct_ep_t *tl_ep, unsigned flags); +UCS_CLASS_DECLARE(uct_sm_iface_t, uct_iface_ops_t*, uct_md_h, uct_worker_h, + const uct_iface_params_t*, const uct_iface_config_t*); #endif diff --git a/src/uct/sm/cma/cma_ep.c b/src/uct/sm/cma/cma_ep.c deleted file mode 100644 index 2bd03882ae5..00000000000 --- a/src/uct/sm/cma/cma_ep.c +++ /dev/null @@ -1,163 +0,0 @@ -/** -* Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. -* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. -* Copyright (C) ARM Ltd. 2016. ALL RIGHTS RESERVED. -* See file LICENSE for terms. -*/ - -#ifdef HAVE_CONFIG_H -# include "config.h" -#endif -#include - -#include "cma_ep.h" -#include -#include - - -static UCS_CLASS_INIT_FUNC(uct_cma_ep_t, const uct_ep_params_t *params) -{ - uct_cma_iface_t *iface = ucs_derived_of(params->iface, uct_cma_iface_t); - - UCT_CHECK_PARAM(params->field_mask & UCT_EP_PARAM_FIELD_IFACE_ADDR, - "UCT_EP_PARAM_FIELD_IFACE_ADDR and UCT_EP_PARAM_FIELD_DEV_ADDR are not defined"); - - UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super); - self->remote_pid = *(const pid_t*)params->iface_addr; - return UCS_OK; -} - -static UCS_CLASS_CLEANUP_FUNC(uct_cma_ep_t) -{ - /* No op */ -} - -UCS_CLASS_DEFINE(uct_cma_ep_t, uct_base_ep_t) -UCS_CLASS_DEFINE_NEW_FUNC(uct_cma_ep_t, uct_ep_t, const uct_ep_params_t *); -UCS_CLASS_DEFINE_DELETE_FUNC(uct_cma_ep_t, uct_ep_t); - - -#define uct_cma_trace_data(_remote_addr, _rkey, _fmt, ...) \ - ucs_trace_data(_fmt " to %"PRIx64"(%+ld)", ## __VA_ARGS__, (_remote_addr), \ - (_rkey)) - -static UCS_F_ALWAYS_INLINE -ucs_status_t uct_cma_ep_common_zcopy(uct_ep_h tl_ep, - const uct_iov_t *iov, - size_t iovcnt, - uint64_t remote_addr, - uct_completion_t *comp, - ssize_t (*fn_p)(pid_t, - const struct iovec *, - unsigned long, - const struct iovec *, - unsigned long, - unsigned long), - char *fn_name) -{ - ssize_t ret; - ssize_t delivered = 0; - size_t iov_it; - size_t iov_it_length; - size_t iov_slice_length; - size_t iov_slice_delivered; - size_t local_iov_it; - size_t length = 0; - struct iovec local_iov[UCT_SM_MAX_IOV]; - struct iovec remote_iov; - uct_cma_ep_t *ep = ucs_derived_of(tl_ep, uct_cma_ep_t); - - do { - iov_it_length = 0; - local_iov_it = 0; - for (iov_it = 0; iov_it < ucs_min(UCT_SM_MAX_IOV, iovcnt); ++iov_it) { - iov_slice_delivered = 0; - - /* Get length of the particular iov element */ - iov_slice_length = uct_iov_get_length(iov + iov_it); - - /* Skip the iov element if no data */ - if (!iov_slice_length) { - continue; - } - iov_it_length += iov_slice_length; - - if (iov_it_length <= delivered) { - continue; /* Skip the iov element if transferred already */ - } else { - /* Let's assume the iov element buffer can be delivered partially */ - if ((iov_it_length - delivered) < iov_slice_length) { - iov_slice_delivered = iov_slice_length - (iov_it_length - delivered); - } - } - - local_iov[local_iov_it].iov_base = (void *)((char *)iov[iov_it].buffer + - iov_slice_delivered); - local_iov[local_iov_it].iov_len = iov_slice_length - iov_slice_delivered; - ++local_iov_it; - } - if (!delivered) { - length = iov_it_length; /* Keep total length of the iov buffers */ - } - - if(!length) { - return UCS_OK; /* Nothing to deliver */ - } - - remote_iov.iov_base = (void *)(remote_addr + delivered); - remote_iov.iov_len = length - delivered; - - ret = fn_p(ep->remote_pid, local_iov, local_iov_it, &remote_iov, 1, 0); - if (ret < 0) { - ucs_error("%s delivered %zu instead of %zu, error message %s", - fn_name, delivered, length, strerror(errno)); - return UCS_ERR_IO_ERROR; - } - - delivered += ret; - } while (delivered < length); - - return UCS_OK; -} - -ucs_status_t uct_cma_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp) -{ - UCT_CHECK_IOV_SIZE(iovcnt, uct_sm_get_max_iov(), "uct_cma_ep_put_zcopy"); - - int ret = uct_cma_ep_common_zcopy(tl_ep, - iov, - iovcnt, - remote_addr, - comp, - process_vm_writev, - "process_vm_writev"); - - UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), PUT, ZCOPY, - uct_iov_total_length(iov, iovcnt)); - uct_cma_trace_data(remote_addr, rkey, "PUT_ZCOPY [length %zu]", - uct_iov_total_length(iov, iovcnt)); - return ret; -} - -ucs_status_t uct_cma_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp) -{ - UCT_CHECK_IOV_SIZE(iovcnt, uct_sm_get_max_iov(), "uct_cma_ep_get_zcopy"); - - int ret = uct_cma_ep_common_zcopy(tl_ep, - iov, - iovcnt, - remote_addr, - comp, - process_vm_readv, - "process_vm_readv"); - - UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, ZCOPY, - uct_iov_total_length(iov, iovcnt)); - uct_cma_trace_data(remote_addr, rkey, "GET_ZCOPY [length %zu]", - uct_iov_total_length(iov, iovcnt)); - return ret; -} diff --git a/src/uct/sm/cma/cma_ep.h b/src/uct/sm/cma/cma_ep.h deleted file mode 100644 index 14d479b1202..00000000000 --- a/src/uct/sm/cma/cma_ep.h +++ /dev/null @@ -1,28 +0,0 @@ -/** -* Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. -* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. -* See file LICENSE for terms. -*/ - -#ifndef UCT_CMA_EP_H -#define UCT_CMA_EP_H - -#include "cma_iface.h" - -#include - - -typedef struct uct_cma_ep { - uct_base_ep_t super; - pid_t remote_pid; -} uct_cma_ep_t; - -UCS_CLASS_DECLARE_NEW_FUNC(uct_cma_ep_t, uct_ep_t, const uct_ep_params_t *); -UCS_CLASS_DECLARE_DELETE_FUNC(uct_cma_ep_t, uct_ep_t); -ucs_status_t uct_cma_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp); -ucs_status_t uct_cma_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp); -#endif diff --git a/src/uct/sm/cma/cma_iface.c b/src/uct/sm/cma/cma_iface.c deleted file mode 100644 index 209120c27a2..00000000000 --- a/src/uct/sm/cma/cma_iface.c +++ /dev/null @@ -1,154 +0,0 @@ -/** - * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. - * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -#include "cma_md.h" -#include "cma_iface.h" -#include "cma_ep.h" - -#include -#include -#include - - -UCT_MD_REGISTER_TL(&uct_cma_md_component, &uct_cma_tl); - -static ucs_config_field_t uct_cma_iface_config_table[] = { - {"", "ALLOC=huge,thp,mmap,heap", NULL, - ucs_offsetof(uct_cma_iface_config_t, super), - UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)}, - {NULL} -}; - -static ucs_status_t uct_cma_iface_get_address(uct_iface_t *tl_iface, - uct_iface_addr_t *addr) -{ - *(pid_t*)addr = getpid(); - return UCS_OK; -} - -static ucs_status_t uct_cma_iface_query(uct_iface_h tl_iface, - uct_iface_attr_t *iface_attr) -{ - memset(iface_attr, 0, sizeof(uct_iface_attr_t)); - - /* default values for all shared memory transports */ - iface_attr->cap.put.min_zcopy = 0; - iface_attr->cap.put.max_zcopy = SIZE_MAX; - iface_attr->cap.put.opt_zcopy_align = 1; - iface_attr->cap.put.align_mtu = iface_attr->cap.put.opt_zcopy_align; - iface_attr->cap.put.max_iov = uct_sm_get_max_iov(); - - iface_attr->cap.get.min_zcopy = 0; - iface_attr->cap.get.max_zcopy = SIZE_MAX; - iface_attr->cap.get.opt_zcopy_align = 1; - iface_attr->cap.get.align_mtu = iface_attr->cap.get.opt_zcopy_align; - iface_attr->cap.get.max_iov = uct_sm_get_max_iov(); - - iface_attr->cap.am.max_iov = 1; - iface_attr->cap.am.opt_zcopy_align = 1; - iface_attr->cap.am.align_mtu = iface_attr->cap.am.opt_zcopy_align; - - iface_attr->iface_addr_len = sizeof(pid_t); - iface_attr->device_addr_len = UCT_SM_IFACE_DEVICE_ADDR_LEN; - iface_attr->ep_addr_len = 0; - iface_attr->max_conn_priv = 0; - iface_attr->cap.flags = UCT_IFACE_FLAG_GET_ZCOPY | - UCT_IFACE_FLAG_PUT_ZCOPY | - UCT_IFACE_FLAG_PENDING | - UCT_IFACE_FLAG_CONNECT_TO_IFACE; - iface_attr->latency.overhead = 80e-9; /* 80 ns */ - iface_attr->latency.growth = 0; - iface_attr->bandwidth = 11145 * 1024.0 * 1024.0; - iface_attr->overhead = 0.4e-6; /* 0.4 us */ - return UCS_OK; -} - -static UCS_CLASS_DECLARE_DELETE_FUNC(uct_cma_iface_t, uct_iface_t); - -static uct_iface_ops_t uct_cma_iface_ops = { - .ep_put_zcopy = uct_cma_ep_put_zcopy, - .ep_get_zcopy = uct_cma_ep_get_zcopy, - .ep_pending_add = ucs_empty_function_return_busy, - .ep_pending_purge = ucs_empty_function, - .ep_flush = uct_base_ep_flush, - .ep_fence = uct_sm_ep_fence, - .ep_create = UCS_CLASS_NEW_FUNC_NAME(uct_cma_ep_t), - .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_cma_ep_t), - .iface_flush = uct_base_iface_flush, - .iface_fence = uct_sm_iface_fence, - .iface_progress_enable = ucs_empty_function, - .iface_progress_disable = ucs_empty_function, - .iface_progress = ucs_empty_function_return_zero, - .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_cma_iface_t), - .iface_query = uct_cma_iface_query, - .iface_get_address = uct_cma_iface_get_address, - .iface_get_device_address = uct_sm_iface_get_device_address, - .iface_is_reachable = uct_sm_iface_is_reachable -}; - -static UCS_CLASS_INIT_FUNC(uct_cma_iface_t, uct_md_h md, uct_worker_h worker, - const uct_iface_params_t *params, - const uct_iface_config_t *tl_config) -{ - UCT_CHECK_PARAM(params->field_mask & UCT_IFACE_PARAM_FIELD_OPEN_MODE, - "UCT_IFACE_PARAM_FIELD_OPEN_MODE is not defined"); - if (!(params->open_mode & UCT_IFACE_OPEN_MODE_DEVICE)) { - ucs_error("only UCT_IFACE_OPEN_MODE_DEVICE is supported"); - return UCS_ERR_UNSUPPORTED; - } - - UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_cma_iface_ops, md, worker, - params, tl_config - UCS_STATS_ARG((params->field_mask & - UCT_IFACE_PARAM_FIELD_STATS_ROOT) ? - params->stats_root : NULL) - UCS_STATS_ARG(UCT_CMA_TL_NAME)); - uct_sm_get_max_iov(); /* to initialize ucs_get_max_iov static variable */ - - return UCS_OK; -} - -static UCS_CLASS_CLEANUP_FUNC(uct_cma_iface_t) -{ -} - -UCS_CLASS_DEFINE(uct_cma_iface_t, uct_base_iface_t); - -static UCS_CLASS_DEFINE_NEW_FUNC(uct_cma_iface_t, uct_iface_t, uct_md_h, - uct_worker_h, const uct_iface_params_t*, - const uct_iface_config_t *); -static UCS_CLASS_DEFINE_DELETE_FUNC(uct_cma_iface_t, uct_iface_t); - -static ucs_status_t uct_cma_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) -{ - uct_tl_resource_desc_t *resource; - - resource = ucs_calloc(1, sizeof(uct_tl_resource_desc_t), "resource desc"); - if (NULL == resource) { - ucs_error("Failed to allocate memory"); - return UCS_ERR_NO_MEMORY; - } - - ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", - UCT_CMA_TL_NAME); - ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", - md->component->name); - resource->dev_type = UCT_DEVICE_TYPE_SHM; - - *num_resources_p = 1; - *resource_p = resource; - return UCS_OK; -} - -UCT_TL_COMPONENT_DEFINE(uct_cma_tl, - uct_cma_query_tl_resources, - uct_cma_iface_t, - UCT_CMA_TL_NAME, - "", - uct_cma_iface_config_table, - uct_cma_iface_config_t); diff --git a/src/uct/sm/configure.m4 b/src/uct/sm/configure.m4 index 4dfe1ca56ee..5def12f6570 100644 --- a/src/uct/sm/configure.m4 +++ b/src/uct/sm/configure.m4 @@ -1,10 +1,9 @@ # -# Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. +# Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. # See file LICENSE for terms. # -m4_include([src/uct/sm/cma/configure.m4]) -m4_include([src/uct/sm/knem/configure.m4]) +m4_include([src/uct/sm/scopy/configure.m4]) m4_include([src/uct/sm/mm/configure.m4]) AC_CONFIG_FILES([src/uct/sm/Makefile]) diff --git a/src/uct/sm/knem/knem_ep.c b/src/uct/sm/knem/knem_ep.c deleted file mode 100644 index 76419ead0d3..00000000000 --- a/src/uct/sm/knem/knem_ep.c +++ /dev/null @@ -1,120 +0,0 @@ -/** - * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. - * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -#include - -#include "knem_ep.h" -#include "knem_md.h" -#include -#include - -static UCS_CLASS_INIT_FUNC(uct_knem_ep_t, const uct_ep_params_t *params) -{ - uct_knem_iface_t *iface = ucs_derived_of(params->iface, uct_knem_iface_t); - - UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super); - return UCS_OK; -} - -static UCS_CLASS_CLEANUP_FUNC(uct_knem_ep_t) -{ - /* No op */ -} - -UCS_CLASS_DEFINE(uct_knem_ep_t, uct_base_ep_t) -UCS_CLASS_DEFINE_NEW_FUNC(uct_knem_ep_t, uct_ep_t, const uct_ep_params_t *); -UCS_CLASS_DEFINE_DELETE_FUNC(uct_knem_ep_t, uct_ep_t); - - -#define uct_knem_trace_data(_remote_addr, _rkey, _fmt, ...) \ - ucs_trace_data(_fmt " to %"PRIx64"(%+ld)", ## __VA_ARGS__, (_remote_addr), \ - (_rkey)) - -#define UCT_KNEM_ZERO_LENGTH_POST(len) \ - if (0 == len) { \ - ucs_trace_data("Zero length request: skip it"); \ - return UCS_OK; \ - } - -static inline ucs_status_t uct_knem_rma(uct_ep_h tl_ep, const uct_iov_t *iov, - size_t iovcnt, uint64_t remote_addr, - uct_knem_key_t *key, int write) -{ - struct knem_cmd_inline_copy icopy; - struct knem_cmd_param_iovec knem_iov[UCT_SM_MAX_IOV]; - uct_knem_iface_t *knem_iface = ucs_derived_of(tl_ep->iface, uct_knem_iface_t); - int knem_fd = knem_iface->knem_md->knem_fd; - int rc; - size_t iov_it; - size_t knem_iov_it = 0; - - for (iov_it = 0; iov_it < ucs_min(UCT_SM_MAX_IOV, iovcnt); ++iov_it) { - knem_iov[knem_iov_it].base = (uintptr_t)iov[iov_it].buffer; - knem_iov[knem_iov_it].len = uct_iov_get_length(iov + iov_it); - if (knem_iov[knem_iov_it].len) { - ++knem_iov_it; - } else { - continue; /* Skip zero length buffers */ - } - } - - UCT_KNEM_ZERO_LENGTH_POST(knem_iov_it); - - icopy.local_iovec_array = (uintptr_t) knem_iov; - icopy.local_iovec_nr = knem_iov_it; - icopy.remote_cookie = key->cookie; - ucs_assert(remote_addr >= key->address); - icopy.remote_offset = remote_addr - key->address; - - icopy.write = write; /* if 0 then, READ from the remote region into my local segments - * if 1 then, WRITE to the remote region from my local segment */ - icopy.flags = 0; /* TBD: add check and support for KNEM_FLAG_DMA */ - icopy.current_status = 0; - icopy.async_status_index = 0; - icopy.pad = 0; - - ucs_assert(knem_fd > -1); - rc = ioctl(knem_fd, KNEM_CMD_INLINE_COPY, &icopy); - if (rc < 0) { - ucs_error("KNEM inline copy failed, err = %d %m", rc); - return UCS_ERR_IO_ERROR; - } - - uct_knem_trace_data(remote_addr, (uintptr_t)key, "%s [length %zu]", - write?"PUT_ZCOPY":"GET_ZCOPY", - uct_iov_total_length(iov, iovcnt)); - return UCS_OK; -} - -ucs_status_t uct_knem_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp) -{ - uct_knem_key_t *key = (uct_knem_key_t *)rkey; - ucs_status_t status; - - UCT_CHECK_IOV_SIZE(iovcnt, uct_sm_get_max_iov(), "uct_knem_ep_put_zcopy"); - - status = uct_knem_rma(tl_ep, iov, iovcnt, remote_addr, key, 1); - UCT_TL_EP_STAT_OP_IF_SUCCESS(status, ucs_derived_of(tl_ep, uct_base_ep_t), - PUT, ZCOPY, uct_iov_total_length(iov, iovcnt)); - return status; -} - -ucs_status_t uct_knem_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp) -{ - uct_knem_key_t *key = (uct_knem_key_t *)rkey; - ucs_status_t status; - - UCT_CHECK_IOV_SIZE(iovcnt, uct_sm_get_max_iov(), "uct_knem_ep_get_zcopy"); - - status = uct_knem_rma(tl_ep, iov, iovcnt, remote_addr, key, 0); - UCT_TL_EP_STAT_OP_IF_SUCCESS(status, ucs_derived_of(tl_ep, uct_base_ep_t), - GET, ZCOPY, uct_iov_total_length(iov, iovcnt)); - return status; -} diff --git a/src/uct/sm/knem/knem_ep.h b/src/uct/sm/knem/knem_ep.h deleted file mode 100644 index d105cfb9c69..00000000000 --- a/src/uct/sm/knem/knem_ep.h +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. - * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -#ifndef UCT_KNEM_EP_H -#define UCT_KNEM_EP_H - -#include "knem_iface.h" - - -typedef struct uct_knem_ep { - uct_base_ep_t super; -} uct_knem_ep_t; - -UCS_CLASS_DECLARE_NEW_FUNC(uct_knem_ep_t, uct_ep_t, const uct_ep_params_t *); -UCS_CLASS_DECLARE_DELETE_FUNC(uct_knem_ep_t, uct_ep_t); -ucs_status_t uct_knem_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp); -ucs_status_t uct_knem_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, - uint64_t remote_addr, uct_rkey_t rkey, - uct_completion_t *comp); -#endif diff --git a/src/uct/sm/knem/knem_iface.c b/src/uct/sm/knem/knem_iface.c deleted file mode 100644 index 190ea290dda..00000000000 --- a/src/uct/sm/knem/knem_iface.c +++ /dev/null @@ -1,142 +0,0 @@ -/** - * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. - * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -#include "knem_md.h" -#include "knem_iface.h" -#include "knem_ep.h" - -#include -#include -#include - - -UCT_MD_REGISTER_TL(&uct_knem_md_component, &uct_knem_tl); - -static ucs_status_t uct_knem_iface_query(uct_iface_h tl_iface, - uct_iface_attr_t *iface_attr) -{ - memset(iface_attr, 0, sizeof(uct_iface_attr_t)); - - /* default values for all shared memory transports */ - iface_attr->cap.put.min_zcopy = 0; - iface_attr->cap.put.max_zcopy = SIZE_MAX; - iface_attr->cap.put.opt_zcopy_align = 1; - iface_attr->cap.put.align_mtu = iface_attr->cap.put.opt_zcopy_align; - iface_attr->cap.put.max_iov = uct_sm_get_max_iov(); - - iface_attr->cap.get.min_zcopy = 0; - iface_attr->cap.get.max_zcopy = SIZE_MAX; - iface_attr->cap.get.opt_zcopy_align = 1; - iface_attr->cap.get.align_mtu = iface_attr->cap.get.opt_zcopy_align; - iface_attr->cap.get.max_iov = uct_sm_get_max_iov(); - - iface_attr->cap.am.max_iov = 1; - iface_attr->cap.am.opt_zcopy_align = 1; - iface_attr->cap.am.align_mtu = iface_attr->cap.am.opt_zcopy_align; - - iface_attr->iface_addr_len = 0; - iface_attr->device_addr_len = UCT_SM_IFACE_DEVICE_ADDR_LEN; - iface_attr->ep_addr_len = 0; - iface_attr->max_conn_priv = 0; - iface_attr->cap.flags = UCT_IFACE_FLAG_GET_ZCOPY | - UCT_IFACE_FLAG_PUT_ZCOPY | - UCT_IFACE_FLAG_PENDING | - UCT_IFACE_FLAG_CONNECT_TO_IFACE; - iface_attr->latency.overhead = 80e-9; /* 80 ns */ - iface_attr->latency.growth = 0; - iface_attr->bandwidth = 13862 * 1024.0 * 1024.0; - iface_attr->overhead = 0.25e-6; /* 0.25 us */ - return UCS_OK; -} - -static UCS_CLASS_DECLARE_DELETE_FUNC(uct_knem_iface_t, uct_iface_t); - -static uct_iface_ops_t uct_knem_iface_ops = { - .ep_put_zcopy = uct_knem_ep_put_zcopy, - .ep_get_zcopy = uct_knem_ep_get_zcopy, - .ep_pending_add = (void*)ucs_empty_function_return_busy, - .ep_pending_purge = (void*)ucs_empty_function, - .ep_flush = uct_base_ep_flush, - .ep_fence = uct_sm_ep_fence, - .ep_create = UCS_CLASS_NEW_FUNC_NAME(uct_knem_ep_t), - .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_knem_ep_t), - .iface_fence = uct_sm_iface_fence, - .iface_progress_enable = ucs_empty_function, - .iface_progress_disable = ucs_empty_function, - .iface_progress = ucs_empty_function_return_zero, - .iface_flush = uct_base_iface_flush, - .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_knem_iface_t), - .iface_query = uct_knem_iface_query, - .iface_get_device_address = uct_sm_iface_get_device_address, - .iface_get_address = (void*)ucs_empty_function_return_success, - .iface_is_reachable = uct_sm_iface_is_reachable -}; - -static UCS_CLASS_INIT_FUNC(uct_knem_iface_t, uct_md_h md, uct_worker_h worker, - const uct_iface_params_t *params, - const uct_iface_config_t *tl_config) -{ - UCT_CHECK_PARAM(params->field_mask & UCT_IFACE_PARAM_FIELD_OPEN_MODE, - "UCT_IFACE_PARAM_FIELD_OPEN_MODE is not defined"); - if (!(params->open_mode & UCT_IFACE_OPEN_MODE_DEVICE)) { - ucs_error("only UCT_IFACE_OPEN_MODE_DEVICE is supported"); - return UCS_ERR_UNSUPPORTED; - } - - UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_knem_iface_ops, md, worker, - params, tl_config - UCS_STATS_ARG((params->field_mask & - UCT_IFACE_PARAM_FIELD_STATS_ROOT) ? - params->stats_root : NULL) - UCS_STATS_ARG(UCT_KNEM_TL_NAME)); - self->knem_md = (uct_knem_md_t *)md; - uct_sm_get_max_iov(); /* to initialize ucs_get_max_iov static variable */ - - return UCS_OK; -} - -static UCS_CLASS_CLEANUP_FUNC(uct_knem_iface_t) -{ - /* No OP */ -} - -UCS_CLASS_DEFINE(uct_knem_iface_t, uct_base_iface_t); - -static UCS_CLASS_DEFINE_NEW_FUNC(uct_knem_iface_t, uct_iface_t, uct_md_h, - uct_worker_h, const uct_iface_params_t*, - const uct_iface_config_t *); -static UCS_CLASS_DEFINE_DELETE_FUNC(uct_knem_iface_t, uct_iface_t); - -static ucs_status_t uct_knem_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) -{ - uct_tl_resource_desc_t *resource; - - resource = ucs_calloc(1, sizeof(uct_tl_resource_desc_t), "resource desc"); - if (NULL == resource) { - ucs_error("Failed to allocate memory"); - return UCS_ERR_NO_MEMORY; - } - - ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", - UCT_KNEM_TL_NAME); - ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", - md->component->name); - resource->dev_type = UCT_DEVICE_TYPE_SHM; - - *num_resources_p = 1; - *resource_p = resource; - return UCS_OK; -} - -UCT_TL_COMPONENT_DEFINE(uct_knem_tl, - uct_knem_query_tl_resources, - uct_knem_iface_t, - UCT_KNEM_TL_NAME, - "", - uct_iface_config_table, - uct_iface_config_t); diff --git a/src/uct/sm/mm/base/mm_def.h b/src/uct/sm/mm/base/mm_def.h deleted file mode 100644 index a9d29e25dc7..00000000000 --- a/src/uct/sm/mm/base/mm_def.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -#ifndef UCT_MM_H -#define UCT_MM_H - -#include -#include -#include -#include - - -typedef struct uct_mm_ep uct_mm_ep_t; -typedef struct uct_mm_iface uct_mm_iface_t; -typedef struct uct_mm_fifo_ctl uct_mm_fifo_ctl_t; -typedef struct uct_mm_fifo_element uct_mm_fifo_element_t; -typedef struct uct_mm_recv_desc uct_mm_recv_desc_t; -typedef struct uct_mm_remote_seg uct_mm_remote_seg_t; - -#define UCT_MM_BASE_ADDRESS_HASH_SIZE 64 - -enum { - UCT_MM_FIFO_ELEM_FLAG_OWNER = UCS_BIT(0), /* new/old info */ - UCT_MM_FIFO_ELEM_FLAG_INLINE = UCS_BIT(1), /* if inline or not */ -}; - -enum { - UCT_MM_AM_BCOPY, - UCT_MM_AM_SHORT, -}; - -#define UCT_MM_IFACE_GET_FIFO_ELEM(_iface, _fifo , _index) \ - (uct_mm_fifo_element_t*) ((char*)(_fifo) + ((_index) * \ - (_iface)->config.fifo_elem_size)); - -#define UCT_MM_IFACE_GET_DESC_START(_iface, _fifo_elem_p) \ - (uct_mm_recv_desc_t *) ((_fifo_elem_p)->desc_chunk_base_addr + \ - (_fifo_elem_p)->desc_offset - (_iface)->rx_headroom) - 1; - - -/* Check if the resources on the remote peer are available for sending to it. - * i.e. check if the remote receive FIFO has room in it. - * return 1 if can send. - * return 0 if can't send. - */ -#define UCT_MM_EP_IS_ABLE_TO_SEND(_head, _tail, _fifo_size) \ - ucs_likely(((_head) - (_tail)) < (_fifo_size)) - -typedef struct uct_mm_md_config { - uct_md_config_t super; - ucs_ternary_value_t hugetlb_mode; /* Enable using huge pages */ -} uct_mm_md_config_t; - - -typedef struct uct_mm_iface_addr { - uint64_t id; - uintptr_t vaddr; -} UCS_S_PACKED uct_mm_iface_addr_t; - - -#endif /* UCT_MM_H */ diff --git a/src/uct/sm/mm/base/mm_ep.c b/src/uct/sm/mm/base/mm_ep.c index 1fc0124ea10..353b8585421 100644 --- a/src/uct/sm/mm/base/mm_ep.c +++ b/src/uct/sm/mm/base/mm_ep.c @@ -2,18 +2,86 @@ * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * Copyright (C) ARM Ltd. 2016. ALL RIGHTS RESERVED. -* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "mm_ep.h" #include -SGLIB_DEFINE_LIST_FUNCTIONS(uct_mm_remote_seg_t, uct_mm_remote_seg_compare, next) -SGLIB_DEFINE_HASHED_CONTAINER_FUNCTIONS(uct_mm_remote_seg_t, - UCT_MM_BASE_ADDRESS_HASH_SIZE, - uct_mm_remote_seg_hash) + +/* send modes */ +typedef enum { + UCT_MM_SEND_AM_BCOPY, + UCT_MM_SEND_AM_SHORT, +} uct_mm_send_op_t; + + +/* Check if the resources on the remote peer are available for sending to it. + * i.e. check if the remote receive FIFO has room in it. + * return 1 if can send. + * return 0 if can't send. + */ +#define UCT_MM_EP_IS_ABLE_TO_SEND(_head, _tail, _fifo_size) \ + ucs_likely(((_head) - (_tail)) < (_fifo_size)) + + +static UCS_F_NOINLINE ucs_status_t +uct_mm_ep_attach_remote_seg(uct_mm_ep_t *ep, uct_mm_seg_id_t seg_id, + size_t length, void **address_p) +{ + uct_mm_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_mm_iface_t); + uct_mm_remote_seg_t *remote_seg; + ucs_status_t status; + khiter_t khiter; + int khret; + + khiter = kh_put(uct_mm_remote_seg, &ep->remote_segs, seg_id, &khret); + if (khret == -1) { + ucs_error("failed to add remote segment to mm ep hash"); + return UCS_ERR_NO_MEMORY; + } + + /* we expect the key would either be never used (=1) or deleted (=2) */ + ucs_assert_always((khret == 1) || (khret == 2)); + + remote_seg = &kh_val(&ep->remote_segs, khiter); + + status = uct_mm_iface_mapper_call(iface, mem_attach, seg_id, length, + ep->remote_iface_addr, remote_seg); + if (status != UCS_OK) { + kh_del(uct_mm_remote_seg, &ep->remote_segs, khiter); + return status; + } + + *address_p = remote_seg->address; + ucs_debug("mm_ep %p: attached remote segment id 0x%"PRIx64" at %p cookie %p", + ep, seg_id, remote_seg->address, remote_seg->cookie); + return UCS_OK; +} + +static UCS_F_ALWAYS_INLINE ucs_status_t +uct_mm_ep_get_remote_seg(uct_mm_ep_t *ep, uct_mm_seg_id_t seg_id, size_t length, + void **address_p) +{ + khiter_t khiter; + + /* fast path - segment is already present */ + khiter = kh_get(uct_mm_remote_seg, &ep->remote_segs, seg_id); + if (ucs_likely(khiter != kh_end(&ep->remote_segs))) { + *address_p = kh_val(&ep->remote_segs, khiter).address; + return UCS_OK; + } + + /* slow path - attach new segment */ + return uct_mm_ep_attach_remote_seg(ep, seg_id, length, address_p); +} /* send a signal to remote interface using Unix-domain socket */ @@ -53,131 +121,76 @@ static void uct_mm_ep_signal_remote(uct_mm_ep_t *ep) static UCS_CLASS_INIT_FUNC(uct_mm_ep_t, const uct_ep_params_t *params) { - uct_mm_iface_t *iface = ucs_derived_of(params->iface, uct_mm_iface_t); - const uct_mm_iface_addr_t *addr = (const void *)params->iface_addr; - + uct_mm_iface_t *iface = ucs_derived_of(params->iface, uct_mm_iface_t); + uct_mm_md_t *md = ucs_derived_of(iface->super.super.md, uct_mm_md_t); + const uct_mm_iface_addr_t *addr = (const void *)params->iface_addr; ucs_status_t status; - size_t size_to_attach; + void *fifo_ptr; UCT_EP_PARAMS_CHECK_DEV_IFACE_ADDRS(params); - UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super); - - /* Connect to the remote address (remote FIFO) */ - /* Attach the address's memory */ - size_to_attach = UCT_MM_GET_FIFO_SIZE(iface); - status = - uct_mm_md_mapper_ops(iface->super.md)->attach(addr->id, - size_to_attach, - (void *)addr->vaddr, - &self->mapped_desc.address, - &self->mapped_desc.cookie, - iface->path); - if (status != UCS_OK) { - ucs_error("failed to connect to remote peer with mm. remote mm_id: %zu", - addr->id); - return status; + UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super.super); + + kh_init_inplace(uct_mm_remote_seg, &self->remote_segs); + ucs_arbiter_group_init(&self->arb_group); + + /* save remote md address */ + if (md->iface_addr_len > 0) { + self->remote_iface_addr = ucs_malloc(md->iface_addr_len, "mm_md_addr"); + if (self->remote_iface_addr == NULL) { + status = UCS_ERR_NO_MEMORY; + goto err; + } + + memcpy(self->remote_iface_addr, addr + 1, md->iface_addr_len); + } else { + self->remote_iface_addr = NULL; } - self->mapped_desc.length = size_to_attach; - self->mapped_desc.mmid = addr->id; + /* Attach the remote FIFO, use the same method as bcopy descriptors */ + status = uct_mm_ep_get_remote_seg(self, addr->fifo_seg_id, + UCT_MM_GET_FIFO_SIZE(iface), &fifo_ptr); + if (status != UCS_OK) { + ucs_error("mm ep failed to connect to remote FIFO id 0x%lx: %s", + addr->fifo_seg_id, ucs_status_string(status)); + goto err_free_md_addr; + } - /* point the ep->fifo_ctl to the remote fifo. - * it's an aligned pointer to the beginning of the ctl struct in the remote FIFO */ - self->fifo_ctl = uct_mm_set_fifo_ctl(self->mapped_desc.address); + /* Initialize remote FIFO control structure */ + uct_mm_iface_set_fifo_ptrs(fifo_ptr, &self->fifo_ctl, &self->fifo_elems); self->cached_tail = self->fifo_ctl->tail; self->signal.addrlen = self->fifo_ctl->signal_addrlen; self->signal.sockaddr = self->fifo_ctl->signal_sockaddr; - /* Make sure the fifo ctrl is aligned */ - ucs_assert_always(((uintptr_t)self->fifo_ctl % UCS_SYS_CACHE_LINE_SIZE) == 0); - - /* set the ep->fifo ptr to point to the beginning of the fifo elements at - * the remote peer */ - uct_mm_set_fifo_elems_ptr(self->mapped_desc.address, &self->fifo); - - /* Initiate the hash which will keep the base_adresses of remote memory - * chunks that hold the descriptors for bcopy. */ - sglib_hashed_uct_mm_remote_seg_t_init(self->remote_segments_hash); - - ucs_arbiter_group_init(&self->arb_group); - - ucs_debug("mm: ep connected: %p, to remote_shmid: %zu", self, addr->id); + ucs_debug("created mm ep %p, connected to remote FIFO id 0x%lx", + self, addr->fifo_seg_id); return UCS_OK; + +err_free_md_addr: + ucs_free(self->remote_iface_addr); +err: + return status; } static UCS_CLASS_CLEANUP_FUNC(uct_mm_ep_t) { - uct_mm_iface_t *iface = ucs_derived_of(self->super.super.iface, uct_mm_iface_t); - ucs_status_t status; - uct_mm_remote_seg_t *remote_seg; - struct sglib_hashed_uct_mm_remote_seg_t_iterator iter; - - for (remote_seg = sglib_hashed_uct_mm_remote_seg_t_it_init(&iter, self->remote_segments_hash); - remote_seg != NULL; remote_seg = sglib_hashed_uct_mm_remote_seg_t_it_next(&iter)) { - sglib_hashed_uct_mm_remote_seg_t_delete(self->remote_segments_hash, remote_seg); - /* detach the remote proceess's descriptors segment */ - status = uct_mm_md_mapper_ops(iface->super.md)->detach(remote_seg); - if (status != UCS_OK) { - ucs_warn("Unable to detach shared memory segment of descriptors: %s", - ucs_status_string(status)); - } - ucs_free(remote_seg); - } - - /* detach the remote proceess's shared memory segment (remote recv FIFO) */ - status = uct_mm_md_mapper_ops(iface->super.md)->detach(&self->mapped_desc); - if (status != UCS_OK) { - ucs_error("error detaching from remote FIFO"); - } + uct_mm_iface_t *iface = ucs_derived_of(self->super.super.iface, uct_mm_iface_t); + uct_mm_remote_seg_t remote_seg; uct_mm_ep_pending_purge(&self->super.super, NULL, NULL); + + kh_foreach_value(&self->remote_segs, remote_seg, { + uct_mm_iface_mapper_call(iface, mem_detach, &remote_seg); + }) + + ucs_free(self->remote_iface_addr); + kh_destroy_inplace(uct_mm_remote_seg, &self->remote_segs); } UCS_CLASS_DEFINE(uct_mm_ep_t, uct_base_ep_t) UCS_CLASS_DEFINE_NEW_FUNC(uct_mm_ep_t, uct_ep_t, const uct_ep_params_t *); UCS_CLASS_DEFINE_DELETE_FUNC(uct_mm_ep_t, uct_ep_t); -void *uct_mm_ep_attach_remote_seg(uct_mm_ep_t *ep, uct_mm_iface_t *iface, uct_mm_fifo_element_t *elem) -{ - uct_mm_remote_seg_t *remote_seg, search; - ucs_status_t status; - - /* take the mmid of the chunk that the desc belongs to, (the desc that the fifo_elem - * is 'assigned' to), and check if the ep has already attached to it. - */ - search.mmid = elem->desc_mmid; - remote_seg = sglib_hashed_uct_mm_remote_seg_t_find_member(ep->remote_segments_hash, &search); - if (remote_seg == NULL) { - /* not in the hash. attach to the memory the mmid refers to. the attach call - * will return the base address of the mmid's chunk - - * save this base address in a hash table (which maps mmid to base address). */ - remote_seg = ucs_malloc(sizeof(*remote_seg), "mm_desc"); - if (remote_seg == NULL) { - ucs_fatal("Failed to allocated memory for a remote segment identifier. %m"); - } - - status = uct_mm_md_mapper_ops(iface->super.md)->attach(elem->desc_mmid, - elem->desc_mpool_size, - elem->desc_chunk_base_addr, - &remote_seg->address, - &remote_seg->cookie, - iface->path); - if (status != UCS_OK) { - ucs_fatal("Failed to attach to remote mmid:%zu. %s ", - elem->desc_mmid, ucs_status_string(status)); - } - - remote_seg->mmid = elem->desc_mmid; - remote_seg->length = elem->desc_mpool_size; - - /* put the base address into the ep's hash table */ - sglib_hashed_uct_mm_remote_seg_t_add(ep->remote_segments_hash, remote_seg); - } - - return remote_seg->address; - -} static inline ucs_status_t uct_mm_ep_get_remote_elem(uct_mm_ep_t *ep, uint64_t head, uct_mm_fifo_element_t **elem) @@ -188,7 +201,7 @@ static inline ucs_status_t uct_mm_ep_get_remote_elem(uct_mm_ep_t *ep, uint64_t h uint64_t returned_val; elem_index = ep->fifo_ctl->head & iface->fifo_mask; - *elem = UCT_MM_IFACE_GET_FIFO_ELEM(iface, ep->fifo, elem_index); + *elem = UCT_MM_IFACE_GET_FIFO_ELEM(iface, ep->fifo_elems, elem_index); /* try to get ownership of the head element */ returned_val = ucs_atomic_cswap64(ucs_unaligned_ptr(&ep->fifo_ctl->head), head, head+1); @@ -213,14 +226,16 @@ static inline void uct_mm_ep_update_cached_tail(uct_mm_ep_t *ep) * is_short = 0 - perform AM bcopy sending */ static UCS_F_ALWAYS_INLINE ssize_t -uct_mm_ep_am_common_send(unsigned is_short, uct_mm_ep_t *ep, uct_mm_iface_t *iface, - uint8_t am_id, size_t length, uint64_t header, - const void *payload, uct_pack_callback_t pack_cb, void *arg, +uct_mm_ep_am_common_send(uct_mm_send_op_t send_op, uct_mm_ep_t *ep, + uct_mm_iface_t *iface, uint8_t am_id, size_t length, + uint64_t header, const void *payload, + uct_pack_callback_t pack_cb, void *arg, unsigned flags) { uct_mm_fifo_element_t *elem; ucs_status_t status; void *base_address; + uint8_t elem_flags; uint64_t head; UCT_CHECK_AM_ID(am_id); @@ -251,32 +266,38 @@ uct_mm_ep_am_common_send(unsigned is_short, uct_mm_ep_t *ep, uct_mm_iface_t *ifa goto retry; } - if (is_short) { - /* AM_SHORT */ + switch (send_op) { + case UCT_MM_SEND_AM_SHORT: /* write to the remote FIFO */ - *(uint64_t*) (elem + 1) = header; - memcpy((void*) (elem + 1) + sizeof(header), payload, length); + uct_am_short_fill_data(elem + 1, header, payload, length); - elem->flags |= UCT_MM_FIFO_ELEM_FLAG_INLINE; + elem_flags = UCT_MM_FIFO_ELEM_FLAG_INLINE; elem->length = length + sizeof(header); - uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_SEND, am_id, + uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_SEND, am_id, elem + 1, length + sizeof(header), "TX: AM_SHORT"); UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, sizeof(header) + length); - } else { - /* AM_BCOPY */ + break; + case UCT_MM_SEND_AM_BCOPY: /* write to the remote descriptor */ /* get the base_address: local ptr to remote memory chunk after attaching to it */ - base_address = uct_mm_ep_attach_remote_seg(ep, iface, elem); - length = pack_cb(base_address + elem->desc_offset, arg); + status = uct_mm_ep_get_remote_seg(ep, elem->desc.seg_id, + elem->desc.seg_size, &base_address); + if (ucs_unlikely(status != UCS_OK)) { + return status; + } - elem->flags &= ~UCT_MM_FIFO_ELEM_FLAG_INLINE; + length = pack_cb(UCS_PTR_BYTE_OFFSET(base_address, + elem->desc.offset), + arg); + elem_flags = 0; elem->length = length; - uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_SEND, am_id, - base_address + elem->desc_offset, length, "TX: AM_BCOPY"); - + uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_SEND, am_id, + UCS_PTR_BYTE_OFFSET(base_address, elem->desc.offset), + length, "TX: AM_BCOPY"); UCT_TL_EP_STAT_OP(&ep->super, AM, BCOPY, length); + break; } elem->am_id = am_id; @@ -285,22 +306,24 @@ uct_mm_ep_am_common_send(unsigned is_short, uct_mm_ep_t *ep, uct_mm_iface_t *ifa * 'writing is complete' flag which the reader checks */ ucs_memory_cpu_store_fence(); - /* change the owner bit to indicate that the writing is complete. + /* set the owner bit to indicate that the writing is complete. * the owner bit flips after every FIFO wraparound */ if (head & iface->config.fifo_size) { - elem->flags |= UCT_MM_FIFO_ELEM_FLAG_OWNER; - } else { - elem->flags &= ~UCT_MM_FIFO_ELEM_FLAG_OWNER; + elem_flags |= UCT_MM_FIFO_ELEM_FLAG_OWNER; } + elem->flags = elem_flags; if (ucs_unlikely(flags & UCT_SEND_FLAG_SIGNALED)) { uct_mm_ep_signal_remote(ep); } - if (is_short) { + switch (send_op) { + case UCT_MM_SEND_AM_SHORT: return UCS_OK; - } else { + case UCT_MM_SEND_AM_BCOPY: return length; + default: + return UCS_ERR_INVALID_PARAM; } } @@ -314,8 +337,9 @@ ucs_status_t uct_mm_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t header, iface->config.fifo_elem_size - sizeof(uct_mm_fifo_element_t), "am_short"); - return uct_mm_ep_am_common_send(UCT_MM_AM_SHORT, ep, iface, id, length, - header, payload, NULL, NULL, 0); + return (ucs_status_t)uct_mm_ep_am_common_send(UCT_MM_SEND_AM_SHORT, ep, + iface, id, length, header, + payload, NULL, NULL, 0); } ssize_t uct_mm_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id, uct_pack_callback_t pack_cb, @@ -324,8 +348,8 @@ ssize_t uct_mm_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id, uct_pack_callback_t pack_ uct_mm_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_mm_iface_t); uct_mm_ep_t *ep = ucs_derived_of(tl_ep, uct_mm_ep_t); - return uct_mm_ep_am_common_send(UCT_MM_AM_BCOPY, ep, iface, id, 0, 0, NULL, - pack_cb, arg, flags); + return uct_mm_ep_am_common_send(UCT_MM_SEND_AM_BCOPY, ep, iface, id, 0, 0, + NULL, pack_cb, arg, flags); } static inline int uct_mm_ep_has_tx_resources(uct_mm_ep_t *ep) @@ -358,12 +382,14 @@ ucs_status_t uct_mm_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *n, } ucs_arbiter_cb_result_t uct_mm_ep_process_pending(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, priv); + uct_mm_ep_t *ep = ucs_container_of(group, uct_mm_ep_t, arb_group); + unsigned *count = (unsigned*)arg; ucs_status_t status; - uct_mm_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), uct_mm_ep_t, arb_group); /* update the local tail with its actual value from the remote peer * making sure that the pending sends would use the real tail value */ @@ -373,14 +399,17 @@ ucs_arbiter_cb_result_t uct_mm_ep_process_pending(ucs_arbiter_t *arbiter, return UCS_ARBITER_CB_RESULT_RESCHED_GROUP; } + ucs_trace_data("progressing pending request %p", req); status = req->func(req); - ucs_trace_data("progress pending request %p returned %s", req, + ucs_trace_data("status returned from progress pending: %s", ucs_status_string(status)); if (status == UCS_OK) { + (*count)++; /* sent successfully. remove from the arbiter */ return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; } else if (status == UCS_INPROGRESS) { + (*count)++; /* sent but not completed, keep in the arbiter */ return UCS_ARBITER_CB_RESULT_NEXT_GROUP; } else { @@ -391,14 +420,17 @@ ucs_arbiter_cb_result_t uct_mm_ep_process_pending(ucs_arbiter_t *arbiter, } static ucs_arbiter_cb_result_t uct_mm_ep_abriter_purge_cb(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { - uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, priv); + uct_mm_ep_t *ep = ucs_container_of(group, uct_mm_ep_t, + arb_group); + uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, + priv); uct_purge_cb_args_t *cb_args = arg; uct_pending_purge_callback_t cb = cb_args->cb; - uct_mm_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), - uct_mm_ep_t, arb_group); + if (cb != NULL) { cb(req, cb_args->arg); } else { diff --git a/src/uct/sm/mm/base/mm_ep.h b/src/uct/sm/mm/base/mm_ep.h index 5ec729645ad..7076ed7ed80 100644 --- a/src/uct/sm/mm/base/mm_ep.h +++ b/src/uct/sm/mm/base/mm_ep.h @@ -10,35 +10,41 @@ #include "mm_iface.h" -#include -#include +#include -struct uct_mm_ep { - uct_base_ep_t super; +KHASH_INIT(uct_mm_remote_seg, uintptr_t, uct_mm_remote_seg_t, 1, + kh_int64_hash_func, kh_int64_hash_equal) + + +/** + * MM transport endpoint + */ +typedef struct uct_mm_ep { + uct_base_ep_t super; /* Remote peer */ - uct_mm_fifo_ctl_t *fifo_ctl; /* pointer to the destination's ctl struct in the receive fifo */ - void *fifo; /* fifo elements (destination's receive fifo) */ + uct_mm_fifo_ctl_t *fifo_ctl; /* pointer to the destination's ctl struct in the receive fifo */ + void *fifo_elems; /* fifo elements (destination's receive fifo) */ - uint64_t cached_tail; /* the sender's own copy of the remote FIFO's tail. - it is not always updated with the actual remote tail value */ + uint64_t cached_tail; /* the sender's own copy of the remote FIFO's tail. + it is not always updated with the actual remote tail value */ /* mapped remote memory chunks to which remote descriptors belong to. * (after attaching to them) */ - uct_mm_remote_seg_t *remote_segments_hash[UCT_MM_BASE_ADDRESS_HASH_SIZE]; + khash_t(uct_mm_remote_seg) remote_segs; + + void *remote_iface_addr; /* remote md-specific address, can be NULL */ - ucs_arbiter_group_t arb_group; /* the group that holds this ep's pending operations */ + ucs_arbiter_group_t arb_group; /* the group that holds this ep's pending operations */ /* Used for signaling remote side wakeup */ struct { - struct sockaddr_un sockaddr; /* address of signaling socket */ - socklen_t addrlen; /* address length of signaling socket */ + struct sockaddr_un sockaddr; /* address of signaling socket */ + socklen_t addrlen; /* address length of signaling socket */ } signal; +} uct_mm_ep_t; - /* Remote peer */ - uct_mm_remote_seg_t mapped_desc; /* pointer to the descriptor of the destination's shared_mem (FIFO) */ -}; UCS_CLASS_DECLARE_NEW_FUNC(uct_mm_ep_t, uct_ep_t,const uct_ep_params_t *); UCS_CLASS_DECLARE_DELETE_FUNC(uct_mm_ep_t, uct_ep_t); @@ -58,20 +64,8 @@ void uct_mm_ep_pending_purge(uct_ep_h ep, uct_pending_purge_callback_t cb, void *arg); ucs_arbiter_cb_result_t uct_mm_ep_process_pending(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg); -static inline uint64_t uct_mm_remote_seg_hash(uct_mm_remote_seg_t *seg) -{ - return seg->mmid % UCT_MM_BASE_ADDRESS_HASH_SIZE; -} - -static inline int64_t uct_mm_remote_seg_compare(uct_mm_remote_seg_t *seg1, uct_mm_remote_seg_t *seg2) -{ - return seg1->mmid - seg2->mmid; -} - -SGLIB_DEFINE_LIST_PROTOTYPES(uct_mm_remote_seg_t, uct_mm_remote_seg_compare, next) -SGLIB_DEFINE_HASHED_CONTAINER_PROTOTYPES(uct_mm_remote_seg_t, UCT_MM_BASE_ADDRESS_HASH_SIZE, uct_mm_remote_seg_hash) - #endif diff --git a/src/uct/sm/mm/base/mm_iface.c b/src/uct/sm/mm/base/mm_iface.c index 5f96dcb2d05..ba6a4dbc2a0 100644 --- a/src/uct/sm/mm/base/mm_iface.c +++ b/src/uct/sm/mm/base/mm_iface.c @@ -4,12 +4,15 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "mm_iface.h" #include "mm_ep.h" #include #include -#include #include #include #include @@ -21,18 +24,22 @@ #define UCT_MM_IFACE_MAX_SIG_EVENTS 32 -static ucs_config_field_t uct_mm_iface_config_table[] = { - {"", "ALLOC=md", NULL, +ucs_config_field_t uct_mm_iface_config_table[] = { + {"SM_", "ALLOC=md,mmap,heap", NULL, ucs_offsetof(uct_mm_iface_config_t, super), - UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)}, + UCS_CONFIG_TYPE_TABLE(uct_sm_iface_config_table)}, {"FIFO_SIZE", "64", "Size of the receive FIFO in the memory-map UCTs.", ucs_offsetof(uct_mm_iface_config_t, fifo_size), UCS_CONFIG_TYPE_UINT}, + {"SEG_SIZE", "8256", + "Size of send/receive buffers for copy-out sends.", + ucs_offsetof(uct_mm_iface_config_t, seg_size), UCS_CONFIG_TYPE_MEMUNITS}, + {"FIFO_RELEASE_FACTOR", "0.5", "Frequency of resource releasing on the receiver's side in the MM UCT.\n" - "This value refers to the percentage of the FIFO size. (must be >= 0 and < 1)", + "This value refers to the percentage of the FIFO size. (must be >= 0 and < 1).", ucs_offsetof(uct_mm_iface_config_t, release_fifo_factor), UCS_CONFIG_TYPE_DOUBLE}, UCT_IFACE_MPOOL_CONFIG_FIELDS("RX_", -1, 512, "receive", @@ -43,28 +50,56 @@ static ucs_config_field_t uct_mm_iface_config_table[] = { "Possible values are:\n" " y - Allocate memory using huge pages only.\n" " n - Allocate memory using regular pages only.\n" - " try - Try to allocate memory using huge pages and if it fails, allocate regular pages.\n", + " try - Try to allocate memory using huge pages and if it fails, allocate regular pages.", ucs_offsetof(uct_mm_iface_config_t, hugetlb_mode), UCS_CONFIG_TYPE_TERNARY}, + {"FIFO_ELEM_SIZE", "128", + "Size of the FIFO element size (data + header) in the MM UCTs.", + ucs_offsetof(uct_mm_iface_config_t, fifo_elem_size), UCS_CONFIG_TYPE_UINT}, + + {"FIFO_MAX_POLL", UCS_PP_MAKE_STRING(UCT_MM_IFACE_FIFO_MAX_POLL), + "Maximal number of receive completions to pick during RX poll", + ucs_offsetof(uct_mm_iface_config_t, fifo_max_poll), UCS_CONFIG_TYPE_ULUNITS}, + {NULL} }; static ucs_status_t uct_mm_iface_get_address(uct_iface_t *tl_iface, uct_iface_addr_t *addr) { - uct_mm_iface_t *iface = ucs_derived_of(tl_iface, uct_mm_iface_t); + uct_mm_iface_t *iface = ucs_derived_of(tl_iface, uct_mm_iface_t); + uct_mm_md_t *md = ucs_derived_of(iface->super.super.md, + uct_mm_md_t); uct_mm_iface_addr_t *iface_addr = (void*)addr; + uct_mm_seg_t *seg = iface->recv_fifo_mem.memh; - iface_addr->id = iface->fifo_mm_id; - iface_addr->vaddr = (uintptr_t)iface->shared_mem; - return UCS_OK; + iface_addr->fifo_seg_id = seg->seg_id; + return uct_mm_md_mapper_ops(md)->iface_addr_pack(md, iface_addr + 1); +} + +static int +uct_mm_iface_is_reachable(const uct_iface_h tl_iface, + const uct_device_addr_t *dev_addr, + const uct_iface_addr_t *tl_iface_addr) +{ + uct_mm_iface_t *iface = ucs_derived_of(tl_iface, uct_mm_iface_t); + uct_mm_md_t *md = ucs_derived_of(iface->super.super.md, + uct_mm_md_t); + uct_mm_iface_addr_t *iface_addr = (void*)tl_iface_addr; + + if (!uct_sm_iface_is_reachable(tl_iface, dev_addr, tl_iface_addr)) { + return 0; + } + + return uct_mm_md_mapper_ops(md)->is_reachable(md, iface_addr->fifo_seg_id, + iface_addr + 1); } void uct_mm_iface_release_desc(uct_recv_desc_t *self, void *desc) { void *mm_desc; - mm_desc = desc - sizeof(uct_mm_recv_desc_t); + mm_desc = UCS_PTR_BYTE_OFFSET(desc, -sizeof(uct_mm_recv_desc_t)); ucs_mpool_put(mm_desc); } @@ -84,7 +119,9 @@ static ucs_status_t uct_mm_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { uct_mm_iface_t *iface = ucs_derived_of(tl_iface, uct_mm_iface_t); - memset(iface_attr, 0, sizeof(uct_iface_attr_t)); + uct_mm_md_t *md = ucs_derived_of(iface->super.super.md, uct_mm_md_t); + + uct_base_iface_query(&iface->super.super, iface_attr); /* default values for all shared memory transports */ iface_attr->cap.put.max_short = UINT_MAX; @@ -111,8 +148,9 @@ static ucs_status_t uct_mm_iface_query(uct_iface_h tl_iface, iface_attr->cap.am.align_mtu = iface_attr->cap.am.opt_zcopy_align; iface_attr->cap.am.max_iov = 1; - iface_attr->iface_addr_len = sizeof(uct_mm_iface_addr_t); - iface_attr->device_addr_len = UCT_SM_IFACE_DEVICE_ADDR_LEN; + iface_attr->iface_addr_len = sizeof(uct_mm_iface_addr_t) + + md->iface_addr_len; + iface_attr->device_addr_len = uct_sm_iface_get_device_addr_len(); iface_attr->ep_addr_len = 0; iface_attr->max_conn_priv = 0; iface_attr->cap.flags = UCT_IFACE_FLAG_PUT_SHORT | @@ -123,9 +161,10 @@ static ucs_status_t uct_mm_iface_query(uct_iface_h tl_iface, UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_PENDING | UCT_IFACE_FLAG_CB_SYNC | - UCT_IFACE_FLAG_EVENT_SEND_COMP | - UCT_IFACE_FLAG_EVENT_RECV_SIG | UCT_IFACE_FLAG_CONNECT_TO_IFACE; + iface_attr->cap.event_flags = UCT_IFACE_FLAG_EVENT_SEND_COMP | + UCT_IFACE_FLAG_EVENT_RECV_SIG | + UCT_IFACE_FLAG_EVENT_FD; iface_attr->cap.atomic32.op_flags = iface_attr->cap.atomic64.op_flags = UCS_BIT(UCT_ATOMIC_OP_ADD) | @@ -140,15 +179,17 @@ static ucs_status_t uct_mm_iface_query(uct_iface_h tl_iface, UCS_BIT(UCT_ATOMIC_OP_SWAP) | UCS_BIT(UCT_ATOMIC_OP_CSWAP); - iface_attr->latency.overhead = 80e-9; /* 80 ns */ - iface_attr->latency.growth = 0; - iface_attr->bandwidth = 12179 * 1024.0 * 1024.0; + iface_attr->latency = ucs_linear_func_make(80e-9, 0); /* 80 ns */ + iface_attr->bandwidth.dedicated = iface->super.config.bandwidth; + iface_attr->bandwidth.shared = 0; iface_attr->overhead = 10e-9; /* 10 ns */ - iface_attr->priority = uct_mm_md_mapper_ops(iface->super.md)->get_priority(); + iface_attr->priority = 0; + return UCS_OK; } -static inline void uct_mm_progress_fifo_tail(uct_mm_iface_t *iface) +static UCS_F_ALWAYS_INLINE void +uct_mm_progress_fifo_tail(uct_mm_iface_t *iface) { /* don't progress the tail every time - release in batches. improves performance */ if (iface->read_index & iface->fifo_release_factor_mask) { @@ -158,114 +199,150 @@ static inline void uct_mm_progress_fifo_tail(uct_mm_iface_t *iface) iface->recv_fifo_ctl->tail = iface->read_index; } -ucs_status_t uct_mm_assign_desc_to_fifo_elem(uct_mm_iface_t *iface, - uct_mm_fifo_element_t *fifo_elem_p, - unsigned need_new_desc) +static UCS_F_ALWAYS_INLINE ucs_status_t +uct_mm_assign_desc_to_fifo_elem(uct_mm_iface_t *iface, + uct_mm_fifo_element_t *elem, + unsigned need_new_desc) { uct_mm_recv_desc_t *desc; if (!need_new_desc) { desc = iface->last_recv_desc; } else { - UCT_TL_IFACE_GET_RX_DESC(&iface->super, &iface->recv_desc_mp, desc, + UCT_TL_IFACE_GET_RX_DESC(&iface->super.super, &iface->recv_desc_mp, desc, return UCS_ERR_NO_RESOURCE); } - fifo_elem_p->desc_mmid = desc->key; - fifo_elem_p->desc_offset = iface->rx_headroom + - (ptrdiff_t) ((void*) (desc + 1) - desc->base_address); - fifo_elem_p->desc_chunk_base_addr = desc->base_address; - fifo_elem_p->desc_mpool_size = desc->mpool_length; - + elem->desc = desc->info; + elem->desc_data = UCS_PTR_BYTE_OFFSET(desc + 1, iface->rx_headroom); return UCS_OK; } -static inline ucs_status_t uct_mm_iface_process_recv(uct_mm_iface_t *iface, - uct_mm_fifo_element_t* elem) +static UCS_F_ALWAYS_INLINE void +uct_mm_iface_process_recv(uct_mm_iface_t *iface, + uct_mm_fifo_element_t* elem) { ucs_status_t status; void *data; if (ucs_likely(elem->flags & UCT_MM_FIFO_ELEM_FLAG_INLINE)) { /* read short (inline) messages from the FIFO elements */ - uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_RECV, elem->am_id, - elem + 1, elem->length, "RX: AM_SHORT"); - status = uct_mm_iface_invoke_am(iface, elem->am_id, elem + 1, - elem->length, 0); - } else { - /* read bcopy messages from the receive descriptors */ - VALGRIND_MAKE_MEM_DEFINED(elem->desc_chunk_base_addr + elem->desc_offset, - elem->length); + uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_RECV, + elem->am_id, elem + 1, elem->length, "RX: AM_SHORT"); + uct_mm_iface_invoke_am(iface, elem->am_id, elem + 1, elem->length, 0); + return; + } + + /* check the memory pool to make sure that there is a new descriptor available */ + if (ucs_unlikely(iface->last_recv_desc == NULL)) { + UCT_TL_IFACE_GET_RX_DESC(&iface->super.super, &iface->recv_desc_mp, + iface->last_recv_desc, return); + } - data = elem->desc_chunk_base_addr + elem->desc_offset; + /* read bcopy messages from the receive descriptors */ + data = elem->desc_data; + VALGRIND_MAKE_MEM_DEFINED(data, elem->length); - uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_RECV, elem->am_id, - data, elem->length, "RX: AM_BCOPY"); + uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_RECV, + elem->am_id, data, elem->length, "RX: AM_BCOPY"); - status = uct_mm_iface_invoke_am(iface, elem->am_id, data, elem->length, - UCT_CB_PARAM_FLAG_DESC); - if (status != UCS_OK) { - /* assign a new receive descriptor to this FIFO element.*/ - uct_mm_assign_desc_to_fifo_elem(iface, elem, 0); - } + status = uct_mm_iface_invoke_am(iface, elem->am_id, data, elem->length, + UCT_CB_PARAM_FLAG_DESC); + if (status != UCS_OK) { + /* assign a new receive descriptor to this FIFO element.*/ + uct_mm_assign_desc_to_fifo_elem(iface, elem, 0); + /* the last_recv_desc is in use. get a new descriptor for it */ + UCT_TL_IFACE_GET_RX_DESC(&iface->super.super, &iface->recv_desc_mp, + iface->last_recv_desc, ucs_debug("recv mpool is empty")); } - return status; } -static inline unsigned uct_mm_iface_poll_fifo(uct_mm_iface_t *iface) +static UCS_F_ALWAYS_INLINE int +uct_mm_iface_fifo_has_new_data(uct_mm_iface_t *iface) { - uint64_t read_index_loc, read_index; - uct_mm_fifo_element_t* read_index_elem; - ucs_status_t status; + /* check the read_index to see if there is a new item to read + * (checking the owner bit) */ + return (((iface->read_index >> iface->fifo_shift) & 1) == + (iface->read_index_elem->flags & 1)); +} - /* check the memory pool to make sure that there is a new descriptor available */ - if (ucs_unlikely(iface->last_recv_desc == NULL)) { - UCT_TL_IFACE_GET_RX_DESC(&iface->super, &iface->recv_desc_mp, - iface->last_recv_desc, return 0); +static UCS_F_ALWAYS_INLINE unsigned +uct_mm_iface_poll_fifo(uct_mm_iface_t *iface) +{ + if (!uct_mm_iface_fifo_has_new_data(iface)) { + return 0; } - read_index = iface->read_index; - read_index_loc = (read_index & iface->fifo_mask); - /* the fifo_element which the read_index points to */ - read_index_elem = UCT_MM_IFACE_GET_FIFO_ELEM(iface, iface->recv_fifo_elements ,read_index_loc); + /* read from read_index_elem */ + ucs_memory_cpu_load_fence(); + ucs_assert(iface->read_index <= iface->recv_fifo_ctl->head); - /* check the read_index to see if there is a new item to read (checking the owner bit) */ - if (((read_index >> iface->fifo_shift) & 1) == ((read_index_elem->flags) & 1)) { + uct_mm_iface_process_recv(iface, iface->read_index_elem); - /* read from read_index_elem */ - ucs_memory_cpu_load_fence(); - ucs_assert(iface->read_index <= iface->recv_fifo_ctl->head); + /* raise the read_index */ + iface->read_index++; - status = uct_mm_iface_process_recv(iface, read_index_elem); - if (status != UCS_OK) { - /* the last_recv_desc is in use. get a new descriptor for it */ - UCT_TL_IFACE_GET_RX_DESC(&iface->super, &iface->recv_desc_mp, - iface->last_recv_desc, ucs_debug("recv mpool is empty")); - } + /* the next fifo_element which the read_index points to */ + iface->read_index_elem = + UCT_MM_IFACE_GET_FIFO_ELEM(iface, iface->recv_fifo_elems, + (iface->read_index & iface->fifo_mask)); - /* raise the read_index. */ - iface->read_index++; + uct_mm_progress_fifo_tail(iface); - uct_mm_progress_fifo_tail(iface); + return 1; +} - return 1; +static UCS_F_ALWAYS_INLINE void +uct_mm_iface_fifo_window_adjust(uct_mm_iface_t *iface, + unsigned fifo_poll_count) +{ + if (fifo_poll_count < iface->fifo_poll_count) { + iface->fifo_poll_count = ucs_max(iface->fifo_poll_count / + UCT_MM_IFACE_FIFO_MD_FACTOR, + UCT_MM_IFACE_FIFO_MIN_POLL); + iface->fifo_prev_wnd_cons = 0; + return; + } + + ucs_assert(fifo_poll_count == iface->fifo_poll_count); + + if (iface->fifo_prev_wnd_cons) { + /* Increase FIFO window size if it was fully consumed + * during the previous iface progress call in order + * to prevent the situation when the window will be + * adjusted to [MIN, MIN + 1, MIN, MIN + 1, ...] that + * is harmful to latency */ + iface->fifo_poll_count = ucs_min(iface->fifo_poll_count + + UCT_MM_IFACE_FIFO_AI_VALUE, + iface->config.fifo_max_poll); } else { - return 0; + iface->fifo_prev_wnd_cons = 1; } } -unsigned uct_mm_iface_progress(void *arg) +static unsigned uct_mm_iface_progress(uct_iface_h tl_iface) { - uct_mm_iface_t *iface = arg; + uct_mm_iface_t *iface = ucs_derived_of(tl_iface, uct_mm_iface_t); + unsigned total_count = 0; unsigned count; + ucs_assert(iface->fifo_poll_count >= UCT_MM_IFACE_FIFO_MIN_POLL); + /* progress receive */ - count = uct_mm_iface_poll_fifo(iface); + do { + count = uct_mm_iface_poll_fifo(iface); + ucs_assert(count < 2); + total_count += count; + ucs_assert(total_count < UINT_MAX); + } while ((count != 0) && (total_count < iface->fifo_poll_count)); + + uct_mm_iface_fifo_window_adjust(iface, total_count); /* progress the pending sends (if there are any) */ - ucs_arbiter_dispatch(&iface->arbiter, 1, uct_mm_ep_process_pending, NULL); + ucs_arbiter_dispatch(&iface->arbiter, 1, uct_mm_ep_process_pending, + &total_count); - return count; + return total_count; } static ucs_status_t uct_mm_iface_event_fd_get(uct_iface_h tl_iface, int *fd_p) @@ -323,74 +400,76 @@ static uct_iface_ops_t uct_mm_iface_ops = { .iface_fence = uct_sm_iface_fence, .iface_progress_enable = uct_base_iface_progress_enable, .iface_progress_disable = uct_base_iface_progress_disable, - .iface_progress = (void*)uct_mm_iface_progress, + .iface_progress = uct_mm_iface_progress, .iface_event_fd_get = uct_mm_iface_event_fd_get, .iface_event_arm = uct_mm_iface_event_fd_arm, .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_mm_iface_t), .iface_query = uct_mm_iface_query, .iface_get_device_address = uct_sm_iface_get_device_address, .iface_get_address = uct_mm_iface_get_address, - .iface_is_reachable = uct_sm_iface_is_reachable + .iface_is_reachable = uct_mm_iface_is_reachable }; -void uct_mm_iface_recv_desc_init(uct_iface_h tl_iface, void *obj, uct_mem_h memh) +static void uct_mm_iface_recv_desc_init(uct_iface_h tl_iface, void *obj, + uct_mem_h memh) { - uct_mm_recv_desc_t *desc = obj; - uct_mm_seg_t *seg = memh; - - /* every desc in the memory pool, holds the mm_id(key) and address of the - * mem pool it belongs to */ - desc->key = seg->mmid; - desc->base_address = seg->address; - desc->mpool_length = seg->length; + uct_mm_iface_t *iface = ucs_derived_of(tl_iface, uct_mm_iface_t); + uct_mm_recv_desc_t *desc = obj; + uct_mm_seg_t *seg = memh; + size_t offset; + + if (seg->length > UINT_MAX) { + ucs_error("mm: shared memory segment length cannot exceed %u", UINT_MAX); + desc->info.seg_id = UINT64_MAX; + desc->info.seg_size = 0; + desc->info.offset = 0; + return; + } + + offset = UCS_PTR_BYTE_DIFF(seg->address, desc + 1) + iface->rx_headroom; + ucs_assert(offset <= UINT_MAX); + + desc->info.seg_id = seg->seg_id; + desc->info.seg_size = seg->length; + desc->info.offset = offset; } static void uct_mm_iface_free_rx_descs(uct_mm_iface_t *iface, unsigned num_elems) { - uct_mm_fifo_element_t* fifo_elem_p; + uct_mm_fifo_element_t *elem; uct_mm_recv_desc_t *desc; unsigned i; for (i = 0; i < num_elems; i++) { - fifo_elem_p = UCT_MM_IFACE_GET_FIFO_ELEM(iface, iface->recv_fifo_elements, i); - desc = UCT_MM_IFACE_GET_DESC_START(iface, fifo_elem_p); + elem = UCT_MM_IFACE_GET_FIFO_ELEM(iface, iface->recv_fifo_elems, i); + desc = (uct_mm_recv_desc_t*)UCS_PTR_BYTE_OFFSET(elem->desc_data, + -iface->rx_headroom) - 1; ucs_mpool_put(desc); } } -ucs_status_t uct_mm_allocate_fifo_mem(uct_mm_iface_t *iface, - uct_mm_iface_config_t *config, uct_md_h md) +void uct_mm_iface_set_fifo_ptrs(void *fifo_mem, uct_mm_fifo_ctl_t **fifo_ctl_p, + void **fifo_elems_p) { - uct_mm_fifo_ctl_t *ctl; - size_t size_to_alloc; - ucs_status_t status; - - /* allocate the receive FIFO */ - size_to_alloc = UCT_MM_GET_FIFO_SIZE(iface); - - status = uct_mm_md_mapper_ops(md)->alloc(md, &size_to_alloc, config->hugetlb_mode, - 0, "mm fifo", &iface->shared_mem, - &iface->fifo_mm_id, &iface->path); - if (status != UCS_OK) { - ucs_error("Failed to allocate memory for the receive FIFO in mm. size: %zu : %m", - size_to_alloc); - return status; - } + uct_mm_fifo_ctl_t *fifo_ctl; - ctl = uct_mm_set_fifo_ctl(iface->shared_mem); - uct_mm_set_fifo_elems_ptr(iface->shared_mem, &iface->recv_fifo_elements); + /* initiate the the uct_mm_fifo_ctl struct, holding the head and the tail */ + fifo_ctl = (uct_mm_fifo_ctl_t*)ucs_align_up_pow2 + ((uintptr_t)fifo_mem, UCS_SYS_CACHE_LINE_SIZE); /* Make sure head and tail are cache-aligned, and not on same cacheline, to * avoid false-sharing. */ - ucs_assert_always((((uintptr_t)&ctl->head) % UCS_SYS_CACHE_LINE_SIZE) == 0); - ucs_assert_always((((uintptr_t)&ctl->tail) % UCS_SYS_CACHE_LINE_SIZE) == 0); - ucs_assert_always(((uintptr_t)&ctl->tail - (uintptr_t)&ctl->head) >= UCS_SYS_CACHE_LINE_SIZE); - - iface->recv_fifo_ctl = ctl; - - ucs_assert(iface->shared_mem != NULL); - return UCS_OK; + ucs_assert_always( + (((uintptr_t)&fifo_ctl->head) % UCS_SYS_CACHE_LINE_SIZE) == 0); + ucs_assert_always( + (((uintptr_t)&fifo_ctl->tail) % UCS_SYS_CACHE_LINE_SIZE) == 0); + ucs_assert_always( + ((uintptr_t)&fifo_ctl->tail - (uintptr_t)&fifo_ctl->head) >= UCS_SYS_CACHE_LINE_SIZE); + + /* initiate the pointer to the beginning of the first FIFO element */ + *fifo_ctl_p = fifo_ctl; + *fifo_elems_p = UCS_PTR_BYTE_OFFSET(fifo_ctl, UCT_MM_FIFO_CTL_SIZE); } static ucs_status_t uct_mm_iface_create_signal_fd(uct_mm_iface_t *iface) @@ -447,30 +526,27 @@ static ucs_status_t uct_mm_iface_create_signal_fd(uct_mm_iface_t *iface) return status; } +static void uct_mm_iface_log_created(uct_mm_iface_t *iface) +{ + uct_mm_seg_t UCS_V_UNUSED *seg = iface->recv_fifo_mem.memh; + + ucs_debug("created mm iface %p FIFO id 0x%lx va %p size %zu (%u x %u elems)", + iface, seg->seg_id, seg->address, seg->length, + iface->config.fifo_elem_size, iface->config.fifo_size); +} + static UCS_CLASS_INIT_FUNC(uct_mm_iface_t, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, const uct_iface_config_t *tl_config) { - uct_mm_iface_config_t *mm_config = ucs_derived_of(tl_config, uct_mm_iface_config_t); + uct_mm_iface_config_t *mm_config = + ucs_derived_of(tl_config, uct_mm_iface_config_t); uct_mm_fifo_element_t* fifo_elem_p; ucs_status_t status; unsigned i; - UCT_CHECK_PARAM(params->field_mask & UCT_IFACE_PARAM_FIELD_OPEN_MODE, - "UCT_IFACE_PARAM_FIELD_OPEN_MODE is not defined"); - if (!(params->open_mode & UCT_IFACE_OPEN_MODE_DEVICE)) { - ucs_error("only UCT_IFACE_OPEN_MODE_DEVICE is supported"); - return UCS_ERR_UNSUPPORTED; - } - - UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_mm_iface_ops, md, worker, - params, tl_config - UCS_STATS_ARG((params->field_mask & - UCT_IFACE_PARAM_FIELD_STATS_ROOT) ? - params->stats_root : NULL) - UCS_STATS_ARG(UCT_MM_TL_NAME)); - - ucs_trace_func("Creating an MM iface=%p worker=%p", self, worker); + UCS_CLASS_CALL_SUPER_INIT(uct_sm_iface_t, &uct_mm_iface_ops, md, + worker, params, tl_config); if (ucs_derived_of(worker, uct_priv_worker_t)->thread_mode == UCS_THREAD_MODE_MULTI) { ucs_error("Shared memory transport does not support multi-threaded worker"); @@ -492,67 +568,80 @@ static UCS_CLASS_INIT_FUNC(uct_mm_iface_t, uct_md_h md, uct_worker_h worker, } /* check the value defining the size of the FIFO element */ - if (mm_config->super.max_short <= sizeof(uct_mm_fifo_element_t)) { - ucs_error("The UCT_MM_MAX_SHORT parameter must be larger than the FIFO " - "element header size. ( > %ld bytes).", - sizeof(uct_mm_fifo_element_t)); + if (mm_config->fifo_elem_size <= sizeof(uct_mm_fifo_element_t)) { + ucs_error("The UCX_MM_FIFO_ELEM_SIZE parameter (%u) must be larger " + "than the FIFO element header size (%ld bytes).", + mm_config->fifo_elem_size, sizeof(uct_mm_fifo_element_t)); status = UCS_ERR_INVALID_PARAM; goto err; } self->config.fifo_size = mm_config->fifo_size; - self->config.fifo_elem_size = mm_config->super.max_short; - self->config.seg_size = mm_config->super.max_bcopy; + self->config.fifo_elem_size = mm_config->fifo_elem_size; + self->config.seg_size = mm_config->seg_size; + self->config.fifo_max_poll = ((mm_config->fifo_max_poll == UCS_ULUNITS_AUTO) ? + UCT_MM_IFACE_FIFO_MAX_POLL : + /* trim by the maximum unsigned integer value */ + ucs_min(mm_config->fifo_max_poll, UINT_MAX)); + self->fifo_prev_wnd_cons = 0; + self->fifo_poll_count = self->config.fifo_max_poll; + /* cppcheck-suppress internalAstError */ self->fifo_release_factor_mask = UCS_MASK(ucs_ilog2(ucs_max((int) (mm_config->fifo_size * mm_config->release_fifo_factor), 1))); - self->fifo_mask = mm_config->fifo_size - 1; + self->fifo_mask = self->config.fifo_size - 1; self->fifo_shift = ucs_count_trailing_zero_bits(mm_config->fifo_size); self->rx_headroom = (params->field_mask & UCT_IFACE_PARAM_FIELD_RX_HEADROOM) ? params->rx_headroom : 0; self->release_desc.cb = uct_mm_iface_release_desc; - /* create the receive FIFO */ - /* use specific allocator to allocate and attach memory and check the - * requested hugetlb allocation mode */ - status = uct_mm_allocate_fifo_mem(self, mm_config, md); + /* Allocate the receive FIFO */ + status = uct_iface_mem_alloc(&self->super.super.super, + UCT_MM_GET_FIFO_SIZE(self), + UCT_MD_MEM_ACCESS_ALL, "mm_recv_fifo", + &self->recv_fifo_mem); if (status != UCS_OK) { - goto err; + ucs_error("mm_iface failed to allocate receive FIFO"); + return status; } - self->recv_fifo_ctl->head = 0; - self->recv_fifo_ctl->tail = 0; - self->read_index = 0; + uct_mm_iface_set_fifo_ptrs(self->recv_fifo_mem.address, + &self->recv_fifo_ctl, &self->recv_fifo_elems); + self->recv_fifo_ctl->head = 0; + self->recv_fifo_ctl->tail = 0; + self->read_index = 0; + self->read_index_elem = UCT_MM_IFACE_GET_FIFO_ELEM(self, + self->recv_fifo_elems, + self->read_index); + /* create a unix file descriptor to receive event notifications */ status = uct_mm_iface_create_signal_fd(self); if (status != UCS_OK) { goto err_free_fifo; } /* create a memory pool for receive descriptors */ - status = uct_iface_mpool_init(&self->super, + status = uct_iface_mpool_init(&self->super.super, &self->recv_desc_mp, sizeof(uct_mm_recv_desc_t) + self->rx_headroom + self->config.seg_size, sizeof(uct_mm_recv_desc_t), UCS_SYS_CACHE_LINE_SIZE, &mm_config->mp, - 512, + mm_config->mp.bufs_grow, uct_mm_iface_recv_desc_init, "mm_recv_desc"); if (status != UCS_OK) { - ucs_error("Failed to create a receive descriptor memory pool for the MM transport"); + ucs_error("failed to create a receive descriptor memory pool for the MM transport"); goto err_close_signal_fd; } - ucs_mpool_grow(&self->recv_desc_mp, mm_config->fifo_size * 2); - /* set the first receive descriptor */ self->last_recv_desc = ucs_mpool_get(&self->recv_desc_mp); VALGRIND_MAKE_MEM_DEFINED(self->last_recv_desc, sizeof(*(self->last_recv_desc))); if (self->last_recv_desc == NULL) { - ucs_error("Failed to get the first receive descriptor"); + ucs_error("failed to get the first receive descriptor"); status = UCS_ERR_NO_RESOURCE; goto destroy_recv_mpool; } @@ -560,19 +649,19 @@ static UCS_CLASS_INIT_FUNC(uct_mm_iface_t, uct_md_h md, uct_worker_h worker, /* initiate the owner bit in all the FIFO elements and assign a receive descriptor * per every FIFO element */ for (i = 0; i < mm_config->fifo_size; i++) { - fifo_elem_p = UCT_MM_IFACE_GET_FIFO_ELEM(self, self->recv_fifo_elements, i); + fifo_elem_p = UCT_MM_IFACE_GET_FIFO_ELEM(self, self->recv_fifo_elems, i); fifo_elem_p->flags = UCT_MM_FIFO_ELEM_FLAG_OWNER; status = uct_mm_assign_desc_to_fifo_elem(self, fifo_elem_p, 1); if (status != UCS_OK) { - ucs_error("Failed to allocate a descriptor for MM"); + ucs_error("failed to allocate a descriptor for MM"); goto destroy_descs; } } ucs_arbiter_init(&self->arbiter); + uct_mm_iface_log_created(self); - ucs_debug("Created an MM iface. FIFO mm id: %zu", self->fifo_mm_id); return UCS_OK; destroy_descs: @@ -583,19 +672,15 @@ static UCS_CLASS_INIT_FUNC(uct_mm_iface_t, uct_md_h md, uct_worker_h worker, err_close_signal_fd: close(self->signal_fd); err_free_fifo: - uct_mm_md_mapper_ops(md)->free(self->shared_mem, self->fifo_mm_id, - UCT_MM_GET_FIFO_SIZE(self), self->path); + uct_iface_mem_free(&self->recv_fifo_mem); err: return status; } static UCS_CLASS_CLEANUP_FUNC(uct_mm_iface_t) { - ucs_status_t status; - size_t size_to_free; - - uct_base_iface_progress_disable(&self->super.super, - UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); + uct_base_iface_progress_disable(&self->super.super.super, + UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); /* return all the descriptors that are now 'assigned' to the FIFO, * to their mpool */ @@ -604,54 +689,12 @@ static UCS_CLASS_CLEANUP_FUNC(uct_mm_iface_t) ucs_mpool_put(self->last_recv_desc); ucs_mpool_cleanup(&self->recv_desc_mp, 1); close(self->signal_fd); - - size_to_free = UCT_MM_GET_FIFO_SIZE(self); - - /* release the memory allocated for the FIFO */ - status = uct_mm_md_mapper_ops(self->super.md)->free(self->shared_mem, - self->fifo_mm_id, - size_to_free, self->path); - if (status != UCS_OK) { - ucs_warn("Unable to release shared memory segment: %m"); - } - + uct_iface_mem_free(&self->recv_fifo_mem); ucs_arbiter_cleanup(&self->arbiter); } UCS_CLASS_DEFINE(uct_mm_iface_t, uct_base_iface_t); -static UCS_CLASS_DEFINE_NEW_FUNC(uct_mm_iface_t, uct_iface_t, uct_md_h, - uct_worker_h, const uct_iface_params_t *, - const uct_iface_config_t *); +UCS_CLASS_DEFINE_NEW_FUNC(uct_mm_iface_t, uct_iface_t, uct_md_h, uct_worker_h, + const uct_iface_params_t*, const uct_iface_config_t*); static UCS_CLASS_DEFINE_DELETE_FUNC(uct_mm_iface_t, uct_iface_t); - -static ucs_status_t uct_mm_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) -{ - uct_tl_resource_desc_t *resource; - - resource = ucs_calloc(1, sizeof(uct_tl_resource_desc_t), "resource desc"); - if (NULL == resource) { - ucs_error("Failed to allocate memory"); - return UCS_ERR_NO_MEMORY; - } - - ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", - UCT_MM_TL_NAME); - ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", - md->component->name); - resource->dev_type = UCT_DEVICE_TYPE_SHM; - - *num_resources_p = 1; - *resource_p = resource; - return UCS_OK; -} - -UCT_TL_COMPONENT_DEFINE(uct_mm_tl, - uct_mm_query_tl_resources, - uct_mm_iface_t, - UCT_MM_TL_NAME, - "MM_", - uct_mm_iface_config_table, - uct_mm_iface_config_t); diff --git a/src/uct/sm/mm/base/mm_iface.h b/src/uct/sm/mm/base/mm_iface.h index 90b2d4fdaf8..f686127fad0 100644 --- a/src/uct/sm/mm/base/mm_iface.h +++ b/src/uct/sm/mm/base/mm_iface.h @@ -8,10 +8,10 @@ #ifndef UCT_MM_IFACE_H #define UCT_MM_IFACE_H -#include "mm_def.h" #include "mm_md.h" #include +#include #include #include #include @@ -21,44 +21,147 @@ #include -#define UCT_MM_TL_NAME "mm" -#define UCT_MM_FIFO_CTL_SIZE_ALIGNED ucs_align_up(sizeof(uct_mm_fifo_ctl_t),UCS_SYS_CACHE_LINE_SIZE) +enum { + UCT_MM_FIFO_ELEM_FLAG_OWNER = UCS_BIT(0), /* new/old info */ + UCT_MM_FIFO_ELEM_FLAG_INLINE = UCS_BIT(1), /* if inline or not */ +}; + + +#define UCT_MM_FIFO_CTL_SIZE \ + ucs_align_up(sizeof(uct_mm_fifo_ctl_t), UCS_SYS_CACHE_LINE_SIZE) + + +#define UCT_MM_GET_FIFO_SIZE(_iface) \ + (UCT_MM_FIFO_CTL_SIZE + \ + ((_iface)->config.fifo_size * (_iface)->config.fifo_elem_size) + \ + (UCS_SYS_CACHE_LINE_SIZE - 1)) + + +#define UCT_MM_IFACE_GET_FIFO_ELEM(_iface, _fifo, _index) \ + ((uct_mm_fifo_element_t*) \ + UCS_PTR_BYTE_OFFSET(_fifo, (_index) * (_iface)->config.fifo_elem_size)) -#define UCT_MM_GET_FIFO_SIZE(iface) (UCS_SYS_CACHE_LINE_SIZE - 1 + \ - UCT_MM_FIFO_CTL_SIZE_ALIGNED + \ - ((iface)->config.fifo_size * \ - (iface)->config.fifo_elem_size)) +#define uct_mm_iface_mapper_call(_iface, _func, ...) \ + ({ \ + uct_mm_md_t *md = ucs_derived_of((_iface)->super.super.md, uct_mm_md_t); \ + uct_mm_md_mapper_call(md, _func, ## __VA_ARGS__); \ + }) +/* AIMD (additive increase/multiplicative decrease) algorithm adopted for FIFO + * polling mechanism to adjust FIFO polling window. + * - FIFO window is increased if the number of completed RX operations during + * the current iface progress call reaches FIFO window size and previous iface + * progress call was able to fully consume FIFO window (protection against + * impacting ping-pong pattern where handling of > 1 RX operation should not + * be expected). + * - FIFO window is decreased if the number of completed RX operations during + * the current iface progress call does not reach FIFO window size. + * See https://en.wikipedia.org/wiki/Additive_increase/multiplicative_decrease + * for more information about original AIMD algorithm used for congestion + * avoidance. */ +#define UCT_MM_IFACE_FIFO_MIN_POLL 1 /* Minimal FIFO window size */ +#define UCT_MM_IFACE_FIFO_MAX_POLL 16 /* Default value for FIFO maximal + * window size */ +#define UCT_MM_IFACE_FIFO_AI_VALUE 1 /* FIFO window += AI value */ +#define UCT_MM_IFACE_FIFO_MD_FACTOR 2 /* FIFO window /= MD factor */ + + +/** + * MM interface configuration + */ typedef struct uct_mm_iface_config { - uct_iface_config_t super; - unsigned fifo_size; /* Size of the receive FIFO */ - double release_fifo_factor; - ucs_ternary_value_t hugetlb_mode; /* Enable using huge pages for */ - /* shared memory buffers */ + uct_sm_iface_config_t super; + size_t seg_size; /* Size of the receive + * descriptor (for payload) */ + unsigned fifo_size; /* Size of the receive FIFO */ + size_t fifo_max_poll; /* Maximal RX completions to pick + * during RX poll */ + double release_fifo_factor; /* Tail index update frequency */ + ucs_ternary_value_t hugetlb_mode; /* Enable using huge pages for + * shared memory buffers */ + unsigned fifo_elem_size; /* Size of the FIFO element size */ uct_iface_mpool_config_t mp; } uct_mm_iface_config_t; -struct uct_mm_fifo_ctl { +/** + * MM interface address + */ +typedef struct uct_mm_iface_addr { + uct_mm_seg_id_t fifo_seg_id; /* Shared memory identifier of FIFO */ + /* mapper-specific iface address follows */ +} UCS_S_PACKED uct_mm_iface_addr_t; + + +/** + * MM FIFO control segment + */ +typedef struct uct_mm_fifo_ctl { /* 1st cacheline */ - volatile uint64_t head; /* where to write next */ - socklen_t signal_addrlen; /* address length of signaling socket */ - struct sockaddr_un signal_sockaddr; /* address of signaling socket */ - UCS_CACHELINE_PADDING(uint64_t, socklen_t, struct sockaddr_un); + volatile uint64_t head; /* Where to write next */ + socklen_t signal_addrlen; /* Address length of signaling socket */ + struct sockaddr_un signal_sockaddr;/* Address of signaling socket */ + UCS_CACHELINE_PADDING(uint64_t, + socklen_t, + struct sockaddr_un); /* 2nd cacheline */ - volatile uint64_t tail; /* how much was read */ -} UCS_S_PACKED UCS_V_ALIGNED(UCS_SYS_CACHE_LINE_SIZE); + volatile uint64_t tail; /* How much was consumed */ +} UCS_S_PACKED UCS_V_ALIGNED(UCS_SYS_CACHE_LINE_SIZE) uct_mm_fifo_ctl_t; + + +/** + * MM receive descriptor info in the shared FIFO + */ +typedef struct uct_mm_desc_info { + uct_mm_seg_id_t seg_id; /* shared memory segment id */ + unsigned seg_size; /* size of the shared memory segment */ + unsigned offset; /* offset inside the shared memory + segment */ +} UCS_S_PACKED uct_mm_desc_info_t; + + +/** + * MM FIFO element + */ +typedef struct uct_mm_fifo_element { + uint8_t flags; /* UCT_MM_FIFO_ELEM_FLAG_xx */ + uint8_t am_id; /* active message id */ + uint16_t length; /* length of actual data written + by producer */ + uct_mm_desc_info_t desc; /* remote receive descriptor + parameters for am_bcopy */ + void *desc_data; /* pointer to receive descriptor, + valid only on receiver */ + + /* the data follows here (in case of inline messaging) */ +} UCS_S_PACKED uct_mm_fifo_element_t; -struct uct_mm_iface { - uct_base_iface_t super; +/* + * MM receive descriptor: + * + * +--------------------+---------------+-----------+ + * | uct_mm_recv_desc_t | user-defined | data | + * | (info + rdesc) | rx headroom | (payload) | + * +--------------------+---------------+-----------+ + */ +typedef struct uct_mm_recv_desc { + uct_mm_desc_info_t info; /* descriptor information for the + remote side which writes to it */ + uct_recv_desc_t recv; /* has to be in the end */ +} uct_mm_recv_desc_t; + + +/** + * MM trandport interface + */ +typedef struct uct_mm_iface { + uct_sm_iface_t super; /* Receive FIFO */ - uct_mm_id_t fifo_mm_id; /* memory id which will be received */ - /* after allocating the fifo */ - void *shared_mem; /* the beginning of the receive fifo */ + uct_allocated_memory_t recv_fifo_mem; uct_mm_fifo_ctl_t *recv_fifo_ctl; /* pointer to the struct at the */ /* beginning of the receive fifo */ @@ -66,55 +169,63 @@ struct uct_mm_iface { /* this struct is cache line aligned and */ /* doesn't necessarily start where */ /* shared_mem starts */ - void *recv_fifo_elements; /* pointer to the first fifo element */ - /* in the receive fifo */ - uint64_t read_index; /* actual reading location */ + void *recv_fifo_elems; /* pointer to the first fifo element + in the receive fifo */ + uct_mm_fifo_element_t *read_index_elem; + uint64_t read_index; /* actual reading location */ - uint8_t fifo_shift; /* = log2(fifo_size) */ - unsigned fifo_mask; /* = 2^fifo_shift - 1 */ + uint8_t fifo_shift; /* = log2(fifo_size) */ + unsigned fifo_mask; /* = 2^fifo_shift - 1 */ uint64_t fifo_release_factor_mask; + unsigned fifo_poll_count; /* How much RX operations can be polled + * during an iface progress call */ + int fifo_prev_wnd_cons; /* Was FIFO window size fully consumed by + * the previous call to iface progress */ + ucs_mpool_t recv_desc_mp; - uct_mm_recv_desc_t *last_recv_desc; /* next receive descriptor to use */ + uct_mm_recv_desc_t *last_recv_desc; /* next receive descriptor to use */ int signal_fd; /* Unix socket for receiving remote signal */ size_t rx_headroom; ucs_arbiter_t arbiter; - const char *path; /* path to the backing file (for 'posix') */ uct_recv_desc_t release_desc; struct { - unsigned fifo_size; - unsigned fifo_elem_size; - unsigned seg_size; /* size of the receive descriptor (for payload)*/ + unsigned fifo_size; + unsigned fifo_elem_size; + unsigned seg_size; /* size of the receive descriptor (for payload)*/ + unsigned fifo_max_poll; } config; -}; +} uct_mm_iface_t; -struct uct_mm_fifo_element { - uint8_t flags; - uint8_t am_id; /* active message id */ - uint16_t length; /* length of actual data */ - - /* bcopy parameters */ - size_t desc_mpool_size; - uct_mm_id_t desc_mmid; /* the mmid of the the memory chunk that - * the desc (that this fifo_elem points to) - * belongs to */ - size_t desc_offset; /* the offset of the desc (its data location for bcopy) - * within the memory chunk it belongs to */ - void *desc_chunk_base_addr; - /* the data follows here (in case of inline messaging) */ -} UCS_S_PACKED; +/* + * Define a memory-mapper transport for MM. + * + * @param _name Component name token + * @param _md_ops Memory domain operations, of type uct_mm_md_ops_t. + * @param _rkey_unpack Remote key unpack function + * @param _rkey_release Remote key release function + * @param _cfg_prefix Prefix for configuration variables. + */ +#define UCT_MM_TL_DEFINE(_name, _md_ops, _rkey_unpack, _rkey_release, \ + _cfg_prefix) \ + \ + UCT_MM_COMPONENT_DEFINE(uct_##_name##_component, _name, _md_ops, \ + _rkey_unpack, _rkey_release, _cfg_prefix) \ + \ + UCT_TL_DEFINE(&(uct_##_name##_component).super, \ + _name, \ + uct_sm_base_query_tl_devices, \ + uct_mm_iface_t, \ + "MM_", \ + uct_mm_iface_config_table, \ + uct_mm_iface_config_t); -struct uct_mm_recv_desc { - uct_mm_id_t key; - void *base_address; - size_t mpool_length; - uct_recv_desc_t recv; /* has to be in the end */ -}; +extern ucs_config_field_t uct_mm_iface_config_table[]; static UCS_F_ALWAYS_INLINE ucs_status_t @@ -124,7 +235,8 @@ uct_mm_iface_invoke_am(uct_mm_iface_t *iface, uint8_t am_id, void *data, ucs_status_t status; void *desc; - status = uct_iface_invoke_am(&iface->super, am_id, data, length, flags); + status = uct_iface_invoke_am(&iface->super.super, am_id, data, length, + flags); if (status == UCS_INPROGRESS) { desc = (void *)((uintptr_t)data - iface->rx_headroom); @@ -136,35 +248,25 @@ uct_mm_iface_invoke_am(uct_mm_iface_t *iface, uint8_t am_id, void *data, } -static uct_mm_fifo_ctl_t* uct_mm_set_fifo_ctl(void *mem_region) -{ - return (uct_mm_fifo_ctl_t*) ucs_align_up_pow2 - ((uintptr_t) mem_region , UCS_SYS_CACHE_LINE_SIZE); -} - /** * Set aligned pointers of the FIFO according to the beginning of the allocated * memory. - * - * @param [in] mem_region pointer to the beginning of the allocated memory. - * @param [out] fifo_elems an aligned pointer to the first FIFO element. + * @param [in] fifo_mem Pointer to the beginning of the allocated memory. + * @param [out] fifo_ctl_p Pointer to the FIFO control structure. + * @param [out] fifo_elems Pointer to the array of FIFO elements. */ -static inline void uct_mm_set_fifo_elems_ptr(void *mem_region, void **fifo_elems) -{ - uct_mm_fifo_ctl_t *fifo_ctl; +void uct_mm_iface_set_fifo_ptrs(void *fifo_mem, uct_mm_fifo_ctl_t **fifo_ctl_p, + void **fifo_elems_p); - /* initiate the the uct_mm_fifo_ctl struct, holding the head and the tail */ - fifo_ctl = uct_mm_set_fifo_ctl(mem_region); - /* initiate the pointer to the beginning of the first FIFO element */ - *fifo_elems = (void*) fifo_ctl + UCT_MM_FIFO_CTL_SIZE_ALIGNED; -} +UCS_CLASS_DECLARE_NEW_FUNC(uct_mm_iface_t, uct_iface_t, uct_md_h, uct_worker_h, + const uct_iface_params_t*, const uct_iface_config_t*); + void uct_mm_iface_release_desc(uct_recv_desc_t *self, void *desc); -ucs_status_t uct_mm_flush(); -unsigned uct_mm_iface_progress(void *arg); -extern uct_tl_component_t uct_mm_tl; +ucs_status_t uct_mm_flush(); + #endif diff --git a/src/uct/sm/mm/base/mm_md.c b/src/uct/sm/mm/base/mm_md.c index e46ff50dfae..0461805f53b 100644 --- a/src/uct/sm/mm/base/mm_md.c +++ b/src/uct/sm/mm/base/mm_md.c @@ -5,13 +5,22 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "mm_md.h" +#include +#include +#include + + ucs_config_field_t uct_mm_md_config_table[] = { {"", "", NULL, ucs_offsetof(uct_mm_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_md_config_table)}, - {"HUGETLB_MODE", "yes", + {"HUGETLB_MODE", "try", "Enable using huge pages for internal buffers. " "Possible values are:\n" " y - Allocate memory using huge pages only.\n" @@ -22,257 +31,118 @@ ucs_config_field_t uct_mm_md_config_table[] = { {NULL} }; -ucs_status_t uct_mm_mem_alloc(uct_md_h md, size_t *length_p, void **address_p, - unsigned flags, const char *alloc_name, - uct_mem_h *memh_p) +ucs_status_t uct_mm_query_md_resources(uct_component_t *component, + uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p) { ucs_status_t status; - uct_mm_seg_t *seg; - - seg = ucs_calloc(1, sizeof(*seg), "mm_seg"); - if (NULL == seg) { - ucs_error("Failed to allocate memory for mm segment"); - return UCS_ERR_NO_MEMORY; - } - - status = uct_mm_md_mapper_ops(md)->alloc(md, length_p, UCS_TRY, flags, - alloc_name, address_p, &seg->mmid, - &seg->path); - if (status != UCS_OK) { - ucs_free(seg); + status = uct_mm_mdc_mapper_ops(component)->query(); + switch (status) { + case UCS_OK: + return uct_md_query_single_md_resource(component, resources_p, + num_resources_p); + case UCS_ERR_UNSUPPORTED: + return uct_md_query_empty_md_resource(resources_p, num_resources_p); + default: return status; } - - seg->length = *length_p; - seg->address = *address_p; - *memh_p = seg; - - ucs_debug("mm allocated address %p length %zu mmid %"PRIu64, - seg->address, seg->length, seg->mmid); - return UCS_OK; } -ucs_status_t uct_mm_mem_free(uct_md_h md, uct_mem_h memh) +ucs_status_t uct_mm_seg_new(void *address, size_t length, uct_mm_seg_t **seg_p) { - uct_mm_seg_t *seg = memh; - ucs_status_t status; - - status = uct_mm_md_mapper_ops(md)->free(seg->address, seg->mmid, seg->length, - seg->path); - if (status != UCS_OK) { - return status; - } - - ucs_free(seg); - return UCS_OK; -} - -ucs_status_t uct_mm_mem_reg(uct_md_h md, void *address, size_t length, - unsigned flags, uct_mem_h *memh_p) -{ - ucs_status_t status; uct_mm_seg_t *seg; - seg = ucs_calloc(1, sizeof(*seg), "mm_seg"); - if (NULL == seg) { - ucs_error("Failed to allocate memory for mm segment"); + seg = ucs_malloc(sizeof(*seg), "mm_seg"); + if (seg == NULL) { + ucs_error("failed to allocate mm segment"); return UCS_ERR_NO_MEMORY; } - status = uct_mm_md_mapper_ops(md)->reg(address, length, - &seg->mmid); - if (status != UCS_OK) { - ucs_free(seg); - return status; - } - - seg->length = length; seg->address = address; - *memh_p = seg; - - ucs_debug("mm registered address %p length %zu mmid %"PRIu64, - address, length, seg->mmid); - return UCS_OK; -} - -ucs_status_t uct_mm_mem_dereg(uct_md_h md, uct_mem_h memh) -{ - uct_mm_seg_t *seg = memh; - ucs_status_t status; - - status = uct_mm_md_mapper_ops(md)->dereg(seg->mmid); - if (status != UCS_OK) { - return status; - } - - ucs_free(seg); - return UCS_OK; -} - -ucs_status_t uct_mm_md_query(uct_md_h md, uct_md_attr_t *md_attr) -{ - md_attr->cap.flags = 0; - if (uct_mm_md_mapper_ops(md)->alloc != NULL) { - md_attr->cap.flags |= UCT_MD_FLAG_ALLOC; - } - if (uct_mm_md_mapper_ops(md)->attach != NULL) { - md_attr->cap.flags |= UCT_MD_FLAG_RKEY_PTR; - } - if (uct_mm_md_mapper_ops(md)->reg != NULL) { - md_attr->cap.flags |= UCT_MD_FLAG_REG; - md_attr->reg_cost.overhead = 1000.0e-9; - md_attr->reg_cost.growth = 0.007e-9; - } - md_attr->cap.flags |= UCT_MD_FLAG_NEED_RKEY; - md_attr->cap.reg_mem_types = UCS_BIT(UCT_MD_MEM_TYPE_HOST); - md_attr->cap.mem_type = UCT_MD_MEM_TYPE_HOST; - /* all mm md(s) support fixed memory alloc */ - md_attr->cap.flags |= UCT_MD_FLAG_FIXED; - md_attr->cap.max_alloc = ULONG_MAX; - md_attr->cap.max_reg = 0; - md_attr->rkey_packed_size = sizeof(uct_mm_packed_rkey_t) + - uct_mm_md_mapper_ops(md)->get_path_size(md); - memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); - return UCS_OK; -} - -ucs_status_t uct_mm_mkey_pack(uct_md_h md, uct_mem_h memh, void *rkey_buffer) -{ - uct_mm_packed_rkey_t *rkey = rkey_buffer; - uct_mm_seg_t *seg = memh; - - rkey->mmid = seg->mmid; - rkey->owner_ptr = (uintptr_t)seg->address; - rkey->length = seg->length; - - if (seg->path != NULL) { - strcpy(rkey->path, seg->path); - } - - ucs_trace("packed rkey: mmid %"PRIu64" owner_ptr %"PRIxPTR, - rkey->mmid, rkey->owner_ptr); + seg->length = length; + seg->seg_id = 0; + *seg_p = seg; return UCS_OK; } -ucs_status_t uct_mm_rkey_unpack(uct_md_component_t *mdc, const void *rkey_buffer, - uct_rkey_t *rkey_p, void **handle_p) +void uct_mm_md_query(uct_md_h md, uct_md_attr_t *md_attr, int support_alloc) { - /* user is responsible to free rkey_buffer */ - const uct_mm_packed_rkey_t *rkey = rkey_buffer; - uct_mm_remote_seg_t *mm_desc; - ucs_status_t status; + memset(md_attr, 0, sizeof(*md_attr)); - ucs_trace("unpacking rkey: mmid %"PRIu64" owner_ptr %"PRIxPTR, - rkey->mmid, rkey->owner_ptr); + md_attr->cap.flags = UCT_MD_FLAG_RKEY_PTR | + UCT_MD_FLAG_NEED_RKEY; + md_attr->cap.max_reg = 0; + md_attr->cap.max_alloc = 0; + md_attr->cap.access_mem_type = UCS_MEMORY_TYPE_HOST; + md_attr->cap.detect_mem_types = 0; - mm_desc = ucs_malloc(sizeof(*mm_desc), "mm_desc"); - if (mm_desc == NULL) { - return UCS_ERR_NO_RESOURCE; + if (support_alloc) { + md_attr->cap.flags |= UCT_MD_FLAG_ALLOC | UCT_MD_FLAG_FIXED; + md_attr->cap.max_alloc = ULONG_MAX; } - status = uct_mm_mdc_mapper_ops(mdc)->attach(rkey->mmid, rkey->length, - (void *)rkey->owner_ptr, - &mm_desc->address, - &mm_desc->cookie, - rkey->path); - if (status != UCS_OK) { - ucs_free(mm_desc); - return status; - } - - mm_desc->length = rkey->length; - mm_desc->mmid = rkey->mmid; - /* store the offset of the addresses, this can be used directly to translate - * the remote VA to local VA of the attached segment */ - *handle_p = mm_desc; - *rkey_p = (uintptr_t)mm_desc->address - rkey->owner_ptr; - return UCS_OK; + memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); } -ucs_status_t uct_mm_rkey_ptr(uct_md_component_t *mdc, uct_rkey_t rkey, +ucs_status_t uct_mm_rkey_ptr(uct_component_t *component, uct_rkey_t rkey, void *handle, uint64_t raddr, void **laddr_p) { - uct_mm_remote_seg_t *mm_desc = handle; - /* rkey stores offset from the remote va */ - *laddr_p = (void *)(raddr + (uint64_t)rkey); - if ((*laddr_p < mm_desc->address) || - (*laddr_p >= mm_desc->address + mm_desc->length)) { - return UCS_ERR_INVALID_ADDR; - } + *laddr_p = UCS_PTR_BYTE_OFFSET(raddr, (ptrdiff_t)rkey); return UCS_OK; } -ucs_status_t uct_mm_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, void *handle) +ucs_status_t uct_mm_md_open(uct_component_t *component, const char *md_name, + const uct_md_config_t *config, uct_md_h *md_p) { + uct_mm_component_t *mmc = ucs_derived_of(component, uct_mm_component_t); ucs_status_t status; - uct_mm_remote_seg_t *mm_desc = handle; - - status = uct_mm_mdc_mapper_ops(mdc)->detach(mm_desc); - ucs_free(mm_desc); - return status; -} + uct_mm_md_t *md; -static void uct_mm_md_close(uct_md_h md) -{ - uct_mm_md_t *mm_md = ucs_derived_of(md, uct_mm_md_t); - - ucs_config_parser_release_opts(mm_md->config, md->component->md_config_table); - ucs_free(mm_md->config); - ucs_free(mm_md); -} - -uct_md_ops_t uct_mm_md_ops = { - .close = uct_mm_md_close, - .query = uct_mm_md_query, - .mem_alloc = uct_mm_mem_alloc, - .mem_free = uct_mm_mem_free, - .mem_reg = uct_mm_mem_reg, - .mem_dereg = uct_mm_mem_dereg, - .mkey_pack = uct_mm_mkey_pack, - .is_mem_type_owned = (void *)ucs_empty_function_return_zero, -}; - -ucs_status_t uct_mm_md_open(const char *md_name, const uct_md_config_t *md_config, - uct_md_h *md_p, uct_md_component_t *mdc) -{ - uct_mm_md_t *mm_md; - ucs_status_t status; - - mm_md = ucs_malloc(sizeof(*mm_md), "uct_mm_md_t"); - if (mm_md == NULL) { + md = ucs_malloc(sizeof(*md), "uct_mm_md_t"); + if (md == NULL) { ucs_error("Failed to allocate memory for uct_mm_md_t"); status = UCS_ERR_NO_MEMORY; goto err; } - mm_md->config = ucs_malloc(mdc->md_config_size, "mm_md config"); - if (mm_md->config == NULL) { + md->config = ucs_malloc(mmc->super.md_config.size, "mm_md config"); + if (md->config == NULL) { ucs_error("Failed to allocate memory for mm_md config"); status = UCS_ERR_NO_MEMORY; goto err_free_mm_md; } - status = ucs_config_parser_clone_opts(md_config, mm_md->config, - mdc->md_config_table); + status = ucs_config_parser_clone_opts(config, md->config, + mmc->super.md_config.table); if (status != UCS_OK) { ucs_error("Failed to clone opts"); goto err_free_mm_md_config; } - mdc->rkey_ptr = uct_mm_rkey_ptr; - - mm_md->super.ops = &uct_mm_md_ops; - mm_md->super.component = mdc; + md->super.ops = &mmc->md_ops->super; + md->super.component = &mmc->super; + md->iface_addr_len = mmc->md_ops->iface_addr_length(md); - *md_p = &mm_md->super; + /* cppcheck-suppress autoVariables */ + *md_p = &md->super; return UCS_OK; err_free_mm_md_config: - ucs_free(mm_md->config); + ucs_free(md->config); err_free_mm_md: - ucs_free(mm_md); + ucs_free(md); err: return status; } + +void uct_mm_md_close(uct_md_h md) +{ + uct_mm_md_t *mm_md = ucs_derived_of(md, uct_mm_md_t); + + ucs_config_parser_release_opts(mm_md->config, + md->component->md_config.table); + ucs_free(mm_md->config); + ucs_free(mm_md); +} diff --git a/src/uct/sm/mm/base/mm_md.h b/src/uct/sm/mm/base/mm_md.h index 02b8b40a01e..a74f57364a2 100644 --- a/src/uct/sm/mm/base/mm_md.h +++ b/src/uct/sm/mm/base/mm_md.h @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -8,160 +8,199 @@ #ifndef UCT_MM_MD_H_ #define UCT_MM_MD_H_ -#include "mm_def.h" - #include #include #include #include -/* Shared memory ID */ -typedef uint64_t uct_mm_id_t; +/* Memory mapper segment unique id, used for both FIFO and bcopy descriptors. + * The exact structure depends on specific mapper */ +typedef uint64_t uct_mm_seg_id_t; -extern ucs_config_field_t uct_mm_md_config_table[]; -/* - * Descriptor of the mapped memory +/** + * Local memory segment structure. + * The mappers must implement memory allocation functions so that they will + * return this structure as uct_memh. */ -struct uct_mm_remote_seg { - uct_mm_remote_seg_t *next; - uct_mm_id_t mmid; /**< mmid of the remote memory chunk */ - void *address; /**< local memory address */ - uint64_t cookie; /**< cookie for mmap, xpmem, etc. */ - size_t length; /**< size of the memory */ -}; +typedef struct uct_mm_seg { + uct_mm_seg_id_t seg_id; /* Shared memory ID */ + void *address; /* Virtual address */ + size_t length; /* Size of the memory */ +} uct_mm_seg_t; + /* - * Memory mapper operations - MM uses them to implement MD and TL functionality. + * Descriptor of remote attached memory */ -typedef struct uct_mm_mapper_ops { +typedef struct uct_mm_remote_seg { + void *address; /* Local address of attached memory */ + void *cookie; /* Mapper-specific data */ +} uct_mm_remote_seg_t; + + +/** + * MM memory domain configuration + */ +typedef struct uct_mm_md_config { + uct_md_config_t super; + ucs_ternary_value_t hugetlb_mode; /* Enable using huge pages */ +} uct_mm_md_config_t; - ucs_status_t (*query)(); - size_t (*get_path_size)(uct_md_h md); +/** + * MM memory domain + */ +typedef struct uct_mm_md { + uct_md_t super; + uct_mm_md_config_t *config; /* Clone of MD configuration */ + size_t iface_addr_len; /* As returned from + uct_mm_md_mapper_ops_t::iface_addr_length */ +} uct_mm_md_t; - uint8_t (*get_priority)(); - ucs_status_t (*reg)(void *address, size_t size, - uct_mm_id_t *mmid_p); +/* Check if available on current machine */ +typedef ucs_status_t (*uct_mm_mapper_query_func_t)(); - ucs_status_t (*dereg)(uct_mm_id_t mm_id); - ucs_status_t (*alloc)(uct_md_h md, size_t *length_p, ucs_ternary_value_t hugetlb, - unsigned flags, const char *alloc_name, void **address_p, - uct_mm_id_t *mmid_p, const char **path_p); +/* Return the size of memory-domain specific iface address (e.g mmap path) */ +typedef size_t (*uct_mm_mapper_iface_addr_length_func_t)(uct_mm_md_t *md); - ucs_status_t (*attach)(uct_mm_id_t mmid, size_t length, - void *remote_address, void **address, uint64_t *cookie, - const char *path); - ucs_status_t (*detach)(uct_mm_remote_seg_t *mm_desc); +/* Pack interface address. Holds common information for all memory segments + * allocated on the same interface. 'buffer' must be at least the size returned + * from iface_addr_length() + */ +typedef ucs_status_t +(*uct_mm_mapper_iface_addr_pack_func_t)(uct_mm_md_t *md, void *buffer); - ucs_status_t (*free)(void *address, uct_mm_id_t mm_id, size_t length, - const char *path); -} uct_mm_mapper_ops_t; +/* Attach memory allocated by mem_alloc(). seg_id is from 'uct_mm_seg_t' + * structure, and iface_addr is from iface_addr_pack() on the remote process + * + * This function is used only for active messages memory (FIFO and receive + * descriptors). + */ +typedef ucs_status_t +(*uct_mm_mapper_mem_attach_func_t)(uct_mm_md_t *md, uct_mm_seg_id_t seg_id, + size_t length, const void *iface_addr, + uct_mm_remote_seg_t *rseg); -/* Extract mapper ops from MD component */ -#define uct_mm_mdc_mapper_ops(_mdc) \ - ((uct_mm_mapper_ops_t*)(_mdc)->priv) +/* Check if memory may be attached using mem_attach. seg_id is from + * 'uct_mm_seg_t' structure, and iface_addr is from iface_addr_pack() on the + * remote process + */ +typedef int +(*uct_mm_mapper_is_reachable_func_t)(uct_mm_md_t *md, uct_mm_seg_id_t seg_id, + const void *iface_addr); -/* Extract mapped ops from MD */ -#define uct_mm_md_mapper_ops(_md) \ - uct_mm_mdc_mapper_ops((_md)->component) + +/* Clean up the remote segment handle created by mem_attach() */ +typedef void +(*uct_mm_mapper_mem_detach_func_t)(uct_mm_md_t *md, + const uct_mm_remote_seg_t *rseg); /* - * Define a memory-mapper component for MM. - * - * @param _var Variable for MD component. - * @param _name String which is the component name. - * @param _ops Mapper operations, of type uct_mm_mapper_ops_t. - * @param _prefix Prefix for defining the vars config table and config struct. - * @param _cfg_prefix Prefix for configuration environment vars. + * Memory mapper operations - used to implement MD and TL functionality */ -#define UCT_MM_COMPONENT_DEFINE(_var, _name, _ops, _prefix, _cfg_prefix) \ - \ - uct_md_component_t _var; \ - \ - static ucs_status_t _var##_query_md_resources(uct_md_resource_desc_t **resources_p, \ - unsigned *num_resources_p) { \ - if ((_ops)->query() == UCS_OK) { \ - return uct_single_md_resource(&_var, resources_p, num_resources_p); \ - } else { \ - *resources_p = NULL; \ - *num_resources_p = 0; \ - return UCS_OK; \ - } \ - } \ - \ - static ucs_status_t _var##_md_open(const char *md_name, const uct_md_config_t *md_config, \ - uct_md_h *md_p) \ - { \ - return uct_mm_md_open(md_name, md_config, md_p, &_var); \ - } \ - \ - UCT_MD_COMPONENT_DEFINE(_var, _name, \ - _var##_query_md_resources, _var##_md_open, _ops, \ - uct_mm_rkey_unpack, \ - uct_mm_rkey_release, _cfg_prefix, _prefix##_md_config_table, \ - _prefix##_md_config_t) +typedef struct uct_mm_mapper_ops { + uct_md_ops_t super; + uct_mm_mapper_query_func_t query; + uct_mm_mapper_iface_addr_length_func_t iface_addr_length; + uct_mm_mapper_iface_addr_pack_func_t iface_addr_pack; + uct_mm_mapper_mem_attach_func_t mem_attach; + uct_mm_mapper_mem_detach_func_t mem_detach; + uct_mm_mapper_is_reachable_func_t is_reachable; +} uct_mm_md_mapper_ops_t; /** - * Local memory segment structure. + * Memory mapper component */ -typedef struct uct_mm_seg { - uct_mm_id_t mmid; /* Shared memory ID */ - void *address; /* Virtual address */ - size_t length; /* Size of the memory */ - const char *path; /* path to the backing file when using posix */ -} uct_mm_seg_t; +typedef struct uct_mm_component { + uct_component_t super; + uct_mm_md_mapper_ops_t *md_ops; +} uct_mm_component_t; -/** - * Packed remote key - */ -typedef struct uct_mm_packed_rkey { - uct_mm_id_t mmid; /* Shared memory ID */ - uintptr_t owner_ptr; /* VA of in allocating process */ - size_t length; /* Size of the memory */ - char path[0]; /* path to the backing file when using posix */ -} uct_mm_packed_rkey_t; +/* Extract mapper ops from MM component */ +#define uct_mm_mdc_mapper_ops(_component) \ + (ucs_derived_of(_component, uct_mm_component_t)->md_ops) -/** - * MM MD +/* Extract mapper ops from MM memory domain */ +#define uct_mm_md_mapper_ops(_md) \ + ucs_derived_of((_md)->super.ops, uct_mm_md_mapper_ops_t) + + +/* Call mapper operation */ +#define uct_mm_md_mapper_call(_md, _func, ...) \ + uct_mm_md_mapper_ops(_md)->_func(_md, ## __VA_ARGS__) + + +/* + * Define a memory-mapper component for MM. + * + * @param _var Variable for MM component. + * @param _name String which is the component name. + * @param _md_ops Mapper operations, of type uct_mm_mapper_ops_t. + * @param _cfg_prefix Prefix for configuration environment vars. */ -typedef struct uct_mm_md { - uct_md_t super; - uct_mm_md_config_t *config; -} uct_mm_md_t; +#define UCT_MM_COMPONENT_DEFINE(_var, _name, _md_ops, _rkey_unpack, \ + _rkey_release, _cfg_prefix) \ + \ + static uct_mm_component_t _var = { \ + .super = { \ + .query_md_resources = uct_mm_query_md_resources, \ + .md_open = uct_mm_md_open, \ + .cm_open = ucs_empty_function_return_unsupported, \ + .rkey_unpack = _rkey_unpack, \ + .rkey_ptr = uct_mm_rkey_ptr, \ + .rkey_release = _rkey_release, \ + .name = #_name, \ + .md_config = { \ + .name = #_name " memory domain", \ + .prefix = _cfg_prefix, \ + .table = uct_##_name##_md_config_table, \ + .size = sizeof(uct_##_name##_md_config_t), \ + }, \ + .cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY, \ + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER( \ + &(_var).super), \ + .flags = 0, \ + }, \ + .md_ops = (_md_ops) \ + }; \ + UCT_COMPONENT_REGISTER(&(_var).super); \ -ucs_status_t uct_mm_mem_alloc(uct_md_h md, size_t *length_p, void **address_p, - unsigned flags, const char *alloc_name, - uct_mem_h *memh_p); +extern ucs_config_field_t uct_mm_md_config_table[]; -ucs_status_t uct_mm_mem_free(uct_md_h md, uct_mem_h memh); -ucs_status_t uct_mm_mem_reg(uct_md_h md, void *address, size_t length, - unsigned flags, uct_mem_h *memh_p); +ucs_status_t uct_mm_query_md_resources(uct_component_t *component, + uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p); -ucs_status_t uct_mm_mem_dereg(uct_md_h md, uct_mem_h memh); +ucs_status_t uct_mm_seg_new(void *address, size_t length, uct_mm_seg_t **seg_p); -ucs_status_t uct_mm_md_query(uct_md_h md, uct_md_attr_t *md_attr); +void uct_mm_md_query(uct_md_h md, uct_md_attr_t *md_attr, int support_alloc); -ucs_status_t uct_mm_mkey_pack(uct_md_h md, uct_mem_h memh, void *rkey_buffer); +ucs_status_t uct_mm_rkey_ptr(uct_component_t *component, uct_rkey_t rkey, + void *handle, uint64_t raddr, void **laddr_p); -ucs_status_t uct_mm_rkey_unpack(uct_md_component_t *mdc, const void *rkey_buffer, - uct_rkey_t *rkey_p, void **handle_p); +ucs_status_t uct_mm_md_open(uct_component_t *component, const char *md_name, + const uct_md_config_t *config, uct_md_h *md_p); -ucs_status_t uct_mm_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, void *handle); +void uct_mm_md_close(uct_md_h md); -ucs_status_t uct_mm_md_open(const char *md_name, const uct_md_config_t *md_config, - uct_md_h *md_p, uct_md_component_t *_var); +static inline void +uct_mm_md_make_rkey(void *local_address, uintptr_t remote_address, + uct_rkey_t *rkey_p) +{ + *rkey_p = (uintptr_t)local_address - remote_address; +} #endif diff --git a/src/uct/sm/mm/posix/mm_posix.c b/src/uct/sm/mm/posix/mm_posix.c index af5eeb76393..76357ad5191 100644 --- a/src/uct/sm/mm/posix/mm_posix.c +++ b/src/uct/sm/mm/posix/mm_posix.c @@ -5,45 +5,69 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include #include #include +#include #include #include -#define UCT_MM_POSIX_SHM_OPEN_MODE (0666) -#define UCT_MM_POSIX_MMAP_PROT (PROT_READ | PROT_WRITE) -#define UCT_MM_POSIX_HUGETLB UCS_BIT(0) -#define UCT_MM_POSIX_SHM_OPEN UCS_BIT(1) -#define UCT_MM_POSIX_PROC_LINK UCS_BIT(2) -#define UCT_MM_POSIX_CTRL_BITS 3 -#define UCT_MM_POSIX_FD_BITS 29 -#define UCT_MM_POSIX_PID_BITS 32 +/* File open flags */ +#define UCT_POSIX_SHM_CREATE_FLAGS (O_CREAT | O_EXCL | O_RDWR) /* shm create flags */ +#define UCT_POSIX_SHM_OPEN_MODE 0600 /* shm open/create mode */ + +/* Memory mapping parameters */ +#define UCT_POSIX_MMAP_PROT (PROT_READ | PROT_WRITE) + +/* Shared memory segment flags */ +#define UCT_POSIX_SEG_FLAG_PROCFS UCS_BIT(63) /* use procfs mode: mmid encodes an + open fd symlink from procfs */ +#define UCT_POSIX_SEG_FLAG_SHM_OPEN UCS_BIT(62) /* use shm_open() rather than open() */ +#define UCT_POSIX_SEG_FLAG_HUGETLB UCS_BIT(61) /* use MAP_HUGETLB */ +#define UCT_POSIX_SEG_FLAG_PID_NS UCS_BIT(60) /* use PID NS in address */ +#define UCT_POSIX_SEG_FLAGS_MASK (UCT_POSIX_SEG_FLAG_PROCFS | \ + UCT_POSIX_SEG_FLAG_SHM_OPEN | \ + UCT_POSIX_SEG_FLAG_PID_NS | \ + UCT_POSIX_SEG_FLAG_HUGETLB) +#define UCT_POSIX_SEG_MMID_MASK (~UCT_POSIX_SEG_FLAGS_MASK) + +/* Packing mmid for procfs mode */ +#define UCT_POSIX_PROCFS_MMID_FD_BITS 30 /* how many bits for file descriptor */ +#define UCT_POSIX_PROCFS_MMID_PID_BITS 30 /* how many bits for pid */ + +/* Filesystem paths */ +#define UCT_POSIX_SHM_OPEN_DIR "/dev/shm" /* directory path for shm_open() */ +#define UCT_POSIX_FILE_FMT "/ucx_shm_posix_%"PRIx64 +#define UCT_POSIX_PROCFS_FILE_FMT "/proc/%d/fd/%d" /* file pattern for procfs mode */ + typedef struct uct_posix_md_config { - uct_mm_md_config_t super; - char *path; - ucs_ternary_value_t use_shm_open; - int use_proc_link; + uct_mm_md_config_t super; + char *dir; + int use_proc_link; } uct_posix_md_config_t; +typedef struct uct_posix_packed_rkey { + uint64_t seg_id; /* flags + mmid */ + uintptr_t address; + size_t length; +} UCS_S_PACKED uct_posix_packed_rkey_t; + + static ucs_config_field_t uct_posix_md_config_table[] = { {"MM_", "", NULL, ucs_offsetof(uct_posix_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_mm_md_config_table)}, - {"USE_SHM_OPEN", "try", "Use shm_open() for opening a file for memory mapping. " - "Possible values are:\n" - " y - Use only shm_open() to open a backing file.\n" - " n - Use only open() to open a backing file.\n" - " try - Try to use shm_open() and if it fails, use open().\n" - "If shm_open() is used, the path to the file defaults to /dev/shm.\n" - "If open() is used, the path to the file is specified in the parameter bellow (DIR).", - ucs_offsetof(uct_posix_md_config_t, use_shm_open), UCS_CONFIG_TYPE_TERNARY}, - - {"DIR", "/tmp", "The path to the backing file in case open() is used.", - ucs_offsetof(uct_posix_md_config_t, path), UCS_CONFIG_TYPE_STRING}, + {"DIR", UCT_POSIX_SHM_OPEN_DIR, + "The path to the backing file. If it's equal to " UCT_POSIX_SHM_OPEN_DIR " then \n" + "shm_open() is used. Otherwise, open() is used.", + ucs_offsetof(uct_posix_md_config_t, dir), UCS_CONFIG_TYPE_STRING}, {"USE_PROC_LINK", "y", "Use /proc//fd/ to share posix file.\n" " y - Use /proc//fd/ to share posix file.\n" @@ -53,13 +77,65 @@ static ucs_config_field_t uct_posix_md_config_table[] = { {NULL} }; -static ucs_status_t uct_posix_test_mem(size_t length, int shm_fd) +static int uct_posix_use_shm_open(const uct_posix_md_config_t *posix_config) { - int *buf; - int chunk_size = 256 * 1024; - ucs_status_t status = UCS_OK; + return !strcmp(posix_config->dir, UCT_POSIX_SHM_OPEN_DIR); +} + +static size_t uct_posix_iface_addr_length(uct_mm_md_t *md) +{ + const uct_posix_md_config_t *posix_config = + ucs_derived_of(md->config, uct_posix_md_config_t); + + /* if shm_open is requested, the path to the backing file is /dev/shm + * by default. however, if shm_open isn't used, the size of the path to the + * requested backing file is needed so that the user would know how much + * space to allocate for the rkey. + */ + if (posix_config->use_proc_link) { + return ucs_sys_ns_is_default(UCS_SYS_NS_TYPE_PID) ? 0 : sizeof(ucs_sys_ns_t); + } + + return uct_posix_use_shm_open(posix_config) ? + 0 : (strlen(posix_config->dir) + 1); +} + +static ucs_status_t uct_posix_md_query(uct_md_h tl_md, uct_md_attr_t *md_attr) +{ + uct_mm_md_t *md = ucs_derived_of(tl_md, uct_mm_md_t); + + uct_mm_md_query(&md->super, md_attr, 1); + md_attr->rkey_packed_size = sizeof(uct_posix_packed_rkey_t) + + uct_posix_iface_addr_length(md); + return UCS_OK; +} + +static uint64_t uct_posix_mmid_procfs_pack(int fd) +{ + pid_t pid = getpid(); + + UCS_STATIC_ASSERT(UCS_MASK(UCT_POSIX_PROCFS_MMID_PID_BITS + + UCT_POSIX_PROCFS_MMID_FD_BITS) == + UCT_POSIX_SEG_MMID_MASK); + + ucs_assert(pid <= UCS_MASK(UCT_POSIX_PROCFS_MMID_PID_BITS)); + ucs_assert(fd <= UCS_MASK(UCT_POSIX_PROCFS_MMID_FD_BITS)); + return pid | ((uint64_t)fd << UCT_POSIX_PROCFS_MMID_PID_BITS); +} + +static void uct_posix_mmid_procfs_unpack(uint64_t mmid, int *pid_p, int *fd_p) +{ + *fd_p = mmid >> UCT_POSIX_PROCFS_MMID_PID_BITS; + *pid_p = mmid & UCS_MASK(UCT_POSIX_PROCFS_MMID_PID_BITS); +} + +static ucs_status_t uct_posix_test_mem(int shm_fd, size_t length) +{ + const size_t chunk_size = 64 * UCS_KBYTE; size_t size_to_write, remaining; ssize_t single_write; + ucs_status_t status; + int *buf; buf = ucs_malloc(chunk_size, "write buffer"); if (buf == NULL) { @@ -98,464 +174,502 @@ static ucs_status_t uct_posix_test_mem(size_t length, int shm_fd) remaining -= single_write; } + status = UCS_OK; + out_free_buf: ucs_free(buf); - out: return status; } -static size_t uct_posix_get_path_size(uct_md_h md) +ucs_status_t uct_posix_open_check_result(const char *func, const char *file_name, + int open_flags, int ret, int *fd_p) { - uct_mm_md_t *mm_md = ucs_derived_of(md, uct_mm_md_t); - uct_posix_md_config_t *posix_config = ucs_derived_of(mm_md->config, - uct_posix_md_config_t); - - /* if shm_open is requested, the path to the backing file is /dev/shm - * by default. however, if shm_open isn't used, in case UCS_NO was set for - * use_shm_open or if UCS_TRY was set but using shm_open() was unsuccessful, - * the size of the path to the requested backing file is needed so that the - * user would know how much space to allocated for the rkey. */ - if (posix_config->use_shm_open == UCS_YES) { - return 0; + if (ret >= 0) { + *fd_p = ret; + return UCS_OK; + } else if (errno == EEXIST) { + return UCS_ERR_ALREADY_EXISTS; } else { - return 1 + strlen(posix_config->path); + ucs_error("%s(file_name=%s flags=0x%x) failed: %m", func, file_name, + open_flags); + return UCS_ERR_SHMEM_SEGMENT; } } -static uint8_t uct_posix_get_priority() +static ucs_status_t uct_posix_shm_open(uint64_t mmid, int open_flags, int *fd_p) { - return 0; + char file_name[NAME_MAX]; + int ret; + + ucs_snprintf_safe(file_name, sizeof(file_name), UCT_POSIX_FILE_FMT, mmid); + ret = shm_open(file_name, open_flags | O_RDWR, UCT_POSIX_SHM_OPEN_MODE); + return uct_posix_open_check_result("shm_open", file_name, open_flags, ret, + fd_p); } -static ucs_status_t uct_posix_set_path(char *file_name, int use_shm_open, - const char *path, uint64_t uuid) +static ucs_status_t uct_posix_file_open(const char *dir, uint64_t mmid, + int open_flags, int* fd_p) { - ucs_status_t status; - int ret, len; + char file_path[PATH_MAX]; + int ret; - if (!use_shm_open) { - strncpy(file_name, path, NAME_MAX); - } + ucs_snprintf_safe(file_path, sizeof(file_path), "%s" UCT_POSIX_FILE_FMT, + dir, mmid); + ret = open(file_path, open_flags | O_RDWR, UCT_POSIX_SHM_OPEN_MODE); + return uct_posix_open_check_result("open", file_path, open_flags, ret, fd_p); +} - len = strlen(file_name); - ret = snprintf(file_name + len, NAME_MAX - len, - "/ucx_posix_mm_%s_%s_%016lx", ucs_get_user_name(), - ucs_get_host_name(), uuid); - if ((ret >= (NAME_MAX - len)) || (ret < 1)) { - status = UCS_ERR_INVALID_PARAM; - return status; - } +static ucs_status_t uct_posix_procfs_open(int pid, int peer_fd, int* fd_p) +{ + char file_path[PATH_MAX]; + int ret; - return UCS_OK; + ucs_snprintf_safe(file_path, sizeof(file_path), UCT_POSIX_PROCFS_FILE_FMT, + pid, peer_fd); + ret = open(file_path, O_RDWR, UCT_POSIX_SHM_OPEN_MODE); + return uct_posix_open_check_result("open", file_path, 0, ret, fd_p); } -static ucs_status_t uct_posix_shm_open(const char *file_name, size_t length, int *shm_fd) +static ucs_status_t uct_posix_unlink(uct_mm_md_t *md, uint64_t seg_id) { - ucs_status_t status; + uct_posix_md_config_t *posix_config = ucs_derived_of(md->config, + uct_posix_md_config_t); + char file_path[PATH_MAX]; + int ret; - /* Create shared memory object and set its size */ - *shm_fd = shm_open(file_name, O_CREAT | O_RDWR | O_EXCL, - UCT_MM_POSIX_SHM_OPEN_MODE); - if (*shm_fd == -1) { - ucs_error("Error returned from shm_open %s. File name is: %s", - strerror(errno), file_name); - status = UCS_ERR_SHMEM_SEGMENT; - goto err; - } - if (ftruncate(*shm_fd, length) == -1) { - ucs_error("Error returned from ftruncate %m"); - status = UCS_ERR_SHMEM_SEGMENT; - goto err_shm_unlink; + if (seg_id & UCT_POSIX_SEG_FLAG_SHM_OPEN) { + ucs_snprintf_safe(file_path, sizeof(file_path), UCT_POSIX_FILE_FMT, + seg_id & UCT_POSIX_SEG_MMID_MASK); + ret = shm_unlink(file_path); + if (ret < 0) { + ucs_error("shm_unlink(%s) failed: %m", file_path); + return UCS_ERR_SHMEM_SEGMENT; + } + } else { + ucs_snprintf_safe(file_path, sizeof(file_path), "%s" UCT_POSIX_FILE_FMT, + posix_config->dir, seg_id & UCT_POSIX_SEG_MMID_MASK); + ret = unlink(file_path); + if (ret < 0) { + ucs_error("unlink(%s) failed: %m", file_path); + return UCS_ERR_SHMEM_SEGMENT; + } } return UCS_OK; - -err_shm_unlink: - close(*shm_fd); - if (shm_unlink(file_name) != 0) { - ucs_warn("unable to shm_unlink the shared memory segment"); - } -err: - return status; } -static ucs_status_t uct_posix_open(const char *file_name, size_t length, int *shm_fd) +static ucs_status_t +uct_posix_mmap(void **address_p, size_t *length_p, int flags, int fd, + const char *alloc_name, ucs_log_level_t err_level) { - ucs_status_t status; + size_t aligned_length; + void *result; - /* use open with the given path */ - *shm_fd = open(file_name, O_CREAT | O_RDWR | O_EXCL, UCT_MM_POSIX_SHM_OPEN_MODE); - if (*shm_fd == -1) { - ucs_error("Error returned from open %s . File name is: %s", - strerror(errno), file_name); - status = UCS_ERR_SHMEM_SEGMENT; - goto err; + aligned_length = ucs_align_up_pow2(*length_p, ucs_get_page_size()); + +#ifdef MAP_HUGETLB + if (flags & MAP_HUGETLB) { + ssize_t huge_page_size = ucs_get_huge_page_size(); + size_t huge_aligned_length; + + if (huge_page_size <= 0) { + ucs_debug("huge pages are not supported on the system"); + return UCS_ERR_NO_MEMORY; /* Huge pages not supported */ + } + + huge_aligned_length = ucs_align_up_pow2(aligned_length, huge_page_size); + if (huge_aligned_length > (2 * aligned_length)) { + return UCS_ERR_EXCEEDS_LIMIT; /* Do not align up by more than 2x */ + } + + aligned_length = huge_aligned_length; } +#endif - if (ftruncate(*shm_fd, length) == -1) { - ucs_error("Error returned from ftruncate %m"); - status = UCS_ERR_SHMEM_SEGMENT; - goto err_close; + result = ucs_mmap(*address_p, aligned_length, UCT_POSIX_MMAP_PROT, + MAP_SHARED | flags, fd, 0 UCS_MEMTRACK_VAL); + if (result == MAP_FAILED) { + ucs_log(err_level, + "shared memory mmap(addr=%p, length=%zu, flags=%s%s, fd=%d) failed: %m", + *address_p, aligned_length, + (flags & MAP_FIXED) ? " FIXED" : "", +#ifdef MAP_HUGETLB + (flags & MAP_HUGETLB) ? " HUGETLB" : "", +#else + "", +#endif + fd); + return UCS_ERR_SHMEM_SEGMENT; } + *address_p = result; + *length_p = aligned_length; + return UCS_OK; +} -err_close: - close(*shm_fd); - if (unlink(file_name) != 0) { - ucs_warn("unable to unlink the shared memory segment"); +static ucs_status_t uct_posix_munmap(void *address, size_t length) +{ + int ret; + + ret = ucs_munmap(address, length); + if (ret != 0) { + ucs_warn("shared memory munmap(address=%p, length=%zu) failed: %m", + address, length); + return UCS_ERR_SHMEM_SEGMENT; } -err: - return status; + + return UCS_OK; } static ucs_status_t -uct_posix_open_backing_file(char *file_name, uint64_t *uuid, uct_posix_md_config_t *config, - size_t length, int *shm_fd, const char **path_p) +uct_posix_mem_attach_common(uct_mm_seg_id_t seg_id, size_t length, + const char *dir, uct_mm_remote_seg_t *rseg) { + uint64_t mmid = seg_id & UCT_POSIX_SEG_MMID_MASK; + int pid, peer_fd, fd; ucs_status_t status; + int mmap_flags; - if (config->use_shm_open != UCS_NO) { - status = uct_posix_set_path(file_name, 1, NULL, *uuid >> UCT_MM_POSIX_CTRL_BITS); - if (status != UCS_OK) { - goto out; - } + ucs_assert(length > 0); + rseg->cookie = (void*)length; - status = uct_posix_shm_open(file_name, length, shm_fd); - if ((config->use_shm_open == UCS_TRY) && (status != UCS_OK)) { - goto use_open; - } else { - *uuid |= UCT_MM_POSIX_SHM_OPEN; - goto out; - } - } - -use_open: - status = uct_posix_set_path(file_name, 0, config->path, *uuid >> UCT_MM_POSIX_CTRL_BITS); - if (status != UCS_OK) { - goto out; + if (seg_id & UCT_POSIX_SEG_FLAG_PROCFS) { + uct_posix_mmid_procfs_unpack(mmid, &pid, &peer_fd); + status = uct_posix_procfs_open(pid, peer_fd, &fd); + } else if (seg_id & UCT_POSIX_SEG_FLAG_SHM_OPEN) { + status = uct_posix_shm_open(mmid, 0, &fd); + } else { + ucs_assert(dir != NULL); /* for coverity */ + status = uct_posix_file_open(dir, mmid, 0, &fd); } - - status = uct_posix_open(file_name, length, shm_fd); if (status != UCS_OK) { return status; } - *uuid &= ~UCT_MM_POSIX_SHM_OPEN; - *path_p = config->path; -out: +#ifdef MAP_HUGETLB + mmap_flags = (seg_id & UCT_POSIX_SEG_FLAG_HUGETLB) ? MAP_HUGETLB : 0; +#else + mmap_flags = 0; +#endif + rseg->address = NULL; + status = uct_posix_mmap(&rseg->address, &length, mmap_flags, fd, + "posix_attach", UCS_LOG_LEVEL_ERROR); + close(fd); return status; } +static int +uct_posix_is_reachable(uct_mm_md_t *md, uct_mm_seg_id_t seg_id, + const void *iface_addr) +{ + if (seg_id & UCT_POSIX_SEG_FLAG_PID_NS) { + return ucs_sys_get_ns(UCS_SYS_NS_TYPE_PID) == *(const ucs_sys_ns_t*)iface_addr; + } + + return ucs_sys_ns_is_default(UCS_SYS_NS_TYPE_PID); +} + +static ucs_status_t uct_posix_mem_detach_common(const uct_mm_remote_seg_t *rseg) +{ + return uct_posix_munmap(rseg->address, (size_t)rseg->cookie); +} + static ucs_status_t -uct_posix_alloc(uct_md_h md, size_t *length_p, ucs_ternary_value_t hugetlb, - unsigned md_map_flags, const char *alloc_name, void **address_p, - uct_mm_id_t *mmid_p, const char **path_p) +uct_posix_segment_open(uct_mm_md_t *md, uct_mm_seg_id_t *seg_id_p, int *fd_p) { + uct_posix_md_config_t *posix_config = ucs_derived_of(md->config, + uct_posix_md_config_t); + uint64_t mmid, flags; ucs_status_t status; - int shm_fd = -1; - uint64_t uuid; - char *file_name; - int mmap_flags; - void *addr_wanted; - uct_mm_md_t *mm_md = ucs_derived_of(md, uct_mm_md_t); - uct_posix_md_config_t *posix_config = ucs_derived_of(mm_md->config, + unsigned rand_seed; + + /* Generate random 32-bit shared memory id and make sure it's not used + * already by opening the file with O_CREAT|O_EXCL */ + rand_seed = ucs_generate_uuid((uintptr_t)md); + for (;;) { + mmid = rand_r(&rand_seed); + ucs_assert(mmid <= UCT_POSIX_SEG_MMID_MASK); + if (uct_posix_use_shm_open(posix_config)) { + flags = UCT_POSIX_SEG_FLAG_SHM_OPEN; + status = uct_posix_shm_open(mmid, UCT_POSIX_SHM_CREATE_FLAGS, fd_p); + } else { + flags = 0; + status = uct_posix_file_open(posix_config->dir, mmid, + UCT_POSIX_SHM_CREATE_FLAGS, fd_p); + } + if (status == UCS_OK) { + *seg_id_p = mmid | flags; + return UCS_OK; /* found unique file name */ + } else if (status != UCS_ERR_ALREADY_EXISTS) { + return status; /* unexpected error (e.g permission denied) */ + } + /* file exists, retry */ + } +} + +static ucs_status_t +uct_posix_mem_alloc(uct_md_h tl_md, size_t *length_p, void **address_p, + unsigned flags, const char *alloc_name, uct_mem_h *memh_p) +{ + uct_mm_md_t *md = ucs_derived_of(tl_md, uct_mm_md_t); + uct_posix_md_config_t *posix_config = ucs_derived_of(md->config, uct_posix_md_config_t); + ucs_status_t status; + uct_mm_seg_t *seg; + int force_hugetlb; + int mmap_flags; + void *address; + int fd; - if (0 == *length_p) { - ucs_error("Unexpected length %zu", *length_p); - status = UCS_ERR_INVALID_PARAM; + status = uct_mm_seg_new(*address_p, *length_p, &seg); + if (status != UCS_OK) { goto err; } - file_name = ucs_calloc(1, NAME_MAX, "shared mr posix"); - if (file_name == NULL) { - status = UCS_ERR_NO_MEMORY; - ucs_error("Failed to allocate memory for the shm_open file name. %m"); - goto err; + status = uct_posix_segment_open(md, &seg->seg_id, &fd); + if (status != UCS_OK) { + goto err_free_seg; } - /* Generate a 64 bit uuid. - * use 61 bits of it for creating the file_name of the backing file. - * other 2 bits: - * 1 bit is for indicating whether or not hugepages were used. - * 1 bit is for indicating whether or not shm_open() was used. - * 1 bit is for indicating whether or not /proc//fd/ was used. */ - uuid = ucs_generate_uuid(0); - - status = uct_posix_open_backing_file(file_name, &uuid, posix_config, - *length_p, &shm_fd, path_p); + /* Check if the location of the backing file has enough memory for the + * needed size by trying to write there before calling mmap */ + status = uct_posix_test_mem(fd, seg->length); if (status != UCS_OK) { - goto err_free_file; + goto err_close; } - /* immediately unlink the file */ + /* If using procfs link instead of mmid, remove the original file and update + * seg->seg_id */ if (posix_config->use_proc_link) { - int ret = (uuid & UCT_MM_POSIX_SHM_OPEN) ? shm_unlink(file_name) : unlink(file_name); - if (ret != 0) { - ucs_warn("unable to unlink the shared memory segment. File name is: %s", - file_name); - status = UCS_ERR_SHMEM_SEGMENT; - goto err_free_file; + status = uct_posix_unlink(md, seg->seg_id); + if (status != UCS_OK) { + goto err_close; } - uuid |= UCT_MM_POSIX_PROC_LINK; - } else { - uuid &= ~UCT_MM_POSIX_PROC_LINK; - } - - /* check is the location of the backing file has enough memory for the needed size - * by trying to write there before calling mmap */ - status = uct_posix_test_mem(*length_p, shm_fd); - if (status != UCS_OK) { - goto err_shm_unlink; - } - - status = UCS_ERR_NO_MEMORY; - - if (posix_config->use_proc_link) { - /* encode fd and pid into uuid */ - uuid &= UCS_MASK_SAFE(UCT_MM_POSIX_CTRL_BITS); - uuid |= (shm_fd << UCT_MM_POSIX_CTRL_BITS); - uuid |= ((uint64_t)getpid()) << (UCT_MM_POSIX_CTRL_BITS + UCT_MM_POSIX_FD_BITS); - - /* Here we encoded fd into uuid using 29 bits, which - * is less than 32 bits (one integer), so there are - * 3 bits lost. We make sure here the encoded fd equals - * to the original fd. If they are not equal, which means - * 29 bits is not enough for fd, we need proper solutions - * to deal with it. */ - ucs_assert(shm_fd == ((uuid >> UCT_MM_POSIX_CTRL_BITS) & UCS_MASK_SAFE(UCT_MM_POSIX_FD_BITS))); + /* Replace mmid by pid+fd. Keep previous SHM_OPEN flag for mkey_pack() */ + seg->seg_id = uct_posix_mmid_procfs_pack(fd) | + (seg->seg_id & UCT_POSIX_SEG_FLAG_SHM_OPEN) | + UCT_POSIX_SEG_FLAG_PROCFS | + (ucs_sys_ns_is_default(UCS_SYS_NS_TYPE_PID) ? 0 : + UCT_POSIX_SEG_FLAG_PID_NS); } /* mmap the shared memory segment that was created by shm_open */ - - if (md_map_flags & UCT_MD_MEM_FLAG_FIXED) { - mmap_flags = MAP_FIXED|MAP_SHARED; - addr_wanted = *address_p; + if (flags & UCT_MD_MEM_FLAG_FIXED) { + mmap_flags = MAP_FIXED; } else { - mmap_flags = MAP_SHARED; - addr_wanted = NULL; + seg->address = NULL; + mmap_flags = 0; } + /* try HUGETLB mmap */ + address = MAP_FAILED; + if (posix_config->super.hugetlb_mode != UCS_NO) { + force_hugetlb = (posix_config->super.hugetlb_mode == UCS_YES); #ifdef MAP_HUGETLB - if (hugetlb != UCS_NO) { - (*address_p) = ucs_mmap(addr_wanted, *length_p, UCT_MM_POSIX_MMAP_PROT, - mmap_flags | MAP_HUGETLB, - shm_fd, 0 UCS_MEMTRACK_VAL); - if ((*address_p) != MAP_FAILED) { - /* indicate that the memory was mapped with hugepages */ - uuid |= UCT_MM_POSIX_HUGETLB; - goto out_ok; - } - - ucs_debug("mm failed to allocate %zu bytes with hugetlb %m", *length_p); - } - + status = uct_posix_mmap(&seg->address, &seg->length, + mmap_flags | MAP_HUGETLB, fd, alloc_name, + force_hugetlb ? UCS_LOG_LEVEL_ERROR : + UCS_LOG_LEVEL_DEBUG); #else - if (hugetlb == UCS_YES) { - ucs_error("Hugepages were requested but they cannot be used with posix mmap."); status = UCS_ERR_SHMEM_SEGMENT; - goto err_shm_unlink; - } + if (force_hugetlb) { + ucs_error("shared memory allocation failed: " + "MAP_HUGETLB is not supported on the system"); + } #endif - - if (hugetlb != UCS_YES) { - (*address_p) = ucs_mmap(addr_wanted, *length_p, UCT_MM_POSIX_MMAP_PROT, - mmap_flags, shm_fd, 0 UCS_MEMTRACK_VAL); - if ((*address_p) != MAP_FAILED) { - /* indicate that the memory was mapped without hugepages */ - uuid &= ~UCT_MM_POSIX_HUGETLB; - goto out_ok; + if ((status != UCS_OK) && force_hugetlb) { + goto err_close; + } else if (status == UCS_OK) { + seg->seg_id |= UCT_POSIX_SEG_FLAG_HUGETLB; } - - ucs_debug("mm failed to allocate %zu bytes without hugetlb for %s: %m", - *length_p, alloc_name); } -err_shm_unlink: - close(shm_fd); - if (!posix_config->use_proc_link) { - if (shm_unlink(file_name) != 0) { - ucs_warn("unable to unlink the shared memory segment"); + /* fallback to regular mmap */ + if (address == MAP_FAILED) { + ucs_assert(posix_config->super.hugetlb_mode != UCS_YES); + status = uct_posix_mmap(&seg->address, &seg->length, mmap_flags, fd, + alloc_name, UCS_LOG_LEVEL_ERROR); + if (status != UCS_OK) { + goto err_close; } } -err_free_file: - ucs_free(file_name); -err: - return status; -out_ok: - ucs_free(file_name); + /* create new memory segment */ + ucs_debug("allocated posix shared memory at %p length %zu", seg->address, + seg->length); + if (!posix_config->use_proc_link) { - /* closing the shm_fd here won't unmap the mem region*/ - close(shm_fd); + /* closing the file here since the peers will open it by file system path */ + close(fd); } - *mmid_p = uuid; + + *address_p = seg->address; + *length_p = seg->length; + *memh_p = seg; return UCS_OK; -} -static ucs_status_t uct_posix_attach(uct_mm_id_t mmid, size_t length, - void *remote_address, - void **local_address, - uint64_t *cookie, const char *path) -{ - void *ptr; - char *file_name; - int shm_fd; - ucs_status_t status = UCS_OK; - - file_name = ucs_calloc(1, NAME_MAX, "shared mr posix"); - if (file_name == NULL) { - ucs_error("Failed to allocate memory for file_name to attach. %m"); - status = UCS_ERR_NO_MEMORY; - goto err; +err_close: + close(fd); + if (!(seg->seg_id & UCT_POSIX_SEG_FLAG_PROCFS)) { + uct_posix_unlink(md, seg->seg_id); } +err_free_seg: + ucs_free(seg); +err: + return status; +} - if (mmid & UCT_MM_POSIX_PROC_LINK) { - int orig_fd, pid; - uct_mm_id_t temp_mmid; - - temp_mmid = mmid >> UCT_MM_POSIX_CTRL_BITS; - orig_fd = temp_mmid & UCS_MASK_SAFE(UCT_MM_POSIX_FD_BITS); - temp_mmid >>= UCT_MM_POSIX_FD_BITS; - pid = temp_mmid & UCS_MASK_SAFE(UCT_MM_POSIX_PID_BITS); +static ucs_status_t uct_posix_mem_free(uct_md_h tl_md, uct_mem_h memh) +{ + uct_mm_md_t *md = ucs_derived_of(tl_md, uct_mm_md_t); + uct_mm_seg_t *seg = memh; + ucs_status_t status; + int fd, dummy_pid; - /* get internal path /proc/pid/fd/ */ - snprintf(file_name, NAME_MAX, "/proc/%d/fd/%d", pid, orig_fd); + status = uct_posix_munmap(seg->address, seg->length); + if (status != UCS_OK) { + return status; + } - shm_fd = open(file_name, O_RDWR, UCT_MM_POSIX_SHM_OPEN_MODE); + if (seg->seg_id & UCT_POSIX_SEG_FLAG_PROCFS) { + uct_posix_mmid_procfs_unpack(seg->seg_id & UCT_POSIX_SEG_MMID_MASK, + &dummy_pid, &fd); + ucs_assert(dummy_pid == getpid()); + close(fd); } else { - status = uct_posix_set_path(file_name, mmid & UCT_MM_POSIX_SHM_OPEN, path, - mmid >> UCT_MM_POSIX_CTRL_BITS); + status = uct_posix_unlink(md, seg->seg_id); if (status != UCS_OK) { - goto err_free_file; - } - - /* use the mmid (62 bits) to recreate the file_name for opening */ - if (mmid & UCT_MM_POSIX_SHM_OPEN) { - shm_fd = shm_open(file_name, O_RDWR | O_EXCL, UCT_MM_POSIX_SHM_OPEN_MODE); - } else { - shm_fd = open(file_name, O_CREAT | O_RDWR, UCT_MM_POSIX_SHM_OPEN_MODE); + return status; } } - if (shm_fd == -1) { - ucs_error("Error returned from open in attach. %s. File name is: %s%s", - strerror(errno), - (mmid & UCT_MM_POSIX_PROC_LINK) ? "" : - (mmid & UCT_MM_POSIX_SHM_OPEN) ? "/dev/shm" : "", - file_name); + ucs_free(seg); + return UCS_OK; +} - status = UCS_ERR_SHMEM_SEGMENT; - goto err_free_file; - } +static void uct_posix_copy_dir(uct_mm_md_t *md, void *buffer) +{ + const uct_posix_md_config_t *posix_config = + ucs_derived_of(md->config, uct_posix_md_config_t); -#ifdef MAP_HUGETLB - if (mmid & UCT_MM_POSIX_HUGETLB) { - ptr = ucs_mmap(NULL ,length, UCT_MM_POSIX_MMAP_PROT, - MAP_SHARED | MAP_HUGETLB, - shm_fd, 0 UCS_MEMTRACK_NAME("posix mmap attach")); - } else -#endif - { - ptr = ucs_mmap(NULL ,length, UCT_MM_POSIX_MMAP_PROT, MAP_SHARED, - shm_fd, 0 UCS_MEMTRACK_NAME("posix mmap attach")); - } - if (ptr == MAP_FAILED) { - ucs_error("ucs_mmap(shm_fd=%d) failed: %m", (int)shm_fd); - status = UCS_ERR_SHMEM_SEGMENT; - goto err_close_fd; - } + memcpy(buffer, posix_config->dir, strlen(posix_config->dir) + 1); +} - ucs_trace("attached remote segment '%s' remote_address %p at address %p", - file_name, remote_address, ptr); +static ucs_status_t uct_posix_iface_addr_pack(uct_mm_md_t *md, void *buffer) +{ + const uct_posix_md_config_t *posix_config = + ucs_derived_of(md->config, uct_posix_md_config_t); + + if (posix_config->use_proc_link) { + if (!ucs_sys_ns_is_default(UCS_SYS_NS_TYPE_PID)) { + *(ucs_sys_ns_t*)buffer = ucs_sys_get_ns(UCS_SYS_NS_TYPE_PID); + } + return UCS_OK; + } - *local_address = ptr; - *cookie = 0xdeadbeef; + if (!uct_posix_use_shm_open(posix_config)) { + uct_posix_copy_dir(md, buffer); + } -err_close_fd: - /* closing the fd here won't unmap the mem region (if ucs_mmap was successful) */ - close(shm_fd); -err_free_file: - ucs_free(file_name); -err: - return status; + return UCS_OK; } -static ucs_status_t uct_posix_detach(uct_mm_remote_seg_t *mm_desc) +static ucs_status_t +uct_posix_md_mkey_pack(uct_md_h tl_md, uct_mem_h memh, void *rkey_buffer) { - int ret; + uct_mm_md_t *md = ucs_derived_of(tl_md, uct_mm_md_t); + uct_mm_seg_t *seg = memh; + uct_posix_packed_rkey_t *packed_rkey = rkey_buffer; - ret = ucs_munmap(mm_desc->address, mm_desc->length); - if (ret != 0) { - ucs_warn("Unable to unmap shared memory segment at %p: %m", mm_desc->address); - return UCS_ERR_SHMEM_SEGMENT; + packed_rkey->seg_id = seg->seg_id; + packed_rkey->address = (uintptr_t)seg->address; + packed_rkey->length = seg->length; + if (!(seg->seg_id & UCT_POSIX_SEG_FLAG_SHM_OPEN) && + !(seg->seg_id & UCT_POSIX_SEG_FLAG_PROCFS)) { + uct_posix_copy_dir(md, packed_rkey + 1); } return UCS_OK; } -static ucs_status_t uct_posix_free(void *address, uct_mm_id_t mm_id, size_t length, - const char *path) +static ucs_status_t uct_posix_mem_attach(uct_mm_md_t *md, uct_mm_seg_id_t seg_id, + size_t length, const void *iface_addr, + uct_mm_remote_seg_t *remote_seg) { - int ret; - ucs_status_t status = UCS_OK; + return uct_posix_mem_attach_common(seg_id, length, iface_addr, remote_seg); +} - ret = ucs_munmap(address, length); - if (ret != 0) { - ucs_error("Unable to unmap shared memory segment at %p: %m", address); - status = UCS_ERR_SHMEM_SEGMENT; - goto err; +static void uct_posix_mem_detach(uct_mm_md_t *md, const uct_mm_remote_seg_t *rseg) +{ + uct_posix_mem_detach_common(rseg); +} + +static ucs_status_t +uct_posix_rkey_unpack(uct_component_t *component, const void *rkey_buffer, + uct_rkey_t *rkey_p, void **handle_p) +{ + const uct_posix_packed_rkey_t *packed_rkey = rkey_buffer; + uct_mm_remote_seg_t *rseg; + ucs_status_t status; + + rseg = ucs_malloc(sizeof(*rseg), "posix_remote_seg"); + if (rseg == NULL) { + ucs_error("failed to allocate posix remote segment descriptor"); + return UCS_ERR_NO_MEMORY; } - if (mm_id & UCT_MM_POSIX_PROC_LINK) { - int orig_fd; - mm_id >>= UCT_MM_POSIX_CTRL_BITS; - orig_fd = (int)(mm_id & UCS_MASK_SAFE(UCT_MM_POSIX_FD_BITS)); - close(orig_fd); - } else { - char *file_name = ucs_calloc(1, NAME_MAX, "shared mr posix mmap"); - if (file_name == NULL) { - ucs_error("Failed to allocate memory for the shm_unlink file name. %m"); - status = UCS_ERR_NO_MEMORY; - goto err; - } + status = uct_posix_mem_attach_common(packed_rkey->seg_id, + packed_rkey->length, + (const char*)(packed_rkey + 1), rseg); + if (status != UCS_OK) { + ucs_free(rseg); + return status; + } - status = uct_posix_set_path(file_name, mm_id & UCT_MM_POSIX_SHM_OPEN, path, - mm_id >> UCT_MM_POSIX_CTRL_BITS); - if (status != UCS_OK) { - goto out_free_file; - } + uct_mm_md_make_rkey(rseg->address, packed_rkey->address, rkey_p); + *handle_p = rseg; + return UCS_OK; +} - /* use the mmid (62 bits uuid) to recreate the file_name for unlink */ - ret = (mm_id & UCT_MM_POSIX_SHM_OPEN) ? shm_unlink(file_name) : unlink(file_name); - if (ret != 0) { - ucs_warn("unable to unlink the shared memory segment. File name is: %s", - file_name); - status = UCS_ERR_SHMEM_SEGMENT; - } +static ucs_status_t +uct_posix_rkey_release(uct_component_t *component, uct_rkey_t rkey, void *handle) +{ + uct_mm_remote_seg_t *rseg = handle; + ucs_status_t status; -out_free_file: - ucs_free(file_name); + status = uct_posix_mem_detach_common(rseg); + if (status != UCS_OK) { + return status; } -err: - return status; + ucs_free(rseg); + return UCS_OK; } -static uct_mm_mapper_ops_t uct_posix_mapper_ops = { - .query = ucs_empty_function_return_success, - .get_path_size = uct_posix_get_path_size, - .get_priority = uct_posix_get_priority, - .reg = NULL, - .dereg = NULL, - .alloc = uct_posix_alloc, - .attach = uct_posix_attach, - .detach = uct_posix_detach, - .free = uct_posix_free +static uct_mm_md_mapper_ops_t uct_posix_md_ops = { + .super = { + .close = uct_mm_md_close, + .query = uct_posix_md_query, + .mem_alloc = uct_posix_mem_alloc, + .mem_free = uct_posix_mem_free, + .mem_advise = (uct_md_mem_advise_func_t)ucs_empty_function_return_unsupported, + .mem_reg = (uct_md_mem_reg_func_t)ucs_empty_function_return_unsupported, + .mem_dereg = (uct_md_mem_dereg_func_t)ucs_empty_function_return_unsupported, + .mkey_pack = uct_posix_md_mkey_pack, + .is_sockaddr_accessible = (uct_md_is_sockaddr_accessible_func_t)ucs_empty_function_return_zero, + .detect_memory_type = (uct_md_detect_memory_type_func_t)ucs_empty_function_return_unsupported + }, + .query = (uct_mm_mapper_query_func_t) + ucs_empty_function_return_success, + .iface_addr_length = uct_posix_iface_addr_length, + .iface_addr_pack = uct_posix_iface_addr_pack, + .mem_attach = uct_posix_mem_attach, + .mem_detach = uct_posix_mem_detach, + .is_reachable = uct_posix_is_reachable }; -UCT_MM_COMPONENT_DEFINE(uct_posix_md, "posix", &uct_posix_mapper_ops, uct_posix, "POSIX_") -UCT_MD_REGISTER_TL(&uct_posix_md, &uct_mm_tl); +UCT_MM_TL_DEFINE(posix, &uct_posix_md_ops, uct_posix_rkey_unpack, + uct_posix_rkey_release, "POSIX_") diff --git a/src/uct/sm/mm/sysv/mm_sysv.c b/src/uct/sm/mm/sysv/mm_sysv.c index ebbfa22d377..cd84ab47c29 100644 --- a/src/uct/sm/mm/sysv/mm_sysv.c +++ b/src/uct/sm/mm/sysv/mm_sysv.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include #include @@ -14,6 +18,11 @@ #define UCT_MM_SYSV_PERM (S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP) #define UCT_MM_SYSV_MSTR (UCT_MM_SYSV_PERM | IPC_CREAT | IPC_EXCL) +typedef struct uct_sysv_packed_rkey { + uint32_t shmid; + uintptr_t owner_ptr; +} UCS_S_PACKED uct_sysv_packed_rkey_t; + typedef struct uct_sysv_md_config { uct_mm_md_config_t super; } uct_sysv_md_config_t; @@ -25,113 +34,165 @@ static ucs_config_field_t uct_sysv_md_config_table[] = { {NULL} }; -static ucs_status_t -uct_sysv_alloc(uct_md_h md, size_t *length_p, ucs_ternary_value_t hugetlb, - unsigned md_map_flags, const char *alloc_name, void **address_p, - uct_mm_id_t *mmid_p, const char **path_p) +static ucs_status_t uct_sysv_md_query(uct_md_h md, uct_md_attr_t *md_attr) { - ucs_status_t status = UCS_ERR_NO_MEMORY; - int flags, shmid = 0; + uct_mm_md_query(md, md_attr, 1); + md_attr->rkey_packed_size = sizeof(uct_sysv_packed_rkey_t); + return UCS_OK; +} - flags = UCT_MM_SYSV_MSTR; +static ucs_status_t uct_sysv_mem_attach_common(int shmid, void **address_p) +{ + void *address; - if (0 == *length_p) { - ucs_error("Unexpected length %zu", *length_p); - status = UCS_ERR_INVALID_PARAM; - goto err; + address = shmat(shmid, NULL, 0); + if (address == MAP_FAILED) { + ucs_error("shmat(shmid=%d) failed: %m", shmid); + *address_p = NULL; /* GCC 8.3.1 reports error without it */ + return UCS_ERR_SHMEM_SEGMENT; } - if (!(md_map_flags & UCT_MD_MEM_FLAG_FIXED)) { - *address_p = NULL; + *address_p = address; + ucs_trace("attached remote segment %d at address %p", (int)shmid, address); + return UCS_OK; +} + +static ucs_status_t +uct_sysv_mem_alloc(uct_md_h tl_md, size_t *length_p, void **address_p, + unsigned flags, const char *alloc_name, uct_mem_h *memh_p) +{ + uct_mm_md_t *md = ucs_derived_of(tl_md, uct_mm_md_t); + ucs_status_t status; + uct_mm_seg_t *seg; + int shmid; + + status = uct_mm_seg_new(*address_p, *length_p, &seg); + if (status != UCS_OK) { + return status; } - if (hugetlb != UCS_NO) { - status = ucs_sysv_alloc(length_p, (*length_p) * 2, address_p, - flags | SHM_HUGETLB, alloc_name, &shmid); +#ifdef SHM_HUGETLB + if (md->config->hugetlb_mode != UCS_NO) { + status = ucs_sysv_alloc(&seg->length, seg->length * 2, &seg->address, + UCT_MM_SYSV_MSTR | SHM_HUGETLB, alloc_name, + &shmid); if (status == UCS_OK) { goto out_ok; } - ucs_debug("mm failed to allocate %zu bytes with hugetlb", *length_p); + ucs_debug("mm failed to allocate %zu bytes with hugetlb", seg->length); } +#else + status = UCS_ERR_UNSUPPORTED; +#endif - if (hugetlb != UCS_YES) { - status = ucs_sysv_alloc(length_p, SIZE_MAX, address_p, flags, alloc_name, - &shmid); + if (md->config->hugetlb_mode != UCS_YES) { + status = ucs_sysv_alloc(&seg->length, SIZE_MAX, &seg->address, + UCT_MM_SYSV_MSTR, alloc_name, &shmid); if (status == UCS_OK) { goto out_ok; } - ucs_debug("mm failed to allocate %zu bytes without hugetlb", *length_p); + ucs_debug("mm failed to allocate %zu bytes without hugetlb", seg->length); } -err: - ucs_error("failed to allocate %zu bytes with mm for %s", *length_p, alloc_name); + ucs_error("failed to allocate %zu bytes with mm for %s", seg->length, + alloc_name); + ucs_free(seg); return status; out_ok: - *mmid_p = shmid; + seg->seg_id = shmid; + *address_p = seg->address; + *length_p = seg->length; + *memh_p = seg; return UCS_OK; } -static ucs_status_t uct_sysv_attach(uct_mm_id_t mmid, size_t length, - void *remote_address, - void **local_address, - uint64_t *cookie, const char *path) +static ucs_status_t uct_sysv_mem_free(uct_md_h tl_md, uct_mem_h memh) { - void *ptr; + uct_mm_seg_t *seg = memh; + ucs_status_t status; - ptr = shmat(mmid, NULL, 0); - if (ptr == MAP_FAILED) { - ucs_error("shmat(shmid=%d) failed: %m", (int)mmid); - return UCS_ERR_SHMEM_SEGMENT; + status = ucs_sysv_free(seg->address); + if (status != UCS_OK) { + return status; } - ucs_trace("attached remote segment %d remote_address %p at address %p", - (int)mmid, remote_address, ptr); - *local_address = ptr; - *cookie = 0xdeadbeef; - + ucs_free(seg); return UCS_OK; } -static ucs_status_t uct_sysv_detach(uct_mm_remote_seg_t *mm_desc) +static ucs_status_t +uct_sysv_md_mkey_pack(uct_md_h md, uct_mem_h memh, void *rkey_buffer) { - ucs_status_t status = ucs_sysv_free(mm_desc->address); - if (UCS_OK != status) { - return status; - } + uct_sysv_packed_rkey_t *packed_rkey = rkey_buffer; + const uct_mm_seg_t *seg = memh; + packed_rkey->shmid = seg->seg_id; + packed_rkey->owner_ptr = (uintptr_t)seg->address; return UCS_OK; } -static ucs_status_t uct_sysv_free(void *address, uct_mm_id_t mm_id, size_t length, - const char *path) +static ucs_status_t uct_sysv_mem_attach(uct_mm_md_t *md, uct_mm_seg_id_t seg_id, + size_t length, const void *iface_addr, + uct_mm_remote_seg_t *rseg) +{ + return uct_sysv_mem_attach_common(seg_id, &rseg->address); +} + +static void uct_sysv_mem_detach(uct_mm_md_t *md, const uct_mm_remote_seg_t *rseg) { - return ucs_sysv_free(address); + ucs_sysv_free(rseg->address); } -static size_t uct_sysv_get_path_size(uct_md_h md) +static ucs_status_t +uct_sysv_rkey_unpack(uct_component_t *component, const void *rkey_buffer, + uct_rkey_t *rkey_p, void **handle_p) { - return 0; + const uct_sysv_packed_rkey_t *packed_rkey = rkey_buffer; + ucs_status_t status; + void *address; + + status = uct_sysv_mem_attach_common(packed_rkey->shmid, &address); + if (status != UCS_OK) { + return status; + } + + *handle_p = address; + uct_mm_md_make_rkey(address, packed_rkey->owner_ptr, rkey_p); + return UCS_OK; } -static uint8_t uct_sysv_get_priority() +static ucs_status_t +uct_sysv_rkey_release(uct_component_t *component, uct_rkey_t rkey, void *handle) { - return 0; + return ucs_sysv_free(handle); } -static uct_mm_mapper_ops_t uct_sysv_mapper_ops = { - .query = ucs_empty_function_return_success, - .get_path_size = uct_sysv_get_path_size, - .get_priority = uct_sysv_get_priority, - .reg = NULL, - .dereg = NULL, - .alloc = uct_sysv_alloc, - .attach = uct_sysv_attach, - .detach = uct_sysv_detach, - .free = uct_sysv_free +static uct_mm_md_mapper_ops_t uct_sysv_md_ops = { + .super = { + .close = uct_mm_md_close, + .query = uct_sysv_md_query, + .mem_alloc = uct_sysv_mem_alloc, + .mem_free = uct_sysv_mem_free, + .mem_advise = (uct_md_mem_advise_func_t)ucs_empty_function_return_unsupported, + .mem_reg = (uct_md_mem_reg_func_t)ucs_empty_function_return_unsupported, + .mem_dereg = (uct_md_mem_dereg_func_t)ucs_empty_function_return_unsupported, + .mkey_pack = uct_sysv_md_mkey_pack, + .is_sockaddr_accessible = (uct_md_is_sockaddr_accessible_func_t)ucs_empty_function_return_zero, + .detect_memory_type = (uct_md_detect_memory_type_func_t)ucs_empty_function_return_unsupported + }, + .query = (uct_mm_mapper_query_func_t) + ucs_empty_function_return_success, + .iface_addr_length = (uct_mm_mapper_iface_addr_length_func_t) + ucs_empty_function_return_zero_int64, + .iface_addr_pack = (uct_mm_mapper_iface_addr_pack_func_t) + ucs_empty_function_return_success, + .mem_attach = uct_sysv_mem_attach, + .mem_detach = uct_sysv_mem_detach, + .is_reachable = (uct_mm_mapper_is_reachable_func_t)ucs_empty_function_return_one }; -UCT_MM_COMPONENT_DEFINE(uct_sysv_md, "sysv", &uct_sysv_mapper_ops, uct_sysv, "SYSV_") -UCT_MD_REGISTER_TL(&uct_sysv_md, &uct_mm_tl); +UCT_MM_TL_DEFINE(sysv, &uct_sysv_md_ops, uct_sysv_rkey_unpack, + uct_sysv_rkey_release, "SYSV_") diff --git a/src/uct/sm/mm/xpmem/Makefile.am b/src/uct/sm/mm/xpmem/Makefile.am index 1cb65a618a0..cb1fa8ee23a 100644 --- a/src/uct/sm/mm/xpmem/Makefile.am +++ b/src/uct/sm/mm/xpmem/Makefile.am @@ -1,16 +1,17 @@ # # Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. +# Copyright (C) UChicago Argonne, LLC. 2019. ALL RIGHTS RESERVED. # See file LICENSE for terms. # if HAVE_XPMEM module_LTLIBRARIES = libuct_xpmem.la -libuct_xpmem_la_CFLAGS = $(BASE_CFLAGS) -libuct_xpmem_la_CPPFLAGS = $(BASE_CPPFLAGS) $(XPMEM_CPPFLAGS) +libuct_xpmem_la_CFLAGS = $(BASE_CFLAGS) $(XPMEM_CFLAGS) +libuct_xpmem_la_CPPFLAGS = $(BASE_CPPFLAGS) libuct_xpmem_la_LIBADD = $(top_builddir)/src/ucs/libucs.la \ $(top_builddir)/src/uct/libuct.la -libuct_xpmem_la_LDFLAGS = $(XPMEM_LDFLAGS) -version-info $(SOVERSION) +libuct_xpmem_la_LDFLAGS = $(XPMEM_LIBS) -version-info $(SOVERSION) libuct_xpmem_la_SOURCES = mm_xpmem.c include $(top_srcdir)/config/module.am diff --git a/src/uct/sm/mm/xpmem/configure.m4 b/src/uct/sm/mm/xpmem/configure.m4 index 48e63723610..b5d8f3b52c7 100644 --- a/src/uct/sm/mm/xpmem/configure.m4 +++ b/src/uct/sm/mm/xpmem/configure.m4 @@ -13,12 +13,11 @@ AS_IF([test "x$with_xpmem" != "xno"], [AS_IF([test ! -d "$with_xpmem"], [ AC_MSG_NOTICE([XPMEM - failed to open the requested location ($with_xpmem), guessing ...]) - PKG_CHECK_MODULES( - [CRAY_XPMEM], [cray-xpmem], + AS_IF([$PKG_CONFIG --exists cray-xpmem], [ xpmem_happy=yes - AC_SUBST(XPMEM_CPPFLAGS, "$CRAY_XPMEM_CFLAGS") - AC_SUBST(XPMEM_LDFLAGS, "$CRAY_XPMEM_LIBS") + AC_SUBST(XPMEM_CFLAGS, [`$PKG_CONFIG --cflags cray-xpmem`]) + AC_SUBST(XPMEM_LIBS, [`$PKG_CONFIG --libs cray-xpmem`]) ], [ # If cray-xpmem module not found in pkg-config, try to search @@ -29,15 +28,29 @@ AS_IF([test "x$with_xpmem" != "xno"], ]) ]) -# Verify XPMEM header file +# Verify XPMEM lib and header files AS_IF([test "x$xpmem_happy" = "xno" -a -d "$with_xpmem"], - [AC_CHECK_HEADER([$with_xpmem/include/xpmem.h], - [AC_SUBST(XPMEM_CPPFLAGS, "-I$with_xpmem/include") - AC_SUBST(XPMEM_LDFLAGS, "-L$with_xpmem/lib -lxpmem") + [ + save_LDFLAGS="$LDFLAGS" + save_CPPFLAGS="$CPPFLAGS" + LDFLAGS="$LDFLAGS -L$with_xpmem/lib -lxpmem" + CPPFLAGS="$CPPFLAGS -I$with_xpmem/include" + AC_CHECK_LIB([xpmem], + [xpmem_init], + [AC_CHECK_HEADER([xpmem.h], + [AC_SUBST(XPMEM_CFLAGS, "-I$with_xpmem/include") + AC_SUBST(XPMEM_LIBS, "-L$with_xpmem/lib -lxpmem") xpmem_happy="yes"], - [AC_MSG_WARN([cray-xpmem header was not found in $with_xpmem])]) - ]) + [AC_MSG_WARN([cray-xpmem header was not found in $with_xpmem])] + ) + ], + [AC_MSG_WARN([cray-xpmem lib was not found in $with_xpmem])] + ) + LDFLAGS="$save_LDFLAGS" + CPPFLAGS="$save_CPPFLAGS" + ] + ) -AS_IF([test "x$xpmem_happy" = "xyes"], [uct_modules+=":xpmem"]) +AS_IF([test "x$xpmem_happy" = "xyes"], [uct_modules="${uct_modules}:xpmem"]) AM_CONDITIONAL([HAVE_XPMEM], [test "x$xpmem_happy" != "xno"]) AC_CONFIG_FILES([src/uct/sm/mm/xpmem/Makefile]) diff --git a/src/uct/sm/mm/xpmem/mm_xpmem.c b/src/uct/sm/mm/xpmem/mm_xpmem.c index 72422e623a5..de229339b34 100644 --- a/src/uct/sm/mm/xpmem/mm_xpmem.c +++ b/src/uct/sm/mm/xpmem/mm_xpmem.c @@ -5,216 +5,551 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "xpmem.h" #include #include +#include #include +#include +#include +#include #include +/* XPMEM memory domain configuration */ +typedef struct uct_xpmem_md_config { + uct_mm_md_config_t super; +} uct_xpmem_md_config_t; + +/* Remote process memory */ +typedef struct uct_xpmem_remote_mem { + xpmem_apid_t apid; + xpmem_segid_t xsegid; + ucs_rcache_t *rcache; + int refcount; +} uct_xpmem_remote_mem_t; + +/* Cache entry for remote memory region */ +typedef struct uct_xpmem_remote_region { + ucs_rcache_region_t super; + void *attach_address; + uct_xpmem_remote_mem_t *rmem; +} uct_xpmem_remote_region_t; + +typedef struct uct_xpmem_iface_addr { + xpmem_segid_t xsegid; +} UCS_S_PACKED uct_xpmem_iface_addr_t; + +typedef struct uct_xpmem_packed_rkey { + xpmem_segid_t xsegid; + uintptr_t address; + size_t length; +} UCS_S_PACKED uct_xpmem_packed_rkey_t; + +KHASH_INIT(xpmem_remote_mem, xpmem_segid_t, uct_xpmem_remote_mem_t*, 1, + kh_int64_hash_func, kh_int64_hash_equal) + +/* Global XPMEM segment which maps the entire process virtual address space */ +static ucs_init_once_t uct_xpmem_global_seg_init_once = UCS_INIT_ONCE_INITIALIZER; +static xpmem_segid_t uct_xpmem_global_xsegid = -1; + +/* Hash of remote regions */ +static khash_t(xpmem_remote_mem) uct_xpmem_remote_mem_hash; +static ucs_recursive_spinlock_t uct_xpmem_remote_mem_lock; + +static ucs_config_field_t uct_xpmem_md_config_table[] = { + {"MM_", "", NULL, + ucs_offsetof(uct_xpmem_md_config_t, super), + UCS_CONFIG_TYPE_TABLE(uct_mm_md_config_table)}, + + {NULL} +}; + +UCS_STATIC_INIT { + ucs_recursive_spinlock_init(&uct_xpmem_remote_mem_lock, 0); + kh_init_inplace(xpmem_remote_mem, &uct_xpmem_remote_mem_hash); +} + +UCS_STATIC_CLEANUP { + uct_xpmem_remote_mem_t *rmem; + ucs_status_t status; + + kh_foreach_value(&uct_xpmem_remote_mem_hash, rmem, { + ucs_warn("remote segment id %lx apid %lx is not released, refcount %d", + (unsigned long)rmem->xsegid, (unsigned long)rmem->apid, + rmem->refcount); + }) + kh_destroy_inplace(xpmem_remote_mem, &uct_xpmem_remote_mem_hash); + + status = ucs_recursive_spinlock_destroy(&uct_xpmem_remote_mem_lock); + if (status != UCS_OK) { + ucs_warn("ucs_recursive_spinlock_destroy() failed: %s", + ucs_status_string(status)); + } +} + static ucs_status_t uct_xpmem_query() { int version; version = xpmem_version(); if (version < 0) { - ucs_debug("Failed to query XPMEM version %d, %m", version); + ucs_debug("xpmem_version() returned %d (%m), xpmem is unavailable", + version); return UCS_ERR_UNSUPPORTED; } + + ucs_debug("xpmem version: %d", version); return UCS_OK; } -static size_t uct_xpmem_get_path_size(uct_md_h md) +static ucs_status_t uct_xpmem_md_query(uct_md_h md, uct_md_attr_t *md_attr) { - return 0; + uct_mm_md_query(md, md_attr, 0); + + md_attr->cap.flags |= UCT_MD_FLAG_REG; + md_attr->reg_cost = ucs_linear_func_make(60.0e-9, 0); + md_attr->cap.max_reg = ULONG_MAX; + md_attr->cap.reg_mem_types = UCS_MEMORY_TYPES_CPU_ACCESSIBLE; + md_attr->rkey_packed_size = sizeof(uct_xpmem_packed_rkey_t); + + return UCS_OK; } -static uint8_t uct_xpmem_get_priority() +static UCS_F_ALWAYS_INLINE size_t +uct_xpmem_rcache_region_length(uct_xpmem_remote_region_t *xpmem_region) { - return 0; + return xpmem_region->super.super.end - xpmem_region->super.super.start; } -static ucs_status_t uct_xmpem_reg(void *address, size_t size, uct_mm_id_t *mmid_p) +static ucs_status_t +uct_xpmem_rcache_mem_reg(void *context, ucs_rcache_t *rcache, void *arg, + ucs_rcache_region_t *region, uint16_t flags) { - xpmem_segid_t segid; - void *start, *end; - - start = ucs_align_down_pow2_ptr(address, ucs_get_page_size()); - end = ucs_align_up_pow2_ptr(address + size, ucs_get_page_size()); - ucs_assert_always(start <= end); - - segid = xpmem_make(start, end - start, XPMEM_PERMIT_MODE, (void*)0666); - VALGRIND_MAKE_MEM_DEFINED(&segid, sizeof(segid)); - if (segid < 0) { - ucs_error("Failed to register %p..%p with xpmem: %m", - start, end); + uct_xpmem_remote_mem_t *rmem = context; + uct_xpmem_remote_region_t *xpmem_region = + ucs_derived_of(region, uct_xpmem_remote_region_t); + struct xpmem_addr addr; + size_t length; + + addr.apid = rmem->apid; + addr.offset = xpmem_region->super.super.start; + length = uct_xpmem_rcache_region_length(xpmem_region); + + xpmem_region->attach_address = xpmem_attach(addr, length, NULL); + VALGRIND_MAKE_MEM_DEFINED(&xpmem_region->attach_address, + sizeof(xpmem_region->attach_address)); + if (xpmem_region->attach_address == MAP_FAILED) { + ucs_error("failed to attach xpmem apid 0x%lx offset 0x%lx length %zu: %m", + (unsigned long)addr.apid, addr.offset, length); return UCS_ERR_IO_ERROR; } - ucs_trace("xpmem registered %p..%p segment 0x%llx", start, end, segid); - *mmid_p = segid; + xpmem_region->rmem = rmem; + + ucs_trace("xpmem attached apid 0x%lx offset 0x%lx length %zu at %p", + (unsigned long)addr.apid, addr.offset, length, + xpmem_region->attach_address); + + VALGRIND_MAKE_MEM_DEFINED(xpmem_region->attach_address, length); return UCS_OK; } -static ucs_status_t uct_xpmem_dereg(uct_mm_id_t mmid) +static void uct_xpmem_rcache_mem_dereg(void *context, ucs_rcache_t *rcache, + ucs_rcache_region_t *region) { + uct_xpmem_remote_region_t *xpmem_region = + ucs_derived_of(region, uct_xpmem_remote_region_t); int ret; - ret = xpmem_remove(mmid); + ucs_trace("xpmem detaching address %p", xpmem_region->attach_address); + + ret = xpmem_detach(xpmem_region->attach_address); if (ret < 0) { - /* No error since there a chance that it already was released - * or deregistered */ - ucs_debug("Failed to remove xpmem segment 0x%"PRIx64": %m", mmid); + ucs_warn("Failed to xpmem_detach: %m"); + } + + xpmem_region->attach_address = NULL; + xpmem_region->rmem = NULL; +} + +static void uct_xpmem_rcache_dump_region(void *context, ucs_rcache_t *rcache, + ucs_rcache_region_t *region, char *buf, + size_t max) +{ + uct_xpmem_remote_mem_t *rmem = context; + uct_xpmem_remote_region_t *xpmem_region = + ucs_derived_of(region, uct_xpmem_remote_region_t); + + snprintf(buf, max, "apid 0x%lx attach_addr %p rmem %p", + (unsigned long)rmem->apid, xpmem_region->attach_address, rmem); +} + +static ucs_rcache_ops_t uct_xpmem_rcache_ops = { + .mem_reg = uct_xpmem_rcache_mem_reg, + .mem_dereg = uct_xpmem_rcache_mem_dereg, + .dump_region = uct_xpmem_rcache_dump_region +}; + +static UCS_F_NOINLINE ucs_status_t +uct_xpmem_make_global_xsegid(xpmem_segid_t *xsegid_p) +{ + /* double-checked locking */ + UCS_INIT_ONCE(&uct_xpmem_global_seg_init_once) { + if (uct_xpmem_global_xsegid < 0) { + uct_xpmem_global_xsegid = xpmem_make(0, XPMEM_MAXADDR_SIZE, + XPMEM_PERMIT_MODE, (void*)0600); + VALGRIND_MAKE_MEM_DEFINED(&uct_xpmem_global_xsegid, + sizeof(uct_xpmem_global_xsegid)); + } } - ucs_trace("xpmem removed segment 0x%"PRIx64, mmid); + if (uct_xpmem_global_xsegid < 0) { + ucs_error("xpmem failed to register process address space: %m"); + return UCS_ERR_IO_ERROR; + } + + ucs_debug("xpmem registered global segment id 0x%lx", + (unsigned long)uct_xpmem_global_xsegid); + *xsegid_p = uct_xpmem_global_xsegid; return UCS_OK; } -static ucs_status_t uct_xpmem_attach(uct_mm_id_t mmid, size_t length, - void *remote_address, void **local_address, - uint64_t *cookie, const char *path) +static UCS_F_ALWAYS_INLINE ucs_status_t +uct_xpmem_get_global_xsegid(xpmem_segid_t *xsegid_p) { - struct xpmem_addr addr; - ucs_status_t status; - ptrdiff_t offset; - void *address; + if (ucs_unlikely(uct_xpmem_global_xsegid < 0)) { + return uct_xpmem_make_global_xsegid(xsegid_p); + } - addr.offset = 0; - addr.apid = xpmem_get(mmid, XPMEM_RDWR, XPMEM_PERMIT_MODE, NULL); - VALGRIND_MAKE_MEM_DEFINED(&addr.apid, sizeof(addr.apid)); - if (addr.apid < 0) { - ucs_error("Failed to acquire xpmem segment 0x%"PRIx64": %m", mmid); - status = UCS_ERR_IO_ERROR; - goto err_xget; + *xsegid_p = uct_xpmem_global_xsegid; + return UCS_OK; +} + +/* lock must be held */ +static UCS_F_NOINLINE ucs_status_t +uct_xpmem_rmem_add(xpmem_segid_t xsegid, uct_xpmem_remote_mem_t **rmem_p) +{ + ucs_rcache_params_t rcache_params; + uct_xpmem_remote_mem_t *rmem; + ucs_status_t status; + khiter_t khiter; + int khret; + + rmem = ucs_malloc(sizeof(*rmem), "xpmem_rmem"); + if (rmem == NULL) { + ucs_error("failed to allocate xpmem rmem"); + status = UCS_ERR_NO_MEMORY; + goto err; } - ucs_trace("xpmem acquired segment 0x%"PRIx64" apid 0x%llx remote_address %p", - mmid, addr.apid, remote_address); + rmem->refcount = 0; + rmem->xsegid = xsegid; + + rmem->apid = xpmem_get(xsegid, XPMEM_RDWR, XPMEM_PERMIT_MODE, NULL); + VALGRIND_MAKE_MEM_DEFINED(&rmem->apid, sizeof(rmem->apid)); + if (rmem->apid < 0) { + ucs_error("xpmem_get(segid=0x%lx) failed: %m", (unsigned long)xsegid); + status = UCS_ERR_SHMEM_SEGMENT; + goto err_free; + } - offset = ((uintptr_t)remote_address) % ucs_get_page_size(); - address = xpmem_attach(addr, length + offset, NULL); - VALGRIND_MAKE_MEM_DEFINED(&address, sizeof(address)); - if (address == MAP_FAILED) { - ucs_error("Failed to attach xpmem segment 0x%"PRIx64" apid 0x%llx " - "with length %zu: %m", mmid, addr.apid, length); - status = UCS_ERR_IO_ERROR; - goto err_xattach; + rcache_params.region_struct_size = sizeof(uct_xpmem_remote_region_t); + rcache_params.alignment = ucs_get_page_size(); + rcache_params.max_alignment = ucs_get_page_size(); + rcache_params.ucm_events = 0; + rcache_params.ucm_event_priority = 0; + rcache_params.ops = &uct_xpmem_rcache_ops; + rcache_params.context = rmem; + rcache_params.flags = UCS_RCACHE_FLAG_NO_PFN_CHECK; + + status = ucs_rcache_create(&rcache_params, "xpmem_remote_mem", + ucs_stats_get_root(), &rmem->rcache); + if (status != UCS_OK) { + ucs_error("failed to create xpmem remote cache: %s", + ucs_status_string(status)); + goto err_release_seg; } - VALGRIND_MAKE_MEM_DEFINED(address + offset, length); + khiter = kh_put(xpmem_remote_mem, &uct_xpmem_remote_mem_hash, xsegid, + &khret); + ucs_assertv_always((khret == 1) || (khret == 2), "khret=%d", khret); + ucs_assert_always (khiter != kh_end(&uct_xpmem_remote_mem_hash)); + kh_val(&uct_xpmem_remote_mem_hash, khiter) = rmem; - *local_address = address + offset; - *cookie = addr.apid; + ucs_trace("xpmem attached to remote segment id 0x%lx apid 0x%lx rcache %p", + (unsigned long)xsegid, (unsigned long)rmem->apid, rmem->rcache); - ucs_trace("xpmem attached segment 0x%"PRIx64" apid 0x%llx %p..%p at %p (+%zd)", - mmid, addr.apid, remote_address, remote_address + length, address, offset); + *rmem_p = rmem; return UCS_OK; -err_xattach: - xpmem_release(addr.apid); -err_xget: +err_release_seg: + xpmem_release(rmem->apid); +err_free: + ucs_free(rmem); +err: return status; } -static ucs_status_t uct_xpmem_detach(uct_mm_remote_seg_t *mm_desc) +/* lock must be held */ +static UCS_F_NOINLINE void +uct_xpmem_rmem_del(uct_xpmem_remote_mem_t *rmem) { - xpmem_apid_t apid = mm_desc->cookie; - void *address; + khiter_t khiter; int ret; - address = ucs_align_down_pow2_ptr(mm_desc->address, ucs_get_page_size()); + ucs_assert(rmem->refcount == 0); - ucs_trace("xpmem detaching address %p", address); - ret = xpmem_detach(address); - if (ret < 0) { - ucs_error("Failed to xpmem_detach: %m"); - return UCS_ERR_IO_ERROR; + ucs_trace("detaching remote segment rmem %p apid %lx", rmem, + (unsigned long)rmem->apid); + + khiter = kh_get(xpmem_remote_mem, &uct_xpmem_remote_mem_hash, rmem->xsegid); + ucs_assert(kh_val(&uct_xpmem_remote_mem_hash, khiter) == rmem); + kh_del(xpmem_remote_mem, &uct_xpmem_remote_mem_hash, khiter); + + ucs_rcache_destroy(rmem->rcache); + + ret = xpmem_release(rmem->apid); + if (ret) { + ucs_warn("xpmem_release(apid=0x%lx) failed: %m", + (unsigned long)rmem->apid); } - VALGRIND_MAKE_MEM_UNDEFINED(mm_desc->address, mm_desc->length); + ucs_free(rmem); +} - ucs_trace("xpmem releasing segment apid 0x%llx", apid); - ret = xpmem_release(apid); - if (ret < 0) { - ucs_error("Failed to release xpmem segment apid 0x%llx", apid); - return UCS_ERR_IO_ERROR; +static ucs_status_t +uct_xpmem_rmem_get(xpmem_segid_t xsegid, uct_xpmem_remote_mem_t **rmem_p) +{ + uct_xpmem_remote_mem_t *rmem; + ucs_status_t status; + khiter_t khiter; + + ucs_recursive_spin_lock(&uct_xpmem_remote_mem_lock); + + khiter = kh_get(xpmem_remote_mem, &uct_xpmem_remote_mem_hash, xsegid); + if (ucs_likely(khiter != kh_end(&uct_xpmem_remote_mem_hash))) { + rmem = kh_val(&uct_xpmem_remote_mem_hash, khiter); + } else { + status = uct_xpmem_rmem_add(xsegid, &rmem); + if (status != UCS_OK) { + *rmem_p = NULL; + goto out_unlock; + } } + ++rmem->refcount; + *rmem_p = rmem; + status = UCS_OK; + +out_unlock: + ucs_recursive_spin_unlock(&uct_xpmem_remote_mem_lock); + return status; +} + +static void uct_xpmem_rmem_put(uct_xpmem_remote_mem_t *rmem) +{ + ucs_recursive_spin_lock(&uct_xpmem_remote_mem_lock); + if (--rmem->refcount == 0) { + uct_xpmem_rmem_del(rmem); + } + ucs_recursive_spin_unlock(&uct_xpmem_remote_mem_lock); +} + +static ucs_status_t +uct_xpmem_mem_attach_common(xpmem_segid_t xsegid, uintptr_t remote_address, + size_t length, uct_xpmem_remote_region_t **region_p) +{ + ucs_rcache_region_t *rcache_region; + uct_xpmem_remote_mem_t *rmem; + uintptr_t start, end; + ucs_status_t status; + + status = uct_xpmem_rmem_get(xsegid, &rmem); + if (status != UCS_OK) { + goto err; + } + + start = ucs_align_down_pow2(remote_address, ucs_get_page_size()); + end = ucs_align_up_pow2 (remote_address + length, ucs_get_page_size()); + + status = ucs_rcache_get(rmem->rcache, (void*)start, end - start, + PROT_READ|PROT_WRITE, NULL, &rcache_region); + if (status != UCS_OK) { + goto err_rmem_put; + } + + *region_p = ucs_derived_of(rcache_region, uct_xpmem_remote_region_t); return UCS_OK; + +err_rmem_put: + uct_xpmem_rmem_put(rmem); +err: + return status; } -static ucs_status_t uct_xpmem_alloc(uct_md_h md, size_t *length_p, - ucs_ternary_value_t hugetlb, - unsigned md_map_flags, const char *alloc_name, - void **address_p, uct_mm_id_t *mmid_p, - const char **path_p) +static void uct_xpmem_mem_detach_common(uct_xpmem_remote_region_t *xpmem_region) +{ + uct_xpmem_remote_mem_t *rmem = xpmem_region->rmem; + + ucs_rcache_region_put(rmem->rcache, &xpmem_region->super); + uct_xpmem_rmem_put(rmem); +} + +static ucs_status_t uct_xmpem_mem_reg(uct_md_h md, void *address, size_t length, + unsigned flags, uct_mem_h *memh_p) { ucs_status_t status; - int mmap_flags; + uct_mm_seg_t *seg; - if (0 == *length_p) { - ucs_error("Unexpected length %zu", *length_p); - status = UCS_ERR_INVALID_PARAM; - goto out; + status = uct_mm_seg_new(address, length, &seg); + if (status != UCS_OK) { + return status; } - if (md_map_flags & UCT_MD_MEM_FLAG_FIXED) { - mmap_flags = MAP_FIXED; - } else { - *address_p = NULL; - mmap_flags = 0; + seg->seg_id = (uintptr_t)address; /* to be used by mem_attach */ + *memh_p = seg; + return UCS_OK; +} + +static ucs_status_t uct_xmpem_mem_dereg(uct_md_h md, uct_mem_h memh) +{ + uct_mm_seg_t *seg = memh; + ucs_free(seg); + return UCS_OK; +} + +static ucs_status_t +uct_xpmem_mkey_pack(uct_md_h md, uct_mem_h memh, void *rkey_buffer) +{ + uct_mm_seg_t *seg = memh; + uct_xpmem_packed_rkey_t *packed_rkey = rkey_buffer; + xpmem_segid_t xsegid; + ucs_status_t status; + + ucs_assert((uintptr_t)seg->address == seg->seg_id); /* sanity */ + + status = uct_xpmem_get_global_xsegid(&xsegid); + if (status != UCS_OK) { + return status; } - /* TBD: any ideas for better allocation */ - status = ucs_mmap_alloc(length_p, address_p, mmap_flags UCS_MEMTRACK_VAL); + packed_rkey->xsegid = xsegid; + packed_rkey->address = (uintptr_t)seg->address; + packed_rkey->length = seg->length; + return UCS_OK; +} + +static size_t uct_xpmem_iface_addr_length(uct_mm_md_t *md) +{ + return sizeof(uct_xpmem_iface_addr_t); +} + +static ucs_status_t uct_xpmem_iface_addr_pack(uct_mm_md_t *md, void *buffer) +{ + uct_xpmem_iface_addr_t *xpmem_iface_addr = buffer; + xpmem_segid_t xsegid; + ucs_status_t status; + + status = uct_xpmem_get_global_xsegid(&xsegid); if (status != UCS_OK) { - ucs_error("Failed to allocate %zu bytes of memory for %s", *length_p, - alloc_name); - goto out; + return status; } - ucs_trace("xpmem allocated address %p length %zu for %s", *address_p, - *length_p, alloc_name); + xpmem_iface_addr->xsegid = xsegid; + return UCS_OK; +} + +static ucs_status_t uct_xpmem_mem_attach(uct_mm_md_t *md, uct_mm_seg_id_t seg_id, + size_t length, const void *iface_addr, + uct_mm_remote_seg_t *rseg) +{ + const uct_xpmem_iface_addr_t *xpmem_iface_addr = iface_addr; + uintptr_t remote_address = seg_id; + uct_xpmem_remote_region_t *xpmem_region; + ucs_status_t status; + ptrdiff_t offset; - status = uct_xmpem_reg(*address_p, *length_p, mmid_p); - if (UCS_OK != status) { - ucs_free(*address_p); - goto out; + ucs_assert(xpmem_iface_addr != NULL); + status = uct_xpmem_mem_attach_common(xpmem_iface_addr->xsegid, + remote_address, length, &xpmem_region); + if (status != UCS_OK) { + return status; } - VALGRIND_MAKE_MEM_DEFINED(*address_p, *length_p); - status = UCS_OK; + /* In order to obtain the local access address of the remote segment + * (rseg->address), we need to calculate its offset from the beginning of the + * region on remote side (offset), and then add it to the local base address + * of the attached region (xpmem_region->attach_address). + */ + offset = remote_address - xpmem_region->super.super.start; + rseg->address = UCS_PTR_BYTE_OFFSET(xpmem_region->attach_address, offset); + rseg->cookie = xpmem_region; -out: - return status; + return UCS_OK; +} + +static void uct_xpmem_mem_detach(uct_mm_md_t *md, + const uct_mm_remote_seg_t *rseg) +{ + uct_xpmem_mem_detach_common(rseg->cookie); } -static ucs_status_t uct_xpmem_free(void *address, uct_mm_id_t mmid, size_t length, - const char *path) +static ucs_status_t +uct_xpmem_rkey_unpack(uct_component_t *component, const void *rkey_buffer, + uct_rkey_t *rkey_p, void **handle_p) { + const uct_xpmem_packed_rkey_t *packed_rkey = rkey_buffer; + uct_xpmem_remote_region_t *xpmem_region; ucs_status_t status; - status = uct_xpmem_dereg(mmid); - if (UCS_OK != status) { + status = uct_xpmem_mem_attach_common(packed_rkey->xsegid, + packed_rkey->address, + packed_rkey->length, + &xpmem_region); + if (status != UCS_OK) { return status; } - return ucs_mmap_free(address, length); + uct_mm_md_make_rkey(xpmem_region->attach_address, + xpmem_region->super.super.start, rkey_p); + *handle_p = xpmem_region; + + return UCS_OK; +} + +static ucs_status_t +uct_xpmem_rkey_release(uct_component_t *component, uct_rkey_t rkey, void *handle) +{ + uct_xpmem_mem_detach_common(handle); + return UCS_OK; } -static uct_mm_mapper_ops_t uct_xpmem_mapper_ops = { - .query = uct_xpmem_query, - .get_path_size = uct_xpmem_get_path_size, - .get_priority = uct_xpmem_get_priority, - .reg = uct_xmpem_reg, - .dereg = uct_xpmem_dereg, - .alloc = uct_xpmem_alloc, - .attach = uct_xpmem_attach, - .detach = uct_xpmem_detach, - .free = uct_xpmem_free +static uct_mm_md_mapper_ops_t uct_xpmem_md_ops = { + .super = { + .close = uct_mm_md_close, + .query = uct_xpmem_md_query, + .mem_alloc = (uct_md_mem_alloc_func_t)ucs_empty_function_return_unsupported, + .mem_free = (uct_md_mem_free_func_t)ucs_empty_function_return_unsupported, + .mem_advise = (uct_md_mem_advise_func_t)ucs_empty_function_return_unsupported, + .mem_reg = uct_xmpem_mem_reg, + .mem_dereg = uct_xmpem_mem_dereg, + .mkey_pack = uct_xpmem_mkey_pack, + .is_sockaddr_accessible = (uct_md_is_sockaddr_accessible_func_t)ucs_empty_function_return_zero, + .detect_memory_type = (uct_md_detect_memory_type_func_t)ucs_empty_function_return_unsupported + }, + .query = uct_xpmem_query, + .iface_addr_length = uct_xpmem_iface_addr_length, + .iface_addr_pack = uct_xpmem_iface_addr_pack, + .mem_attach = uct_xpmem_mem_attach, + .mem_detach = uct_xpmem_mem_detach, + .is_reachable = (uct_mm_mapper_is_reachable_func_t)ucs_empty_function_return_one }; -UCT_MM_COMPONENT_DEFINE(uct_xpmem_md, "xpmem", &uct_xpmem_mapper_ops, uct, "XPMEM_") -UCT_MD_REGISTER_TL(&uct_xpmem_md, &uct_mm_tl); +UCT_MM_TL_DEFINE(xpmem, &uct_xpmem_md_ops, uct_xpmem_rkey_unpack, + uct_xpmem_rkey_release, "XPMEM_") diff --git a/src/uct/sm/scopy/Makefile.am b/src/uct/sm/scopy/Makefile.am new file mode 100644 index 00000000000..4cb971e2180 --- /dev/null +++ b/src/uct/sm/scopy/Makefile.am @@ -0,0 +1,6 @@ +# +# Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# + +SUBDIRS = cma knem diff --git a/src/uct/sm/scopy/base/scopy_ep.c b/src/uct/sm/scopy/base/scopy_ep.c new file mode 100644 index 00000000000..99cf3285af9 --- /dev/null +++ b/src/uct/sm/scopy/base/scopy_ep.c @@ -0,0 +1,205 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "scopy_iface.h" +#include "scopy_ep.h" + +#include + + +const char* uct_scopy_tx_op_str[] = { + [UCT_SCOPY_TX_PUT_ZCOPY] = "uct_scopy_ep_put_zcopy", + [UCT_SCOPY_TX_GET_ZCOPY] = "uct_scopy_ep_get_zcopy" +}; + +UCS_CLASS_INIT_FUNC(uct_scopy_ep_t, const uct_ep_params_t *params) +{ + uct_scopy_iface_t *iface = ucs_derived_of(params->iface, uct_scopy_iface_t); + + UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super.super); + + ucs_arbiter_group_init(&self->arb_group); + + return UCS_OK; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_scopy_ep_t) +{ + ucs_arbiter_group_cleanup(&self->arb_group); +} + +UCS_CLASS_DEFINE(uct_scopy_ep_t, uct_base_ep_t) + +static UCS_F_ALWAYS_INLINE void +uct_scopy_ep_tx_init_common(uct_scopy_tx_t *tx, uct_scopy_tx_op_t tx_op, + uct_completion_t *comp) +{ + tx->comp = comp; + tx->op = tx_op; + ucs_arbiter_elem_init(&tx->arb_elem); +} + +static UCS_F_ALWAYS_INLINE ucs_status_t +uct_scopy_ep_tx_init(uct_ep_h tl_ep, const uct_iov_t *iov, + size_t iov_cnt, uint64_t remote_addr, + uct_rkey_t rkey, uct_completion_t *comp, + uct_scopy_tx_op_t tx_op) +{ + uct_scopy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_scopy_iface_t); + uct_scopy_ep_t *ep = ucs_derived_of(tl_ep, uct_scopy_ep_t); + uct_scopy_tx_t *tx; + size_t iov_it; + + ucs_assert((tx_op == UCT_SCOPY_TX_PUT_ZCOPY) || + (tx_op == UCT_SCOPY_TX_GET_ZCOPY)); + + UCT_CHECK_IOV_SIZE(iov_cnt, iface->config.max_iov, uct_scopy_tx_op_str[tx_op]); + + tx = ucs_mpool_get_inline(&iface->tx_mpool); + if (ucs_unlikely(tx == NULL)) { + return UCS_ERR_NO_MEMORY; + } + + uct_scopy_ep_tx_init_common(tx, tx_op, comp); + tx->rkey = rkey; + tx->remote_addr = remote_addr; + tx->iov_cnt = 0; + ucs_iov_iter_init(&tx->iov_iter); + for (iov_it = 0; iov_it < iov_cnt; iov_it++) { + if (uct_iov_get_length(&iov[iov_it]) == 0) { + /* Avoid zero-length IOV elements */ + continue; + } + + tx->iov[tx->iov_cnt] = iov[iov_it]; + tx->iov_cnt++; + } + + if (tx_op == UCT_SCOPY_TX_PUT_ZCOPY) { + UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), PUT, ZCOPY, + uct_iov_total_length(tx->iov, tx->iov_cnt)); + } else { + UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, ZCOPY, + uct_iov_total_length(tx->iov, tx->iov_cnt)); + } + + if (tx->iov_cnt == 0) { + uct_scopy_trace_data(tx); + ucs_mpool_put_inline(tx); + return UCS_OK; + } + + if (ucs_unlikely(ucs_arbiter_is_empty(&iface->arbiter))) { + uct_worker_progress_register_safe(&iface->super.super.worker->super, + (ucs_callback_t) + iface->super.super.super.ops.iface_progress, + iface, UCS_CALLBACKQ_FLAG_FAST, + &iface->super.super.prog.id); + } + + ucs_arbiter_group_push_elem(&ep->arb_group, &tx->arb_elem); + ucs_arbiter_group_schedule(&iface->arbiter, &ep->arb_group); + + return UCS_INPROGRESS; +} + +ucs_status_t uct_scopy_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, + size_t iov_cnt, uint64_t remote_addr, + uct_rkey_t rkey, uct_completion_t *comp) +{ + return uct_scopy_ep_tx_init(tl_ep, iov, iov_cnt, remote_addr, + rkey, comp, UCT_SCOPY_TX_PUT_ZCOPY); +} + +ucs_status_t uct_scopy_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, + size_t iov_cnt, uint64_t remote_addr, + uct_rkey_t rkey, uct_completion_t *comp) +{ + return uct_scopy_ep_tx_init(tl_ep, iov, iov_cnt, remote_addr, + rkey, comp, UCT_SCOPY_TX_GET_ZCOPY); +} + +ucs_arbiter_cb_result_t uct_scopy_ep_progress_tx(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, + ucs_arbiter_elem_t *elem, + void *arg) +{ + uct_scopy_iface_t *iface = ucs_container_of(arbiter, uct_scopy_iface_t, + arbiter); + uct_scopy_ep_t *ep = ucs_container_of(group, uct_scopy_ep_t, + arb_group); + uct_scopy_tx_t *tx = ucs_container_of(elem, uct_scopy_tx_t, + arb_elem); + unsigned *count = (unsigned*)arg; + ucs_status_t status = UCS_OK; + size_t seg_size; + + if (*count == iface->config.tx_quota) { + return UCS_ARBITER_CB_RESULT_STOP; + } + + if (tx->op != UCT_SCOPY_TX_FLUSH_COMP) { + ucs_assert((tx->op == UCT_SCOPY_TX_GET_ZCOPY) || + (tx->op == UCT_SCOPY_TX_PUT_ZCOPY)); + seg_size = iface->config.seg_size; + status = iface->tx(&ep->super.super, tx->iov, tx->iov_cnt, + &tx->iov_iter, &seg_size, tx->remote_addr, + tx->rkey, tx->op); + if (!UCS_STATUS_IS_ERR(status)) { + (*count)++; + ucs_assertv(*count <= iface->config.tx_quota, + "count=%u vs quota=%u", + *count, iface->config.tx_quota); + + tx->remote_addr += seg_size; + uct_scopy_trace_data(tx); + + if (tx->iov_iter.iov_index < tx->iov_cnt) { + return UCS_ARBITER_CB_RESULT_RESCHED_GROUP; + } + } + } + + ucs_assert((tx->comp != NULL) || + (tx->op != UCT_SCOPY_TX_FLUSH_COMP)); + if (tx->comp != NULL) { + uct_invoke_completion(tx->comp, status); + } + + ucs_mpool_put_inline(tx); + + return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; +} + +ucs_status_t uct_scopy_ep_flush(uct_ep_h tl_ep, unsigned flags, + uct_completion_t *comp) +{ + uct_scopy_ep_t *ep = ucs_derived_of(tl_ep, uct_scopy_ep_t); + uct_scopy_iface_t *iface = ucs_derived_of(tl_ep->iface, + uct_scopy_iface_t); + uct_scopy_tx_t *flush_comp; + + if (ucs_arbiter_group_is_empty(&ep->arb_group)) { + UCT_TL_EP_STAT_FLUSH(&ep->super); + return UCS_OK; + } + + if (comp != NULL) { + flush_comp = ucs_mpool_get_inline(&iface->tx_mpool); + if (ucs_unlikely(flush_comp == NULL)) { + return UCS_ERR_NO_MEMORY; + } + + uct_scopy_ep_tx_init_common(flush_comp, UCT_SCOPY_TX_FLUSH_COMP, comp); + ucs_arbiter_group_push_elem(&ep->arb_group, &flush_comp->arb_elem); + } + + UCT_TL_EP_STAT_FLUSH_WAIT(&ep->super); + return UCS_INPROGRESS; +} diff --git a/src/uct/sm/scopy/base/scopy_ep.h b/src/uct/sm/scopy/base/scopy_ep.h new file mode 100644 index 00000000000..3d1978f7781 --- /dev/null +++ b/src/uct/sm/scopy/base/scopy_ep.h @@ -0,0 +1,87 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCT_SCOPY_EP_H +#define UCT_SCOPY_EP_H + +#include +#include +#include + + +extern const char* uct_scopy_tx_op_str[]; + + +typedef enum uct_scopy_tx_op { + UCT_SCOPY_TX_GET_ZCOPY, + UCT_SCOPY_TX_PUT_ZCOPY, + UCT_SCOPY_TX_FLUSH_COMP, + UCT_SCOPY_TX_LAST +} uct_scopy_tx_op_t; + + +/** + * TX operation executor + * + * @param [in] tl_ep Transport EP. + * @param [in] iov The pointer to the array of UCT IOVs. + * @param [in] iov_cnt The number of the elements in the array of UCT IOVs. + * @param [in] uct_iov_iter_p The pointer to the UCT IOV iterator. + * @param [in/out] length_p Input: The maximal total length of the data that + * can be transferred in a signle call. Output: The + * resulted length of the data that was transferred. + * @param [in] remote_addr The address of the remote data buffer. + * @param [in] rkey The remote memory key. + * @param [in] tx_op TX operation identifier. + * + * @return UCS_OK if the operation was successfully completed, otherwise - error status. + */ +typedef ucs_status_t +(*uct_scopy_ep_tx_func_t)(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iov_cnt, + ucs_iov_iter_t *iov_iter_p, size_t *length_p, + uint64_t remote_addr, uct_rkey_t rkey, + uct_scopy_tx_op_t tx_op); + +typedef struct uct_scopy_tx_arb_elem { +} uct_scopy_tx_arb_elem_t; + + +typedef struct uct_scopy_tx { + ucs_arbiter_elem_t arb_elem; /* TX arbiter group element */ + uct_scopy_tx_op_t op; /* TX operation identifier */ + uint64_t remote_addr; /* The remote address */ + uct_rkey_t rkey; /* User-passed UCT rkey */ + uct_completion_t *comp; /* The pointer to the user's passed completion */ + ucs_iov_iter_t iov_iter; /* UCT IOVs iterator */ + size_t iov_cnt; /* The number of the UCT IOVs */ + uct_iov_t iov[]; /* UCT IOVs */ +} uct_scopy_tx_t; + + +typedef struct uct_scopy_ep { + uct_base_ep_t super; + ucs_arbiter_group_t arb_group; /* TX arbiter group */ +} uct_scopy_ep_t; + + +UCS_CLASS_DECLARE(uct_scopy_ep_t, const uct_ep_params_t *); + +ucs_status_t uct_scopy_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, + size_t iov_cnt, uint64_t remote_addr, + uct_rkey_t rkey, uct_completion_t *comp); + +ucs_status_t uct_scopy_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, + size_t iov_cnt, uint64_t remote_addr, + uct_rkey_t rkey, uct_completion_t *comp); + +ucs_arbiter_cb_result_t uct_scopy_ep_progress_tx(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, + ucs_arbiter_elem_t *elem, + void *arg); + +ucs_status_t uct_scopy_ep_flush(uct_ep_h tl_ep, unsigned flags, + uct_completion_t *comp); + +#endif diff --git a/src/uct/sm/scopy/base/scopy_iface.c b/src/uct/sm/scopy/base/scopy_iface.c new file mode 100644 index 00000000000..5e865f33333 --- /dev/null +++ b/src/uct/sm/scopy/base/scopy_iface.c @@ -0,0 +1,155 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "scopy_iface.h" +#include "scopy_ep.h" + +#include +#include + +#include + + +ucs_config_field_t uct_scopy_iface_config_table[] = { + {"SM_", "", NULL, + ucs_offsetof(uct_scopy_iface_config_t, super), + UCS_CONFIG_TYPE_TABLE(uct_sm_iface_config_table)}, + + {"MAX_IOV", "16", + "Maximum IOV count that can contain user-defined payload in a single\n" + "call to GET/PUT Zcopy operation", + ucs_offsetof(uct_scopy_iface_config_t, max_iov), UCS_CONFIG_TYPE_ULONG}, + + {"SEG_SIZE", "512k", + "Segment size that is used to perform data transfer when doing progress\n" + "of GET/PUT Zcopy operations", + ucs_offsetof(uct_scopy_iface_config_t, seg_size), UCS_CONFIG_TYPE_MEMUNITS}, + + /* TX_QUOTA=1 is used by default in order to make iface progress more + * lightweight and not be blocked for a long time (CMA/KNEM write/read + * operations are blocking). The blocking iface progress for a long time + * is harmful for the many-to-one (GET operation) and one-to-many (PUT + * operation) patterns. */ + {"TX_QUOTA", "1", + "How many TX segments can be dispatched during iface progress", + ucs_offsetof(uct_scopy_iface_config_t, tx_quota), UCS_CONFIG_TYPE_UINT}, + + UCT_IFACE_MPOOL_CONFIG_FIELDS("TX_", -1, 8, "send", + ucs_offsetof(uct_scopy_iface_config_t, tx_mpool), ""), + + {NULL} +}; + +static ucs_mpool_ops_t uct_scopy_mpool_ops = { + .chunk_alloc = ucs_mpool_chunk_malloc, + .chunk_release = ucs_mpool_chunk_free, + .obj_init = NULL, + .obj_cleanup = NULL +}; + +void uct_scopy_iface_query(uct_scopy_iface_t *iface, uct_iface_attr_t *iface_attr) +{ + uct_base_iface_query(&iface->super.super, iface_attr); + + /* default values for all shared memory transports */ + iface_attr->cap.put.min_zcopy = 0; + iface_attr->cap.put.max_zcopy = SIZE_MAX; + iface_attr->cap.put.opt_zcopy_align = 1; + iface_attr->cap.put.align_mtu = iface_attr->cap.put.opt_zcopy_align; + iface_attr->cap.put.max_iov = iface->config.max_iov; + + iface_attr->cap.get.min_zcopy = 0; + iface_attr->cap.get.max_zcopy = SIZE_MAX; + iface_attr->cap.get.opt_zcopy_align = 1; + iface_attr->cap.get.align_mtu = iface_attr->cap.get.opt_zcopy_align; + iface_attr->cap.get.max_iov = iface->config.max_iov; + + iface_attr->device_addr_len = uct_sm_iface_get_device_addr_len(); + iface_attr->ep_addr_len = 0; + iface_attr->max_conn_priv = 0; + iface_attr->cap.flags = UCT_IFACE_FLAG_GET_ZCOPY | + UCT_IFACE_FLAG_PUT_ZCOPY | + UCT_IFACE_FLAG_PENDING | + UCT_IFACE_FLAG_CONNECT_TO_IFACE; + iface_attr->latency = ucs_linear_func_make(80e-9, 0); /* 80 ns */ +} + +UCS_CLASS_INIT_FUNC(uct_scopy_iface_t, uct_scopy_iface_ops_t *ops, uct_md_h md, + uct_worker_h worker, const uct_iface_params_t *params, + const uct_iface_config_t *tl_config) +{ + uct_scopy_iface_config_t *config = ucs_derived_of(tl_config, + uct_scopy_iface_config_t); + size_t elem_size; + ucs_status_t status; + + UCS_CLASS_CALL_SUPER_INIT(uct_sm_iface_t, &ops->super, md, worker, params, tl_config); + + self->tx = ops->ep_tx; + self->config.max_iov = ucs_min(config->max_iov, ucs_iov_get_max()); + self->config.seg_size = config->seg_size; + self->config.tx_quota = config->tx_quota; + + elem_size = sizeof(uct_scopy_tx_t) + + self->config.max_iov * sizeof(uct_iov_t); + + ucs_arbiter_init(&self->arbiter); + + status = ucs_mpool_init(&self->tx_mpool, 0, elem_size, + 0, UCS_SYS_CACHE_LINE_SIZE, + config->tx_mpool.bufs_grow, + config->tx_mpool.max_bufs, + &uct_scopy_mpool_ops, + "uct_scopy_iface_tx_mp"); + + return status; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_scopy_iface_t) +{ + uct_worker_progress_unregister_safe(&self->super.super.worker->super, + &self->super.super.prog.id); + ucs_mpool_cleanup(&self->tx_mpool, 1); + ucs_arbiter_cleanup(&self->arbiter); +} + +UCS_CLASS_DEFINE(uct_scopy_iface_t, uct_sm_iface_t); + +unsigned uct_scopy_iface_progress(uct_iface_h tl_iface) +{ + uct_scopy_iface_t *iface = ucs_derived_of(tl_iface, uct_scopy_iface_t); + unsigned count = 0; + + ucs_arbiter_dispatch(&iface->arbiter, 1, uct_scopy_ep_progress_tx, &count); + + if (ucs_unlikely(ucs_arbiter_is_empty(&iface->arbiter))) { + uct_worker_progress_unregister_safe(&iface->super.super.worker->super, + &iface->super.super.prog.id); + } + + return count; +} + +ucs_status_t uct_scopy_iface_flush(uct_iface_h tl_iface, unsigned flags, + uct_completion_t *comp) +{ + uct_scopy_iface_t *iface = ucs_derived_of(tl_iface, uct_scopy_iface_t); + + if (ucs_unlikely(comp != NULL)) { + return UCS_ERR_UNSUPPORTED; + } + + if (!ucs_arbiter_is_empty(&iface->arbiter)) { + UCT_TL_IFACE_STAT_FLUSH_WAIT(&iface->super.super); + return UCS_INPROGRESS; + } + + UCT_TL_IFACE_STAT_FLUSH(&iface->super.super); + return UCS_OK; +} diff --git a/src/uct/sm/scopy/base/scopy_iface.h b/src/uct/sm/scopy/base/scopy_iface.h new file mode 100644 index 00000000000..8bd6e524e1c --- /dev/null +++ b/src/uct/sm/scopy/base/scopy_iface.h @@ -0,0 +1,72 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCT_SCOPY_IFACE_H +#define UCT_SCOPY_IFACE_H + +#include "scopy_ep.h" + +#include +#include + +#define uct_scopy_trace_data(_tx) \ + ucs_trace_data("%s [tx %p iov %zu/%zu length %zu/%zu] to %"PRIx64"(%+ld)", \ + uct_scopy_tx_op_str[(_tx)->op], (_tx), \ + (_tx)->iov_iter.iov_index, (_tx)->iov_cnt, \ + uct_iov_iter_flat_offset((_tx)->iov, (_tx)->iov_cnt, \ + &(_tx)->iov_iter), \ + uct_iov_total_length((_tx)->iov, (_tx)->iov_cnt), \ + (_tx)->remote_addr, (_tx)->rkey) + + +extern ucs_config_field_t uct_scopy_iface_config_table[]; + + +typedef struct uct_scopy_iface_config { + uct_sm_iface_config_t super; + size_t max_iov; /* Maximum supported IOVs */ + size_t seg_size; /* Segment size that is used to perfrom + * data transfer for RMA operations */ + unsigned tx_quota; /* How many TX segments can be dispatched + * during iface progress */ + uct_iface_mpool_config_t tx_mpool; /* TX memory pool configuration */ +} uct_scopy_iface_config_t; + + +typedef struct uct_scopy_iface { + uct_sm_iface_t super; + ucs_arbiter_t arbiter; /* TX arbiter */ + ucs_mpool_t tx_mpool; /* TX memory pool */ + uct_scopy_ep_tx_func_t tx; /* TX function */ + struct { + size_t max_iov; /* Maximum supported IOVs limited by + * user configuration and system + * settings */ + size_t seg_size; /* Maximal size of the segments + * that has to be used in GET/PUT + * Zcopy transfers */ + unsigned tx_quota; /* How many TX segments can be dispatched + * during iface progress */ + } config; +} uct_scopy_iface_t; + + +typedef struct uct_scopy_iface_ops { + uct_iface_ops_t super; + uct_scopy_ep_tx_func_t ep_tx; +} uct_scopy_iface_ops_t; + + +void uct_scopy_iface_query(uct_scopy_iface_t *iface, uct_iface_attr_t *iface_attr); + +UCS_CLASS_DECLARE(uct_scopy_iface_t, uct_scopy_iface_ops_t*, uct_md_h, uct_worker_h, + const uct_iface_params_t*, const uct_iface_config_t*); + +unsigned uct_scopy_iface_progress(uct_iface_h tl_iface); + +ucs_status_t uct_scopy_iface_flush(uct_iface_h tl_iface, unsigned flags, + uct_completion_t *comp); + +#endif diff --git a/src/uct/sm/cma/Makefile.am b/src/uct/sm/scopy/cma/Makefile.am similarity index 100% rename from src/uct/sm/cma/Makefile.am rename to src/uct/sm/scopy/cma/Makefile.am diff --git a/src/uct/sm/scopy/cma/cma_ep.c b/src/uct/sm/scopy/cma/cma_ep.c new file mode 100644 index 00000000000..cd2a5364c4c --- /dev/null +++ b/src/uct/sm/scopy/cma/cma_ep.c @@ -0,0 +1,95 @@ +/** +* Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. +* Copyright (C) ARM Ltd. 2016. ALL RIGHTS RESERVED. +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif +#include + +#include "cma_ep.h" +#include +#include +#include + + +typedef ssize_t (*uct_cma_ep_zcopy_fn_t)(pid_t, const struct iovec *, + unsigned long, const struct iovec *, + unsigned long, unsigned long); + + +const struct { + uct_cma_ep_zcopy_fn_t fn; + char *name; +} uct_cma_ep_fn[] = { + [UCT_SCOPY_TX_GET_ZCOPY] = { + .fn = process_vm_readv, + .name = "process_vm_readv" + }, + [UCT_SCOPY_TX_PUT_ZCOPY] = { + .fn = process_vm_writev, + .name = "process_vm_writev" + } +}; + +static UCS_CLASS_INIT_FUNC(uct_cma_ep_t, const uct_ep_params_t *params) +{ + UCT_CHECK_PARAM(params->field_mask & UCT_EP_PARAM_FIELD_IFACE_ADDR, + "UCT_EP_PARAM_FIELD_IFACE_ADDR and UCT_EP_PARAM_FIELD_DEV_ADDR are not defined"); + + UCS_CLASS_CALL_SUPER_INIT(uct_scopy_ep_t, params); + self->remote_pid = *(const pid_t*)params->iface_addr & + ~UCT_CMA_IFACE_ADDR_FLAG_PID_NS; + return UCS_OK; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_cma_ep_t) +{ + /* No op */ +} + +UCS_CLASS_DEFINE(uct_cma_ep_t, uct_scopy_ep_t) +UCS_CLASS_DEFINE_NEW_FUNC(uct_cma_ep_t, uct_ep_t, const uct_ep_params_t *); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_cma_ep_t, uct_ep_t); + +ucs_status_t uct_cma_ep_tx(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iov_cnt, + ucs_iov_iter_t *iov_iter, size_t *length_p, + uint64_t remote_addr, uct_rkey_t rkey, + uct_scopy_tx_op_t tx_op) +{ + uct_cma_ep_t *ep = ucs_derived_of(tl_ep, uct_cma_ep_t); + size_t local_iov_idx = 0; + size_t UCS_V_UNUSED remote_iov_idx = 0; + size_t local_iov_cnt = UCT_SM_MAX_IOV; + size_t total_iov_length; + struct iovec local_iov[UCT_SM_MAX_IOV], remote_iov; + ssize_t ret; + + ucs_assert(*length_p != 0); + + total_iov_length = uct_iov_to_iovec(local_iov, &local_iov_cnt, + iov, iov_cnt, *length_p, iov_iter); + ucs_assert((total_iov_length <= *length_p) && (total_iov_length != 0) && + (local_iov_cnt > 0)); + + remote_iov.iov_base = (void*)(uintptr_t)remote_addr; + remote_iov.iov_len = total_iov_length; + + ret = uct_cma_ep_fn[tx_op].fn(ep->remote_pid, &local_iov[local_iov_idx], + local_iov_cnt - local_iov_idx, &remote_iov, + 1, 0); + if (ucs_unlikely(ret < 0)) { + ucs_error("%s(pid=%d length=%zu) returned %zd: %m", + uct_cma_ep_fn[tx_op].name, ep->remote_pid, + remote_iov.iov_len, ret); + return UCS_ERR_IO_ERROR; + } + + ucs_assert(ret <= remote_iov.iov_len); + + *length_p = ret; + return UCS_OK; +} diff --git a/src/uct/sm/scopy/cma/cma_ep.h b/src/uct/sm/scopy/cma/cma_ep.h new file mode 100644 index 00000000000..82030f6a9cd --- /dev/null +++ b/src/uct/sm/scopy/cma/cma_ep.h @@ -0,0 +1,29 @@ +/** +* Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. +* See file LICENSE for terms. +*/ + +#ifndef UCT_CMA_EP_H +#define UCT_CMA_EP_H + +#include "cma_iface.h" + +#include + + +typedef struct uct_cma_ep { + uct_scopy_ep_t super; + pid_t remote_pid; +} uct_cma_ep_t; + + +UCS_CLASS_DECLARE_NEW_FUNC(uct_cma_ep_t, uct_ep_t, const uct_ep_params_t *); +UCS_CLASS_DECLARE_DELETE_FUNC(uct_cma_ep_t, uct_ep_t); + +ucs_status_t uct_cma_ep_tx(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iov_cnt, + ucs_iov_iter_t *iov_iter, size_t *length_p, + uint64_t remote_addr, uct_rkey_t rkey, + uct_scopy_tx_op_t tx_op); + +#endif diff --git a/src/uct/sm/scopy/cma/cma_iface.c b/src/uct/sm/scopy/cma/cma_iface.c new file mode 100644 index 00000000000..f65f29dd5c9 --- /dev/null +++ b/src/uct/sm/scopy/cma/cma_iface.c @@ -0,0 +1,137 @@ +/** + * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "cma_md.h" +#include "cma_iface.h" +#include "cma_ep.h" + +#include +#include + + +typedef struct { + pid_t id; +} ucs_cma_iface_base_device_addr_t; + +typedef struct { + ucs_cma_iface_base_device_addr_t super; + ucs_sys_ns_t pid_ns; +} ucs_cma_iface_ext_device_addr_t; + + +static ucs_config_field_t uct_cma_iface_config_table[] = { + {"SCOPY_", "ALLOC=huge,thp,mmap,heap;SM_BW=11145MBs", NULL, + ucs_offsetof(uct_cma_iface_config_t, super), + UCS_CONFIG_TYPE_TABLE(uct_scopy_iface_config_table)}, + + {NULL} +}; + +static ucs_status_t uct_cma_iface_get_address(uct_iface_t *tl_iface, + uct_iface_addr_t *addr) +{ + ucs_cma_iface_ext_device_addr_t *iface_addr = (void*)addr; + + ucs_assert(!(getpid() & UCT_CMA_IFACE_ADDR_FLAG_PID_NS)); + + iface_addr->super.id = getpid(); + if (!ucs_sys_ns_is_default(UCS_SYS_NS_TYPE_PID)) { + iface_addr->super.id |= UCT_CMA_IFACE_ADDR_FLAG_PID_NS; + iface_addr->pid_ns = ucs_sys_get_ns(UCS_SYS_NS_TYPE_PID); + } + return UCS_OK; +} + +static ucs_status_t uct_cma_iface_query(uct_iface_h tl_iface, + uct_iface_attr_t *iface_attr) +{ + uct_cma_iface_t *iface = ucs_derived_of(tl_iface, uct_cma_iface_t); + + uct_scopy_iface_query(&iface->super, iface_attr); + + iface_attr->iface_addr_len = ucs_sys_ns_is_default(UCS_SYS_NS_TYPE_PID) ? + sizeof(ucs_cma_iface_base_device_addr_t) : + sizeof(ucs_cma_iface_ext_device_addr_t); + iface_attr->bandwidth.dedicated = iface->super.super.config.bandwidth; + iface_attr->bandwidth.shared = 0; + iface_attr->overhead = 0.4e-6; /* 0.4 us */ + + return UCS_OK; +} + +static int +uct_cma_iface_is_reachable(const uct_iface_h tl_iface, + const uct_device_addr_t *dev_addr, + const uct_iface_addr_t *tl_iface_addr) +{ + ucs_cma_iface_ext_device_addr_t *iface_addr = (void*)tl_iface_addr; + + if (!uct_sm_iface_is_reachable(tl_iface, dev_addr, tl_iface_addr)) { + return 0; + } + + if (iface_addr->super.id & UCT_CMA_IFACE_ADDR_FLAG_PID_NS) { + return ucs_sys_get_ns(UCS_SYS_NS_TYPE_PID) == iface_addr->pid_ns; + } + + return ucs_sys_ns_is_default(UCS_SYS_NS_TYPE_PID); +} + +static UCS_CLASS_DECLARE_DELETE_FUNC(uct_cma_iface_t, uct_iface_t); + +static uct_scopy_iface_ops_t uct_cma_iface_ops = { + .super = { + .ep_put_zcopy = uct_scopy_ep_put_zcopy, + .ep_get_zcopy = uct_scopy_ep_get_zcopy, + .ep_pending_add = (uct_ep_pending_add_func_t)ucs_empty_function_return_busy, + .ep_pending_purge = (uct_ep_pending_purge_func_t)ucs_empty_function, + .ep_flush = uct_scopy_ep_flush, + .ep_fence = uct_sm_ep_fence, + .ep_create = UCS_CLASS_NEW_FUNC_NAME(uct_cma_ep_t), + .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_cma_ep_t), + .iface_flush = uct_scopy_iface_flush, + .iface_fence = uct_sm_iface_fence, + .iface_progress_enable = (uct_iface_progress_enable_func_t)ucs_empty_function, + .iface_progress_disable = (uct_iface_progress_disable_func_t)ucs_empty_function, + .iface_progress = uct_scopy_iface_progress, + .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_cma_iface_t), + .iface_query = uct_cma_iface_query, + .iface_get_address = uct_cma_iface_get_address, + .iface_get_device_address = uct_sm_iface_get_device_address, + .iface_is_reachable = uct_cma_iface_is_reachable + }, + .ep_tx = uct_cma_ep_tx +}; + +static UCS_CLASS_INIT_FUNC(uct_cma_iface_t, uct_md_h md, uct_worker_h worker, + const uct_iface_params_t *params, + const uct_iface_config_t *tl_config) +{ + UCS_CLASS_CALL_SUPER_INIT(uct_scopy_iface_t, &uct_cma_iface_ops, md, + worker, params, tl_config); + + return UCS_OK; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_cma_iface_t) +{ + /* No op */ +} + +UCS_CLASS_DEFINE(uct_cma_iface_t, uct_scopy_iface_t); + +static UCS_CLASS_DEFINE_NEW_FUNC(uct_cma_iface_t, uct_iface_t, uct_md_h, + uct_worker_h, const uct_iface_params_t*, + const uct_iface_config_t *); +static UCS_CLASS_DEFINE_DELETE_FUNC(uct_cma_iface_t, uct_iface_t); + +UCT_TL_DEFINE(&uct_cma_component, cma, uct_sm_base_query_tl_devices, + uct_cma_iface_t, "CMA_", uct_cma_iface_config_table, + uct_cma_iface_config_t); diff --git a/src/uct/sm/cma/cma_iface.h b/src/uct/sm/scopy/cma/cma_iface.h similarity index 65% rename from src/uct/sm/cma/cma_iface.h rename to src/uct/sm/scopy/cma/cma_iface.h index d7ec7c86056..62fdf917a14 100644 --- a/src/uct/sm/cma/cma_iface.h +++ b/src/uct/sm/scopy/cma/cma_iface.h @@ -8,20 +8,20 @@ #define UCT_CMA_IFACE_H #include +#include -#define UCT_CMA_TL_NAME "cma" + +#define UCT_CMA_IFACE_ADDR_FLAG_PID_NS UCS_BIT(31) /* use PID NS in address */ typedef struct uct_cma_iface_config { - uct_iface_config_t super; + uct_scopy_iface_config_t super; } uct_cma_iface_config_t; typedef struct uct_cma_iface { - uct_base_iface_t super; + uct_scopy_iface_t super; } uct_cma_iface_t; -extern uct_tl_component_t uct_cma_tl; - #endif diff --git a/src/uct/sm/cma/cma_md.c b/src/uct/sm/scopy/cma/cma_md.c similarity index 62% rename from src/uct/sm/cma/cma_md.c rename to src/uct/sm/scopy/cma/cma_md.c index 4577d56b4f4..76919840c72 100644 --- a/src/uct/sm/cma/cma_md.c +++ b/src/uct/sm/scopy/cma/cma_md.c @@ -1,14 +1,23 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ -#define _GNU_SOURCE +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#ifndef _GNU_SOURCE +# define _GNU_SOURCE +#endif + #include "cma_md.h" +#include #include +#include #include #include #include @@ -18,8 +27,6 @@ #endif -uct_md_component_t uct_cma_md_component; - static int uct_cma_test_ptrace_scope() { static const char *ptrace_scope_file = "/proc/sys/kernel/yama/ptrace_scope"; @@ -107,17 +114,16 @@ static int uct_cma_test_writev() return 1; } -static ucs_status_t uct_cma_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) +static ucs_status_t +uct_cma_query_md_resources(uct_component_t *component, + uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p) { if (uct_cma_test_writev() && uct_cma_test_ptrace_scope()) { - return uct_single_md_resource(&uct_cma_md_component, - resources_p, - num_resources_p); + return uct_md_query_single_md_resource(component, resources_p, + num_resources_p); } else { - *resources_p = NULL; - *num_resources_p = 0; - return UCS_OK; + return uct_md_query_empty_md_resource(resources_p, num_resources_p); } } @@ -132,45 +138,55 @@ static ucs_status_t uct_cma_mem_reg(uct_md_h md, void *address, size_t length, return UCS_OK; } -static ucs_status_t uct_cma_md_open(const char *md_name, const uct_md_config_t *md_config, - uct_md_h *md_p) +static ucs_status_t +uct_cma_md_open(uct_component_t *component, const char *md_name, + const uct_md_config_t *md_config, uct_md_h *md_p) { static uct_md_ops_t md_ops = { - .close = (void*)ucs_empty_function, - .query = uct_cma_md_query, - .mem_alloc = (void*)ucs_empty_function_return_success, - .mem_free = (void*)ucs_empty_function_return_success, - .mkey_pack = (void*)ucs_empty_function_return_success, - .mem_reg = uct_cma_mem_reg, - .mem_dereg = (void*)ucs_empty_function_return_success, - .is_mem_type_owned = (void *)ucs_empty_function_return_zero, + .close = (uct_md_close_func_t)ucs_empty_function, + .query = uct_cma_md_query, + .mem_alloc = (uct_md_mem_alloc_func_t)ucs_empty_function_return_success, + .mem_free = (uct_md_mem_free_func_t)ucs_empty_function_return_success, + .mkey_pack = (uct_md_mkey_pack_func_t)ucs_empty_function_return_success, + .mem_reg = uct_cma_mem_reg, + .mem_dereg = (uct_md_mem_dereg_func_t)ucs_empty_function_return_success, + .detect_memory_type = ucs_empty_function_return_unsupported, }; static uct_md_t md = { .ops = &md_ops, - .component = &uct_cma_md_component + .component = &uct_cma_component }; *md_p = &md; return UCS_OK; } -UCT_MD_COMPONENT_DEFINE(uct_cma_md_component, "cma", - uct_cma_query_md_resources, uct_cma_md_open, NULL, - uct_md_stub_rkey_unpack, - ucs_empty_function_return_success, "CMA_", - uct_md_config_table, uct_md_config_t) - ucs_status_t uct_cma_md_query(uct_md_h md, uct_md_attr_t *md_attr) { - md_attr->rkey_packed_size = 0; - md_attr->cap.flags = UCT_MD_FLAG_REG; - md_attr->cap.reg_mem_types = UCS_BIT(UCT_MD_MEM_TYPE_HOST); - md_attr->cap.mem_type = UCT_MD_MEM_TYPE_HOST; - md_attr->cap.max_alloc = 0; - md_attr->cap.max_reg = ULONG_MAX; - md_attr->reg_cost.overhead = 9e-9; - md_attr->reg_cost.growth = 0; + md_attr->rkey_packed_size = 0; + md_attr->cap.flags = UCT_MD_FLAG_REG; + md_attr->cap.reg_mem_types = UCS_MEMORY_TYPES_CPU_ACCESSIBLE; + md_attr->cap.access_mem_type = UCS_MEMORY_TYPE_HOST; + md_attr->cap.detect_mem_types = 0; + md_attr->cap.max_alloc = 0; + md_attr->cap.max_reg = ULONG_MAX; + md_attr->reg_cost = ucs_linear_func_make(9e-9, 0); memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); return UCS_OK; } + +uct_component_t uct_cma_component = { + .query_md_resources = uct_cma_query_md_resources, + .md_open = uct_cma_md_open, + .cm_open = ucs_empty_function_return_unsupported, + .rkey_unpack = uct_md_stub_rkey_unpack, + .rkey_ptr = ucs_empty_function_return_unsupported, + .rkey_release = ucs_empty_function_return_success, + .name = "cma", + .md_config = UCT_MD_DEFAULT_CONFIG_INITIALIZER, + .cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY, + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_cma_component), + .flags = 0 +}; +UCT_COMPONENT_REGISTER(&uct_cma_component); diff --git a/src/uct/sm/cma/cma_md.h b/src/uct/sm/scopy/cma/cma_md.h similarity index 90% rename from src/uct/sm/cma/cma_md.h rename to src/uct/sm/scopy/cma/cma_md.h index 5a65be5e80e..9ab24494b3c 100644 --- a/src/uct/sm/cma/cma_md.h +++ b/src/uct/sm/scopy/cma/cma_md.h @@ -16,7 +16,7 @@ #include #include -extern uct_md_component_t uct_cma_md_component; +extern uct_component_t uct_cma_component; ucs_status_t uct_cma_md_query(uct_md_h md, uct_md_attr_t *md_attr); diff --git a/src/uct/sm/cma/configure.m4 b/src/uct/sm/scopy/cma/configure.m4 similarity index 86% rename from src/uct/sm/cma/configure.m4 rename to src/uct/sm/scopy/cma/configure.m4 index 2094b030446..d8772157979 100644 --- a/src/uct/sm/cma/configure.m4 +++ b/src/uct/sm/scopy/cma/configure.m4 @@ -16,10 +16,10 @@ AS_IF([test "x$enable_cma" != xno], [cma_happy="yes"], [cma_happy="no"]) AS_IF([test "x$cma_happy" = "xyes"], - [uct_modules+=":cma"]) + [uct_modules="${uct_modules}:cma"]) ]) ] ) AM_CONDITIONAL([HAVE_CMA], [test "x$cma_happy" != xno]) -AC_CONFIG_FILES([src/uct/sm/cma/Makefile]) +AC_CONFIG_FILES([src/uct/sm/scopy/cma/Makefile]) diff --git a/src/uct/sm/scopy/configure.m4 b/src/uct/sm/scopy/configure.m4 new file mode 100644 index 00000000000..23fb8358c0d --- /dev/null +++ b/src/uct/sm/scopy/configure.m4 @@ -0,0 +1,9 @@ +# +# Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# + +m4_include([src/uct/sm/scopy/cma/configure.m4]) +m4_include([src/uct/sm/scopy/knem/configure.m4]) + +AC_CONFIG_FILES([src/uct/sm/scopy/Makefile]) diff --git a/src/uct/sm/knem/Makefile.am b/src/uct/sm/scopy/knem/Makefile.am similarity index 90% rename from src/uct/sm/knem/Makefile.am rename to src/uct/sm/scopy/knem/Makefile.am index 20feb62efdc..46029c12596 100644 --- a/src/uct/sm/knem/Makefile.am +++ b/src/uct/sm/scopy/knem/Makefile.am @@ -7,7 +7,7 @@ if HAVE_KNEM module_LTLIBRARIES = libuct_knem.la libuct_knem_la_CFLAGS = $(BASE_CFLAGS) -libuct_knem_la_CPPFLAGS = $(BASE_CPPFLAGS) +libuct_knem_la_CPPFLAGS = $(BASE_CPPFLAGS) $(KNEM_CPPFLAGS) libuct_knem_la_LIBADD = $(top_builddir)/src/ucs/libucs.la \ $(top_builddir)/src/uct/libuct.la libuct_knem_la_LDFLAGS = -version-info $(SOVERSION) $(UCT_MODULE_LDFLAGS) diff --git a/src/uct/sm/knem/configure.m4 b/src/uct/sm/scopy/knem/configure.m4 similarity index 83% rename from src/uct/sm/knem/configure.m4 rename to src/uct/sm/scopy/knem/configure.m4 index 14b90e66ae3..f6290b9692e 100644 --- a/src/uct/sm/knem/configure.m4 +++ b/src/uct/sm/scopy/knem/configure.m4 @@ -19,9 +19,8 @@ AS_IF([test "x$with_knem" != xno], CPPFLAGS="$ucx_check_knem_include_dir $CPPFLAGS" AC_CHECK_DECL([KNEM_CMD_GET_INFO], - [BASE_CFLAGS="$BASE_CFLAGS $ucx_check_knem_include_dir" - BASE_CPPFLAGS="$BASE_CPPFLAGS $ucx_check_knem_include_dir" - uct_modules+=":knem" + [AC_SUBST([KNEM_CPPFLAGS], [$ucx_check_knem_include_dir]) + uct_modules="${uct_modules}:knem" knem_happy="yes"], [AS_IF([test "x$with_knem" != xguess], [AC_MSG_ERROR([KNEM requested but required file (knem_io.h) could not be found])], @@ -35,4 +34,4 @@ AS_IF([test "x$with_knem" != xno], ) AM_CONDITIONAL([HAVE_KNEM], [test "x$knem_happy" != xno]) -AC_CONFIG_FILES([src/uct/sm/knem/Makefile]) +AC_CONFIG_FILES([src/uct/sm/scopy/knem/Makefile]) diff --git a/src/uct/sm/scopy/knem/knem_ep.c b/src/uct/sm/scopy/knem/knem_ep.c new file mode 100644 index 00000000000..ff9f09a8ce3 --- /dev/null +++ b/src/uct/sm/scopy/knem/knem_ep.c @@ -0,0 +1,103 @@ +/** + * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include + +#include "knem_ep.h" +#include "knem_md.h" +#include +#include +#include + + +const char *uct_knem_ep_tx_op_str[] = { + [UCT_SCOPY_TX_GET_ZCOPY] = "READ", + [UCT_SCOPY_TX_PUT_ZCOPY] = "WRITE" +}; + + +static UCS_CLASS_INIT_FUNC(uct_knem_ep_t, const uct_ep_params_t *params) +{ + UCS_CLASS_CALL_SUPER_INIT(uct_scopy_ep_t, params); + + return UCS_OK; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_knem_ep_t) +{ + /* No op */ +} + +UCS_CLASS_DEFINE(uct_knem_ep_t, uct_scopy_ep_t) +UCS_CLASS_DEFINE_NEW_FUNC(uct_knem_ep_t, uct_ep_t, const uct_ep_params_t *); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_knem_ep_t, uct_ep_t); + +static UCS_F_ALWAYS_INLINE +void uct_knem_iovec_set_length(struct knem_cmd_param_iovec *iov, size_t length) +{ + iov->len = length; +} + +static UCS_F_ALWAYS_INLINE +void uct_knem_iovec_set_buffer(struct knem_cmd_param_iovec *iov, void *buffer) +{ + iov->base = (uintptr_t)buffer; +} + +ucs_status_t uct_knem_ep_tx(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iov_cnt, + ucs_iov_iter_t *iov_iter, size_t *length_p, + uint64_t remote_addr, uct_rkey_t rkey, + uct_scopy_tx_op_t tx_op) +{ + uct_knem_iface_t *knem_iface = ucs_derived_of(tl_ep->iface, + uct_knem_iface_t); + int knem_fd = knem_iface->knem_md->knem_fd; + uct_knem_key_t *key = (uct_knem_key_t*)rkey; + size_t local_iov_cnt = UCT_SM_MAX_IOV; + struct knem_cmd_param_iovec local_iov[UCT_SM_MAX_IOV]; + size_t UCS_V_UNUSED total_iov_length; + struct knem_cmd_inline_copy icopy; + int ret; + + ucs_assert(*length_p != 0); + + total_iov_length = ucs_iov_converter(local_iov, &local_iov_cnt, + uct_knem_iovec_set_buffer, uct_knem_iovec_set_length, + iov, iov_cnt, + uct_iov_get_buffer, uct_iov_get_length, + *length_p, iov_iter); + ucs_assert((total_iov_length <= *length_p) && (total_iov_length != 0) && + (local_iov_cnt > 0)); + + icopy.local_iovec_array = (uintptr_t)local_iov; + icopy.local_iovec_nr = local_iov_cnt; + icopy.remote_cookie = key->cookie; + icopy.current_status = 0; + ucs_assert(remote_addr >= key->address); + icopy.remote_offset = remote_addr - key->address; + /* This value is used to set `knem_cmd_inline_copy::write` field */ + UCS_STATIC_ASSERT(UCT_SCOPY_TX_PUT_ZCOPY == 1); + icopy.write = tx_op; + /* TBD: add check and support for KNEM_FLAG_DMA */ + icopy.flags = 0; + + ucs_assert(knem_fd > -1); + ret = ioctl(knem_fd, KNEM_CMD_INLINE_COPY, &icopy); + if (ucs_unlikely((ret < 0) || + (icopy.current_status != KNEM_STATUS_SUCCESS))) { + ucs_error("KNEM inline copy \"%s\" failed, ioctl() return value - %d, " + "copy status - %d: %m", + uct_knem_ep_tx_op_str[tx_op], ret, icopy.current_status); + return UCS_ERR_IO_ERROR; + } + + *length_p = total_iov_length; + return UCS_OK; +} diff --git a/src/uct/sm/scopy/knem/knem_ep.h b/src/uct/sm/scopy/knem/knem_ep.h new file mode 100644 index 00000000000..d163e7a75e6 --- /dev/null +++ b/src/uct/sm/scopy/knem/knem_ep.h @@ -0,0 +1,28 @@ +/** + * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCT_KNEM_EP_H +#define UCT_KNEM_EP_H + +#include "knem_iface.h" + +#include + + +typedef struct uct_knem_ep { + uct_scopy_ep_t super; +} uct_knem_ep_t; + + +UCS_CLASS_DECLARE_NEW_FUNC(uct_knem_ep_t, uct_ep_t, const uct_ep_params_t *); +UCS_CLASS_DECLARE_DELETE_FUNC(uct_knem_ep_t, uct_ep_t); + +ucs_status_t uct_knem_ep_tx(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iov_cnt, + ucs_iov_iter_t *iov_iter, size_t *length_p, + uint64_t remote_addr, uct_rkey_t rkey, + uct_scopy_tx_op_t tx_op); + +#endif diff --git a/src/uct/sm/scopy/knem/knem_iface.c b/src/uct/sm/scopy/knem/knem_iface.c new file mode 100644 index 00000000000..20bbc3ef897 --- /dev/null +++ b/src/uct/sm/scopy/knem/knem_iface.c @@ -0,0 +1,93 @@ +/** + * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "knem_md.h" +#include "knem_iface.h" +#include "knem_ep.h" + +#include +#include + + +static ucs_config_field_t uct_knem_iface_config_table[] = { + {"SCOPY_", "SM_BW=13862MBs", NULL, + ucs_offsetof(uct_knem_iface_config_t, super), + UCS_CONFIG_TYPE_TABLE(uct_scopy_iface_config_table)}, + + {NULL} +}; + +static ucs_status_t uct_knem_iface_query(uct_iface_h tl_iface, + uct_iface_attr_t *iface_attr) +{ + uct_knem_iface_t *iface = ucs_derived_of(tl_iface, uct_knem_iface_t); + + uct_scopy_iface_query(&iface->super, iface_attr); + + iface_attr->iface_addr_len = 0; + iface_attr->bandwidth.shared = iface->super.super.config.bandwidth; + iface_attr->bandwidth.dedicated = 0; + iface_attr->overhead = 0.25e-6; /* 0.25 us */ + + return UCS_OK; +} + +static UCS_CLASS_DECLARE_DELETE_FUNC(uct_knem_iface_t, uct_iface_t); + +static uct_scopy_iface_ops_t uct_knem_iface_ops = { + .super = { + .ep_put_zcopy = uct_scopy_ep_put_zcopy, + .ep_get_zcopy = uct_scopy_ep_get_zcopy, + .ep_pending_add = (uct_ep_pending_add_func_t)ucs_empty_function_return_busy, + .ep_pending_purge = (uct_ep_pending_purge_func_t)ucs_empty_function, + .ep_flush = uct_scopy_ep_flush, + .ep_fence = uct_sm_ep_fence, + .ep_create = UCS_CLASS_NEW_FUNC_NAME(uct_knem_ep_t), + .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_knem_ep_t), + .iface_flush = uct_scopy_iface_flush, + .iface_fence = uct_sm_iface_fence, + .iface_progress_enable = (uct_iface_progress_enable_func_t)ucs_empty_function, + .iface_progress_disable = (uct_iface_progress_disable_func_t)ucs_empty_function, + .iface_progress = uct_scopy_iface_progress, + .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_knem_iface_t), + .iface_query = uct_knem_iface_query, + .iface_get_device_address = uct_sm_iface_get_device_address, + .iface_get_address = (uct_iface_get_address_func_t)ucs_empty_function_return_success, + .iface_is_reachable = uct_sm_iface_is_reachable + }, + .ep_tx = uct_knem_ep_tx +}; + +static UCS_CLASS_INIT_FUNC(uct_knem_iface_t, uct_md_h md, uct_worker_h worker, + const uct_iface_params_t *params, + const uct_iface_config_t *tl_config) +{ + UCS_CLASS_CALL_SUPER_INIT(uct_scopy_iface_t, &uct_knem_iface_ops, md, + worker, params, tl_config); + self->knem_md = (uct_knem_md_t *)md; + + return UCS_OK; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_knem_iface_t) +{ + /* No OP */ +} + +UCS_CLASS_DEFINE(uct_knem_iface_t, uct_scopy_iface_t); + +static UCS_CLASS_DEFINE_NEW_FUNC(uct_knem_iface_t, uct_iface_t, uct_md_h, + uct_worker_h, const uct_iface_params_t*, + const uct_iface_config_t *); +static UCS_CLASS_DEFINE_DELETE_FUNC(uct_knem_iface_t, uct_iface_t); + +UCT_TL_DEFINE(&uct_knem_component, knem, uct_sm_base_query_tl_devices, + uct_knem_iface_t, "KNEM_", uct_knem_iface_config_table, + uct_knem_iface_config_t); diff --git a/src/uct/sm/knem/knem_iface.h b/src/uct/sm/scopy/knem/knem_iface.h similarity index 70% rename from src/uct/sm/knem/knem_iface.h rename to src/uct/sm/scopy/knem/knem_iface.h index c2755d7b75f..9cab8f885e8 100644 --- a/src/uct/sm/knem/knem_iface.h +++ b/src/uct/sm/scopy/knem/knem_iface.h @@ -10,22 +10,18 @@ #include "knem_md.h" #include - - -#define UCT_KNEM_TL_NAME "knem" +#include typedef struct uct_knem_iface_config { - uct_iface_config_t super; + uct_scopy_iface_config_t super; } uct_knem_iface_config_t; typedef struct uct_knem_iface { - uct_base_iface_t super; - uct_knem_md_t *knem_md; + uct_scopy_iface_t super; + uct_knem_md_t *knem_md; } uct_knem_iface_t; -extern uct_tl_component_t uct_knem_tl; - #endif diff --git a/src/uct/sm/knem/knem_md.c b/src/uct/sm/scopy/knem/knem_md.c similarity index 74% rename from src/uct/sm/knem/knem_md.c rename to src/uct/sm/scopy/knem/knem_md.c index ba2865ccd17..682aee02389 100644 --- a/src/uct/sm/knem/knem_md.c +++ b/src/uct/sm/scopy/knem/knem_md.c @@ -1,16 +1,23 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "knem_md.h" #include "knem_io.h" #include +#include +#include #include + static ucs_config_field_t uct_knem_md_config_table[] = { {"", "", NULL, ucs_offsetof(uct_knem_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_md_config_table)}, @@ -29,21 +36,24 @@ ucs_status_t uct_knem_md_query(uct_md_h uct_md, uct_md_attr_t *md_attr) { uct_knem_md_t *md = ucs_derived_of(uct_md, uct_knem_md_t); - md_attr->rkey_packed_size = sizeof(uct_knem_key_t); - md_attr->cap.flags = UCT_MD_FLAG_REG | - UCT_MD_FLAG_NEED_RKEY; - md_attr->cap.reg_mem_types = UCS_BIT(UCT_MD_MEM_TYPE_HOST); - md_attr->cap.mem_type = UCT_MD_MEM_TYPE_HOST; - md_attr->cap.max_alloc = 0; - md_attr->cap.max_reg = ULONG_MAX; - md_attr->reg_cost = md->reg_cost; + md_attr->rkey_packed_size = sizeof(uct_knem_key_t); + md_attr->cap.flags = UCT_MD_FLAG_REG | + UCT_MD_FLAG_NEED_RKEY; + md_attr->cap.reg_mem_types = UCS_MEMORY_TYPES_CPU_ACCESSIBLE; + md_attr->cap.access_mem_type = UCS_MEMORY_TYPE_HOST; + md_attr->cap.detect_mem_types = 0; + md_attr->cap.max_alloc = 0; + md_attr->cap.max_reg = ULONG_MAX; + md_attr->reg_cost = md->reg_cost; memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); return UCS_OK; } -static ucs_status_t uct_knem_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) +static ucs_status_t +uct_knem_query_md_resources(uct_component_t *component, + uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p) { int fd; int rc; @@ -53,35 +63,31 @@ static ucs_status_t uct_knem_query_md_resources(uct_md_resource_desc_t **resourc fd = open("/dev/knem", O_RDWR); if (fd < 0) { - ucs_debug("Could not open the KNEM device file at /dev/knem: %m. Disabling knem resource"); - *resources_p = NULL; - *num_resources_p = 0; - return UCS_OK; + ucs_debug("could not open the KNEM device file at /dev/knem: %m. Disabling knem resource"); + goto out_empty; } rc = ioctl(fd, KNEM_CMD_GET_INFO, &info); if (rc < 0) { - *resources_p = NULL; - *num_resources_p = 0; - close(fd); ucs_debug("KNEM get info failed. not using knem, err = %d %m", rc); - return UCS_OK; + goto out_empty_close_fd; } if (KNEM_ABI_VERSION != info.abi) { - *resources_p = NULL; - *num_resources_p = 0; - close(fd); ucs_error("KNEM ABI mismatch: KNEM_ABI_VERSION: %d, Driver binary interface version: %d", KNEM_ABI_VERSION, info.abi); - return UCS_OK; + goto out_empty_close_fd; } /* We have to close it since it is not clear * if it will be selected in future */ close(fd); - return uct_single_md_resource(&uct_knem_md_component, resources_p, - num_resources_p); + return uct_md_query_single_md_resource(component, resources_p, num_resources_p); + +out_empty_close_fd: + close(fd); +out_empty: + return uct_md_query_empty_md_resource(resources_p, num_resources_p); } static void uct_knem_md_close(uct_md_h md) @@ -197,9 +203,9 @@ static ucs_status_t uct_knem_rkey_pack(uct_md_h md, uct_mem_h memh, return UCS_OK; } -static ucs_status_t uct_knem_rkey_unpack(uct_md_component_t *mdc, - const void *rkey_buffer, uct_rkey_t *rkey_p, - void **handle_p) +static ucs_status_t uct_knem_rkey_unpack(uct_component_t *component, + const void *rkey_buffer, + uct_rkey_t *rkey_p, void **handle_p) { uct_knem_key_t *packed = (uct_knem_key_t *)rkey_buffer; uct_knem_key_t *key; @@ -218,8 +224,8 @@ static ucs_status_t uct_knem_rkey_unpack(uct_md_component_t *mdc, return UCS_OK; } -static ucs_status_t uct_knem_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, - void *handle) +static ucs_status_t uct_knem_rkey_release(uct_component_t *component, + uct_rkey_t rkey, void *handle) { ucs_assert(NULL == handle); ucs_free((void *)rkey); @@ -227,12 +233,12 @@ static ucs_status_t uct_knem_rkey_release(uct_md_component_t *mdc, uct_rkey_t rk } static uct_md_ops_t md_ops = { - .close = uct_knem_md_close, - .query = uct_knem_md_query, - .mkey_pack = uct_knem_rkey_pack, - .mem_reg = uct_knem_mem_reg, - .mem_dereg = uct_knem_mem_dereg, - .is_mem_type_owned = (void *)ucs_empty_function_return_zero + .close = uct_knem_md_close, + .query = uct_knem_md_query, + .mkey_pack = uct_knem_rkey_pack, + .mem_reg = uct_knem_mem_reg, + .mem_dereg = uct_knem_mem_dereg, + .detect_memory_type = ucs_empty_function_return_unsupported, }; static inline uct_knem_rcache_region_t* uct_knem_rcache_region_from_memh(uct_mem_h memh) @@ -269,12 +275,12 @@ static ucs_status_t uct_knem_mem_rcache_dereg(uct_md_h uct_md, uct_mem_h memh) } static uct_md_ops_t uct_knem_md_rcache_ops = { - .close = uct_knem_md_close, - .query = uct_knem_md_query, - .mkey_pack = uct_knem_rkey_pack, - .mem_reg = uct_knem_mem_rcache_reg, - .mem_dereg = uct_knem_mem_rcache_dereg, - .is_mem_type_owned = (void *)ucs_empty_function_return_zero, + .close = uct_knem_md_close, + .query = uct_knem_md_query, + .mkey_pack = uct_knem_rkey_pack, + .mem_reg = uct_knem_mem_rcache_reg, + .mem_dereg = uct_knem_mem_rcache_dereg, + .detect_memory_type = ucs_empty_function_return_unsupported, }; @@ -318,9 +324,9 @@ static ucs_rcache_ops_t uct_knem_rcache_ops = { .dump_region = uct_knem_rcache_dump_region_cb }; -static ucs_status_t uct_knem_md_open(const char *md_name, - const uct_md_config_t *uct_md_config, - uct_md_h *md_p) +static ucs_status_t +uct_knem_md_open(uct_component_t *component, const char *md_name, + const uct_md_config_t *uct_md_config, uct_md_h *md_p) { const uct_knem_md_config_t *md_config = ucs_derived_of(uct_md_config, uct_knem_md_config_t); uct_knem_md_t *knem_md; @@ -333,11 +339,10 @@ static ucs_status_t uct_knem_md_open(const char *md_name, return UCS_ERR_NO_MEMORY; } - knem_md->super.ops = &md_ops; - knem_md->super.component = &uct_knem_md_component; - knem_md->reg_cost.overhead = 1200.0e-9; - knem_md->reg_cost.growth = 0.007e-9; - knem_md->rcache = NULL; + knem_md->super.ops = &md_ops; + knem_md->super.component = &uct_knem_component; + knem_md->reg_cost = ucs_linear_func_make(1200.0e-9, 0.007e-9); + knem_md->rcache = NULL; knem_md->knem_fd = open("/dev/knem", O_RDWR); if (knem_md->knem_fd < 0) { @@ -354,12 +359,13 @@ static ucs_status_t uct_knem_md_open(const char *md_name, rcache_params.ucm_event_priority = md_config->rcache.event_prio; rcache_params.context = knem_md; rcache_params.ops = &uct_knem_rcache_ops; + rcache_params.flags = 0; status = ucs_rcache_create(&rcache_params, "knem rcache device", ucs_stats_get_root(), &knem_md->rcache); if (status == UCS_OK) { - knem_md->super.ops = &uct_knem_md_rcache_ops; - knem_md->reg_cost.overhead = md_config->rcache.overhead; - knem_md->reg_cost.growth = 0; /* It's close enough to 0 */ + knem_md->super.ops = &uct_knem_md_rcache_ops; + knem_md->reg_cost = ucs_linear_func_make(md_config->rcache.overhead, + 0); } else { ucs_assert(knem_md->rcache == NULL); if (md_config->rcache_enable == UCS_YES) { @@ -378,8 +384,22 @@ static ucs_status_t uct_knem_md_open(const char *md_name, return UCS_OK; } -UCT_MD_COMPONENT_DEFINE(uct_knem_md_component, "knem", - uct_knem_query_md_resources, uct_knem_md_open, 0, - uct_knem_rkey_unpack, - uct_knem_rkey_release, "KNEM_", uct_knem_md_config_table, - uct_knem_md_config_t) +uct_component_t uct_knem_component = { + .query_md_resources = uct_knem_query_md_resources, + .md_open = uct_knem_md_open, + .cm_open = ucs_empty_function_return_unsupported, + .rkey_unpack = uct_knem_rkey_unpack, + .rkey_ptr = ucs_empty_function_return_unsupported, + .rkey_release = uct_knem_rkey_release, + .name = "knem", + .md_config = { + .name = "KNEM memory domain", + .prefix = "KNEM_", + .table = uct_knem_md_config_table, + .size = sizeof(uct_knem_md_config_t), + }, + .cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY, + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_knem_component), + .flags = 0 +}; +UCT_COMPONENT_REGISTER(&uct_knem_component); diff --git a/src/uct/sm/knem/knem_md.h b/src/uct/sm/scopy/knem/knem_md.h similarity index 92% rename from src/uct/sm/knem/knem_md.h rename to src/uct/sm/scopy/knem/knem_md.h index b46db2aada8..f29d9c4f060 100644 --- a/src/uct/sm/knem/knem_md.h +++ b/src/uct/sm/scopy/knem/knem_md.h @@ -14,7 +14,7 @@ #include #include -extern uct_md_component_t uct_knem_md_component; +extern uct_component_t uct_knem_component; ucs_status_t uct_knem_md_query(uct_md_h md, uct_md_attr_t *md_attr); /** @@ -24,7 +24,7 @@ typedef struct uct_knem_md { struct uct_md super; /**< Domain info */ int knem_fd; /**< File descriptor for /dev/knem */ ucs_rcache_t *rcache; /**< Registration cache (can be NULL) */ - uct_linear_growth_t reg_cost; /**< Memory registration cost */ + ucs_linear_func_t reg_cost; /**< Memory registration cost */ } uct_knem_md_t; /** diff --git a/src/uct/sm/self/self.c b/src/uct/sm/self/self.c index 63b3a37e5ab..1431400828a 100644 --- a/src/uct/sm/self/self.c +++ b/src/uct/sm/self/self.c @@ -3,9 +3,14 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "self.h" #include +#include #include #include #include @@ -27,7 +32,20 @@ /* Forward declarations */ static uct_iface_ops_t uct_self_iface_ops; -static uct_md_component_t uct_self_md; +static uct_component_t uct_self_component; + + +static ucs_config_field_t uct_self_iface_config_table[] = { + {"", "", NULL, + ucs_offsetof(uct_self_iface_config_t, super), + UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)}, + + {"SEG_SIZE", "8k", + "Size of copy-out buffer", + ucs_offsetof(uct_self_iface_config_t, seg_size), UCS_CONFIG_TYPE_MEMUNITS}, + + {NULL} +}; static ucs_status_t uct_self_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *attr) @@ -35,7 +53,8 @@ static ucs_status_t uct_self_iface_query(uct_iface_h tl_iface, uct_iface_attr_t uct_self_iface_t *iface = ucs_derived_of(tl_iface, uct_self_iface_t); ucs_trace_func("iface=%p", iface); - memset(attr, 0, sizeof(*attr)); + + uct_base_iface_query(&iface->super, attr); attr->iface_addr_len = sizeof(uct_self_iface_addr_t); attr->device_addr_len = 0; @@ -89,9 +108,9 @@ static ucs_status_t uct_self_iface_query(uct_iface_h tl_iface, uct_iface_attr_t attr->cap.am.max_hdr = 0; attr->cap.am.max_iov = 1; - attr->latency.overhead = 0; - attr->latency.growth = 0; - attr->bandwidth = 6911 * 1024.0 * 1024.0; + attr->latency = ucs_linear_func_make(0, 0); + attr->bandwidth.dedicated = 6911.0 * UCS_MBYTE; + attr->bandwidth.shared = 0; attr->overhead = 10e-9; attr->priority = 0; @@ -140,12 +159,12 @@ static ucs_mpool_ops_t uct_self_iface_mpool_ops = { .obj_cleanup = NULL }; -static UCS_CLASS_DEFINE_DELETE_FUNC(uct_self_iface_t, uct_iface_t); - static UCS_CLASS_INIT_FUNC(uct_self_iface_t, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, const uct_iface_config_t *tl_config) { + uct_self_iface_config_t *config = ucs_derived_of(tl_config, + uct_self_iface_config_t); ucs_status_t status; UCT_CHECK_PARAM(params->field_mask & UCT_IFACE_PARAM_FIELD_OPEN_MODE, @@ -160,11 +179,6 @@ static UCS_CLASS_INIT_FUNC(uct_self_iface_t, uct_md_h md, uct_worker_h worker, return UCS_ERR_INVALID_PARAM; } - if (strcmp(params->mode.device.dev_name, UCT_SELF_NAME) != 0) { - ucs_error("No device was found: %s", params->mode.device.dev_name); - return UCS_ERR_NO_DEVICE; - } - UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_self_iface_ops, md, worker, params, tl_config UCS_STATS_ARG((params->field_mask & @@ -173,7 +187,7 @@ static UCS_CLASS_INIT_FUNC(uct_self_iface_t, uct_md_h md, uct_worker_h worker, UCS_STATS_ARG(UCT_SELF_NAME)); self->id = ucs_generate_uuid((uintptr_t)self); - self->send_size = tl_config->max_bcopy; + self->send_size = config->seg_size; status = ucs_mpool_init(&self->msg_mp, 0, self->send_size, 0, UCS_SYS_CACHE_LINE_SIZE, @@ -195,33 +209,20 @@ static UCS_CLASS_CLEANUP_FUNC(uct_self_iface_t) } UCS_CLASS_DEFINE(uct_self_iface_t, uct_base_iface_t); + +static UCS_CLASS_DEFINE_DELETE_FUNC(uct_self_iface_t, uct_iface_t); + static UCS_CLASS_DEFINE_NEW_FUNC(uct_self_iface_t, uct_iface_t, uct_md_h, uct_worker_h, const uct_iface_params_t*, const uct_iface_config_t*); -static ucs_status_t uct_self_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) +static ucs_status_t +uct_self_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) { - uct_tl_resource_desc_t *resource = 0; - - ucs_trace_func("md=%p", md); - - resource = ucs_calloc(1, sizeof(*resource), "resource desc"); - if (NULL == resource) { - ucs_error("Failed to allocate memory"); - return UCS_ERR_NO_MEMORY; - } - - ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", - UCT_SELF_NAME); - ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", - UCT_SELF_NAME); - resource->dev_type = UCT_DEVICE_TYPE_SELF; - - *num_resources_p = 1; - *resource_p = resource; - return UCS_OK; + return uct_single_device_resource(md, UCT_SM_DEVICE_NAME, + UCT_DEVICE_TYPE_SELF, + tl_devices_p, num_tl_devices_p); } static UCS_CLASS_INIT_FUNC(uct_self_ep_t, const uct_ep_params_t *params) @@ -255,8 +256,7 @@ ucs_status_t uct_self_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t header, UCT_CHECK_LENGTH(total_length, 0, iface->send_size, "am_short"); send_buffer = UCT_SELF_IFACE_SEND_BUFFER_GET(iface); - *(uint64_t*)send_buffer = header; - memcpy(send_buffer + sizeof(uint64_t), payload, length); + uct_am_short_fill_data(send_buffer, header, payload, length); UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, total_length); uct_self_iface_sendrecv_am(iface, id, send_buffer, total_length, "SHORT"); @@ -315,32 +315,25 @@ static uct_iface_ops_t uct_self_iface_ops = { .iface_is_reachable = uct_self_iface_is_reachable }; -UCT_TL_COMPONENT_DEFINE(uct_self_tl, uct_self_query_tl_resources, uct_self_iface_t, - UCT_SELF_NAME, "SELF_", uct_iface_config_table, uct_iface_config_t); -UCT_MD_REGISTER_TL(&uct_self_md, &uct_self_tl); +UCT_TL_DEFINE(&uct_self_component, self, uct_self_query_tl_devices, uct_self_iface_t, + "SELF_", uct_self_iface_config_table, uct_self_iface_config_t); static ucs_status_t uct_self_md_query(uct_md_h md, uct_md_attr_t *attr) { /* Dummy memory registration provided. No real memory handling exists */ - attr->cap.flags = UCT_MD_FLAG_REG | - UCT_MD_FLAG_NEED_RKEY; /* TODO ignore rkey in rma/amo ops */ - attr->cap.reg_mem_types = UCS_BIT(UCT_MD_MEM_TYPE_HOST); - attr->cap.mem_type = UCT_MD_MEM_TYPE_HOST; - attr->cap.max_alloc = 0; - attr->cap.max_reg = ULONG_MAX; - attr->rkey_packed_size = 0; /* uct_md_query adds UCT_MD_COMPONENT_NAME_MAX to this */ - attr->reg_cost.overhead = 0; - attr->reg_cost.growth = 0; + attr->cap.flags = UCT_MD_FLAG_REG | + UCT_MD_FLAG_NEED_RKEY; /* TODO ignore rkey in rma/amo ops */ + attr->cap.reg_mem_types = UCS_MEMORY_TYPES_CPU_ACCESSIBLE; + attr->cap.detect_mem_types = 0; + attr->cap.access_mem_type = UCS_MEMORY_TYPE_HOST; + attr->cap.max_alloc = 0; + attr->cap.max_reg = ULONG_MAX; + attr->rkey_packed_size = 0; /* uct_md_query adds UCT_COMPONENT_NAME_MAX to this */ + attr->reg_cost = ucs_linear_func_make(0, 0); memset(&attr->local_cpus, 0xff, sizeof(attr->local_cpus)); return UCS_OK; } -static ucs_status_t uct_self_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) -{ - return uct_single_md_resource(&uct_self_md, resources_p, num_resources_p); -} - static ucs_status_t uct_self_mem_reg(uct_md_h md, void *address, size_t length, unsigned flags, uct_mem_h *memh_p) { @@ -349,27 +342,27 @@ static ucs_status_t uct_self_mem_reg(uct_md_h md, void *address, size_t length, return UCS_OK; } -static ucs_status_t uct_self_md_open(const char *md_name, const uct_md_config_t *md_config, - uct_md_h *md_p) +static ucs_status_t uct_self_md_open(uct_component_t *component, const char *md_name, + const uct_md_config_t *config, uct_md_h *md_p) { static uct_md_ops_t md_ops = { - .close = (void*)ucs_empty_function, - .query = uct_self_md_query, - .mkey_pack = ucs_empty_function_return_success, - .mem_reg = uct_self_mem_reg, - .mem_dereg = ucs_empty_function_return_success, - .is_mem_type_owned = (void *)ucs_empty_function_return_zero, + .close = ucs_empty_function, + .query = uct_self_md_query, + .mkey_pack = ucs_empty_function_return_success, + .mem_reg = uct_self_mem_reg, + .mem_dereg = ucs_empty_function_return_success, + .detect_memory_type = ucs_empty_function_return_unsupported }; static uct_md_t md = { .ops = &md_ops, - .component = &uct_self_md + .component = &uct_self_component }; *md_p = &md; return UCS_OK; } -static ucs_status_t uct_self_md_rkey_unpack(uct_md_component_t *mdc, +static ucs_status_t uct_self_md_rkey_unpack(uct_component_t *component, const void *rkey_buffer, uct_rkey_t *rkey_p, void **handle_p) { @@ -382,8 +375,17 @@ static ucs_status_t uct_self_md_rkey_unpack(uct_md_component_t *mdc, return UCS_OK; } -static UCT_MD_COMPONENT_DEFINE(uct_self_md, UCT_SELF_NAME, - uct_self_query_md_resources, uct_self_md_open, NULL, - uct_self_md_rkey_unpack, - ucs_empty_function_return_success, "SELF_", - uct_md_config_table, uct_md_config_t); +static uct_component_t uct_self_component = { + .query_md_resources = uct_md_query_single_md_resource, + .md_open = uct_self_md_open, + .cm_open = ucs_empty_function_return_unsupported, + .rkey_unpack = uct_self_md_rkey_unpack, + .rkey_ptr = ucs_empty_function_return_unsupported, + .rkey_release = ucs_empty_function_return_success, + .name = UCT_SELF_NAME, + .md_config = UCT_MD_DEFAULT_CONFIG_INITIALIZER, + .cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY, + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_self_component), + .flags = 0 +}; +UCT_COMPONENT_REGISTER(&uct_self_component); diff --git a/src/uct/sm/self/self.h b/src/uct/sm/self/self.h index 21c4bf9a7d4..f9a4b610cb1 100644 --- a/src/uct/sm/self/self.h +++ b/src/uct/sm/self/self.h @@ -14,6 +14,12 @@ typedef uint64_t uct_self_iface_addr_t; +typedef struct uct_self_iface_config { + uct_iface_config_t super; + size_t seg_size; /* Maximal send size */ +} uct_self_iface_config_t; + + typedef struct uct_self_iface { uct_base_iface_t super; uct_self_iface_addr_t id; /* Unique identifier for the instance */ diff --git a/src/uct/tcp/sockcm/sockcm_def.h b/src/uct/tcp/sockcm/sockcm_def.h new file mode 100644 index 00000000000..81195a084e6 --- /dev/null +++ b/src/uct/tcp/sockcm/sockcm_def.h @@ -0,0 +1,44 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2017. ALL RIGHTS RESERVED. + * Copyright (C) NVIDIA Corporation. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCT_SOCKCM_H +#define UCT_SOCKCM_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define UCT_SOCKCM_TL_NAME "sockcm" +#define UCT_SOCKCM_PRIV_DATA_LEN 2048 + +typedef struct uct_sockcm_iface uct_sockcm_iface_t; +typedef struct uct_sockcm_ep uct_sockcm_ep_t; + +typedef struct uct_sockcm_conn_param { + ssize_t length; + int fd; + char private_data[UCT_SOCKCM_PRIV_DATA_LEN]; +} uct_sockcm_conn_param_t; + +typedef struct uct_sockcm_ctx { + int sock_fd; + size_t recv_len; + uct_sockcm_iface_t *iface; + uct_sockcm_conn_param_t conn_param; + ucs_list_link_t list; +} uct_sockcm_ctx_t; + +ucs_status_t uct_sockcm_ep_set_sock_id(uct_sockcm_ep_t *ep); +void uct_sockcm_ep_put_sock_id(uct_sockcm_ctx_t *sock_id_ctx); + +#endif /* UCT_SOCKCM_H */ diff --git a/src/uct/tcp/sockcm/sockcm_ep.c b/src/uct/tcp/sockcm/sockcm_ep.c new file mode 100644 index 00000000000..cbeae27d883 --- /dev/null +++ b/src/uct/tcp/sockcm/sockcm_ep.c @@ -0,0 +1,406 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2017-2019. ALL RIGHTS RESERVED. + * Copyright (C) NVIDIA Corporation. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "sockcm_ep.h" +#include +#include +#include +#include + +#define UCT_SOCKCM_CB_FLAGS_CHECK(_flags) \ + do { \ + UCT_CB_FLAGS_CHECK(_flags); \ + if (!((_flags) & UCT_CB_FLAG_ASYNC)) { \ + return UCS_ERR_UNSUPPORTED; \ + } \ + } while (0) + +ucs_status_t uct_sockcm_ep_set_sock_id(uct_sockcm_ep_t *ep) +{ + ucs_status_t status; + struct sockaddr *dest_addr = NULL; + + ep->sock_id_ctx = ucs_malloc(sizeof(*ep->sock_id_ctx), "client sock_id_ctx"); + if (ep->sock_id_ctx == NULL) { + return UCS_ERR_NO_MEMORY; + } + + dest_addr = (struct sockaddr *) &(ep->remote_addr); + + status = ucs_socket_create(dest_addr->sa_family, SOCK_STREAM, + &ep->sock_id_ctx->sock_fd); + if (status != UCS_OK) { + ucs_debug("unable to create client socket for sockcm"); + ucs_free(ep->sock_id_ctx); + return status; + } + + return UCS_OK; +} + +void uct_sockcm_ep_put_sock_id(uct_sockcm_ctx_t *sock_id_ctx) +{ + close(sock_id_ctx->sock_fd); + ucs_free(sock_id_ctx); +} + +ucs_status_t uct_sockcm_ep_send_client_info(uct_sockcm_ep_t *ep) +{ + uct_sockcm_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_sockcm_iface_t); + uct_cm_ep_priv_data_pack_args_t pack_args; + uct_sockcm_conn_param_t conn_param; + char dev_name[UCT_DEVICE_NAME_MAX]; + ucs_status_t status; + + memset(&conn_param, 0, sizeof(uct_sockcm_conn_param_t)); + + /* get interface name associated with the connected client fd; use that for pack_cb */ + status = ucs_sockaddr_get_ifname(ep->sock_id_ctx->sock_fd, dev_name, + UCT_DEVICE_NAME_MAX); + if (UCS_OK != status) { + goto out; + } + + pack_args.field_mask = UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME; + ucs_strncpy_safe(pack_args.dev_name, dev_name, UCT_DEVICE_NAME_MAX); + + conn_param.length = ep->pack_cb(ep->pack_cb_arg, &pack_args, + (void*)conn_param.private_data); + if (conn_param.length < 0) { + ucs_error("sockcm client (iface=%p, ep = %p) failed to fill " + "private data. status: %s", + iface, ep, ucs_status_string((ucs_status_t)conn_param.length)); + status = UCS_ERR_IO_ERROR; + goto out; + } + + ucs_assert(conn_param.length <= UCT_SOCKCM_PRIV_DATA_LEN); + + status = ucs_socket_send(ep->sock_id_ctx->sock_fd, &conn_param, + sizeof(uct_sockcm_conn_param_t), NULL, NULL); + +out: + return status; +} + +static const char* +uct_sockcm_ep_conn_state_str(uct_sockcm_ep_conn_state_t state) +{ + switch (state) { + case UCT_SOCKCM_EP_CONN_STATE_SOCK_CONNECTING: + return "UCT_SOCKCM_EP_CONN_STATE_SOCK_CONNECTING"; + case UCT_SOCKCM_EP_CONN_STATE_INFO_SENT: + return "UCT_SOCKCM_EP_CONN_STATE_INFO_SENT"; + case UCT_SOCKCM_EP_CONN_STATE_CLOSED: + return "UCT_SOCKCM_EP_CONN_STATE_CLOSED"; + case UCT_SOCKCM_EP_CONN_STATE_CONNECTED: + return "UCT_SOCKCM_EP_CONN_STATE_CONNECTED"; + default: + ucs_fatal("invaild sockcm endpoint state %d", state); + } +} + +static void uct_sockcm_change_state(uct_sockcm_ep_t *ep, + uct_sockcm_ep_conn_state_t conn_state, + ucs_status_t status) +{ + uct_sockcm_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_sockcm_iface_t); + + pthread_mutex_lock(&ep->ops_mutex); + ucs_debug("changing ep with status %s from state %s to state %s, status %s", + ucs_status_string(ep->status), + uct_sockcm_ep_conn_state_str(ep->conn_state), + uct_sockcm_ep_conn_state_str(conn_state), + ucs_status_string(status)); + if ((ep->status != UCS_OK) && + (ep->conn_state == UCT_SOCKCM_EP_CONN_STATE_CLOSED)) { + /* Do not handle failure twice for closed EP */ + pthread_mutex_unlock(&ep->ops_mutex); + return; + } + + ep->status = status; + ep->conn_state = conn_state; + + if (conn_state == UCT_SOCKCM_EP_CONN_STATE_CLOSED) { + uct_sockcm_ep_set_failed(&iface->super.super, &ep->super.super, status); + } + + uct_sockcm_ep_invoke_completions(ep, status); + pthread_mutex_unlock(&ep->ops_mutex); +} + +static void uct_sockcm_handle_sock_connect(uct_sockcm_ep_t *ep) +{ + char sockaddr_str[UCS_SOCKADDR_STRING_LEN]; + int fd = ep->sock_id_ctx->sock_fd; + ucs_status_t status; + + if (!ucs_socket_is_connected(fd)) { + ucs_error("failed to connect to %s", + ucs_sockaddr_str((struct sockaddr*)&ep->remote_addr, + sockaddr_str, sizeof(sockaddr_str))); + uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CLOSED, + UCS_ERR_UNREACHABLE); + goto err; + } + + status = uct_sockcm_ep_send_client_info(ep); + if (status != UCS_OK) { + ucs_error("failed to send client info: %s", ucs_status_string(status)); + uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CLOSED, status); + goto err; + } + + ep->conn_state = UCT_SOCKCM_EP_CONN_STATE_INFO_SENT; + + /* Call current handler when server responds to sent message */ + if (UCS_OK != ucs_async_modify_handler(fd, UCS_EVENT_SET_EVREAD)) { + ucs_error("failed to modify async handler for fd %d", fd); + uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CLOSED, + UCS_ERR_IO_ERROR); + goto err; + } + + return; + +err: + status = ucs_async_modify_handler(fd, 0); + if (status != UCS_OK) { + ucs_debug("unable to modify handler"); + } +} + +static void uct_sockcm_handle_info_sent(uct_sockcm_ep_t *ep) +{ + ucs_status_t status; + size_t recv_len; + char notif_val; + + recv_len = sizeof(notif_val); + status = ucs_socket_recv_nb(ep->sock_id_ctx->sock_fd, ¬if_val, + &recv_len, NULL, NULL); + if (UCS_ERR_NO_PROGRESS == status) { + /* will call recv again when ready */ + return; + } + + ucs_async_remove_handler(ep->sock_id_ctx->sock_fd, 0); + + if (UCS_OK != status) { + /* receive notif failed, close the connection */ + uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CLOSED, status); + return; + } + + if (notif_val == UCT_SOCKCM_IFACE_NOTIFY_ACCEPT) { + ucs_debug("event_handler OK after accept"); + uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CONNECTED, UCS_OK); + } else { + ucs_debug("event_handler REJECTED after reject"); + uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CLOSED, + UCS_ERR_REJECTED); + } +} + +static void uct_sockcm_ep_event_handler(int fd, int events, void *arg) +{ + uct_sockcm_ep_t *ep = (uct_sockcm_ep_t *) arg; + + switch (ep->conn_state) { + case UCT_SOCKCM_EP_CONN_STATE_SOCK_CONNECTING: + uct_sockcm_handle_sock_connect(ep); + break; + case UCT_SOCKCM_EP_CONN_STATE_INFO_SENT: + uct_sockcm_handle_info_sent(ep); + break; + case UCT_SOCKCM_EP_CONN_STATE_CONNECTED: + if (UCS_OK != ucs_async_modify_handler(fd, 0)) { + ucs_warn("unable to turn off event notifications on %d", fd); + } + uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CONNECTED, UCS_OK); + break; + case UCT_SOCKCM_EP_CONN_STATE_CLOSED: + default: + ucs_debug("handling closed/default state, ep %p fd %d", ep, fd); + uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CLOSED, + UCS_ERR_IO_ERROR); + break; + } +} + +static UCS_CLASS_INIT_FUNC(uct_sockcm_ep_t, const uct_ep_params_t *params) +{ + const ucs_sock_addr_t *sockaddr = params->sockaddr; + uct_sockcm_iface_t *iface = NULL; + struct sockaddr *param_sockaddr = NULL; + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + ucs_status_t status; + size_t sockaddr_len; + + iface = ucs_derived_of(params->iface, uct_sockcm_iface_t); + UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super); + + if (iface->is_server) { + return UCS_ERR_UNSUPPORTED; + } + + if (!(params->field_mask & UCT_EP_PARAM_FIELD_SOCKADDR)) { + return UCS_ERR_INVALID_PARAM; + } + + UCT_SOCKCM_CB_FLAGS_CHECK((params->field_mask & + UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS) ? + params->sockaddr_cb_flags : 0); + + self->pack_cb = (params->field_mask & + UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB) ? + params->sockaddr_pack_cb : NULL; + self->pack_cb_arg = (params->field_mask & + UCT_EP_PARAM_FIELD_USER_DATA) ? + params->user_data : NULL; + self->pack_cb_flags = (params->field_mask & + UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS) ? + params->sockaddr_cb_flags : 0; + pthread_mutex_init(&self->ops_mutex, NULL); + ucs_queue_head_init(&self->ops); + + param_sockaddr = (struct sockaddr *) sockaddr->addr; + if (UCS_OK != ucs_sockaddr_sizeof(param_sockaddr, &sockaddr_len)) { + ucs_error("sockcm ep: unknown remote sa_family=%d", + sockaddr->addr->sa_family); + status = UCS_ERR_IO_ERROR; + goto err; + } + + memcpy(&self->remote_addr, param_sockaddr, sockaddr_len); + + self->slow_prog_id = UCS_CALLBACKQ_ID_NULL; + + status = uct_sockcm_ep_set_sock_id(self); + if (status != UCS_OK) { + goto err; + } + + status = ucs_sys_fcntl_modfl(self->sock_id_ctx->sock_fd, O_NONBLOCK, 0); + if (status != UCS_OK) { + goto sock_err; + } + + status = ucs_socket_connect(self->sock_id_ctx->sock_fd, param_sockaddr); + if (UCS_STATUS_IS_ERR(status)) { + self->conn_state = UCT_SOCKCM_EP_CONN_STATE_CLOSED; + goto sock_err; + } + + self->conn_state = UCT_SOCKCM_EP_CONN_STATE_SOCK_CONNECTING; + self->status = UCS_INPROGRESS; + + /* set ep->status before event handler call to avoid simultaneous writes to state*/ + status = ucs_async_set_event_handler(iface->super.worker->async->mode, + self->sock_id_ctx->sock_fd, + UCS_EVENT_SET_EVWRITE, + uct_sockcm_ep_event_handler, + self, iface->super.worker->async); + if (status != UCS_OK) { + goto sock_err; + } + + ucs_debug("created an SOCKCM endpoint on iface %p, " + "remote addr: %s", iface, + ucs_sockaddr_str(param_sockaddr, + ip_port_str, UCS_SOCKADDR_STRING_LEN)); + return UCS_OK; + +sock_err: + uct_sockcm_ep_put_sock_id(self->sock_id_ctx); +err: + ucs_debug("error in sock connect"); + pthread_mutex_destroy(&self->ops_mutex); + + return status; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_sockcm_ep_t) +{ + uct_sockcm_iface_t *iface = ucs_derived_of(self->super.super.iface, + uct_sockcm_iface_t); + + ucs_debug("sockcm_ep %p: destroying", self); + + UCS_ASYNC_BLOCK(iface->super.worker->async); + + ucs_async_remove_handler(self->sock_id_ctx->sock_fd, 1); + uct_sockcm_ep_put_sock_id(self->sock_id_ctx); + + uct_worker_progress_unregister_safe(&iface->super.worker->super, + &self->slow_prog_id); + + pthread_mutex_destroy(&self->ops_mutex); + if (!ucs_queue_is_empty(&self->ops)) { + ucs_warn("destroying endpoint %p with not completed operations", self); + } + + UCS_ASYNC_UNBLOCK(iface->super.worker->async); +} + +UCS_CLASS_DEFINE(uct_sockcm_ep_t, uct_base_ep_t) +UCS_CLASS_DEFINE_NEW_FUNC(uct_sockcm_ep_t, uct_ep_t, const uct_ep_params_t *); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_sockcm_ep_t, uct_ep_t); + +static unsigned uct_sockcm_client_err_handle_progress(void *arg) +{ + uct_sockcm_ep_t *sockcm_ep = arg; + uct_sockcm_iface_t *iface = ucs_derived_of(sockcm_ep->super.super.iface, + uct_sockcm_iface_t); + + ucs_trace_func("err_handle ep=%p", sockcm_ep); + UCS_ASYNC_BLOCK(iface->super.worker->async); + + sockcm_ep->slow_prog_id = UCS_CALLBACKQ_ID_NULL; + uct_set_ep_failed(&UCS_CLASS_NAME(uct_sockcm_ep_t), &sockcm_ep->super.super, + sockcm_ep->super.super.iface, sockcm_ep->status); + + UCS_ASYNC_UNBLOCK(iface->super.worker->async); + return 0; +} + +void uct_sockcm_ep_set_failed(uct_iface_t *iface, uct_ep_h ep, ucs_status_t status) +{ + uct_sockcm_iface_t *sockcm_iface = ucs_derived_of(iface, uct_sockcm_iface_t); + uct_sockcm_ep_t *sockcm_ep = ucs_derived_of(ep, uct_sockcm_ep_t); + + if (sockcm_iface->super.err_handler_flags & UCT_CB_FLAG_ASYNC) { + uct_set_ep_failed(&UCS_CLASS_NAME(uct_sockcm_ep_t), &sockcm_ep->super.super, + &sockcm_iface->super.super, status); + } else { + sockcm_ep->status = status; + uct_worker_progress_register_safe(&sockcm_iface->super.worker->super, + uct_sockcm_client_err_handle_progress, + sockcm_ep, UCS_CALLBACKQ_FLAG_ONESHOT, + &sockcm_ep->slow_prog_id); + } +} + +void uct_sockcm_ep_invoke_completions(uct_sockcm_ep_t *ep, ucs_status_t status) +{ + uct_sockcm_ep_op_t *op; + + ucs_assert(pthread_mutex_trylock(&ep->ops_mutex) == EBUSY); + + ucs_queue_for_each_extract(op, &ep->ops, queue_elem, 1) { + pthread_mutex_unlock(&ep->ops_mutex); + uct_invoke_completion(op->user_comp, status); + ucs_free(op); + pthread_mutex_lock(&ep->ops_mutex); + } +} diff --git a/src/uct/tcp/sockcm/sockcm_ep.h b/src/uct/tcp/sockcm/sockcm_ep.h new file mode 100644 index 00000000000..9f0bd64a833 --- /dev/null +++ b/src/uct/tcp/sockcm/sockcm_ep.h @@ -0,0 +1,49 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2017-2019. ALL RIGHTS RESERVED. + * Copyright (C) NVIDIA Corporation. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCT_SOCKCM_EP_H +#define UCT_SOCKCM_EP_H + +#include "sockcm_iface.h" + +typedef struct uct_sockcm_ep_op uct_sockcm_ep_op_t; + +typedef enum uct_sockcm_ep_conn_state { + UCT_SOCKCM_EP_CONN_STATE_SOCK_CONNECTING, + UCT_SOCKCM_EP_CONN_STATE_INFO_SENT, + UCT_SOCKCM_EP_CONN_STATE_CLOSED, + UCT_SOCKCM_EP_CONN_STATE_CONNECTED +} uct_sockcm_ep_conn_state_t; + +struct uct_sockcm_ep_op { + ucs_queue_elem_t queue_elem; + uct_completion_t *user_comp; +}; + +struct uct_sockcm_ep { + uct_base_ep_t super; + uct_cm_ep_priv_data_pack_callback_t pack_cb; + void *pack_cb_arg; + uint32_t pack_cb_flags; + uct_sockcm_ep_conn_state_t conn_state; + + pthread_mutex_t ops_mutex; /* guards ops and status */ + ucs_queue_head_t ops; + ucs_status_t status; /* client EP status */ + + struct sockaddr_storage remote_addr; + uct_worker_cb_id_t slow_prog_id; + uct_sockcm_ctx_t *sock_id_ctx; +}; + +UCS_CLASS_DECLARE_NEW_FUNC(uct_sockcm_ep_t, uct_ep_t, const uct_ep_params_t *); +UCS_CLASS_DECLARE_DELETE_FUNC(uct_sockcm_ep_t, uct_ep_t); + +void uct_sockcm_ep_set_failed(uct_iface_t *iface, uct_ep_h ep, ucs_status_t status); + +void uct_sockcm_ep_invoke_completions(uct_sockcm_ep_t *ep, ucs_status_t status); + +#endif diff --git a/src/uct/tcp/sockcm/sockcm_iface.c b/src/uct/tcp/sockcm/sockcm_iface.c new file mode 100644 index 00000000000..51c8841f15a --- /dev/null +++ b/src/uct/tcp/sockcm/sockcm_iface.c @@ -0,0 +1,430 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2017-2019. ALL RIGHTS RESERVED. + * Copyright (C) NVIDIA Corporation. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "sockcm_iface.h" +#include "sockcm_ep.h" + +#include +#include +#include +#include + + +enum uct_sockcm_process_event_flags { + UCT_SOCKCM_PROCESS_EVENT_DESTROY_SOCK_ID_FLAG = UCS_BIT(0), + UCT_SOCKCM_PROCESS_EVENT_ACK_EVENT_FLAG = UCS_BIT(1) +}; + +static ucs_config_field_t uct_sockcm_iface_config_table[] = { + {"", "", NULL, + ucs_offsetof(uct_sockcm_iface_config_t, super), + UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)}, + + {"BACKLOG", "1024", + "Maximum number of pending connections for a listening socket.", + ucs_offsetof(uct_sockcm_iface_config_t, backlog), UCS_CONFIG_TYPE_UINT}, + + {NULL} +}; + +static UCS_CLASS_DECLARE_DELETE_FUNC(uct_sockcm_iface_t, uct_iface_t); + +static ucs_status_t uct_sockcm_iface_query(uct_iface_h tl_iface, + uct_iface_attr_t *iface_attr) +{ + uct_sockcm_iface_t *iface = ucs_derived_of(tl_iface, uct_sockcm_iface_t); + struct sockaddr_storage addr; + ucs_status_t status; + + uct_base_iface_query(&iface->super, iface_attr); + + iface_attr->iface_addr_len = sizeof(ucs_sock_addr_t); + iface_attr->device_addr_len = 0; + iface_attr->cap.flags = UCT_IFACE_FLAG_CONNECT_TO_SOCKADDR | + UCT_IFACE_FLAG_CB_ASYNC | + UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE; + iface_attr->max_conn_priv = UCT_SOCKCM_MAX_CONN_PRIV; + + if (iface->is_server) { + socklen_t len = sizeof(struct sockaddr_storage); + if (getsockname(iface->listen_fd, (struct sockaddr *)&addr, &len)) { + ucs_error("sockcm_iface: getsockname failed %m"); + return UCS_ERR_IO_ERROR; + } + + status = ucs_sockaddr_copy((struct sockaddr *)&iface_attr->listen_sockaddr, + (const struct sockaddr *)&addr); + if (status != UCS_OK) { + return status; + } + } + + return UCS_OK; +} + +static ucs_status_t uct_sockcm_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *iface_addr) +{ + ucs_sock_addr_t *sockcm_addr = (ucs_sock_addr_t *)iface_addr; + + sockcm_addr->addr = NULL; + sockcm_addr->addrlen = 0; + return UCS_OK; +} + +static ucs_status_t uct_sockcm_iface_notify_client(int notif_val, + uct_conn_request_h conn_request) +{ + char notif = notif_val; + int fd; + + fd = ((uct_sockcm_ctx_t *) conn_request)->sock_fd; + + return ucs_socket_send(fd, ¬if, sizeof(notif), NULL, NULL); +} + +static ucs_status_t uct_sockcm_iface_accept(uct_iface_h tl_iface, + uct_conn_request_h conn_request) +{ + return uct_sockcm_iface_notify_client(UCT_SOCKCM_IFACE_NOTIFY_ACCEPT, conn_request); +} + +static ucs_status_t uct_sockcm_iface_reject(uct_iface_h tl_iface, + uct_conn_request_h conn_request) +{ + return uct_sockcm_iface_notify_client(UCT_SOCKCM_IFACE_NOTIFY_REJECT, conn_request); +} + +static ucs_status_t uct_sockcm_ep_flush(uct_ep_h tl_ep, unsigned flags, + uct_completion_t *comp) +{ + uct_sockcm_ep_t *ep = ucs_derived_of(tl_ep, uct_sockcm_ep_t); + ucs_status_t status; + uct_sockcm_ep_op_t *op; + + pthread_mutex_lock(&ep->ops_mutex); + status = ep->status; + if ((status == UCS_INPROGRESS) && (comp != NULL)) { + op = ucs_malloc(sizeof(*op), "uct_sockcm_ep_flush op"); + if (op != NULL) { + op->user_comp = comp; + ucs_queue_push(&ep->ops, &op->queue_elem); + } else { + status = UCS_ERR_NO_MEMORY; + } + } + pthread_mutex_unlock(&ep->ops_mutex); + + return status; +} + + +static uct_iface_ops_t uct_sockcm_iface_ops = { + .ep_create = UCS_CLASS_NEW_FUNC_NAME(uct_sockcm_ep_t), + .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_sockcm_ep_t), + .ep_flush = uct_sockcm_ep_flush, + .ep_fence = uct_base_ep_fence, + .ep_pending_purge = ucs_empty_function, + .iface_accept = uct_sockcm_iface_accept, + .iface_reject = uct_sockcm_iface_reject, + .iface_progress_enable = (uct_iface_progress_enable_func_t)ucs_empty_function_return_success, + .iface_progress_disable = (uct_iface_progress_disable_func_t)ucs_empty_function_return_success, + .iface_progress = ucs_empty_function_return_zero, + .iface_flush = uct_base_iface_flush, + .iface_fence = uct_base_iface_fence, + .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_sockcm_iface_t), + .iface_query = uct_sockcm_iface_query, + .iface_is_reachable = (uct_iface_is_reachable_func_t)ucs_empty_function_return_zero, + .iface_get_device_address = (uct_iface_get_device_address_func_t)ucs_empty_function_return_success, + .iface_get_address = uct_sockcm_iface_get_address +}; + +static ucs_status_t uct_sockcm_iface_process_conn_req(uct_sockcm_ctx_t *sock_id_ctx) +{ + uct_sockcm_iface_t *iface = sock_id_ctx->iface; + uct_sockcm_conn_param_t *conn_param = &sock_id_ctx->conn_param; + + ucs_debug("process conn req conn_param = %p, conn_param->length = %ld", + conn_param, conn_param->length); + iface->conn_request_cb(&iface->super.super, iface->conn_request_arg, sock_id_ctx, + conn_param->private_data, conn_param->length); + return UCS_OK; +} + +static void uct_sockcm_iface_recv_handler(int fd, int events, void *arg) +{ + uct_sockcm_ctx_t *sock_id_ctx = (uct_sockcm_ctx_t *) arg; + ucs_status_t status; + size_t recv_len; + + /* attempt another receive only if initial receive was not successful */ + recv_len = sizeof(uct_sockcm_conn_param_t) - sock_id_ctx->recv_len; + if (recv_len == 0) { + goto out_remove_handler; + } + + status = ucs_socket_recv_nb(sock_id_ctx->sock_fd, + UCS_PTR_BYTE_OFFSET(&sock_id_ctx->conn_param, + sock_id_ctx->recv_len), + &recv_len, NULL, NULL); + if ((status == UCS_ERR_CANCELED) || (status == UCS_ERR_IO_ERROR)) { + ucs_warn("recv failed in recv handler"); + /* TODO: clean up resources allocated for client endpoint? */ + return; + } + + sock_id_ctx->recv_len += ((UCS_ERR_NO_PROGRESS == status) ? 0 : recv_len); + if (sock_id_ctx->recv_len != sizeof(uct_sockcm_conn_param_t)) { + /* handler should be notified when remaining pieces show up */ + return; + } + + if (UCS_OK != uct_sockcm_iface_process_conn_req((uct_sockcm_ctx_t*)arg)) { + ucs_error("unable to process connection request"); + } + +out_remove_handler: + status = ucs_async_modify_handler(fd, 0); + if (status != UCS_OK) { + ucs_debug("unable to modify handler"); + } +} + +static void uct_sockcm_iface_event_handler(int fd, int events, void *arg) +{ + size_t recv_len = 0; + uct_sockcm_iface_t *iface = arg; + uct_sockcm_ctx_t *sock_id_ctx = NULL; + struct sockaddr peer_addr; + socklen_t addrlen; + int accept_fd; + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + ucs_status_t status; + + addrlen = sizeof(struct sockaddr); + accept_fd = accept(iface->listen_fd, (struct sockaddr*)&peer_addr, &addrlen); + if (accept_fd == -1) { + if ((errno == EAGAIN) || (errno == EINTR)) { + ucs_debug("accept(fd=%d) failed: %m", iface->listen_fd); + } else { + /* accept failed here, let the client try again */ + ucs_warn("accept(fd=%d) failed with non-recoverable error %m", + iface->listen_fd); + } + return; + } + + ucs_debug("sockcm_iface %p: accepted connection from %s at fd %d %m", iface, + ucs_sockaddr_str(&peer_addr, ip_port_str, + UCS_SOCKADDR_STRING_LEN), accept_fd); + + /* Unlike rdmacm, socket connect/accept does not permit exchange of + * connection parameters but we need to use send/recv on top of that + * We simulate that with an explicit receive */ + + sock_id_ctx = ucs_malloc(sizeof(uct_sockcm_ctx_t), "accepted sock_id_ctx"); + if (sock_id_ctx == NULL) { + ucs_error("sockcm_listener: unable to create mem for accepted fd"); + close(accept_fd); + return; + } + + sock_id_ctx->recv_len = 0; + sock_id_ctx->sock_fd = accept_fd; + sock_id_ctx->iface = iface; + + status = ucs_sys_fcntl_modfl(sock_id_ctx->sock_fd, O_NONBLOCK, 0); + if (status != UCS_OK) { + ucs_error("sockcm_listener: unable make accepted fd non-blocking"); + goto err; + } + + recv_len = sizeof(sock_id_ctx->conn_param); + + status = ucs_socket_recv_nb(accept_fd, &sock_id_ctx->conn_param, &recv_len, + NULL, NULL); + if (UCS_OK != status) { + sock_id_ctx->recv_len = ((UCS_ERR_NO_PROGRESS == status) ? 0: recv_len); + status = ucs_async_set_event_handler(iface->super.worker->async->mode, + sock_id_ctx->sock_fd, + UCS_EVENT_SET_EVREAD, + uct_sockcm_iface_recv_handler, + sock_id_ctx, + iface->super.worker->async); + if (status != UCS_OK) { + ucs_fatal("sockcm_listener: unable to create handler for new connection"); + goto err; + } + ucs_debug("assigning recv handler for message from client"); + } else { + ucs_debug("not assigning recv handler for message from client"); + if (UCS_OK != uct_sockcm_iface_process_conn_req(sock_id_ctx)) { + ucs_error("Unable to process connection request"); + } + } + + UCS_ASYNC_BLOCK(iface->super.worker->async); + ucs_list_add_tail(&iface->used_sock_ids_list, &sock_id_ctx->list); + UCS_ASYNC_UNBLOCK(iface->super.worker->async); + + return; + +err: + uct_sockcm_ep_put_sock_id(sock_id_ctx); + return; +} + +static UCS_CLASS_INIT_FUNC(uct_sockcm_iface_t, uct_md_h md, uct_worker_h worker, + const uct_iface_params_t *params, + const uct_iface_config_t *tl_config) +{ + uct_sockcm_iface_config_t *config = ucs_derived_of(tl_config, + uct_sockcm_iface_config_t); + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + ucs_status_t status; + struct sockaddr *param_sockaddr; + int param_sockaddr_len; + + UCT_CHECK_PARAM(params->field_mask & UCT_IFACE_PARAM_FIELD_OPEN_MODE, + "UCT_IFACE_PARAM_FIELD_OPEN_MODE is not defined"); + + UCT_CHECK_PARAM((params->open_mode & UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER) || + (params->open_mode & UCT_IFACE_OPEN_MODE_SOCKADDR_CLIENT), + "Invalid open mode %zu", params->open_mode); + + UCT_CHECK_PARAM(!(params->open_mode & UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER) || + (params->field_mask & UCT_IFACE_PARAM_FIELD_SOCKADDR), + "UCT_IFACE_PARAM_FIELD_SOCKADDR is not defined " + "for UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER"); + + UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_sockcm_iface_ops, md, worker, + params, tl_config + UCS_STATS_ARG((params->field_mask & + UCT_IFACE_PARAM_FIELD_STATS_ROOT) ? + params->stats_root : NULL) + UCS_STATS_ARG(UCT_SOCKCM_TL_NAME)); + + if (self->super.worker->async == NULL) { + ucs_error("sockcm must have async != NULL"); + return UCS_ERR_INVALID_PARAM; + } + if (self->super.worker->async->mode == UCS_ASYNC_MODE_SIGNAL) { + ucs_warn("sockcm does not support SIGIO"); + } + + self->listen_fd = -1; + + if (params->open_mode & UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER) { + + if (!(params->mode.sockaddr.cb_flags & UCT_CB_FLAG_ASYNC)) { + return UCS_ERR_INVALID_PARAM; + } + + param_sockaddr = (struct sockaddr *)params->mode.sockaddr.listen_sockaddr.addr; + param_sockaddr_len = params->mode.sockaddr.listen_sockaddr.addrlen; + + status = ucs_socket_create(param_sockaddr->sa_family, SOCK_STREAM, + &self->listen_fd); + if (status != UCS_OK) { + return status; + } + + status = ucs_sys_fcntl_modfl(self->listen_fd, O_NONBLOCK, 0); + if (status != UCS_OK) { + goto err_close_sock; + } + + if (0 > bind(self->listen_fd, param_sockaddr, param_sockaddr_len)) { + ucs_error("bind(fd=%d) failed: %m", self->listen_fd); + status = (errno == EADDRINUSE) ? UCS_ERR_BUSY : UCS_ERR_IO_ERROR; + goto err_close_sock; + } + + if (0 > listen(self->listen_fd, config->backlog)) { + ucs_error("listen(fd=%d; backlog=%d)", self->listen_fd, + config->backlog); + status = UCS_ERR_IO_ERROR; + goto err_close_sock; + } + + status = ucs_async_set_event_handler(self->super.worker->async->mode, + self->listen_fd, + UCS_EVENT_SET_EVREAD | + UCS_EVENT_SET_EVERR, + uct_sockcm_iface_event_handler, + self, self->super.worker->async); + if (status != UCS_OK) { + goto err_close_sock; + } + + ucs_debug("iface (%p) sockcm id %d listening on %s", self, + self->listen_fd, + ucs_sockaddr_str(param_sockaddr, ip_port_str, + UCS_SOCKADDR_STRING_LEN)); + + self->cb_flags = params->mode.sockaddr.cb_flags; + self->conn_request_cb = params->mode.sockaddr.conn_request_cb; + self->conn_request_arg = params->mode.sockaddr.conn_request_arg; + self->is_server = 1; + } else { + self->is_server = 0; + } + + ucs_list_head_init(&self->used_sock_ids_list); + + return UCS_OK; + + err_close_sock: + close(self->listen_fd); + return status; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_sockcm_iface_t) +{ + uct_sockcm_ctx_t *sock_id_ctx; + + if (self->is_server) { + if (-1 != self->listen_fd) { + ucs_debug("cleaning listen_fd = %d", self->listen_fd); + ucs_async_remove_handler(self->listen_fd, 1); + close(self->listen_fd); + } + } + + UCS_ASYNC_BLOCK(self->super.worker->async); + + while (!ucs_list_is_empty(&self->used_sock_ids_list)) { + sock_id_ctx = ucs_list_extract_head(&self->used_sock_ids_list, + uct_sockcm_ctx_t, list); + ucs_debug("cleaning server fd = %d", sock_id_ctx->sock_fd); + ucs_async_remove_handler(sock_id_ctx->sock_fd, 1); + uct_sockcm_ep_put_sock_id(sock_id_ctx); + } + + UCS_ASYNC_UNBLOCK(self->super.worker->async); +} + +UCS_CLASS_DEFINE(uct_sockcm_iface_t, uct_base_iface_t); +static UCS_CLASS_DEFINE_NEW_FUNC(uct_sockcm_iface_t, uct_iface_t, uct_md_h, + uct_worker_h, const uct_iface_params_t *, + const uct_iface_config_t *); +static UCS_CLASS_DEFINE_DELETE_FUNC(uct_sockcm_iface_t, uct_iface_t); + +static ucs_status_t +uct_sockcm_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) +{ + *num_tl_devices_p = 0; + *tl_devices_p = NULL; + return UCS_OK; +} + +UCT_TL_DEFINE(&uct_sockcm_component, sockcm, uct_sockcm_query_tl_devices, + uct_sockcm_iface_t, "SOCKCM_", uct_sockcm_iface_config_table, + uct_sockcm_iface_config_t); diff --git a/src/uct/tcp/sockcm/sockcm_iface.h b/src/uct/tcp/sockcm/sockcm_iface.h new file mode 100644 index 00000000000..e39fd0f8f4f --- /dev/null +++ b/src/uct/tcp/sockcm/sockcm_iface.h @@ -0,0 +1,41 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2017. ALL RIGHTS RESERVED. + * Copyright (C) NVIDIA Corporation. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCT_SOCKCM_IFACE_H +#define UCT_SOCKCM_IFACE_H + +#include "sockcm_def.h" +#include "sockcm_md.h" + +#define UCT_SOCKCM_MAX_CONN_PRIV \ + (UCT_SOCKCM_PRIV_DATA_LEN - sizeof(ssize_t)) + + +typedef enum uct_sockcm_iface_notify { + UCT_SOCKCM_IFACE_NOTIFY_ACCEPT = 0, + UCT_SOCKCM_IFACE_NOTIFY_REJECT +} uct_sockcm_iface_notify_t; + +typedef struct uct_sockcm_iface_config { + uct_iface_config_t super; + unsigned backlog; +} uct_sockcm_iface_config_t; + +struct uct_sockcm_iface { + uct_base_iface_t super; + + int listen_fd; + + uint8_t is_server; + /* Fields used only for server side */ + void *conn_request_arg; + uct_sockaddr_conn_request_callback_t conn_request_cb; + uint32_t cb_flags; + + /* Field used only for client side */ + ucs_list_link_t used_sock_ids_list; +}; +#endif diff --git a/src/uct/tcp/sockcm/sockcm_md.c b/src/uct/tcp/sockcm/sockcm_md.c new file mode 100644 index 00000000000..350795d10ad --- /dev/null +++ b/src/uct/tcp/sockcm/sockcm_md.c @@ -0,0 +1,143 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2017. ALL RIGHTS RESERVED. + * Copyright (C) NVIDIA Corporation. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "sockcm_md.h" + +#define UCT_SOCKCM_NAME "sockcm" + +static ucs_config_field_t uct_sockcm_md_config_table[] = { + {"", "", NULL, + ucs_offsetof(uct_sockcm_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_md_config_table)}, + {NULL} +}; + +static void uct_sockcm_md_close(uct_md_h md); + +static uct_md_ops_t uct_sockcm_md_ops = { + .close = uct_sockcm_md_close, + .query = uct_sockcm_md_query, + .is_sockaddr_accessible = uct_sockcm_is_sockaddr_accessible, + .detect_memory_type = ucs_empty_function_return_unsupported, +}; + +static void uct_sockcm_md_close(uct_md_h md) +{ + uct_sockcm_md_t *sockcm_md = ucs_derived_of(md, uct_sockcm_md_t); + ucs_free(sockcm_md); +} + +ucs_status_t uct_sockcm_md_query(uct_md_h md, uct_md_attr_t *md_attr) +{ + md_attr->cap.flags = UCT_MD_FLAG_SOCKADDR; + md_attr->cap.reg_mem_types = 0; + md_attr->cap.access_mem_type = UCS_MEMORY_TYPE_HOST; + md_attr->cap.detect_mem_types = 0; + md_attr->cap.max_alloc = 0; + md_attr->cap.max_reg = 0; + md_attr->rkey_packed_size = 0; + md_attr->reg_cost = ucs_linear_func_make(0, 0); + memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); + return UCS_OK; +} + +int uct_sockcm_is_sockaddr_accessible(uct_md_h md, const ucs_sock_addr_t *sockaddr, + uct_sockaddr_accessibility_t mode) +{ + struct sockaddr *param_sockaddr = NULL; + int is_accessible = 0; + int sock_fd = -1; + size_t sockaddr_len = 0; + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + + param_sockaddr = (struct sockaddr *) sockaddr->addr; + + if ((mode != UCT_SOCKADDR_ACC_LOCAL) && (mode != UCT_SOCKADDR_ACC_REMOTE)) { + ucs_error("Unknown sockaddr accessibility mode %d", mode); + return 0; + } + + sock_fd = socket(param_sockaddr->sa_family, SOCK_STREAM, 0); + if (-1 == sock_fd) { + return 0; + } + + if (UCS_OK != ucs_sockaddr_sizeof(param_sockaddr, &sockaddr_len)) { + ucs_debug("family != AF_INET and != AF_INET6"); + goto out_destroy_id; + } + + if (mode == UCT_SOCKADDR_ACC_LOCAL) { + ucs_debug("addr_len = %ld", (long int) sockaddr_len); + + if (bind(sock_fd, param_sockaddr, sockaddr_len)) { + ucs_debug("bind(addr = %s) failed: %m", + ucs_sockaddr_str((struct sockaddr *)sockaddr->addr, + ip_port_str, UCS_SOCKADDR_STRING_LEN)); + goto out_destroy_id; + } + + if (ucs_sockaddr_is_inaddr_any(param_sockaddr)) { + is_accessible = 1; + goto out_print; + } + } + + is_accessible = 1; /* if UCT_SOCKADDR_ACC_REMOTE == mode*/ + + out_print: + ucs_debug("address %s is accessible from sockcm_md %p with mode: %d", + ucs_sockaddr_str(param_sockaddr, ip_port_str, + UCS_SOCKADDR_STRING_LEN), + ucs_derived_of(md, uct_sockcm_md_t), mode); + + out_destroy_id: + close(sock_fd); + + return is_accessible; +} + +static ucs_status_t +uct_sockcm_md_open(uct_component_t *component, const char *md_name, + const uct_md_config_t *config, uct_md_h *md_p) +{ + uct_sockcm_md_t *md; + + md = ucs_malloc(sizeof(*md), "sockcm_md"); + if (md == NULL) { + return UCS_ERR_NO_MEMORY; + } + + md->super.ops = &uct_sockcm_md_ops; + md->super.component = &uct_sockcm_component; + + /* cppcheck-suppress autoVariables */ + *md_p = &md->super; + return UCS_OK; +} + +uct_component_t uct_sockcm_component = { + .query_md_resources = uct_md_query_single_md_resource, + .md_open = uct_sockcm_md_open, + .cm_open = ucs_empty_function_return_unsupported, + .rkey_unpack = ucs_empty_function_return_unsupported, + .rkey_ptr = ucs_empty_function_return_unsupported, + .rkey_release = ucs_empty_function_return_unsupported, + .name = UCT_SOCKCM_NAME, + .md_config = { + .name = "Sock-CM memory domain", + .prefix = "SOCKCM_", + .table = uct_sockcm_md_config_table, + .size = sizeof(uct_sockcm_md_config_t), + }, + .cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY, + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_sockcm_component), + .flags = 0 +}; +UCT_COMPONENT_REGISTER(&uct_sockcm_component) diff --git a/src/uct/tcp/sockcm/sockcm_md.h b/src/uct/tcp/sockcm/sockcm_md.h new file mode 100644 index 00000000000..7b7cfa6d927 --- /dev/null +++ b/src/uct/tcp/sockcm/sockcm_md.h @@ -0,0 +1,37 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2017. ALL RIGHTS RESERVED. + * Copyright (C) NVIDIA Corporation. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCT_SOCKCM_MD_H_ +#define UCT_SOCKCM_MD_H_ + +#include "sockcm_def.h" +#include +#include +#include + +/* + * SOCKCM memory domain. + */ +typedef struct uct_sockcm_md { + uct_md_t super; +} uct_sockcm_md_t; + +/* + * SOCKCM memory domain configuration. + */ +typedef struct uct_sockcm_md_config { + uct_md_config_t super; +} uct_sockcm_md_config_t; + +extern uct_component_t uct_sockcm_component; + +ucs_status_t uct_sockcm_md_query(uct_md_h md, uct_md_attr_t *md_attr); + +int uct_sockcm_is_sockaddr_accessible(uct_md_h md, + const ucs_sock_addr_t *sockaddr, + uct_sockaddr_accessibility_t mode); + +#endif diff --git a/src/uct/tcp/tcp.h b/src/uct/tcp/tcp.h index 763673882d6..65f5017c35a 100644 --- a/src/uct/tcp/tcp.h +++ b/src/uct/tcp/tcp.h @@ -6,55 +6,299 @@ #ifndef UCT_TCP_MD_H #define UCT_TCP_MD_H +#include "tcp_base.h" + #include +#include +#include #include +#include +#include +#include +#include +#include + #include -#define UCT_TCP_NAME "tcp" +#define UCT_TCP_NAME "tcp" + +#define UCT_TCP_CONFIG_PREFIX "TCP_" + +/* Magic number that is used by TCP to identify its peers */ +#define UCT_TCP_MAGIC_NUMBER 0xCAFEBABE12345678lu + +/* Maximum number of events to wait on event set */ +#define UCT_TCP_MAX_EVENTS 16 + +/* How long should be string to keep [%s:%s] string + * where %s value can be -/Tx/Rx */ +#define UCT_TCP_EP_CTX_CAPS_STR_MAX 8 +/* How many IOVs are needed to keep AM/PUT Zcopy service data + * (TCP protocol and user's AM (or PUT) headers) */ +#define UCT_TCP_EP_ZCOPY_SERVICE_IOV_COUNT 2 -/* How many events to wait for in epoll_wait */ -#define UCT_TCP_MAX_EVENTS 16 +/* How many IOVs are needed to do AM Short + * (TCP protocol and user's AM headers, payload) */ +#define UCT_TCP_EP_AM_SHORTV_IOV_COUNT 3 +/* Maximum size of a data that can be sent by PUT Zcopy + * operation */ +#define UCT_TCP_EP_PUT_ZCOPY_MAX SIZE_MAX + +/* Length of a data that is used by PUT protocol */ +#define UCT_TCP_EP_PUT_SERVICE_LENGTH (sizeof(uct_tcp_am_hdr_t) + \ + sizeof(uct_tcp_ep_put_req_hdr_t)) + +#define UCT_TCP_CONFIG_MAX_CONN_RETRIES "MAX_CONN_RETRIES" + +/* TX and RX caps */ +#define UCT_TCP_EP_CTX_CAPS (UCS_BIT(UCT_TCP_EP_CTX_TYPE_TX) | \ + UCS_BIT(UCT_TCP_EP_CTX_TYPE_RX)) + + +/** + * TCP context type + */ +typedef enum uct_tcp_ep_ctx_type { + /* EP is connected to a peer to send data. This EP is managed + * by a user and TCP mustn't free this EP even if connection + * is broken. */ + UCT_TCP_EP_CTX_TYPE_TX, + /* EP is connected to a peer to receive data. If only RX is set + * on a given EP, it is hidden from a user (i.e. the user is unable + * to do any operation on that EP) and TCP is responsible to + * free memory allocating for this EP. */ + UCT_TCP_EP_CTX_TYPE_RX, + + /* Additional flags that controls EP behavior: */ + /* - Zcopy TX operation is in progress on a given EP. */ + UCT_TCP_EP_CTX_TYPE_ZCOPY_TX, + /* - PUT RX operation is in progress on a given EP. */ + UCT_TCP_EP_CTX_TYPE_PUT_RX, + /* - PUT TX operation is waiting for an ACK on a given EP */ + UCT_TCP_EP_CTX_TYPE_PUT_TX_WAITING_ACK, + /* - PUT RX operation is waiting for resources to send an ACK + * for received PUT operations on a given EP */ + UCT_TCP_EP_CTX_TYPE_PUT_RX_SENDING_ACK +} uct_tcp_ep_ctx_type_t; + + +/** + * TCP endpoint connection state + */ +typedef enum uct_tcp_ep_conn_state { + /* EP is unable to communicate with a peer's EP - connections establishment + * was unsuccessful or detected hangup during communications. */ + UCT_TCP_EP_CONN_STATE_CLOSED, + /* EP is connecting to a peer's EP, i.e. connect() was called on non-blocking + * socket and returned this call returned that an operation is in progress. + * After it is done, it sends `UCT_TCP_CM_CONN_REQ` to the peer. + * All AM operations return `UCS_ERR_NO_RESOURCE` error to a caller. */ + UCT_TCP_EP_CONN_STATE_CONNECTING, + /* EP is receiving the magic number in order to verify a peer. EP is moved + * to this state after accept() completed. */ + UCT_TCP_EP_CONN_STATE_RECV_MAGIC_NUMBER, + /* EP is accepting connection from a peer, i.e. accept() returns socket fd + * on which a connection was accepted, this EP was created using this socket + * fd and the magic number was received and verified by EP and now it is + * waiting for `UCT_TCP_CM_CONN_REQ` from a peer. */ + UCT_TCP_EP_CONN_STATE_ACCEPTING, + /* EP is waiting for `UCT_TCP_CM_CONN_ACK` message from a peer after sending + * `UCT_TCP_CM_CONN_REQ`. + * All AM operations return `UCS_ERR_NO_RESOURCE` error to a caller. */ + UCT_TCP_EP_CONN_STATE_WAITING_ACK, + /* EP is waiting for a connection and `UCT_TCP_CM_CONN_REQ` message from + * a peer after simultaneous connection resolution between them. This EP + * is a "winner" of the resolution, but no RX capability on this PR (i.e. + * no `UCT_TCP_CM_CONN_REQ` message was received from the peer). EP is moved + * to `UCT_TCP_EP_CONN_STATE_CONNECTED` state upon receiving this message. + * All AM operations return `UCS_ERR_NO_RESOURCE` error to a caller. */ + UCT_TCP_EP_CONN_STATE_WAITING_REQ, + /* EP is connected to a peer and they can communicate with each other. */ + UCT_TCP_EP_CONN_STATE_CONNECTED +} uct_tcp_ep_conn_state_t; /* Forward declaration */ typedef struct uct_tcp_ep uct_tcp_ep_t; typedef unsigned (*uct_tcp_ep_progress_t)(uct_tcp_ep_t *ep); +static inline int uct_tcp_khash_sockaddr_in_equal(struct sockaddr_in sa1, + struct sockaddr_in sa2) +{ + ucs_status_t status; + int cmp; + + cmp = ucs_sockaddr_cmp((const struct sockaddr*)&sa1, + (const struct sockaddr*)&sa2, + &status); + ucs_assert(status == UCS_OK); + return !cmp; +} + +static inline uint32_t uct_tcp_khash_sockaddr_in_hash(struct sockaddr_in sa) +{ + ucs_status_t UCS_V_UNUSED status; + size_t addr_size; + + status = ucs_sockaddr_sizeof((const struct sockaddr*)&sa, + &addr_size); + ucs_assert(status == UCS_OK); + return ucs_crc32(0, (const void *)&sa, addr_size); +} + +KHASH_INIT(uct_tcp_cm_eps, struct sockaddr_in, ucs_list_link_t*, + 1, uct_tcp_khash_sockaddr_in_hash, uct_tcp_khash_sockaddr_in_equal); + + +/** + * TCP Connection Manager state + */ +typedef struct uct_tcp_cm_state { + const char *name; /* CM state name */ + uct_tcp_ep_progress_t tx_progress; /* TX progress function */ + uct_tcp_ep_progress_t rx_progress; /* RX progress function */ +} uct_tcp_cm_state_t; + + +/** + * TCP Connection Manager event + */ +typedef enum uct_tcp_cm_conn_event { + /* Connection request from a EP that has TX capability to a EP that + * has to be able to receive AM data (i.e. has to have RX capability). */ + UCT_TCP_CM_CONN_REQ = UCS_BIT(0), + /* Connection acknowledgment from a EP that accepts a conenction from + * initiator of a connection request. */ + UCT_TCP_CM_CONN_ACK = UCS_BIT(1), + /* Request for waiting of a connection request. + * The mesage is not sent separately (only along with a connection + * acknowledgment.) */ + UCT_TCP_CM_CONN_WAIT_REQ = UCS_BIT(2), + /* Connection acknowledgment + Connection request. The mesasge is sent + * from a EP that accepts remote conenction when it was in + * `UCT_TCP_EP_CONN_STATE_CONNECTING` state (i.e. original + * `UCT_TCP_CM_CONN_REQ` wasn't sent yet) and want to have RX capability + * on a peer's EP in order to send AM data. */ + UCT_TCP_CM_CONN_ACK_WITH_REQ = (UCT_TCP_CM_CONN_REQ | + UCT_TCP_CM_CONN_ACK), + /* Connection acknowledgment + Request for waiting of a connection request. + * The message is sent from a EP that accepts remote conenction when it was + * in `UCT_TCP_EP_CONN_STATE_WAITING_ACK` state (i.e. original + * `UCT_TCP_CM_CONN_REQ` was sent) and want to have RX capability on a + * peer's EP in order to send AM data. */ + UCT_TCP_CM_CONN_ACK_WITH_WAIT_REQ = (UCT_TCP_CM_CONN_WAIT_REQ | + UCT_TCP_CM_CONN_ACK) +} uct_tcp_cm_conn_event_t; + + +/** + * TCP connection request packet + */ +typedef struct uct_tcp_cm_conn_req_pkt { + uct_tcp_cm_conn_event_t event; /* Connection event ID */ + struct sockaddr_in iface_addr; /* Socket address of UCT local iface */ +} UCS_S_PACKED uct_tcp_cm_conn_req_pkt_t; + /** * TCP active message header */ typedef struct uct_tcp_am_hdr { - uint8_t am_id; - uint32_t length; + uint8_t am_id; /* UCT AM ID of an AM operation */ + uint32_t length; /* Length of data sent in an AM operation */ } UCS_S_PACKED uct_tcp_am_hdr_t; +/** + * AM IDs reserved for TCP protocols + */ +typedef enum uct_tcp_ep_am_id { + /* AM ID reserved for TCP internal Connection Manager messages */ + UCT_TCP_EP_CM_AM_ID = UCT_AM_ID_MAX, + /* AM ID reserved for TCP internal PUT REQ message */ + UCT_TCP_EP_PUT_REQ_AM_ID = UCT_AM_ID_MAX + 1, + /* AM ID reserved for TCP internal PUT ACK message */ + UCT_TCP_EP_PUT_ACK_AM_ID = UCT_AM_ID_MAX + 2 +} uct_tcp_ep_am_id_t; + + +/** + * TCP PUT request header + */ +typedef struct uct_tcp_ep_put_req_hdr { + uint64_t addr; /* Address of a remote memory buffer */ + size_t length; /* Length of a remote memory buffer */ + uint32_t sn; /* Sequence number of the current PUT operation */ +} UCS_S_PACKED uct_tcp_ep_put_req_hdr_t; + + +/** + * TCP PUT acknowledge header + */ +typedef struct uct_tcp_ep_put_ack_hdr { + uint32_t sn; /* Sequence number of the last acked PUT operation */ +} UCS_S_PACKED uct_tcp_ep_put_ack_hdr_t; + + +/** + * TCP PUT completion + */ +typedef struct uct_tcp_ep_put_completion { + uct_completion_t *comp; /* User's completion passed to + * uct_ep_flush */ + uint32_t wait_put_sn; /* Sequence number of the last unacked + * PUT operations that was in-progress + * when uct_ep_flush was called */ + ucs_queue_elem_t elem; /* Element to insert completion into + * TCP EP PUT operation pending queue */ +} uct_tcp_ep_put_completion_t; + + /** * TCP endpoint communication context */ typedef struct uct_tcp_ep_ctx { - void *buf; /* Partial send/recv data */ - size_t length; /* How much data in the buffer */ - size_t offset; /* Next offset to send/recv */ - uct_tcp_ep_progress_t progress; /* Progress engine */ + uint32_t put_sn; /* Sequence number of last sent + * or received PUT operation */ + void *buf; /* Partial send/recv data */ + size_t length; /* How much data in the buffer */ + size_t offset; /* How much data was sent (TX) or was + * handled after receiving (RX) */ } uct_tcp_ep_ctx_t; +/** + * TCP AM/PUT Zcopy communication context mapped to + * buffer from TCP EP context + */ +typedef struct uct_tcp_ep_zcopy_tx { + uct_tcp_am_hdr_t super; /* UCT TCP AM header */ + uct_completion_t *comp; /* Local UCT completion object */ + size_t iov_index; /* Current IOV index */ + size_t iov_cnt; /* Number of IOVs that should be sent */ + struct iovec iov[0]; /* IOVs that should be sent */ +} uct_tcp_ep_zcopy_tx_t; + + /** * TCP endpoint */ struct uct_tcp_ep { uct_base_ep_t super; - int fd; /* Socket file descriptor */ - uint32_t events; /* Current notifications */ - uct_tcp_ep_ctx_t tx; /* TX resources */ - uct_tcp_ep_ctx_t rx; /* RX resources */ - ucs_sock_addr_t peer_addr; /* Remote iface addr */ - ucs_queue_head_t pending_q; /* Pending operations */ - ucs_list_link_t list; + uint8_t ctx_caps; /* Which contexts are supported */ + int fd; /* Socket file descriptor */ + uct_tcp_ep_conn_state_t conn_state; /* State of connection with peer */ + unsigned conn_retries; /* Number of connection attempts done */ + int events; /* Current notifications */ + uct_tcp_ep_ctx_t tx; /* TX resources */ + uct_tcp_ep_ctx_t rx; /* RX resources */ + struct sockaddr_in peer_addr; /* Remote iface addr */ + ucs_queue_head_t pending_q; /* Pending operations */ + ucs_queue_head_t put_comp_q; /* Flush completions waiting for + * outstanding PUTs acknowledgment */ + ucs_list_link_t list; /* List element to insert into TCP EP list */ }; @@ -64,27 +308,50 @@ struct uct_tcp_ep { typedef struct uct_tcp_iface { uct_base_iface_t super; /* Parent class */ int listen_fd; /* Server socket */ + khash_t(uct_tcp_cm_eps) ep_cm_map; /* Map of endpoints that don't + * have one of the context cap */ ucs_list_link_t ep_list; /* List of endpoints */ char if_name[IFNAMSIZ]; /* Network interface name */ - int epfd; /* Event poll set of sockets */ - size_t outstanding; /* How much data in the EP send buffers */ + ucs_sys_event_set_t *event_set; /* Event set identifier */ ucs_mpool_t tx_mpool; /* TX memory pool */ ucs_mpool_t rx_mpool; /* RX memory pool */ - size_t am_buf_size; /* AM buffer size */ + size_t outstanding; /* How much data in the EP send buffers + * + how many non-blocking connections + * are in progress + how many EPs are + * waiting for PUT Zcopy operation ACKs + * (0/1 for each EP) */ struct { + size_t tx_seg_size; /* TX AM buffer size */ + size_t rx_seg_size; /* RX AM buffer size */ + size_t sendv_thresh; /* Minimum size of user's payload from which + * non-blocking vector send should be used */ + struct { + size_t max_iov; /* Maximum supported IOVs limited by + * user configuration and service buffers + * (TCP protocol and user's AM headers) */ + size_t max_hdr; /* Maximum supported AM Zcopy header */ + size_t hdr_offset; /* Offset in TX buffer to empty space that + * can be used for AM Zcopy header */ + } zcopy; struct sockaddr_in ifaddr; /* Network address */ struct sockaddr_in netmask; /* Network address mask */ - size_t buf_size; /* Maximal bcopy size */ - size_t short_size; /* Maximal short size */ int prefer_default; /* Prefer default gateway */ + int put_enable; /* Enable PUT Zcopy operation support */ + int conn_nb; /* Use non-blocking connect() */ unsigned max_poll; /* Number of events to poll per socket*/ + unsigned max_conn_retries; /* How many connection establishment attmepts + * should be done if dropped connection was + * detected due to lack of system resources */ + unsigned syn_cnt; /* Number of SYN retransmits that TCP should send + * before aborting the attempt to connect. + * It cannot exceed 255. */ } config; struct { int nodelay; /* TCP_NODELAY */ - int sndbuf; /* SO_SNDBUF */ - int rcvbuf; /* SO_RCVBUF */ + size_t sndbuf; /* SO_SNDBUF */ + size_t rcvbuf; /* SO_RCVBUF */ } sockopt; } uct_tcp_iface_t; @@ -93,19 +360,28 @@ typedef struct uct_tcp_iface { * TCP interface configuration */ typedef struct uct_tcp_iface_config { - uct_iface_config_t super; - int prefer_default; - unsigned max_poll; - int sockopt_nodelay; - int sockopt_sndbuf; - int sockopt_rcvbuf; - uct_iface_mpool_config_t tx_mpool; - uct_iface_mpool_config_t rx_mpool; + uct_iface_config_t super; + size_t tx_seg_size; + size_t rx_seg_size; + size_t max_iov; + size_t sendv_thresh; + int prefer_default; + int put_enable; + int conn_nb; + unsigned max_poll; + unsigned max_conn_retries; + int sockopt_nodelay; + uct_tcp_send_recv_buf_config_t sockopt; + unsigned syn_cnt; + uct_iface_mpool_config_t tx_mpool; + uct_iface_mpool_config_t rx_mpool; } uct_tcp_iface_config_t; -extern uct_md_component_t uct_tcp_md; +extern uct_component_t uct_tcp_component; extern const char *uct_tcp_address_type_names[]; +extern const uct_tcp_cm_state_t uct_tcp_ep_cm_state[]; +extern const uct_tcp_ep_progress_t uct_tcp_ep_progress_rx_cb[]; ucs_status_t uct_tcp_netif_caps(const char *if_name, double *latency_p, double *bandwidth_p); @@ -118,30 +394,54 @@ ucs_status_t uct_tcp_netif_is_default(const char *if_name, int *result_p); int uct_tcp_sockaddr_cmp(const struct sockaddr *sa1, const struct sockaddr *sa2); -ucs_status_t uct_tcp_send(int fd, const void *data, size_t *length_p); +ucs_status_t uct_tcp_iface_set_sockopt(uct_tcp_iface_t *iface, int fd); -ucs_status_t uct_tcp_recv(int fd, void *data, size_t *length_p); +size_t uct_tcp_iface_get_max_iov(const uct_tcp_iface_t *iface); -ucs_status_t uct_tcp_send_blocking(int fd, const void *data, size_t length); +size_t uct_tcp_iface_get_max_zcopy_header(const uct_tcp_iface_t *iface); -ucs_status_t uct_tcp_recv_blocking(int fd, void *data, size_t length); +void uct_tcp_iface_add_ep(uct_tcp_ep_t *ep); -ucs_status_t uct_tcp_iface_set_sockopt(uct_tcp_iface_t *iface, int fd); +void uct_tcp_iface_remove_ep(uct_tcp_ep_t *ep); + +ucs_status_t uct_tcp_ep_handle_dropped_connect(uct_tcp_ep_t *ep, + ucs_status_t io_status); + +ucs_status_t uct_tcp_ep_init(uct_tcp_iface_t *iface, int fd, + const struct sockaddr_in *dest_addr, + uct_tcp_ep_t **ep_p); + +ucs_status_t uct_tcp_ep_create(const uct_ep_params_t *params, + uct_ep_h *ep_p); + +const char *uct_tcp_ep_ctx_caps_str(uint8_t ep_ctx_caps, char *str_buffer); + +void uct_tcp_ep_change_ctx_caps(uct_tcp_ep_t *ep, uint8_t new_caps); + +ucs_status_t uct_tcp_ep_add_ctx_cap(uct_tcp_ep_t *ep, + uct_tcp_ep_ctx_type_t cap); + +ucs_status_t uct_tcp_ep_remove_ctx_cap(uct_tcp_ep_t *ep, + uct_tcp_ep_ctx_type_t cap); -ucs_status_t uct_tcp_ep_create(uct_tcp_iface_t *iface, int fd, - const struct sockaddr *dest_addr, - uct_tcp_ep_t **ep_p); +ucs_status_t uct_tcp_ep_move_ctx_cap(uct_tcp_ep_t *from_ep, uct_tcp_ep_t *to_ep, + uct_tcp_ep_ctx_type_t ctx_cap); -ucs_status_t uct_tcp_ep_create_connected(const uct_ep_params_t *params, - uct_ep_h *ep_p); +void uct_tcp_ep_destroy_internal(uct_ep_h tl_ep); void uct_tcp_ep_destroy(uct_ep_h tl_ep); -unsigned uct_tcp_ep_progress_tx(uct_tcp_ep_t *ep); +void uct_tcp_ep_set_failed(uct_tcp_ep_t *ep); -unsigned uct_tcp_ep_progress_rx(uct_tcp_ep_t *ep); +unsigned uct_tcp_ep_is_self(const uct_tcp_ep_t *ep); -void uct_tcp_ep_mod_events(uct_tcp_ep_t *ep, uint32_t add, uint32_t remove); +void uct_tcp_ep_remove(uct_tcp_iface_t *iface, uct_tcp_ep_t *ep); + +void uct_tcp_ep_add(uct_tcp_iface_t *iface, uct_tcp_ep_t *ep); + +void uct_tcp_ep_mod_events(uct_tcp_ep_t *ep, int add, int remove); + +void uct_tcp_ep_pending_queue_dispatch(uct_tcp_ep_t *ep); ucs_status_t uct_tcp_ep_am_short(uct_ep_h uct_ep, uint8_t am_id, uint64_t header, const void *payload, unsigned length); @@ -150,6 +450,15 @@ ssize_t uct_tcp_ep_am_bcopy(uct_ep_h uct_ep, uint8_t am_id, uct_pack_callback_t pack_cb, void *arg, unsigned flags); +ucs_status_t uct_tcp_ep_am_zcopy(uct_ep_h uct_ep, uint8_t am_id, const void *header, + unsigned header_length, const uct_iov_t *iov, + size_t iovcnt, unsigned flags, + uct_completion_t *comp); + +ucs_status_t uct_tcp_ep_put_zcopy(uct_ep_h uct_ep, const uct_iov_t *iov, + size_t iovcnt, uint64_t remote_addr, + uct_rkey_t rkey, uct_completion_t *comp); + ucs_status_t uct_tcp_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *req, unsigned flags); @@ -159,4 +468,53 @@ void uct_tcp_ep_pending_purge(uct_ep_h tl_ep, uct_pending_purge_callback_t cb, ucs_status_t uct_tcp_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp); +ucs_status_t uct_tcp_cm_send_event(uct_tcp_ep_t *ep, uct_tcp_cm_conn_event_t event); + +unsigned uct_tcp_cm_handle_conn_pkt(uct_tcp_ep_t **ep_p, void *pkt, uint32_t length); + +unsigned uct_tcp_cm_conn_progress(uct_tcp_ep_t *ep); + +uct_tcp_ep_conn_state_t +uct_tcp_cm_set_conn_state(uct_tcp_ep_t *ep, + uct_tcp_ep_conn_state_t new_conn_state); + +void uct_tcp_cm_change_conn_state(uct_tcp_ep_t *ep, + uct_tcp_ep_conn_state_t new_conn_state); + +ucs_status_t uct_tcp_cm_add_ep(uct_tcp_iface_t *iface, uct_tcp_ep_t *ep); + +void uct_tcp_cm_remove_ep(uct_tcp_iface_t *iface, uct_tcp_ep_t *ep); + +uct_tcp_ep_t *uct_tcp_cm_search_ep(uct_tcp_iface_t *iface, + const struct sockaddr_in *peer_addr, + uct_tcp_ep_ctx_type_t with_ctx_type); + +void uct_tcp_cm_purge_ep(uct_tcp_ep_t *ep); + +ucs_status_t uct_tcp_cm_handle_incoming_conn(uct_tcp_iface_t *iface, + const struct sockaddr_in *peer_addr, + int fd); + +ucs_status_t uct_tcp_cm_conn_start(uct_tcp_ep_t *ep); + +static inline void uct_tcp_iface_outstanding_inc(uct_tcp_iface_t *iface) +{ + iface->outstanding++; +} + +static inline void uct_tcp_iface_outstanding_dec(uct_tcp_iface_t *iface) +{ + ucs_assert(iface->outstanding > 0); + iface->outstanding--; +} + +/** + * Query for active network devices under /sys/class/net, as determined by + * ucs_netif_is_active(). 'md' parameter is not used, and is added for + * compatibility with uct_tl_t::query_devices definition. + */ +ucs_status_t uct_tcp_query_devices(uct_md_h md, + uct_tl_device_resource_t **devices_p, + unsigned *num_devices_p); + #endif diff --git a/src/uct/tcp/tcp_base.c b/src/uct/tcp/tcp_base.c new file mode 100644 index 00000000000..9cc7e741473 --- /dev/null +++ b/src/uct/tcp/tcp_base.c @@ -0,0 +1,24 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "tcp_base.h" + +#include + +ucs_status_t ucs_tcp_base_set_syn_cnt(int fd, int tcp_syn_cnt) +{ + if (tcp_syn_cnt != UCS_ULUNITS_AUTO) { + ucs_socket_setopt(fd, IPPROTO_TCP, TCP_SYNCNT, (const void*)&tcp_syn_cnt, + sizeof(int)); + } + + /* return UCS_OK anyway since setting TCP_SYNCNT is done on best effort */ + return UCS_OK; +} diff --git a/src/uct/tcp/tcp_base.h b/src/uct/tcp/tcp_base.h new file mode 100644 index 00000000000..25bbf728512 --- /dev/null +++ b/src/uct/tcp/tcp_base.h @@ -0,0 +1,48 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef UCT_TCP_BASE_H +#define UCT_TCP_BASE_H + +#include +#include +#include +#include +#include + + +/** + * TCP socket send and receive buffers configuration. + */ +typedef struct uct_tcp_send_recv_buf_config { + size_t sndbuf; + size_t rcvbuf; +} uct_tcp_send_recv_buf_config_t; + + +/** + * Define configuration fields for tcp socket send and receive buffers. + */ +#define UCT_TCP_SEND_RECV_BUF_FIELDS(_offset) \ + {"SNDBUF", "auto", \ + "Socket send buffer size", \ + (_offset) + ucs_offsetof(uct_tcp_send_recv_buf_config_t, sndbuf), UCS_CONFIG_TYPE_MEMUNITS}, \ + \ + {"RCVBUF", "auto", \ + "Socket receive buffer size", \ + (_offset) + ucs_offsetof(uct_tcp_send_recv_buf_config_t, rcvbuf), UCS_CONFIG_TYPE_MEMUNITS} + + +#define UCT_TCP_SYN_CNT(_offset) \ + {"SYN_CNT", "auto", \ + "Number of SYN retransmits that TCP should send before aborting the attempt\n" \ + "to connect. It cannot exceed 255. auto means to use the system default.", \ + (_offset) , UCS_CONFIG_TYPE_ULUNITS} + + +ucs_status_t ucs_tcp_base_set_syn_cnt(int fd, int tcp_syn_cnt); + +#endif /* UCT_TCP_BASE_H */ diff --git a/src/uct/tcp/tcp_cm.c b/src/uct/tcp/tcp_cm.c new file mode 100644 index 00000000000..a224a9accff --- /dev/null +++ b/src/uct/tcp/tcp_cm.c @@ -0,0 +1,629 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "tcp.h" + +#include + + +void uct_tcp_cm_change_conn_state(uct_tcp_ep_t *ep, + uct_tcp_ep_conn_state_t new_conn_state) +{ + int full_log = 1; + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + char str_local_addr[UCS_SOCKADDR_STRING_LEN]; + char str_remote_addr[UCS_SOCKADDR_STRING_LEN]; + char str_ctx_caps[UCT_TCP_EP_CTX_CAPS_STR_MAX]; + uct_tcp_ep_conn_state_t old_conn_state; + + old_conn_state = ep->conn_state; + ep->conn_state = new_conn_state; + + switch(ep->conn_state) { + case UCT_TCP_EP_CONN_STATE_CONNECTING: + case UCT_TCP_EP_CONN_STATE_WAITING_ACK: + if (old_conn_state == UCT_TCP_EP_CONN_STATE_CLOSED) { + uct_tcp_iface_outstanding_inc(iface); + } else { + ucs_assert((ep->conn_state == UCT_TCP_EP_CONN_STATE_CONNECTING) || + (old_conn_state == UCT_TCP_EP_CONN_STATE_CONNECTING)); + } + break; + case UCT_TCP_EP_CONN_STATE_WAITING_REQ: + ucs_assert(old_conn_state == UCT_TCP_EP_CONN_STATE_WAITING_ACK); + break; + case UCT_TCP_EP_CONN_STATE_CONNECTED: + ucs_assert((old_conn_state == UCT_TCP_EP_CONN_STATE_CONNECTING) || + (old_conn_state == UCT_TCP_EP_CONN_STATE_WAITING_ACK) || + (old_conn_state == UCT_TCP_EP_CONN_STATE_ACCEPTING) || + (old_conn_state == UCT_TCP_EP_CONN_STATE_WAITING_REQ)); + if ((old_conn_state == UCT_TCP_EP_CONN_STATE_WAITING_ACK) || + (old_conn_state == UCT_TCP_EP_CONN_STATE_WAITING_REQ) || + /* It may happen when a peer is going to use this EP with socket + * from accepted connection in case of handling simultaneous + * connection establishment */ + (old_conn_state == UCT_TCP_EP_CONN_STATE_CONNECTING)) { + uct_tcp_iface_outstanding_dec(iface); + } + if (ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_TX)) { + /* Progress possibly pending TX operations */ + uct_tcp_ep_pending_queue_dispatch(ep); + } + break; + case UCT_TCP_EP_CONN_STATE_CLOSED: + ucs_assert(old_conn_state != UCT_TCP_EP_CONN_STATE_CLOSED); + if ((old_conn_state == UCT_TCP_EP_CONN_STATE_CONNECTING) || + (old_conn_state == UCT_TCP_EP_CONN_STATE_WAITING_ACK) || + (old_conn_state == UCT_TCP_EP_CONN_STATE_WAITING_REQ)) { + uct_tcp_iface_outstanding_dec(iface); + } else if ((old_conn_state == UCT_TCP_EP_CONN_STATE_ACCEPTING) || + (old_conn_state == UCT_TCP_EP_CONN_STATE_RECV_MAGIC_NUMBER)) { + /* Since ep::peer_addr is 0'ed, we have to print w/o peer's address */ + full_log = 0; + } + break; + default: + ucs_assert((ep->conn_state == UCT_TCP_EP_CONN_STATE_ACCEPTING) || + (ep->conn_state == UCT_TCP_EP_CONN_STATE_RECV_MAGIC_NUMBER)); + /* Since ep::peer_addr is 0'ed and client's + * has already been logged, print w/o peer's address */ + full_log = 0; + break; + } + + if (full_log) { + ucs_debug("tcp_ep %p: %s -> %s for the [%s]<->[%s] connection %s", + ep, uct_tcp_ep_cm_state[old_conn_state].name, + uct_tcp_ep_cm_state[ep->conn_state].name, + ucs_sockaddr_str((const struct sockaddr*)&iface->config.ifaddr, + str_local_addr, UCS_SOCKADDR_STRING_LEN), + ucs_sockaddr_str((const struct sockaddr*)&ep->peer_addr, + str_remote_addr, UCS_SOCKADDR_STRING_LEN), + uct_tcp_ep_ctx_caps_str(ep->ctx_caps, str_ctx_caps)); + } else { + ucs_debug("tcp_ep %p: %s -> %s", + ep, uct_tcp_ep_cm_state[old_conn_state].name, + uct_tcp_ep_cm_state[ep->conn_state].name); + } +} + +static ucs_status_t uct_tcp_cm_io_err_handler_cb(void *arg, + ucs_status_t io_status) +{ + return uct_tcp_ep_handle_dropped_connect((uct_tcp_ep_t*)arg, io_status); +} + +/* `fmt_str` parameter has to contain "%s" to write event type */ +static void uct_tcp_cm_trace_conn_pkt(const uct_tcp_ep_t *ep, + ucs_log_level_t log_level, + const char *fmt_str, + uct_tcp_cm_conn_event_t event) +{ + char event_str[64] = { 0 }; + char str_addr[UCS_SOCKADDR_STRING_LEN], msg[128], *p; + + p = event_str; + if (event & UCT_TCP_CM_CONN_REQ) { + ucs_snprintf_zero(event_str, sizeof(event_str), "%s", + UCS_PP_MAKE_STRING(UCT_TCP_CM_CONN_REQ)); + p += strlen(event_str); + } + + if (event & UCT_TCP_CM_CONN_WAIT_REQ) { + ucs_assert(p == event_str); + ucs_snprintf_zero(event_str, sizeof(event_str), "%s", + UCS_PP_MAKE_STRING(UCT_TCP_CM_CONN_WAIT_REQ)); + p += strlen(event_str); + } + + if (event & UCT_TCP_CM_CONN_ACK) { + if (p != event_str) { + ucs_snprintf_zero(p, sizeof(event_str) - (p - event_str), " | "); + p += strlen(p); + } + ucs_snprintf_zero(p, sizeof(event_str) - (p - event_str), "%s", + UCS_PP_MAKE_STRING(UCT_TCP_CM_CONN_ACK)); + p += strlen(event_str); + } + + if (event_str == p) { + ucs_snprintf_zero(event_str, sizeof(event_str), "UNKNOWN (%d)", event); + log_level = UCS_LOG_LEVEL_ERROR; + } + + ucs_snprintf_zero(msg, sizeof(msg), fmt_str, event_str); + + ucs_log(log_level, "tcp_ep %p: %s %s", ep, msg, + ucs_sockaddr_str((const struct sockaddr*)&ep->peer_addr, + str_addr, UCS_SOCKADDR_STRING_LEN)); +} + +ucs_status_t uct_tcp_cm_send_event(uct_tcp_ep_t *ep, uct_tcp_cm_conn_event_t event) +{ + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + size_t magic_number_length = 0; + void *pkt_buf; + size_t pkt_length, cm_pkt_length; + uct_tcp_cm_conn_req_pkt_t *conn_pkt; + uct_tcp_cm_conn_event_t *pkt_event; + uct_tcp_am_hdr_t *pkt_hdr; + ucs_status_t status; + + ucs_assertv(!(event & ~(UCT_TCP_CM_CONN_REQ | + UCT_TCP_CM_CONN_ACK | + UCT_TCP_CM_CONN_WAIT_REQ)), + "ep=%p", ep); + ucs_assertv(!(ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_TX)) || + (ep->conn_state != UCT_TCP_EP_CONN_STATE_CONNECTED), + "ep=%p", ep); + + pkt_length = sizeof(*pkt_hdr); + if (event == UCT_TCP_CM_CONN_REQ) { + cm_pkt_length = sizeof(*conn_pkt); + + if (ep->conn_state == UCT_TCP_EP_CONN_STATE_CONNECTING) { + magic_number_length = sizeof(uint64_t); + } + } else { + cm_pkt_length = sizeof(event); + } + + pkt_length += cm_pkt_length + magic_number_length; + pkt_buf = ucs_alloca(pkt_length); + pkt_hdr = (uct_tcp_am_hdr_t*)(UCS_PTR_BYTE_OFFSET(pkt_buf, + magic_number_length)); + pkt_hdr->am_id = UCT_AM_ID_MAX; + pkt_hdr->length = cm_pkt_length; + + if (event == UCT_TCP_CM_CONN_REQ) { + if (ep->conn_state == UCT_TCP_EP_CONN_STATE_CONNECTING) { + ucs_assert(magic_number_length == sizeof(uint64_t)); + *(uint64_t*)pkt_buf = UCT_TCP_MAGIC_NUMBER; + } + + conn_pkt = (uct_tcp_cm_conn_req_pkt_t*)(pkt_hdr + 1); + conn_pkt->event = UCT_TCP_CM_CONN_REQ; + conn_pkt->iface_addr = iface->config.ifaddr; + } else { + pkt_event = (uct_tcp_cm_conn_event_t*)(pkt_hdr + 1); + *pkt_event = event; + } + + status = ucs_socket_send(ep->fd, pkt_buf, pkt_length, + uct_tcp_cm_io_err_handler_cb, ep); + if (status == UCS_OK) { + uct_tcp_cm_trace_conn_pkt(ep, UCS_LOG_LEVEL_TRACE, + "%s sent to", event); + } else { + uct_tcp_cm_trace_conn_pkt(ep, ((status == UCS_ERR_CANCELED) ? + UCS_LOG_LEVEL_DEBUG : UCS_LOG_LEVEL_ERROR), + "unable to send %s to", event); + } + return status; +} + +ucs_status_t uct_tcp_cm_add_ep(uct_tcp_iface_t *iface, uct_tcp_ep_t *ep) +{ + ucs_list_link_t *ep_list; + khiter_t iter; + int ret; + + iter = kh_get(uct_tcp_cm_eps, &iface->ep_cm_map, ep->peer_addr); + if (iter == kh_end(&iface->ep_cm_map)) { + ep_list = ucs_malloc(sizeof(*ep_list), "tcp_ep_cm_map_entry"); + if (ep_list == NULL) { + return UCS_ERR_NO_MEMORY; + } + + ucs_list_head_init(ep_list); + iter = kh_put(uct_tcp_cm_eps, &iface->ep_cm_map, ep->peer_addr, &ret); + kh_value(&iface->ep_cm_map, iter) = ep_list; + + ucs_debug("tcp_iface %p: %p list added to map", iface, ep_list); + } else { + ep_list = kh_value(&iface->ep_cm_map, iter); + ucs_assertv(!ucs_list_is_empty(ep_list), "iface=%p", iface); + } + + uct_tcp_iface_remove_ep(ep); + + ucs_list_add_tail(ep_list, &ep->list); + ucs_debug("tcp_iface %p: tcp_ep %p added to %p list", + iface, ep, ep_list); + + return UCS_OK; +} + +void uct_tcp_cm_remove_ep(uct_tcp_iface_t *iface, uct_tcp_ep_t *ep) +{ + ucs_list_link_t *ep_list; + khiter_t iter; + + iter = kh_get(uct_tcp_cm_eps, &iface->ep_cm_map, ep->peer_addr); + ucs_assertv(iter != kh_end(&iface->ep_cm_map), "iface=%p", iface); + + ep_list = kh_value(&iface->ep_cm_map, iter); + ucs_assertv(!ucs_list_is_empty(ep_list), "iface=%p", iface); + + ucs_list_del(&ep->list); + ucs_debug("tcp_iface %p: tcp_ep %p removed from %p list", + iface, ep, ep_list); + + uct_tcp_iface_add_ep(ep); + + if (ucs_list_is_empty(ep_list)) { + kh_del(uct_tcp_cm_eps, &iface->ep_cm_map, iter); + ucs_debug("tcp_iface %p: %p list removed from map", + iface, ep_list); + ucs_free(ep_list); + } +} + +uct_tcp_ep_t *uct_tcp_cm_search_ep(uct_tcp_iface_t *iface, + const struct sockaddr_in *peer_addr, + uct_tcp_ep_ctx_type_t with_ctx_type) +{ + uct_tcp_ep_t *ep; + ucs_list_link_t *ep_list; + khiter_t iter; + + iter = kh_get(uct_tcp_cm_eps, &iface->ep_cm_map, *peer_addr); + if (iter != kh_end(&iface->ep_cm_map)) { + ep_list = kh_value(&iface->ep_cm_map, iter); + ucs_assertv(!ucs_list_is_empty(ep_list), "iface=%p", iface); + + ucs_list_for_each(ep, ep_list, list) { + if (ep->ctx_caps & UCS_BIT(with_ctx_type)) { + return ep; + } + } + } + + return NULL; +} + +void uct_tcp_cm_purge_ep(uct_tcp_ep_t *ep) +{ + /* Move from a khash's EP list to iface's EP list */ + ucs_list_del(&ep->list); + uct_tcp_ep_change_ctx_caps(ep, 0); + uct_tcp_iface_add_ep(ep); +} + +static unsigned +uct_tcp_cm_simult_conn_accept_remote_conn(uct_tcp_ep_t *accept_ep, + uct_tcp_ep_t *connect_ep) +{ + uct_tcp_cm_conn_event_t event; + ucs_status_t status; + + /* 1. Close the allocated socket `fd` to avoid reading any + * events for this socket and assign the socket `fd` returned + * from `accept()` to the found EP */ + uct_tcp_ep_mod_events(connect_ep, 0, connect_ep->events); + ucs_assertv(connect_ep->events == 0, + "Requested epoll events must be 0-ed for ep=%p", connect_ep); + + close(connect_ep->fd); + connect_ep->fd = accept_ep->fd; + + /* 2. Migrate RX from the EP allocated during accepting connection to + * the found EP */ + status = uct_tcp_ep_move_ctx_cap(accept_ep, connect_ep, + UCT_TCP_EP_CTX_TYPE_RX); + if (status != UCS_OK) { + return 0; + } + + /* 3. The EP allocated during accepting connection has to be destroyed + * upon return from this function (set its socket `fd` to -1 prior + * to avoid closing this socket) */ + uct_tcp_ep_mod_events(accept_ep, 0, UCS_EVENT_SET_EVREAD); + accept_ep->fd = -1; + accept_ep = NULL; + + /* 4. Send ACK to the peer */ + event = UCT_TCP_CM_CONN_ACK; + + /* 5. - If found EP is still connecting, tie REQ with ACK and send + * it to the peer using new socket fd to ensure that the peer + * will be able to receive the data from us + * - If found EP is waiting ACK, tie WAIT_REQ with ACK and send + * it to the peer using new socket fd to ensure that the peer + * will wait for REQ and after receiving the REQ, peer will + * be able to receive the data from us */ + if (connect_ep->conn_state == UCT_TCP_EP_CONN_STATE_CONNECTING) { + event |= UCT_TCP_CM_CONN_REQ; + } else if (connect_ep->conn_state == UCT_TCP_EP_CONN_STATE_WAITING_ACK) { + event |= UCT_TCP_CM_CONN_WAIT_REQ; + } + + status = uct_tcp_cm_send_event(connect_ep, event); + if (status != UCS_OK) { + return 0; + } + /* 6. Now fully connected to the peer */ + uct_tcp_ep_mod_events(connect_ep, UCS_EVENT_SET_EVREAD, 0); + uct_tcp_cm_change_conn_state(connect_ep, UCT_TCP_EP_CONN_STATE_CONNECTED); + + return 1; +} + +static unsigned uct_tcp_cm_handle_simult_conn(uct_tcp_iface_t *iface, + uct_tcp_ep_t *accept_ep, + uct_tcp_ep_t *connect_ep) +{ + int accept_conn = 0; + unsigned progress_count = 0; + ucs_status_t status; + int cmp; + + if ((connect_ep->conn_state != UCT_TCP_EP_CONN_STATE_CONNECTED) && + (connect_ep->conn_state != UCT_TCP_EP_CONN_STATE_WAITING_REQ)) { + cmp = ucs_sockaddr_cmp((const struct sockaddr*)&connect_ep->peer_addr, + (const struct sockaddr*)&iface->config.ifaddr, + &status); + if (status != UCS_OK) { + return 0; + } + + /* Accept connection from a peer if our iface + * address is greater than peer's one */ + accept_conn = (cmp < 0); + } + + if (!accept_conn) { + /* Migrate RX from the EP allocated during accepting connection to + * the found EP. */ + status = uct_tcp_ep_move_ctx_cap(accept_ep, connect_ep, + UCT_TCP_EP_CTX_TYPE_RX); + if (status != UCS_OK) { + return 0; + } + + if (connect_ep->conn_state == UCT_TCP_EP_CONN_STATE_WAITING_REQ) { + uct_tcp_cm_change_conn_state(connect_ep, UCT_TCP_EP_CONN_STATE_CONNECTED); + } + + uct_tcp_ep_mod_events(connect_ep, UCS_EVENT_SET_EVREAD, 0); + } else /* our iface address less than remote && we are not connected */ { + /* Accept the remote connection and close the current one */ + ucs_assertv(cmp != 0, "peer addresses for accepted tcp_ep %p and " + "found tcp_ep %p mustn't be equal", accept_ep, connect_ep); + progress_count = uct_tcp_cm_simult_conn_accept_remote_conn(accept_ep, + connect_ep); + } + + return progress_count; +} + +static unsigned +uct_tcp_cm_handle_conn_req(uct_tcp_ep_t **ep_p, + const uct_tcp_cm_conn_req_pkt_t *cm_req_pkt) +{ + uct_tcp_ep_t *ep = *ep_p; + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + unsigned progress_count = 0; + ucs_status_t status; + uct_tcp_ep_t *peer_ep; + + ep->peer_addr = cm_req_pkt->iface_addr; + uct_tcp_cm_trace_conn_pkt(ep, UCS_LOG_LEVEL_TRACE, + "%s received from", UCT_TCP_CM_CONN_REQ); + + status = uct_tcp_ep_add_ctx_cap(ep, UCT_TCP_EP_CTX_TYPE_RX); + if (status != UCS_OK) { + goto out; + } + + if (ep->conn_state == UCT_TCP_EP_CONN_STATE_CONNECTED) { + return 0; + } + + ucs_assertv(!(ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_TX)), + "ep %p mustn't have TX cap", ep); + + if (!uct_tcp_ep_is_self(ep) && + (peer_ep = uct_tcp_cm_search_ep(iface, &ep->peer_addr, + UCT_TCP_EP_CTX_TYPE_TX))) { + progress_count = uct_tcp_cm_handle_simult_conn(iface, ep, peer_ep); + ucs_assert(!(ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_TX))); + goto out; + } else { + /* Just accept this connection and make it operational for RX events */ + status = uct_tcp_cm_send_event(ep, UCT_TCP_CM_CONN_ACK); + if (status != UCS_OK) { + goto out; + } + + uct_tcp_cm_change_conn_state(ep, UCT_TCP_EP_CONN_STATE_CONNECTED); + + progress_count = 1; + } + + return progress_count; + +out: + if (!(ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_TX))) { + uct_tcp_ep_destroy_internal(&ep->super.super); + *ep_p = NULL; + } + return progress_count; +} + +void uct_tcp_cm_handle_conn_ack(uct_tcp_ep_t *ep, uct_tcp_cm_conn_event_t cm_event, + uct_tcp_ep_conn_state_t new_conn_state) +{ + uct_tcp_cm_trace_conn_pkt(ep, UCS_LOG_LEVEL_TRACE, + "%s received from", cm_event); + if (ep->conn_state != new_conn_state) { + uct_tcp_cm_change_conn_state(ep, new_conn_state); + } +} + +unsigned uct_tcp_cm_handle_conn_pkt(uct_tcp_ep_t **ep_p, void *pkt, uint32_t length) +{ + ucs_status_t status; + uct_tcp_cm_conn_event_t cm_event; + uct_tcp_cm_conn_req_pkt_t *cm_req_pkt; + uct_tcp_ep_conn_state_t new_conn_state; + + ucs_assertv(length >= sizeof(cm_event), "ep=%p", *ep_p); + + cm_event = *((uct_tcp_cm_conn_event_t*)pkt); + + switch (cm_event) { + case UCT_TCP_CM_CONN_REQ: + /* Don't trace received CM packet here, because + * EP doesn't contain the peer address */ + ucs_assertv(length == sizeof(*cm_req_pkt), "ep=%p", *ep_p); + cm_req_pkt = (uct_tcp_cm_conn_req_pkt_t*)pkt; + return uct_tcp_cm_handle_conn_req(ep_p, cm_req_pkt); + case UCT_TCP_CM_CONN_ACK_WITH_WAIT_REQ: + if (!((*ep_p)->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_RX))) { + new_conn_state = UCT_TCP_EP_CONN_STATE_WAITING_REQ; + } else { + new_conn_state = UCT_TCP_EP_CONN_STATE_CONNECTED; + } + uct_tcp_cm_handle_conn_ack(*ep_p, cm_event, new_conn_state); + return 0; + case UCT_TCP_CM_CONN_ACK_WITH_REQ: + status = uct_tcp_ep_add_ctx_cap(*ep_p, UCT_TCP_EP_CTX_TYPE_RX); + if (status != UCS_OK) { + return 0; + } + /* fall through */ + case UCT_TCP_CM_CONN_ACK: + uct_tcp_cm_handle_conn_ack(*ep_p, cm_event, + UCT_TCP_EP_CONN_STATE_CONNECTED); + return 0; + case UCT_TCP_CM_CONN_WAIT_REQ: + ucs_error("tcp_ep %p: CM event for waiting REQ (%d) " + "must be sent along with ACK", *ep_p, cm_event); + return 0; + } + + ucs_error("tcp_ep %p: unknown CM event received %d", *ep_p, cm_event); + return 0; +} + +static ucs_status_t uct_tcp_cm_conn_complete(uct_tcp_ep_t *ep, + unsigned *progress_count_p) +{ + ucs_status_t status; + + status = uct_tcp_cm_send_event(ep, UCT_TCP_CM_CONN_REQ); + if (status != UCS_OK) { + goto out; + } + + uct_tcp_cm_change_conn_state(ep, UCT_TCP_EP_CONN_STATE_WAITING_ACK); + uct_tcp_ep_mod_events(ep, UCS_EVENT_SET_EVREAD, 0); + + ucs_assertv((ep->tx.length == 0) && (ep->tx.offset == 0) && + (ep->tx.buf == NULL), "ep=%p", ep); +out: + if (progress_count_p != NULL) { + *progress_count_p = (status == UCS_OK); + } + return status; +} + +unsigned uct_tcp_cm_conn_progress(uct_tcp_ep_t *ep) +{ + unsigned progress_count; + + if (!ucs_socket_is_connected(ep->fd)) { + ucs_error("tcp_ep %p: connection establishment for " + "socket fd %d was unsuccessful", ep, ep->fd); + goto err; + } + + uct_tcp_cm_conn_complete(ep, &progress_count); + return progress_count; + +err: + uct_tcp_ep_set_failed(ep); + return 0; +} + +ucs_status_t uct_tcp_cm_conn_start(uct_tcp_ep_t *ep) +{ + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + ucs_status_t status; + + if (ep->conn_retries++ > iface->config.max_conn_retries) { + ucs_error("tcp_ep %p: reached maximum number of connection retries " + "(%u)", ep, iface->config.max_conn_retries); + return UCS_ERR_TIMED_OUT; + } + + uct_tcp_cm_change_conn_state(ep, UCT_TCP_EP_CONN_STATE_CONNECTING); + + status = ucs_socket_connect(ep->fd, (const struct sockaddr*)&ep->peer_addr); + if (UCS_STATUS_IS_ERR(status)) { + return status; + } else if (status == UCS_INPROGRESS) { + uct_tcp_ep_mod_events(ep, UCS_EVENT_SET_EVWRITE, 0); + return UCS_OK; + } + + ucs_assert(status == UCS_OK); + + if (!iface->config.conn_nb) { + status = ucs_sys_fcntl_modfl(ep->fd, O_NONBLOCK, 0); + if (status != UCS_OK) { + return status; + } + } + + return uct_tcp_cm_conn_complete(ep, NULL); +} + +/* This function is called from async thread */ +ucs_status_t uct_tcp_cm_handle_incoming_conn(uct_tcp_iface_t *iface, + const struct sockaddr_in *peer_addr, + int fd) +{ + char str_local_addr[UCS_SOCKADDR_STRING_LEN]; + char str_remote_addr[UCS_SOCKADDR_STRING_LEN]; + ucs_status_t status; + uct_tcp_ep_t *ep; + + if (!ucs_socket_is_connected(fd)) { + ucs_warn("tcp_iface %p: connection establishment for socket fd %d " + "from %s to %s was unsuccessful", iface, fd, + ucs_sockaddr_str((const struct sockaddr*)&peer_addr, + str_remote_addr, UCS_SOCKADDR_STRING_LEN), + ucs_sockaddr_str((const struct sockaddr*)&iface->config.ifaddr, + str_local_addr, UCS_SOCKADDR_STRING_LEN)); + return UCS_ERR_UNREACHABLE; + } + + status = uct_tcp_ep_init(iface, fd, NULL, &ep); + if (status != UCS_OK) { + return status; + } + + uct_tcp_cm_change_conn_state(ep, UCT_TCP_EP_CONN_STATE_RECV_MAGIC_NUMBER); + uct_tcp_ep_mod_events(ep, UCS_EVENT_SET_EVREAD, 0); + + ucs_debug("tcp_iface %p: accepted connection from " + "%s on %s to tcp_ep %p (fd %d)", iface, + ucs_sockaddr_str((const struct sockaddr*)peer_addr, + str_remote_addr, UCS_SOCKADDR_STRING_LEN), + ucs_sockaddr_str((const struct sockaddr*)&iface->config.ifaddr, + str_local_addr, UCS_SOCKADDR_STRING_LEN), + ep, fd); + return UCS_OK; +} diff --git a/src/uct/tcp/tcp_ep.c b/src/uct/tcp/tcp_ep.c index c25c505b1fc..5f8cd903e6a 100644 --- a/src/uct/tcp/tcp_ep.c +++ b/src/uct/tcp/tcp_ep.c @@ -3,27 +3,57 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "tcp.h" #include -static void uct_tcp_ep_epoll_ctl(uct_tcp_ep_t *ep, int op) -{ - uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, - uct_tcp_iface_t); - struct epoll_event epoll_event = { - .data.ptr = ep, - .events = ep->events, - }; - int ret; - - ret = epoll_ctl(iface->epfd, op, ep->fd, &epoll_event); - if (ret < 0) { - ucs_fatal("epoll_ctl(epfd=%d, op=%d, fd=%d) failed: %m", - iface->epfd, op, ep->fd); +/* Forward declarations */ +static unsigned uct_tcp_ep_progress_data_tx(uct_tcp_ep_t *ep); +static unsigned uct_tcp_ep_progress_data_rx(uct_tcp_ep_t *ep); +static unsigned uct_tcp_ep_progress_magic_number_rx(uct_tcp_ep_t *ep); + +const uct_tcp_cm_state_t uct_tcp_ep_cm_state[] = { + [UCT_TCP_EP_CONN_STATE_CLOSED] = { + .name = "CLOSED", + .tx_progress = (uct_tcp_ep_progress_t)ucs_empty_function_return_zero, + .rx_progress = (uct_tcp_ep_progress_t)ucs_empty_function_return_zero + }, + [UCT_TCP_EP_CONN_STATE_CONNECTING] = { + .name = "CONNECTING", + .tx_progress = uct_tcp_cm_conn_progress, + .rx_progress = uct_tcp_ep_progress_data_rx + }, + [UCT_TCP_EP_CONN_STATE_WAITING_ACK] = { + .name = "WAITING_ACK", + .tx_progress = (uct_tcp_ep_progress_t)ucs_empty_function_return_zero, + .rx_progress = uct_tcp_ep_progress_data_rx + }, + [UCT_TCP_EP_CONN_STATE_RECV_MAGIC_NUMBER] = { + .name = "RECV_MAGIC_NUMBER", + .tx_progress = (uct_tcp_ep_progress_t)ucs_empty_function_return_zero, + .rx_progress = uct_tcp_ep_progress_magic_number_rx + }, + [UCT_TCP_EP_CONN_STATE_ACCEPTING] = { + .name = "ACCEPTING", + .tx_progress = (uct_tcp_ep_progress_t)ucs_empty_function_return_zero, + .rx_progress = uct_tcp_ep_progress_data_rx + }, + [UCT_TCP_EP_CONN_STATE_WAITING_REQ] = { + .name = "WAITING_REQ", + .tx_progress = (uct_tcp_ep_progress_t)ucs_empty_function_return_zero, + .rx_progress = uct_tcp_ep_progress_data_rx + }, + [UCT_TCP_EP_CONN_STATE_CONNECTED] = { + .name = "CONNECTED", + .tx_progress = uct_tcp_ep_progress_data_tx, + .rx_progress = uct_tcp_ep_progress_data_rx } -} +}; static inline int uct_tcp_ep_ctx_buf_empty(uct_tcp_ep_ctx_t *ctx) { @@ -39,11 +69,21 @@ static inline int uct_tcp_ep_ctx_buf_need_progress(uct_tcp_ep_ctx_t *ctx) return ctx->offset < ctx->length; } -static inline int uct_tcp_ep_can_send(uct_tcp_ep_t *ep) +static inline ucs_status_t uct_tcp_ep_check_tx_res(uct_tcp_ep_t *ep) { - ucs_assert(ep->tx.offset <= ep->tx.length); - /* TODO optimize to allow partial sends/message coalescing */ - return uct_tcp_ep_ctx_buf_empty(&ep->tx); + if (ucs_unlikely(ep->conn_state != UCT_TCP_EP_CONN_STATE_CONNECTED)) { + if (ep->conn_state == UCT_TCP_EP_CONN_STATE_CLOSED) { + return UCS_ERR_UNREACHABLE; + } + + ucs_assertv((ep->conn_state == UCT_TCP_EP_CONN_STATE_CONNECTING) || + (ep->conn_state == UCT_TCP_EP_CONN_STATE_WAITING_ACK) || + (ep->conn_state == UCT_TCP_EP_CONN_STATE_WAITING_REQ), + "ep=%p", ep); + return UCS_ERR_NO_RESOURCE; + } + + return uct_tcp_ep_ctx_buf_empty(&ep->tx) ? UCS_OK : UCS_ERR_NO_RESOURCE; } static inline void uct_tcp_ep_ctx_rewind(uct_tcp_ep_ctx_t *ctx) @@ -52,9 +92,10 @@ static inline void uct_tcp_ep_ctx_rewind(uct_tcp_ep_ctx_t *ctx) ctx->length = 0; } -static void uct_tcp_ep_ctx_init(uct_tcp_ep_ctx_t *ctx) +static inline void uct_tcp_ep_ctx_init(uct_tcp_ep_ctx_t *ctx) { - ctx->buf = NULL; + ctx->put_sn = UINT32_MAX; + ctx->buf = NULL; uct_tcp_ep_ctx_rewind(ctx); } @@ -65,163 +106,206 @@ static inline void uct_tcp_ep_ctx_reset(uct_tcp_ep_ctx_t *ctx) uct_tcp_ep_ctx_rewind(ctx); } -static void uct_tcp_ep_addr_cleanup(ucs_sock_addr_t *sock_addr) +static void uct_tcp_ep_addr_cleanup(struct sockaddr_in *sock_addr) { - ucs_free((void*)sock_addr->addr); - - sock_addr->addr = NULL; - sock_addr->addrlen = 0; + memset(sock_addr, 0, sizeof(*sock_addr)); } -static ucs_status_t uct_tcp_ep_addr_init(ucs_sock_addr_t *sock_addr, - const struct sockaddr *addr) +static void uct_tcp_ep_addr_init(struct sockaddr_in *sock_addr, + const struct sockaddr_in *peer_addr) { /* TODO: handle IPv4 and IPv6 */ - socklen_t addr_len = sizeof(struct sockaddr_in); - struct sockaddr *new_addr; - - if (addr == NULL) { - sock_addr->addr = NULL; - sock_addr->addrlen = 0; + if (peer_addr == NULL) { + uct_tcp_ep_addr_cleanup(sock_addr); } else { - new_addr = ucs_malloc(addr_len, "sock_addr"); - if (new_addr == NULL) { - return UCS_ERR_NO_MEMORY; - } - - sock_addr->addr = new_addr; - sock_addr->addrlen = addr_len; + *sock_addr = *peer_addr; } - - return UCS_OK; -} - -static void uct_tcp_ep_close_fd(int *fd_p) -{ - if (*fd_p != -1) { - close(*fd_p); - *fd_p = -1; - } -} - -/* Must be called with `iface::worker::async` blocked */ -static unsigned uct_tcp_ep_in_iface(uct_tcp_ep_t *ep) -{ - return !ucs_list_is_empty(&ep->list); } -/* Must be called with `iface::worker::async` blocked */ -static void uct_tcp_ep_del_from_iface(uct_tcp_iface_t *iface, - uct_tcp_ep_t *ep) +unsigned uct_tcp_ep_is_self(const uct_tcp_ep_t *ep) { - if (uct_tcp_ep_in_iface(ep)) { - ucs_list_del(&ep->list); - } -} + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + ucs_status_t status; + int cmp; -/* Must be called with `iface::worker::async` blocked */ -static void uct_tcp_ep_add_to_iface(uct_tcp_iface_t *iface, - uct_tcp_ep_t *ep) -{ - ucs_list_add_tail(&iface->ep_list, &ep->list); + cmp = ucs_sockaddr_cmp((const struct sockaddr*)&ep->peer_addr, + (const struct sockaddr*)&iface->config.ifaddr, + &status); + ucs_assertv(status == UCS_OK, "ep=%p", ep); + return !cmp; } static void uct_tcp_ep_cleanup(uct_tcp_ep_t *ep) -{ +{ uct_tcp_ep_addr_cleanup(&ep->peer_addr); - if (ep->tx.buf) { + if (ep->tx.buf != NULL) { uct_tcp_ep_ctx_reset(&ep->tx); } - if (ep->rx.buf) { + + if (ep->rx.buf != NULL) { uct_tcp_ep_ctx_reset(&ep->rx); } - uct_tcp_ep_close_fd(&ep->fd); + if (ep->events && (ep->fd != -1)) { + uct_tcp_ep_mod_events(ep, 0, ep->events); + } + + ucs_close_fd(&ep->fd); } static UCS_CLASS_INIT_FUNC(uct_tcp_ep_t, uct_tcp_iface_t *iface, - int fd, const struct sockaddr *dest_addr) + int fd, const struct sockaddr_in *dest_addr) { ucs_status_t status; + ucs_assertv(fd >= 0, "iface=%p", iface); + UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super) - self->peer_addr.addr = NULL; + uct_tcp_ep_addr_init(&self->peer_addr, dest_addr); - status = uct_tcp_ep_addr_init(&self->peer_addr, dest_addr); - if (status != UCS_OK) { - return status; - } + uct_tcp_ep_ctx_init(&self->tx); + uct_tcp_ep_ctx_init(&self->rx); - self->tx.buf = NULL; - self->tx.progress = (uct_tcp_ep_progress_t)ucs_empty_function_return_zero; + self->events = 0; + self->conn_retries = 0; + self->fd = fd; + self->ctx_caps = 0; + self->conn_state = UCT_TCP_EP_CONN_STATE_CLOSED; - self->rx.buf = NULL; - self->rx.progress = (uct_tcp_ep_progress_t)ucs_empty_function_return_zero; - - ucs_queue_head_init(&self->pending_q); - self->events = 0; - self->fd = fd; ucs_list_head_init(&self->list); + ucs_queue_head_init(&self->pending_q); + ucs_queue_head_init(&self->put_comp_q); - if (self->fd == -1) { - status = ucs_socket_create(AF_INET, SOCK_STREAM, &self->fd); - if (status != UCS_OK) { - goto err_cleanup; - } - - /* TODO use non-blocking connect */ - status = ucs_socket_connect(self->fd, dest_addr); + /* Make a socket non-blocking if an EP is created during accepting + * a connection or non-blocking connection mode is requested */ + if ((dest_addr == NULL) || iface->config.conn_nb) { + status = ucs_sys_fcntl_modfl(self->fd, O_NONBLOCK, 0); if (status != UCS_OK) { goto err_cleanup; } - - uct_tcp_ep_ctx_init(&self->tx); - self->tx.progress = uct_tcp_ep_progress_tx; - } else { - uct_tcp_ep_ctx_init(&self->rx); - self->rx.progress = uct_tcp_ep_progress_rx; - } - - status = ucs_sys_fcntl_modfl(self->fd, O_NONBLOCK, 0); - if (status != UCS_OK) { - if (fd != -1) { - /* to be closed by this function caller */ - self->fd = -1; - } - goto err_cleanup; } status = uct_tcp_iface_set_sockopt(iface, self->fd); if (status != UCS_OK) { - if (fd != -1) { - /* to be closed by this function caller */ - self->fd = -1; - } goto err_cleanup; } - UCS_ASYNC_BLOCK(iface->super.worker->async); - uct_tcp_ep_add_to_iface(iface, self); - UCS_ASYNC_UNBLOCK(iface->super.worker->async); + uct_tcp_iface_add_ep(self); ucs_debug("tcp_ep %p: created on iface %p, fd %d", self, iface, self->fd); return UCS_OK; err_cleanup: + /* need to be closed by this function caller */ + self->fd = -1; uct_tcp_ep_cleanup(self); return status; } -static UCS_CLASS_CLEANUP_FUNC(uct_tcp_ep_t) +const char *uct_tcp_ep_ctx_caps_str(uint8_t ep_ctx_caps, char *str_buffer) { - uct_tcp_iface_t *iface = ucs_derived_of(self->super.super.iface, + ucs_snprintf_zero(str_buffer, UCT_TCP_EP_CTX_CAPS_STR_MAX, "[%s:%s]", + (ep_ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_TX)) ? + "Tx" : "-", + (ep_ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_RX)) ? + "Rx" : "-"); + return str_buffer; +} + +void uct_tcp_ep_change_ctx_caps(uct_tcp_ep_t *ep, uint8_t new_caps) +{ + char str_prev_ctx_caps[UCT_TCP_EP_CTX_CAPS_STR_MAX]; + char str_cur_ctx_caps[UCT_TCP_EP_CTX_CAPS_STR_MAX]; + + if (ep->ctx_caps != new_caps) { + ucs_trace("tcp_ep %p: ctx caps changed %s -> %s", ep, + uct_tcp_ep_ctx_caps_str(ep->ctx_caps, str_prev_ctx_caps), + uct_tcp_ep_ctx_caps_str(new_caps, str_cur_ctx_caps)); + ep->ctx_caps = new_caps; + } +} + +ucs_status_t uct_tcp_ep_add_ctx_cap(uct_tcp_ep_t *ep, + uct_tcp_ep_ctx_type_t cap) +{ + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_tcp_iface_t); + uint8_t prev_caps = ep->ctx_caps; + + uct_tcp_ep_change_ctx_caps(ep, ep->ctx_caps | UCS_BIT(cap)); + if (!uct_tcp_ep_is_self(ep) && (prev_caps != ep->ctx_caps)) { + if (!(prev_caps & UCT_TCP_EP_CTX_CAPS)) { + return uct_tcp_cm_add_ep(iface, ep); + } else if (ucs_test_all_flags(ep->ctx_caps, UCT_TCP_EP_CTX_CAPS)) { + uct_tcp_cm_remove_ep(iface, ep); + } + } + + return UCS_OK; +} + +ucs_status_t uct_tcp_ep_remove_ctx_cap(uct_tcp_ep_t *ep, + uct_tcp_ep_ctx_type_t cap) +{ + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + uint8_t prev_caps = ep->ctx_caps; + + uct_tcp_ep_change_ctx_caps(ep, ep->ctx_caps & ~UCS_BIT(cap)); + if (!uct_tcp_ep_is_self(ep)) { + if (ucs_test_all_flags(prev_caps, UCT_TCP_EP_CTX_CAPS)) { + return uct_tcp_cm_add_ep(iface, ep); + } else if (!(ep->ctx_caps & UCT_TCP_EP_CTX_CAPS)) { + uct_tcp_cm_remove_ep(iface, ep); + } + } + + return UCS_OK; +} + +ucs_status_t uct_tcp_ep_move_ctx_cap(uct_tcp_ep_t *from_ep, uct_tcp_ep_t *to_ep, + uct_tcp_ep_ctx_type_t ctx_cap) +{ + ucs_status_t status; + + status = uct_tcp_ep_remove_ctx_cap(from_ep, ctx_cap); + if (status != UCS_OK) { + return status; + } + + return uct_tcp_ep_add_ctx_cap(to_ep, ctx_cap); +} + +static UCS_CLASS_CLEANUP_FUNC(uct_tcp_ep_t) +{ + uct_tcp_iface_t UCS_V_UNUSED *iface = + ucs_derived_of(self->super.super.iface, uct_tcp_iface_t); + uct_tcp_ep_put_completion_t *put_comp; - UCS_ASYNC_BLOCK(iface->super.worker->async); - uct_tcp_ep_del_from_iface(iface, self); - UCS_ASYNC_UNBLOCK(iface->super.worker->async); + uct_tcp_ep_mod_events(self, 0, self->events); + + if (self->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_TX)) { + uct_tcp_ep_remove_ctx_cap(self, UCT_TCP_EP_CTX_TYPE_TX); + } + + if (self->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_RX)) { + uct_tcp_ep_remove_ctx_cap(self, UCT_TCP_EP_CTX_TYPE_RX); + } + + ucs_assertv(!(self->ctx_caps & UCT_TCP_EP_CTX_CAPS), "ep=%p", self); + + ucs_queue_for_each_extract(put_comp, &self->put_comp_q, elem, 1) { + ucs_free(put_comp); + } + + uct_tcp_iface_remove_ep(self); + + if (self->conn_state != UCT_TCP_EP_CONN_STATE_CLOSED) { + uct_tcp_cm_change_conn_state(self, UCT_TCP_EP_CONN_STATE_CLOSED); + } uct_tcp_ep_cleanup(self); @@ -230,16 +314,120 @@ static UCS_CLASS_CLEANUP_FUNC(uct_tcp_ep_t) UCS_CLASS_DEFINE(uct_tcp_ep_t, uct_base_ep_t); -UCS_CLASS_DEFINE_NAMED_NEW_FUNC(uct_tcp_ep_create, uct_tcp_ep_t, uct_tcp_ep_t, +UCS_CLASS_DEFINE_NAMED_NEW_FUNC(uct_tcp_ep_init, uct_tcp_ep_t, uct_tcp_ep_t, uct_tcp_iface_t*, int, - const struct sockaddr*) -UCS_CLASS_DEFINE_NAMED_DELETE_FUNC(uct_tcp_ep_destroy, uct_tcp_ep_t, uct_ep_t) + const struct sockaddr_in*) +UCS_CLASS_DEFINE_NAMED_DELETE_FUNC(uct_tcp_ep_destroy_internal, + uct_tcp_ep_t, uct_ep_t) + +void uct_tcp_ep_destroy(uct_ep_h tl_ep) +{ + uct_tcp_ep_t *ep = ucs_derived_of(tl_ep, uct_tcp_ep_t); + + if ((ep->conn_state == UCT_TCP_EP_CONN_STATE_CONNECTED) && + ucs_test_all_flags(ep->ctx_caps, UCT_TCP_EP_CTX_CAPS)) { + /* remove TX capability, but still will be able to receive data */ + uct_tcp_ep_remove_ctx_cap(ep, UCT_TCP_EP_CTX_TYPE_TX); + } else { + uct_tcp_ep_destroy_internal(tl_ep); + } +} + +void uct_tcp_ep_set_failed(uct_tcp_ep_t *ep) +{ + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); -ucs_status_t uct_tcp_ep_create_connected(const uct_ep_params_t *params, - uct_ep_h *ep_p) + if (ep->conn_state != UCT_TCP_EP_CONN_STATE_CLOSED) { + uct_tcp_cm_change_conn_state(ep, UCT_TCP_EP_CONN_STATE_CLOSED); + } + + uct_set_ep_failed(&UCS_CLASS_NAME(uct_tcp_ep_t), + &ep->super.super, &iface->super.super, + UCS_ERR_UNREACHABLE); +} + +static ucs_status_t +uct_tcp_ep_create_socket_and_connect(uct_tcp_iface_t *iface, + const struct sockaddr_in *dest_addr, + uct_tcp_ep_t **ep_p) +{ + uct_tcp_ep_t *ep = NULL; + ucs_status_t status; + int fd; + + /* if EP is already allocated, dest_addr can be NULL */ + ucs_assert((*ep_p != NULL) || (dest_addr != NULL)); + + status = ucs_socket_create(AF_INET, SOCK_STREAM, &fd); + if (status != UCS_OK) { + return status; + } + + if (*ep_p == NULL) { + status = uct_tcp_ep_init(iface, fd, dest_addr, &ep); + if (status != UCS_OK) { + goto err_close_fd; + } + + /* EP is responsible for this socket fd from now */ + fd = -1; + } else { + ep = *ep_p; + ep->fd = fd; + } + + status = uct_tcp_cm_conn_start(ep); + if (status != UCS_OK) { + goto err_ep_destroy; + } + + if (*ep_p == NULL) { + *ep_p = ep; + } + + return UCS_OK; + +err_ep_destroy: + if (*ep_p == NULL) { + uct_tcp_ep_destroy_internal(&ep->super.super); + } +err_close_fd: + /* fd has to be valid in case of valid EP has been + * passed to this function */ + ucs_assert((*ep_p == NULL) || (fd != -1)); + ucs_close_fd(&fd); + return status; +} + +static ucs_status_t uct_tcp_ep_create_connected(uct_tcp_iface_t *iface, + const struct sockaddr_in *dest_addr, + uct_tcp_ep_t **ep_p) +{ + ucs_status_t status; + + status = uct_tcp_ep_create_socket_and_connect(iface, dest_addr, ep_p); + if (status != UCS_OK) { + return status; + } + + status = uct_tcp_ep_add_ctx_cap(*ep_p, UCT_TCP_EP_CTX_TYPE_TX); + if (status != UCS_OK) { + goto err_ep_destroy; + } + + return UCS_OK; + +err_ep_destroy: + uct_tcp_ep_destroy_internal(&(*ep_p)->super.super); + return status; +} + +ucs_status_t uct_tcp_ep_create(const uct_ep_params_t *params, + uct_ep_h *ep_p) { uct_tcp_iface_t *iface = ucs_derived_of(params->iface, uct_tcp_iface_t); - uct_tcp_ep_t *tcp_ep = NULL; + uct_tcp_ep_t *ep = NULL; struct sockaddr_in dest_addr; ucs_status_t status; @@ -250,33 +438,102 @@ ucs_status_t uct_tcp_ep_create_connected(const uct_ep_params_t *params, dest_addr.sin_port = *(in_port_t*)params->iface_addr; dest_addr.sin_addr = *(struct in_addr*)params->dev_addr; - /* TODO try to reuse existing connection */ - status = uct_tcp_ep_create(iface, -1, (struct sockaddr*)&dest_addr, &tcp_ep); + do { + ep = uct_tcp_cm_search_ep(iface, &dest_addr, + UCT_TCP_EP_CTX_TYPE_RX); + if (ep) { + ucs_assert(!(ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_TX))); + /* Found EP with RX ctx, try to send the connection request + * to the remote peer, if it successful - assign TX to this EP + * and return the EP to the user, otherwise - destroy this EP + * and try to search another EP w/o TX capability or create + * new EP */ + status = uct_tcp_cm_send_event(ep, UCT_TCP_CM_CONN_REQ); + if (status != UCS_OK) { + uct_tcp_ep_destroy_internal(&ep->super.super); + ep = NULL; + } else { + status = uct_tcp_ep_add_ctx_cap(ep, UCT_TCP_EP_CTX_TYPE_TX); + if (status != UCS_OK) { + return status; + } + } + } else { + status = uct_tcp_ep_create_connected(iface, &dest_addr, &ep); + break; + } + } while (ep == NULL); + if (status == UCS_OK) { - ucs_debug("tcp_ep %p: connected to %s:%d", tcp_ep, - inet_ntoa(dest_addr.sin_addr), ntohs(dest_addr.sin_port)); - *ep_p = &tcp_ep->super.super; + /* cppcheck-suppress autoVariables */ + *ep_p = &ep->super.super; } return status; } -void uct_tcp_ep_mod_events(uct_tcp_ep_t *ep, uint32_t add, uint32_t remove) +void uct_tcp_ep_mod_events(uct_tcp_ep_t *ep, int add, int rem) { - int old_events = ep->events; - int new_events = (ep->events | add) & ~remove; + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + int old_events = ep->events; + int new_events = (ep->events | add) & ~rem; + ucs_status_t status; if (new_events != ep->events) { ep->events = new_events; ucs_trace("tcp_ep %p: set events to %c%c", ep, - (new_events & EPOLLIN) ? 'i' : '-', - (new_events & EPOLLOUT) ? 'o' : '-'); + (new_events & UCS_EVENT_SET_EVREAD) ? 'r' : '-', + (new_events & UCS_EVENT_SET_EVWRITE) ? 'w' : '-'); if (new_events == 0) { - uct_tcp_ep_epoll_ctl(ep, EPOLL_CTL_DEL); + status = ucs_event_set_del(iface->event_set, ep->fd); } else if (old_events != 0) { - uct_tcp_ep_epoll_ctl(ep, EPOLL_CTL_MOD); + status = ucs_event_set_mod(iface->event_set, ep->fd, + (ucs_event_set_type_t)ep->events, + (void *)ep); } else { - uct_tcp_ep_epoll_ctl(ep, EPOLL_CTL_ADD); + status = ucs_event_set_add(iface->event_set, ep->fd, + (ucs_event_set_type_t)ep->events, + (void *)ep); } + if (status != UCS_OK) { + ucs_fatal("unable to modify event set for tcp_ep %p (fd=%d)", ep, + ep->fd); + } + } +} + +static inline void uct_tcp_ep_handle_put_ack(uct_tcp_ep_t *ep, + uct_tcp_ep_put_ack_hdr_t *put_ack) +{ + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + uct_tcp_ep_put_completion_t *put_comp; + + if (put_ack->sn == ep->tx.put_sn) { + /* Since there are no other PUT operations in-flight, can remove flag + * and decrement iface outstanding operations counter */ + ucs_assert(ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_PUT_TX_WAITING_ACK)); + ep->ctx_caps &= ~UCS_BIT(UCT_TCP_EP_CTX_TYPE_PUT_TX_WAITING_ACK); + uct_tcp_iface_outstanding_dec(iface); + } + + ucs_queue_for_each_extract(put_comp, &ep->put_comp_q, elem, + (UCS_CIRCULAR_COMPARE32(put_comp->wait_put_sn, + <=, put_ack->sn))) { + uct_invoke_completion(put_comp->comp, UCS_OK); + ucs_free(put_comp); + } +} + +void uct_tcp_ep_pending_queue_dispatch(uct_tcp_ep_t *ep) +{ + uct_pending_req_priv_queue_t *priv; + + uct_pending_queue_dispatch(priv, &ep->pending_q, + uct_tcp_ep_ctx_buf_empty(&ep->tx)); + if (uct_tcp_ep_ctx_buf_empty(&ep->tx)) { + ucs_assert(ucs_queue_is_empty(&ep->pending_q)); + uct_tcp_ep_mod_events(ep, 0, UCS_EVENT_SET_EVWRITE); } } @@ -285,104 +542,332 @@ static void uct_tcp_ep_handle_disconnected(uct_tcp_ep_t *ep, { ucs_debug("tcp_ep %p: remote disconnected", ep); - uct_tcp_ep_mod_events(ep, 0, EPOLLIN); uct_tcp_ep_ctx_reset(ctx); - uct_tcp_ep_destroy(&ep->super.super); + + if (ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_TX)) { + if (ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_RX)) { + uct_tcp_ep_remove_ctx_cap(ep, UCT_TCP_EP_CTX_TYPE_RX); + } + + uct_tcp_ep_mod_events(ep, 0, ep->events); + ucs_close_fd(&ep->fd); + } else { + /* If the EP supports RX only or no capabilities set, destroy it */ + uct_tcp_ep_destroy_internal(&ep->super.super); + } } -static inline unsigned uct_tcp_ep_send(uct_tcp_ep_t *ep) +static inline ssize_t uct_tcp_ep_send(uct_tcp_ep_t *ep) { uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_tcp_iface_t); - size_t send_length; + size_t sent_length; ucs_status_t status; - send_length = ep->tx.length - ep->tx.offset; - ucs_assert(send_length > 0); + ucs_assert(ep->tx.length > ep->tx.offset); + sent_length = ep->tx.length - ep->tx.offset; - status = uct_tcp_send(ep->fd, ep->tx.buf + ep->tx.offset, &send_length); - if (status < 0) { - return 0; + status = ucs_socket_send_nb(ep->fd, UCS_PTR_BYTE_OFFSET(ep->tx.buf, ep->tx.offset), + &sent_length, NULL, NULL); + if (ucs_unlikely((status != UCS_OK) && + (status != UCS_ERR_NO_PROGRESS))) { + return status; } - ucs_trace_data("tcp_ep %p: sent %zu bytes", ep, send_length); + iface->outstanding -= sent_length; + ep->tx.offset += sent_length; - iface->outstanding -= send_length; - ep->tx.offset += send_length; - if (!uct_tcp_ep_ctx_buf_need_progress(&ep->tx)) { - uct_tcp_ep_ctx_reset(&ep->tx); - } + ucs_assert(sent_length <= SSIZE_MAX); - return send_length > 0; + return sent_length; } -static inline unsigned uct_tcp_ep_recv(uct_tcp_ep_t *ep, size_t *recv_length) +static inline void uct_tcp_ep_comp_zcopy(uct_tcp_ep_t *ep, + uct_completion_t *comp, + ucs_status_t status) { + ep->ctx_caps &= ~UCS_BIT(UCT_TCP_EP_CTX_TYPE_ZCOPY_TX); + if (comp != NULL) { + uct_invoke_completion(comp, status); + } +} + +static inline ssize_t uct_tcp_ep_sendv(uct_tcp_ep_t *ep) +{ + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + uct_tcp_ep_zcopy_tx_t *ctx = (uct_tcp_ep_zcopy_tx_t*)ep->tx.buf; + size_t sent_length; ucs_status_t status; - ucs_assertv(*recv_length, "ep=%p", ep); + ucs_assertv((ep->tx.offset < ep->tx.length) && + (ctx->iov_cnt > 0), "ep=%p", ep); + + status = ucs_socket_sendv_nb(ep->fd, &ctx->iov[ctx->iov_index], + ctx->iov_cnt - ctx->iov_index, + &sent_length, NULL, NULL); - status = uct_tcp_recv(ep->fd, ep->rx.buf + ep->rx.length, recv_length); if (ucs_unlikely(status != UCS_OK)) { - if (status == UCS_ERR_CANCELED) { - uct_tcp_ep_handle_disconnected(ep, &ep->rx); + if (status == UCS_ERR_NO_PROGRESS) { + ucs_assert(sent_length == 0); + return 0; } - *recv_length = 0; + + uct_tcp_ep_comp_zcopy(ep, ctx->comp, status); + return status; + } + + ep->tx.offset += sent_length; + iface->outstanding -= sent_length; + + if (ep->tx.offset != ep->tx.length) { + ucs_iov_advance(ctx->iov, ctx->iov_cnt, + &ctx->iov_index, sent_length); + } else { + uct_tcp_ep_comp_zcopy(ep, ctx->comp, UCS_OK); + } + + ucs_assert(sent_length <= SSIZE_MAX); + return sent_length; +} + +static int uct_tcp_ep_is_conn_closed_by_peer(ucs_status_t io_status) +{ + return (io_status == UCS_ERR_REJECTED) || + (io_status == UCS_ERR_CONNECTION_RESET); +} + +ucs_status_t uct_tcp_ep_handle_dropped_connect(uct_tcp_ep_t *ep, + ucs_status_t io_status) +{ + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + ucs_status_t status; + + /* if connection establishment fails, the system limits + * may not be big enough */ + if (((ep->conn_state == UCT_TCP_EP_CONN_STATE_CONNECTING) || + (ep->conn_state == UCT_TCP_EP_CONN_STATE_WAITING_ACK) || + (ep->conn_state == UCT_TCP_EP_CONN_STATE_WAITING_REQ)) && + (uct_tcp_ep_is_conn_closed_by_peer(io_status) || + (io_status == UCS_ERR_TIMED_OUT))) { + uct_tcp_ep_mod_events(ep, 0, ep->events); + ucs_close_fd(&ep->fd); + + uct_tcp_cm_change_conn_state(ep, UCT_TCP_EP_CONN_STATE_CLOSED); + + status = uct_tcp_ep_create_socket_and_connect(iface, NULL, &ep); + if (status == UCS_OK) { + return UCS_OK; + } + + ucs_error("try to increase \"net.core.somaxconn\", " + "\"net.core.netdev_max_backlog\", " + "\"net.ipv4.tcp_max_syn_backlog\" to the maximum value " + "on the remote node or increase %s%s%s (=%u)", + UCS_DEFAULT_ENV_PREFIX, UCT_TCP_CONFIG_PREFIX, + UCT_TCP_CONFIG_MAX_CONN_RETRIES, + iface->config.max_conn_retries); + } + + return io_status; +} + +static ucs_status_t uct_tcp_ep_io_err_handler_cb(void *arg, + ucs_status_t io_status) +{ + uct_tcp_ep_t *ep = (uct_tcp_ep_t*)arg; + uct_tcp_iface_t UCS_V_UNUSED *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + char str_local_addr[UCS_SOCKADDR_STRING_LEN]; + char str_remote_addr[UCS_SOCKADDR_STRING_LEN]; + + if (uct_tcp_ep_is_conn_closed_by_peer(io_status) && + ((ep->conn_state == UCT_TCP_EP_CONN_STATE_ACCEPTING) || + ((ep->conn_state == UCT_TCP_EP_CONN_STATE_CONNECTED) && + (ep->ctx_caps == UCS_BIT(UCT_TCP_EP_CTX_TYPE_RX)) /* only RX cap */))) { + ucs_debug("tcp_ep %p: detected that [%s <-> %s] connection was " + "dropped by the peer", ep, + ucs_sockaddr_str((const struct sockaddr*)&iface->config.ifaddr, + str_local_addr, UCS_SOCKADDR_STRING_LEN), + ucs_sockaddr_str((const struct sockaddr*)&ep->peer_addr, + str_remote_addr, UCS_SOCKADDR_STRING_LEN)); + return UCS_OK; + } + + return uct_tcp_ep_handle_dropped_connect(ep, io_status); +} + +static inline void uct_tcp_ep_handle_recv_err(uct_tcp_ep_t *ep, + ucs_status_t status) +{ + if ((status == UCS_ERR_NO_PROGRESS) || (status == UCS_ERR_CANCELED)) { + /* If no data were read to the allocated buffer, + * we can safely reset it for futher re-use and to + * avoid overwriting this buffer, because `rx::length == 0` */ + if (ep->rx.length == 0) { + uct_tcp_ep_ctx_reset(&ep->rx); + } + } else { + uct_tcp_ep_handle_disconnected(ep, &ep->rx); + } +} + +static inline unsigned uct_tcp_ep_recv(uct_tcp_ep_t *ep, size_t recv_length) +{ + uct_tcp_iface_t UCS_V_UNUSED *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + ucs_status_t status; + + ucs_assertv(recv_length != 0, "ep=%p", ep); + + status = ucs_socket_recv_nb(ep->fd, UCS_PTR_BYTE_OFFSET(ep->rx.buf, + ep->rx.length), + &recv_length, uct_tcp_ep_io_err_handler_cb, ep); + if (ucs_unlikely(status != UCS_OK)) { + uct_tcp_ep_handle_recv_err(ep, status); return 0; } - ep->rx.length += *recv_length; - ucs_trace_data("tcp_ep %p: recvd %zu bytes", ep, *recv_length); + ucs_assertv(recv_length, "ep=%p", ep); + + ep->rx.length += recv_length; + ucs_trace_data("tcp_ep %p: recvd %zu bytes", ep, recv_length); + ucs_assert(ep->rx.length <= (iface->config.rx_seg_size * 2)); return 1; } -unsigned uct_tcp_ep_progress_tx(uct_tcp_ep_t *ep) +/* Forward declaration - the function depends on AM send + * functions implemented below */ +static void uct_tcp_ep_post_put_ack(uct_tcp_ep_t *ep); + +static unsigned uct_tcp_ep_progress_data_tx(uct_tcp_ep_t *ep) { - unsigned count = 0; - uct_pending_req_priv_queue_t *priv; + unsigned ret = 0; + ssize_t offset; ucs_trace_func("ep=%p", ep); - if (!uct_tcp_ep_ctx_buf_empty(&ep->tx)) { - count += uct_tcp_ep_send(ep); + if (uct_tcp_ep_ctx_buf_need_progress(&ep->tx)) { + offset = (!(ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_ZCOPY_TX)) ? + uct_tcp_ep_send(ep) : uct_tcp_ep_sendv(ep)); + if (ucs_unlikely(offset < 0)) { + uct_tcp_ep_handle_disconnected(ep, &ep->tx); + return 1; + } + + ret = (offset > 0); + + ucs_trace_data("ep %p fd %d sent %zu/%zu bytes, moved by offset %zd", + ep, ep->fd, ep->tx.offset, ep->tx.length, offset); + + if (!uct_tcp_ep_ctx_buf_need_progress(&ep->tx)) { + uct_tcp_ep_ctx_reset(&ep->tx); + } + } + + if (ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_PUT_RX_SENDING_ACK)) { + uct_tcp_ep_post_put_ack(ep); } - uct_pending_queue_dispatch(priv, &ep->pending_q, uct_tcp_ep_can_send(ep)); + if (!ucs_queue_is_empty(&ep->pending_q)) { + uct_tcp_ep_pending_queue_dispatch(ep); + return ret; + } - if (uct_tcp_ep_can_send(ep)) { + if (uct_tcp_ep_ctx_buf_empty(&ep->tx)) { ucs_assert(ucs_queue_is_empty(&ep->pending_q)); - uct_tcp_ep_mod_events(ep, 0, EPOLLOUT); + uct_tcp_ep_mod_events(ep, 0, UCS_EVENT_SET_EVWRITE); } - return count; + return ret; } static inline void uct_tcp_ep_comp_recv_am(uct_tcp_iface_t *iface, uct_tcp_ep_t *ep, uct_tcp_am_hdr_t *hdr) { - ucs_assertv(hdr->am_id < UCT_AM_ID_MAX, "invalid am id: %d", hdr->am_id); - uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_RECV, hdr->am_id, - hdr + 1, hdr->length, "RECV fd %d", ep->fd); - uct_iface_invoke_am(&iface->super, hdr->am_id, hdr + 1, - hdr->length, 0); + hdr + 1, hdr->length, + "RECV: ep %p fd %d received %zu/%zu bytes", + ep, ep->fd, ep->rx.offset, ep->rx.length); + uct_iface_invoke_am(&iface->super, hdr->am_id, hdr + 1, hdr->length, 0); +} + +static inline ucs_status_t +uct_tcp_ep_put_rx_advance(uct_tcp_ep_t *ep, uct_tcp_ep_put_req_hdr_t *put_req, + size_t recv_length) +{ + ucs_assert(!(ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_PUT_RX_SENDING_ACK))); + ucs_assert(recv_length <= put_req->length); + put_req->addr += recv_length; + put_req->length -= recv_length; + + if (!put_req->length) { + uct_tcp_ep_post_put_ack(ep); + + /* EP's ctx_caps doesn't have UCT_TCP_EP_CTX_TYPE_PUT_RX flag + * set in case of entire PUT payload was received through + * AM protocol */ + if (ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_PUT_RX)) { + ep->ctx_caps &= ~UCS_BIT(UCT_TCP_EP_CTX_TYPE_PUT_RX); + uct_tcp_ep_ctx_reset(&ep->rx); + } + + return UCS_OK; + } + + return UCS_INPROGRESS; +} + +static inline void uct_tcp_ep_handle_put_req(uct_tcp_ep_t *ep, + uct_tcp_ep_put_req_hdr_t *put_req, + size_t extra_recvd_length) +{ + size_t copied_length; + ucs_status_t status; + + ucs_assert(put_req->addr || !put_req->length); + + copied_length = ucs_min(put_req->length, extra_recvd_length); + memcpy((void*)(uintptr_t)put_req->addr, + UCS_PTR_BYTE_OFFSET(ep->rx.buf, ep->rx.offset), + copied_length); + ep->rx.offset += copied_length; + ep->rx.put_sn = put_req->sn; + + /* Remove the flag that indicates that EP is sending PUT RX ACK in order + * to not ack the uncompleted PUT RX operation for which PUT REQ is being + * handled here. ACK for both operations will be sent after the completion + * of the last received PUT operation */ + ep->ctx_caps &= ~UCS_BIT(UCT_TCP_EP_CTX_TYPE_PUT_RX_SENDING_ACK); + + status = uct_tcp_ep_put_rx_advance(ep, put_req, copied_length); + if (status == UCS_OK) { + return; + } + + ucs_assert(ep->rx.offset == ep->rx.length); + uct_tcp_ep_ctx_rewind(&ep->rx); + /* Since RX buffer and PUT request can be ovelapped, use memmove() */ + memmove(ep->rx.buf, put_req, sizeof(*put_req)); + ep->ctx_caps |= UCS_BIT(UCT_TCP_EP_CTX_TYPE_PUT_RX); } -unsigned uct_tcp_ep_progress_rx(uct_tcp_ep_t *ep) +static unsigned uct_tcp_ep_progress_am_rx(uct_tcp_ep_t *ep) { uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_tcp_iface_t); + unsigned handled = 0; uct_tcp_am_hdr_t *hdr; size_t recv_length; - size_t remainder; + size_t remaining; ucs_trace_func("ep=%p", ep); if (!uct_tcp_ep_ctx_buf_need_progress(&ep->rx)) { ucs_assert(ep->rx.buf == NULL); - ep->rx.buf = ucs_mpool_get_inline(&iface->rx_mpool); if (ucs_unlikely(ep->rx.buf == NULL)) { ucs_warn("tcp_ep %p: unable to get a buffer from RX memory pool", ep); @@ -390,94 +875,354 @@ unsigned uct_tcp_ep_progress_rx(uct_tcp_ep_t *ep) } /* post the entire AM buffer */ - recv_length = iface->am_buf_size; - } else if (ep->rx.length - ep->rx.offset < sizeof(*hdr)) { - ucs_assert(ep->rx.buf != NULL); + recv_length = iface->config.rx_seg_size; + } else if (ep->rx.length < sizeof(*hdr)) { + ucs_assert((ep->rx.buf != NULL) && (ep->rx.offset == 0)); /* do partial receive of the remaining part of the hdr * and post the entire AM buffer */ - recv_length = iface->am_buf_size - ep->rx.length; + recv_length = iface->config.rx_seg_size - ep->rx.length; } else { - ucs_assert(ep->rx.buf != NULL); + ucs_assert((ep->rx.buf != NULL) && + ((ep->rx.length - ep->rx.offset) >= sizeof(*hdr))); /* do partial receive of the remaining user data */ - hdr = ep->rx.buf + ep->rx.offset; + hdr = UCS_PTR_BYTE_OFFSET(ep->rx.buf, ep->rx.offset); recv_length = hdr->length - (ep->rx.length - ep->rx.offset - sizeof(*hdr)); } - if (!uct_tcp_ep_recv(ep, &recv_length)) { + if (!uct_tcp_ep_recv(ep, recv_length)) { goto out; } /* Parse received active messages */ while (uct_tcp_ep_ctx_buf_need_progress(&ep->rx)) { - remainder = ep->rx.length - ep->rx.offset; - if (remainder < sizeof(*hdr)) { + remaining = ep->rx.length - ep->rx.offset; + if (remaining < sizeof(*hdr)) { /* Move the partially received hdr to the beginning of the buffer */ - memmove(ep->rx.buf, ep->rx.buf + ep->rx.offset, remainder); + memmove(ep->rx.buf, UCS_PTR_BYTE_OFFSET(ep->rx.buf, ep->rx.offset), + remaining); ep->rx.offset = 0; - ep->rx.length = remainder; + ep->rx.length = remaining; + handled++; goto out; } - hdr = ep->rx.buf + ep->rx.offset; - ucs_assert(hdr->length <= (iface->am_buf_size - sizeof(uct_tcp_am_hdr_t))); + hdr = UCS_PTR_BYTE_OFFSET(ep->rx.buf, ep->rx.offset); + ucs_assertv(hdr->length <= (iface->config.rx_seg_size - sizeof(*hdr)), + "tcp_ep %p (conn state - %s): %u vs %zu", + ep, uct_tcp_ep_cm_state[ep->conn_state].name, hdr->length, + (iface->config.rx_seg_size - sizeof(*hdr))); - if (remainder < sizeof(*hdr) + hdr->length) { + if (remaining < (sizeof(*hdr) + hdr->length)) { + handled++; goto out; } /* Full message was received */ ep->rx.offset += sizeof(*hdr) + hdr->length; + ucs_assert(ep->rx.offset <= ep->rx.length); + + if (ucs_likely(hdr->am_id < UCT_AM_ID_MAX)) { + uct_tcp_ep_comp_recv_am(iface, ep, hdr); + handled++; + } else if (hdr->am_id == UCT_TCP_EP_PUT_REQ_AM_ID) { + ucs_assert(hdr->length == sizeof(uct_tcp_ep_put_req_hdr_t)); + uct_tcp_ep_handle_put_req(ep, (uct_tcp_ep_put_req_hdr_t*)(hdr + 1), + ep->rx.length - ep->rx.offset); + handled++; + if (ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_PUT_RX)) { + /* It means that PUT RX is in progress and EP RX buffer + * is used to keep PUT header. So, we don't need to + * release a EP RX buffer */ + goto out; + } + } else if (hdr->am_id == UCT_TCP_EP_PUT_ACK_AM_ID) { + ucs_assert(hdr->length == sizeof(uint32_t)); + uct_tcp_ep_handle_put_ack(ep, (uct_tcp_ep_put_ack_hdr_t*)(hdr + 1)); + handled++; + } else { + ucs_assert(hdr->am_id == UCT_TCP_EP_CM_AM_ID); + handled += 1 + uct_tcp_cm_handle_conn_pkt(&ep, hdr + 1, hdr->length); + /* coverity[check_after_deref] */ + if (ep == NULL) { + goto out; + } + } - uct_tcp_ep_comp_recv_am(iface, ep, hdr); + ucs_assert(ep != NULL); } uct_tcp_ep_ctx_reset(&ep->rx); out: - return recv_length > 0; + return handled; } static inline ucs_status_t uct_tcp_ep_am_prepare(uct_tcp_iface_t *iface, uct_tcp_ep_t *ep, uint8_t am_id, uct_tcp_am_hdr_t **hdr) { - UCT_CHECK_AM_ID(am_id); + ucs_status_t status; - if (!uct_tcp_ep_can_send(ep)) { - UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); - return UCS_ERR_NO_RESOURCE; + status = uct_tcp_ep_check_tx_res(ep); + if (ucs_unlikely(status != UCS_OK)) { + if (ucs_likely(status == UCS_ERR_NO_RESOURCE)) { + goto err_no_res; + } + return status; } ucs_assertv(ep->tx.buf == NULL, "ep=%p", ep); ep->tx.buf = ucs_mpool_get_inline(&iface->tx_mpool); if (ucs_unlikely(ep->tx.buf == NULL)) { - UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); - return UCS_ERR_NO_RESOURCE; + goto err_no_res; } *hdr = ep->tx.buf; (*hdr)->am_id = am_id; return UCS_OK; + +err_no_res: + if (ep->fd != -1) { + uct_tcp_ep_mod_events(ep, UCS_EVENT_SET_EVWRITE, 0); + } + UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); + return UCS_ERR_NO_RESOURCE; } -static inline void uct_tcp_ep_am_send(uct_tcp_iface_t *iface, uct_tcp_ep_t *ep, - const uct_tcp_am_hdr_t *hdr) +static unsigned uct_tcp_ep_progress_put_rx(uct_tcp_ep_t *ep) { - uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_SEND, hdr->am_id, - hdr + 1, hdr->length, "SEND fd %d", ep->fd); + uct_tcp_ep_put_req_hdr_t *put_req; + size_t recv_length; + ucs_status_t status; + + put_req = (uct_tcp_ep_put_req_hdr_t*)ep->rx.buf; + recv_length = put_req->length; + status = ucs_socket_recv_nb(ep->fd, (void*)(uintptr_t)put_req->addr, + &recv_length, + uct_tcp_ep_io_err_handler_cb, ep); + if (ucs_unlikely(status != UCS_OK)) { + uct_tcp_ep_handle_recv_err(ep, status); + return 0; + } + + ucs_assertv(recv_length, "ep=%p", ep); + + uct_tcp_ep_put_rx_advance(ep, put_req, recv_length); + + return 1; +} + +static unsigned uct_tcp_ep_progress_data_rx(uct_tcp_ep_t *ep) +{ + if (!(ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_PUT_RX))) { + return uct_tcp_ep_progress_am_rx(ep); + } else { + return uct_tcp_ep_progress_put_rx(ep); + } +} + +static unsigned uct_tcp_ep_progress_magic_number_rx(uct_tcp_ep_t *ep) +{ + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + char str_local_addr[UCS_SOCKADDR_STRING_LEN]; + char str_remote_addr[UCS_SOCKADDR_STRING_LEN]; + size_t recv_length, prev_length; + uint64_t magic_number; + + if (ep->rx.buf == NULL) { + ep->rx.buf = ucs_mpool_get_inline(&iface->rx_mpool); + if (ucs_unlikely(ep->rx.buf == NULL)) { + ucs_warn("tcp_ep %p: unable to get a buffer from RX memory pool", ep); + return 0; + } + } + + prev_length = ep->rx.length; + recv_length = sizeof(magic_number) - ep->rx.length; + + if (!uct_tcp_ep_recv(ep, recv_length)) { + /* Do not touch EP here as it could be destroyed during + * socket error handling */ + return 0; + } + + if (ep->rx.length < sizeof(magic_number)) { + return ((ep->rx.length - prev_length) > 0); + } + + magic_number = *(uint64_t*)ep->rx.buf; + + if (magic_number != UCT_TCP_MAGIC_NUMBER) { + /* Silently close this connection and destroy its EP */ + ucs_debug("tcp_iface %p (%s): received wrong magic number (expected: " + "%zu, received: %zu) for ep=%p (fd=%d) from %s", iface, + ucs_sockaddr_str((const struct sockaddr*)&iface->config.ifaddr, + str_local_addr, UCS_SOCKADDR_STRING_LEN), + UCT_TCP_MAGIC_NUMBER, magic_number, ep, + ep->fd, ucs_socket_getname_str(ep->fd, str_remote_addr, + UCS_SOCKADDR_STRING_LEN)); + goto err; + } + + uct_tcp_ep_ctx_reset(&ep->rx); + + uct_tcp_cm_change_conn_state(ep, UCT_TCP_EP_CONN_STATE_ACCEPTING); + + return 1; + +err: + uct_tcp_ep_destroy_internal(&ep->super.super); + return 0; +} + +static inline void +uct_tcp_ep_set_outstanding_zcopy(uct_tcp_iface_t *iface, uct_tcp_ep_t *ep, + uct_tcp_ep_zcopy_tx_t *ctx, const void *header, + unsigned header_length, uct_completion_t *comp) +{ + ctx->comp = comp; + ep->ctx_caps |= UCS_BIT(UCT_TCP_EP_CTX_TYPE_ZCOPY_TX); + + if ((header_length != 0) && + /* check whether a user's header was sent or not */ + (ep->tx.offset < (sizeof(uct_tcp_am_hdr_t) + header_length))) { + ucs_assert(header_length <= iface->config.zcopy.max_hdr); + /* if the user's header wasn't sent completely, copy it to + * the EP TX buffer (after Zcopy context and IOVs) for + * retransmission. iov_len is already set to the proper value */ + ctx->iov[1].iov_base = UCS_PTR_BYTE_OFFSET(ep->tx.buf, + iface->config.zcopy.hdr_offset); + memcpy(ctx->iov[1].iov_base, header, header_length); + } + + ctx->iov_index = 0; + ucs_iov_advance(ctx->iov, ctx->iov_cnt, &ctx->iov_index, ep->tx.offset); + uct_tcp_ep_mod_events(ep, UCS_EVENT_SET_EVWRITE, 0); +} + +static inline ucs_status_t +uct_tcp_ep_am_send(uct_tcp_iface_t *iface, uct_tcp_ep_t *ep, + const uct_tcp_am_hdr_t *hdr) +{ + ssize_t offset; ep->tx.length = sizeof(*hdr) + hdr->length; iface->outstanding += ep->tx.length; - uct_tcp_ep_send(ep); + offset = uct_tcp_ep_send(ep); + if (ucs_unlikely(offset < 0)) { + return (ucs_status_t)offset; + } + + uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_SEND, hdr->am_id, + hdr + 1, hdr->length, "SEND: ep %p fd %d sent " + "%zu/%zu bytes, moved by offset %zd", + ep, ep->fd, ep->tx.offset, ep->tx.length, offset); + + if (ucs_likely(!uct_tcp_ep_ctx_buf_need_progress(&ep->tx))) { + uct_tcp_ep_ctx_reset(&ep->tx); + } else { + uct_tcp_ep_mod_events(ep, UCS_EVENT_SET_EVWRITE, 0); + } + + return UCS_OK; +} - if (!uct_tcp_ep_ctx_buf_empty(&ep->tx)) { - uct_tcp_ep_mod_events(ep, EPOLLOUT, 0); +static const void* +uct_tcp_ep_am_sendv_get_trace_payload(uct_tcp_am_hdr_t *hdr, + const void *header, + const struct iovec *payload_iov, + int short_sendv) +{ + if (!short_sendv) { + return header; } + + /* If user requested trace data, we copy header and payload + * to EP TX buffer in order to trace correct data */ + uct_am_short_fill_data(hdr + 1, *(const uint64_t*)header, + payload_iov->iov_base, payload_iov->iov_len); + return (hdr + 1); +} + +static inline ucs_status_t +uct_tcp_ep_am_sendv(uct_tcp_iface_t *iface, uct_tcp_ep_t *ep, + int short_sendv, uct_tcp_am_hdr_t *hdr, + size_t send_limit, const void *header, + struct iovec *iov, size_t iov_cnt) +{ + ucs_status_t status; + + ep->tx.length += hdr->length + sizeof(*hdr); + + ucs_assertv((ep->tx.length <= send_limit) && + (iov_cnt > 0), "ep=%p", ep); + + status = ucs_socket_sendv_nb(ep->fd, iov, iov_cnt, + &ep->tx.offset, NULL, NULL); + + uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_SEND, hdr->am_id, + /* the function will be invoked only in case of + * data tracing is enabled */ + uct_tcp_ep_am_sendv_get_trace_payload(hdr, header, + &iov[2], short_sendv), + hdr->length, "SEND: ep %p fd %d sent %zu/%zu bytes, " + "moved by offset %zu, iov cnt %zu " + "[addr %p len %zu] [addr %p len %zu]", + ep, ep->fd, ep->tx.offset, ep->tx.length, + ep->tx.offset, iov_cnt, + /* print user-defined header or + * first iovec with a payload */ + ((iov_cnt > 1) ? iov[1].iov_base : NULL), + ((iov_cnt > 1) ? iov[1].iov_len : 0), + /* print first/second iovec with a payload */ + ((iov_cnt > 2) ? iov[2].iov_base : NULL), + ((iov_cnt > 2) ? iov[2].iov_len : 0)); + + iface->outstanding += ep->tx.length - ep->tx.offset; + + return status; +} + +static void uct_tcp_ep_post_put_ack(uct_tcp_ep_t *ep) +{ + uct_tcp_am_hdr_t *hdr = NULL; + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + uct_tcp_ep_put_ack_hdr_t *put_ack; + ucs_status_t status; + + /* Make sure that we are sending nothing through this EP at the moment. + * This check is needed to avoid mixing AM/PUT data sent from this EP + * and this PUT ACK message */ + status = uct_tcp_ep_am_prepare(iface, ep, + UCT_TCP_EP_PUT_ACK_AM_ID, &hdr); + if (status != UCS_OK) { + if (status == UCS_ERR_NO_RESOURCE) { + ep->ctx_caps |= UCS_BIT(UCT_TCP_EP_CTX_TYPE_PUT_RX_SENDING_ACK); + } else { + ucs_error("tcp_ep %p: failed to prepare AM data", ep); + } + return; + } + + /* Send PUT ACK to confirm completing PUT operations with + * the last received sequence number == ep::rx::put_sn */ + ucs_assertv(hdr != NULL, "ep=%p", ep); + hdr->length = sizeof(*put_ack); + put_ack = (uct_tcp_ep_put_ack_hdr_t*)(hdr + 1); + put_ack->sn = ep->rx.put_sn; + + uct_tcp_ep_am_send(iface, ep, hdr); + + /* If sending PUT ACK was OK, always remove SENDING ACK flag + * as the function can be called from outstanding progress */ + ep->ctx_caps &= ~UCS_BIT(UCT_TCP_EP_CTX_TYPE_PUT_RX_SENDING_ACK); } ucs_status_t uct_tcp_ep_am_short(uct_ep_h uct_ep, uint8_t am_id, uint64_t header, @@ -485,25 +1230,74 @@ ucs_status_t uct_tcp_ep_am_short(uct_ep_h uct_ep, uint8_t am_id, uint64_t header { uct_tcp_ep_t *ep = ucs_derived_of(uct_ep, uct_tcp_ep_t); uct_tcp_iface_t *iface = ucs_derived_of(uct_ep->iface, uct_tcp_iface_t); + uct_tcp_am_hdr_t *hdr = NULL; + struct iovec iov[UCT_TCP_EP_AM_SHORTV_IOV_COUNT]; + uint32_t payload_length; + size_t offset; ucs_status_t status; - uct_tcp_am_hdr_t *hdr; UCT_CHECK_LENGTH(length + sizeof(header), 0, - iface->config.short_size - sizeof(uct_tcp_am_hdr_t), + iface->config.tx_seg_size - sizeof(uct_tcp_am_hdr_t), "am_short"); + UCT_CHECK_AM_ID(am_id); status = uct_tcp_ep_am_prepare(iface, ep, am_id, &hdr); if (status != UCS_OK) { return status; } - *((uint64_t*)(hdr + 1)) = header; - memcpy((uint8_t*)(hdr + 1) + sizeof(header), payload, length); - hdr->length = length + sizeof(header); + ucs_assertv(hdr != NULL, "ep=%p", ep); - uct_tcp_ep_am_send(iface, ep, hdr); - UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, length + sizeof(header)); - return UCS_OK; + /* Save the length of the payload, because hdr (ep::buf) + * can be released inside `uct_tcp_ep_am_send` call */ + hdr->length = payload_length = length + sizeof(header); + + if (length <= iface->config.sendv_thresh) { + uct_am_short_fill_data(hdr + 1, header, payload, length); + status = uct_tcp_ep_am_send(iface, ep, hdr); + if (ucs_unlikely(status != UCS_OK)) { + uct_tcp_ep_ctx_reset(&ep->tx); + return status; + } + + UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, payload_length); + } else { + iov[0].iov_base = hdr; + iov[0].iov_len = sizeof(*hdr); + + iov[1].iov_base = &header; + iov[1].iov_len = sizeof(header); + + iov[2].iov_base = (void*)payload; + iov[2].iov_len = length; + + status = uct_tcp_ep_am_sendv(iface, ep, 1, hdr, + iface->config.tx_seg_size, &header, + iov, UCT_TCP_EP_AM_SHORTV_IOV_COUNT); + if ((status == UCS_OK) || (status == UCS_ERR_NO_PROGRESS)) { + UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, payload_length); + + if (uct_tcp_ep_ctx_buf_need_progress(&ep->tx)) { + /* Copy only user's header and payload to the TX buffer, + * TCP AM header is placed at the beginning of the buffer */ + offset = ((ep->tx.offset >= sizeof(*hdr)) ? + (ep->tx.offset - sizeof(*hdr)) : 0); + + ucs_iov_copy(&iov[1], UCT_TCP_EP_AM_SHORTV_IOV_COUNT - 1, + offset, UCS_PTR_BYTE_OFFSET(hdr + 1, offset), + ep->tx.length - sizeof(*hdr) - offset, + UCS_IOV_COPY_TO_BUF); + uct_tcp_ep_mod_events(ep, UCS_EVENT_SET_EVWRITE, 0); + return UCS_OK; + } + + ucs_assert(status == UCS_OK); + } + + uct_tcp_ep_ctx_reset(&ep->tx); + } + + return status; } ssize_t uct_tcp_ep_am_bcopy(uct_ep_h uct_ep, uint8_t am_id, @@ -512,30 +1306,199 @@ ssize_t uct_tcp_ep_am_bcopy(uct_ep_h uct_ep, uint8_t am_id, { uct_tcp_ep_t *ep = ucs_derived_of(uct_ep, uct_tcp_ep_t); uct_tcp_iface_t *iface = ucs_derived_of(uct_ep->iface, uct_tcp_iface_t); + uct_tcp_am_hdr_t *hdr = NULL; + uint32_t payload_length; ucs_status_t status; - ssize_t payload_length; - uct_tcp_am_hdr_t *hdr; + + UCT_CHECK_AM_ID(am_id); status = uct_tcp_ep_am_prepare(iface, ep, am_id, &hdr); if (status != UCS_OK) { return status; } - /* Save the length of the payload to separate variable, because hdr - * (ep::buf) can be released inside the `uct_tcp_ep_am_send` call */ + ucs_assertv(hdr != NULL, "ep=%p", ep); + + /* Save the length of the payload, because hdr (ep::buf) + * can be released inside `uct_tcp_ep_am_send` call */ hdr->length = payload_length = pack_cb(hdr + 1, arg); - uct_tcp_ep_am_send(iface, ep, hdr); + status = uct_tcp_ep_am_send(iface, ep, hdr); + if (ucs_unlikely(status != UCS_OK)) { + uct_tcp_ep_ctx_reset(&ep->tx); + return status; + } + UCT_TL_EP_STAT_OP(&ep->super, AM, BCOPY, payload_length); + return payload_length; } +static inline ucs_status_t +uct_tcp_ep_prepare_zcopy(uct_tcp_iface_t *iface, uct_tcp_ep_t *ep, uint8_t am_id, + const void *header, unsigned header_length, + const uct_iov_t *iov, size_t iovcnt, const char *name, + size_t *zcopy_payload_p, uct_tcp_ep_zcopy_tx_t **ctx_p) +{ + uct_tcp_am_hdr_t *hdr = NULL; + size_t io_vec_cnt; + ucs_iov_iter_t uct_iov_iter; + uct_tcp_ep_zcopy_tx_t *ctx; + ucs_status_t status; + + UCT_CHECK_IOV_SIZE(iovcnt, iface->config.zcopy.max_iov, name); + UCT_CHECK_LENGTH(header_length, 0, iface->config.zcopy.max_hdr, name); + + status = uct_tcp_ep_am_prepare(iface, ep, am_id, &hdr); + if (ucs_unlikely(status != UCS_OK)) { + return status; + } + + ucs_assertv(hdr != NULL, "ep=%p", ep); + + ctx = ucs_derived_of(hdr, uct_tcp_ep_zcopy_tx_t); + ctx->iov_cnt = 0; + + /* TCP transport header */ + ctx->iov[ctx->iov_cnt].iov_base = hdr; + ctx->iov[ctx->iov_cnt].iov_len = sizeof(*hdr); + ctx->iov_cnt++; + + /* User-defined or TCP internal protocol header */ + if (header_length != 0) { + ucs_assert(header != NULL); + ctx->iov[ctx->iov_cnt].iov_base = (void*)header; + ctx->iov[ctx->iov_cnt].iov_len = header_length; + ctx->iov_cnt++; + } + + /* User-defined payload */ + ucs_iov_iter_init(&uct_iov_iter); + io_vec_cnt = iovcnt; + *zcopy_payload_p = uct_iov_to_iovec(&ctx->iov[ctx->iov_cnt], &io_vec_cnt, + iov, iovcnt, SIZE_MAX, &uct_iov_iter); + *ctx_p = ctx; + ctx->iov_cnt += io_vec_cnt; + + return UCS_OK; +} + +ucs_status_t uct_tcp_ep_am_zcopy(uct_ep_h uct_ep, uint8_t am_id, const void *header, + unsigned header_length, const uct_iov_t *iov, + size_t iovcnt, unsigned flags, + uct_completion_t *comp) +{ + uct_tcp_ep_t *ep = ucs_derived_of(uct_ep, uct_tcp_ep_t); + uct_tcp_iface_t *iface = ucs_derived_of(uct_ep->iface, uct_tcp_iface_t); + uct_tcp_ep_zcopy_tx_t *ctx = NULL; + size_t payload_length = 0; + ucs_status_t status; + + UCT_CHECK_LENGTH(header_length + uct_iov_total_length(iov, iovcnt), 0, + iface->config.rx_seg_size - sizeof(uct_tcp_am_hdr_t), + "am_zcopy"); + UCT_CHECK_AM_ID(am_id); + + status = uct_tcp_ep_prepare_zcopy(iface, ep, am_id, header, header_length, + iov, iovcnt, "am_zcopy", &payload_length, + &ctx); + if (ucs_unlikely(status != UCS_OK)) { + return status; + } + + ctx->super.length = payload_length + header_length; + + status = uct_tcp_ep_am_sendv(iface, ep, 0, &ctx->super, + iface->config.rx_seg_size, + header, ctx->iov, ctx->iov_cnt); + if (ucs_unlikely((status != UCS_OK) && (status != UCS_ERR_NO_PROGRESS))) { + goto out; + } + + UCT_TL_EP_STAT_OP(&ep->super, AM, ZCOPY, ctx->super.length); + + if (uct_tcp_ep_ctx_buf_need_progress(&ep->tx)) { + uct_tcp_ep_set_outstanding_zcopy(iface, ep, ctx, header, + header_length, comp); + return UCS_INPROGRESS; + } + + ucs_assert(status == UCS_OK); + +out: + uct_tcp_ep_ctx_reset(&ep->tx); + return status; +} + +ucs_status_t uct_tcp_ep_put_zcopy(uct_ep_h uct_ep, const uct_iov_t *iov, + size_t iovcnt, uint64_t remote_addr, + uct_rkey_t rkey, uct_completion_t *comp) +{ + uct_tcp_ep_t *ep = ucs_derived_of(uct_ep, uct_tcp_ep_t); + uct_tcp_iface_t *iface = ucs_derived_of(uct_ep->iface, uct_tcp_iface_t); + uct_tcp_ep_zcopy_tx_t *ctx = NULL; + uct_tcp_ep_put_req_hdr_t put_req = {0}; /* Suppress Cppcheck false-positive */ + ucs_status_t status; + + UCT_CHECK_LENGTH(sizeof(put_req) + uct_iov_total_length(iov, iovcnt), 0, + UCT_TCP_EP_PUT_ZCOPY_MAX - sizeof(uct_tcp_am_hdr_t), + "put_zcopy"); + + status = uct_tcp_ep_prepare_zcopy(iface, ep, UCT_TCP_EP_PUT_REQ_AM_ID, + &put_req, sizeof(put_req), + iov, iovcnt, "put_zcopy", + /* Set a payload length directly to the + * TX length, since PUT Zcopy doesn't + * set the payload length to TCP AM hdr */ + &ep->tx.length, &ctx); + if (ucs_unlikely(status != UCS_OK)) { + return status; + } + + ctx->super.length = sizeof(put_req); + put_req.addr = remote_addr; + put_req.length = ep->tx.length; + put_req.sn = ep->tx.put_sn + 1; + + status = uct_tcp_ep_am_sendv(iface, ep, 0, &ctx->super, UCT_TCP_EP_PUT_ZCOPY_MAX, + &put_req, ctx->iov, ctx->iov_cnt); + if (ucs_unlikely((status != UCS_OK) && (status != UCS_ERR_NO_PROGRESS))) { + goto out; + } + + ep->tx.put_sn++; + + if (!(ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_PUT_TX_WAITING_ACK))) { + /* Add UCT_TCP_EP_CTX_TYPE_PUT_TX_WAITING_ACK flag and increment iface + * outstanding operations counter in order to ensure returning + * UCS_INPROGRESS from flush functions and do progressing. + * UCT_TCP_EP_CTX_TYPE_PUT_TX_WAITING_ACK flag has to be removed upon PUT + * ACK message receiving if there are no other PUT operations in-flight */ + ep->ctx_caps |= UCS_BIT(UCT_TCP_EP_CTX_TYPE_PUT_TX_WAITING_ACK); + uct_tcp_iface_outstanding_inc(iface); + } + + UCT_TL_EP_STAT_OP(&ep->super, PUT, ZCOPY, put_req.length); + + if (uct_tcp_ep_ctx_buf_need_progress(&ep->tx)) { + uct_tcp_ep_set_outstanding_zcopy(iface, ep, ctx, &put_req, + sizeof(put_req), comp); + return UCS_INPROGRESS; + } + + ucs_assert(status == UCS_OK); + +out: + uct_tcp_ep_ctx_reset(&ep->tx); + return status; +} + ucs_status_t uct_tcp_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *req, unsigned flags) { uct_tcp_ep_t *ep = ucs_derived_of(tl_ep, uct_tcp_ep_t); - if (uct_tcp_ep_can_send(ep)) { + if (uct_tcp_ep_check_tx_res(ep) == UCS_OK) { return UCS_ERR_BUSY; } @@ -545,10 +1508,10 @@ ucs_status_t uct_tcp_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *req, } void uct_tcp_ep_pending_purge(uct_ep_h tl_ep, uct_pending_purge_callback_t cb, - void *arg) + void *arg) { - uct_tcp_ep_t *ep = ucs_derived_of(tl_ep, uct_tcp_ep_t); - uct_pending_req_priv_queue_t *priv; + uct_tcp_ep_t *ep = ucs_derived_of(tl_ep, uct_tcp_ep_t); + uct_pending_req_priv_queue_t UCS_V_UNUSED *priv; uct_pending_queue_purge(priv, &ep->pending_q, 1, cb, arg); } @@ -557,11 +1520,28 @@ ucs_status_t uct_tcp_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp) { uct_tcp_ep_t *ep = ucs_derived_of(tl_ep, uct_tcp_ep_t); + uct_tcp_ep_put_completion_t *put_comp; - if (!uct_tcp_ep_can_send(ep)) { + if (uct_tcp_ep_check_tx_res(ep) == UCS_ERR_NO_RESOURCE) { + UCT_TL_EP_STAT_FLUSH_WAIT(&ep->super); return UCS_ERR_NO_RESOURCE; } + if (ep->ctx_caps & UCS_BIT(UCT_TCP_EP_CTX_TYPE_PUT_TX_WAITING_ACK)) { + if (comp != NULL) { + put_comp = ucs_calloc(1, sizeof(*put_comp), "put completion"); + if (put_comp == NULL) { + return UCS_ERR_NO_MEMORY; + } + + put_comp->wait_put_sn = ep->tx.put_sn; + put_comp->comp = comp; + ucs_queue_push(&ep->put_comp_q, &put_comp->elem); + } + + return UCS_INPROGRESS; + } + UCT_TL_EP_STAT_FLUSH(&ep->super); return UCS_OK; } diff --git a/src/uct/tcp/tcp_iface.c b/src/uct/tcp/tcp_iface.c index 81ad45922a8..cad4a270911 100644 --- a/src/uct/tcp/tcp_iface.c +++ b/src/uct/tcp/tcp_iface.c @@ -1,8 +1,13 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "tcp.h" #include @@ -13,31 +18,61 @@ #include #include + +extern ucs_class_t UCS_CLASS_DECL_NAME(uct_tcp_iface_t); + static ucs_config_field_t uct_tcp_iface_config_table[] = { - {"", "MAX_SHORT=8k", NULL, + {"", "MAX_NUM_EPS=256", NULL, ucs_offsetof(uct_tcp_iface_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)}, + {"TX_SEG_SIZE", "8kb", + "Size of send copy-out buffer", + ucs_offsetof(uct_tcp_iface_config_t, tx_seg_size), UCS_CONFIG_TYPE_MEMUNITS}, + + {"RX_SEG_SIZE", "64kb", + "Size of receive copy-out buffer", + ucs_offsetof(uct_tcp_iface_config_t, rx_seg_size), UCS_CONFIG_TYPE_MEMUNITS}, + + {"MAX_IOV", "6", + "Maximum IOV count that can contain user-defined payload in a single\n" + "call to non-blocking vector socket send", + ucs_offsetof(uct_tcp_iface_config_t, max_iov), UCS_CONFIG_TYPE_ULONG}, + + {"SENDV_THRESH", "2kb", + "Threshold for switching from send() to sendmsg() for short active messages", + ucs_offsetof(uct_tcp_iface_config_t, sendv_thresh), UCS_CONFIG_TYPE_MEMUNITS}, + {"PREFER_DEFAULT", "y", "Give higher priority to the default network interface on the host", ucs_offsetof(uct_tcp_iface_config_t, prefer_default), UCS_CONFIG_TYPE_BOOL}, + {"PUT_ENABLE", "y", + "Enable PUT Zcopy support", + ucs_offsetof(uct_tcp_iface_config_t, put_enable), UCS_CONFIG_TYPE_BOOL}, + + {"CONN_NB", "n", + "Enable non-blocking connection establishment. It may improve startup " + "time, but can lead to connection resets due to high load on TCP/IP stack", + ucs_offsetof(uct_tcp_iface_config_t, conn_nb), UCS_CONFIG_TYPE_BOOL}, + {"MAX_POLL", UCS_PP_MAKE_STRING(UCT_TCP_MAX_EVENTS), "Number of times to poll on a ready socket. 0 - no polling, -1 - until drained", ucs_offsetof(uct_tcp_iface_config_t, max_poll), UCS_CONFIG_TYPE_UINT}, + {UCT_TCP_CONFIG_MAX_CONN_RETRIES, "25", + "How many connection establishment attempts should be done if dropped " + "connection was detected due to lack of system resources", + ucs_offsetof(uct_tcp_iface_config_t, max_conn_retries), UCS_CONFIG_TYPE_UINT}, + {"NODELAY", "y", "Set TCP_NODELAY socket option to disable Nagle algorithm. Setting this\n" "option usually provides better performance", ucs_offsetof(uct_tcp_iface_config_t, sockopt_nodelay), UCS_CONFIG_TYPE_BOOL}, - {"SNDBUF", "64k", - "Socket send buffer size", - ucs_offsetof(uct_tcp_iface_config_t, sockopt_sndbuf), UCS_CONFIG_TYPE_MEMUNITS}, + UCT_TCP_SEND_RECV_BUF_FIELDS(ucs_offsetof(uct_tcp_iface_config_t, sockopt)), - {"RCVBUF", "auto", - "Socket receive buffer size", - ucs_offsetof(uct_tcp_iface_config_t, sockopt_rcvbuf), UCS_CONFIG_TYPE_MEMUNITS}, + UCT_TCP_SYN_CNT(ucs_offsetof(uct_tcp_iface_config_t, syn_cnt)), UCT_IFACE_MPOOL_CONFIG_FIELDS("TX_", -1, 8, "send", ucs_offsetof(uct_tcp_iface_config_t, tx_mpool), ""), @@ -48,6 +83,7 @@ static ucs_config_field_t uct_tcp_iface_config_table[] = { {NULL} }; + static UCS_CLASS_DEFINE_DELETE_FUNC(uct_tcp_iface_t, uct_iface_t); static ucs_status_t uct_tcp_iface_get_device_address(uct_iface_h tl_iface, @@ -71,42 +107,64 @@ static int uct_tcp_iface_is_reachable(const uct_iface_h tl_iface, const uct_device_addr_t *dev_addr, const uct_iface_addr_t *iface_addr) { - uct_tcp_iface_t *iface = ucs_derived_of(tl_iface, uct_tcp_iface_t); - const in_addr_t *remote_inaddr = (const in_addr_t*)dev_addr; - in_addr_t netmask = iface->config.netmask.sin_addr.s_addr; - - return (*remote_inaddr & netmask) == - (iface->config.ifaddr.sin_addr.s_addr & netmask); + /* We always report that a peer is reachable. connect() call will + * fail if the peer is unreachable when creating UCT/TCP EP */ + return 1; } static ucs_status_t uct_tcp_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *attr) { uct_tcp_iface_t *iface = ucs_derived_of(tl_iface, uct_tcp_iface_t); + size_t am_buf_size = iface->config.tx_seg_size - sizeof(uct_tcp_am_hdr_t); ucs_status_t status; int is_default; - memset(attr, 0, sizeof(*attr)); + uct_base_iface_query(&iface->super, attr); + + status = uct_tcp_netif_caps(iface->if_name, &attr->latency.c, + &attr->bandwidth.shared); + if (status != UCS_OK) { + return status; + } + attr->iface_addr_len = sizeof(in_port_t); attr->device_addr_len = sizeof(struct in_addr); attr->cap.flags = UCT_IFACE_FLAG_CONNECT_TO_IFACE | UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_PENDING | - UCT_IFACE_FLAG_CB_SYNC | - UCT_IFACE_FLAG_EVENT_SEND_COMP | - UCT_IFACE_FLAG_EVENT_RECV; - - attr->cap.am.max_bcopy = iface->config.buf_size - sizeof(uct_tcp_am_hdr_t); - attr->cap.am.max_short = iface->config.short_size - sizeof(uct_tcp_am_hdr_t); - - status = uct_tcp_netif_caps(iface->if_name, &attr->latency.overhead, - &attr->bandwidth); - if (status != UCS_OK) { - return status; + UCT_IFACE_FLAG_CB_SYNC; + attr->cap.event_flags = UCT_IFACE_FLAG_EVENT_SEND_COMP | + UCT_IFACE_FLAG_EVENT_RECV | + UCT_IFACE_FLAG_EVENT_FD; + + attr->cap.am.max_short = am_buf_size; + attr->cap.am.max_bcopy = am_buf_size; + + if (iface->config.zcopy.max_iov > UCT_TCP_EP_ZCOPY_SERVICE_IOV_COUNT) { + /* AM */ + attr->cap.am.max_iov = iface->config.zcopy.max_iov - + UCT_TCP_EP_ZCOPY_SERVICE_IOV_COUNT; + attr->cap.am.max_zcopy = iface->config.rx_seg_size - + sizeof(uct_tcp_am_hdr_t); + attr->cap.am.max_hdr = iface->config.zcopy.max_hdr; + attr->cap.am.opt_zcopy_align = 1; + attr->cap.flags |= UCT_IFACE_FLAG_AM_ZCOPY; + + if (iface->config.put_enable) { + /* PUT */ + attr->cap.put.max_iov = iface->config.zcopy.max_iov - + UCT_TCP_EP_ZCOPY_SERVICE_IOV_COUNT; + attr->cap.put.max_zcopy = UCT_TCP_EP_PUT_ZCOPY_MAX - + UCT_TCP_EP_PUT_SERVICE_LENGTH; + attr->cap.put.opt_zcopy_align = 1; + attr->cap.flags |= UCT_IFACE_FLAG_PUT_ZCOPY; + } } - attr->latency.growth = 0; - attr->overhead = 50e-6; /* 50 usec */ + attr->bandwidth.dedicated = 0; + attr->latency.m = 0; + attr->overhead = 50e-6; /* 50 usec */ if (iface->config.prefer_default) { status = uct_tcp_netif_is_default(iface->if_name, &is_default); @@ -114,9 +172,9 @@ static ucs_status_t uct_tcp_iface_query(uct_iface_h tl_iface, uct_iface_attr_t * return status; } - attr->priority = is_default ? 0 : 1; + attr->priority = is_default ? 0 : 1; } else { - attr->priority = 0; + attr->priority = 0; } return UCS_OK; @@ -126,50 +184,45 @@ static ucs_status_t uct_tcp_iface_event_fd_get(uct_iface_h tl_iface, int *fd_p) { uct_tcp_iface_t *iface = ucs_derived_of(tl_iface, uct_tcp_iface_t); - *fd_p = iface->epfd; - return UCS_OK; + return ucs_event_set_fd_get(iface->event_set, fd_p); +} + +static void uct_tcp_iface_handle_events(void *callback_data, + int events, void *arg) +{ + unsigned *count = (unsigned*)arg; + uct_tcp_ep_t *ep = (uct_tcp_ep_t*)callback_data; + + ucs_assertv(ep->conn_state != UCT_TCP_EP_CONN_STATE_CLOSED, "ep=%p", ep); + + if (events & UCS_EVENT_SET_EVREAD) { + *count += uct_tcp_ep_cm_state[ep->conn_state].rx_progress(ep); + } + if (events & UCS_EVENT_SET_EVWRITE) { + *count += uct_tcp_ep_cm_state[ep->conn_state].tx_progress(ep); + } } unsigned uct_tcp_iface_progress(uct_iface_h tl_iface) { uct_tcp_iface_t *iface = ucs_derived_of(tl_iface, uct_tcp_iface_t); - unsigned read_events = 0; + unsigned max_events = iface->config.max_poll; unsigned count = 0; - struct epoll_event events[UCT_TCP_MAX_EVENTS]; - uct_tcp_ep_t *ep; - int i, nevents, max_events; + unsigned read_events; + ucs_status_t status; do { - max_events = ucs_min(iface->config.max_poll - read_events, - UCT_TCP_MAX_EVENTS); - - nevents = epoll_wait(iface->epfd, events, max_events, 0); - if (ucs_unlikely((nevents < 0))) { - if (errno == EINTR) { - /* force a new loop iteration */ - nevents = max_events; - continue; - } - ucs_error("epoll_wait(epfd=%d max=%d) failed: %m", - iface->epfd, max_events); - return 0; - } - - for (i = 0; i < nevents; ++i) { - ep = events[i].data.ptr; - if (events[i].events & EPOLLIN) { - count += ep->rx.progress(ep); - } - if (events[i].events & EPOLLOUT) { - count += ep->tx.progress(ep); - } - } - - read_events += nevents; - - ucs_trace_poll("iface=%p epoll_wait()=%d, total=%u", - iface, nevents, read_events); - } while ((read_events < iface->config.max_poll) && (nevents == max_events)); + read_events = ucs_min(ucs_sys_event_set_max_wait_events, max_events); + status = ucs_event_set_wait(iface->event_set, &read_events, + 0, uct_tcp_iface_handle_events, + (void *)&count); + max_events -= read_events; + ucs_trace_poll("iface=%p ucs_event_set_wait() returned %d: " + "read events=%u, total=%u", + iface, status, read_events, + iface->config.max_poll - max_events); + } while ((max_events > 0) && (read_events == UCT_TCP_MAX_EVENTS) && + ((status == UCS_OK) || (status == UCS_INPROGRESS))); return count; } @@ -200,37 +253,34 @@ static void uct_tcp_iface_listen_close(uct_tcp_iface_t *iface) } } -static void uct_tcp_iface_connect_handler(int listen_fd, void *arg) +static void uct_tcp_iface_connect_handler(int listen_fd, int events, void *arg) { uct_tcp_iface_t *iface = arg; struct sockaddr_in peer_addr; socklen_t addrlen; ucs_status_t status; - uct_tcp_ep_t *ep; int fd; ucs_assert(listen_fd == iface->listen_fd); - addrlen = sizeof(peer_addr); - fd = accept(iface->listen_fd, (struct sockaddr*)&peer_addr, &addrlen); - if (fd < 0) { - if ((errno != EAGAIN) && (errno != EINTR)) { - ucs_error("accept() failed: %m"); - uct_tcp_iface_listen_close(iface); + for (;;) { + addrlen = sizeof(peer_addr); + status = ucs_socket_accept(iface->listen_fd, (struct sockaddr*)&peer_addr, + &addrlen, &fd); + if (status != UCS_OK) { + if (status != UCS_ERR_NO_PROGRESS) { + uct_tcp_iface_listen_close(iface); + } + return; } - return; - } - - ucs_debug("tcp_iface %p: accepted connection from %s:%d to fd %d", iface, - inet_ntoa(peer_addr.sin_addr), ntohs(peer_addr.sin_port), fd); + ucs_assert(fd != -1); - status = uct_tcp_ep_create(iface, fd, NULL, &ep); - if (status != UCS_OK) { - close(fd); - return; + status = uct_tcp_cm_handle_incoming_conn(iface, &peer_addr, fd); + if (status != UCS_OK) { + close(fd); + return; + } } - - uct_tcp_ep_mod_events(ep, EPOLLIN, 0); } ucs_status_t uct_tcp_iface_set_sockopt(uct_tcp_iface_t *iface, int fd) @@ -244,35 +294,25 @@ ucs_status_t uct_tcp_iface_set_sockopt(uct_tcp_iface_t *iface, int fd) return status; } - if (iface->sockopt.sndbuf != UCS_CONFIG_MEMUNITS_AUTO) { - status = ucs_socket_setopt(fd, SOL_SOCKET, SO_SNDBUF, - (const void*)&iface->sockopt.sndbuf, - sizeof(int)); - if (status != UCS_OK) { - return status; - } - } - - if (iface->sockopt.rcvbuf != UCS_CONFIG_MEMUNITS_AUTO) { - status = ucs_socket_setopt(fd, SOL_SOCKET, SO_RCVBUF, - (const void*)&iface->sockopt.rcvbuf, - sizeof(int)); - if (status != UCS_OK) { - return status; - } + status = ucs_socket_set_buffer_size(fd, iface->sockopt.sndbuf, + iface->sockopt.rcvbuf); + if (status != UCS_OK) { + return status; } - return UCS_OK; + return ucs_tcp_base_set_syn_cnt(fd, iface->config.syn_cnt); } static uct_iface_ops_t uct_tcp_iface_ops = { .ep_am_short = uct_tcp_ep_am_short, .ep_am_bcopy = uct_tcp_ep_am_bcopy, + .ep_am_zcopy = uct_tcp_ep_am_zcopy, + .ep_put_zcopy = uct_tcp_ep_put_zcopy, .ep_pending_add = uct_tcp_ep_pending_add, .ep_pending_purge = uct_tcp_ep_pending_purge, .ep_flush = uct_tcp_ep_flush, .ep_fence = uct_base_ep_fence, - .ep_create = uct_tcp_ep_create_connected, + .ep_create = uct_tcp_ep_create, .ep_destroy = uct_tcp_ep_destroy, .iface_flush = uct_tcp_iface_flush, .iface_fence = uct_base_iface_fence, @@ -291,64 +331,48 @@ static uct_iface_ops_t uct_tcp_iface_ops = { static ucs_status_t uct_tcp_iface_listener_init(uct_tcp_iface_t *iface) { struct sockaddr_in bind_addr = iface->config.ifaddr; - socklen_t addrlen = sizeof(bind_addr); + socklen_t socklen = sizeof(bind_addr); + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; ucs_status_t status; int ret; - /* Create the server socket for accepting incoming connections */ - status = ucs_socket_create(AF_INET, SOCK_STREAM, &iface->listen_fd); - if (status != UCS_OK) { - return status; - } - - /* Set the server socket to non-blocking mode */ - status = ucs_sys_fcntl_modfl(iface->listen_fd, O_NONBLOCK, 0); + bind_addr.sin_port = 0; /* use a random port */ + status = ucs_socket_server_init((struct sockaddr *)&bind_addr, + sizeof(bind_addr), ucs_socket_max_conn(), + &iface->listen_fd); if (status != UCS_OK) { - goto err_close_sock; - } - - /* Bind socket to random available port */ - bind_addr.sin_port = 0; - ret = bind(iface->listen_fd, (struct sockaddr*)&bind_addr, sizeof(bind_addr)); - if (ret < 0) { - ucs_error("bind(fd=%d) failed: %m", iface->listen_fd); - status = UCS_ERR_IO_ERROR; - goto err_close_sock; + goto err; } /* Get the port which was selected for the socket */ - ret = getsockname(iface->listen_fd, (struct sockaddr*)&bind_addr, &addrlen); + ret = getsockname(iface->listen_fd, (struct sockaddr *)&bind_addr, &socklen); if (ret < 0) { ucs_error("getsockname(fd=%d) failed: %m", iface->listen_fd); status = UCS_ERR_IO_ERROR; goto err_close_sock; } - iface->config.ifaddr.sin_port = bind_addr.sin_port; - /* Listen for connections */ - ret = listen(iface->listen_fd, SOMAXCONN); - if (ret < 0) { - ucs_error("listen(fd=%d; backlog=%d)", iface->listen_fd, SOMAXCONN); - status = UCS_ERR_IO_ERROR; - goto err_close_sock; - } - - ucs_debug("tcp_iface %p: listening for connections on %s:%d", iface, - inet_ntoa(bind_addr.sin_addr), ntohs(bind_addr.sin_port)); + iface->config.ifaddr.sin_port = bind_addr.sin_port; /* Register event handler for incoming connections */ status = ucs_async_set_event_handler(iface->super.worker->async->mode, - iface->listen_fd, POLLIN|POLLERR, + iface->listen_fd, + UCS_EVENT_SET_EVREAD | + UCS_EVENT_SET_EVERR, uct_tcp_iface_connect_handler, iface, iface->super.worker->async); if (status != UCS_OK) { goto err_close_sock; } + ucs_debug("tcp_iface %p: listening for connections (fd=%d) on %s", + iface, iface->listen_fd, ucs_sockaddr_str((struct sockaddr *)&bind_addr, + ip_port_str, sizeof(ip_port_str))); return UCS_OK; err_close_sock: close(iface->listen_fd); +err: return status; } @@ -374,6 +398,11 @@ static UCS_CLASS_INIT_FUNC(uct_tcp_iface_t, uct_md_h md, uct_worker_h worker, return UCS_ERR_UNSUPPORTED; } + if (ucs_derived_of(worker, uct_priv_worker_t)->thread_mode == UCS_THREAD_MODE_MULTI) { + ucs_error("TCP transport does not support multi-threaded worker"); + return UCS_ERR_INVALID_PARAM; + } + UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_tcp_iface_ops, md, worker, params, tl_config UCS_STATS_ARG((params->field_mask & @@ -383,21 +412,60 @@ static UCS_CLASS_INIT_FUNC(uct_tcp_iface_t, uct_md_h md, uct_worker_h worker, ucs_strncpy_zero(self->if_name, params->mode.device.dev_name, sizeof(self->if_name)); - self->outstanding = 0; - self->config.buf_size = config->super.max_bcopy + - sizeof(uct_tcp_am_hdr_t); - self->config.short_size = config->super.max_short + - sizeof(uct_tcp_am_hdr_t); - self->config.prefer_default = config->prefer_default; - self->config.max_poll = config->max_poll; - self->sockopt.nodelay = config->sockopt_nodelay; - self->sockopt.sndbuf = config->sockopt_sndbuf; - self->sockopt.rcvbuf = config->sockopt_rcvbuf; + self->outstanding = 0; + self->config.tx_seg_size = config->tx_seg_size + + sizeof(uct_tcp_am_hdr_t); + self->config.rx_seg_size = config->rx_seg_size + + sizeof(uct_tcp_am_hdr_t); + + if (ucs_iov_get_max() >= UCT_TCP_EP_AM_SHORTV_IOV_COUNT) { + self->config.sendv_thresh = config->sendv_thresh; + } else { + /* AM Short with non-blocking vector send can't be used */ + self->config.sendv_thresh = UCS_MEMUNITS_INF; + } + + /* Maximum IOV count allowed by user's configuration (considering TCP + * protocol and user's AM headers that use 1st and 2nd IOVs + * correspondingly) and system constraints */ + self->config.zcopy.max_iov = ucs_min(config->max_iov + + UCT_TCP_EP_ZCOPY_SERVICE_IOV_COUNT, + ucs_iov_get_max()); + /* Use a remaining part of TX segment for AM Zcopy header */ + self->config.zcopy.hdr_offset = (sizeof(uct_tcp_ep_zcopy_tx_t) + + sizeof(struct iovec) * + self->config.zcopy.max_iov); + if ((self->config.zcopy.hdr_offset > self->config.tx_seg_size) && + (self->config.zcopy.max_iov > UCT_TCP_EP_ZCOPY_SERVICE_IOV_COUNT)) { + ucs_error("AM Zcopy context (%zu) must be <= TX segment size (%zu). " + "It can be adjusted by decreasing maximum IOV count (%zu)", + self->config.zcopy.hdr_offset, self->config.tx_seg_size, + self->config.zcopy.max_iov); + return UCS_ERR_INVALID_PARAM; + } + + self->config.zcopy.max_hdr = self->config.tx_seg_size - + self->config.zcopy.hdr_offset; + self->config.prefer_default = config->prefer_default; + self->config.put_enable = config->put_enable; + self->config.conn_nb = config->conn_nb; + self->config.max_poll = config->max_poll; + self->config.max_conn_retries = config->max_conn_retries; + self->config.syn_cnt = config->syn_cnt; + self->sockopt.nodelay = config->sockopt_nodelay; + self->sockopt.sndbuf = config->sockopt.sndbuf; + self->sockopt.rcvbuf = config->sockopt.rcvbuf; + ucs_list_head_init(&self->ep_list); + kh_init_inplace(uct_tcp_cm_eps, &self->ep_cm_map); - self->am_buf_size = ucs_max(self->config.buf_size, self->config.short_size); + if (self->config.tx_seg_size > self->config.rx_seg_size) { + ucs_error("RX segment size (%zu) must be >= TX segment size (%zu)", + self->config.rx_seg_size, self->config.tx_seg_size); + return UCS_ERR_INVALID_PARAM; + } - status = ucs_mpool_init(&self->tx_mpool, 0, self->am_buf_size, + status = ucs_mpool_init(&self->tx_mpool, 0, self->config.tx_seg_size, 0, UCS_SYS_CACHE_LINE_SIZE, (config->tx_mpool.bufs_grow == 0) ? 32 : config->tx_mpool.bufs_grow, @@ -407,7 +475,7 @@ static UCS_CLASS_INIT_FUNC(uct_tcp_iface_t, uct_md_h md, uct_worker_h worker, goto err; } - status = ucs_mpool_init(&self->rx_mpool, 0, self->am_buf_size * 2, + status = ucs_mpool_init(&self->rx_mpool, 0, self->config.rx_seg_size * 2, 0, UCS_SYS_CACHE_LINE_SIZE, (config->rx_mpool.bufs_grow == 0) ? 32 : config->rx_mpool.bufs_grow, @@ -417,33 +485,27 @@ static UCS_CLASS_INIT_FUNC(uct_tcp_iface_t, uct_md_h md, uct_worker_h worker, goto err_cleanup_tx_mpool; } - if (ucs_derived_of(worker, uct_priv_worker_t)->thread_mode == UCS_THREAD_MODE_MULTI) { - ucs_error("TCP transport does not support multi-threaded worker"); - return UCS_ERR_INVALID_PARAM; - } - status = uct_tcp_netif_inaddr(self->if_name, &self->config.ifaddr, &self->config.netmask); if (status != UCS_OK) { goto err_cleanup_rx_mpool; } - self->epfd = epoll_create(1); - if (self->epfd < 0) { - ucs_error("epoll_create() failed: %m"); + status = ucs_event_set_create(&self->event_set); + if (status != UCS_OK) { status = UCS_ERR_IO_ERROR; goto err_cleanup_rx_mpool; } status = uct_tcp_iface_listener_init(self); if (status != UCS_OK) { - goto err_close_epfd; + goto err_cleanup_event_set; } return UCS_OK; -err_close_epfd: - close(self->epfd); +err_cleanup_event_set: + ucs_event_set_cleanup(self->event_set); err_cleanup_rx_mpool: ucs_mpool_cleanup(&self->rx_mpool, 1); err_cleanup_tx_mpool: @@ -452,30 +514,71 @@ static UCS_CLASS_INIT_FUNC(uct_tcp_iface_t, uct_md_h md, uct_worker_h worker, return status; } -static UCS_CLASS_CLEANUP_FUNC(uct_tcp_iface_t) +static void uct_tcp_iface_ep_list_cleanup(uct_tcp_iface_t *iface, + ucs_list_link_t *ep_list) { uct_tcp_ep_t *ep, *tmp; + + ucs_list_for_each_safe(ep, tmp, ep_list, list) { + uct_tcp_cm_purge_ep(ep); + uct_tcp_ep_destroy_internal(&ep->super.super); + } +} + +static void uct_tcp_iface_eps_cleanup(uct_tcp_iface_t *iface) +{ + ucs_list_link_t *ep_list; + + uct_tcp_iface_ep_list_cleanup(iface, &iface->ep_list); + + kh_foreach_value(&iface->ep_cm_map, ep_list, { + uct_tcp_iface_ep_list_cleanup(iface, ep_list); + ucs_free(ep_list); + }); + + kh_destroy_inplace(uct_tcp_cm_eps, &iface->ep_cm_map); +} + +void uct_tcp_iface_add_ep(uct_tcp_ep_t *ep) +{ + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + UCS_ASYNC_BLOCK(iface->super.worker->async); + ucs_list_add_tail(&iface->ep_list, &ep->list); + UCS_ASYNC_UNBLOCK(iface->super.worker->async); +} + +void uct_tcp_iface_remove_ep(uct_tcp_ep_t *ep) +{ + uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_tcp_iface_t); + UCS_ASYNC_BLOCK(iface->super.worker->async); + ucs_list_del(&ep->list); + UCS_ASYNC_UNBLOCK(iface->super.worker->async); +} + +static UCS_CLASS_CLEANUP_FUNC(uct_tcp_iface_t) +{ ucs_status_t status; ucs_debug("tcp_iface %p: destroying", self); - uct_base_iface_progress_disable(&self->super.super, UCT_PROGRESS_SEND| - UCT_PROGRESS_RECV); + uct_base_iface_progress_disable(&self->super.super, + UCT_PROGRESS_SEND | + UCT_PROGRESS_RECV); status = ucs_async_remove_handler(self->listen_fd, 1); if (status != UCS_OK) { ucs_warn("failed to remove handler for server socket fd=%d", self->listen_fd); } - ucs_list_for_each_safe(ep, tmp, &self->ep_list, list) { - uct_tcp_ep_destroy(&ep->super.super); - } + uct_tcp_iface_eps_cleanup(self); ucs_mpool_cleanup(&self->rx_mpool, 1); ucs_mpool_cleanup(&self->tx_mpool, 1); uct_tcp_iface_listen_close(self); - close(self->epfd); + ucs_event_set_cleanup(self->event_set); } UCS_CLASS_DEFINE(uct_tcp_iface_t, uct_base_iface_t); @@ -483,14 +586,14 @@ static UCS_CLASS_DEFINE_NEW_FUNC(uct_tcp_iface_t, uct_iface_t, uct_md_h, uct_worker_h, const uct_iface_params_t*, const uct_iface_config_t*); -static ucs_status_t uct_tcp_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) +ucs_status_t uct_tcp_query_devices(uct_md_h md, + uct_tl_device_resource_t **devices_p, + unsigned *num_devices_p) { - uct_tl_resource_desc_t *resources, *tmp, *resource; + uct_tl_device_resource_t *devices, *tmp; static const char *netdev_dir = "/sys/class/net"; struct dirent *entry; - unsigned num_resources; + unsigned num_devices; ucs_status_t status; DIR *dir; @@ -501,45 +604,54 @@ static ucs_status_t uct_tcp_query_tl_resources(uct_md_h md, goto out; } - resources = NULL; - num_resources = 0; + devices = NULL; + num_devices = 0; for (;;) { errno = 0; entry = readdir(dir); if (entry == NULL) { if (errno != 0) { ucs_error("readdir(%s) failed: %m", netdev_dir); - ucs_free(resources); + ucs_free(devices); status = UCS_ERR_IO_ERROR; goto out_closedir; } break; /* no more items */ } + /* According to the sysfs(5) manual page, all of entries + * has to be a symbolic link representing one of the real + * or virtual networking devices that are visible in the + * network namespace of the process that is accessing the + * directory. Let's avoid checking files that are not a + * symbolic link, e.g. "." and ".." entries */ + if (entry->d_type != DT_LNK) { + continue; + } + if (!ucs_netif_is_active(entry->d_name)) { continue; } - tmp = ucs_realloc(resources, sizeof(*resources) * (num_resources + 1), - "resource desc"); + tmp = ucs_realloc(devices, sizeof(*devices) * (num_devices + 1), + "tcp devices"); if (tmp == NULL) { - ucs_free(resources); + ucs_free(devices); status = UCS_ERR_NO_MEMORY; goto out_closedir; } - resources = tmp; + devices = tmp; - resource = &resources[num_resources++]; - ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), - "%s", UCT_TCP_NAME); - ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), + ucs_snprintf_zero(devices[num_devices].name, + sizeof(devices[num_devices].name), "%s", entry->d_name); - resource->dev_type = UCT_DEVICE_TYPE_NET; + devices[num_devices].type = UCT_DEVICE_TYPE_NET; + ++num_devices; } - *num_resources_p = num_resources; - *resource_p = resources; - status = UCS_OK; + *num_devices_p = num_devices; + *devices_p = devices; + status = UCS_OK; out_closedir: closedir(dir); @@ -547,8 +659,6 @@ static ucs_status_t uct_tcp_query_tl_resources(uct_md_h md, return status; } -UCT_TL_COMPONENT_DEFINE(uct_tcp_tl, uct_tcp_query_tl_resources, uct_tcp_iface_t, - UCT_TCP_NAME, "TCP_", uct_tcp_iface_config_table, - uct_tcp_iface_config_t); -UCT_MD_REGISTER_TL(&uct_tcp_md, &uct_tcp_tl); - +UCT_TL_DEFINE(&uct_tcp_component, tcp, uct_tcp_query_devices, uct_tcp_iface_t, + UCT_TCP_CONFIG_PREFIX, uct_tcp_iface_config_table, + uct_tcp_iface_config_t); diff --git a/src/uct/tcp/tcp_listener.c b/src/uct/tcp/tcp_listener.c new file mode 100644 index 00000000000..4ae59caa65a --- /dev/null +++ b/src/uct/tcp/tcp_listener.c @@ -0,0 +1,214 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "tcp_sockcm_ep.h" + +#include +#include + + +static void uct_tcp_listener_conn_req_handler(int fd, int events, void *arg) +{ + uct_tcp_listener_t *listener = (uct_tcp_listener_t *)arg; + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + struct sockaddr_storage client_addr; + ucs_async_context_t *async_ctx; + uct_tcp_sockcm_ep_t *ep; + uct_ep_params_t params; + ucs_status_t status; + socklen_t addrlen; + int conn_fd; + + ucs_assert(fd == listener->listen_fd); + + addrlen = sizeof(struct sockaddr_storage); + status = ucs_socket_accept(listener->listen_fd, + (struct sockaddr*)&client_addr, + &addrlen, &conn_fd); + if (status != UCS_OK) { + return; + } + + ucs_assert(conn_fd != -1); + + ucs_trace("server accepted a connection request (fd=%d) from client %s", + conn_fd, ucs_sockaddr_str((struct sockaddr*)&client_addr, + ip_port_str, UCS_SOCKADDR_STRING_LEN)); + + /* Set the accept_fd to non-blocking mode + * (so that send/recv won't be blocking) */ + status = ucs_sys_fcntl_modfl(conn_fd, O_NONBLOCK, 0); + if (status != UCS_OK) { + goto err; + } + + /* create the server's endpoint here. uct_ep_create() will return this one */ + params.field_mask = UCT_EP_PARAM_FIELD_CM | + UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS; + params.cm = listener->super.cm; + params.sockaddr_cb_flags = UCT_CB_FLAG_ASYNC; + + status = UCS_CLASS_NEW(uct_tcp_sockcm_ep_t, &ep, ¶ms); + if (status != UCS_OK) { + ucs_error("failed to create a new tcp_sockcm ep"); + goto err; + } + + /* coverity[uninit_use] */ + ep->fd = conn_fd; + ep->listener = listener; + + status = uct_tcp_sockcm_ep_set_sockopt(ep); + if (status != UCS_OK) { + goto err_delete_ep; + } + + /* Adding the ep to a list on the cm for cleanup purposes */ + ucs_list_add_tail(&listener->sockcm->ep_list, &ep->list); + + async_ctx = listener->super.cm->iface.worker->async; + status = ucs_async_set_event_handler(async_ctx->mode, conn_fd, + UCS_EVENT_SET_EVREAD | + UCS_EVENT_SET_EVERR, + uct_tcp_sa_data_handler, + ep, async_ctx); + if (status != UCS_OK) { + goto err_delete_ep; + } + + return; + +err_delete_ep: + UCS_CLASS_DELETE(uct_tcp_sockcm_ep_t, ep); +err: + ucs_close_fd(&conn_fd); +} + +UCS_CLASS_INIT_FUNC(uct_tcp_listener_t, uct_cm_h cm, + const struct sockaddr *saddr, socklen_t socklen, + const uct_listener_params_t *params) +{ + ucs_async_context_t *async_ctx; + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + ucs_status_t status; + int backlog; + + UCS_CLASS_CALL_SUPER_INIT(uct_listener_t, cm); + + self->sockcm = ucs_derived_of(cm, uct_tcp_sockcm_t); + self->conn_request_cb = params->conn_request_cb; + self->user_data = (params->field_mask & UCT_LISTENER_PARAM_FIELD_USER_DATA) ? + params->user_data : NULL; + backlog = (params->field_mask & UCT_LISTENER_PARAM_FIELD_BACKLOG) ? + params->backlog : ucs_socket_max_conn(); + + status = ucs_socket_server_init(saddr, socklen, backlog, &self->listen_fd); + if (status != UCS_OK) { + goto err; + } + + async_ctx = self->sockcm->super.iface.worker->async; + status = ucs_async_set_event_handler(async_ctx->mode, self->listen_fd, + UCS_EVENT_SET_EVREAD | + UCS_EVENT_SET_EVERR, + uct_tcp_listener_conn_req_handler, self, + async_ctx); + if (status != UCS_OK) { + goto err_close_socket; + } + + ucs_debug("created a TCP listener %p on cm %p with fd: %d " + "listening on %s", self, cm, self->listen_fd, + ucs_sockaddr_str(saddr, ip_port_str, UCS_SOCKADDR_STRING_LEN)); + + return UCS_OK; + +err_close_socket: + ucs_close_fd(&self->listen_fd); +err: + return status; +} + +UCS_CLASS_CLEANUP_FUNC(uct_tcp_listener_t) +{ + ucs_status_t status; + + status = ucs_async_remove_handler(self->listen_fd, 1); + if (status != UCS_OK) { + ucs_warn("failed to remove event handler for fd %d: %s", + self->listen_fd, ucs_status_string(status)); + } + + ucs_close_fd(&self->listen_fd); +} + +ucs_status_t uct_tcp_listener_reject(uct_listener_h listener, + uct_conn_request_h conn_request) +{ + uct_tcp_sockcm_ep_t *cep = (uct_tcp_sockcm_ep_t *)conn_request; + uct_tcp_sockcm_t *tcp_sockcm = uct_tcp_sockcm_ep_get_cm(cep); + char peer_str[UCS_SOCKADDR_STRING_LEN]; + ucs_status_t status; + + UCS_ASYNC_BLOCK(tcp_sockcm->super.iface.worker->async); + + ucs_assert((cep->state & UCT_TCP_SOCKCM_EP_ON_SERVER) && + !(cep->state & UCT_TCP_SOCKCM_EP_SERVER_CREATED)); + + if (cep->state & UCT_TCP_SOCKCM_EP_FAILED) { + status = UCS_ERR_NOT_CONNECTED; + goto out; + } + + ucs_trace("server ep %p (fd=%d state=%d) rejecting connection request from client: %s", + cep, cep->fd, cep->state, + uct_tcp_sockcm_cm_ep_peer_addr_str(cep, peer_str, UCS_SOCKADDR_STRING_LEN)); + + status = UCS_OK; +out: + UCS_ASYNC_UNBLOCK(tcp_sockcm->super.iface.worker->async); + /* reject the connection request by closing the endpoint which will close its fd */ + UCS_CLASS_DELETE(uct_tcp_sockcm_ep_t, cep); + return status; +} + +ucs_status_t uct_tcp_listener_query(uct_listener_h listener, + uct_listener_attr_t *listener_attr) +{ + uct_tcp_listener_t *tcp_listener = ucs_derived_of(listener, + uct_tcp_listener_t); + struct sockaddr_storage addr; + ucs_status_t status; + socklen_t sock_len; + + if (listener_attr->field_mask & UCT_LISTENER_ATTR_FIELD_SOCKADDR) { + sock_len = sizeof(struct sockaddr_storage); + if (getsockname(tcp_listener->listen_fd, (struct sockaddr *)&addr, + &sock_len)) { + ucs_error("getsockname failed (listener=%p) %m", tcp_listener); + return UCS_ERR_IO_ERROR; + } + + status = ucs_sockaddr_copy((struct sockaddr *)&listener_attr->sockaddr, + (const struct sockaddr *)&addr); + if (status != UCS_OK) { + return status; + } + + } + + return UCS_OK; +} + +UCS_CLASS_DEFINE(uct_tcp_listener_t, uct_listener_t); +UCS_CLASS_DEFINE_NEW_FUNC(uct_tcp_listener_t, uct_listener_t, + uct_cm_h , const struct sockaddr *, socklen_t , + const uct_listener_params_t *); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_tcp_listener_t, uct_listener_t); diff --git a/src/uct/tcp/tcp_listener.h b/src/uct/tcp/tcp_listener.h new file mode 100644 index 00000000000..b8884a828bb --- /dev/null +++ b/src/uct/tcp/tcp_listener.h @@ -0,0 +1,37 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#include "tcp_sockcm.h" + +/** + * An TCP listener for incoming connections requests on the server side. + */ +typedef struct uct_tcp_listener { + uct_listener_t super; + + int listen_fd; + + uct_tcp_sockcm_t *sockcm; + + /** Callback to invoke upon receving a connection request from a client */ + uct_cm_listener_conn_request_callback_t conn_request_cb; + + /** User's data to be passed as argument to the conn_request_cb */ + void *user_data; +} uct_tcp_listener_t; + + +UCS_CLASS_DECLARE_NEW_FUNC(uct_tcp_listener_t, uct_listener_t, + uct_cm_h , const struct sockaddr *, socklen_t , + const uct_listener_params_t *); +UCS_CLASS_DECLARE_DELETE_FUNC(uct_tcp_listener_t, uct_listener_t); + +ucs_status_t uct_tcp_listener_query(uct_listener_h listener, + uct_listener_attr_t *listener_attr); + +ucs_status_t uct_tcp_listener_reject(uct_listener_h listener, + uct_conn_request_h conn_request); + diff --git a/src/uct/tcp/tcp_md.c b/src/uct/tcp/tcp_md.c index a84da2cda03..acc01e782d8 100644 --- a/src/uct/tcp/tcp_md.c +++ b/src/uct/tcp/tcp_md.c @@ -1,53 +1,91 @@ /** - * Copyright (C) Mellanox Technologies Ltd. 2001-2016. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "tcp.h" +#include "tcp_sockcm.h" +#include static ucs_status_t uct_tcp_md_query(uct_md_h md, uct_md_attr_t *attr) { - attr->cap.flags = 0; - attr->cap.max_alloc = 0; - attr->cap.reg_mem_types = 0; - attr->cap.mem_type = 0; - attr->cap.max_reg = 0; - attr->rkey_packed_size = 0; - attr->reg_cost.overhead = 0; - attr->reg_cost.growth = 0; + /* Dummy memory registration provided. No real memory handling exists */ + attr->cap.flags = UCT_MD_FLAG_REG | + UCT_MD_FLAG_NEED_RKEY; /* TODO ignore rkey in rma/amo ops */ + attr->cap.max_alloc = 0; + attr->cap.reg_mem_types = UCS_MEMORY_TYPES_CPU_ACCESSIBLE; + attr->cap.access_mem_type = UCS_MEMORY_TYPE_HOST; + attr->cap.detect_mem_types = 0; + attr->cap.max_reg = ULONG_MAX; + attr->rkey_packed_size = 0; + attr->reg_cost = ucs_linear_func_make(0, 0); memset(&attr->local_cpus, 0xff, sizeof(attr->local_cpus)); return UCS_OK; } -static ucs_status_t uct_tcp_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) +static ucs_status_t uct_tcp_md_mem_reg(uct_md_h md, void *address, size_t length, + unsigned flags, uct_mem_h *memh_p) { - return uct_single_md_resource(&uct_tcp_md, resources_p, num_resources_p); + /* We have to emulate memory registration. Return dummy pointer */ + *memh_p = (void*)0xdeadbeef; + return UCS_OK; } -static ucs_status_t uct_tcp_md_open(const char *md_name, const uct_md_config_t *md_config, - uct_md_h *md_p) +static ucs_status_t +uct_tcp_md_open(uct_component_t *component, const char *md_name, + const uct_md_config_t *md_config, uct_md_h *md_p) { static uct_md_ops_t md_ops = { - .close = ucs_empty_function, - .query = uct_tcp_md_query, - .mkey_pack = ucs_empty_function_return_unsupported, - .mem_reg = ucs_empty_function_return_unsupported, - .mem_dereg = ucs_empty_function_return_unsupported, - .is_mem_type_owned = (void *)ucs_empty_function_return_zero, + .close = ucs_empty_function, + .query = uct_tcp_md_query, + .mkey_pack = ucs_empty_function_return_success, + .mem_reg = uct_tcp_md_mem_reg, + .mem_dereg = ucs_empty_function_return_success, + .detect_memory_type = ucs_empty_function_return_unsupported }; static uct_md_t md = { .ops = &md_ops, - .component = &uct_tcp_md + .component = &uct_tcp_component }; *md_p = &md; return UCS_OK; } -UCT_MD_COMPONENT_DEFINE(uct_tcp_md, UCT_TCP_NAME, - uct_tcp_query_md_resources, uct_tcp_md_open, NULL, - ucs_empty_function_return_unsupported, - ucs_empty_function_return_success, "TCP_", - uct_md_config_table, uct_md_config_t); +static ucs_status_t uct_tcp_md_rkey_unpack(uct_component_t *component, + const void *rkey_buffer, + uct_rkey_t *rkey_p, void **handle_p) +{ + /** + * Pseudo stub function for the key unpacking + * Need rkey == 0 due to work with same process to reuse uct_base_[put|get|atomic]* + */ + *rkey_p = 0; + *handle_p = NULL; + return UCS_OK; +} + +uct_component_t uct_tcp_component = { + .query_md_resources = uct_md_query_single_md_resource, + .md_open = uct_tcp_md_open, + .cm_open = UCS_CLASS_NEW_FUNC_NAME(uct_tcp_sockcm_t), + .rkey_unpack = uct_tcp_md_rkey_unpack, + .rkey_ptr = ucs_empty_function_return_unsupported, + .rkey_release = ucs_empty_function_return_success, + .name = UCT_TCP_NAME, + .md_config = UCT_MD_DEFAULT_CONFIG_INITIALIZER, + .cm_config = { + .name = "TCP-SOCKCM connection manager", + .prefix = "TCP_CM_", + .table = uct_tcp_sockcm_config_table, + .size = sizeof(uct_tcp_sockcm_config_t), + }, + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_tcp_component), + .flags = UCT_COMPONENT_FLAG_CM +}; +UCT_COMPONENT_REGISTER(&uct_tcp_component) diff --git a/src/uct/tcp/tcp_net.c b/src/uct/tcp/tcp_net.c index ed563733347..8daffb25e6c 100644 --- a/src/uct/tcp/tcp_net.c +++ b/src/uct/tcp/tcp_net.c @@ -10,9 +10,6 @@ # include "config.h" #endif -#if HAVE_IB -#include /* for ipoib header size */ -#endif #include #include #include @@ -34,9 +31,10 @@ ucs_status_t uct_tcp_netif_caps(const char *if_name, double *latency_p, uint32_t speed_mbps; ucs_status_t status; struct ifreq ifr; - size_t mtu, ll_headers; + size_t ll_headers; int speed_known; short ether_type; + size_t mtu; memset(&ifr, 0, sizeof(ifr)); @@ -51,7 +49,7 @@ ucs_status_t uct_tcp_netif_caps(const char *if_name, double *latency_p, speed_mbps = edata.speed; #endif #if HAVE_DECL_SPEED_UNKNOWN - speed_known = speed_mbps != SPEED_UNKNOWN; + speed_known = speed_mbps != (uint32_t)SPEED_UNKNOWN; #else speed_known = (speed_mbps != 0) && ((uint16_t)speed_mbps != (uint16_t)-1); #endif @@ -85,18 +83,16 @@ ucs_status_t uct_tcp_netif_caps(const char *if_name, double *latency_p, ETH_FCS_LEN + /* CRC */ 12; /* inter-packet gap */ break; -#if HAVE_IB case ARPHRD_INFINIBAND: - ll_headers = UCT_IB_LRH_LEN + - UCT_IB_GRH_LEN + - UCT_IB_BTH_LEN + - UCT_IB_DETH_LEN + /* UD */ - 4 + 20 + /* IPoIB */ - UCT_IB_ICRC_LEN + - UCT_IB_VCRC_LEN + - UCT_IB_DELIM_LEN; + ll_headers = /* LRH */ 8 + + /* GRH */ 40 + + /* BTH */ 12 + + /* DETH */ 8 + + /* IPoIB */ 4 + 20 + + /* ICRC */ 4 + + /* VCRC */ 2 + + /* DELIM */ 2; break; -#endif default: ll_headers = 0; break; @@ -127,7 +123,7 @@ ucs_status_t uct_tcp_netif_inaddr(const char *if_name, struct sockaddr_in *ifadd } } - if ((ifra.ifr_addr.sa_family != AF_INET) ) { + if ((ifra.ifr_addr.sa_family != AF_INET) ) { ucs_error("%s address is not INET", if_name); return UCS_ERR_INVALID_ADDR; } @@ -173,39 +169,3 @@ ucs_status_t uct_tcp_netif_is_default(const char *if_name, int *result_p) fclose(f); return UCS_OK; } - -static ucs_status_t uct_tcp_do_io(int fd, void *data, size_t *length_p, - uct_tcp_io_func_t io_func, const char *name) -{ - ssize_t ret; - - ucs_assert(*length_p > 0); - ret = io_func(fd, data, *length_p, MSG_NOSIGNAL); - if (ret == 0) { - ucs_trace("fd %d is closed", fd); - return UCS_ERR_CANCELED; /* Connection closed */ - } else if (ret < 0) { - if ((errno == EINTR) || (errno == EAGAIN)) { - *length_p = 0; - return UCS_OK; - } else { - ucs_error("%s(fd=%d data=%p length=%zu) failed: %m", - name, fd, data, *length_p); - return UCS_ERR_IO_ERROR; - } - } else { - *length_p = ret; - return UCS_OK; - } -} - -ucs_status_t uct_tcp_send(int fd, const void *data, size_t *length_p) -{ - return uct_tcp_do_io(fd, (void*)data, length_p, (uct_tcp_io_func_t)send, - "send"); -} - -ucs_status_t uct_tcp_recv(int fd, void *data, size_t *length_p) -{ - return uct_tcp_do_io(fd, data, length_p, recv, "recv"); -} diff --git a/src/uct/tcp/tcp_sockcm.c b/src/uct/tcp/tcp_sockcm.c new file mode 100644 index 00000000000..bfa4e2e2fdb --- /dev/null +++ b/src/uct/tcp/tcp_sockcm.c @@ -0,0 +1,209 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "tcp_sockcm_ep.h" + +#include +#include + + +ucs_config_field_t uct_tcp_sockcm_config_table[] = { + {"", "", NULL, + ucs_offsetof(uct_tcp_sockcm_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_cm_config_table)}, + + {"PRIV_DATA_LEN", "2048", + "TCP CM private data length", + ucs_offsetof(uct_tcp_sockcm_config_t, priv_data_len), UCS_CONFIG_TYPE_MEMUNITS}, + + UCT_TCP_SEND_RECV_BUF_FIELDS(ucs_offsetof(uct_tcp_sockcm_config_t, sockopt)), + + UCT_TCP_SYN_CNT(ucs_offsetof(uct_tcp_sockcm_config_t, syn_cnt)), + + {NULL} +}; + +static ucs_status_t uct_tcp_sockcm_query(uct_cm_h cm, uct_cm_attr_t *cm_attr) +{ + uct_tcp_sockcm_t *tcp_sockcm = ucs_derived_of(cm, uct_tcp_sockcm_t); + + if (cm_attr->field_mask & UCT_CM_ATTR_FIELD_MAX_CONN_PRIV) { + cm_attr->max_conn_priv = tcp_sockcm->priv_data_len; + } + + return UCS_OK; +} + +static uct_cm_ops_t uct_tcp_sockcm_ops = { + .close = UCS_CLASS_DELETE_FUNC_NAME(uct_tcp_sockcm_t), + .cm_query = uct_tcp_sockcm_query, + .listener_create = UCS_CLASS_NEW_FUNC_NAME(uct_tcp_listener_t), + .listener_reject = uct_tcp_listener_reject, + .listener_query = uct_tcp_listener_query, + .listener_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_tcp_listener_t), + .ep_create = uct_tcp_sockcm_ep_create +}; + +static void uct_tcp_sockcm_close_ep(uct_tcp_sockcm_ep_t *ep) +{ + ucs_list_del(&ep->list); + UCS_CLASS_DELETE(uct_tcp_sockcm_ep_t, ep); +} + +static inline void uct_tcp_sockcm_ep_handle_event_status(uct_tcp_sockcm_ep_t *ep, + ucs_status_t status, + int events, const char *reason) +{ + ucs_assert(UCS_STATUS_IS_ERR(status)); + ucs_assert(!(ep->state & UCT_TCP_SOCKCM_EP_FAILED)); + + ucs_trace("handling error on %s ep %p (fd=%d state=%d events=%d) because %s: %s ", + ((ep->state & UCT_TCP_SOCKCM_EP_ON_SERVER) ? "server" : "client"), + ep, ep->fd, ep->state, events, reason, ucs_status_string(status)); + + /* if the ep is on the server side but uct_ep_create wasn't called yet, + * destroy the ep here since uct_ep_destroy won't be called either */ + if ((ep->state & UCT_TCP_SOCKCM_EP_ON_SERVER) && + !(ep->state & UCT_TCP_SOCKCM_EP_SERVER_CREATED)) { + ucs_assert(events == UCS_EVENT_SET_EVREAD); + uct_tcp_sockcm_close_ep(ep); + } else { + uct_tcp_sockcm_ep_handle_error(ep, status); + } +} + +static ucs_status_t uct_tcp_sockcm_event_err_to_ucs_err_log(int fd, + ucs_log_level_t* log_level) +{ + int error = 0; + ucs_status_t status; + + status = ucs_socket_getopt(fd, SOL_SOCKET, SO_ERROR, (void*)&error, sizeof(error)); + if ((status != UCS_OK) || (error != ECONNREFUSED)) { + *log_level = UCS_LOG_LEVEL_ERROR; + return UCS_ERR_IO_ERROR; + } + + *log_level = UCS_LOG_LEVEL_DEBUG; + return UCS_ERR_REJECTED; +} + +void uct_tcp_sa_data_handler(int fd, int events, void *arg) +{ + uct_tcp_sockcm_ep_t *ep = (uct_tcp_sockcm_ep_t*)arg; + ucs_log_level_t log_level; + ucs_status_t status; + + ucs_assertv(ep->fd == fd, "ep->fd %d fd %d, ep_state %d", ep->fd, fd, ep->state); + + ucs_trace("ep %p on %s received event (state = %d)", ep, + (ep->state & UCT_TCP_SOCKCM_EP_ON_SERVER) ? "server" : "client", + ep->state); + + if (events & UCS_EVENT_SET_EVERR) { + status = uct_tcp_sockcm_event_err_to_ucs_err_log(fd, &log_level); + ucs_log(log_level, "error event on %s ep %p (status=%s state=%d) events=%d", + (ep->state & UCT_TCP_SOCKCM_EP_ON_SERVER) ? "server" : "client", + ep, ucs_status_string(status), ep->state, events); + uct_tcp_sockcm_ep_handle_event_status(ep, status, events, "event set error"); + return; + } + + /* handle a READ event first in case it is a disconnect notice from the peer */ + if (events & UCS_EVENT_SET_EVREAD) { + status = uct_tcp_sockcm_ep_recv(ep); + if (status != UCS_OK) { + uct_tcp_sockcm_ep_handle_event_status(ep, status, events, "failed to receive"); + return; + } + } + + if (events & UCS_EVENT_SET_EVWRITE) { + status = uct_tcp_sockcm_ep_send(ep); + if (status != UCS_OK) { + uct_tcp_sockcm_ep_handle_event_status(ep, status, events, "failed to send"); + return; + } + } +} + +static uct_iface_ops_t uct_tcp_sockcm_iface_ops = { + .ep_pending_purge = (uct_ep_pending_purge_func_t)ucs_empty_function, + .ep_disconnect = uct_tcp_sockcm_ep_disconnect, + .cm_ep_conn_notify = uct_tcp_sockcm_cm_ep_conn_notify, + .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_tcp_sockcm_ep_t), + .ep_put_short = (uct_ep_put_short_func_t)ucs_empty_function_return_unsupported, + .ep_put_bcopy = (uct_ep_put_bcopy_func_t)ucs_empty_function_return_unsupported, + .ep_get_bcopy = (uct_ep_get_bcopy_func_t)ucs_empty_function_return_unsupported, + .ep_am_short = (uct_ep_am_short_func_t)ucs_empty_function_return_unsupported, + .ep_am_bcopy = (uct_ep_am_bcopy_func_t)ucs_empty_function_return_unsupported, + .ep_atomic_cswap64 = (uct_ep_atomic_cswap64_func_t)ucs_empty_function_return_unsupported, + .ep_atomic64_post = (uct_ep_atomic64_post_func_t)ucs_empty_function_return_unsupported, + .ep_atomic64_fetch = (uct_ep_atomic64_fetch_func_t)ucs_empty_function_return_unsupported, + .ep_atomic_cswap32 = (uct_ep_atomic_cswap32_func_t)ucs_empty_function_return_unsupported, + .ep_atomic32_post = (uct_ep_atomic32_post_func_t)ucs_empty_function_return_unsupported, + .ep_atomic32_fetch = (uct_ep_atomic32_fetch_func_t)ucs_empty_function_return_unsupported, + .ep_pending_add = (uct_ep_pending_add_func_t)ucs_empty_function_return_unsupported, + .ep_flush = (uct_ep_flush_func_t)ucs_empty_function_return_success, + .ep_fence = (uct_ep_fence_func_t)ucs_empty_function_return_unsupported, + .ep_check = (uct_ep_check_func_t)ucs_empty_function_return_unsupported, + .ep_create = (uct_ep_create_func_t)ucs_empty_function_return_unsupported, + .iface_flush = (uct_iface_flush_func_t)ucs_empty_function_return_unsupported, + .iface_fence = (uct_iface_fence_func_t)ucs_empty_function_return_unsupported, + .iface_progress_enable = ucs_empty_function, + .iface_progress_disable = ucs_empty_function, + .iface_progress = (uct_iface_progress_func_t)ucs_empty_function_return_zero, + .iface_event_fd_get = (uct_iface_event_fd_get_func_t)ucs_empty_function_return_unsupported, + .iface_event_arm = (uct_iface_event_arm_func_t)ucs_empty_function_return_unsupported, + .iface_close = ucs_empty_function, + .iface_query = (uct_iface_query_func_t)ucs_empty_function_return_unsupported, + .iface_get_device_address = (uct_iface_get_device_address_func_t)ucs_empty_function_return_unsupported, + .iface_get_address = (uct_iface_get_address_func_t)ucs_empty_function_return_unsupported, + .iface_is_reachable = (uct_iface_is_reachable_func_t)ucs_empty_function_return_zero +}; + +UCS_CLASS_INIT_FUNC(uct_tcp_sockcm_t, uct_component_h component, + uct_worker_h worker, const uct_cm_config_t *config) +{ + uct_tcp_sockcm_config_t *cm_config = ucs_derived_of(config, + uct_tcp_sockcm_config_t); + + UCS_CLASS_CALL_SUPER_INIT(uct_cm_t, &uct_tcp_sockcm_ops, + &uct_tcp_sockcm_iface_ops, worker, component); + + self->priv_data_len = cm_config->priv_data_len - + sizeof(uct_tcp_sockcm_priv_data_hdr_t); + self->sockopt_sndbuf = cm_config->sockopt.sndbuf; + self->sockopt_rcvbuf = cm_config->sockopt.rcvbuf; + self->syn_cnt = cm_config->syn_cnt; + + ucs_list_head_init(&self->ep_list); + + ucs_debug("created tcp_sockcm %p", self); + + return UCS_OK; +} + +UCS_CLASS_CLEANUP_FUNC(uct_tcp_sockcm_t) +{ + uct_tcp_sockcm_ep_t *ep, *tmp; + + UCS_ASYNC_BLOCK(self->super.iface.worker->async); + + ucs_list_for_each_safe(ep, tmp, &self->ep_list, list) { + uct_tcp_sockcm_close_ep(ep); + } + + UCS_ASYNC_UNBLOCK(self->super.iface.worker->async); +} + +UCS_CLASS_DEFINE(uct_tcp_sockcm_t, uct_cm_t); +UCS_CLASS_DEFINE_NEW_FUNC(uct_tcp_sockcm_t, uct_cm_t, uct_component_h, + uct_worker_h, const uct_cm_config_t *); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_tcp_sockcm_t, uct_cm_t); diff --git a/src/uct/tcp/tcp_sockcm.h b/src/uct/tcp/tcp_sockcm.h new file mode 100644 index 00000000000..29a3b788a11 --- /dev/null +++ b/src/uct/tcp/tcp_sockcm.h @@ -0,0 +1,48 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#include "tcp_base.h" +#include + + +typedef struct uct_tcp_sockcm_ep uct_tcp_sockcm_ep_t; + + +/** + * A TCP connection manager + */ +typedef struct uct_tcp_sockcm { + uct_cm_t super; + size_t priv_data_len; + size_t sockopt_sndbuf; /** SO_SNDBUF */ + size_t sockopt_rcvbuf; /** SO_RCVBUF */ + unsigned syn_cnt; /** TCP_SYNCNT */ + ucs_list_link_t ep_list; /** List of endpoints */ +} uct_tcp_sockcm_t; + +/** + * TCP SOCKCM configuration. + */ +typedef struct uct_tcp_sockcm_config { + uct_cm_config_t super; + size_t priv_data_len; + uct_tcp_send_recv_buf_config_t sockopt; + unsigned syn_cnt; +} uct_tcp_sockcm_config_t; + + +typedef struct uct_tcp_sockcm_priv_data_hdr { + size_t length; /** Length of the private data */ + uint8_t status; +} uct_tcp_sockcm_priv_data_hdr_t; + +extern ucs_config_field_t uct_tcp_sockcm_config_table[]; + +UCS_CLASS_DECLARE_NEW_FUNC(uct_tcp_sockcm_t, uct_cm_t, uct_component_h, + uct_worker_h, const uct_cm_config_t *); +UCS_CLASS_DECLARE_DELETE_FUNC(uct_tcp_sockcm_t, uct_cm_t); + +void uct_tcp_sa_data_handler(int fd, int events, void *arg); diff --git a/src/uct/tcp/tcp_sockcm_ep.c b/src/uct/tcp/tcp_sockcm_ep.c new file mode 100644 index 00000000000..a84a607f7b7 --- /dev/null +++ b/src/uct/tcp/tcp_sockcm_ep.c @@ -0,0 +1,870 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "tcp_sockcm_ep.h" +#include +#include +#include +#include + + +const char *uct_tcp_sockcm_cm_ep_peer_addr_str(uct_tcp_sockcm_ep_t *cep, + char *buf, size_t max) +{ + struct sockaddr_storage remote_dev_addr = {0}; /* Suppress Clang false-positive */ + socklen_t remote_dev_addr_len; + ucs_status_t status; + + /* get the device address of the remote peer associated with the connected fd */ + status = ucs_socket_getpeername(cep->fd, &remote_dev_addr, &remote_dev_addr_len); + if (status != UCS_OK) { + ucs_snprintf_safe(buf, max, "<%s>", ucs_status_string(status)); + return buf; + } + + return ucs_sockaddr_str((const struct sockaddr*)&remote_dev_addr, buf, max); +} + +void uct_tcp_sockcm_ep_close_fd(int *fd) +{ + ucs_async_remove_handler(*fd, 1); + ucs_close_fd(fd); +} + +static int uct_tcp_sockcm_ep_is_connected(uct_tcp_sockcm_ep_t *cep) +{ + return cep->state & (UCT_TCP_SOCKCM_EP_CLIENT_CONNECTED_CB_INVOKED | + UCT_TCP_SOCKCM_EP_SERVER_NOTIFY_CB_INVOKED); +} + +static void uct_tcp_sockcm_ep_client_connect_cb(uct_tcp_sockcm_ep_t *cep, + uct_cm_remote_data_t *remote_data, + ucs_status_t status) +{ + cep->state |= UCT_TCP_SOCKCM_EP_CLIENT_CONNECTED_CB_INVOKED; + uct_cm_ep_client_connect_cb(&cep->super, remote_data, status); +} + +static void uct_tcp_sockcm_ep_server_notify_cb(uct_tcp_sockcm_ep_t *cep, + ucs_status_t status) +{ + cep->state |= UCT_TCP_SOCKCM_EP_SERVER_NOTIFY_CB_INVOKED; + uct_cm_ep_server_conn_notify_cb(&cep->super, status); +} + +ucs_status_t uct_tcp_sockcm_ep_disconnect(uct_ep_h ep, unsigned flags) +{ + uct_tcp_sockcm_ep_t *cep = ucs_derived_of(ep, uct_tcp_sockcm_ep_t); + uct_tcp_sockcm_t *tcp_sockcm = uct_tcp_sockcm_ep_get_cm(cep); + char peer_str[UCS_SOCKADDR_STRING_LEN]; + ucs_status_t status; + int ret; + + UCS_ASYNC_BLOCK(tcp_sockcm->super.iface.worker->async); + + ucs_debug("ep %p (fd=%d state=%d) disconnecting from peer :%s", cep, cep->fd, + cep->state, uct_tcp_sockcm_cm_ep_peer_addr_str(cep, peer_str, + UCS_SOCKADDR_STRING_LEN)); + + if ((cep->state & UCT_TCP_SOCKCM_EP_FAILED) && + !(cep->state & UCT_TCP_SOCKCM_EP_GOT_DISCONNECT)) { + status = UCS_ERR_NOT_CONNECTED; + goto out; + } + + if (ucs_unlikely(cep->state & UCT_TCP_SOCKCM_EP_DISCONNECTING)) { + if (cep->state & UCT_TCP_SOCKCM_EP_GOT_DISCONNECT) { + ucs_error("duplicate call of uct_ep_disconnect on a disconnected ep " + "(fd=%d state=%d peer=%s)", cep->fd, cep->state, + uct_tcp_sockcm_cm_ep_peer_addr_str(cep, peer_str, + UCS_SOCKADDR_STRING_LEN)); + status = UCS_ERR_NOT_CONNECTED; + goto out; + } + + ucs_debug("duplicate call of uct_ep_disconnect on an ep " + "that was not disconnected yet (fd=%d state=%d). peer %s", + cep->fd, cep->state, + uct_tcp_sockcm_cm_ep_peer_addr_str(cep, peer_str, + UCS_SOCKADDR_STRING_LEN)); + status = UCS_INPROGRESS; + goto out; + } + + if (!uct_tcp_sockcm_ep_is_connected(cep)) { + ucs_debug("calling uct_ep_disconnect on an ep that is not " + "connected yet (fd=%d state=%d to peer %s)", cep->fd, + cep->state, uct_tcp_sockcm_cm_ep_peer_addr_str(cep, peer_str, + UCS_SOCKADDR_STRING_LEN)); + status = UCS_ERR_BUSY; + goto out; + } + + cep->state |= UCT_TCP_SOCKCM_EP_DISCONNECTING; + + /* disables further send operations but keep receive operations to get a + * message from the peer when it disconnects in order to invoke the disconnect_cb */ + ucs_assert(cep->fd != -1); + ret = shutdown(cep->fd, SHUT_WR); + if (ret == -1) { + ucs_error("ep %p: failed to shutdown on fd %d. %m", cep, cep->fd); + status = UCS_ERR_IO_ERROR; + goto out; + } + + status = UCS_OK; + +out: + UCS_ASYNC_UNBLOCK(tcp_sockcm->super.iface.worker->async); + return status; +} + +static void uct_tcp_sockcm_ep_reset_comm_ctx(uct_tcp_sockcm_ep_t *cep) +{ + cep->comm_ctx.offset = 0; + cep->comm_ctx.length = 0; +} + +static void uct_tcp_sockcm_ep_invoke_error_cb(uct_tcp_sockcm_ep_t *cep, + ucs_status_t status) +{ + uct_cm_remote_data_t remote_data; + + ucs_assert(status != UCS_OK); + + /* no errors should happen after the ep was set to failed, since its ep's fd + * was removed from the async handlers */ + ucs_assert(!(cep->state & UCT_TCP_SOCKCM_EP_FAILED)); + + if (uct_tcp_sockcm_ep_is_connected(cep)) { + /* ep is already connected, call disconnect callback */ + uct_cm_ep_disconnect_cb(&cep->super); + } else if (cep->state & UCT_TCP_SOCKCM_EP_ON_CLIENT) { + remote_data.field_mask = 0; + uct_tcp_sockcm_ep_client_connect_cb(cep, &remote_data, status); + } else { + ucs_assert(cep->state & UCT_TCP_SOCKCM_EP_ON_SERVER); + /* the server might not have a valid ep yet. in this case the notify_cb + * is an empty function */ + uct_tcp_sockcm_ep_server_notify_cb(cep, status); + } +} + +void uct_tcp_sockcm_ep_handle_error(uct_tcp_sockcm_ep_t *cep, ucs_status_t status) +{ + ucs_status_t async_status; + + ucs_assert(UCS_STATUS_IS_ERR(status)); + + ucs_trace("removing ep %p (fd=%d state=%d) async events handler. %s ", + cep, cep->fd, cep->state, ucs_status_string(status)); + + async_status = ucs_async_remove_handler(cep->fd, 1); + if (async_status != UCS_OK) { + ucs_warn("failed to remove fd %d from the async handlers: %s", + cep->fd, ucs_status_string(async_status)); + } + + uct_tcp_sockcm_ep_invoke_error_cb(cep, status); + cep->state |= UCT_TCP_SOCKCM_EP_FAILED; +} + +static ucs_status_t uct_tcp_sockcm_ep_handle_remote_disconnect(uct_tcp_sockcm_ep_t *cep, + ucs_status_t status) +{ + char peer_str[UCS_SOCKADDR_STRING_LEN]; + ucs_status_t cb_status; + + /* remote peer disconnected */ + ucs_debug("ep %p (fd=%d): remote peer (%s) disconnected/rejected (%s)", + cep, cep->fd, + uct_tcp_sockcm_cm_ep_peer_addr_str(cep, peer_str, UCS_SOCKADDR_STRING_LEN), + ucs_status_string(status)); + + /* if the server started sending any data that the client received, then + * it means that the server accepted the client's connection request and + * created an ep to it. therefore, the server did not reject the request + * and if we got here then the status should be UCS_ERR_CONNECTION_RESET. + * otherwise, the status is UCT_TCP_SOCKCM_EP_CLIENT_GOT_REJECTED */ + if (ucs_test_all_flags(cep->state, UCT_TCP_SOCKCM_EP_ON_CLIENT | + UCT_TCP_SOCKCM_EP_DATA_SENT) && + !(cep->state & (UCT_TCP_SOCKCM_EP_HDR_RECEIVED | + UCT_TCP_SOCKCM_EP_DATA_RECEIVED))) { + cb_status = UCS_ERR_REJECTED; + cep->state |= UCT_TCP_SOCKCM_EP_CLIENT_GOT_REJECTED; + } else { + cb_status = UCS_ERR_CONNECTION_RESET; + } + + cep->state |= UCT_TCP_SOCKCM_EP_GOT_DISCONNECT; + + uct_tcp_sockcm_ep_reset_comm_ctx(cep); + return cb_status; +} + +static int uct_tcp_sockcm_ep_is_tx_rx_done(uct_tcp_sockcm_ep_t *cep) +{ + ucs_assert((cep->comm_ctx.length != 0)); + return (cep->comm_ctx.offset == cep->comm_ctx.length); +} + +/** + * This function should be called with the lock held. + */ +static ucs_status_t uct_tcp_sockcm_ep_progress_send(uct_tcp_sockcm_ep_t *cep) +{ + ucs_status_t status; + size_t sent_length; + int events; + + ucs_assert((ucs_test_all_flags(cep->state, UCT_TCP_SOCKCM_EP_ON_CLIENT | + UCT_TCP_SOCKCM_EP_PRIV_DATA_PACKED)) || + (ucs_test_all_flags(cep->state, UCT_TCP_SOCKCM_EP_ON_SERVER | + UCT_TCP_SOCKCM_EP_SERVER_CREATED | + UCT_TCP_SOCKCM_EP_DATA_RECEIVED))); + + ucs_assertv(cep->comm_ctx.offset < cep->comm_ctx.length, "ep state %d offset %zu length %zu", + cep->state, cep->comm_ctx.offset, cep->comm_ctx.length); + + sent_length = cep->comm_ctx.length - cep->comm_ctx.offset; + + status = ucs_socket_send_nb(cep->fd, + UCS_PTR_BYTE_OFFSET(cep->comm_ctx.buf, + cep->comm_ctx.offset), + &sent_length, NULL, NULL); + if ((status != UCS_OK) && (status != UCS_ERR_NO_PROGRESS)) { + if (status != UCS_ERR_CONNECTION_RESET) { /* UCS_ERR_NOT_CONNECTED cannot return from send() */ + ucs_error("ep %p failed to send %s's data (len=%zu offset=%zu status=%s)", + cep, (cep->state & UCT_TCP_SOCKCM_EP_ON_SERVER) ? "server" : "client", + cep->comm_ctx.length, cep->comm_ctx.offset, ucs_status_string(status)); + } + + /* treat all send errors as if they are disconnect from the remote peer - + * i.e. stop sending and receiving on this endpoint and invoke the upper + * layer callback */ + status = uct_tcp_sockcm_ep_handle_remote_disconnect(cep, status); + goto out; + } + + cep->comm_ctx.offset += sent_length; + ucs_assert(cep->comm_ctx.offset <= cep->comm_ctx.length); + + if (uct_tcp_sockcm_ep_is_tx_rx_done(cep)) { + ucs_assert(status == UCS_OK); + cep->state |= UCT_TCP_SOCKCM_EP_DATA_SENT; + + /* on the client side - if completed sending a message after the notify + * call was invoked, then this message is the notify message */ + if (cep->state & UCT_TCP_SOCKCM_EP_CLIENT_NOTIFY_CALLED) { + ucs_assert(cep->state & UCT_TCP_SOCKCM_EP_ON_CLIENT); + cep->state |= UCT_TCP_SOCKCM_EP_CLIENT_NOTIFY_SENT; + } + + uct_tcp_sockcm_ep_reset_comm_ctx(cep); + + /* wait for a message from the peer */ + events = UCS_EVENT_SET_EVREAD; + } else { + /* continue the sending when possible, and handle potential disconnect */ + events = UCS_EVENT_SET_EVREAD | UCS_EVENT_SET_EVWRITE; + } + + status = ucs_async_modify_handler(cep->fd, events); + if (status != UCS_OK) { + ucs_error("failed to modify %d event handler to %d: %s", + cep->fd, events, ucs_status_string(status)); + } + +out: + return status; +} + +ucs_status_t uct_tcp_sockcm_cm_ep_conn_notify(uct_ep_h ep) +{ + uct_tcp_sockcm_ep_t *cep = ucs_derived_of(ep, uct_tcp_sockcm_ep_t); + uct_tcp_sockcm_t *tcp_sockcm = uct_tcp_sockcm_ep_get_cm(cep); + uct_tcp_sockcm_priv_data_hdr_t *hdr; + char peer_str[UCS_SOCKADDR_STRING_LEN]; + ucs_status_t status; + + UCS_ASYNC_BLOCK(tcp_sockcm->super.iface.worker->async); + + if (cep->state & (UCT_TCP_SOCKCM_EP_DISCONNECTING | + UCT_TCP_SOCKCM_EP_FAILED)) { + status = UCS_ERR_NOT_CONNECTED; + goto out; + } + + ucs_assert(ucs_test_all_flags(cep->state, UCT_TCP_SOCKCM_EP_ON_CLIENT | + UCT_TCP_SOCKCM_EP_DATA_SENT | + UCT_TCP_SOCKCM_EP_DATA_RECEIVED | + UCT_TCP_SOCKCM_EP_CLIENT_CONNECTED_CB_INVOKED)); + ucs_assert(!(cep->state & UCT_TCP_SOCKCM_EP_CLIENT_NOTIFY_CALLED)); + + hdr = (uct_tcp_sockcm_priv_data_hdr_t*)cep->comm_ctx.buf; + + hdr->length = 0; /* sending only the header in the notify message */ + hdr->status = UCS_OK; + cep->comm_ctx.length = sizeof(*hdr); + + ucs_trace("ep %p sending conn notification to server: %s", cep, + uct_tcp_sockcm_cm_ep_peer_addr_str(cep, peer_str, UCS_SOCKADDR_STRING_LEN)); + + cep->state |= UCT_TCP_SOCKCM_EP_CLIENT_NOTIFY_CALLED; + status = uct_tcp_sockcm_ep_progress_send(cep); + +out: + UCS_ASYNC_UNBLOCK(tcp_sockcm->super.iface.worker->async); + return status; +} + +static ucs_status_t uct_tcp_sockcm_ep_pack_priv_data(uct_tcp_sockcm_ep_t *cep) +{ + char ifname_str[UCT_DEVICE_NAME_MAX]; + uct_tcp_sockcm_priv_data_hdr_t *hdr; + size_t priv_data_ret; + ucs_status_t status; + uct_cm_ep_priv_data_pack_args_t pack_args; + + /* get interface name associated with the connected client fd */ + status = ucs_sockaddr_get_ifname(cep->fd, ifname_str, sizeof(ifname_str)); + if (UCS_OK != status) { + goto out; + } + + hdr = (uct_tcp_sockcm_priv_data_hdr_t*)cep->comm_ctx.buf; + pack_args.field_mask = UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME; + ucs_strncpy_safe(pack_args.dev_name, ifname_str, UCT_DEVICE_NAME_MAX); + + status = uct_cm_ep_pack_cb(&cep->super, cep->super.user_data, &pack_args, + hdr + 1, + uct_tcp_sockcm_ep_get_cm(cep)->priv_data_len, + &priv_data_ret); + if (status != UCS_OK) { + goto out; + } + + hdr->length = priv_data_ret; + hdr->status = UCS_OK; + cep->comm_ctx.length = sizeof(*hdr) + hdr->length; + cep->state |= UCT_TCP_SOCKCM_EP_PRIV_DATA_PACKED; + +out: + return status; +} + +static int uct_tcp_sockcm_ep_send_skip_event(uct_tcp_sockcm_ep_t *cep) +{ + /* if the ep got a disconnect notice from the peer or had an internal local + * error, it should have removed its fd from the async handlers. + * therefore, no send events should get here afterwards */ + ucs_assert(!(cep->state & (UCT_TCP_SOCKCM_EP_GOT_DISCONNECT | + UCT_TCP_SOCKCM_EP_FAILED))); + + if (cep->state & UCT_TCP_SOCKCM_EP_DISCONNECTING) { + return 1; + } else if (cep->state & UCT_TCP_SOCKCM_EP_ON_SERVER) { + return cep->state & UCT_TCP_SOCKCM_EP_DATA_SENT; + } else { + ucs_assert(cep->state & UCT_TCP_SOCKCM_EP_ON_CLIENT); + return (cep->state & UCT_TCP_SOCKCM_EP_CLIENT_NOTIFY_SENT) || + ((cep->state & UCT_TCP_SOCKCM_EP_DATA_SENT) && + !(cep->state & UCT_TCP_SOCKCM_EP_CLIENT_NOTIFY_CALLED)); + } +} + +ucs_status_t uct_tcp_sockcm_ep_send(uct_tcp_sockcm_ep_t *cep) +{ + ucs_status_t status; + + if (uct_tcp_sockcm_ep_send_skip_event(cep)) { + return UCS_OK; + } + + if (!(cep->state & UCT_TCP_SOCKCM_EP_PRIV_DATA_PACKED)) { + status = uct_tcp_sockcm_ep_pack_priv_data(cep); + if (status != UCS_OK) { + return status; + } + } + + return uct_tcp_sockcm_ep_progress_send(cep); +} + +static ucs_status_t uct_tcp_sockcm_ep_server_invoke_conn_req_cb(uct_tcp_sockcm_ep_t *cep) +{ + uct_tcp_sockcm_priv_data_hdr_t *hdr = (uct_tcp_sockcm_priv_data_hdr_t *) + cep->comm_ctx.buf; + struct sockaddr_storage remote_dev_addr = {0}; /* Suppress Clang false-positive */ + uct_cm_listener_conn_request_args_t conn_req_args; + char peer_str[UCS_SOCKADDR_STRING_LEN]; + char ifname_str[UCT_DEVICE_NAME_MAX]; + uct_cm_remote_data_t remote_data; + socklen_t remote_dev_addr_len; + ucs_sock_addr_t client_saddr; + ucs_status_t status; + + /* get the local interface name associated with the connected fd */ + status = ucs_sockaddr_get_ifname(cep->fd, ifname_str, UCT_DEVICE_NAME_MAX); + if (UCS_OK != status) { + return status; + } + + /* get the device address of the remote peer associated with the connected fd */ + status = ucs_socket_getpeername(cep->fd, &remote_dev_addr, &remote_dev_addr_len); + if (status != UCS_OK) { + return status; + } + + remote_data.field_mask = UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR | + UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR_LENGTH | + UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA | + UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA_LENGTH; + remote_data.dev_addr = (uct_device_addr_t *)&remote_dev_addr; + remote_data.dev_addr_length = remote_dev_addr_len; + remote_data.conn_priv_data = hdr + 1; + remote_data.conn_priv_data_length = hdr->length; + + client_saddr.addr = (struct sockaddr*)&remote_dev_addr; + client_saddr.addrlen = remote_dev_addr_len; + + conn_req_args.field_mask = UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_DEV_NAME | + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_CONN_REQUEST | + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_REMOTE_DATA | + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_CLIENT_ADDR; + conn_req_args.conn_request = cep; + conn_req_args.remote_data = &remote_data; + conn_req_args.client_address = client_saddr; + ucs_strncpy_safe(conn_req_args.dev_name, ifname_str, UCT_DEVICE_NAME_MAX); + + ucs_debug("fd %d: remote_data: (field_mask=%zu) dev_addr: %s (length=%zu), " + "conn_priv_data_length=%zu", cep->fd, remote_data.field_mask, + ucs_sockaddr_str((const struct sockaddr*)remote_data.dev_addr, + peer_str, UCS_SOCKADDR_STRING_LEN), + remote_data.dev_addr_length, remote_data.conn_priv_data_length); + + /* the endpoint, passed as the conn_request to the callback, will be passed + * to uct_ep_create() which will be invoked by the user and therefore moving + * over to its responsibility. */ + ucs_list_del(&cep->list); + cep->listener->conn_request_cb(&cep->listener->super, cep->listener->user_data, + &conn_req_args); + + return UCS_OK; +} + +static ucs_status_t uct_tcp_sockcm_ep_client_invoke_connect_cb(uct_tcp_sockcm_ep_t *cep) +{ + uct_tcp_sockcm_priv_data_hdr_t *hdr = (uct_tcp_sockcm_priv_data_hdr_t *) + cep->comm_ctx.buf; + struct sockaddr_storage remote_dev_addr = {0}; /* Suppress Clang false-positive */ + socklen_t remote_dev_addr_len; + uct_cm_remote_data_t remote_data; + ucs_status_t status; + + ucs_assert(!(cep->state & UCT_TCP_SOCKCM_EP_GOT_DISCONNECT)); + + /* get the device address of the remote peer associated with the connected fd */ + status = ucs_socket_getpeername(cep->fd, &remote_dev_addr, &remote_dev_addr_len); + if (status != UCS_OK) { + return status; + } + + remote_data.field_mask = UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR | + UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR_LENGTH | + UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA | + UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA_LENGTH; + remote_data.dev_addr = (uct_device_addr_t *)&remote_dev_addr; + remote_data.dev_addr_length = remote_dev_addr_len; + remote_data.conn_priv_data = hdr + 1; + remote_data.conn_priv_data_length = hdr->length; + + uct_tcp_sockcm_ep_client_connect_cb(cep, &remote_data, (ucs_status_t)hdr->status); + + return status; +} + +ucs_status_t uct_tcp_sockcm_ep_server_handle_data_received(uct_tcp_sockcm_ep_t *cep) +{ + uct_tcp_sockcm_priv_data_hdr_t *hdr = (uct_tcp_sockcm_priv_data_hdr_t *) + cep->comm_ctx.buf; + ucs_status_t status; + + if (cep->state & UCT_TCP_SOCKCM_EP_DATA_SENT) { + ucs_assert(ucs_test_all_flags(cep->state, UCT_TCP_SOCKCM_EP_SERVER_CREATED | + UCT_TCP_SOCKCM_EP_DATA_RECEIVED)); + + ucs_assert(hdr->length == 0); + ucs_assert(!(cep->state & UCT_TCP_SOCKCM_EP_GOT_DISCONNECT)); + + uct_tcp_sockcm_ep_server_notify_cb(cep, (ucs_status_t)hdr->status); + + /* server to wait for any notification (disconnect) from the client */ + status = ucs_async_modify_handler(cep->fd, UCS_EVENT_SET_EVREAD); + } else if ((cep->state & UCT_TCP_SOCKCM_EP_DATA_RECEIVED) && + !(cep->state & UCT_TCP_SOCKCM_EP_SERVER_CREATED)) { + status = uct_tcp_sockcm_ep_server_invoke_conn_req_cb(cep); + } else { + ucs_error("unexpected state on the server endpoint: %d", cep->state); + status = UCS_ERR_IO_ERROR; + } + + return status; +} + +ucs_status_t uct_tcp_sockcm_ep_handle_data_received(uct_tcp_sockcm_ep_t *cep) +{ + ucs_status_t status; + + cep->state |= UCT_TCP_SOCKCM_EP_DATA_RECEIVED; + /* if the data was received, drop the header_received flag to receive new messages */ + cep->state &= ~UCT_TCP_SOCKCM_EP_HDR_RECEIVED; + uct_tcp_sockcm_ep_reset_comm_ctx(cep); + + if (cep->state & UCT_TCP_SOCKCM_EP_ON_SERVER) { + status = uct_tcp_sockcm_ep_server_handle_data_received(cep); + } else { + ucs_assert(cep->state & UCT_TCP_SOCKCM_EP_ON_CLIENT); + status = uct_tcp_sockcm_ep_client_invoke_connect_cb(cep); + + /* next, unless disconnected, if the client did not send a connection + * establishment notification to the server from the connect_cb, + * he will send it from the main thread */ + } + + return status; +} + +static ucs_status_t uct_tcp_sockcm_ep_recv_nb(uct_tcp_sockcm_ep_t *cep) +{ + size_t recv_length; + ucs_status_t status; + + recv_length = uct_tcp_sockcm_ep_get_cm(cep)->priv_data_len + + sizeof(uct_tcp_sockcm_priv_data_hdr_t) - cep->comm_ctx.offset; + status = ucs_socket_recv_nb(cep->fd, UCS_PTR_BYTE_OFFSET(cep->comm_ctx.buf, + cep->comm_ctx.offset), + &recv_length, NULL, NULL); + if ((status != UCS_OK) && (status != UCS_ERR_NO_PROGRESS)) { + if (status != UCS_ERR_NOT_CONNECTED) { /* ECONNRESET cannot return from recv() */ + ucs_error("ep %p (fd=%d) failed to recv client's data " + "(offset=%zu status=%s)", cep, cep->fd, cep->comm_ctx.offset, + ucs_status_string(status)); + } + + /* treat all recv errors as if they are disconnect/reject from the remote peer - + * i.e. stop sending and receiving on this endpoint */ + status = uct_tcp_sockcm_ep_handle_remote_disconnect(cep, status); + goto out; + } + + cep->comm_ctx.offset += recv_length; + ucs_assertv((cep->comm_ctx.length ? + cep->comm_ctx.offset <= cep->comm_ctx.length : 1), "%zu > %zu", + cep->comm_ctx.offset, cep->comm_ctx.length); + +out: + return status; +} + +ucs_status_t uct_tcp_sockcm_ep_recv(uct_tcp_sockcm_ep_t *cep) +{ + uct_tcp_sockcm_priv_data_hdr_t *hdr; + ucs_status_t status; + + /* if the ep got a disconnect notice from the peer, had an internal local + * error or the client received a reject frmo the server, it should have + * removed its fd from the async handlers. + * therefore, no recv events should get here afterwards */ + ucs_assert(!(cep->state & (UCT_TCP_SOCKCM_EP_GOT_DISCONNECT | + UCT_TCP_SOCKCM_EP_CLIENT_GOT_REJECTED | + UCT_TCP_SOCKCM_EP_FAILED))); + + status = uct_tcp_sockcm_ep_recv_nb(cep); + if (status != UCS_OK) { + goto out; + } + + if (!(cep->state & UCT_TCP_SOCKCM_EP_HDR_RECEIVED)) { + if (cep->comm_ctx.offset < sizeof(*hdr)) { + goto out; + } + + hdr = (uct_tcp_sockcm_priv_data_hdr_t *)cep->comm_ctx.buf; + cep->comm_ctx.length = sizeof(*hdr) + hdr->length; + ucs_assertv(cep->comm_ctx.offset <= cep->comm_ctx.length , "%zu > %zu", + cep->comm_ctx.offset, cep->comm_ctx.length); + + cep->state |= UCT_TCP_SOCKCM_EP_HDR_RECEIVED; + } + + if (uct_tcp_sockcm_ep_is_tx_rx_done(cep)) { + status = uct_tcp_sockcm_ep_handle_data_received(cep); + } + +out: + return (status == UCS_ERR_NO_PROGRESS) ? UCS_OK : status; +} + +ucs_status_t uct_tcp_sockcm_ep_set_sockopt(uct_tcp_sockcm_ep_t *ep) +{ + uct_tcp_sockcm_t *tcp_sockcm = uct_tcp_sockcm_ep_get_cm(ep); + ucs_status_t status; + + status = ucs_socket_set_buffer_size(ep->fd, tcp_sockcm->sockopt_sndbuf, + tcp_sockcm->sockopt_rcvbuf); + if (status != UCS_OK) { + return status; + } + + return ucs_tcp_base_set_syn_cnt(ep->fd, tcp_sockcm->syn_cnt); + +} + +static ucs_status_t uct_tcp_sockcm_ep_client_init(uct_tcp_sockcm_ep_t *cep, + const uct_ep_params_t *params) +{ + uct_tcp_sockcm_t *tcp_sockcm = uct_tcp_sockcm_ep_get_cm(cep); + uct_cm_base_ep_t *cm_ep = &cep->super; + char ip_port_str[UCS_SOCKADDR_STRING_LEN]; + const struct sockaddr *server_addr; + ucs_async_context_t *async_ctx; + ucs_status_t status; + + cep->state |= UCT_TCP_SOCKCM_EP_ON_CLIENT; + + status = UCT_CM_SET_CB(params, UCT_EP_PARAM_FIELD_SOCKADDR_CONNECT_CB_CLIENT, + cm_ep->client.connect_cb, params->sockaddr_cb_client, + uct_cm_ep_client_connect_callback_t, + ucs_empty_function); + if (status != UCS_OK) { + goto err; + } + + server_addr = params->sockaddr->addr; + status = ucs_socket_create(server_addr->sa_family, SOCK_STREAM, &cep->fd); + if (status != UCS_OK) { + goto err; + } + + /* Set the fd to non-blocking mode. (so that connect() won't be blocking) */ + status = ucs_sys_fcntl_modfl(cep->fd, O_NONBLOCK, 0); + if (status != UCS_OK) { + status = UCS_ERR_IO_ERROR; + goto err_close_socket; + } + + status = uct_tcp_sockcm_ep_set_sockopt(cep); + if (status != UCS_OK) { + goto err_close_socket; + } + + /* try to connect to the server */ + status = ucs_socket_connect(cep->fd, server_addr); + if (UCS_STATUS_IS_ERR(status)) { + goto err_close_socket; + } + ucs_assert((status == UCS_OK) || (status == UCS_INPROGRESS)); + + async_ctx = tcp_sockcm->super.iface.worker->async; + status = ucs_async_set_event_handler(async_ctx->mode, cep->fd, + UCS_EVENT_SET_EVWRITE, + uct_tcp_sa_data_handler, cep, + async_ctx); + if (status != UCS_OK) { + goto err_close_socket; + } + + ucs_debug("created a TCP SOCKCM endpoint (fd=%d) on tcp cm %p, " + "remote addr: %s", cep->fd, tcp_sockcm, + ucs_sockaddr_str(server_addr, ip_port_str, UCS_SOCKADDR_STRING_LEN)); + + return status; + +err_close_socket: + uct_tcp_sockcm_ep_close_fd(&cep->fd); +err: + return status; +} + +static ucs_status_t uct_tcp_sockcm_ep_server_create(uct_tcp_sockcm_ep_t *tcp_ep, + const uct_ep_params_t *params, + uct_ep_h *ep_p) +{ + uct_tcp_sockcm_t *tcp_sockcm = uct_tcp_sockcm_ep_get_cm(tcp_ep); + uct_tcp_sockcm_t *params_tcp_sockcm; + ucs_async_context_t *new_async_ctx; + ucs_status_t status; + + if (!(params->field_mask & UCT_EP_PARAM_FIELD_CM)) { + ucs_error("UCT_EP_PARAM_FIELD_CM is not set. field_mask 0x%lx", + params->field_mask); + status = UCS_ERR_INVALID_PARAM; + goto err; + } + + if (params->cm == NULL) { + ucs_error("cm cannot be NULL (ep=%p fd=%d)", tcp_ep, tcp_ep->fd); + status = UCS_ERR_INVALID_PARAM; + goto err; + } + + + /* check if the server opened this ep, to the client, on a CM that is + * different from the one it created its internal ep on earlier, when it + * received the connection request from the client (the cm used by its listener) */ + if (&tcp_sockcm->super != params->cm) { + status = ucs_async_remove_handler(tcp_ep->fd, 1); + if (status != UCS_OK) { + ucs_error("failed to remove fd %d from the async handlers: %s", + tcp_ep->fd, ucs_status_string(status)); + goto err; + } + } + + UCS_ASYNC_BLOCK(tcp_sockcm->super.iface.worker->async); + + UCS_CLASS_CLEANUP(uct_cm_base_ep_t, &tcp_ep->super); + + /* set the server's ep to use the cm from params and its iface + * (it could be the previous one it had - the one used by the listener or + * a new one set by the user) */ + status = UCS_CLASS_INIT(uct_cm_base_ep_t, &tcp_ep->super, params); + if (status != UCS_OK) { + ucs_error("failed to initialize a uct_cm_base_ep_t endpoint"); + goto err_unblock; + } + + params_tcp_sockcm = ucs_derived_of(params->cm, uct_tcp_sockcm_t); + ucs_assert(uct_tcp_sockcm_ep_get_cm(tcp_ep) == params_tcp_sockcm); + + status = UCT_CM_SET_CB(params, UCT_EP_PARAM_FIELD_SOCKADDR_NOTIFY_CB_SERVER, + tcp_ep->super.server.notify_cb, params->sockaddr_cb_server, + uct_cm_ep_server_conn_notify_callback_t, + ucs_empty_function); + if (status != UCS_OK) { + goto err_unblock; + } + + /* the server's endpoint was already created by the listener, return it */ + *ep_p = &tcp_ep->super.super.super; + tcp_ep->state |= UCT_TCP_SOCKCM_EP_SERVER_CREATED; + + UCS_ASYNC_UNBLOCK(tcp_sockcm->super.iface.worker->async); + + if (&tcp_sockcm->super != params->cm) { + new_async_ctx = params_tcp_sockcm->super.iface.worker->async; + status = ucs_async_set_event_handler(new_async_ctx->mode, tcp_ep->fd, + UCS_EVENT_SET_EVREAD | + UCS_EVENT_SET_EVERR, + uct_tcp_sa_data_handler, + tcp_ep, new_async_ctx); + if (status != UCS_OK) { + ucs_error("failed to set event handler (fd %d): %s", + tcp_ep->fd, ucs_status_string(status)); + goto err; + } + + ucs_trace("moved tcp_sockcm ep %p from cm %p to cm %p", tcp_ep, + tcp_sockcm, params_tcp_sockcm); + } + + ucs_trace("server completed endpoint creation (fd=%d cm=%p state=%d)", + tcp_ep->fd, params_tcp_sockcm, tcp_ep->state); + + /* now that the server's ep was created, can try to send data */ + ucs_async_modify_handler(tcp_ep->fd, UCS_EVENT_SET_EVWRITE | UCS_EVENT_SET_EVREAD); + return UCS_OK; + +err_unblock: + UCS_ASYNC_UNBLOCK(tcp_sockcm->super.iface.worker->async); +err: + return status; +} + +UCS_CLASS_INIT_FUNC(uct_tcp_sockcm_ep_t, const uct_ep_params_t *params) +{ + ucs_status_t status; + + UCS_CLASS_CALL_SUPER_INIT(uct_cm_base_ep_t, params); + + uct_tcp_sockcm_ep_reset_comm_ctx(self); + self->state = 0; + self->comm_ctx.buf = ucs_calloc(1, uct_tcp_sockcm_ep_get_cm(self)->priv_data_len + + sizeof(uct_tcp_sockcm_priv_data_hdr_t), + "tcp_sockcm priv data"); + if (self->comm_ctx.buf == NULL) { + ucs_error("failed to allocate memory for the ep's send/recv buf"); + status = UCS_ERR_NO_MEMORY; + goto out; + } + + if (params->field_mask & UCT_EP_PARAM_FIELD_SOCKADDR) { + status = uct_tcp_sockcm_ep_client_init(self, params); + if (status != UCS_OK) { + ucs_free(self->comm_ctx.buf); + goto out; + } + } else { + self->state |= UCT_TCP_SOCKCM_EP_ON_SERVER; + status = UCS_OK; + } + + ucs_debug("%s created an endpoint on tcp_sockcm %p id: %d state: %d", + (self->state & UCT_TCP_SOCKCM_EP_ON_SERVER) ? "server" : "client", + uct_tcp_sockcm_ep_get_cm(self), self->fd, self->state); + +out: + return status; +} + +ucs_status_t uct_tcp_sockcm_ep_create(const uct_ep_params_t *params, uct_ep_h *ep_p) +{ + uct_tcp_sockcm_ep_t *tcp_ep; + ucs_status_t status; + + if (params->field_mask & UCT_EP_PARAM_FIELD_SOCKADDR) { + /* create a new endpoint for the client side */ + return UCS_CLASS_NEW(uct_tcp_sockcm_ep_t, ep_p, params); + } else if (params->field_mask & UCT_EP_PARAM_FIELD_CONN_REQUEST) { + tcp_ep = (uct_tcp_sockcm_ep_t*)params->conn_request; + + status = uct_tcp_sockcm_ep_server_create(tcp_ep, params, ep_p); + if (status != UCS_OK) { + UCS_CLASS_DELETE(uct_tcp_sockcm_ep_t, tcp_ep); + } + + return status; + } else { + ucs_error("either UCT_EP_PARAM_FIELD_SOCKADDR or UCT_EP_PARAM_FIELD_CONN_REQUEST " + "has to be provided"); + return UCS_ERR_INVALID_PARAM; + } +} + +UCS_CLASS_CLEANUP_FUNC(uct_tcp_sockcm_ep_t) +{ + uct_tcp_sockcm_t *tcp_sockcm = uct_tcp_sockcm_ep_get_cm(self); + + UCS_ASYNC_BLOCK(tcp_sockcm->super.iface.worker->async); + + ucs_trace("%s destroy ep %p on cm %p", + (self->state & UCT_TCP_SOCKCM_EP_ON_SERVER) ? "server" : "client", + self, tcp_sockcm); + + ucs_free(self->comm_ctx.buf); + + uct_tcp_sockcm_ep_close_fd(&self->fd); + UCS_ASYNC_UNBLOCK(tcp_sockcm->super.iface.worker->async); +} + +UCS_CLASS_DEFINE(uct_tcp_sockcm_ep_t, uct_base_ep_t); +UCS_CLASS_DEFINE_NEW_FUNC(uct_tcp_sockcm_ep_t, uct_ep_t, const uct_ep_params_t *); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_tcp_sockcm_ep_t, uct_ep_t); diff --git a/src/uct/tcp/tcp_sockcm_ep.h b/src/uct/tcp/tcp_sockcm_ep.h new file mode 100644 index 00000000000..028c6bd7d96 --- /dev/null +++ b/src/uct/tcp/tcp_sockcm_ep.h @@ -0,0 +1,75 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#include "tcp_listener.h" + + +typedef enum uct_tcp_sockcm_ep_state { + UCT_TCP_SOCKCM_EP_ON_SERVER = UCS_BIT(0), /* ep is on the server side */ + UCT_TCP_SOCKCM_EP_ON_CLIENT = UCS_BIT(1), /* ep is on the client side */ + UCT_TCP_SOCKCM_EP_SERVER_CREATED = UCS_BIT(2), /* server's ep after call to uct_ep_create */ + UCT_TCP_SOCKCM_EP_PRIV_DATA_PACKED = UCS_BIT(3), /* ep packed its private data */ + UCT_TCP_SOCKCM_EP_HDR_RECEIVED = UCS_BIT(4), /* ep received the header of a new message */ + UCT_TCP_SOCKCM_EP_DATA_SENT = UCS_BIT(5), /* ep finished sending the data */ + UCT_TCP_SOCKCM_EP_DATA_RECEIVED = UCS_BIT(6), /* ep finished receiving the data */ + UCT_TCP_SOCKCM_EP_CLIENT_CONNECTED_CB_INVOKED = UCS_BIT(7), /* ep invoked the connect_cb on the client side */ + UCT_TCP_SOCKCM_EP_SERVER_NOTIFY_CB_INVOKED = UCS_BIT(8), /* ep invoked the notify_cb on the server side */ + UCT_TCP_SOCKCM_EP_CLIENT_NOTIFY_CALLED = UCS_BIT(9), /* ep on the client called notify API call */ + UCT_TCP_SOCKCM_EP_CLIENT_NOTIFY_SENT = UCS_BIT(10), /* ep on the client sent the notify message to the server */ + UCT_TCP_SOCKCM_EP_DISCONNECTING = UCS_BIT(11), /* @ref uct_ep_disconnect was called on the ep */ + UCT_TCP_SOCKCM_EP_GOT_DISCONNECT = UCS_BIT(12), /* ep received a disconnect notice from the remote peer */ + UCT_TCP_SOCKCM_EP_FAILED = UCS_BIT(13), /* ep is in error state due to an internal local error */ + UCT_TCP_SOCKCM_EP_CLIENT_GOT_REJECTED = UCS_BIT(14), /* ep on the client side received a reject from the server + (debug flag) */ +} uct_tcp_sockcm_ep_state_t; + + +/** + * TCP SOCKCM endpoint that is opened on a connection manager + */ +struct uct_tcp_sockcm_ep { + uct_cm_base_ep_t super; + int fd; /* the fd of the socket on the ep */ + uint16_t state; /* ep state (uct_tcp_sockcm_ep_state_t) */ + uct_tcp_listener_t *listener; /* the listener the ep belongs to - used on the server side */ + ucs_list_link_t list; /* list item on the cm ep_list - used on the server side */ + struct { + void *buf; /* Data buffer to send/recv */ + size_t length; /* How much data to send/recv */ + size_t offset; /* Next offset to send/recv */ + } comm_ctx; +}; + +UCS_CLASS_DECLARE(uct_tcp_sockcm_ep_t, const uct_ep_params_t *); +UCS_CLASS_DECLARE_NEW_FUNC(uct_tcp_sockcm_ep_t, uct_ep_t, const uct_ep_params_t *); +UCS_CLASS_DECLARE_DELETE_FUNC(uct_tcp_sockcm_ep_t, uct_ep_t); + +static UCS_F_ALWAYS_INLINE +uct_tcp_sockcm_t *uct_tcp_sockcm_ep_get_cm(uct_tcp_sockcm_ep_t *cep) +{ + /* return the tcp sockcm connection manager this ep is using */ + return ucs_container_of(cep->super.super.super.iface, uct_tcp_sockcm_t, + super.iface); +} + +void uct_tcp_sockcm_ep_close_fd(int *fd); + +ucs_status_t uct_tcp_sockcm_ep_create(const uct_ep_params_t *params, uct_ep_h* ep_p); + +ucs_status_t uct_tcp_sockcm_ep_disconnect(uct_ep_h ep, unsigned flags); + +ucs_status_t uct_tcp_sockcm_ep_send(uct_tcp_sockcm_ep_t *cep); + +ucs_status_t uct_tcp_sockcm_ep_recv(uct_tcp_sockcm_ep_t *cep); + +ucs_status_t uct_tcp_sockcm_ep_set_sockopt(uct_tcp_sockcm_ep_t *ep); + +ucs_status_t uct_tcp_sockcm_cm_ep_conn_notify(uct_ep_h ep); + +void uct_tcp_sockcm_ep_handle_error(uct_tcp_sockcm_ep_t *cep, ucs_status_t status); + +const char *uct_tcp_sockcm_cm_ep_peer_addr_str(uct_tcp_sockcm_ep_t *cep, + char *buf, size_t max); diff --git a/src/uct/ugni/base/ugni_def.h b/src/uct/ugni/base/ugni_def.h index c3e9720e027..ccd5b12431b 100644 --- a/src/uct/ugni/base/ugni_def.h +++ b/src/uct/ugni/base/ugni_def.h @@ -46,17 +46,25 @@ do {\ #if ENABLE_MT #define uct_ugni_check_lock_needed(_cdm) UCS_THREAD_MODE_MULTI == (_cdm)->thread_mode -#define uct_ugni_cdm_init_lock(_cdm) ucs_spinlock_init(&(_cdm)->lock) -#define uct_ugni_cdm_destroy_lock(_cdm) ucs_spinlock_destroy(&(_cdm)->lock) +#define uct_ugni_cdm_init_lock(_cdm) ucs_recursive_spinlock_init(&(_cdm)->lock, 0) +#define uct_ugni_cdm_destroy_lock(_cdm) \ + do { \ + ucs_status_t status; \ + \ + status = ucs_recursive_spinlock_destroy(&(_cdm)->lock); \ + if (status != UCS_OK) {\ + ucs_warn("ucs_recursive_spinlock_destroy() failed (%d)", status); \ + } \ + } while(0) #define uct_ugni_cdm_lock(_cdm) \ if (uct_ugni_check_lock_needed(_cdm)) { \ ucs_trace_async("Taking lock"); \ - ucs_spin_lock(&(_cdm)->lock); \ + ucs_recursive_spin_lock(&(_cdm)->lock); \ } #define uct_ugni_cdm_unlock(_cdm) \ if (uct_ugni_check_lock_needed(_cdm)) { \ ucs_trace_async("Releasing lock"); \ - ucs_spin_unlock(&(_cdm)->lock); \ + ucs_recursive_spin_unlock(&(_cdm)->lock); \ } #else #define uct_ugni_cdm_init_lock(x) UCS_OK diff --git a/src/uct/ugni/base/ugni_device.c b/src/uct/ugni/base/ugni_device.c index 5512f0b9433..c9e8d74a8f1 100644 --- a/src/uct/ugni/base/ugni_device.c +++ b/src/uct/ugni/base/ugni_device.c @@ -7,6 +7,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ugni_device.h" #include "ugni_md.h" #include "ugni_iface.h" @@ -36,26 +40,25 @@ static uct_ugni_job_info_t job_info = { uint32_t ugni_domain_counter = 0; -void uct_ugni_device_get_resource(const char *tl_name, uct_ugni_device_t *dev, - uct_tl_resource_desc_t *resource) +void uct_ugni_device_get_resource(uct_ugni_device_t *dev, + uct_tl_device_resource_t *tl_device) { - ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", tl_name); - ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", dev->fname); - resource->dev_type = UCT_DEVICE_TYPE_NET; + ucs_snprintf_zero(tl_device->name, sizeof(tl_device->name), "%s", dev->fname); + tl_device->type = UCT_DEVICE_TYPE_NET; } -ucs_status_t uct_ugni_query_tl_resources(uct_md_h md, const char *tl_name, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) +ucs_status_t uct_ugni_query_devices(uct_md_h md, + uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) { - uct_tl_resource_desc_t *resources; + uct_tl_device_resource_t *resources; int num_devices = job_info.num_devices; uct_ugni_device_t *devs = job_info.devices; int i; ucs_status_t status = UCS_OK; - resources = ucs_calloc(job_info.num_devices, sizeof(uct_tl_resource_desc_t), - "resource desc"); + resources = ucs_calloc(job_info.num_devices, sizeof(*resources), + "resource desc"); if (NULL == resources) { ucs_error("Failed to allocate memory"); num_devices = 0; @@ -65,12 +68,12 @@ ucs_status_t uct_ugni_query_tl_resources(uct_md_h md, const char *tl_name, } for (i = 0; i < job_info.num_devices; i++) { - uct_ugni_device_get_resource(tl_name, &devs[i], &resources[i]); + uct_ugni_device_get_resource(&devs[i], &resources[i]); } error: - *num_resources_p = num_devices; - *resource_p = resources; + *num_tl_devices_p = num_devices; + *tl_devices_p = resources; return status; } @@ -324,7 +327,7 @@ static ucs_status_t get_nic_address(uct_ugni_device_t *dev_p) return UCS_OK; } -ucs_status_t uct_ugni_device_create(int dev_id, int index, uct_ugni_device_t *dev_p) +ucs_status_t uct_ugni_device_create(int dev_id, int idx, uct_ugni_device_t *dev_p) { ucs_status_t status; gni_return_t ugni_rc; @@ -359,7 +362,7 @@ ucs_status_t uct_ugni_device_create(int dev_id, int index, uct_ugni_device_t *de } ucs_snprintf_zero(dev_p->fname, sizeof(dev_p->fname), "%s:%d", - dev_p->type_name, index); + dev_p->type_name, idx); return UCS_OK; } @@ -403,21 +406,23 @@ static int uct_ugni_next_power_of_two_inclusive (int value) ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device, ucs_thread_mode_t thread_mode) { - uct_ugni_job_info_t *job_info; + uct_ugni_job_info_t *j_info; int modes; gni_return_t ugni_rc; ucs_status_t status = UCS_OK; int pid_max = 32768, free_bits; FILE *fh; - job_info = uct_ugni_get_job_info(); - if (NULL == job_info) { + j_info = uct_ugni_get_job_info(); + if (NULL == j_info) { return UCS_ERR_IO_ERROR; } fh = fopen ("/proc/sys/kernel/pid_max", "r"); if (NULL != fh) { - fscanf (fh, "%d", &pid_max); + if (fscanf (fh, "%d", &pid_max) != 1) { + ucs_debug("cound not read pid_max, using default"); + } fclose (fh); } @@ -433,7 +438,7 @@ ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device, cdm->domain_id, getpid (), free_bits, ugni_domain_counter); modes = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_CACHED_AMO_ENABLED | GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL | GNI_CDM_MODE_FMA_SHARED; - ugni_rc = GNI_CdmCreate(cdm->domain_id, job_info->ptag, job_info->cookie, + ugni_rc = GNI_CdmCreate(cdm->domain_id, j_info->ptag, j_info->cookie, modes, &cdm->cdm_handle); if (GNI_RC_SUCCESS != ugni_rc) { ucs_error("GNI_CdmCreate failed, Error status: %s %d", diff --git a/src/uct/ugni/base/ugni_device.h b/src/uct/ugni/base/ugni_device.h index dd6c7be3c79..3c1e702de23 100644 --- a/src/uct/ugni/base/ugni_device.h +++ b/src/uct/ugni/base/ugni_device.h @@ -12,15 +12,15 @@ ucs_status_t uct_ugni_device_create(int dev_id, int index, uct_ugni_device_t *dev_p); void uct_ugni_device_destroy(uct_ugni_device_t *dev); -void uct_ugni_device_get_resource(const char *tl_name, uct_ugni_device_t *dev, - uct_tl_resource_desc_t *resource); +void uct_ugni_device_get_resource(uct_ugni_device_t *dev, + uct_tl_device_resource_t *tl_device); ucs_status_t uct_ugni_iface_get_dev_address(uct_iface_t *tl_iface, uct_device_addr_t *addr); ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device, ucs_thread_mode_t thread_mode); ucs_status_t uct_ugni_destroy_cdm(uct_ugni_cdm_t *cdm); uct_ugni_device_t *uct_ugni_device_by_name(const char *dev_name); -ucs_status_t uct_ugni_query_tl_resources(uct_md_h md, const char *tl_name, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p); +ucs_status_t uct_ugni_query_devices(uct_md_h md, + uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p); ucs_status_t init_device_list(); ucs_status_t uct_ugni_create_md_cdm(uct_ugni_cdm_t *cdm); ucs_status_t uct_ugni_create_cq(gni_cq_handle_t *cq, unsigned cq_size, uct_ugni_cdm_t *cdm); diff --git a/src/uct/ugni/base/ugni_ep.c b/src/uct/ugni/base/ugni_ep.c index d523d59c27e..2c2e435a854 100644 --- a/src/uct/ugni/base/ugni_ep.c +++ b/src/uct/ugni/base/ugni_ep.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ugni_ep.h" #include "ugni_iface.h" @@ -27,16 +31,18 @@ ucs_status_t uct_ugni_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *n, } ucs_arbiter_cb_result_t uct_ugni_ep_process_pending(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg){ - uct_ugni_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), uct_ugni_ep_t, arb_group); + uct_ugni_ep_t *ep = ucs_container_of(group, uct_ugni_ep_t, arb_group); uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, priv); ucs_status_t rc; ep->arb_sched = 1; + ucs_trace_data("progressing pending request %p", req); rc = req->func(req); ep->arb_sched = 0; - ucs_trace_data("progress pending request %p returned %s", req, + ucs_trace_data("status returned from progress pending: %s", ucs_status_string(rc)); if (UCS_OK == rc) { @@ -52,10 +58,11 @@ ucs_arbiter_cb_result_t uct_ugni_ep_process_pending(ucs_arbiter_t *arbiter, } ucs_arbiter_cb_result_t uct_ugni_ep_abriter_purge_cb(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { - uct_ugni_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), uct_ugni_ep_t, arb_group); + uct_ugni_ep_t *ep = ucs_container_of(group, uct_ugni_ep_t, arb_group); uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, priv); uct_purge_cb_args_t *cb_args = arg; diff --git a/src/uct/ugni/base/ugni_ep.h b/src/uct/ugni/base/ugni_ep.h index 1e5115a2ef6..0d641f64d07 100644 --- a/src/uct/ugni/base/ugni_ep.h +++ b/src/uct/ugni/base/ugni_ep.h @@ -40,9 +40,11 @@ ucs_status_t uct_ugni_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *n, void uct_ugni_ep_pending_purge(uct_ep_h tl_ep, uct_pending_purge_callback_t cb, void *arg); ucs_arbiter_cb_result_t uct_ugni_ep_process_pending(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg); ucs_arbiter_cb_result_t uct_ugni_ep_abriter_purge_cb(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg); ucs_status_t uct_ugni_ep_flush(uct_ep_h tl_ep, unsigned flags, diff --git a/src/uct/ugni/base/ugni_iface.c b/src/uct/ugni/base/ugni_iface.c index 8a27b275a71..f4a6b1ec84b 100644 --- a/src/uct/ugni/base/ugni_iface.c +++ b/src/uct/ugni/base/ugni_iface.c @@ -1,8 +1,13 @@ /** * Copyright (c) UT-Battelle, LLC. 2014-2017. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ugni_types.h" #include "ugni_md.h" #include "ugni_device.h" diff --git a/src/uct/ugni/base/ugni_md.c b/src/uct/ugni/base/ugni_md.c index 085c8841b81..8e0b79c842d 100644 --- a/src/uct/ugni/base/ugni_md.c +++ b/src/uct/ugni/base/ugni_md.c @@ -1,9 +1,13 @@ /** * Copyright (c) UT-Battelle, LLC. 2014-2017. ALL RIGHTS RESERVED. - * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ugni_device.h" #include "ugni_iface.h" #include "ugni_md.h" @@ -16,30 +20,31 @@ UCS_CONFIG_DEFINE_ARRAY(ugni_alloc_methods, sizeof(uct_alloc_method_t), pthread_mutex_t uct_ugni_global_lock = PTHREAD_MUTEX_INITIALIZER; /* For Cray devices we have only one MD */ -static ucs_status_t uct_ugni_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) +static ucs_status_t +uct_ugni_query_md_resources(uct_component_h component, + uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p) { - if (getenv("PMI_GNI_PTAG") != NULL) { - return uct_single_md_resource(&uct_ugni_md_component, resources_p, num_resources_p); - } else { - *resources_p = NULL; - *num_resources_p = 0; - return UCS_OK; + if (getenv("PMI_GNI_PTAG") == NULL) { + return uct_md_query_empty_md_resource(resources_p, num_resources_p); } + + return uct_md_query_single_md_resource(component, resources_p, + num_resources_p); } static ucs_status_t uct_ugni_md_query(uct_md_h md, uct_md_attr_t *md_attr) { - md_attr->rkey_packed_size = 3 * sizeof(uint64_t); - md_attr->cap.flags = UCT_MD_FLAG_REG | - UCT_MD_FLAG_NEED_MEMH | - UCT_MD_FLAG_NEED_RKEY; - md_attr->cap.reg_mem_types = UCS_BIT(UCT_MD_MEM_TYPE_HOST); - md_attr->cap.mem_type = UCT_MD_MEM_TYPE_HOST; - md_attr->cap.max_alloc = 0; - md_attr->cap.max_reg = ULONG_MAX; - md_attr->reg_cost.overhead = 1000.0e-9; - md_attr->reg_cost.growth = 0.007e-9; + md_attr->rkey_packed_size = 3 * sizeof(uint64_t); + md_attr->cap.flags = UCT_MD_FLAG_REG | + UCT_MD_FLAG_NEED_MEMH | + UCT_MD_FLAG_NEED_RKEY; + md_attr->cap.reg_mem_types = UCS_MEMORY_TYPES_CPU_ACCESSIBLE; + md_attr->cap.access_mem_type = UCS_MEMORY_TYPE_HOST; + md_attr->cap.detect_mem_types = 0; + md_attr->cap.max_alloc = 0; + md_attr->cap.max_reg = ULONG_MAX; + md_attr->reg_cost = ucs_linear_func_make(1000.0e-9, 0.007e-9); memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); return UCS_OK; } @@ -120,15 +125,16 @@ static ucs_status_t uct_ugni_rkey_pack(uct_md_h md, uct_mem_h memh, return UCS_OK; } -static ucs_status_t uct_ugni_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, - void *handle) +static ucs_status_t uct_ugni_rkey_release(uct_component_t *component, + uct_rkey_t rkey, void *handle) { ucs_assert(NULL == handle); ucs_free((void *)rkey); return UCS_OK; } -static ucs_status_t uct_ugni_rkey_unpack(uct_md_component_t *mdc, const void *rkey_buffer, +static ucs_status_t uct_ugni_rkey_unpack(uct_component_t *component, + const void *rkey_buffer, uct_rkey_t *rkey_p, void **handle_p) { const uint64_t *ptr = rkey_buffer; @@ -169,26 +175,27 @@ static void uct_ugni_md_close(uct_md_h md) pthread_mutex_unlock(&uct_ugni_global_lock); } -static ucs_status_t uct_ugni_md_open(const char *md_name, const uct_md_config_t *md_config, - uct_md_h *md_p) +static ucs_status_t +uct_ugni_md_open(uct_component_h component,const char *md_name, + const uct_md_config_t *md_config, uct_md_h *md_p) { ucs_status_t status = UCS_OK; pthread_mutex_lock(&uct_ugni_global_lock); static uct_md_ops_t md_ops = { - .close = uct_ugni_md_close, - .query = uct_ugni_md_query, - .mem_alloc = (void*)ucs_empty_function, - .mem_free = (void*)ucs_empty_function, - .mem_reg = uct_ugni_mem_reg, - .mem_dereg = uct_ugni_mem_dereg, - .mkey_pack = uct_ugni_rkey_pack, - .is_mem_type_owned = (void *)ucs_empty_function_return_zero, + .close = uct_ugni_md_close, + .query = uct_ugni_md_query, + .mem_alloc = (void*)ucs_empty_function, + .mem_free = (void*)ucs_empty_function, + .mem_reg = uct_ugni_mem_reg, + .mem_dereg = uct_ugni_mem_dereg, + .mkey_pack = uct_ugni_rkey_pack, + .detect_memory_type = ucs_empty_function_return_unsupported, }; static uct_ugni_md_t md = { .super.ops = &md_ops, - .super.component = &uct_ugni_md_component, + .super.component = &uct_ugni_component, .ref_count = 0 }; @@ -214,14 +221,22 @@ static ucs_status_t uct_ugni_md_open(const char *md_name, const uct_md_config_t return status; } - -UCT_MD_COMPONENT_DEFINE(uct_ugni_md_component, - UCT_UGNI_MD_NAME, - uct_ugni_query_md_resources, - uct_ugni_md_open, - NULL, - uct_ugni_rkey_unpack, - uct_ugni_rkey_release, - "UGNI_", - uct_md_config_table, - uct_md_config_t); +uct_component_t uct_ugni_component = { + .query_md_resources = uct_ugni_query_md_resources, + .md_open = uct_ugni_md_open, + .cm_open = ucs_empty_function_return_unsupported, + .rkey_unpack = uct_ugni_rkey_unpack, + .rkey_ptr = ucs_empty_function_return_unsupported, + .rkey_release = uct_ugni_rkey_release, + .name = UCT_UGNI_MD_NAME, + .md_config = { + .name = "UGNI memory domain", + .prefix = "UGNI_", + .table = uct_md_config_table, + .size = sizeof(uct_md_config_t), + }, + .cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY, + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_ugni_component), + .flags = 0 +}; +UCT_COMPONENT_REGISTER(&uct_ugni_component); diff --git a/src/uct/ugni/base/ugni_md.h b/src/uct/ugni/base/ugni_md.h index 986c57463b4..c5a6a732e7c 100644 --- a/src/uct/ugni/base/ugni_md.h +++ b/src/uct/ugni/base/ugni_md.h @@ -10,6 +10,6 @@ #include "ugni_def.h" #include -extern uct_md_component_t uct_ugni_md_component; +extern uct_component_t uct_ugni_component; #endif diff --git a/src/uct/ugni/base/ugni_types.h b/src/uct/ugni/base/ugni_types.h index 63b7eab3f2a..e5c8f8cd53d 100644 --- a/src/uct/ugni/base/ugni_types.h +++ b/src/uct/ugni/base/ugni_types.h @@ -9,10 +9,13 @@ #define UCT_UGNI_TYPES_H #include "ugni_def.h" + +#include #include #include #include + typedef struct uct_ugni_device { gni_nic_device_t type; /**< Device type */ char type_name[UCT_UGNI_MAX_TYPE_NAME]; /**< Device type name */ @@ -21,20 +24,20 @@ typedef struct uct_ugni_device { uint32_t address; /**< Device address */ uint32_t cpu_id; /**< CPU attached directly to the device */ - cpu_set_t cpu_mask; /**< CPU mask */ + ucs_sys_cpuset_t cpu_mask; /**< CPU mask */ /* TBD - reference counter */ } uct_ugni_device_t; typedef struct uct_ugni_cdm { - gni_cdm_handle_t cdm_handle; /**< Ugni communication domain */ - gni_nic_handle_t nic_handle; /**< Ugni NIC handle */ - uct_ugni_device_t *dev; /**< Ugni device the cdm is connected to */ - ucs_thread_mode_t thread_mode; - uint32_t address; - uint32_t domain_id; + gni_cdm_handle_t cdm_handle; /**< Ugni communication domain */ + gni_nic_handle_t nic_handle; /**< Ugni NIC handle */ + uct_ugni_device_t *dev; /**< Ugni device the cdm is connected to */ + ucs_thread_mode_t thread_mode; + uint32_t address; + uint32_t domain_id; #if ENABLE_MT - ucs_spinlock_t lock; /**< Device lock */ + ucs_recursive_spinlock_t lock; /**< Device lock */ #endif } uct_ugni_cdm_t; diff --git a/src/uct/ugni/configure.m4 b/src/uct/ugni/configure.m4 index 95db667a578..a2979a860e5 100644 --- a/src/uct/ugni/configure.m4 +++ b/src/uct/ugni/configure.m4 @@ -7,21 +7,26 @@ cray_ugni_supported=no AC_ARG_WITH([ugni], - [AC_HELP_STRING([--with-ugni(=DIR)], - [Build Cray UGNI support, adding DIR/include, DIR/lib, and DIR/lib64 to the search path for headers and libraries])], - [], - [with_ugni=default]) + [AC_HELP_STRING([--with-ugni(=DIR)], [Build Cray UGNI support])], + [], + [with_ugni=guess]) + +AS_IF([test "x$with_ugni" != "xno"], + [AC_MSG_CHECKING([cray-ugni]) + AS_IF([$PKG_CONFIG --exists cray-ugni cray-pmi], + [AC_MSG_RESULT([yes]) + AC_SUBST([CRAY_UGNI_CFLAGS], [`$PKG_CONFIG --cflags cray-ugni cray-pmi`]) + AC_SUBST([CRAY_UGNI_LIBS], [`$PKG_CONFIG --libs cray-ugni cray-pmi`]) + uct_modules="${uct_modules}:ugni" + cray_ugni_supported=yes + AC_DEFINE([HAVE_TL_UGNI], [1], [Defined if UGNI transport exists]) + ], + [AC_MSG_RESULT([no]) + AS_IF([test "x$with_ugni" != "xguess"], + [AC_MSG_ERROR([UGNI support was requested but cray-ugni and cray-pmi packages cannot be found])]) + ]) + ]) -AS_IF([test "x$with_ugni" != "xno"], - [PKG_CHECK_MODULES([CRAY_UGNI], [cray-ugni cray-pmi], - [uct_modules+=":ugni" - cray_ugni_supported=yes - AC_DEFINE([HAVE_TL_UGNI], [1], - [Define if UGNI transport exists.])], - [AS_IF([test "x$with_ugni" != "xdefault"], - [AC_MSG_WARN([UGNI support was requested but cray-ugni and cray-pmi packages cannot be found]) - AC_MSG_ERROR([Cannot continue])],[])] - )]) AM_CONDITIONAL([HAVE_CRAY_UGNI], [test "x$cray_ugni_supported" = xyes]) AC_CONFIG_FILES([src/uct/ugni/Makefile]) diff --git a/src/uct/ugni/rdma/ugni_rdma_ep.c b/src/uct/ugni/rdma/ugni_rdma_ep.c index 3f0de8eb06a..f86f99a1260 100644 --- a/src/uct/ugni/rdma/ugni_rdma_ep.c +++ b/src/uct/ugni/rdma/ugni_rdma_ep.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ugni_rdma_ep.h" #include "ugni_rdma_iface.h" #include @@ -15,29 +19,29 @@ uct_mem_h _memh = _iov[0].memh; /* Endpoint operations */ -static inline void uct_ugni_invoke_orig_comp(uct_ugni_rdma_fetch_desc_t *fma, ucs_status_t status) +static inline void uct_ugni_invoke_orig_comp(uct_ugni_rdma_fetch_desc_t *fma_desc, ucs_status_t status) { - if (ucs_likely(NULL != fma->orig_comp_cb)) { - uct_invoke_completion(fma->orig_comp_cb, status); + if (ucs_likely(NULL != fma_desc->orig_comp_cb)) { + uct_invoke_completion(fma_desc->orig_comp_cb, status); } } -static inline void uct_ugni_format_fma(uct_ugni_base_desc_t *fma, gni_post_type_t type, +static inline void uct_ugni_format_fma(uct_ugni_base_desc_t *fma_desc, gni_post_type_t type, const void *buffer, uint64_t remote_addr, uct_rkey_t rkey, unsigned length, uct_ugni_ep_t *ep, uct_completion_t *comp, uct_unpack_callback_t unpack_cb) { - fma->desc.type = type; - fma->desc.cq_mode = GNI_CQMODE_GLOBAL_EVENT; - fma->desc.dlvr_mode = GNI_DLVMODE_PERFORMANCE; - fma->desc.local_addr = (uint64_t)buffer; - fma->desc.remote_addr = remote_addr; - fma->desc.remote_mem_hndl = *(gni_mem_handle_t *)rkey; - fma->desc.length = length; - fma->flush_group = ep->flush_group; - fma->comp_cb = comp; - fma->unpack_cb = unpack_cb; + fma_desc->desc.type = type; + fma_desc->desc.cq_mode = GNI_CQMODE_GLOBAL_EVENT; + fma_desc->desc.dlvr_mode = GNI_DLVMODE_PERFORMANCE; + fma_desc->desc.local_addr = (uint64_t)buffer; + fma_desc->desc.remote_addr = remote_addr; + fma_desc->desc.remote_mem_hndl = *(gni_mem_handle_t *)rkey; + fma_desc->desc.length = length; + fma_desc->flush_group = ep->flush_group; + fma_desc->comp_cb = comp; + fma_desc->unpack_cb = unpack_cb; } static inline void uct_ugni_format_fma_amo(uct_ugni_rdma_fetch_desc_t *amo, gni_post_type_t type, @@ -118,20 +122,20 @@ static inline ucs_status_t uct_ugni_post_rdma(uct_ugni_rdma_iface_t *iface, static inline ssize_t uct_ugni_post_fma(uct_ugni_rdma_iface_t *iface, uct_ugni_ep_t *ep, - uct_ugni_base_desc_t *fma, + uct_ugni_base_desc_t *fma_desc, ssize_t ok_status) { gni_return_t ugni_rc; if (ucs_unlikely(!uct_ugni_ep_can_send(ep))) { - ucs_mpool_put(fma); + ucs_mpool_put(fma_desc); return UCS_ERR_NO_RESOURCE; } uct_ugni_cdm_lock(&iface->super.cdm); - ugni_rc = GNI_PostFma(ep->ep, &fma->desc); + ugni_rc = GNI_PostFma(ep->ep, &fma_desc->desc); uct_ugni_cdm_unlock(&iface->super.cdm); if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) { - ucs_mpool_put(fma); + ucs_mpool_put(fma_desc); if(GNI_RC_ERROR_RESOURCE == ugni_rc || GNI_RC_ERROR_NOMEM == ugni_rc) { ucs_debug("GNI_PostFma failed, Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); @@ -143,7 +147,7 @@ static inline ssize_t uct_ugni_post_fma(uct_ugni_rdma_iface_t *iface, } } - ++fma->flush_group->flush_comp.count; + ++fma_desc->flush_group->flush_comp.count; ++iface->super.outstanding; return ok_status; @@ -155,23 +159,23 @@ ucs_status_t uct_ugni_ep_put_short(uct_ep_h tl_ep, const void *buffer, { uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t); uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t); - uct_ugni_base_desc_t *fma; + uct_ugni_base_desc_t *fma_desc; UCT_SKIP_ZERO_LENGTH(length); UCT_CHECK_LENGTH(length, 0, iface->config.fma_seg_size, "put_short"); UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc, - fma, return UCS_ERR_NO_RESOURCE); - uct_ugni_format_fma(fma, GNI_POST_FMA_PUT, buffer, + fma_desc, return UCS_ERR_NO_RESOURCE); + uct_ugni_format_fma(fma_desc, GNI_POST_FMA_PUT, buffer, remote_addr, rkey, length, ep, NULL, NULL); ucs_trace_data("Posting PUT Short, GNI_PostFma of size %"PRIx64" from %p to " "%p, with [%"PRIx64" %"PRIx64"]", - fma->desc.length, - (void *)fma->desc.local_addr, - (void *)fma->desc.remote_addr, - fma->desc.remote_mem_hndl.qword1, - fma->desc.remote_mem_hndl.qword2); + fma_desc->desc.length, + (void *)fma_desc->desc.local_addr, + (void *)fma_desc->desc.remote_addr, + fma_desc->desc.remote_mem_hndl.qword1, + fma_desc->desc.remote_mem_hndl.qword2); UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), PUT, SHORT, length); - return uct_ugni_post_fma(iface, ep, fma, UCS_OK); + return uct_ugni_post_fma(iface, ep, fma_desc, UCS_OK); } ssize_t uct_ugni_ep_put_bcopy(uct_ep_h tl_ep, uct_pack_callback_t pack_cb, @@ -184,26 +188,26 @@ ssize_t uct_ugni_ep_put_bcopy(uct_ep_h tl_ep, uct_pack_callback_t pack_cb, * pack_cb(desc + 1, arg, length); */ uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t); uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t); - uct_ugni_base_desc_t *fma; + uct_ugni_base_desc_t *fma_desc; size_t length; UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_buffer, - fma, return UCS_ERR_NO_RESOURCE); + fma_desc, return UCS_ERR_NO_RESOURCE); - length = pack_cb(fma + 1, arg); - UCT_SKIP_ZERO_LENGTH(length, fma); + length = pack_cb(fma_desc + 1, arg); + UCT_SKIP_ZERO_LENGTH(length, fma_desc); UCT_CHECK_LENGTH(length, 0, iface->config.fma_seg_size, "put_bcopy"); - uct_ugni_format_fma(fma, GNI_POST_FMA_PUT, fma + 1, + uct_ugni_format_fma(fma_desc, GNI_POST_FMA_PUT, fma_desc + 1, remote_addr, rkey, length, ep, NULL, NULL); ucs_trace_data("Posting PUT BCOPY, GNI_PostFma of size %"PRIx64" from %p to " "%p, with [%"PRIx64" %"PRIx64"]", - fma->desc.length, - (void *)fma->desc.local_addr, - (void *)fma->desc.remote_addr, - fma->desc.remote_mem_hndl.qword1, - fma->desc.remote_mem_hndl.qword2); + fma_desc->desc.length, + (void *)fma_desc->desc.local_addr, + (void *)fma_desc->desc.remote_addr, + fma_desc->desc.remote_mem_hndl.qword1, + fma_desc->desc.remote_mem_hndl.qword2); UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), PUT, BCOPY, length); - return uct_ugni_post_fma(iface, ep, fma, length); + return uct_ugni_post_fma(iface, ep, fma_desc, length); } ucs_status_t uct_ugni_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, @@ -235,12 +239,12 @@ ucs_status_t uct_ugni_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t static void uct_ugni_amo_unpack64(uct_completion_t *self, ucs_status_t status) { - uct_ugni_rdma_fetch_desc_t *fma = (uct_ugni_rdma_fetch_desc_t *) + uct_ugni_rdma_fetch_desc_t *fma_desc = (uct_ugni_rdma_fetch_desc_t *) ucs_container_of(self, uct_ugni_rdma_fetch_desc_t, tmp); - /* Call the orignal callback and skip padding */ - *(uint64_t *)fma->user_buffer = *(uint64_t *)(fma + 1); - uct_ugni_invoke_orig_comp(fma, status); + /* Call the original callback and skip padding */ + *(uint64_t *)fma_desc->user_buffer = *(uint64_t *)(fma_desc + 1); + uct_ugni_invoke_orig_comp(fma_desc, status); } ucs_status_t uct_ugni_ep_atomic_cswap64(uct_ep_h tl_ep, uint64_t compare, uint64_t swap, @@ -249,31 +253,31 @@ ucs_status_t uct_ugni_ep_atomic_cswap64(uct_ep_h tl_ep, uint64_t compare, uint64 { uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t); uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t); - uct_ugni_rdma_fetch_desc_t *fma; + uct_ugni_rdma_fetch_desc_t *fma_desc; - UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_famo, fma, + UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_famo, fma_desc, return UCS_ERR_NO_RESOURCE); - uct_ugni_format_fma_amo(fma, GNI_POST_AMO, GNI_FMA_ATOMIC_CSWAP, - compare, swap, fma + 1, remote_addr, + uct_ugni_format_fma_amo(fma_desc, GNI_POST_AMO, GNI_FMA_ATOMIC_CSWAP, + compare, swap, fma_desc + 1, remote_addr, rkey, LEN_64, ep, comp, uct_ugni_amo_unpack64, (void *)result); ucs_trace_data("Posting AMO CSWAP, GNI_PostFma of size %"PRIx64" value" "%"PRIx64" compare %"PRIx64" to %p, with [%"PRIx64" %"PRIx64"]", - fma->super.desc.length, swap, compare, - (void *)fma->super.desc.remote_addr, - fma->super.desc.remote_mem_hndl.qword1, - fma->super.desc.remote_mem_hndl.qword2); + fma_desc->super.desc.length, swap, compare, + (void *)fma_desc->super.desc.remote_addr, + fma_desc->super.desc.remote_mem_hndl.qword1, + fma_desc->super.desc.remote_mem_hndl.qword2); UCT_TL_EP_STAT_ATOMIC(ucs_derived_of(tl_ep, uct_base_ep_t)); - return uct_ugni_post_fma(iface, ep, &fma->super, UCS_INPROGRESS); + return uct_ugni_post_fma(iface, ep, &fma_desc->super, UCS_INPROGRESS); } static void uct_ugni_amo_unpack32(uct_completion_t *self, ucs_status_t status) { - uct_ugni_rdma_fetch_desc_t *fma = (uct_ugni_rdma_fetch_desc_t *) + uct_ugni_rdma_fetch_desc_t *fma_desc = (uct_ugni_rdma_fetch_desc_t *) ucs_container_of(self, uct_ugni_rdma_fetch_desc_t, tmp); - /* Call the orignal callback and skip padding */ - *(uint32_t *)fma->user_buffer = *(uint32_t *)(fma + 1); - uct_ugni_invoke_orig_comp(fma, status); + /* Call the original callback and skip padding */ + *(uint32_t *)fma_desc->user_buffer = *(uint32_t *)(fma_desc + 1); + uct_ugni_invoke_orig_comp(fma_desc, status); } ucs_status_t uct_ugni_ep_atomic_cswap32(uct_ep_h tl_ep, uint32_t compare, uint32_t swap, @@ -282,21 +286,21 @@ ucs_status_t uct_ugni_ep_atomic_cswap32(uct_ep_h tl_ep, uint32_t compare, uint32 { uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t); uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t); - uct_ugni_rdma_fetch_desc_t *fma; + uct_ugni_rdma_fetch_desc_t *fma_desc; - UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_famo, fma, + UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_famo, fma_desc, return UCS_ERR_NO_RESOURCE); - uct_ugni_format_fma_amo(fma, GNI_POST_AMO, GNI_FMA_ATOMIC2_FCSWAP_S, - (uint64_t)compare, (uint64_t)swap, fma + 1, remote_addr, + uct_ugni_format_fma_amo(fma_desc, GNI_POST_AMO, GNI_FMA_ATOMIC2_FCSWAP_S, + (uint64_t)compare, (uint64_t)swap, fma_desc + 1, remote_addr, rkey, LEN_32, ep, comp, uct_ugni_amo_unpack32, (void *)result); ucs_trace_data("Posting AMO CSWAP, GNI_PostFma of size %"PRIx64" value" "%"PRIx32" compare %"PRIx32" to %p, with [%"PRIx64" %"PRIx64"]", - fma->super.desc.length, swap, compare, - (void *)fma->super.desc.remote_addr, - fma->super.desc.remote_mem_hndl.qword1, - fma->super.desc.remote_mem_hndl.qword2); + fma_desc->super.desc.length, swap, compare, + (void *)fma_desc->super.desc.remote_addr, + fma_desc->super.desc.remote_mem_hndl.qword1, + fma_desc->super.desc.remote_mem_hndl.qword2); UCT_TL_EP_STAT_ATOMIC(ucs_derived_of(tl_ep, uct_base_ep_t)); - return uct_ugni_post_fma(iface, ep, &fma->super, UCS_INPROGRESS); + return uct_ugni_post_fma(iface, ep, &fma_desc->super, UCS_INPROGRESS); } ucs_status_t uct_ugni_ep_atomic_op32(uct_ep_h tl_ep, uint32_t op, @@ -305,21 +309,21 @@ ucs_status_t uct_ugni_ep_atomic_op32(uct_ep_h tl_ep, uint32_t op, { uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t); uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t); - uct_ugni_rdma_fetch_desc_t *fma; + uct_ugni_rdma_fetch_desc_t *fma_desc; - UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_famo, fma, + UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_famo, fma_desc, return UCS_ERR_NO_RESOURCE); - uct_ugni_format_fma_amo(fma, GNI_POST_AMO, op_type, + uct_ugni_format_fma_amo(fma_desc, GNI_POST_AMO, op_type, (uint64_t)op, 0, NULL, remote_addr, rkey, LEN_32, ep, NULL, NULL, NULL); ucs_trace_data("Posting AMO %s, GNI_PostFma of size %"PRIx64" value" "%"PRIx32" to %p, with [%"PRIx64" %"PRIx64"]", - op_str, fma->super.desc.length, op, - (void *)fma->super.desc.remote_addr, - fma->super.desc.remote_mem_hndl.qword1, - fma->super.desc.remote_mem_hndl.qword2); + op_str, fma_desc->super.desc.length, op, + (void *)fma_desc->super.desc.remote_addr, + fma_desc->super.desc.remote_mem_hndl.qword1, + fma_desc->super.desc.remote_mem_hndl.qword2); UCT_TL_EP_STAT_ATOMIC(ucs_derived_of(tl_ep, uct_base_ep_t)); - return uct_ugni_post_fma(iface, ep, &fma->super, UCS_OK); + return uct_ugni_post_fma(iface, ep, &fma_desc->super, UCS_OK); } ucs_status_t uct_ugni_ep_atomic32_post(uct_ep_h ep, unsigned opcode, uint32_t value, @@ -350,21 +354,21 @@ ucs_status_t uct_ugni_ep_atomic_op64(uct_ep_h tl_ep, uint64_t op, { uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t); uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t); - uct_ugni_rdma_fetch_desc_t *fma; + uct_ugni_rdma_fetch_desc_t *fma_desc; - UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_famo, fma, + UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_famo, fma_desc, return UCS_ERR_NO_RESOURCE); - uct_ugni_format_fma_amo(fma, GNI_POST_AMO, op_type, + uct_ugni_format_fma_amo(fma_desc, GNI_POST_AMO, op_type, op, 0, NULL, remote_addr, rkey, LEN_64, ep, NULL, NULL, NULL); ucs_trace_data("Posting AMO %s, GNI_PostFma of size %"PRIx64" value" "%"PRIx64" to %p, with [%"PRIx64" %"PRIx64"]", - op_str, fma->super.desc.length, op, - (void *)fma->super.desc.remote_addr, - fma->super.desc.remote_mem_hndl.qword1, - fma->super.desc.remote_mem_hndl.qword2); + op_str, fma_desc->super.desc.length, op, + (void *)fma_desc->super.desc.remote_addr, + fma_desc->super.desc.remote_mem_hndl.qword1, + fma_desc->super.desc.remote_mem_hndl.qword2); UCT_TL_EP_STAT_ATOMIC(ucs_derived_of(tl_ep, uct_base_ep_t)); - return uct_ugni_post_fma(iface, ep, &fma->super, UCS_OK); + return uct_ugni_post_fma(iface, ep, &fma_desc->super, UCS_OK); } ucs_status_t uct_ugni_ep_atomic64_post(uct_ep_h ep, unsigned opcode, uint64_t value, @@ -396,21 +400,21 @@ ucs_status_t uct_ugni_ep_atomic_fop64(uct_ep_h tl_ep, uint64_t op, { uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t); uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t); - uct_ugni_rdma_fetch_desc_t *fma; + uct_ugni_rdma_fetch_desc_t *fma_desc; - UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_famo, fma, + UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_famo, fma_desc, return UCS_ERR_NO_RESOURCE); - uct_ugni_format_fma_amo(fma, GNI_POST_AMO, op_type, - op, 0, fma + 1, remote_addr, + uct_ugni_format_fma_amo(fma_desc, GNI_POST_AMO, op_type, + op, 0, fma_desc + 1, remote_addr, rkey, LEN_64, ep, comp, uct_ugni_amo_unpack64, (void *)result); ucs_trace_data("Posting AMO %s, GNI_PostFma of size %"PRIx64" value" "%"PRIx64" to %p, with [%"PRIx64" %"PRIx64"]", - op_str, fma->super.desc.length, op, - (void *)fma->super.desc.remote_addr, - fma->super.desc.remote_mem_hndl.qword1, - fma->super.desc.remote_mem_hndl.qword2); + op_str, fma_desc->super.desc.length, op, + (void *)fma_desc->super.desc.remote_addr, + fma_desc->super.desc.remote_mem_hndl.qword1, + fma_desc->super.desc.remote_mem_hndl.qword2); UCT_TL_EP_STAT_ATOMIC(ucs_derived_of(tl_ep, uct_base_ep_t)); - return uct_ugni_post_fma(iface, ep, &fma->super, UCS_INPROGRESS); + return uct_ugni_post_fma(iface, ep, &fma_desc->super, UCS_INPROGRESS); } ucs_status_t uct_ugni_ep_atomic64_fetch(uct_ep_h ep, uct_atomic_op_t opcode, @@ -447,21 +451,21 @@ ucs_status_t uct_ugni_ep_atomic_fop32(uct_ep_h tl_ep, uint32_t op, { uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t); uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t); - uct_ugni_rdma_fetch_desc_t *fma; + uct_ugni_rdma_fetch_desc_t *fma_desc; - UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_famo, fma, + UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_famo, fma_desc, return UCS_ERR_NO_RESOURCE); - uct_ugni_format_fma_amo(fma, GNI_POST_AMO, op_type, - (uint64_t)op, 0, fma + 1, remote_addr, + uct_ugni_format_fma_amo(fma_desc, GNI_POST_AMO, op_type, + (uint64_t)op, 0, fma_desc + 1, remote_addr, rkey, LEN_32, ep, comp, uct_ugni_amo_unpack32, (void *)result); ucs_trace_data("Posting AMO %s, GNI_PostFma of size %"PRIx64" value" "%"PRIx32" to %p, with [%"PRIx64" %"PRIx64"]", - op_str, fma->super.desc.length, op, - (void *)fma->super.desc.remote_addr, - fma->super.desc.remote_mem_hndl.qword1, - fma->super.desc.remote_mem_hndl.qword2); + op_str, fma_desc->super.desc.length, op, + (void *)fma_desc->super.desc.remote_addr, + fma_desc->super.desc.remote_mem_hndl.qword1, + fma_desc->super.desc.remote_mem_hndl.qword2); UCT_TL_EP_STAT_ATOMIC(ucs_derived_of(tl_ep, uct_base_ep_t)); - return uct_ugni_post_fma(iface, ep, &fma->super, UCS_INPROGRESS); + return uct_ugni_post_fma(iface, ep, &fma_desc->super, UCS_INPROGRESS); } ucs_status_t uct_ugni_ep_atomic32_fetch(uct_ep_h ep, uct_atomic_op_t opcode, @@ -493,17 +497,17 @@ ucs_status_t uct_ugni_ep_atomic32_fetch(uct_ep_h ep, uct_atomic_op_t opcode, static void uct_ugni_unalign_fma_get_cb(uct_completion_t *self, ucs_status_t status) { - uct_ugni_rdma_fetch_desc_t *fma = (uct_ugni_rdma_fetch_desc_t *) + uct_ugni_rdma_fetch_desc_t *fma_desc = (uct_ugni_rdma_fetch_desc_t *) ucs_container_of(self, uct_ugni_rdma_fetch_desc_t, tmp); /* Call the orignal callback and skip padding */ - fma->super.unpack_cb(fma->user_buffer, (char *)(fma + 1) + fma->padding, - fma->super.desc.length - fma->padding - fma->tail); + fma_desc->super.unpack_cb(fma_desc->user_buffer, (char *)(fma_desc + 1) + fma_desc->padding, + fma_desc->super.desc.length - fma_desc->padding - fma_desc->tail); - uct_ugni_invoke_orig_comp(fma, status); + uct_ugni_invoke_orig_comp(fma_desc, status); } -static inline void uct_ugni_format_get_fma(uct_ugni_rdma_fetch_desc_t *fma, +static inline void uct_ugni_format_get_fma(uct_ugni_rdma_fetch_desc_t *fma_desc, uint64_t remote_addr, uct_rkey_t rkey, unsigned length, uct_ugni_ep_t *ep, uct_completion_t *user_comp, @@ -515,19 +519,19 @@ static inline void uct_ugni_format_get_fma(uct_ugni_rdma_fetch_desc_t *fma, void *buffer; unsigned align_length; - fma->padding = ucs_padding_pow2(remote_addr, UGNI_GET_ALIGN); - fma->orig_comp_cb = user_comp; + fma_desc->padding = ucs_padding_pow2(remote_addr, UGNI_GET_ALIGN); + fma_desc->orig_comp_cb = user_comp; /* Make sure that the address is always aligned */ - addr = remote_addr - fma->padding; - buffer = (fma + 1); - fma->user_buffer = arg; + addr = remote_addr - fma_desc->padding; + buffer = (fma_desc + 1); + fma_desc->user_buffer = arg; /* Make sure that the length is always aligned */ - align_length = ucs_check_if_align_pow2(length + fma->padding, UGNI_GET_ALIGN) ? - ucs_align_up_pow2((length + fma->padding), UGNI_GET_ALIGN):length + fma->padding; - fma->tail = align_length - length - fma->padding; + align_length = ucs_check_if_align_pow2(length + fma_desc->padding, UGNI_GET_ALIGN) ? + ucs_align_up_pow2((length + fma_desc->padding), UGNI_GET_ALIGN):length + fma_desc->padding; + fma_desc->tail = align_length - length - fma_desc->padding; ucs_assert(ucs_check_if_align_pow2(addr, UGNI_GET_ALIGN)==0); ucs_assert(ucs_check_if_align_pow2(align_length, UGNI_GET_ALIGN)==0); - uct_ugni_format_fma(&fma->super, GNI_POST_FMA_GET, buffer, addr, rkey, align_length, + uct_ugni_format_fma(&fma_desc->super, GNI_POST_FMA_GET, buffer, addr, rkey, align_length, ep, internal_comp, unpack_cb); } @@ -566,31 +570,31 @@ ucs_status_t uct_ugni_ep_get_bcopy(uct_ep_h tl_ep, { uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t); uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t); - uct_ugni_rdma_fetch_desc_t *fma; + uct_ugni_rdma_fetch_desc_t *fma_desc; UCT_SKIP_ZERO_LENGTH(length); UCT_CHECK_LENGTH(ucs_align_up_pow2(length, UGNI_GET_ALIGN), 0, iface->config.fma_seg_size, "get_bcopy"); UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_get_buffer, - fma, return UCS_ERR_NO_RESOURCE); + fma_desc, return UCS_ERR_NO_RESOURCE); - fma->tmp.func = uct_ugni_unalign_fma_get_cb; - fma->tmp.count = 1; - uct_ugni_format_get_fma(fma, + fma_desc->tmp.func = uct_ugni_unalign_fma_get_cb; + fma_desc->tmp.count = 1; + uct_ugni_format_get_fma(fma_desc, remote_addr, rkey, length, ep, comp, - &fma->tmp, + &fma_desc->tmp, unpack_cb, arg); ucs_trace_data("Posting GET BCOPY, GNI_PostFma of size %"PRIx64" (%lu) from %p to " "%p, with [%"PRIx64" %"PRIx64"]", - fma->super.desc.length, length, - (void *)fma->super.desc.local_addr, - (void *)fma->super.desc.remote_addr, - fma->super.desc.remote_mem_hndl.qword1, - fma->super.desc.remote_mem_hndl.qword2); + fma_desc->super.desc.length, length, + (void *)fma_desc->super.desc.local_addr, + (void *)fma_desc->super.desc.remote_addr, + fma_desc->super.desc.remote_mem_hndl.qword1, + fma_desc->super.desc.remote_mem_hndl.qword2); UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, BCOPY, length); - return uct_ugni_post_fma(iface, ep, &fma->super, UCS_INPROGRESS); + return uct_ugni_post_fma(iface, ep, &fma_desc->super, UCS_INPROGRESS); } static void assemble_composed_unaligned(uct_completion_t *self, ucs_status_t status) @@ -615,12 +619,12 @@ static void assemble_composed_unaligned(uct_completion_t *self, ucs_status_t sta static void free_composed_desc(void *arg) { uct_ugni_rdma_fetch_desc_t *desc = (uct_ugni_rdma_fetch_desc_t*)arg; - uct_ugni_rdma_fetch_desc_t *fma = ucs_container_of(desc->super.comp_cb, uct_ugni_rdma_fetch_desc_t, tmp); - uct_ugni_rdma_fetch_desc_t *rdma = fma->head; + uct_ugni_rdma_fetch_desc_t *fma_desc = ucs_container_of(desc->super.comp_cb, uct_ugni_rdma_fetch_desc_t, tmp); + uct_ugni_rdma_fetch_desc_t *rdma = fma_desc->head; if (0 == --rdma->tmp.count) { - fma->super.free_cb = rdma->super.free_cb = ucs_mpool_put; - ucs_mpool_put(fma); + fma_desc->super.free_cb = rdma->super.free_cb = ucs_mpool_put; + ucs_mpool_put(fma_desc); ucs_mpool_put(rdma); } } @@ -629,7 +633,7 @@ static ucs_status_t uct_ugni_ep_get_composed_fma_rdma(uct_ep_h tl_ep, void *buff uct_mem_h memh, uint64_t remote_addr, uct_rkey_t rkey, uct_completion_t *comp) { - uct_ugni_rdma_fetch_desc_t *fma = NULL; + uct_ugni_rdma_fetch_desc_t *fma_desc = NULL; uct_ugni_rdma_fetch_desc_t *rdma = NULL; uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t); uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t); @@ -642,43 +646,43 @@ static ucs_status_t uct_ugni_ep_get_composed_fma_rdma(uct_ep_h tl_ep, void *buff fma_length = iface->config.fma_seg_size; UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_get_buffer, - fma, return UCS_ERR_NO_RESOURCE); + fma_desc, return UCS_ERR_NO_RESOURCE); UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_get, - rdma, {ucs_mpool_put(fma);return UCS_ERR_NO_RESOURCE;}); + rdma, {ucs_mpool_put(fma_desc);return UCS_ERR_NO_RESOURCE;}); rdma_remote_start = remote_addr; fma_remote_start = rdma_remote_start + rdma_length; aligned_fma_remote_start = ucs_align_up_pow2(fma_remote_start, UGNI_GET_ALIGN); /* The FMA completion is used to signal when both descs have completed. */ - fma->tmp.count = 2; - fma->tmp.func = assemble_composed_unaligned; + fma_desc->tmp.count = 2; + fma_desc->tmp.func = assemble_composed_unaligned; /* The RDMA completion is used to signal when both descs have been freed */ rdma->tmp.count = 2; - uct_ugni_format_get_fma(fma, aligned_fma_remote_start, rkey, fma_length, ep, comp, &fma->tmp, NULL, NULL); - fma->tail = aligned_fma_remote_start - fma_remote_start; + uct_ugni_format_get_fma(fma_desc, aligned_fma_remote_start, rkey, fma_length, ep, comp, &fma_desc->tmp, NULL, NULL); + fma_desc->tail = aligned_fma_remote_start - fma_remote_start; uct_ugni_format_unaligned_rdma(rdma, buffer, rdma_remote_start, memh, rkey, - rdma_length+fma->tail, ep, iface->super.local_cq, - &fma->tmp); - fma->head = rdma; - rdma->head = fma; - fma->user_buffer = rdma->user_buffer = buffer; - fma->super.free_cb = rdma->super.free_cb = free_composed_desc; + rdma_length+fma_desc->tail, ep, iface->super.local_cq, + &fma_desc->tmp); + fma_desc->head = rdma; + rdma->head = fma_desc; + fma_desc->user_buffer = rdma->user_buffer = buffer; + fma_desc->super.free_cb = rdma->super.free_cb = free_composed_desc; ucs_trace_data("Posting split GET ZCOPY, GNI_PostFma of size %"PRIx64" (%lu) from %p to " "%p, with [%"PRIx64" %"PRIx64"] and GNI_PostRdma of size %"PRIx64" (%lu)" " from %p to %p, with [%"PRIx64" %"PRIx64"]", - fma->super.desc.length, length, - (void *)fma->super.desc.local_addr, - (void *)fma->super.desc.remote_addr, - fma->super.desc.remote_mem_hndl.qword1, - fma->super.desc.remote_mem_hndl.qword2, + fma_desc->super.desc.length, length, + (void *)fma_desc->super.desc.local_addr, + (void *)fma_desc->super.desc.remote_addr, + fma_desc->super.desc.remote_mem_hndl.qword1, + fma_desc->super.desc.remote_mem_hndl.qword2, rdma->super.desc.length, length, (void *)rdma->super.desc.local_addr, (void *)rdma->super.desc.remote_addr, rdma->super.desc.remote_mem_hndl.qword1, rdma->super.desc.remote_mem_hndl.qword2); UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, ZCOPY, length); - post_result = uct_ugni_post_fma(iface, ep, &(fma->super), UCS_INPROGRESS); + post_result = uct_ugni_post_fma(iface, ep, &(fma_desc->super), UCS_INPROGRESS); if(post_result != UCS_OK && post_result != UCS_INPROGRESS){ ucs_mpool_put(rdma); return post_result; diff --git a/src/uct/ugni/rdma/ugni_rdma_iface.c b/src/uct/ugni/rdma/ugni_rdma_iface.c index 1b6ff4f2fc4..69e1ebac557 100644 --- a/src/uct/ugni/rdma/ugni_rdma_iface.c +++ b/src/uct/ugni/rdma/ugni_rdma_iface.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ugni_rdma_ep.h" #include "ugni_rdma_iface.h" #include @@ -12,7 +16,7 @@ static ucs_config_field_t uct_ugni_rdma_iface_config_table[] = { /* This tuning controls the allocation priorities for bouncing buffers */ - { "", "MAX_SHORT=2048;MAX_BCOPY=2048;ALLOC=huge,mmap,heap", NULL, + { "", "ALLOC=huge,mmap,heap", NULL, ucs_offsetof(uct_ugni_rdma_iface_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)}, UCT_IFACE_MPOOL_CONFIG_FIELDS("RDMA", -1, 0, "rdma", @@ -23,19 +27,12 @@ static ucs_config_field_t uct_ugni_rdma_iface_config_table[] = { {NULL} }; -static ucs_status_t uct_ugni_rdma_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) -{ - return uct_ugni_query_tl_resources(md, UCT_UGNI_RDMA_TL_NAME, - resource_p, num_resources_p); -} - static ucs_status_t uct_ugni_rdma_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_iface, uct_ugni_rdma_iface_t); - memset(iface_attr, 0, sizeof(uct_iface_attr_t)); + uct_base_iface_query(&iface->super.super, iface_attr); + iface_attr->cap.put.max_short = iface->config.fma_seg_size; iface_attr->cap.put.max_bcopy = iface->config.fma_seg_size; iface_attr->cap.put.min_zcopy = 0; @@ -96,11 +93,12 @@ static ucs_status_t uct_ugni_rdma_iface_query(uct_iface_h tl_iface, uct_iface_at UCS_BIT(UCT_ATOMIC_OP_SWAP) | UCS_BIT(UCT_ATOMIC_OP_CSWAP); } - iface_attr->overhead = 80e-9; /* 80 ns */ - iface_attr->latency.overhead = 900e-9; /* 900 ns */ - iface_attr->latency.growth = 0; - iface_attr->bandwidth = 6911 * pow(1024,2); /* bytes */ - iface_attr->priority = 0; + iface_attr->overhead = 80e-9; /* 80 ns */ + iface_attr->latency = ucs_linear_func_make(900e-9, 0); /* 900 ns */ + iface_attr->bandwidth.dedicated = 6911.0 * UCS_MBYTE; /* bytes */ + iface_attr->bandwidth.shared = 0; + iface_attr->priority = 0; + return UCS_OK; } @@ -177,6 +175,8 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ugni_rdma_iface_t) ucs_mpool_cleanup(&self->free_desc, 1); } +extern ucs_class_t UCS_CLASS_DECL_NAME(uct_ugni_rdma_iface_t); + static UCS_CLASS_DEFINE_DELETE_FUNC(uct_ugni_rdma_iface_t, uct_iface_t); static uct_iface_ops_t uct_ugni_aries_rdma_iface_ops = { @@ -373,11 +373,6 @@ UCS_CLASS_DEFINE_NEW_FUNC(uct_ugni_rdma_iface_t, uct_iface_t, uct_md_h, uct_worker_h, const uct_iface_params_t*, const uct_iface_config_t*); -UCT_TL_COMPONENT_DEFINE(uct_ugni_rdma_tl_component, - uct_ugni_rdma_query_tl_resources, - uct_ugni_rdma_iface_t, - UCT_UGNI_RDMA_TL_NAME, - "UGNI_RDMA", - uct_ugni_rdma_iface_config_table, - uct_ugni_rdma_iface_config_t); -UCT_MD_REGISTER_TL(&uct_ugni_md_component, &uct_ugni_rdma_tl_component); +UCT_TL_DEFINE(&uct_ugni_component, ugni_rdma, uct_ugni_query_devices, + uct_ugni_rdma_iface_t, "UGNI_RDMA_", + uct_ugni_rdma_iface_config_table, uct_ugni_rdma_iface_config_t); diff --git a/src/uct/ugni/rdma/ugni_rdma_iface.h b/src/uct/ugni/rdma/ugni_rdma_iface.h index f068d023800..c40c74efad9 100644 --- a/src/uct/ugni/rdma/ugni_rdma_iface.h +++ b/src/uct/ugni/rdma/ugni_rdma_iface.h @@ -11,7 +11,7 @@ #include #include -#define UCT_UGNI_RDMA_TL_NAME "ugni_rdma" + #define UCT_UGNI_MAX_FMA 2048 #define UCT_UGNI_MAX_RDMA (512*1024*1024); diff --git a/src/uct/ugni/smsg/ugni_smsg_ep.c b/src/uct/ugni/smsg/ugni_smsg_ep.c index 86b0d03d97a..31b57eea11c 100644 --- a/src/uct/ugni/smsg/ugni_smsg_ep.c +++ b/src/uct/ugni/smsg/ugni_smsg_ep.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ugni_smsg_ep.h" #include "ugni_smsg_iface.h" #include @@ -231,7 +235,6 @@ ucs_status_t uct_ugni_smsg_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t head uct_ugni_smsg_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_smsg_iface_t); uct_ugni_smsg_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_smsg_ep_t); uct_ugni_smsg_header_t *smsg_header; - uint64_t *header_data; uct_ugni_smsg_desc_t *desc; ucs_status_t rc; @@ -248,17 +251,18 @@ ucs_status_t uct_ugni_smsg_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t head smsg_header = (uct_ugni_smsg_header_t *)(desc+1); smsg_header->length = length + sizeof(header); - header_data = (uint64_t*)(smsg_header+1); - *header_data = header; - memcpy((void*)(header_data+1), payload, length); + uct_am_short_fill_data(smsg_header + 1, header, payload, length); uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_SEND, - id, header_data, length, "TX: AM_SHORT"); + id, smsg_header + 1, length, "TX: AM_SHORT"); - rc = uct_ugni_smsg_ep_am_common_send(ep, iface, id, sizeof(uct_ugni_smsg_header_t), - smsg_header, smsg_header->length, (void*)header_data, desc); + rc = uct_ugni_smsg_ep_am_common_send(ep, iface, id, + sizeof(uct_ugni_smsg_header_t), + smsg_header, smsg_header->length, + smsg_header + 1, desc); - UCT_TL_EP_STAT_OP_IF_SUCCESS(rc, ucs_derived_of(tl_ep, uct_base_ep_t), AM, SHORT, sizeof(header) + length); + UCT_TL_EP_STAT_OP_IF_SUCCESS(rc, ucs_derived_of(tl_ep, uct_base_ep_t), AM, + SHORT, sizeof(header) + length); return rc; } diff --git a/src/uct/ugni/smsg/ugni_smsg_iface.c b/src/uct/ugni/smsg/ugni_smsg_iface.c index 777ddb1abc1..0f5b53a34ac 100644 --- a/src/uct/ugni/smsg/ugni_smsg_iface.c +++ b/src/uct/ugni/smsg/ugni_smsg_iface.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ugni_smsg_iface.h" #include "ugni_smsg_ep.h" #include @@ -11,7 +15,8 @@ #include #include -#define UCT_UGNI_SMSG_TL_NAME "ugni_smsg" + +extern ucs_class_t UCS_CLASS_DECL_NAME(uct_ugni_smsg_iface_t); static ucs_config_field_t uct_ugni_smsg_iface_config_table[] = { {"", "ALLOC=huge,thp,mmap,heap", NULL, @@ -64,7 +69,7 @@ static void process_mbox(uct_ugni_smsg_iface_t *iface, uct_ugni_smsg_ep_t *ep){ /* Only one thread at a time can process mboxes for the iface. After it's done then everyone's messages have been drained. */ - if (!ucs_spin_trylock(&iface->mbox_lock)) { + if (!ucs_recursive_spin_trylock(&iface->mbox_lock)) { return; } while(1){ @@ -100,7 +105,7 @@ static void process_mbox(uct_ugni_smsg_iface_t *iface, uct_ugni_smsg_ep_t *ep){ break; } } - ucs_spin_unlock(&iface->mbox_lock); + ucs_recursive_spin_unlock(&iface->mbox_lock); } static void uct_ugni_smsg_handle_remote_overflow(uct_ugni_smsg_iface_t *iface){ @@ -183,19 +188,12 @@ static unsigned uct_ugni_smsg_progress(void *arg) return count - 2; } -static ucs_status_t uct_ugni_smsg_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) -{ - return uct_ugni_query_tl_resources(md, UCT_UGNI_SMSG_TL_NAME, - resource_p, num_resources_p); -} - static ucs_status_t uct_ugni_smsg_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { uct_ugni_smsg_iface_t *iface = ucs_derived_of(tl_iface, uct_ugni_smsg_iface_t); - memset(iface_attr, 0, sizeof(uct_iface_attr_t)); + uct_base_iface_query(&iface->super.super, iface_attr); + iface_attr->cap.am.max_short = iface->config.smsg_seg_size-sizeof(uint64_t); iface_attr->cap.am.max_bcopy = iface->config.smsg_seg_size; iface_attr->cap.am.opt_zcopy_align = 1; @@ -211,21 +209,28 @@ static ucs_status_t uct_ugni_smsg_iface_query(uct_iface_h tl_iface, uct_iface_at UCT_IFACE_FLAG_PENDING; iface_attr->overhead = 1e-6; /* 1 usec */ - iface_attr->latency.overhead = 40e-6; /* 40 usec */ - iface_attr->latency.growth = 0; - iface_attr->bandwidth = pow(1024, 2); /* bytes */ + iface_attr->latency = ucs_linear_func_make(40e-6, 0); /* 40 usec */ + iface_attr->bandwidth.dedicated = 1.0 * UCS_MBYTE; /* bytes */ + iface_attr->bandwidth.shared = 0; iface_attr->priority = 0; + return UCS_OK; } static UCS_CLASS_CLEANUP_FUNC(uct_ugni_smsg_iface_t) { + ucs_status_t status; + uct_worker_progress_remove(self->super.super.worker, &self->super.super.prog); ucs_mpool_cleanup(&self->free_desc, 1); ucs_mpool_cleanup(&self->free_mbox, 1); uct_ugni_destroy_cq(self->remote_cq, &self->super.cdm); - ucs_spinlock_destroy(&self->mbox_lock); + + status = ucs_recursive_spinlock_destroy(&self->mbox_lock); + if (status != UCS_OK) { + ucs_warn("ucs_recursive_spinlock_destroy() failed (%d)", status); + } } static uct_iface_ops_t uct_ugni_smsg_iface_ops = { @@ -289,7 +294,7 @@ static UCS_CLASS_INIT_FUNC(uct_ugni_smsg_iface_t, uct_md_h md, uct_worker_h work smsg_attr.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT; smsg_attr.mbox_maxcredit = self->config.smsg_max_credit; smsg_attr.msg_maxsize = self->config.smsg_seg_size; - status = ucs_spinlock_init(&self->mbox_lock); + status = ucs_recursive_spinlock_init(&self->mbox_lock, 0); if (UCS_OK != status) { goto exit; } @@ -359,7 +364,7 @@ static UCS_CLASS_INIT_FUNC(uct_ugni_smsg_iface_t, uct_md_h md, uct_worker_h work clean_cq: uct_ugni_destroy_cq(self->remote_cq, &self->super.cdm); clean_lock: - ucs_spinlock_destroy(&self->mbox_lock); + ucs_recursive_spinlock_destroy(&self->mbox_lock); exit: uct_ugni_cleanup_base_iface(&self->super); ucs_error("Failed to activate interface"); @@ -371,12 +376,6 @@ UCS_CLASS_DEFINE_NEW_FUNC(uct_ugni_smsg_iface_t, uct_iface_t, uct_md_h, uct_worker_h, const uct_iface_params_t*, const uct_iface_config_t *); -UCT_TL_COMPONENT_DEFINE(uct_ugni_smsg_tl_component, - uct_ugni_smsg_query_tl_resources, - uct_ugni_smsg_iface_t, - UCT_UGNI_SMSG_TL_NAME, - "UGNI_SMSG", - uct_ugni_smsg_iface_config_table, - uct_ugni_iface_config_t); - -UCT_MD_REGISTER_TL(&uct_ugni_md_component, &uct_ugni_smsg_tl_component); +UCT_TL_DEFINE(&uct_ugni_component, ugni_smsg, uct_ugni_query_devices, + uct_ugni_smsg_iface_t, "UGNI_SMSG_", + uct_ugni_smsg_iface_config_table, uct_ugni_iface_config_t); diff --git a/src/uct/ugni/smsg/ugni_smsg_iface.h b/src/uct/ugni/smsg/ugni_smsg_iface.h index 876748d0be3..9e28b52b764 100644 --- a/src/uct/ugni/smsg/ugni_smsg_iface.h +++ b/src/uct/ugni/smsg/ugni_smsg_iface.h @@ -16,21 +16,21 @@ #define SMSG_MAX_SIZE 65535 typedef struct uct_ugni_smsg_iface { - uct_ugni_iface_t super; /**< Super type */ - gni_cq_handle_t remote_cq; /**< Remote completion queue */ - ucs_mpool_t free_desc; /**< Pool of FMA descriptors for + uct_ugni_iface_t super; /**< Super type */ + gni_cq_handle_t remote_cq; /**< Remote completion queue */ + ucs_mpool_t free_desc; /**< Pool of FMA descriptors for requests without bouncing buffers */ - ucs_mpool_t free_mbox; /**< Pool of mboxes for use with smsg */ - uint32_t smsg_id; /**< Id number to uniquely identify smsgs in the cq */ + ucs_mpool_t free_mbox; /**< Pool of mboxes for use with smsg */ + uint32_t smsg_id; /**< Id number to uniquely identify smsgs in the cq */ struct { - unsigned smsg_seg_size; /**< Max SMSG size */ - size_t rx_headroom; /**< The size of user defined header for am */ - uint16_t smsg_max_retransmit; - uint16_t smsg_max_credit; /**< Max credits for smsg boxes */ + unsigned smsg_seg_size; /**< Max SMSG size */ + size_t rx_headroom; /**< The size of user defined header for am */ + uint16_t smsg_max_retransmit; + uint16_t smsg_max_credit; /**< Max credits for smsg boxes */ } config; - size_t bytes_per_mbox; - uct_ugni_smsg_desc_t *smsg_list[UCT_UGNI_HASH_SIZE]; /**< A list of descriptors currently outstanding */ - ucs_spinlock_t mbox_lock; /**< Lock for processing SMSG mboxes */ + size_t bytes_per_mbox; + uct_ugni_smsg_desc_t *smsg_list[UCT_UGNI_HASH_SIZE]; /**< A list of descriptors currently outstanding */ + ucs_recursive_spinlock_t mbox_lock; /**< Lock for processing SMSG mboxes */ } uct_ugni_smsg_iface_t; typedef struct uct_ugni_smsg_header { diff --git a/src/uct/ugni/udt/ugni_udt_ep.c b/src/uct/ugni/udt/ugni_udt_ep.c index 250b6bb31a4..7905677be1e 100644 --- a/src/uct/ugni/udt/ugni_udt_ep.c +++ b/src/uct/ugni/udt/ugni_udt_ep.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ugni_udt_ep.h" #include "ugni_udt_iface.h" #include @@ -31,29 +35,35 @@ ucs_status_t uct_ugni_udt_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *n, } ucs_arbiter_cb_result_t uct_ugni_udt_ep_process_pending(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { + uct_ugni_ep_t *ep = ucs_container_of(group, uct_ugni_ep_t, + arb_group); + uct_ugni_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_ugni_iface_t); ucs_arbiter_cb_result_t result; - uct_ugni_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), uct_ugni_ep_t, arb_group); - uct_ugni_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ugni_iface_t); - result = uct_ugni_ep_process_pending(arbiter, elem, arg); + result = uct_ugni_ep_process_pending(arbiter, group, elem, arg); if (UCS_ARBITER_CB_RESULT_REMOVE_ELEM == result) { uct_worker_progress_remove(iface->super.worker, &iface->super.prog); } return result; } -static ucs_arbiter_cb_result_t uct_ugni_udt_ep_abriter_purge_cb(ucs_arbiter_t *arbiter, - ucs_arbiter_elem_t *elem, - void *arg) +static ucs_arbiter_cb_result_t +uct_ugni_udt_ep_abriter_purge_cb(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, + ucs_arbiter_elem_t *elem, void *arg) { - uct_ugni_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), uct_ugni_ep_t, arb_group); - uct_ugni_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ugni_iface_t); + uct_ugni_ep_t *ep = ucs_container_of(group, uct_ugni_ep_t, + arb_group); + uct_ugni_iface_t *iface = ucs_derived_of(ep->super.super.iface, + uct_ugni_iface_t); ucs_arbiter_cb_result_t result; - result = uct_ugni_ep_abriter_purge_cb(arbiter, elem, arg); + result = uct_ugni_ep_abriter_purge_cb(arbiter, group, elem, arg); if (UCS_ARBITER_CB_RESULT_REMOVE_ELEM == result) { uct_worker_progress_remove(iface->super.worker, &iface->super.prog); } @@ -171,17 +181,16 @@ uct_ugni_udt_ep_am_common_send(const unsigned is_short, uct_ugni_udt_ep_t *ep, u sheader = uct_ugni_udt_get_sheader(desc, iface); if (is_short) { - uint64_t *hdr = (uint64_t *)uct_ugni_udt_get_spayload(desc, iface); - *hdr = header; - memcpy((void*)(hdr + 1), payload, length); + uct_am_short_fill_data(uct_ugni_udt_get_spayload(desc, iface), + header, payload, length); sheader->length = length + sizeof(header); - msg_length = sheader->length + sizeof(*sheader); + msg_length = sheader->length + sizeof(*sheader); UCT_TL_EP_STAT_OP(ucs_derived_of(ep, uct_base_ep_t), AM, SHORT, sizeof(header) + length); } else { - packed_length = pack_cb((void *)uct_ugni_udt_get_spayload(desc, iface), - arg); + packed_length = pack_cb((void *)uct_ugni_udt_get_spayload(desc, iface), + arg); sheader->length = packed_length; - msg_length = sheader->length + sizeof(*sheader); + msg_length = sheader->length + sizeof(*sheader); UCT_TL_EP_STAT_OP(ucs_derived_of(ep, uct_base_ep_t), AM, BCOPY, packed_length); } @@ -190,7 +199,7 @@ uct_ugni_udt_ep_am_common_send(const unsigned is_short, uct_ugni_udt_ep_t *ep, u is_short ? "TX: AM_SHORT" : "TX: AM_BCOPY"); sheader->am_id = am_id; - sheader->type = UCT_UGNI_UDT_PAYLOAD; + sheader->type = UCT_UGNI_UDT_PAYLOAD; ucs_assertv(msg_length <= GNI_DATAGRAM_MAXSIZE, "msg_length=%u", msg_length); diff --git a/src/uct/ugni/udt/ugni_udt_ep.h b/src/uct/ugni/udt/ugni_udt_ep.h index a071e21849a..acc1f4adce7 100644 --- a/src/uct/ugni/udt/ugni_udt_ep.h +++ b/src/uct/ugni/udt/ugni_udt_ep.h @@ -32,6 +32,7 @@ ssize_t uct_ugni_udt_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id, ucs_status_t uct_ugni_udt_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *n, unsigned flags); ucs_arbiter_cb_result_t uct_ugni_udt_ep_process_pending(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg); void uct_ugni_udt_ep_pending_purge(uct_ep_h tl_ep, diff --git a/src/uct/ugni/udt/ugni_udt_iface.c b/src/uct/ugni/udt/ugni_udt_iface.c index 5a6f2217144..76229a2ccf3 100644 --- a/src/uct/ugni/udt/ugni_udt_iface.c +++ b/src/uct/ugni/udt/ugni_udt_iface.c @@ -4,13 +4,18 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include "ugni_udt_iface.h" #include "ugni_udt_ep.h" #include #include #include -#define UCT_UGNI_UDT_TL_NAME "ugni_udt" + +extern ucs_class_t UCS_CLASS_DECL_NAME(uct_ugni_udt_iface_t); static ucs_config_field_t uct_ugni_udt_iface_config_table[] = { {"", "ALLOC=huge,thp,mmap,heap", NULL, @@ -155,19 +160,12 @@ static void uct_ugni_udt_iface_release_desc(uct_recv_desc_t *self, void *desc) ucs_mpool_put(ugni_desc); } -static ucs_status_t uct_ugni_udt_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) -{ - return uct_ugni_query_tl_resources(md, UCT_UGNI_UDT_TL_NAME, - resource_p, num_resources_p); -} - static ucs_status_t uct_ugni_udt_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { uct_ugni_udt_iface_t *iface = ucs_derived_of(tl_iface, uct_ugni_udt_iface_t); - memset(iface_attr, 0, sizeof(uct_iface_attr_t)); + uct_base_iface_query(&iface->super.super, iface_attr); + iface_attr->cap.am.max_short = iface->config.udt_seg_size - sizeof(uct_ugni_udt_header_t); iface_attr->cap.am.max_bcopy = iface->config.udt_seg_size - @@ -185,14 +183,15 @@ static ucs_status_t uct_ugni_udt_iface_query(uct_iface_h tl_iface, uct_iface_att UCT_IFACE_FLAG_CB_ASYNC; iface_attr->overhead = 1e-6; /* 1 usec */ - iface_attr->latency.overhead = 40e-6; /* 40 usec */ - iface_attr->latency.growth = 0; - iface_attr->bandwidth = pow(1024, 2); /* bytes */ + iface_attr->latency = ucs_linear_func_make(40e-6, 0); /* 40 usec */ + iface_attr->bandwidth.dedicated = 1.0 * UCS_MBYTE; /* bytes */ + iface_attr->bandwidth.shared = 0; iface_attr->priority = 0; + return UCS_OK; } -void uct_ugni_proccess_datagram_pipe(int event_id, void *arg) { +void uct_ugni_proccess_datagram_pipe(int event_id, int events, void *arg) { uct_ugni_udt_iface_t *iface = (uct_ugni_udt_iface_t *)arg; uct_ugni_udt_ep_t *ep; uct_ugni_udt_desc_t *datagram; @@ -337,7 +336,8 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ugni_udt_iface_t) uct_ugni_udt_clean_wildcard(self); ucs_async_remove_handler(ucs_async_pipe_rfd(&self->event_pipe),1); if (self->events_ready) { - uct_ugni_proccess_datagram_pipe(ucs_async_pipe_rfd(&self->event_pipe),self); + uct_ugni_proccess_datagram_pipe(ucs_async_pipe_rfd(&self->event_pipe), + UCS_EVENT_SET_EVREAD, self); } uct_ugni_udt_terminate_thread(self); pthread_join(self->event_thread, &dummy); @@ -441,7 +441,7 @@ static UCS_CLASS_INIT_FUNC(uct_ugni_udt_iface_t, uct_md_h md, uct_worker_h worke status = ucs_async_set_event_handler(self->super.super.worker->async->mode, ucs_async_pipe_rfd(&self->event_pipe), - POLLIN, + UCS_EVENT_SET_EVREAD, uct_ugni_proccess_datagram_pipe, self, self->super.super.worker->async); @@ -487,12 +487,6 @@ UCS_CLASS_DEFINE_NEW_FUNC(uct_ugni_udt_iface_t, uct_iface_t, uct_md_h, uct_worker_h, const uct_iface_params_t*, const uct_iface_config_t*); -UCT_TL_COMPONENT_DEFINE(uct_ugni_udt_tl_component, - uct_ugni_udt_query_tl_resources, - uct_ugni_udt_iface_t, - UCT_UGNI_UDT_TL_NAME, - "UGNI_UDT", - uct_ugni_udt_iface_config_table, - uct_ugni_iface_config_t); - -UCT_MD_REGISTER_TL(&uct_ugni_md_component, &uct_ugni_udt_tl_component); +UCT_TL_DEFINE(&uct_ugni_component, ugni_udt, uct_ugni_query_devices, + uct_ugni_udt_iface_t, "UGNI_UDT_", + uct_ugni_udt_iface_config_table, uct_ugni_iface_config_t); diff --git a/test/apps/Makefile.am b/test/apps/Makefile.am index 9fa9ab94be3..0ccb0fa5a71 100644 --- a/test/apps/Makefile.am +++ b/test/apps/Makefile.am @@ -2,13 +2,19 @@ # Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. # Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. # Copyright (C) ARM Ltd. 2017. ALL RIGHTS RESERVED. -# Copyright (C) Huawei Technologies Co.,Ltd. 2020. ALL RIGHTS RESERVED. +# Copyright (C) Huawei Technologies Co.,Ltd. 2019-2021. ALL RIGHTS RESERVED. # # See file LICENSE for terms. # +SUBDIRS = profiling + +if HAVE_GLIBCXX_NOTHROW +SUBDIRS += iodemo +endif + if HAVE_CXX11 -SUBDIRS = sockaddr +SUBDIRS += sockaddr endif noinst_PROGRAMS = \ diff --git a/test/apps/iodemo/Makefile.am b/test/apps/iodemo/Makefile.am new file mode 100644 index 00000000000..15a56ecc3fa --- /dev/null +++ b/test/apps/iodemo/Makefile.am @@ -0,0 +1,23 @@ +# +# Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. +# +# See file LICENSE for terms. +# + +noinst_PROGRAMS = io_demo + +noinst_HEADERS = \ + ucx_wrapper.h + +io_demo_CXXFLAGS = \ + $(BASE_CXXFLAGS) + +io_demo_CPPFLAGS = $(BASE_CPPFLAGS) + +io_demo_LDADD = \ + $(top_builddir)/src/ucs/libucs.la \ + $(top_builddir)/src/ucp/libucp.la + +io_demo_SOURCES = \ + ucx_wrapper.cc \ + io_demo.cc diff --git a/test/apps/iodemo/io_demo.cc b/test/apps/iodemo/io_demo.cc new file mode 100644 index 00000000000..e866ddc81e7 --- /dev/null +++ b/test/apps/iodemo/io_demo.cc @@ -0,0 +1,1010 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#include "ucx_wrapper.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ALIGNMENT 4096 + + +/* IO operation type */ +typedef enum { + IO_READ, + IO_WRITE, + IO_COMP +} io_op_t; + +static const char *io_op_names[] = { + "read", + "write", + "completion" +}; + +/* test options */ +typedef struct { + const char *server_addr; + int port_num; + long client_retries; + double client_timeout; + double client_runtime_limit; + size_t iomsg_size; + size_t min_data_size; + size_t max_data_size; + size_t chunk_size; + long iter_count; + long window_size; + std::vector operations; + unsigned random_seed; + size_t num_buffers; + bool verbose; +} options_t; + +#define LOG UcxLog("[DEMO]", true) +#define VERBOSE_LOG UcxLog("[DEMO]", _test_opts.verbose) + +template +class MemoryPool { +public: + MemoryPool(size_t buffer_size = 0) : + _num_allocated(0), _buffer_size(buffer_size) { + } + + ~MemoryPool() { + if (_num_allocated != _freelist.size()) { + LOG << "Some items were not freed. Total:" << _num_allocated + << ", current:" << _freelist.size() << "."; + } + + for (size_t i = 0; i < _freelist.size(); i++) { + delete _freelist[i]; + } + } + + T * get() { + T * item; + + if (_freelist.empty()) { + item = new T(_buffer_size, this); + _num_allocated++; + } else { + item = _freelist.back(); + _freelist.pop_back(); + } + return item; + } + + void put(T * item) { + _freelist.push_back(item); + } + +private: + std::vector _freelist; + uint32_t _num_allocated; + size_t _buffer_size; +}; + +/** + * Linear congruential generator (LCG): + * n[i + 1] = (n[i] * A + C) % M + * where A, C, M used as in glibc + */ +class IoDemoRandom { +public: + static void srand(unsigned seed) { + _seed = seed & _M; + } + + static inline int rand(int min = std::numeric_limits::min(), + int max = std::numeric_limits::max()) { + _seed = (_seed * _A + _C) & _M; + /* To resolve that LCG returns alternating even/odd values */ + if (max - min == 1) { + return (_seed & 0x100) ? max : min; + } else { + return (int)_seed % (max - min + 1) + min; + } + } + +private: + static unsigned _seed; + static const unsigned _A; + static const unsigned _C; + static const unsigned _M; +}; +unsigned IoDemoRandom::_seed = 0; +const unsigned IoDemoRandom::_A = 1103515245U; +const unsigned IoDemoRandom::_C = 12345U; +const unsigned IoDemoRandom::_M = 0x7fffffffU; + +class P2pDemoCommon : public UcxContext { +protected: + + /* IO request header */ + typedef struct { + io_op_t op; + uint32_t sn; + size_t data_size; + } iomsg_hdr_t; + + typedef enum { + XFER_TYPE_SEND, + XFER_TYPE_RECV + } xfer_type_t; + + /* Asynchronous IO message */ + class IoMessage : public UcxCallback { + public: + IoMessage(size_t buffer_size, MemoryPool* pool) { + _buffer = malloc(buffer_size); + _pool = pool; + _buffer_size = buffer_size; + } + + void init(io_op_t op, uint32_t sn, size_t data_size) { + iomsg_hdr_t *hdr = reinterpret_cast(_buffer); + assert(sizeof(*hdr) <= _buffer_size); + hdr->op = op; + hdr->sn = sn; + hdr->data_size = data_size; + } + + ~IoMessage() { + free(_buffer); + } + + virtual void operator()(ucs_status_t status) { + _pool->put(this); + } + + void *buffer() { + return _buffer; + } + + private: + void* _buffer; + MemoryPool* _pool; + size_t _buffer_size; + }; + + P2pDemoCommon(const options_t& test_opts) : + UcxContext(test_opts.iomsg_size), _test_opts(test_opts), + _io_msg_pool(opts().iomsg_size), _cur_buffer_idx(0), _padding(0) { + + _data_buffers.resize(opts().num_buffers); + for (size_t i = 0; i < _data_buffers.size(); ++i) { + std::string &data_buffer = _data_buffers[i]; + data_buffer.resize(opts().max_data_size + ALIGNMENT); + uintptr_t ptr = (uintptr_t)&data_buffer[0]; + _padding = ((ptr + ALIGNMENT - 1) & ~(ALIGNMENT - 1)) - ptr; + } + } + + const options_t& opts() const { + return _test_opts; + } + + inline void *buffer() { + return &_data_buffers[_cur_buffer_idx][_padding]; + } + + inline void *buffer(size_t offset) { + return &_data_buffers[_cur_buffer_idx][_padding + offset]; + } + + inline void next_buffer() { + _cur_buffer_idx = (_cur_buffer_idx + 1) % _data_buffers.size(); + assert(_cur_buffer_idx < opts().num_buffers); + } + + inline size_t get_data_size() { + return IoDemoRandom::rand(opts().min_data_size, + opts().max_data_size); + } + + bool send_io_message(UcxConnection *conn, io_op_t op, + uint32_t sn, size_t data_size) { + IoMessage *m = _io_msg_pool.get(); + m->init(op, sn, data_size); + VERBOSE_LOG << "sending IO " << io_op_names[op] << ", sn " << sn + << " data size " << data_size; + return conn->send_io_message(m->buffer(), opts().iomsg_size, m); + } + + void send_recv_data_as_chunks(UcxConnection* conn, size_t data_size, uint32_t sn, + xfer_type_t send_recv_data, + UcxCallback* callback = EmptyCallback::get()) { + size_t remaining = data_size; + while (remaining > 0) { + size_t xfer_size = std::min(opts().chunk_size, remaining); + if (send_recv_data == XFER_TYPE_SEND) { + conn->send_data(buffer(data_size - remaining), xfer_size, sn, callback); + } else { + conn->recv_data(buffer(data_size - remaining), xfer_size, sn, callback); + } + remaining -= xfer_size; + } + } + + void send_data_as_chunks(UcxConnection* conn, size_t data_size, uint32_t sn, + UcxCallback* callback = EmptyCallback::get()) { + send_recv_data_as_chunks(conn, data_size, sn, XFER_TYPE_SEND, callback); + } + + void recv_data_as_chunks(UcxConnection* conn, size_t data_size, uint32_t sn, + UcxCallback* callback = EmptyCallback::get()) { + send_recv_data_as_chunks(conn, data_size, sn, XFER_TYPE_RECV, callback); + } + + uint32_t get_chunk_cnt(size_t data_size) { + return (data_size + opts().chunk_size - 1) / opts().chunk_size; + } + + void send_data(UcxConnection* conn, size_t data_size, uint32_t sn, + UcxCallback* callback = EmptyCallback::get()) { + send_data_as_chunks(conn, data_size, sn, callback); + } + + void recv_data(UcxConnection* conn, size_t data_size, uint32_t sn, + UcxCallback* callback = EmptyCallback::get()) { + recv_data_as_chunks(conn, data_size, sn, callback); + } + +protected: + const options_t _test_opts; + MemoryPool _io_msg_pool; + +private: + std::vector _data_buffers; + size_t _cur_buffer_idx; + size_t _padding; +}; + + +class DemoServer : public P2pDemoCommon { +public: + // sends an IO response when done + class IoWriteResponseCallback : public UcxCallback { + public: + IoWriteResponseCallback(size_t buffer_size, + MemoryPool* pool) : + _server(NULL), _conn(NULL), _sn(0), _data_size(0), _chunk_cnt(0) { + _pool = pool; + } + + void init(DemoServer *server, UcxConnection* conn, uint32_t sn, + size_t data_size, uint32_t chunk_cnt = 1) { + _server = server; + _conn = conn; + _sn = sn; + _data_size = data_size; + _chunk_cnt = chunk_cnt; + } + + virtual void operator()(ucs_status_t status) { + if (--_chunk_cnt > 0) { + return; + } + if (status == UCS_OK) { + _server->send_io_message(_conn, IO_COMP, _sn, _data_size); + } + _pool->put(this); + } + + private: + DemoServer* _server; + UcxConnection* _conn; + uint32_t _sn; + size_t _data_size; + uint32_t _chunk_cnt; + MemoryPool* _pool; + }; + + DemoServer(const options_t& test_opts) : + P2pDemoCommon(test_opts), _callback_pool(0) { + } + + void run() { + struct sockaddr_in listen_addr; + memset(&listen_addr, 0, sizeof(listen_addr)); + listen_addr.sin_family = AF_INET; + listen_addr.sin_addr.s_addr = INADDR_ANY; + listen_addr.sin_port = htons(opts().port_num); + + listen((const struct sockaddr*)&listen_addr, sizeof(listen_addr)); + for (;;) { + try { + progress(); + } catch (const std::exception &e) { + std::cerr << e.what(); + } + } + } + + void handle_io_read_request(UcxConnection* conn, const iomsg_hdr_t *hdr) { + // send data + VERBOSE_LOG << "sending IO read data"; + assert(opts().max_data_size >= hdr->data_size); + + send_data(conn, hdr->data_size, hdr->sn); + + // send response as data + VERBOSE_LOG << "sending IO read response"; + IoMessage *response = _io_msg_pool.get(); + response->init(IO_COMP, hdr->sn, 0); + conn->send_data(response->buffer(), opts().iomsg_size, hdr->sn, + response); + + next_buffer(); + } + + void handle_io_write_request(UcxConnection* conn, const iomsg_hdr_t *hdr) { + VERBOSE_LOG << "receiving IO write data"; + assert(opts().max_data_size >= hdr->data_size); + assert(hdr->data_size != 0); + + IoWriteResponseCallback *w = _callback_pool.get(); + w->init(this, conn, hdr->sn, hdr->data_size, get_chunk_cnt(hdr->data_size)); + recv_data(conn, hdr->data_size, hdr->sn, w); + + next_buffer(); + } + + virtual void dispatch_connection_error(UcxConnection *conn) { + LOG << "deleting connection " << conn; + delete conn; + } + + virtual void dispatch_io_message(UcxConnection* conn, const void *buffer, + size_t length) { + const iomsg_hdr_t *hdr = reinterpret_cast(buffer); + + VERBOSE_LOG << "got io message " << io_op_names[hdr->op] << " sn " + << hdr->sn << " data size " << hdr->data_size << " conn " + << conn; + + if (hdr->op == IO_READ) { + handle_io_read_request(conn, hdr); + } else if (hdr->op == IO_WRITE) { + handle_io_write_request(conn, hdr); + } else { + LOG << "Invalid opcode: " << hdr->op; + } + } +protected: + MemoryPool _callback_pool; +}; + + +class DemoClient : public P2pDemoCommon { +public: + class IoReadResponseCallback : public UcxCallback { + public: + IoReadResponseCallback(size_t buffer_size, + MemoryPool* pool) : + _counter(0), _io_counter(0), _chunk_cnt(0) { + _buffer = malloc(buffer_size); + _pool = pool; + } + + void init(long *counter, uint32_t chunk_cnt = 1) { + _counter = 0; + _io_counter = counter; + _chunk_cnt = chunk_cnt; + } + + ~IoReadResponseCallback() { + free(_buffer); + } + + virtual void operator()(ucs_status_t status) { + /* wait data and response completion */ + if (++_counter < (1 + _chunk_cnt)) { + return; + } + + ++(*_io_counter); + _pool->put(this); + } + + void* buffer() { + return _buffer; + } + + private: + long _counter; + long* _io_counter; + uint32_t _chunk_cnt; + void* _buffer; + MemoryPool* _pool; + }; + + DemoClient(const options_t& test_opts) : + P2pDemoCommon(test_opts), + _num_sent(0), _num_completed(0), _status(OK), _start_time(get_time()), + _retry(0), _callback_pool(opts().iomsg_size) { + _status_str[OK] = "ok"; + _status_str[ERROR] = "error"; + _status_str[RUNTIME_EXCEEDED] = "run-time exceeded"; + _status_str[CONN_RETRIES_EXCEEDED] = "connection retries exceeded"; + } + + typedef enum { + OK, + ERROR, + RUNTIME_EXCEEDED, + CONN_RETRIES_EXCEEDED + } status_t; + + size_t do_io_read(UcxConnection *conn, uint32_t sn) { + size_t data_size = get_data_size(); + + if (!send_io_message(conn, IO_READ, sn, data_size)) { + return data_size; + } + + ++_num_sent; + IoReadResponseCallback *r = _callback_pool.get(); + r->init(&_num_completed, get_chunk_cnt(data_size)); + recv_data(conn, data_size, sn, r); + conn->recv_data(r->buffer(), opts().iomsg_size, sn, r); + next_buffer(); + + return data_size; + } + + size_t do_io_write(UcxConnection *conn, uint32_t sn) { + size_t data_size = get_data_size(); + + if (!send_io_message(conn, IO_WRITE, sn, data_size)) { + return data_size; + } + + ++_num_sent; + VERBOSE_LOG << "sending data " << buffer() << " size " + << data_size << " sn " << sn; + send_data(conn, data_size, sn); + next_buffer(); + + return data_size; + } + + virtual void dispatch_io_message(UcxConnection* conn, const void *buffer, + size_t length) { + const iomsg_hdr_t *hdr = reinterpret_cast(buffer); + + VERBOSE_LOG << "got io message " << io_op_names[hdr->op] << " sn " + << hdr->sn << " data size " << hdr->data_size + << " conn " << conn; + + if (hdr->op == IO_COMP) { + ++_num_completed; + } + } + + virtual void dispatch_connection_error(UcxConnection *conn) { + LOG << "setting error flag on connection " << conn; + _status = ERROR; + } + + bool wait_for_responses(long max_outstanding) { + struct timeval tv_start = {}; + bool timer_started = false; + struct timeval tv_curr, tv_diff; + long count; + + count = 0; + while (((_num_sent - _num_completed) > max_outstanding) && (_status == OK)) { + if (count < 1000) { + progress(); + ++count; + continue; + } + + count = 0; + + gettimeofday(&tv_curr, NULL); + + if (!timer_started) { + tv_start = tv_curr; + timer_started = true; + continue; + } + + timersub(&tv_curr, &tv_start, &tv_diff); + double elapsed = tv_diff.tv_sec + (tv_diff.tv_usec * 1e-6); + if (elapsed > _test_opts.client_timeout * 10) { + LOG << "timeout waiting for " << (_num_sent - _num_completed) + << " replies"; + _status = ERROR; + } + } + + return (_status == OK); + } + + UcxConnection* connect() { + struct sockaddr_in connect_addr; + memset(&connect_addr, 0, sizeof(connect_addr)); + connect_addr.sin_family = AF_INET; + connect_addr.sin_port = htons(opts().port_num); + inet_pton(AF_INET, opts().server_addr, &connect_addr.sin_addr); + + return UcxContext::connect((const struct sockaddr*)&connect_addr, + sizeof(connect_addr)); + } + + static double get_time() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + (tv.tv_usec * 1e-6); + } + + static std::string get_time_str() { + char str[80]; + struct timeval tv; + gettimeofday(&tv, NULL); + snprintf(str, sizeof(str), "[%lu.%06lu]", tv.tv_sec, tv.tv_usec); + return str; + } + + bool run() { + UcxConnection* conn = connect(); + if (!conn) { + return false; + } + + _status = OK; + + // TODO reset these values by canceling requests + _num_sent = 0; + _num_completed = 0; + + double prev_time = get_time(); + long total_iter = 0; + long total_prev_iter = 0; + std::vector info; + + for (int i = 0; i < IO_COMP; ++i) { + op_info_t op_info = {static_cast(i), 0, 0}; + info.push_back(op_info); + } + + while ((total_iter < opts().iter_count) && (_status == OK)) { + VERBOSE_LOG << " <<<< iteration " << total_iter << " >>>>"; + + if (!wait_for_responses(opts().window_size - 1)) { + break; + } + + io_op_t op = get_op(); + size_t size; + switch (op) { + case IO_READ: + size = do_io_read(conn, total_iter); + break; + case IO_WRITE: + size = do_io_write(conn, total_iter); + break; + default: + abort(); + } + + info[op].total_bytes += size; + info[op].num_iters++; + + if (((total_iter % 10) == 0) && (total_iter > total_prev_iter)) { + double curr_time = get_time(); + if (curr_time >= (prev_time + 1.0)) { + if (!wait_for_responses(0)) { + break; + } + + report_performance(total_iter - total_prev_iter, + curr_time - prev_time, info); + + total_prev_iter = total_iter; + prev_time = curr_time; + + check_time_limit(curr_time); + } + } + + ++total_iter; + } + + if (wait_for_responses(0)) { + double curr_time = get_time(); + report_performance(total_iter - total_prev_iter, + curr_time - prev_time, info); + check_time_limit(curr_time); + } + + delete conn; + return (_status == OK) || (_status == RUNTIME_EXCEEDED); + } + + // returns true if number of connection retries is exceeded + bool update_retry() { + if (++_retry >= opts().client_retries) { + /* client failed all retries */ + _status = CONN_RETRIES_EXCEEDED; + return true; + } + + LOG << "retry " << _retry << "/" << opts().client_retries + << " in " << opts().client_timeout << " seconds"; + usleep((int)(1e6 * opts().client_timeout)); + return false; + } + + status_t get_status() const { + return _status; + } + + const std::string& get_status_str() { + return _status_str[_status]; + } + +private: + typedef struct { + io_op_t op; + long num_iters; + size_t total_bytes; + } op_info_t; + + inline io_op_t get_op() { + if (opts().operations.size() == 1) { + return opts().operations[0]; + } + + return opts().operations[IoDemoRandom::rand( + 0, opts().operations.size() - 1)]; + } + + inline void check_time_limit(double current_time) { + if ((_status == OK) && + ((current_time - _start_time) >= opts().client_runtime_limit)) { + _status = RUNTIME_EXCEEDED; + } + } + + void report_performance(long num_iters, double elapsed, + std::vector &info) { + if (num_iters == 0) { + return; + } + + double latency_usec = (elapsed / num_iters) * 1e6; + bool first_print = true; + + for (unsigned i = 0; i < info.size(); ++i) { + op_info_t *op_info = &info[i]; + + if (!op_info->total_bytes) { + continue; + } + + if (first_print) { + std::cout << get_time_str() << " "; + first_print = false; + } else { + // print comma for non-first printouts + std::cout << ", "; + } + + double throughput_mbs = op_info->total_bytes / + elapsed / (1024.0 * 1024.0); + + std::cout << op_info->num_iters << " " + << io_op_names[op_info->op] << "s at " + << throughput_mbs << " MB/s"; + + // reset for the next round + op_info->total_bytes = 0; + op_info->num_iters = 0; + } + + if (!first_print) { + if (opts().window_size == 1) { + std::cout << ", average latency: " << latency_usec << " usec"; + } + std::cout << std::endl; + } + } + +private: + long _num_sent; + long _num_completed; + status_t _status; + std::map _status_str; + double _start_time; + unsigned _retry; +protected: + MemoryPool _callback_pool; +}; + +static int set_data_size(char *str, options_t *test_opts) +{ + const static char token = ':'; + char *val1, *val2; + + if (strchr(str, token) == NULL) { + test_opts->min_data_size = + test_opts->max_data_size = strtol(str, NULL, 0); + return 0; + } + + val1 = strtok(str, ":"); + val2 = strtok(NULL, ":"); + + if ((val1 != NULL) && (val2 != NULL)) { + test_opts->min_data_size = strtol(val1, NULL, 0); + test_opts->max_data_size = strtol(val2, NULL, 0); + } else if (val1 != NULL) { + if (str[0] == ':') { + test_opts->min_data_size = 0; + test_opts->max_data_size = strtol(val1, NULL, 0); + } else { + test_opts->min_data_size = strtol(val1, NULL, 0); + } + } else { + return -1; + } + + return 0; +} + +static int set_time(char *str, double *dest_p) +{ + char units[3] = ""; + int num_fields; + double value; + double per_sec; + + if (!strcmp(str, "inf")) { + *dest_p = std::numeric_limits::max(); + return 0; + } + + num_fields = sscanf(str, "%lf%c%c", &value, &units[0], &units[1]); + if (num_fields == 1) { + per_sec = 1; + } else if ((num_fields == 2) || (num_fields == 3)) { + if (!strcmp(units, "h")) { + per_sec = 1.0 / 3600.0; + } else if (!strcmp(units, "m")) { + per_sec = 1.0 / 60.0; + } else if (!strcmp(units, "s")) { + per_sec = 1; + } else if (!strcmp(units, "ms")) { + per_sec = 1e3; + } else if (!strcmp(units, "us")) { + per_sec = 1e6; + } else if (!strcmp(units, "ns")) { + per_sec = 1e9; + } else { + return -1; + } + } else { + return -1; + } + + *(double*)dest_p = value / per_sec; + return 0; +} + +static int parse_args(int argc, char **argv, options_t *test_opts) +{ + char *str; + bool found; + int c; + + test_opts->server_addr = NULL; + test_opts->port_num = 1337; + test_opts->client_retries = std::numeric_limits::max(); + test_opts->client_timeout = 1.0; + test_opts->client_runtime_limit = std::numeric_limits::max(); + test_opts->min_data_size = 4096; + test_opts->max_data_size = 4096; + test_opts->chunk_size = std::numeric_limits::max(); + test_opts->num_buffers = 1; + test_opts->iomsg_size = 256; + test_opts->iter_count = 1000; + test_opts->window_size = 1; + test_opts->random_seed = std::time(NULL); + test_opts->verbose = false; + + while ((c = getopt(argc, argv, "p:c:r:d:b:i:w:k:o:t:l:s:v")) != -1) { + switch (c) { + case 'p': + test_opts->port_num = atoi(optarg); + break; + case 'c': + if (strcmp(optarg, "inf")) { + test_opts->client_retries = strtol(optarg, NULL, 0); + } + break; + case 'r': + test_opts->iomsg_size = strtol(optarg, NULL, 0); + break; + case 'd': + if (set_data_size(optarg, test_opts) == -1) { + std::cout << "invalid data size range '" << optarg << "'" << std::endl; + return -1; + } + break; + case 'b': + test_opts->num_buffers = strtol(optarg, NULL, 0); + if (test_opts->num_buffers == 0) { + std::cout << "number of buffers ('" << optarg << "')" + << " has to be > 0" << std::endl; + return -1; + } + break; + case 'i': + test_opts->iter_count = strtol(optarg, NULL, 0); + break; + case 'w': + test_opts->window_size = atoi(optarg); + break; + case 'k': + test_opts->chunk_size = strtol(optarg, NULL, 0); + break; + case 'o': + str = strtok(optarg, ","); + while (str != NULL) { + found = false; + + for (int op_it = 0; op_it < IO_COMP; ++op_it) { + if (!strcmp(io_op_names[op_it], str)) { + io_op_t op = static_cast(op_it); + if (std::find(test_opts->operations.begin(), + test_opts->operations.end(), + op) == test_opts->operations.end()) { + test_opts->operations.push_back(op); + } + found = true; + } + } + + if (!found) { + std::cout << "invalid operation name '" << str << "'" << std::endl; + return -1; + } + + str = strtok(NULL, ","); + } + + if (test_opts->operations.size() == 0) { + std::cout << "no operation names were provided '" << optarg << "'" << std::endl; + return -1; + } + break; + case 't': + if (set_time(optarg, &test_opts->client_timeout) != 0) { + std::cout << "invalid '" << optarg << "' value for client timeout" << std::endl; + return -1; + } + break; + case 'l': + if (set_time(optarg, &test_opts->client_runtime_limit) != 0) { + std::cout << "invalid '" << optarg << "' value for client run-time limit" << std::endl; + return -1; + } + break; + case 's': + test_opts->random_seed = strtoul(optarg, NULL, 0); + break; + case 'v': + test_opts->verbose = true; + break; + case 'h': + default: + std::cout << "Usage: io_demo [options] [server_address]" << std::endl; + std::cout << "" << std::endl; + std::cout << "Supported options are:" << std::endl; + std::cout << " -p TCP port number to use" << std::endl; + std::cout << " -o Comma-separated string of IO operations [read|write]" << std::endl; + std::cout << " NOTE: if using several IO operations, performance" << std::endl; + std::cout << " measurments may be inaccurate" << std::endl; + std::cout << " -d : Range that should be used to get data" << std::endl; + std::cout << " size of IO payload" << std::endl; + std::cout << " -b Number of IO buffers to use for communications" << std::endl; + std::cout << " -i Number of iterations to run communication" << std::endl; + std::cout << " -w Number of outstanding requests" << std::endl; + std::cout << " -k Split the data transfer to chunks of this size" << std::endl; + std::cout << " -r Size of IO request packet" << std::endl; + std::cout << " -c Number of connection retries on client" << std::endl; + std::cout << " (or \"inf\") for failure" << std::endl; + std::cout << " -t Client timeout (or \"inf\")" << std::endl; + std::cout << " -l Time limit to run the IO client (or \"inf\")" << std::endl; + std::cout << " Examples: -l 17.5s; -l 10m; 15.5h" << std::endl; + std::cout << " -s Random seed to use for randomizing" << std::endl; + std::cout << " -v Set verbose mode" << std::endl; + std::cout << "" << std::endl; + return -1; + } + } + + if (optind < argc) { + test_opts->server_addr = argv[optind]; + } + + if (test_opts->operations.size() == 0) { + test_opts->operations.push_back(IO_WRITE); + } + + return 0; +} + +static int do_server(const options_t& test_opts) +{ + DemoServer server(test_opts); + if (!server.init()) { + return -1; + } + + server.run(); + return 0; +} + +static int do_client(const options_t& test_opts) +{ + IoDemoRandom::srand(test_opts.random_seed); + LOG << "random seed: " << test_opts.random_seed; + + DemoClient client(test_opts); + if (!client.init()) { + return -1; + } + + for (;;) { + if (client.run()) { + /* successful run */ + break; + } + + if (client.update_retry()) { + break; + } + } + + DemoClient::status_t status = client.get_status(); + LOG << "client exit with \"" << client.get_status_str() << "\" status"; + return ((status == DemoClient::OK) || + (status == DemoClient::RUNTIME_EXCEEDED)) ? 0 : -1; +} + +int main(int argc, char **argv) +{ + options_t test_opts; + int ret; + + ret = parse_args(argc, argv, &test_opts); + if (ret < 0) { + return ret; + } + + if (test_opts.server_addr == NULL) { + return do_server(test_opts); + } else { + return do_client(test_opts); + } +} diff --git a/test/apps/iodemo/ucx_wrapper.cc b/test/apps/iodemo/ucx_wrapper.cc new file mode 100644 index 00000000000..f6ec2e1d7d5 --- /dev/null +++ b/test/apps/iodemo/ucx_wrapper.cc @@ -0,0 +1,707 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#include "ucx_wrapper.h" + +#include +#include +#include +#include +#include + +#include + + +struct ucx_request { + UcxCallback *callback; + UcxConnection *conn; + ucs_status_t status; + bool completed; + uint32_t conn_id; + size_t recv_length; + ucs_list_link_t pos; +}; + +UcxCallback::~UcxCallback() +{ +} + +void EmptyCallback::operator()(ucs_status_t status) +{ +} + +EmptyCallback* EmptyCallback::get() { + // singleton + static EmptyCallback instance; + return &instance; +} + +UcxLog::UcxLog(const char* prefix, bool enable) : _enable(enable) +{ + if (enable) { + std::cout << prefix << " "; + } +} + +UcxLog::~UcxLog() +{ + if (_enable) { + std::cout << std::endl; + } +} + +#define UCX_LOG UcxLog("[UCX]", true) + +UcxContext::UcxContext(size_t iomsg_size) : + _context(NULL), _worker(NULL), _listener(NULL), _iomsg_recv_request(NULL), + _iomsg_buffer(iomsg_size, '\0') +{ +} + +UcxContext::~UcxContext() +{ + destroy_connections(); + destroy_listener(); + destroy_worker(); + if (_context) { + ucp_cleanup(_context); + } +} + +bool UcxContext::init() +{ + if (_context && _worker) { + UCX_LOG << "context is already initialized"; + return true; + } + + /* Create context */ + ucp_params_t ucp_params; + ucp_params.field_mask = UCP_PARAM_FIELD_FEATURES | + UCP_PARAM_FIELD_REQUEST_INIT | + UCP_PARAM_FIELD_REQUEST_SIZE; + ucp_params.features = UCP_FEATURE_TAG | + UCP_FEATURE_STREAM; + ucp_params.request_init = request_init; + ucp_params.request_size = sizeof(ucx_request); + ucs_status_t status = ucp_init(&ucp_params, NULL, &_context); + if (status != UCS_OK) { + UCX_LOG << "ucp_init() failed: " << ucs_status_string(status); + return false; + } + + UCX_LOG << "created context " << _context; + + /* Create worker */ + ucp_worker_params_t worker_params; + worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; + worker_params.thread_mode = UCS_THREAD_MODE_SINGLE; + status = ucp_worker_create(_context, &worker_params, &_worker); + if (status != UCS_OK) { + ucp_cleanup(_context); + UCX_LOG << "ucp_worker_create() failed: " << ucs_status_string(status); + return false; + } + + UCX_LOG << "created worker " << _worker; + + recv_io_message(); + return true; +} + +bool UcxContext::listen(const struct sockaddr* saddr, size_t addrlen) +{ + ucp_listener_params_t listener_params; + + listener_params.field_mask = UCP_LISTENER_PARAM_FIELD_SOCK_ADDR | + UCP_LISTENER_PARAM_FIELD_CONN_HANDLER; + listener_params.sockaddr.addr = saddr; + listener_params.sockaddr.addrlen = addrlen; + listener_params.conn_handler.cb = connect_callback; + listener_params.conn_handler.arg = reinterpret_cast(this); + + ucs_status_t status = ucp_listener_create(_worker, &listener_params, + &_listener); + if (status != UCS_OK) { + UCX_LOG << "ucp_listener_create() failed: " << ucs_status_string(status); + return false; + } + + UCX_LOG << "started listener " << _listener << " on " + << sockaddr_str(saddr, addrlen); + return true; +} + +UcxConnection* UcxContext::connect(const struct sockaddr* saddr, size_t addrlen) +{ + UcxConnection *conn = new UcxConnection(*this, get_next_conn_id()); + if (!conn->connect(saddr, addrlen)) { + delete conn; + return NULL; + } + + add_connection(conn); + return conn; +} + +void UcxContext::progress() +{ + ucp_worker_progress(_worker); + progress_io_message(); + progress_conn_requests(); + progress_failed_connections(); +} + +void UcxContext::dispatch_new_connection(UcxConnection *conn) +{ + // To be implemented in a subclass +} + +void UcxContext::dispatch_io_message(UcxConnection* conn, const void *buffer, + size_t length) +{ + // To be implemented in a subclass +} + +void UcxContext::dispatch_connection_error(UcxConnection* conn) +{ + // To be implemented in a subclass +} + +uint32_t UcxContext::get_next_conn_id() +{ + static uint32_t conn_id = 1; + return conn_id++; +} + +void UcxContext::request_init(void *request) +{ + ucx_request *r = reinterpret_cast(request); + request_reset(r); +} + +void UcxContext::request_reset(ucx_request *r) +{ + r->completed = false; + r->callback = NULL; + r->conn = NULL; + r->recv_length = 0; + r->pos.next = NULL; + r->pos.prev = NULL; +} + +void UcxContext::request_release(void *request) +{ + request_reset(reinterpret_cast(request)); + ucp_request_free(request); +} + +void UcxContext::connect_callback(ucp_conn_request_h conn_req, void *arg) +{ + UcxContext *self = reinterpret_cast(arg); + UCX_LOG << "got new connection request " << conn_req; + self->_conn_requests.push_back(conn_req); +} + +void UcxContext::iomsg_recv_callback(void *request, ucs_status_t status, + ucp_tag_recv_info *info) +{ + ucx_request *r = reinterpret_cast(request); + r->completed = true; + r->conn_id = (info->sender_tag & ~IOMSG_TAG) >> 32; + r->recv_length = info->length; +} + +const std::string UcxContext::sockaddr_str(const struct sockaddr* saddr, + size_t addrlen) +{ + char buf[128]; + int port; + + if (saddr->sa_family != AF_INET) { + return ""; + } + + struct sockaddr_storage addr = {0}; + memcpy(&addr, saddr, addrlen); + + inet_ntop(AF_INET, &((struct sockaddr_in*)&addr)->sin_addr, + buf, sizeof(buf)); + port = ntohs(((struct sockaddr_in*)&addr)->sin_port); + + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ":%d", port); + return buf; +} + +ucp_worker_h UcxContext::worker() const +{ + return _worker; +} + +void UcxContext::progress_conn_requests() +{ + while (!_conn_requests.empty()) { + UcxConnection *conn = new UcxConnection(*this, get_next_conn_id()); + if (conn->accept(_conn_requests.front())) { + add_connection(conn); + dispatch_new_connection(conn); + } else { + delete conn; + } + _conn_requests.pop_front(); + } +} + +void UcxContext::progress_io_message() +{ + if (!_iomsg_recv_request->completed) { + return; + } + + uint32_t conn_id = _iomsg_recv_request->conn_id; + conn_map_t::iterator iter = _conns.find(conn_id); + if (iter == _conns.end()) { + UCX_LOG << "could not find connection with id " << conn_id; + } else { + UcxConnection *conn = iter->second; + dispatch_io_message(conn, &_iomsg_buffer[0], + _iomsg_recv_request->recv_length); + } + request_release(_iomsg_recv_request); + recv_io_message(); +} + +void UcxContext::progress_failed_connections() +{ + while (!_failed_conns.empty()) { + UcxConnection *conn = _failed_conns.front(); + _failed_conns.pop_front(); + dispatch_connection_error(conn); + } +} + +UcxContext::wait_status_t +UcxContext::wait_completion(ucs_status_ptr_t status_ptr, double timeout) +{ + if (status_ptr == NULL) { + return WAIT_STATUS_OK; + } else if (UCS_PTR_IS_PTR(status_ptr)) { + ucx_request *request = (ucx_request*)UCS_STATUS_PTR(status_ptr); + ucs_status_t status; + struct timeval tv_start; + gettimeofday(&tv_start, NULL); + do { + struct timeval tv_current, elapsed; + gettimeofday(&tv_current, NULL); + timersub(&tv_current, &tv_start, &elapsed); + if (elapsed.tv_sec + (elapsed.tv_usec * 1e-6) > timeout) { + return WAIT_STATUS_TIMED_OUT; + } + + ucp_worker_progress(_worker); + status = ucp_request_check_status(request); + } while (status == UCS_INPROGRESS); + request_release(request); + return (status == UCS_OK) ? WAIT_STATUS_OK : WAIT_STATUS_FAILED; + } else { + assert(UCS_PTR_IS_ERR(status_ptr)); + return WAIT_STATUS_FAILED; + } +} + +void UcxContext::recv_io_message() +{ + ucs_status_ptr_t status_ptr = ucp_tag_recv_nb(_worker, &_iomsg_buffer[0], + _iomsg_buffer.size(), + ucp_dt_make_contig(1), + IOMSG_TAG, IOMSG_TAG, + iomsg_recv_callback); + assert(status_ptr != NULL); + _iomsg_recv_request = reinterpret_cast(status_ptr); +} + +void UcxContext::add_connection(UcxConnection *conn) +{ + _conns[conn->id()] = conn; +} + +void UcxContext::remove_connection(UcxConnection *conn) +{ + _conns.erase(conn->id()); +} + +void UcxContext::handle_connection_error(UcxConnection *conn) +{ + remove_connection(conn); + _failed_conns.push_back(conn); +} + +void UcxContext::destroy_connections() +{ + while (!_conn_requests.empty()) { + UCX_LOG << "reject connection request " << _conn_requests.front(); + ucp_listener_reject(_listener, _conn_requests.front()); + _conn_requests.pop_front(); + } + + for (conn_map_t::iterator iter = _conns.begin(); iter != _conns.end(); ++iter) { + delete iter->second; + + } + _conns.clear(); +} + +void UcxContext::destroy_listener() +{ + if (_listener) { + ucp_listener_destroy(_listener); + } +} + +void UcxContext::destroy_worker() +{ + if (!_worker) { + return; + } + + if (_iomsg_recv_request != NULL) { + ucp_request_cancel(_worker, _iomsg_recv_request); + wait_completion(_iomsg_recv_request); + } + + ucp_worker_destroy(_worker); +} + + +#define UCX_CONN_LOG UcxLog(_log_prefix, true) + +unsigned UcxConnection::_num_instances = 0; + +UcxConnection::UcxConnection(UcxContext &context, uint32_t conn_id) : + _context(context), _conn_id(conn_id), _remote_conn_id(0), + _ep(0), _close_request(NULL) +{ + ++_num_instances; + struct sockaddr_in in_addr = {0}; + in_addr.sin_family = AF_INET; + set_log_prefix((const struct sockaddr*)&in_addr, sizeof(in_addr)); + ucs_list_head_init(&_all_requests); + UCX_CONN_LOG << "created new connection, total: " << _num_instances; +} + +UcxConnection::~UcxConnection() +{ + // if _ep is NULL, connection was closed and removed by error handler + if (_ep != NULL) { + disconnect(UCP_EP_CLOSE_MODE_FORCE); + } + + if (_close_request) { + _context.wait_completion(_close_request); + } + + // wait until all requests are completed + if (!ucs_list_is_empty(&_all_requests)) { + UCX_CONN_LOG << "waiting for " << ucs_list_length(&_all_requests) << + " uncompleted requests"; + } + while (!ucs_list_is_empty(&_all_requests)) { + ucp_worker_progress(_context.worker()); + } + + UCX_CONN_LOG << "released"; + --_num_instances; +} + +bool UcxConnection::connect(const struct sockaddr* saddr, socklen_t addrlen) +{ + set_log_prefix(saddr, addrlen); + + ucp_ep_params_t ep_params; + ep_params.field_mask = UCP_EP_PARAM_FIELD_FLAGS | + UCP_EP_PARAM_FIELD_SOCK_ADDR; + ep_params.flags = UCP_EP_PARAMS_FLAGS_CLIENT_SERVER; + ep_params.sockaddr.addr = saddr; + ep_params.sockaddr.addrlen = addrlen; + + return connect_common(ep_params); +} + +bool UcxConnection::accept(ucp_conn_request_h conn_req) +{ + ucp_conn_request_attr_t conn_req_attr; + conn_req_attr.field_mask = UCP_CONN_REQUEST_ATTR_FIELD_CLIENT_ADDR; + + ucs_status_t status = ucp_conn_request_query(conn_req, &conn_req_attr); + if (status == UCS_OK) { + set_log_prefix((const struct sockaddr*)&conn_req_attr.client_address, + sizeof(conn_req_attr.client_address)); + } else { + UCX_CONN_LOG << "ucp_conn_request_query() failed: " << ucs_status_string(status); + } + + ucp_ep_params_t ep_params; + ep_params.field_mask = UCP_EP_PARAM_FIELD_CONN_REQUEST; + ep_params.conn_request = conn_req; + + return connect_common(ep_params); +} + +bool UcxConnection::send_io_message(const void *buffer, size_t length, + UcxCallback* callback) +{ + ucp_tag_t tag = make_iomsg_tag(_remote_conn_id, 0); + return send_common(buffer, length, tag, callback); +} + +bool UcxConnection::send_data(const void *buffer, size_t length, uint32_t sn, + UcxCallback* callback) +{ + ucp_tag_t tag = make_data_tag(_remote_conn_id, sn); + return send_common(buffer, length, tag, callback); +} + +bool UcxConnection::recv_data(void *buffer, size_t length, uint32_t sn, + UcxCallback* callback) +{ + if (_ep == NULL) { + return false; + } + + ucp_tag_t tag = make_data_tag(_conn_id, sn); + ucp_tag_t tag_mask = std::numeric_limits::max(); + ucs_status_ptr_t ptr_status = ucp_tag_recv_nb(_context.worker(), buffer, + length, ucp_dt_make_contig(1), + tag, tag_mask, + data_recv_callback); + return process_request("ucp_tag_recv_nb", ptr_status, callback); +} + +void UcxConnection::cancel_all() +{ + if (ucs_list_is_empty(&_all_requests)) { + return; + } + + ucx_request *request, *tmp; + unsigned count = 0; + ucs_list_for_each_safe(request, tmp, &_all_requests, pos) { + ucp_request_cancel(_context.worker(), request); + ++count; + } + + UCX_CONN_LOG << "canceling " << count << " requests "; +} + +ucp_tag_t UcxConnection::make_data_tag(uint32_t conn_id, uint32_t sn) +{ + return (static_cast(conn_id) << 32) | sn; +} + +ucp_tag_t UcxConnection::make_iomsg_tag(uint32_t conn_id, uint32_t sn) +{ + return UcxContext::IOMSG_TAG | make_data_tag(conn_id, sn); +} + +void UcxConnection::stream_send_callback(void *request, ucs_status_t status) +{ +} + +void UcxConnection::stream_recv_callback(void *request, ucs_status_t status, + size_t recv_len) +{ +} + +void UcxConnection::common_request_callback(void *request, ucs_status_t status) +{ + ucx_request *r = reinterpret_cast(request); + + assert(!r->completed); + if (r->callback) { + // already processed by send function + (*r->callback)(status); + r->conn->request_completed(r); + UcxContext::request_release(r); + } else { + // not yet processed by "process_request" + r->completed = true; + r->status = status; + } +} + +void UcxConnection::data_recv_callback(void *request, ucs_status_t status, + ucp_tag_recv_info *info) +{ + common_request_callback(request, status); +} + +void UcxConnection::error_callback(void *arg, ucp_ep_h ep, ucs_status_t status) +{ + reinterpret_cast(arg)->handle_connection_error(status); +} + +void UcxConnection::set_log_prefix(const struct sockaddr* saddr, + socklen_t addrlen) +{ + std::stringstream ss; + ss << "[UCX-connection #" << _conn_id << " " << + UcxContext::sockaddr_str(saddr, addrlen) << "]"; + memset(_log_prefix, 0, MAX_LOG_PREFIX_SIZE); + int length = ss.str().length(); + if (length >= MAX_LOG_PREFIX_SIZE) { + length = MAX_LOG_PREFIX_SIZE - 1; + } + memcpy(_log_prefix, ss.str().c_str(), length); +} + +bool UcxConnection::connect_common(ucp_ep_params_t& ep_params) +{ + UcxContext::wait_status_t wait_status; + + // create endpoint + ep_params.field_mask |= UCP_EP_PARAM_FIELD_ERR_HANDLER | + UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE; + ep_params.err_mode = UCP_ERR_HANDLING_MODE_PEER; + ep_params.err_handler.cb = error_callback; + ep_params.err_handler.arg = reinterpret_cast(this); + + ucs_status_t status = ucp_ep_create(_context.worker(), &ep_params, &_ep); + if (status != UCS_OK) { + UCX_LOG << "ucp_ep_create() failed: " << ucs_status_string(status); + return false; + } + + UCX_CONN_LOG << "created endpoint " << _ep << ", exchanging connection id"; + + const ucp_datatype_t dt_int = ucp_dt_make_contig(sizeof(uint32_t)); + + // receive remote connection id + size_t recv_len; + void *rreq = ucp_stream_recv_nb(_ep, &_remote_conn_id, 1, dt_int, + stream_recv_callback, &recv_len, + UCP_STREAM_RECV_FLAG_WAITALL); + + // send local connection id + void *sreq = ucp_stream_send_nb(_ep, &_conn_id, 1, dt_int, + stream_send_callback, 0); + wait_status = _context.wait_completion(sreq, 5); + if (wait_status != UcxContext::WAIT_STATUS_OK) { + UCX_CONN_LOG << "failed to send remote connection id"; + ep_close(UCP_EP_CLOSE_MODE_FORCE); + if (wait_status == UcxContext::WAIT_STATUS_TIMED_OUT) { + _context.wait_completion(sreq); + } + // wait for receive request as well, which should be canceled by ep close + _context.wait_completion(rreq); + return false; + } + + // wait to complete receiving remote connection id + wait_status = _context.wait_completion(rreq, 5); + if (wait_status != UcxContext::WAIT_STATUS_OK) { + UCX_CONN_LOG << "failed to receive remote connection id"; + ep_close(UCP_EP_CLOSE_MODE_FORCE); + if (wait_status == UcxContext::WAIT_STATUS_TIMED_OUT) { + _context.wait_completion(rreq); + } + return false; + } + + UCX_CONN_LOG << "remote id is " << _remote_conn_id; + return true; +} + +bool UcxConnection::send_common(const void *buffer, size_t length, ucp_tag_t tag, + UcxCallback* callback) +{ + if (_ep == NULL) { + return false; + } + + ucs_status_ptr_t ptr_status = ucp_tag_send_nb(_ep, buffer, length, + ucp_dt_make_contig(1), tag, + common_request_callback); + return process_request("ucp_tag_send_nb", ptr_status, callback); +} + +void UcxConnection::request_started(ucx_request *r) +{ + ucs_list_add_tail(&_all_requests, &r->pos); +} + +void UcxConnection::request_completed(ucx_request *r) +{ + assert(r->conn == this); + ucs_list_del(&r->pos); +} + +void UcxConnection::handle_connection_error(ucs_status_t status) +{ + UCX_CONN_LOG << "detected error: " << ucs_status_string(status); + + if (_remote_conn_id) { + disconnect(UCP_EP_CLOSE_MODE_FORCE); + _context.handle_connection_error(this); + } +} + +void UcxConnection::disconnect(enum ucp_ep_close_mode mode) +{ + _context.remove_connection(this); + cancel_all(); + ep_close(mode); +} + +void UcxConnection::ep_close(enum ucp_ep_close_mode mode) +{ + static const char *mode_str[] = {"force", "flush"}; + if (_ep == NULL) { + /* already closed */ + return; + } + + assert(!_close_request); + + UCX_CONN_LOG << "closing ep " << _ep << " mode " << mode_str[mode]; + _close_request = ucp_ep_close_nb(_ep, mode); + _ep = NULL; +} + +bool UcxConnection::process_request(const char *what, + ucs_status_ptr_t ptr_status, + UcxCallback* callback) +{ + ucs_status_t status; + + if (ptr_status == NULL) { + (*callback)(UCS_OK); + return true; + } else if (UCS_PTR_IS_ERR(ptr_status)) { + status = UCS_PTR_STATUS(ptr_status); + UCX_CONN_LOG << what << "failed with status: " + << ucs_status_string(status); + (*callback)(status); + return false; + } else { + // pointer to request + ucx_request *r = reinterpret_cast(ptr_status); + if (r->completed) { + // already completed by callback + status = r->status; + (*callback)(status); + UcxContext::request_release(r); + return status == UCS_OK; + } else { + // will be completed by callback + r->callback = callback; + r->conn = this; + request_started(r); + return true; + } + } +} + + diff --git a/test/apps/iodemo/ucx_wrapper.h b/test/apps/iodemo/ucx_wrapper.h new file mode 100644 index 00000000000..78497417d7c --- /dev/null +++ b/test/apps/iodemo/ucx_wrapper.h @@ -0,0 +1,237 @@ +/* + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef IODEMO_UCX_WRAPPER_H_ +#define IODEMO_UCX_WRAPPER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_LOG_PREFIX_SIZE 64 + +/* Forward declarations */ +class UcxConnection; +struct ucx_request; + +/* + * UCX callback for send/receive completion + */ +class UcxCallback { +public: + virtual ~UcxCallback(); + virtual void operator()(ucs_status_t status) = 0; +}; + + +/* + * Empty callback singleton + */ +class EmptyCallback : public UcxCallback { +public: + /// @override + virtual void operator()(ucs_status_t status); + + static EmptyCallback* get(); +}; + + +/* + * Logger which can be enabled/disabled + */ +class UcxLog { +public: + UcxLog(const char* prefix, bool enable); + + ~UcxLog(); + + template + const UcxLog& operator<<(const T &t) const { + if (_enable) { + std::cout << t; + } + return *this; + } + +private: + const bool _enable; +}; + + +/** + * Holds UCX global context and worker + */ +class UcxContext { +public: + UcxContext(size_t iomsg_size); + + virtual ~UcxContext(); + + bool init(); + + bool listen(const struct sockaddr* saddr, size_t addrlen); + + UcxConnection* connect(const struct sockaddr* saddr, size_t addrlen); + + void progress(); + +protected: + + // Called when new connection is created on server side + virtual void dispatch_new_connection(UcxConnection *conn); + + // Called when new IO message is received + virtual void dispatch_io_message(UcxConnection* conn, const void *buffer, + size_t length); + + // Called when there is a fatal failure on the connection + virtual void dispatch_connection_error(UcxConnection* conn); + +private: + typedef enum { + WAIT_STATUS_OK, + WAIT_STATUS_FAILED, + WAIT_STATUS_TIMED_OUT + } wait_status_t; + + friend class UcxConnection; + + static const ucp_tag_t IOMSG_TAG = 1ull << 63; + + static uint32_t get_next_conn_id(); + + static void request_init(void *request); + + static void request_reset(ucx_request *r); + + static void request_release(void *request); + + static void connect_callback(ucp_conn_request_h conn_req, void *arg); + + static void iomsg_recv_callback(void *request, ucs_status_t status, + ucp_tag_recv_info *info); + + static const std::string sockaddr_str(const struct sockaddr* saddr, + size_t addrlen); + + ucp_worker_h worker() const; + + void progress_conn_requests(); + + void progress_io_message(); + + void progress_failed_connections(); + + wait_status_t wait_completion(ucs_status_ptr_t status_ptr, + double timeout = 1e6); + + void recv_io_message(); + + void add_connection(UcxConnection *conn); + + void remove_connection(UcxConnection *conn); + + void handle_connection_error(UcxConnection *conn); + + void destroy_connections(); + + void destroy_listener(); + + void destroy_worker(); + + typedef std::map conn_map_t; + + ucp_context_h _context; + ucp_worker_h _worker; + ucp_listener_h _listener; + conn_map_t _conns; + ucx_request* _iomsg_recv_request; + std::string _iomsg_buffer; + std::deque _conn_requests; + std::deque _failed_conns; +}; + + +class UcxConnection { +public: + UcxConnection(UcxContext& context, uint32_t conn_id); + + ~UcxConnection(); + + bool connect(const struct sockaddr* saddr, socklen_t addrlen); + + bool accept(ucp_conn_request_h conn_req); + + bool send_io_message(const void *buffer, size_t length, + UcxCallback* callback = EmptyCallback::get()); + + bool send_data(const void *buffer, size_t length, uint32_t sn, + UcxCallback* callback = EmptyCallback::get()); + + bool recv_data(void *buffer, size_t length, uint32_t sn, + UcxCallback* callback = EmptyCallback::get()); + + void cancel_all(); + + uint32_t id() const { + return _conn_id; + } + +private: + static ucp_tag_t make_data_tag(uint32_t conn_id, uint32_t sn); + + static ucp_tag_t make_iomsg_tag(uint32_t conn_id, uint32_t sn); + + static void stream_send_callback(void *request, ucs_status_t status); + + static void stream_recv_callback(void *request, ucs_status_t status, + size_t recv_len); + + static void common_request_callback(void *request, ucs_status_t status); + + static void data_recv_callback(void *request, ucs_status_t status, + ucp_tag_recv_info *info); + + static void error_callback(void *arg, ucp_ep_h ep, ucs_status_t status); + + void set_log_prefix(const struct sockaddr* saddr, socklen_t addrlen); + + bool connect_common(ucp_ep_params_t& ep_params); + + bool send_common(const void *buffer, size_t length, ucp_tag_t tag, + UcxCallback* callback); + + void request_started(ucx_request *r); + + void request_completed(ucx_request *r); + + void handle_connection_error(ucs_status_t status); + + void disconnect(enum ucp_ep_close_mode mode); + + void ep_close(enum ucp_ep_close_mode mode); + + bool process_request(const char *what, ucs_status_ptr_t ptr_status, + UcxCallback* callback); + + static unsigned _num_instances; + + UcxContext& _context; + uint32_t _conn_id; + uint32_t _remote_conn_id; + char _log_prefix[MAX_LOG_PREFIX_SIZE]; + ucp_ep_h _ep; + void* _close_request; + ucs_list_link_t _all_requests; +}; + +#endif + diff --git a/test/apps/profiling/Makefile.am b/test/apps/profiling/Makefile.am new file mode 100644 index 00000000000..f9b521b7ff8 --- /dev/null +++ b/test/apps/profiling/Makefile.am @@ -0,0 +1,11 @@ +# +# Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. +# + +noinst_PROGRAMS = ucx_profiling + +ucx_profiling_SOURCES = ucx_profiling.c +ucx_profiling_CFLAGS = $(BASE_CFLAGS) +ucx_profiling_CPPFLAGS = $(BASE_CPPFLAGS) +ucx_profiling_LDADD = $(top_builddir)/src/ucs/libucs.la +ucx_profiling_LDFLAGS = -lm diff --git a/test/examples/ucx_profiling.c b/test/apps/profiling/ucx_profiling.c similarity index 100% rename from test/examples/ucx_profiling.c rename to test/apps/profiling/ucx_profiling.c diff --git a/test/apps/sockaddr/sa_main.cc b/test/apps/sockaddr/sa_main.cc index 57ccf881495..958c1ecff61 100644 --- a/test/apps/sockaddr/sa_main.cc +++ b/test/apps/sockaddr/sa_main.cc @@ -331,6 +331,7 @@ void application::pton(const dest_t& dst, struct sockaddr_storage& saddr, case AF_INET: reinterpret_cast(&saddr)->sin_port = htons(dst.port); + /* cppcheck-suppress internalAstError */ addr = &reinterpret_cast(&saddr)->sin_addr; addrlen = sizeof(struct sockaddr_in); addr_datalen = sizeof(struct in_addr); diff --git a/test/apps/test_dlopen_cfg_print.c b/test/apps/test_dlopen_cfg_print.c index 21e6c402fef..35baf0edcf0 100644 --- a/test/apps/test_dlopen_cfg_print.c +++ b/test/apps/test_dlopen_cfg_print.c @@ -30,6 +30,8 @@ static void* do_dlopen_or_exit(const char *filename) int main(int argc, char **argv) { + typedef void (*print_all_opts_func_t)(FILE*, const char *, int); + const char *ucs_filename = QUOTE(UCS_LIB_PATH); const char *uct_filename = QUOTE(UCT_LIB_PATH); void *ucs_handle, *uct_handle; @@ -44,9 +46,9 @@ int main(int argc, char **argv) } /* print all config table, to force going over the global list in ucs */ - void (*print_all_opts)(FILE*, int) = - dlsym(ucs_handle, "ucs_config_parser_print_all_opts"); - print_all_opts(stdout, 0); + print_all_opts_func_t print_all_opts = + (print_all_opts_func_t)dlsym(ucs_handle, "ucs_config_parser_print_all_opts"); + print_all_opts(stdout, "TEST_", 0); dlclose(ucs_handle); printf("done\n"); diff --git a/test/apps/test_link_map.c b/test/apps/test_link_map.c index 29167e58845..6683333cb8d 100644 --- a/test/apps/test_link_map.c +++ b/test/apps/test_link_map.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include #include diff --git a/test/apps/test_ucp_dlopen.c b/test/apps/test_ucp_dlopen.c index 4951a23fd40..0cd31472420 100644 --- a/test/apps/test_ucp_dlopen.c +++ b/test/apps/test_ucp_dlopen.c @@ -4,6 +4,10 @@ * See file LICENSE for terms. */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include #include @@ -18,17 +22,25 @@ int test_ucp_init(void *handle) { - ucs_status_t (*ucp_init_version_f)(unsigned, unsigned, const ucp_params_t*, - const ucp_config_t*, ucp_context_h*); - void (*ucp_context_print_info_f)(const ucp_context_h, FILE*); - void (*ucp_cleanup_f)(ucp_context_h); + typedef ucs_status_t (*ucp_init_version_func_t)(unsigned, unsigned, + const ucp_params_t *, + const ucp_config_t *, + ucp_context_h *); + typedef void (*ucp_context_print_info_func_t)(const ucp_context_h, FILE*); + typedef void (*ucp_cleanup_func_t)(ucp_context_h); + + ucp_init_version_func_t ucp_init_version_f; + ucp_context_print_info_func_t ucp_context_print_info_f; + ucp_cleanup_func_t ucp_cleanup_f; ucp_params_t ucp_params; ucs_status_t status; ucp_context_h ucph; - ucp_init_version_f = dlsym(handle, "ucp_init_version"); - ucp_cleanup_f = dlsym(handle, "ucp_cleanup"); - ucp_context_print_info_f = dlsym(handle, "ucp_context_print_info"); + ucp_init_version_f = (ucp_init_version_func_t)dlsym(handle, + "ucp_init_version"); + ucp_cleanup_f = (ucp_cleanup_func_t)dlsym(handle, "ucp_cleanup"); + ucp_context_print_info_f = (ucp_context_print_info_func_t)dlsym(handle, + "ucp_context_print_info"); if (!ucp_init_version_f || !ucp_cleanup_f || !ucp_context_print_info_f) { fprintf(stderr, "failed to get UCP function pointers\n"); @@ -77,7 +89,8 @@ int main(int argc, char **argv) MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (ptr2 == MAP_FAILED) { fprintf(stderr, "mmmap() failed: %m\n"); - return -1; + ret = -1; + goto failed_mmap; } /* load ucp */ @@ -85,23 +98,23 @@ int main(int argc, char **argv) handle = dlopen(filename, RTLD_NOW | RTLD_LOCAL); if (handle == NULL) { fprintf(stderr, "failed to open %s: %m\n", filename); - return -1; + ret = -1; + goto failed_dlopen; } /* init ucp */ ret = test_ucp_init(handle); - if (ret) { - return -1; - } /* unload ucp */ dlclose(handle); +failed_dlopen: /* relase the memory - could break if UCM is unloaded */ munmap(ptr2, alloc_size); +failed_mmap: free(ptr1); printf("done\n"); - return 0; + return ret; } diff --git a/test/apps/test_ucs_dlopen.c b/test/apps/test_ucs_dlopen.c index e101df1dfca..87ef1af7b8a 100644 --- a/test/apps/test_ucs_dlopen.c +++ b/test/apps/test_ucs_dlopen.c @@ -4,7 +4,10 @@ * See file LICENSE for terms. */ -#define _GNU_SOURCE +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include #include @@ -24,12 +27,17 @@ static void vm_unmap_cb(ucm_event_type_t event_type, ucm_event_t *event, int test_ucm_set_event_handler(void *handle) { - ucs_status_t (*ucm_set_event_handler_f)(int events, int priority, - ucm_event_callback_t cb, void *arg); + typedef ucs_status_t (*ucm_set_event_handler_func_t)(int events, + int priority, + ucm_event_callback_t cb, + void *arg); + + ucm_set_event_handler_func_t ucm_set_event_handler_f; ucs_status_t status; dlerror(); - ucm_set_event_handler_f = dlsym(handle, "ucm_set_event_handler"); + ucm_set_event_handler_f = (ucm_set_event_handler_func_t)dlsym(handle, + "ucm_set_event_handler"); if (ucm_set_event_handler_f == NULL) { fprintf(stderr, "failed to resolve ucm_set_event_handler(): %s\n", dlerror()); @@ -73,7 +81,8 @@ int main(int argc, char **argv) MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (ptr2 == MAP_FAILED) { fprintf(stderr, "mmmap() failed: %m\n"); - return -1; + ret = -1; + goto failed_mmap; } /* load ucm */ @@ -82,23 +91,23 @@ int main(int argc, char **argv) handle = dlopen(filename, RTLD_NOW); if (handle == NULL) { fprintf(stderr, "failed to open %s: %s\n", filename, dlerror()); - return -1; + ret = -1; + goto failed_dlopen; } /* init ucm */ ret = test_ucm_set_event_handler(handle); - if (ret < 0) { - return ret; - } /* unload ucp */ dlclose(handle); +failed_dlopen: /* release the memory - could break if UCM is unloaded */ munmap(ptr2, alloc_size); +failed_mmap: free(ptr1); printf("done\n"); - return 0; + return ret; } diff --git a/test/apps/test_ucx_tls.py b/test/apps/test_ucx_tls.py index cf4d643c440..d5e6d04207b 100755 --- a/test/apps/test_ucx_tls.py +++ b/test/apps/test_ucx_tls.py @@ -11,17 +11,19 @@ import re import commands from distutils.version import LooseVersion +from optparse import OptionParser #expected AM transport selections per given number of eps mlx4_am = { - 2 : "rc", - 16 : "rc", - 32 : "rc", - 64 : "rc", - 256 : "ud", - 1024 : "ud", - 1000000 : "ud", + 2 : "rc_verbs", + 16 : "rc_verbs", + 32 : "rc_verbs", + 64 : "rc_verbs", + 256 : "ud_verbs", + 512 : "ud_verbs", + 1024 : "ud_verbs", + 1000000 : "ud_verbs", } mlx5_am = { @@ -30,6 +32,7 @@ 32 : "rc_mlx5", 64 : "dc_mlx5", 256 : "dc_mlx5", + 512 : "dc_mlx5", 1024 : "dc_mlx5", 1000000 : "dc_mlx5", } @@ -40,6 +43,7 @@ 32 : "rc_mlx5", 64 : "rc_mlx5", 256 : "ud_mlx5", + 512 : "ud_mlx5", 1024 : "ud_mlx5", 1000000 : "ud_mlx5", } @@ -51,18 +55,20 @@ 32 : "rc_mlx5", 64 : "rc_mlx5", 256 : "rc_mlx5", + 512 : "rc_mlx5", 1024 : "rc_mlx5", 1000000 : "rc_mlx5", } mlx4_am_override = { - 2 : "rc", - 16 : "rc", - 32 : "rc", - 64 : "rc", - 256 : "rc", - 1024 : "rc", - 1000000 : "rc", + 2 : "rc_verbs", + 16 : "rc_verbs", + 32 : "rc_verbs", + 64 : "rc_verbs", + 256 : "rc_verbs", + 512 : "rc_verbs", + 1024 : "rc_verbs", + 1000000 : "rc_verbs", } am_tls = { @@ -74,9 +80,18 @@ "mlx5_override" : mlx5_am_override } -def find_am_transport(dev, neps, override = 0) : +def exec_cmd(cmd): + if options.verbose: + print cmd + + status, output = commands.getstatusoutput(cmd) + if options.verbose: + print "return code " + str(status) + print output - ucx_info = bin_prefix+"/ucx_info -e -u t" + return status, output + +def find_am_transport(dev, neps, override = 0) : os.putenv("UCX_TLS", "ib") os.putenv("UCX_NET_DEVICES", dev) @@ -84,13 +99,14 @@ def find_am_transport(dev, neps, override = 0) : if (override): os.putenv("UCX_NUM_EPS", "2") - status, output = commands.getstatusoutput(ucx_info + " -n " + str(neps) + " | grep am") - #print output + status, output = exec_cmd(ucx_info + ucx_info_args + str(neps) + " | grep am") - match = re.search(r'\d+:(\S+)/\S+', output) + os.unsetenv("UCX_TLS") + os.unsetenv("UCX_NET_DEVICES") + + match = re.search(r'\d+:(\S+)/\S+', output) if match: am_tls = match.group(1) - #print am_tls if (override): os.unsetenv("UCX_NUM_EPS") @@ -98,26 +114,72 @@ def find_am_transport(dev, neps, override = 0) : else: return "no am tls" +def test_fallback_from_rc(dev, neps) : -if len(sys.argv) > 1: - bin_prefix = sys.argv[1] + "/bin" -else: + os.putenv("UCX_TLS", "ib") + os.putenv("UCX_NET_DEVICES", dev) + + status,output = exec_cmd(ucx_info + ucx_info_args + str(neps) + " | grep rc") + + os.unsetenv("UCX_TLS") + os.unsetenv("UCX_NET_DEVICES") + + if output != "": + print "RC transport must not be used when estimated number of EPs = " + str(neps) + sys.exit(1) + + os.putenv("UCX_TLS", "rc,ud,tcp") + + status,output_rc = exec_cmd(ucx_info + ucx_info_args + str(neps) + " | grep rc") + + status,output_tcp = exec_cmd(ucx_info + ucx_info_args + str(neps) + " | grep tcp") + + if output_rc != "" or output_tcp != "": + print "RC/TCP transports must not be used when estimated number of EPs = " + str(neps) + sys.exit(1) + + os.unsetenv("UCX_TLS") + +parser = OptionParser() +parser.add_option("-p", "--prefix", metavar="PATH", help = "root UCX directory") +parser.add_option("-v", "--verbose", action="store_true", \ + help = "verbose output", default=False) +(options, args) = parser.parse_args() + +if options.prefix == None: bin_prefix = "./src/tools/info" +else: + bin_prefix = options.prefix + "/bin" -status, output = commands.getstatusoutput("ibv_devinfo -l | tail -n +2 | sed -e 's/^[ \t]*//' | head -n -1 ") +if not (os.path.isdir(bin_prefix)): + print "directory \"" + bin_prefix + "\" does not exist" + parser.print_help() + exit(1) + +ucx_info = bin_prefix + "/ucx_info" +ucx_info_args = " -e -u t -n " + +status, output = exec_cmd(ucx_info + " -c | grep -e \"UCX_RC_.*_MAX_NUM_EPS\"") +match = re.findall(r'\S+=(\d+)', output) +if match: + rc_max_num_eps = int(max(match)) +else: + rc_max_num_eps = 0 + +status, output = exec_cmd("ibv_devinfo -l | tail -n +2 | sed -e 's/^[ \t]*//' | head -n -1 ") dev_list = output.splitlines() port = "1" for dev in sorted(dev_list): - status, dev_attrs = commands.getstatusoutput("ibv_devinfo -d " + dev + " -i " + port) + status, dev_attrs = exec_cmd("ibv_devinfo -d " + dev + " -i " + port) if dev_attrs.find("PORT_ACTIVE") == -1: continue driver_name = os.path.basename(os.readlink("/sys/class/infiniband/%s/device/driver" % dev)) dev_name = driver_name.split("_")[0] # should be mlx4 or mlx5 if not dev_name in ['mlx4', 'mlx5']: - print "Invalid device name: ", dev_name - sys.exit(1) + print "Skipping unknown device: ", dev_name + continue if dev_attrs.find("Ethernet") == -1: dev_tl_map = am_tls[dev_name] @@ -147,5 +209,8 @@ def find_am_transport(dev, neps, override = 0) : if dev_tl_override_map[n_eps] != tl: sys.exit(1) + if n_eps >= (rc_max_num_eps * 2): + test_fallback_from_rc(dev + ':' + port, n_eps) + sys.exit(0) diff --git a/test/examples/Makefile.am b/test/examples/Makefile.am deleted file mode 100644 index 82f82489541..00000000000 --- a/test/examples/Makefile.am +++ /dev/null @@ -1,58 +0,0 @@ -# -# Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. -# -# Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. -# See file LICENSE for terms. -# - -examplesdir = $(pkgdatadir)/examples -dist_examples_DATA = \ - ucx_hello_world.h \ - ucp_hello_world.c \ - uct_hello_world.c \ - ucx_profiling.c \ - ucp_client_server.c - -EXAMPLE_CCLD_FLAGS = -lucs -I$(includedir) -L$(libdir) -Wall -Werror -Wl,-rpath,$(libdir) - -installcheck-local: - @echo "INSTALLCHECK: Compiling examples with installed library" - $(CC) -o uct_hello_world $(examplesdir)/uct_hello_world.c -luct $(EXAMPLE_CCLD_FLAGS) -pedantic - $(CC) -o ucp_hello_world $(examplesdir)/ucp_hello_world.c -lucp $(EXAMPLE_CCLD_FLAGS) -pedantic - $(CC) -o ucp_client_server $(examplesdir)/ucp_client_server.c -lucp $(EXAMPLE_CCLD_FLAGS) -pedantic - $(CC) -o ucx_profiling $(examplesdir)/ucx_profiling.c -lm $(EXAMPLE_CCLD_FLAGS) - $(RM) *.o uct_hello_world ucp_hello_world ucp_client_server ucx_profiling - -if HAVE_EXAMPLES - -noinst_PROGRAMS = \ - ucp_hello_world \ - uct_hello_world \ - ucx_profiling \ - ucp_client_server - -ucp_hello_world_SOURCES = ucp_hello_world.c -ucp_hello_world_CFLAGS = $(BASE_CFLAGS) -pedantic -ucp_hello_world_CPPFLAGS = $(BASE_CPPFLAGS) -ucp_hello_world_LDADD = $(top_builddir)/src/ucs/libucs.la \ - $(top_builddir)/src/ucp/libucp.la - -uct_hello_world_SOURCES = uct_hello_world.c -uct_hello_world_CFLAGS = $(BASE_CFLAGS) -pedantic -uct_hello_world_CPPFLAGS = $(BASE_CPPFLAGS) -uct_hello_world_LDADD = $(top_builddir)/src/ucs/libucs.la \ - $(top_builddir)/src/uct/libuct.la - -ucp_client_server_SOURCES = ucp_client_server.c -ucp_client_server_CFLAGS = $(BASE_CFLAGS) -pedantic -ucp_client_server_CPPFLAGS = $(BASE_CPPFLAGS) -ucp_client_server_LDADD = $(top_builddir)/src/ucs/libucs.la \ - $(top_builddir)/src/ucp/libucp.la - -ucx_profiling_SOURCES = ucx_profiling.c -ucx_profiling_CFLAGS = $(BASE_CFLAGS) -ucx_profiling_CPPFLAGS = $(BASE_CPPFLAGS) -ucx_profiling_LDADD = $(top_builddir)/src/ucs/libucs.la -ucx_profiling_LDFLAGS = -lm - -endif diff --git a/test/examples/ucp_client_server.c b/test/examples/ucp_client_server.c deleted file mode 100644 index 05fb0846172..00000000000 --- a/test/examples/ucp_client_server.c +++ /dev/null @@ -1,498 +0,0 @@ -/** -* Copyright (C) Mellanox Technologies Ltd. 2018. ALL RIGHTS RESERVED. -* -* See file LICENSE for terms. -*/ - -/* - * UCP client - server example utility - * ----------------------------------------------- - * - * Server side: - * - * ./ucp_client_server - * - * Client side: - * - * ./ucp_client_server -a - * - * Notes: - * - * - The server will listen to incoming connection requests on INADDR_ANY. - * - The client needs to pass the IP address of the server side to connect to - * as an argument to the test. - * - Currently, the passed IP needs to be an IPoIB or a RoCE address. - * - The port which the server side would listen on can be modified with the - * '-p' option and should be used on both sides. The default port to use is - * 13337. - */ - -#include - -#include /* memset */ -#include /* inet_addr */ -#include /* getopt */ -#include /* atoi */ - - -const char test_message[] = "UCX Client-Server Hello World"; -static uint16_t server_port = 13337; - -#define TEST_STRING_LEN sizeof(test_message) - - -/** - * Server context to be used in the user's accept callback. - * It holds the server's endpoint which will be created upon accepting a - * connection request from the client. - */ -typedef struct ucx_server_ctx { - ucp_ep_h ep; -} ucx_server_ctx_t; - - -/** - * Stream request context. Holds a value to indicate whether or not the - * request is completed. - */ -typedef struct test_req { - int complete; -} test_req_t; - - -/** - * The callback on the receiving side, which is invoked upon receiving the - * stream message. - */ -static void stream_recv_cb(void *request, ucs_status_t status, size_t length) -{ - test_req_t *req = request; - - req->complete = 1; - - printf("stream_recv_cb returned with status %d (%s), length: %lu\n", - status, ucs_status_string(status), length); -} - -/** - * The callback on the sending side, which is invoked after finishing sending - * the stream message. - */ -static void stream_send_cb(void *request, ucs_status_t status) -{ - test_req_t *req = request; - - req->complete = 1; - - printf("stream_send_cb returned with status %d (%s)\n", - status, ucs_status_string(status)); -} - -/** - * The callback on the server side which is invoked upon receiving a connection - * request from the client. - */ -static void server_accept_cb(ucp_ep_h ep, void *arg) -{ - ucx_server_ctx_t *context = arg; - - /* Save the server's endpoint in the user's context, for future usage */ - context->ep = ep; -} - -/** - * Set an address for the server to listen on - INADDR_ANY on a well known port. - */ -void set_listen_addr(struct sockaddr_in *listen_addr) -{ - /* The server will listen on INADDR_ANY */ - memset(listen_addr, 0, sizeof(struct sockaddr_in)); - listen_addr->sin_family = AF_INET; - listen_addr->sin_addr.s_addr = INADDR_ANY; - listen_addr->sin_port = htons(server_port); -} - -/** - * Set an address to connect to. A given IP address on a well known port. - */ -void set_connect_addr(const char *address_str, struct sockaddr_in *connect_addr) -{ - memset(connect_addr, 0, sizeof(struct sockaddr_in)); - connect_addr->sin_family = AF_INET; - connect_addr->sin_addr.s_addr = inet_addr(address_str); - connect_addr->sin_port = htons(server_port); -} - -/** - * Initialize the server side. The server starts listening on the set address - * and waits for its connected endpoint to be created. - */ -static int start_server(ucp_worker_h ucp_worker, ucx_server_ctx_t *context, - ucp_listener_h *listener) -{ - struct sockaddr_in listen_addr; - ucp_listener_params_t params; - ucs_status_t status; - - set_listen_addr(&listen_addr); - - params.field_mask = UCP_LISTENER_PARAM_FIELD_SOCK_ADDR | - UCP_LISTENER_PARAM_FIELD_ACCEPT_HANDLER; - params.sockaddr.addr = (const struct sockaddr*)&listen_addr; - params.sockaddr.addrlen = sizeof(listen_addr); - params.accept_handler.cb = server_accept_cb; - params.accept_handler.arg = context; - - /* Create a listener on the server side to listen on the given address.*/ - status = ucp_listener_create(ucp_worker, ¶ms, listener); - if (status != UCS_OK) { - fprintf(stderr, "failed to listen (%s)\n", ucs_status_string(status)); - } - - return status; -} - -/** - * Initialize the client side. Create an endpoint from the client side to be - * connected to the remote server (to the given IP). - */ -static int start_client(ucp_worker_h ucp_worker, const char *ip, - ucp_ep_h *client_ep) -{ - ucp_ep_params_t ep_params; - struct sockaddr_in connect_addr; - ucs_status_t status; - - set_connect_addr(ip, &connect_addr); - - /* - * Endpoint field mask bits: - * UCP_EP_PARAM_FIELD_FLAGS - Use the value of the 'flags' field. - * UCP_EP_PARAM_FIELD_SOCK_ADDR - Use a remote sockaddr to connect - * to the remote peer. - * UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE - Error handling mode - this flag - * is temporarily required since the - * endpoint will be closed with - * UCP_EP_CLOSE_MODE_FORCE which - * requires this mode. - * Once UCP_EP_CLOSE_MODE_FORCE is - * removed, the error handling mode - * will be removed. - */ - ep_params.field_mask = UCP_EP_PARAM_FIELD_FLAGS | - UCP_EP_PARAM_FIELD_SOCK_ADDR | - UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE; - ep_params.err_mode = UCP_ERR_HANDLING_MODE_PEER; - ep_params.flags = UCP_EP_PARAMS_FLAGS_CLIENT_SERVER; - ep_params.sockaddr.addr = (struct sockaddr*)&connect_addr; - ep_params.sockaddr.addrlen = sizeof(connect_addr); - - status = ucp_ep_create(ucp_worker, &ep_params, client_ep); - if (status != UCS_OK) { - fprintf(stderr, "failed to connect to %s (%s)\n", ip, ucs_status_string(status)); - } - - return status; -} - -/** - * Print the received message on the server side or the sent data on the client - * side. - */ -static void print_result(int is_server, char *recv_message) -{ - if (is_server) { - printf("UCX data message was received\n"); - printf("\n\n----- UCP TEST SUCCESS -------\n\n"); - printf("%s", recv_message); - printf("\n\n------------------------------\n\n"); - } else { - printf("\n\n-----------------------------------------\n\n"); - printf("Client sent message: \n%s.\nlength: %ld\n", - test_message, TEST_STRING_LEN); - printf("\n-----------------------------------------\n\n"); - } -} - -/** - * Progress the request until it completes. - */ -static ucs_status_t request_wait(ucp_worker_h ucp_worker, test_req_t *request) -{ - ucs_status_t status; - - /* if operation was completed immediately */ - if (request == NULL) { - return UCS_OK; - } - - if (UCS_PTR_IS_ERR(request)) { - return UCS_PTR_STATUS(request); - } - - while (request->complete == 0) { - ucp_worker_progress(ucp_worker); - } - status = ucp_request_check_status(request); - - /* This request may be reused so initialize it for next time */ - request->complete = 0; - ucp_request_free(request); - - return status; -} - -/** - * Send and receive a message using the Stream API. - * The client sends a message to the server and waits until the send it completed. - * The server receives a message from the client and waits for its completion. - */ -static int send_recv_stream(ucp_worker_h ucp_worker, ucp_ep_h ep, int is_server) -{ - char recv_message[TEST_STRING_LEN]= ""; - test_req_t *request; - size_t length; - int ret = 0; - ucs_status_t status; - - if (!is_server) { - /* Client sends a message to the server using the stream API */ - request = ucp_stream_send_nb(ep, test_message, 1, - ucp_dt_make_contig(TEST_STRING_LEN), - stream_send_cb, 0); - } else { - /* Server receives a message from the client using the stream API */ - request = ucp_stream_recv_nb(ep, &recv_message, 1, - ucp_dt_make_contig(TEST_STRING_LEN), - stream_recv_cb, &length, - UCP_STREAM_RECV_FLAG_WAITALL); - } - - status = request_wait(ucp_worker, request); - if (status != UCS_OK){ - fprintf(stderr, "unable to %s UCX message (%s)\n", - is_server ? "receive": "send", - ucs_status_string(status)); - ret = -1; - } else { - print_result(is_server, recv_message); - } - - return ret; -} - -/** - * Close the given endpoint. - * Currently closing the endpoint with UCP_EP_CLOSE_MODE_FORCE since we currently - * cannot rely on the client side to be present during the server's endpoint - * closing process. - */ -static void ep_close(ucp_worker_h ucp_worker, ucp_ep_h ep) -{ - ucs_status_t status; - void *close_req; - - close_req = ucp_ep_close_nb(ep, UCP_EP_CLOSE_MODE_FORCE); - if (UCS_PTR_IS_PTR(close_req)) { - do { - ucp_worker_progress(ucp_worker); - status = ucp_request_check_status(close_req); - } while (status == UCS_INPROGRESS); - - ucp_request_free(close_req); - } else if (UCS_PTR_STATUS(close_req) != UCS_OK) { - fprintf(stderr, "failed to close ep %p\n", (void*)ep); - } -} - -/** - * A callback to be invoked by UCX in order to initialize the user's request. - */ -static void request_init(void *request) -{ - test_req_t *req = request; - req->complete = 0; -} - -/** - * Print this application's usage help message. - */ -static void usage() -{ - fprintf(stderr, "Usage: ucp_client_server [parameters]\n"); - fprintf(stderr, "UCP client-server example utility\n"); - fprintf(stderr, "\nParameters are:\n"); - fprintf(stderr, " -a Set IP address of the server " - "(required for client and should not be specified " - "for the server)\n"); - fprintf(stderr, " -p Set alternative server port (default:13337)\n"); - fprintf(stderr, "\n"); -} - -/** - * Parse the command line arguments. - */ -static int parse_cmd(int argc, char *const argv[], char **server_addr) -{ - int c = 0; - int port; - - opterr = 0; - - while ((c = getopt(argc, argv, "a:p:")) != -1) { - switch (c) { - case 'a': - *server_addr = optarg; - break; - case 'p': - port = atoi(optarg); - if ((port < 0) || (port > UINT16_MAX)) { - fprintf(stderr, "Wrong server port number %d\n", server_port); - return -1; - } - server_port = port; - break; - default: - usage(); - return -1; - } - } - - return 0; -} - -/** - * Initialize the UCP context and worker. - */ -static int init_context(ucp_context_h *ucp_context, ucp_worker_h *ucp_worker) -{ - /* UCP objects */ - ucp_worker_params_t worker_params; - ucp_params_t ucp_params; - ucs_status_t status; - int ret = 0; - - memset(&ucp_params, 0, sizeof(ucp_params)); - memset(&worker_params, 0, sizeof(worker_params)); - - /* UCP initialization */ - ucp_params.field_mask = UCP_PARAM_FIELD_FEATURES | - UCP_PARAM_FIELD_REQUEST_SIZE | - UCP_PARAM_FIELD_REQUEST_INIT; - ucp_params.features = UCP_FEATURE_STREAM; - - ucp_params.request_size = sizeof(test_req_t); - ucp_params.request_init = request_init; - - status = ucp_init(&ucp_params, NULL, ucp_context); - if (status != UCS_OK) { - fprintf(stderr, "failed to ucp_init (%s)\n", ucs_status_string(status)); - ret = -1; - goto err; - } - - worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; - worker_params.thread_mode = UCS_THREAD_MODE_SINGLE; - - status = ucp_worker_create(*ucp_context, &worker_params, ucp_worker); - if (status != UCS_OK) { - fprintf(stderr, "failed to ucp_worker_create (%s)\n", ucs_status_string(status)); - ret = -1; - goto err_cleanup; - } - - return ret; - -err_cleanup: - ucp_cleanup(*ucp_context); - -err: - return ret; -} - - -int main(int argc, char **argv) -{ - ucx_server_ctx_t context; - int is_server, ret; - char *server_addr = NULL; - - /* UCP objects */ - ucp_context_h ucp_context; - ucp_listener_h listener; - ucp_worker_h ucp_worker; - ucs_status_t status; - ucp_ep_h ep; - - ret = parse_cmd(argc, argv, &server_addr); - if (ret != 0) { - goto err; - } - - /* Initialize the UCX required objects */ - ret = init_context(&ucp_context, &ucp_worker); - if (ret != 0) { - goto err; - } - - /* Client-Server initialization */ - if (server_addr == NULL) { - /* Server side */ - is_server = 1; - - /* Initialize the server's endpoint to NULL. Once the server's endpoint - * is created, this field will have a valid value. */ - context.ep = NULL; - - status = start_server(ucp_worker, &context, &listener); - if (status != UCS_OK) { - fprintf(stderr, "failed to start server\n"); - goto err_worker; - } - - /* Server is always up */ - printf("Waiting for connection...\n"); - while (1) { - /* Wait for the server's callback to set the context->ep field, thus - * indicating that the server's endpoint was created and is ready to - * be used. The client side should initiate the connection, leading - * to this ep's creation */ - if (context.ep == NULL) { - ucp_worker_progress(ucp_worker); - } else { - /* Client-Server communication via Stream API */ - send_recv_stream(ucp_worker, context.ep, is_server); - - /* Close the endpoint to the client */ - ep_close(ucp_worker, context.ep); - - /* Initialize server's endpoint for the next connection with a new - * client */ - context.ep = NULL; - printf("Waiting for connection...\n"); - }; - } - } else { - /* Client side */ - is_server = 0; - status = start_client(ucp_worker, server_addr, &ep); - if (status != UCS_OK) { - fprintf(stderr, "failed to start client\n"); - goto err_worker; - } - - /* Client-Server communication via Stream API */ - ret = send_recv_stream(ucp_worker, ep, is_server); - - /* Close the endpoint to the server */ - ep_close(ucp_worker, ep); - } - -err_worker: - ucp_worker_destroy(ucp_worker); - - ucp_cleanup(ucp_context); - -err: - return ret; -} diff --git a/test/examples/ucx_hello_world.h b/test/examples/ucx_hello_world.h deleted file mode 100644 index 6de567981d3..00000000000 --- a/test/examples/ucx_hello_world.h +++ /dev/null @@ -1,123 +0,0 @@ -/** -* Copyright (C) Mellanox Technologies Ltd. 2001-2016. ALL RIGHTS RESERVED. -* -* See file LICENSE for terms. -*/ - -#ifndef UCX_HELLO_WORLD_H -#define UCX_HELLO_WORLD_H - -#include -#include -#include -#include -#include -#include - -#define CHKERR_JUMP(_cond, _msg, _label) \ -do { \ - if (_cond) { \ - fprintf(stderr, "Failed to %s\n", _msg); \ - goto _label; \ - } \ -} while (0) - -int server_connect(uint16_t server_port) -{ - struct sockaddr_in inaddr; - int lsock = -1; - int dsock = -1; - int optval = 1; - int ret; - - lsock = socket(AF_INET, SOCK_STREAM, 0); - CHKERR_JUMP(lsock < 0, "open server socket", err); - - optval = 1; - ret = setsockopt(lsock, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)); - CHKERR_JUMP(ret < 0, "server setsockopt()", err_sock); - - inaddr.sin_family = AF_INET; - inaddr.sin_port = htons(server_port); - inaddr.sin_addr.s_addr = INADDR_ANY; - memset(inaddr.sin_zero, 0, sizeof(inaddr.sin_zero)); - ret = bind(lsock, (struct sockaddr*)&inaddr, sizeof(inaddr)); - CHKERR_JUMP(ret < 0, "bind server", err_sock); - - ret = listen(lsock, 0); - CHKERR_JUMP(ret < 0, "listen server", err_sock); - - fprintf(stdout, "Waiting for connection...\n"); - - /* Accept next connection */ - dsock = accept(lsock, NULL, NULL); - CHKERR_JUMP(dsock < 0, "accept server", err_sock); - - close(lsock); - - return dsock; - -err_sock: - close(lsock); - -err: - return -1; -} - -int client_connect(const char *server, uint16_t server_port) -{ - struct sockaddr_in conn_addr; - struct hostent *he; - int connfd; - int ret; - - connfd = socket(AF_INET, SOCK_STREAM, 0); - CHKERR_JUMP(connfd < 0, "open client socket", err); - - he = gethostbyname(server); - CHKERR_JUMP((he == NULL || he->h_addr_list == NULL), "found a host", err_conn); - - conn_addr.sin_family = he->h_addrtype; - conn_addr.sin_port = htons(server_port); - - memcpy(&conn_addr.sin_addr, he->h_addr_list[0], he->h_length); - memset(conn_addr.sin_zero, 0, sizeof(conn_addr.sin_zero)); - - ret = connect(connfd, (struct sockaddr*)&conn_addr, sizeof(conn_addr)); - CHKERR_JUMP(ret < 0, "connect client", err_conn); - - return connfd; - -err_conn: - close(connfd); -err: - return -1; -} - -static int barrier(int oob_sock) -{ - int dummy = 0; - ssize_t res; - - res = send(oob_sock, &dummy, sizeof(dummy), 0); - if (res < 0) { - return res; - } - - res = recv(oob_sock, &dummy, sizeof(dummy), 0); - - /* number of received bytes should be the same as sent */ - return !(res == sizeof(dummy)); -} - -static void generate_test_string(char *str, int size) -{ - int i; - - for (i = 0; i < (size - 1); ++i) { - str[i] = 'A' + (i % 26); - } - str[i] = 0; -} - -#endif /* UCX_HELLO_WORLD_H */ diff --git a/test/gtest/Makefile.am b/test/gtest/Makefile.am index 11074cf0e5e..c1b567d0875 100644 --- a/test/gtest/Makefile.am +++ b/test/gtest/Makefile.am @@ -2,7 +2,9 @@ # Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. # Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. # Copyright (C) The University of Tennessee and the University of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED. -# Copyright (C) Huawei Technologies Co.,Ltd. 2020. ALL RIGHTS RESERVED. +# Copyright (C) Los Alamos National Security, LLC. 2018 ALL RIGHTS RESERVED. +# Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED. +# Copyright (C) Huawei Technologies Co.,Ltd. 2019-2021. ALL RIGHTS RESERVED. # # See file LICENSE for terms. # @@ -11,7 +13,7 @@ if HAVE_GTEST # Set default configuration for running tests UCX_HANDLE_ERRORS ?= freeze -UCX_LOG_LEVEL ?= info +UCX_LOG_LEVEL ?= warn UCX_LOG_PRINT_ENABLE ?= y GTEST_FILTER ?= * GTEST_EXTRA_ARGS ?= @@ -34,6 +36,8 @@ VALGRIND_ARGS = \ --track-origins=yes \ --fair-sched=try \ --num-callers=25 \ + --error-exitcode=1 \ + --child-silent-after-fork=yes \ --suppressions=$(top_srcdir)/contrib/valgrind.supp \ $(VALGRIND_EXTRA_ARGS) @@ -44,8 +48,14 @@ gtest_LDADD = \ $(top_builddir)/src/ucs/libucs.la \ $(top_builddir)/src/uct/libuct.la \ $(top_builddir)/src/ucm/libucm.la \ - $(top_builddir)/src/ucp/libucp.la \ - $(top_builddir)/src/ucg/libucg.la \ + $(top_builddir)/src/ucp/libucp.la + +if HAVE_UCG +gtest_LDADD += \ + $(top_builddir)/src/ucg/libucg.la +endif + +gtest_LDADD += \ $(top_builddir)/src/tools/perf/lib/libucxperf.la \ $(OPENMP_CFLAGS) \ $(GTEST_LIBS) @@ -62,12 +72,15 @@ gtest_CPPFLAGS = \ gtest_LDFLAGS = $(GTEST_LDFLAGS) -no-install -Wl,-dynamic-list-data gtest_CFLAGS = $(BASE_CFLAGS) -gtest_CXXFLAGS = $(BASE_CXXFLAGS) $(GTEST_CXXFLAGS) -fno-tree-vectorize \ - -DGTEST_UCM_HOOK_LIB_DIR="\"${abs_builddir}/ucm/test_dlopen/.libs\"" +gtest_CXXFLAGS = \ + $(BASE_CXXFLAGS) $(GTEST_CXXFLAGS) \ + -DGTEST_UCM_HOOK_LIB_DIR="\"${abs_builddir}/ucm/test_dlopen/.libs\"" gtest_SOURCES = \ common/gtest-all.cc \ common/main.cc \ + common/test_gtest_cmn.cc \ + common/mem_buffer.cc \ common/test_helpers.cc \ common/test_obj_size.cc \ common/test_watchdog.cc \ @@ -102,10 +115,13 @@ gtest_SOURCES = \ uct/uct_p2p_test.cc \ uct/uct_test.cc \ uct/test_stats.cc \ + ucs/test_event_set.cc \ ucs/test_stats_filter.cc \ uct/test_peer_failure.cc \ uct/test_tag.cc \ + uct/tcp/test_tcp.cc \ \ + ucp/test_ucp_am.cc \ ucp/test_ucp_stream.cc \ ucp/test_ucp_peer_failure.cc \ ucp/test_ucp_atomic.cc \ @@ -114,6 +130,7 @@ gtest_SOURCES = \ ucp/test_ucp_mmap.cc \ ucp/test_ucp_mem_type.cc \ ucp/test_ucp_perf.cc \ + ucp/test_ucp_proto.cc \ ucp/test_ucp_rma.cc \ ucp/test_ucp_rma_mt.cc \ ucp/test_ucp_tag_cancel.cc \ @@ -123,6 +140,7 @@ gtest_SOURCES = \ ucp/test_ucp_tag_perf.cc \ ucp/test_ucp_tag_probe.cc \ ucp/test_ucp_tag_xfer.cc \ + ucp/test_ucp_tag_mem_type.cc \ ucp/test_ucp_tag.cc \ ucp/test_ucp_context.cc \ ucp/test_ucp_wireup.cc \ @@ -152,12 +170,18 @@ gtest_SOURCES = \ ucs/test_strided_alloc.cc \ ucs/test_string.cc \ ucs/test_sys.cc \ + ucs/test_topo.cc \ ucs/test_sock.cc \ ucs/test_time.cc \ ucs/test_twheel.cc \ ucs/test_frag_list.cc \ ucs/test_type.cc \ ucs/test_log.cc \ + ucs/test_iov.cc \ + ucs/arch/test_x86_64.cc + +if HAVE_UCG +gtest_SOURCES += \ ucg/ucg_test.cc \ ucg/test_plan.cc \ ucg/test_topo_info.cc \ @@ -169,26 +193,30 @@ gtest_SOURCES = \ ucg/test_ucg_context.cc \ ucg/test_builtin.cc \ ucg/test_cb.cc +endif if HAVE_IB gtest_SOURCES += \ uct/ib/test_ib.cc \ + uct/ib/test_ib_md.cc \ uct/ib/test_cq_moderation.cc \ - uct/ib/test_ib_xfer.cc + uct/ib/test_ib_xfer.cc \ + uct/ib/test_ib_pkey.cc \ + uct/ib/test_ib_event.cc gtest_CPPFLAGS += \ $(IBVERBS_CPPFLAGS) gtest_LDADD += \ $(IBVERBS_LDFLAGS) \ $(top_builddir)/src/uct/ib/libuct_ib.la -if HAVE_MLX5_HW +if HAVE_DEVX gtest_SOURCES += \ - uct/ib/test_ib_md.cc + uct/ib/test_devx.cc endif if HAVE_TL_UD gtest_SOURCES += \ uct/ib/ud_base.cc \ uct/ib/test_ud.cc \ - uct/ib/test_ud_slow_timer.cc \ + uct/ib/test_ud_timer.cc \ uct/ib/test_ud_pending.cc \ uct/ib/test_ud_ds.cc endif @@ -209,13 +237,31 @@ endif # HAVE_IB if HAVE_CUDA gtest_SOURCES += \ ucm/cuda_hooks.cc +gtest_CPPFLAGS += \ + $(CUDA_CPPFLAGS) gtest_LDADD += \ $(CUDA_LDFLAGS) \ $(top_builddir)/src/uct/cuda/libuct_cuda.la endif +if HAVE_HIP +if HAVE_GNUXX11 +gtest_SOURCES += \ + ucm/rocm_hooks.cc +gtest_CPPFLAGS += \ + $(HIP_CPPFLAGS) +gtest_CXXFLAGS += \ + $(HIP_CXXFLAGS) +gtest_LDADD += \ + $(HIP_LDFLAGS) \ + $(HIP_LIBS) \ + $(top_builddir)/src/uct/rocm/libuct_rocm.la +endif +endif + noinst_HEADERS = \ common/gtest.h \ + common/mem_buffer.h \ common/test.h \ common/test_helpers.h \ common/test_perf.h \ @@ -223,6 +269,7 @@ noinst_HEADERS = \ \ uct/ib/test_rc.h \ uct/ib/ud_base.h \ + uct/ib/test_ib.h \ uct/test_amo.h \ uct/test_p2p_mix.h \ uct/test_p2p_rma.h \ @@ -234,10 +281,14 @@ noinst_HEADERS = \ ucp/test_ucp_memheap.h \ ucp/test_ucp_tag.h \ ucp/ucp_test.h \ - ucp/ucp_datatype.h \ + ucp/ucp_datatype.h + +if HAVE_UCG +noinst_HEADERS += \ ucg/ucg_test.h \ ucg/ucg_plan_test.h \ ucg/test_op.h +endif .PHONY: test test gdb valgrind fix_rpath ucx diff --git a/test/gtest/common/gtest-all.cc b/test/gtest/common/gtest-all.cc index 704b9f64101..fa67e68e6ab 100644 --- a/test/gtest/common/gtest-all.cc +++ b/test/gtest/common/gtest-all.cc @@ -4260,6 +4260,12 @@ void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) { void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) { ColoredPrintf(COLOR_GREEN, "[ RUN ] "); PrintTestName(test_info.test_case_name(), test_info.name()); + if (test_info.type_param() != NULL) { + printf(" <%s>", test_info.type_param()); + } + if (test_info.value_param() != NULL) { + printf(" <%s>", test_info.value_param()); + } printf("\n"); fflush(stdout); } diff --git a/test/gtest/common/main.cc b/test/gtest/common/main.cc index d98b7325d04..c8f39543d6a 100644 --- a/test/gtest/common/main.cc +++ b/test/gtest/common/main.cc @@ -14,10 +14,12 @@ #include "test_helpers.h" #include "tap.h" + static int ucs_gtest_random_seed = -1; int ucs::perf_retry_count = 0; /* 0 - don't check performance */ double ucs::perf_retry_interval = 1.0; + void parse_test_opts(int argc, char **argv) { int c; while ((c = getopt(argc, argv, "s:p:i:")) != -1) { @@ -42,7 +44,7 @@ static void modify_config_for_valgrind(const char *name, const char *value) { char full_name[128]; - snprintf(full_name, sizeof(full_name), "%s%s", UCS_CONFIG_PREFIX, name); + snprintf(full_name, sizeof(full_name), "%s%s", UCS_DEFAULT_ENV_PREFIX, name); if (getenv(full_name) == NULL) { UCS_TEST_MESSAGE << " Setting for valgrind: " << full_name << "=" << value; @@ -51,8 +53,8 @@ static void modify_config_for_valgrind(const char *name, const char *value) } int main(int argc, char **argv) { - // coverity[fun_call_w_exception]: uncaught exceptions cause nonzero exit anyway, so don't warn. - ::testing::InitGoogleTest(&argc, argv); + // coverity[fun_call_w_exception]: uncaught exceptions cause nonzero exit anyway, so don't warn. + ::testing::InitGoogleTest(&argc, argv); char *str = getenv("GTEST_TAP"); int ret; @@ -84,6 +86,7 @@ int main(int argc, char **argv) { modify_config_for_valgrind("CM_TIMEOUT", "600ms"); modify_config_for_valgrind("TCP_TX_BUFS_GROW", "512"); modify_config_for_valgrind("TCP_RX_BUFS_GROW", "512"); + modify_config_for_valgrind("TCP_RX_SEG_SIZE", "16k"); ucm_global_opts.enable_malloc_reloc = 1; /* Test reloc hooks with valgrind, though it's generally unsafe. */ } @@ -100,5 +103,7 @@ int main(int argc, char **argv) { ucs::watchdog_stop(); + ucs::analyze_test_results(); + return ret; } diff --git a/test/gtest/common/mem_buffer.cc b/test/gtest/common/mem_buffer.cc new file mode 100644 index 00000000000..fcad446aecc --- /dev/null +++ b/test/gtest/common/mem_buffer.cc @@ -0,0 +1,293 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "mem_buffer.h" + +#include +#include +#include + +#if HAVE_CUDA +# include +# include + +#define CUDA_CALL(_code) \ + do { \ + cudaError_t cerr = _code; \ + if (cerr != cudaSuccess) { \ + UCS_TEST_ABORT(# _code << " failed"); \ + } \ + } while (0) + +#endif + +#if HAVE_ROCM +# include + +#define ROCM_CALL(_code) \ + do { \ + hipError_t cerr = _code; \ + if (cerr != hipSuccess) { \ + UCS_TEST_ABORT(# _code << " failed"); \ + } \ + } while (0) + +#endif + + +std::vector mem_buffer::supported_mem_types() +{ + static std::vector vec; + + if (vec.empty()) { + vec.push_back(UCS_MEMORY_TYPE_HOST); +#if HAVE_CUDA + vec.push_back(UCS_MEMORY_TYPE_CUDA); + vec.push_back(UCS_MEMORY_TYPE_CUDA_MANAGED); +#endif +#if HAVE_ROCM + vec.push_back(UCS_MEMORY_TYPE_ROCM); + vec.push_back(UCS_MEMORY_TYPE_ROCM_MANAGED); +#endif + } + + return vec; +} + +void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type) +{ + void *ptr; + + switch (mem_type) { + case UCS_MEMORY_TYPE_HOST: + ptr = malloc(size); + if (ptr == NULL) { + UCS_TEST_ABORT("malloc() failed"); + } + return ptr; +#if HAVE_CUDA + case UCS_MEMORY_TYPE_CUDA: + CUDA_CALL(cudaMalloc(&ptr, size)); + return ptr; + case UCS_MEMORY_TYPE_CUDA_MANAGED: + CUDA_CALL(cudaMallocManaged(&ptr, size)); + return ptr; +#endif +#if HAVE_ROCM + case UCS_MEMORY_TYPE_ROCM: + ROCM_CALL(hipMalloc(&ptr, size)); + return ptr; + case UCS_MEMORY_TYPE_ROCM_MANAGED: + ROCM_CALL(hipMallocManaged(&ptr, size)); + return ptr; +#endif + default: + UCS_TEST_SKIP_R(std::string(ucs_memory_type_names[mem_type]) + + " memory is not supported"); + } +} + +void mem_buffer::release(void *ptr, ucs_memory_type_t mem_type) +{ + switch (mem_type) { + case UCS_MEMORY_TYPE_HOST: + free(ptr); + break; +#if HAVE_CUDA + case UCS_MEMORY_TYPE_CUDA: + case UCS_MEMORY_TYPE_CUDA_MANAGED: + CUDA_CALL(cudaFree(ptr)); + break; +#endif +#if HAVE_ROCM + case UCS_MEMORY_TYPE_ROCM: + case UCS_MEMORY_TYPE_ROCM_MANAGED: + ROCM_CALL(hipFree(ptr)); + break; +#endif + default: + break; + } +} + +void mem_buffer::pattern_fill(void *buffer, size_t length, uint64_t seed) +{ + uint64_t *ptr = (uint64_t*)buffer; + char *end = (char *)buffer + length; + + while ((char*)(ptr + 1) <= end) { + *ptr = seed; + seed = pat(seed); + ++ptr; + } + memcpy(ptr, &seed, end - (char*)ptr); +} + +void mem_buffer::pattern_check(const void *buffer, size_t length, uint64_t seed) +{ + const char* end = (const char*)buffer + length; + const uint64_t *ptr = (const uint64_t*)buffer; + + while ((const char*)(ptr + 1) <= end) { + if (*ptr != seed) { + UCS_TEST_ABORT("At offset " << ((const char*)ptr - (const char*)buffer) << ": " << + "Expected: 0x" << std::hex << seed << " " << + "Got: 0x" << std::hex << (*ptr) << std::dec); + } + seed = pat(seed); + ++ptr; + } + + size_t remainder = (end - (const char*)ptr); + if (remainder > 0) { + ucs_assert(remainder < sizeof(*ptr)); + uint64_t mask = UCS_MASK_SAFE(remainder * 8 * sizeof(char)); + uint64_t value = 0; + memcpy(&value, ptr, remainder); + if (value != (seed & mask)) { + UCS_TEST_ABORT("At offset " << ((const char*)ptr - (const char*)buffer) << + " (remainder " << remainder << ") : " << + "Expected: 0x" << std::hex << (seed & mask) << " " << + "Mask: 0x" << std::hex << mask << " " << + "Got: 0x" << std::hex << value << std::dec); + } + } +} + +void mem_buffer::pattern_check(const void *buffer, size_t length) +{ + if (length > sizeof(uint64_t)) { + pattern_check(buffer, length, *(const uint64_t*)buffer); + } +} + +void mem_buffer::pattern_fill(void *buffer, size_t length, uint64_t seed, + ucs_memory_type_t mem_type) +{ + if (UCP_MEM_IS_ACCESSIBLE_FROM_CPU(mem_type)) { + pattern_fill(buffer, length, seed); + } else { + ucs::auto_buffer temp(length); + pattern_fill(*temp, length, seed); + copy_to(buffer, *temp, length, mem_type); + } +} + +void mem_buffer::pattern_check(const void *buffer, size_t length, uint64_t seed, + ucs_memory_type_t mem_type) +{ + if (UCP_MEM_IS_ACCESSIBLE_FROM_CPU(mem_type)) { + pattern_check(buffer, length, seed); + } else { + ucs::auto_buffer temp(length); + copy_from(*temp, buffer, length, mem_type); + pattern_check(*temp, length, seed); + } +} + +void mem_buffer::copy_to(void *dst, const void *src, size_t length, + ucs_memory_type_t dst_mem_type) +{ + switch (dst_mem_type) { + case UCS_MEMORY_TYPE_HOST: + case UCS_MEMORY_TYPE_CUDA_MANAGED: + case UCS_MEMORY_TYPE_ROCM_MANAGED: + memcpy(dst, src, length); + break; +#if HAVE_CUDA + case UCS_MEMORY_TYPE_CUDA: + CUDA_CALL(cudaMemcpy(dst, src, length, cudaMemcpyHostToDevice)); + CUDA_CALL(cudaDeviceSynchronize()); + break; +#endif +#if HAVE_ROCM + case UCS_MEMORY_TYPE_ROCM: + ROCM_CALL(hipMemcpy(dst, src, length, hipMemcpyHostToDevice)); + ROCM_CALL(hipDeviceSynchronize()); + break; +#endif + default: + abort_wrong_mem_type(dst_mem_type); + } +} + +void mem_buffer::copy_from(void *dst, const void *src, size_t length, + ucs_memory_type_t src_mem_type) +{ + switch (src_mem_type) { + case UCS_MEMORY_TYPE_HOST: + case UCS_MEMORY_TYPE_CUDA_MANAGED: + case UCS_MEMORY_TYPE_ROCM_MANAGED: + memcpy(dst, src, length); + break; +#if HAVE_CUDA + case UCS_MEMORY_TYPE_CUDA: + CUDA_CALL(cudaMemcpy(dst, src, length, cudaMemcpyDeviceToHost)); + CUDA_CALL(cudaDeviceSynchronize()); + break; +#endif +#if HAVE_ROCM + case UCS_MEMORY_TYPE_ROCM: + ROCM_CALL(hipMemcpy(dst, src, length, hipMemcpyDeviceToHost)); + ROCM_CALL(hipDeviceSynchronize()); + break; +#endif + default: + abort_wrong_mem_type(src_mem_type); + } +} + +bool mem_buffer::compare(const void *expected, const void *buffer, + size_t length, ucs_memory_type_t mem_type) +{ + if (UCP_MEM_IS_ACCESSIBLE_FROM_CPU(mem_type)) { + return memcmp(expected, buffer, length) == 0; + } else { + ucs::auto_buffer temp(length); + copy_from(*temp, buffer, length, mem_type); + return memcmp(expected, *temp, length) == 0; + } +} + +std::string mem_buffer::mem_type_name(ucs_memory_type_t mem_type) +{ + return ucs_memory_type_names[mem_type]; +} + +void mem_buffer::abort_wrong_mem_type(ucs_memory_type_t mem_type) { + UCS_TEST_ABORT("Wrong buffer memory type " + mem_type_name(mem_type)); +} + +uint64_t mem_buffer::pat(uint64_t prev) { + /* LFSR pattern */ + static const uint64_t polynom = 1337; + return (prev << 1) | (__builtin_parityl(prev & polynom) & 1); +} + +mem_buffer::mem_buffer(size_t size, ucs_memory_type_t mem_type) : + m_mem_type(mem_type), m_ptr(allocate(size, mem_type)), m_size(size) { +} + +mem_buffer::~mem_buffer() { + release(ptr(), mem_type()); +} + +ucs_memory_type_t mem_buffer::mem_type() const { + return m_mem_type; +} + +void *mem_buffer::ptr() const { + return m_ptr; +} + +size_t mem_buffer::size() const { + return m_size; +} diff --git a/test/gtest/common/mem_buffer.h b/test/gtest/common/mem_buffer.h new file mode 100644 index 00000000000..134e36684d0 --- /dev/null +++ b/test/gtest/common/mem_buffer.h @@ -0,0 +1,82 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef GTEST_MEM_BUFFER_H_ +#define GTEST_MEM_BUFFER_H_ + +#include +#include +#include +#include + + +/** + * Wrapper and utility functions for memory type buffers, e.g buffers which are + * not necessarily allocated on host memory, such as cuda, rocm, etc. + */ +class mem_buffer { +public: + static std::vector supported_mem_types(); + + /* allocate buffer of a given memory type */ + static void *allocate(size_t size, ucs_memory_type_t mem_type); + + /* release buffer of a given memory type */ + static void release(void *ptr, ucs_memory_type_t mem_type); + + /* fill pattern in a host-accessible buffer */ + static void pattern_fill(void *buffer, size_t length, uint64_t seed); + + /* check pattern in a host-accessible buffer */ + static void pattern_check(const void *buffer, size_t length, uint64_t seed); + + /* check pattern in a host-accessible buffer, take seed from 1st word */ + static void pattern_check(const void *buffer, size_t length); + + /* fill pattern in a memtype buffer */ + static void pattern_fill(void *buffer, size_t length, uint64_t seed, + ucs_memory_type_t mem_type); + + /* check pattern in a memtype buffer */ + static void pattern_check(const void *buffer, size_t length, uint64_t seed, + ucs_memory_type_t mem_type); + + /* copy from host memory to memtype buffer */ + static void copy_to(void *dst, const void *src, size_t length, + ucs_memory_type_t dst_mem_type); + + /* copy from memtype buffer to host memory */ + static void copy_from(void *dst, const void *src, size_t length, + ucs_memory_type_t src_mem_type); + + /* compare memtype buffer with host memory, return true if equal */ + static bool compare(const void *expected, const void *buffer, + size_t length, ucs_memory_type_t mem_type); + + /* return the string name of a memory type */ + static std::string mem_type_name(ucs_memory_type_t mem_type); + + mem_buffer(size_t size, ucs_memory_type_t mem_type); + virtual ~mem_buffer(); + + ucs_memory_type_t mem_type() const; + + void *ptr() const; + + size_t size() const; + +private: + static void abort_wrong_mem_type(ucs_memory_type_t mem_type); + + static uint64_t pat(uint64_t prev); + + const ucs_memory_type_t m_mem_type; + void * const m_ptr; + const size_t m_size; +}; + + +#endif diff --git a/test/gtest/common/test.cc b/test/gtest/common/test.cc index 1ee096a171d..4642bd6ba7f 100644 --- a/test/gtest/common/test.cc +++ b/test/gtest/common/test.cc @@ -10,6 +10,8 @@ #include #include +#include + namespace ucs { pthread_mutex_t test_base::m_logger_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -86,7 +88,7 @@ void test_base::get_config(const std::string& name, std::string& value, size_t m max); if (status != UCS_OK) { GTEST_FAIL() << "Invalid UCS configuration for " << name - << ", error message: " << ucs_status_string(status) + << ": " << ucs_status_string(status) << "(" << status << ")"; } } @@ -126,7 +128,9 @@ void test_base::pop_config() ucs_log_func_rc_t test_base::count_warns_logger(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap) + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) { pthread_mutex_lock(&m_logger_mutex); if (level == UCS_LOG_LEVEL_ERROR) { @@ -141,14 +145,28 @@ test_base::count_warns_logger(const char *file, unsigned line, const char *funct std::string test_base::format_message(const char *message, va_list ap) { const size_t buffer_size = ucs_log_get_buffer_size(); - char buf[buffer_size]; - vsnprintf(buf, buffer_size, message, ap); - return std::string(buf); + std::string buf(buffer_size, '\0'); + vsnprintf(&buf[0], buffer_size, message, ap); + buf.resize(strlen(buf.c_str())); + return buf; +} + +void test_base::push_debug_message_with_limit(std::vector& vec, + const std::string& message, + const size_t limit) { + if (vec.size() >= limit) { + UCS_TEST_ABORT("aborting after " + ucs::to_string(vec.size()) + + " error messages (" + message + ")"); + } + + vec.push_back(message); } ucs_log_func_rc_t test_base::hide_errors_logger(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap) + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) { if (level == UCS_LOG_LEVEL_ERROR) { pthread_mutex_lock(&m_logger_mutex); @@ -160,13 +178,16 @@ test_base::hide_errors_logger(const char *file, unsigned line, const char *funct pthread_mutex_unlock(&m_logger_mutex); } - ucs_log_default_handler(file, line, function, level, message, ap); + ucs_log_default_handler(file, line, function, level, + &ucs_global_opts.log_component, message, ap); return UCS_LOG_FUNC_RC_STOP; } ucs_log_func_rc_t test_base::hide_warns_logger(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap) + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) { if (level == UCS_LOG_LEVEL_WARN) { pthread_mutex_lock(&m_logger_mutex); @@ -178,13 +199,16 @@ test_base::hide_warns_logger(const char *file, unsigned line, const char *functi pthread_mutex_unlock(&m_logger_mutex); } - ucs_log_default_handler(file, line, function, level, message, ap); + ucs_log_default_handler(file, line, function, level, + &ucs_global_opts.log_component, message, ap); return UCS_LOG_FUNC_RC_STOP; } ucs_log_func_rc_t test_base::wrap_errors_logger(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap) + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) { /* Ignore warnings about empty memory pool */ if (level == UCS_LOG_LEVEL_ERROR) { @@ -192,7 +216,7 @@ test_base::wrap_errors_logger(const char *file, unsigned line, const char *funct std::istringstream iss(format_message(message, ap)); std::string text; while (getline(iss, text, '\n')) { - m_errors.push_back(text); + push_debug_message_with_limit(m_errors, text, 1000); UCS_TEST_MESSAGE << "< " << text << " >"; } pthread_mutex_unlock(&m_logger_mutex); @@ -209,10 +233,12 @@ void test_base::SetUpProxy() { m_num_errors_before = m_total_errors; m_errors.clear(); + m_warnings.clear(); m_num_log_handlers_before = ucs_log_num_handlers(); ucs_log_push_handler(count_warns_logger); try { + check_skip_test(); init(); m_initialized = true; m_state = RUNNING; @@ -229,6 +255,8 @@ void test_base::TearDownProxy() { m_state == ABORTED, "state=%d", m_state); + watchdog_signal(); + if (m_initialized) { cleanup(); } @@ -307,11 +335,7 @@ void test_base::TestBodyProxy() { m_state = ABORTED; throw; } - } else if (m_state == SKIPPED) { - } else if (m_state == ABORTED) { } - - watchdog_signal(); } void test_base::skipped(const test_skip_exception& e) { @@ -322,6 +346,8 @@ void test_base::skipped(const test_skip_exception& e) { detail::message_stream("SKIP") << "(" << reason << ")"; } m_state = SKIPPED; + skipped_tests.insert(::testing::UnitTest:: + GetInstance()->current_test_info()); } void test_base::init() { diff --git a/test/gtest/common/test.h b/test/gtest/common/test.h index 0991dd29677..a60c2534340 100644 --- a/test/gtest/common/test.h +++ b/test/gtest/common/test.h @@ -7,6 +7,12 @@ #ifndef UCS_TEST_BASE_H #define UCS_TEST_BASE_H +/* gcc 4.3.4 compilation */ +#ifndef UINT8_MAX +#define __STDC_LIMIT_MACROS +#include +#endif + #include "test_helpers.h" #include @@ -40,7 +46,7 @@ class test_base { protected: class scoped_log_handler { - public: +public: scoped_log_handler(ucs_log_func_t handler) { ucs_log_push_handler(handler); } @@ -64,23 +70,33 @@ class test_base { virtual void init(); bool barrier(); + virtual void check_skip_test() = 0; + virtual void test_body() = 0; static ucs_log_func_rc_t count_warns_logger(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap); + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap); static ucs_log_func_rc_t hide_errors_logger(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap); + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap); static ucs_log_func_rc_t hide_warns_logger(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap); + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap); static ucs_log_func_rc_t wrap_errors_logger(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap); + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap); state_t m_state; bool m_initialized; @@ -101,6 +117,10 @@ class test_base { private: void skipped(const test_skip_exception& e); void run(); + static void push_debug_message_with_limit(std::vector& vec, + const std::string& message, + const size_t limit); + static void *thread_func(void *arg); pthread_barrier_t m_barrier; @@ -126,6 +146,15 @@ class test : public testing::Test, public test_base { UCS_TEST_BASE_IMPL; }; +/* + * Base class from generic tests with user-defined parameter + */ +template +class test_with_param : public testing::TestWithParam, public test_base { +public: + UCS_TEST_BASE_IMPL; +}; + /** * UCT/UCP tests common storage for tests entities */ @@ -169,31 +198,37 @@ class entities_storage { /* * Helper macro */ -#define UCS_TEST_(test_case_name, test_name, parent_class, parent_id, num_threads, ...)\ -class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\ - public:\ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {\ +#define UCS_TEST_(test_case_name, test_name, parent_class, parent_id, \ + num_threads, skip_cond, skip_reason, ...) \ +class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() { \ set_num_threads(num_threads); \ UCS_PP_FOREACH(UCS_TEST_SET_CONFIG, _, __VA_ARGS__) \ } \ - private:\ - virtual void test_body();\ + private: \ + virtual void check_skip_test() { \ + if (skip_cond) { \ + UCS_TEST_SKIP_R(skip_reason); \ + } \ + } \ + virtual void test_body(); \ static ::testing::TestInfo* const test_info_;\ GTEST_DISALLOW_COPY_AND_ASSIGN_(\ GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\ -};\ +}; \ \ ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\ - ::test_info_ =\ - ::testing::internal::MakeAndRegisterTestInfo(\ + ::test_info_ = \ + ::testing::internal::MakeAndRegisterTestInfo( \ #test_case_name, \ (num_threads == 1) ? #test_name : #test_name "/mt_" #num_threads, \ "", "", \ (parent_id), \ parent_class::SetUpTestCase, \ parent_class::TearDownTestCase, \ - new ::testing::internal::TestFactoryImpl<\ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\ + new ::testing::internal::TestFactoryImpl< \ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>); \ void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::test_body() @@ -202,39 +237,56 @@ void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::test_body() */ #define UCS_TEST_F(test_fixture, test_name, ...)\ UCS_TEST_(test_fixture, test_name, test_fixture, \ - ::testing::internal::GetTypeId(), 1, __VA_ARGS__) + ::testing::internal::GetTypeId(), \ + 1, 0, "", __VA_ARGS__) + + +/* + * Define test fixture with modified configuration and check skip condition + */ +#define UCS_TEST_SKIP_COND_F(test_fixture, test_name, skip_cond, ...) \ + UCS_TEST_(test_fixture, test_name, test_fixture, \ + ::testing::internal::GetTypeId(), \ + 1, skip_cond, #skip_cond, __VA_ARGS__) /* * Define test fixture with multiple threads */ -#define UCS_MT_TEST_F(test_fixture, test_name, num_threads, ...)\ +#define UCS_MT_TEST_F(test_fixture, test_name, num_threads, ...) \ UCS_TEST_(test_fixture, test_name, test_fixture, \ - ::testing::internal::GetTypeId(), num_threads, __VA_ARGS__) + ::testing::internal::GetTypeId(), \ + num_threads, 0, "", __VA_ARGS__) /* * Helper macro */ -#define UCS_TEST_P_(test_case_name, test_name, num_threads, ...) \ +#define UCS_TEST_P_(test_case_name, test_name, num_threads, \ + skip_cond, skip_reason, ...) \ class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ : public test_case_name { \ public: \ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() { \ set_num_threads(num_threads); \ UCS_PP_FOREACH(UCS_TEST_SET_CONFIG, _, __VA_ARGS__); \ } \ virtual void test_body(); \ private: \ + virtual void check_skip_test() { \ + if (skip_cond) { \ + UCS_TEST_SKIP_R(skip_reason); \ + } \ + } \ static int AddToRegistry() { \ - ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ - GetTestCasePatternHolder(\ - #test_case_name, __FILE__, __LINE__)->AddTestPattern(\ - #test_case_name, \ - (num_threads == 1) ? #test_name : #test_name "/mt_" #num_threads, \ - new ::testing::internal::TestMetaFactory< \ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \ - return 0; \ + ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ + GetTestCasePatternHolder( \ + #test_case_name, __FILE__, __LINE__)->AddTestPattern( \ + #test_case_name, \ + (num_threads == 1) ? #test_name : #test_name "/mt_" #num_threads, \ + new ::testing::internal::TestMetaFactory< \ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \ + return 0; \ } \ static int gtest_registering_dummy_; \ GTEST_DISALLOW_COPY_AND_ASSIGN_(\ @@ -250,13 +302,20 @@ void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::test_body() * Define parameterized test with modified configuration */ #define UCS_TEST_P(test_case_name, test_name, ...) \ - UCS_TEST_P_(test_case_name, test_name, 1, __VA_ARGS__) + UCS_TEST_P_(test_case_name, test_name, 1, 0, "", __VA_ARGS__) + + +/* + * Define parameterized test with modified configuration and check skip condition + */ +#define UCS_TEST_SKIP_COND_P(test_case_name, test_name, skip_cond, ...) \ + UCS_TEST_P_(test_case_name, test_name, 1, skip_cond, #skip_cond, __VA_ARGS__) /* * Define parameterized test with multiple threads */ #define UCS_MT_TEST_P(test_case_name, test_name, num_threads, ...) \ - UCS_TEST_P_(test_case_name, test_name, num_threads, __VA_ARGS__) + UCS_TEST_P_(test_case_name, test_name, num_threads, 0, "", __VA_ARGS__) #endif diff --git a/test/gtest/common/test_gtest_cmn.cc b/test/gtest/common/test_gtest_cmn.cc new file mode 100644 index 00000000000..580dcca690b --- /dev/null +++ b/test/gtest/common/test_gtest_cmn.cc @@ -0,0 +1,18 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#include "test.h" +#include "test_helpers.h" + + +class gtest_common : public ucs::test { +}; + + +UCS_TEST_F(gtest_common, auto_ptr) { + ucs::auto_ptr p(new int); +} + diff --git a/test/gtest/common/test_helpers.cc b/test/gtest/common/test_helpers.cc index a373ed7b446..5e380c2b7a4 100644 --- a/test/gtest/common/test_helpers.cc +++ b/test/gtest/common/test_helpers.cc @@ -9,16 +9,23 @@ #include #include #include +#include + #include +#include namespace ucs { +typedef std::pair test_result_t; + const double test_timeout_in_sec = 60.; const double watchdog_timeout_default = 900.; // 15 minutes static test_watchdog_t watchdog; +std::set< const ::testing::TestInfo*> skipped_tests; + void *watchdog_func(void *arg) { int ret = 0; @@ -208,6 +215,104 @@ void watchdog_stop() pthread_mutex_destroy(&watchdog.mutex); } +static bool test_results_cmp(const test_result_t &a, const test_result_t &b) +{ + return a.second > b.second; +} + +void analyze_test_results() +{ + // GTEST_REPORT_LONGEST_TESTS=100 will report TOP-100 longest tests + /* coverity[tainted_data_return] */ + char *env_p = getenv("GTEST_REPORT_LONGEST_TESTS"); + if (env_p == NULL) { + return; + } + + size_t total_skipped_cnt = skipped_tests.size(); + ::testing::TimeInMillis total_skipped_time = 0; + size_t max_name_size = 0; + std::set< const ::testing::TestInfo*>::iterator skipped_it; + int top_n; + + if (!strcmp(env_p, "*")) { + top_n = std::numeric_limits::max(); + } else { + top_n = atoi(env_p); + if (!top_n) { + return; + } + } + + ::testing::UnitTest *unit_test = ::testing::UnitTest::GetInstance(); + std::vector test_results; + + if (unit_test == NULL) { + ADD_FAILURE() << "Unable to get the Unit Test instance"; + return; + } + + for (int i = 0; i < unit_test->total_test_case_count(); i++) { + const ::testing::TestCase *test_case = unit_test->GetTestCase(i); + if (test_case == NULL) { + ADD_FAILURE() << "Unable to get the Test Case instance with index " + << i; + return; + } + + for (int i = 0; i < test_case->total_test_count(); i++) { + const ::testing::TestInfo *test = test_case->GetTestInfo(i); + if (test == NULL) { + ADD_FAILURE() << "Unable to get the Test Info instance with index " + << i; + return; + } + + if (test->should_run()) { + const ::testing::TestResult *result = test->result(); + std::string test_name = test->test_case_name(); + + test_name += "."; + test_name += test->name(); + + test_results.push_back(std::make_pair(test_name, + result->elapsed_time())); + + max_name_size = std::max(test_name.size(), max_name_size); + + skipped_it = skipped_tests.find(test); + if (skipped_it != skipped_tests.end()) { + total_skipped_time += result->elapsed_time(); + skipped_tests.erase(skipped_it); + } + } + } + } + + std::sort(test_results.begin(), test_results.end(), test_results_cmp); + + top_n = std::min((int)test_results.size(), top_n); + if (!top_n) { + return; + } + + // Print TOP- slowest tests + int max_index_size = ucs::to_string(top_n).size(); + std::cout << std::endl << "TOP-" << top_n << " longest tests:" << std::endl; + + for (int i = 0; i < top_n; i++) { + std::cout << std::setw(max_index_size - ucs::to_string(i + 1).size() + 1) + << (i + 1) << ". " << test_results[i].first + << std::setw(max_name_size - test_results[i].first.size() + 3) + << " - " << test_results[i].second << " ms" << std::endl; + } + + // Print skipped tests statistics + std::cout << std::endl << "Skipped tests: count - " + << total_skipped_cnt << ", time - " + << total_skipped_time << " ms" << std::endl; +} + int test_time_multiplier() { int factor = 1; @@ -228,14 +333,18 @@ ucs_time_t get_deadline(double timeout_in_sec) int max_tcp_connections() { - int max_conn = 65535 - 1024; /* limit on number of ports */ + static int max_conn = 0; - /* Limit numer of endpoints to number of open files, for TCP */ - struct rlimit rlim; - int ret = getrlimit(RLIMIT_NOFILE, &rlim); - if (ret == 0) { - /* assume no more than 100 fd-s are already used */ - max_conn = ucs_min((static_cast(rlim.rlim_cur) - 100) / 2, max_conn); + if (!max_conn) { + max_conn = 65535 - 1024; /* limit on number of ports */ + + /* Limit numer of endpoints to number of open files, for TCP */ + struct rlimit rlim; + int ret = getrlimit(RLIMIT_NOFILE, &rlim); + if (ret == 0) { + /* assume no more than 100 fd-s are already used */ + max_conn = ucs_min((static_cast(rlim.rlim_cur) - 100) / 2, max_conn); + } } return max_conn; @@ -272,6 +381,39 @@ scoped_setenv::~scoped_setenv() { } } +ucx_env_cleanup::ucx_env_cleanup() { + const size_t prefix_len = strlen(UCS_DEFAULT_ENV_PREFIX); + char **envp; + + for (envp = environ; *envp != NULL; ++envp) { + std::string env_var = *envp; + + if ((env_var.find("=") != std::string::npos) && + (env_var.find(UCS_DEFAULT_ENV_PREFIX, 0, prefix_len) != std::string::npos)) { + ucx_env_storage.push_back(env_var); + } + } + + for (size_t i = 0; i < ucx_env_storage.size(); i++) { + std::string var_name = + ucx_env_storage[i].substr(0, ucx_env_storage[i].find("=")); + + unsetenv(var_name.c_str()); + } +} + +ucx_env_cleanup::~ucx_env_cleanup() { + while (!ucx_env_storage.empty()) { + std::string var_name = + ucx_env_storage.back().substr(0, ucx_env_storage.back().find("=")); + std::string var_value = + ucx_env_storage.back().substr(ucx_env_storage.back().find("=") + 1); + + setenv(var_name.c_str(), var_value.c_str(), 1); + ucx_env_storage.pop_back(); + } +} + void safe_sleep(double sec) { ucs_time_t current_time = ucs_get_time(); ucs_time_t end_time = current_time + ucs_time_from_sec(sec); @@ -291,42 +433,95 @@ bool is_inet_addr(const struct sockaddr* ifa_addr) { (ifa_addr->sa_family == AF_INET6); } -bool is_rdmacm_netdev(const char *ifa_name) { +static std::vector read_dir(const std::string& path) +{ + std::vector result; struct dirent *entry; - char path[PATH_MAX]; - char dev_name[16]; - char guid_buf[32]; DIR *dir; - snprintf(path, PATH_MAX, "/sys/class/net/%s/device/infiniband", ifa_name); - dir = opendir(path); + dir = opendir(path.c_str()); if (dir == NULL) { - return false; + goto out_close; } - /* read IB device name */ - for (;;) { - entry = readdir(dir); - if (entry == NULL) { - closedir(dir); - return false; - } else if (entry->d_name[0] != '.') { - ucs_strncpy_zero(dev_name, entry->d_name, sizeof(dev_name)); - break; + for (entry = readdir(dir); entry != NULL; entry = readdir(dir)) { + if (entry->d_name[0] != '.') { + result.push_back(entry->d_name); } } + +out_close: closedir(dir); + return result; +} + +static std::set get_all_rdmacm_net_devices() +{ + static const std::string sysfs_ib_dir = "/sys/class/infiniband"; + static const std::string sysfs_net_dir = "/sys/class/net"; + static const std::string ndevs_fmt = sysfs_ib_dir + + "/%s/ports/%d/gid_attrs/ndevs/0"; + static const std::string node_guid_fmt = sysfs_ib_dir + "/%s/node_guid"; + std::set devices; + char dev_name[32]; + char guid_buf[32]; + ssize_t nread; + int port_num; + + std::vector ndevs = read_dir(sysfs_net_dir); + + /* Enumerate IPoIB and RoCE devices which have direct mapping to an RDMA + * device. + */ + for (size_t i = 0; i < ndevs.size(); ++i) { + std::string infiniband_dir = sysfs_net_dir + "/" + ndevs[i] + + "/device/infiniband"; + if (!read_dir(infiniband_dir).empty()) { + devices.insert(ndevs[i]); + } + } + + /* Enumerate all RoCE devices, including bonding (RoCE LAG). Some devices + * can be found again, but std::set will eliminate the duplicates. + */ + std::vector rdma_devs = read_dir(sysfs_ib_dir); + for (size_t i = 0; i < rdma_devs.size(); ++i) { + const char *ndev_name = rdma_devs[i].c_str(); + + for (port_num = 1; port_num <= 2; ++port_num) { + nread = ucs_read_file_str(dev_name, sizeof(dev_name), 1, + ndevs_fmt.c_str(), ndev_name, port_num); + if (nread <= 0) { + continue; + } + + memset(guid_buf, 0, sizeof(guid_buf)); + nread = ucs_read_file_str(guid_buf, sizeof(guid_buf), 1, + node_guid_fmt.c_str(), ndev_name); + if (nread <= 0) { + continue; + } - /* read node guid */ - memset(guid_buf, 0, sizeof(guid_buf)); - ssize_t nread = ucs_read_file(guid_buf, sizeof(guid_buf), 1, - "/sys/class/infiniband/%s/node_guid", dev_name); - if (nread < 0) { - return false; + /* use the device if node_guid != 0 */ + if (strstr(guid_buf, "0000:0000:0000:0000") == NULL) { + devices.insert(ucs_strtrim(dev_name)); + } + } } - /* use the device if node_guid != 0 */ - return strstr(guid_buf, "0000:0000:0000:0000") == NULL; + return devices; +} + +bool is_rdmacm_netdev(const char *ifa_name) { + static bool initialized = false; + static std::set devices; + + if (!initialized) { + devices = get_all_rdmacm_net_devices(); + initialized = true; + } + + return devices.find(ifa_name) != devices.end(); } uint16_t get_port() { @@ -355,7 +550,7 @@ uint16_t get_port() { EXPECT_EQ(ret, 0); EXPECT_LT(1023, ntohs(ret_addr.sin_port)) ; - port = ret_addr.sin_port; + port = ntohs(ret_addr.sin_port); close(sock_fd); return port; } @@ -364,6 +559,134 @@ void *mmap_fixed_address() { return (void*)0xff0000000; } +std::string compact_string(const std::string &str, size_t length) +{ + if (str.length() <= length * 2) { + return str; + } + + return str.substr(0, length) + "..." + str.substr(str.length() - length); +} + +sock_addr_storage::sock_addr_storage() : m_size(0), m_is_valid(false) { + memset(&m_storage, 0, sizeof(m_storage)); +} + +sock_addr_storage::sock_addr_storage(const ucs_sock_addr_t &ucs_sock_addr) { + if (sizeof(m_storage) < ucs_sock_addr.addrlen) { + memset(&m_storage, 0, sizeof(m_storage)); + m_size = 0; + m_is_valid = false; + } else { + set_sock_addr(*ucs_sock_addr.addr, ucs_sock_addr.addrlen); + } +} + +void sock_addr_storage::set_sock_addr(const struct sockaddr &addr, + const size_t size) { + ASSERT_GE(sizeof(m_storage), size); + ASSERT_TRUE(ucs::is_inet_addr(&addr)); + memcpy(&m_storage, &addr, size); + m_size = size; + m_is_valid = true; +} + +void sock_addr_storage::reset_to_any() { + ASSERT_TRUE(m_is_valid); + + if (get_sock_addr_ptr()->sa_family == AF_INET) { + struct sockaddr_in sin = {0}; + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = INADDR_ANY; + sin.sin_port = get_port(); + + set_sock_addr(*(struct sockaddr*)&sin, sizeof(sin)); + } else { + ASSERT_EQ(get_sock_addr_ptr()->sa_family, AF_INET6); + struct sockaddr_in6 sin = {0}; + + sin.sin6_family = AF_INET6; + sin.sin6_addr = in6addr_any; + sin.sin6_port = get_port(); + + set_sock_addr(*(struct sockaddr*)&sin, sizeof(sin)); + } +} + +bool +sock_addr_storage::operator==(const struct sockaddr_storage &sockaddr) const { + ucs_status_t status; + int result = ucs_sockaddr_cmp(get_sock_addr_ptr(), + (const struct sockaddr*)&sockaddr, &status); + ASSERT_UCS_OK(status); + return result == 0; +} + +void sock_addr_storage::set_port(uint16_t port) { + if (get_sock_addr_ptr()->sa_family == AF_INET) { + struct sockaddr_in *addr_in = (struct sockaddr_in *)&m_storage; + addr_in->sin_port = htons(port); + } else { + ASSERT_TRUE(get_sock_addr_ptr()->sa_family == AF_INET6); + struct sockaddr_in6 *addr_in = (struct sockaddr_in6 *)&m_storage; + addr_in->sin6_port = htons(port); + } +} + +uint16_t sock_addr_storage::get_port() const { + if (get_sock_addr_ptr()->sa_family == AF_INET) { + struct sockaddr_in *addr_in = (struct sockaddr_in *)&m_storage; + return ntohs(addr_in->sin_port); + } else { + EXPECT_TRUE(get_sock_addr_ptr()->sa_family == AF_INET6); + + struct sockaddr_in6 *addr_in = (struct sockaddr_in6 *)&m_storage; + return ntohs(addr_in->sin6_port); + } +} + +size_t sock_addr_storage::get_addr_size() const { + return m_size; +} + +ucs_sock_addr_t sock_addr_storage::to_ucs_sock_addr() const { + ucs_sock_addr_t addr; + + addr.addr = get_sock_addr_ptr(); + addr.addrlen = m_size; + return addr; +} + +std::string sock_addr_storage::to_str() const { + char str[UCS_SOCKADDR_STRING_LEN]; + return ucs_sockaddr_str(get_sock_addr_ptr(), str, sizeof(str)); +} + +const struct sockaddr* sock_addr_storage::get_sock_addr_ptr() const { + return m_is_valid ? (struct sockaddr *)(&m_storage) : NULL; +} + +std::ostream& operator<<(std::ostream& os, const sock_addr_storage& sa_storage) +{ + return os << ucs::sockaddr_to_str(sa_storage.get_sock_addr_ptr()); +} + +auto_buffer::auto_buffer(size_t size) : m_ptr(malloc(size)) { + if (!m_ptr) { + UCS_TEST_ABORT("Failed to allocate memory"); + } +} + +auto_buffer::~auto_buffer() +{ + free(m_ptr); +} + +void* auto_buffer::operator*() const { + return m_ptr; +}; + namespace detail { message_stream::message_stream(const std::string& title) { @@ -382,4 +705,21 @@ message_stream::~message_stream() { } // detail +std::vector > supported_mem_type_pairs() { + static std::vector > result; + + if (result.empty()) { + result = ucs::make_pairs(mem_buffer::supported_mem_types()); + } + + return result; +} + +void skip_on_address_sanitizer() +{ +#ifdef __SANITIZE_ADDRESS__ + UCS_TEST_SKIP_R("Address sanitizer"); +#endif +} + } // ucs diff --git a/test/gtest/common/test_helpers.h b/test/gtest/common/test_helpers.h index 9294fff9ada..e9f6558f387 100644 --- a/test/gtest/common/test_helpers.h +++ b/test/gtest/common/test_helpers.h @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2012. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * Copyright (c) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -10,11 +10,15 @@ #include "gtest.h" +#include + +#include #include #include #include #include #include + #include #include #include @@ -86,6 +90,15 @@ } while (0) +#define ASSERT_UCS_OK_OR_BUSY(_expr) \ + do { \ + ucs_status_t _status = (_expr); \ + if (((_status) != UCS_OK) && ((_status) != UCS_ERR_BUSY)) { \ + UCS_TEST_ABORT("Error: " << ucs_status_string(_status)); \ + } \ + } while (0) + + #define ASSERT_UCS_PTR_OK(_expr) \ do { \ ucs_status_ptr_t _status = (_expr); \ @@ -97,7 +110,7 @@ #define EXPECT_UD_CHECK(_val1, _val2, _exp_ud, _exp_non_ud) \ do { \ - if ((GetParam()->tl_name == "ud") || (GetParam()->tl_name == "ud_mlx5")) { \ + if (has_ud()) { \ EXPECT_##_exp_ud(_val1, _val2); \ } else { \ EXPECT_##_exp_non_ud(_val1, _val2); \ @@ -159,11 +172,14 @@ } + namespace ucs { extern const double test_timeout_in_sec; extern const double watchdog_timeout_default; +extern std::set< const ::testing::TestInfo*> skipped_tests; + typedef enum { WATCHDOG_STOP, WATCHDOG_RUN, @@ -194,6 +210,8 @@ int watchdog_get_kill_signal(); int watchdog_start(); void watchdog_stop(); +void analyze_test_results(); + class test_abort_exception : public std::exception { }; @@ -248,6 +266,7 @@ ucs_time_t get_deadline(double timeout_in_sec = test_timeout_in_sec); */ int max_tcp_connections(); + /** * Signal-safe sleep. */ @@ -279,6 +298,12 @@ uint16_t get_port(); void *mmap_fixed_address(); +/* + * Returns a compacted string with just head and tail, e.g "xxx...yyy" + */ +std::string compact_string(const std::string &str, size_t length); + + /** * Return the IP address of the given interface address. */ @@ -289,6 +314,42 @@ std::string sockaddr_to_str(const S *saddr) { buffer, UCS_SOCKADDR_STRING_LEN); } +/** + * Wrapper for struct sockaddr_storage to unify work flow for IPv4 and IPv6 + */ +class sock_addr_storage { +public: + sock_addr_storage(); + + sock_addr_storage(const ucs_sock_addr_t &ucs_sock_addr); + + void set_sock_addr(const struct sockaddr &addr, const size_t size); + + void reset_to_any(); + + bool operator==(const struct sockaddr_storage &sockaddr) const; + + void set_port(uint16_t port); + + uint16_t get_port() const; + + size_t get_addr_size() const; + + ucs_sock_addr_t to_ucs_sock_addr() const; + + std::string to_str() const; + + const struct sockaddr* get_sock_addr_ptr() const; + +private: + struct sockaddr_storage m_storage; + size_t m_size; + bool m_is_valid; +}; + + +std::ostream& operator<<(std::ostream& os, const sock_addr_storage& sa_storage); + /* * For gtest's EXPECT_EQ @@ -316,6 +377,11 @@ static inline int rand() { return ::rand(); } +static inline void srand(unsigned seed) { + /* coverity[dont_call] */ + return ::srand(seed); +} + void fill_random(void *data, size_t size); /* C can be vector or string */ @@ -333,7 +399,7 @@ static void fill_random(C& c, size_t size) { template static inline T random_upper() { return static_cast((rand() / static_cast(RAND_MAX)) * - std::numeric_limits::max()); + static_cast(std::numeric_limits::max())); } template @@ -371,6 +437,14 @@ class scoped_setenv { std::string m_old_value; }; +class ucx_env_cleanup { +public: + ucx_env_cleanup(); + ~ucx_env_cleanup(); +private: + std::vector ucx_env_storage; +}; + template std::string to_string(const T& value) { std::stringstream ss; @@ -378,6 +452,13 @@ std::string to_string(const T& value) { return ss.str(); } +template +std::string to_hex_string(const T& value) { + std::stringstream ss; + ss << std::hex << value; + return ss.str(); +} + template T from_string(const std::string& str) { T value; @@ -569,6 +650,10 @@ class handle { return m_initialized ? m_value : NULL; } + T operator->() const { + return get(); + } + private: void release() { @@ -588,6 +673,55 @@ class handle { ArgT m_dtor_arg; }; +/* simplified version of std::auto_ptr which was deprecated in newer stdc++ + * versions in favor of unique_ptr */ +template +class auto_ptr { +public: + auto_ptr() : m_ptr(NULL) { + } + + auto_ptr(T* ptr) : m_ptr(NULL) { + reset(ptr); + } + + ~auto_ptr() { + reset(); + } + + void reset(T* ptr = NULL) { + if (m_ptr) { + delete m_ptr; + } + m_ptr = ptr; + } + + operator T*() const { + return m_ptr; + } + + T* operator->() const { + return m_ptr; + } + +private: + auto_ptr(const auto_ptr&); /* disable copy */ + auto_ptr operator=(const auto_ptr&); /* disable assign */ + + T* m_ptr; +}; + +#define UCS_TEST_TRY_CREATE_HANDLE(_t, _handle, _dtor, _ctor, ...) \ + ({ \ + _t h; \ + ucs_status_t status = _ctor(__VA_ARGS__, &h); \ + ASSERT_UCS_OK_OR_BUSY(status); \ + if (status == UCS_OK) { \ + _handle.reset(h, _dtor); \ + } \ + status; \ + }) + #define UCS_TEST_CREATE_HANDLE(_t, _handle, _dtor, _ctor, ...) \ { \ _t h; \ @@ -596,6 +730,17 @@ class handle { _handle.reset(h, _dtor); \ } +#define UCS_TEST_CREATE_HANDLE_IF_SUPPORTED(_t, _handle, _dtor, _ctor, ...) \ + { \ + _t h; \ + ucs_status_t status = _ctor(__VA_ARGS__, &h); \ + if (status == UCS_ERR_UNSUPPORTED) { \ + UCS_TEST_SKIP_R(std::string("Unsupported operation: ") + \ + UCS_PP_MAKE_STRING(_ctor)); \ + } \ + ASSERT_UCS_OK(status); \ + _handle.reset(h, _dtor); \ + } class size_value { public: @@ -632,6 +777,23 @@ static inline O& operator<<(O& os, const size_value& sz) return os; } + +class auto_buffer { +public: + auto_buffer(size_t size); + ~auto_buffer(); + void* operator*() const; +private: + void *m_ptr; +}; + + +template +static void deleter(T *ptr) { + delete ptr; +} + + extern int perf_retry_count; extern double perf_retry_interval; @@ -680,6 +842,65 @@ class message_stream { } // detail +/** + * N-ary Cartesian product over the N vectors provided in the input vector + * The cardinality of the result vector: + * output.size = input[0].size * input[1].size * ... * input[input.size].size + */ +template +void cartesian_product(std::vector > &final_output, + std::vector &cur_output, + typename std::vector > + ::const_iterator cur_input, + typename std::vector > + ::const_iterator end_input) { + if (cur_input == end_input) { + final_output.push_back(cur_output); + return; + } + + const std::vector &cur_vector = *cur_input; + + cur_input++; + + for (typename std::vector::const_iterator iter = + cur_vector.begin(); iter != cur_vector.end(); ++iter) { + cur_output.push_back(*iter); + ucs::cartesian_product(final_output, cur_output, + cur_input, end_input); + cur_output.pop_back(); + } +} + +template +void cartesian_product(std::vector > &output, + const std::vector > &input) +{ + std::vector cur_output; + cartesian_product(output, cur_output, input.begin(), input.end()); +} + +template +std::vector > make_pairs(const std::vector &input_vec) { + std::vector > result; + std::vector > input; + + input.push_back(input_vec); + input.push_back(input_vec); + + ucs::cartesian_product(result, input); + + return result; +} + +std::vector > supported_mem_type_pairs(); + + +/** + * Skip test if address sanitizer is enabled + */ +void skip_on_address_sanitizer(); + } // ucs #endif /* UCS_TEST_HELPERS_H */ diff --git a/test/gtest/common/test_obj_size.cc b/test/gtest/common/test_obj_size.cc index 8164a109506..91a4dd64f8f 100644 --- a/test/gtest/common/test_obj_size.cc +++ b/test/gtest/common/test_obj_size.cc @@ -41,29 +41,29 @@ UCS_TEST_F(test_obj_size, size) { #if ENABLE_DEBUG_DATA UCS_TEST_SKIP_R("Debug data"); -#elif ENABLE_STATS +#elif defined (ENABLE_STATS) UCS_TEST_SKIP_R("Statistic enabled"); -#elif ENABLE_ASSERT +#elif UCS_ENABLE_ASSERT UCS_TEST_SKIP_R("Assert enabled"); #else EXPECTED_SIZE(ucp_ep_t, 64); - EXPECTED_SIZE(ucp_request_t, 232); + EXPECTED_SIZE(ucp_request_t, 240); EXPECTED_SIZE(ucp_recv_desc_t, 48); EXPECTED_SIZE(uct_ep_t, 8); EXPECTED_SIZE(uct_base_ep_t, 8); EXPECTED_SIZE(uct_rkey_bundle_t, 24); EXPECTED_SIZE(uct_self_ep_t, 8); - EXPECTED_SIZE(uct_tcp_ep_t, 128); + EXPECTED_SIZE(uct_tcp_ep_t, 160); # if HAVE_TL_RC - EXPECTED_SIZE(uct_rc_ep_t, 80); - EXPECTED_SIZE(uct_rc_verbs_ep_t, 88); + EXPECTED_SIZE(uct_rc_ep_t, 64); + EXPECTED_SIZE(uct_rc_verbs_ep_t, 96); # endif # if HAVE_TL_DC EXPECTED_SIZE(uct_dc_mlx5_ep_t, 32); # endif # if HAVE_TL_UD - EXPECTED_SIZE(uct_ud_ep_t, 240); - EXPECTED_SIZE(uct_ud_verbs_ep_t, 256); + EXPECTED_SIZE(uct_ud_ep_t, 248); + EXPECTED_SIZE(uct_ud_verbs_ep_t, 264); # endif #endif } diff --git a/test/gtest/common/test_perf.cc b/test/gtest/common/test_perf.cc index c3a812393a0..0000e1a3e54 100644 --- a/test/gtest/common/test_perf.cc +++ b/test/gtest/common/test_perf.cc @@ -12,7 +12,6 @@ extern "C" { #include #include -#include } #include #include @@ -113,7 +112,7 @@ void test_perf::rte::exchange_vec(void *rte_group, void * req) } void test_perf::rte::report(void *rte_group, const ucx_perf_result_t *result, - void *arg, int is_final) + void *arg, int is_final, int is_multi_thread) { } @@ -193,12 +192,11 @@ test_perf::test_result test_perf::run_multi_threaded(const test_spec &test, unsi params.alignment = ucs_get_page_size(); params.max_outstanding = test.max_outstanding; if (ucs::test_time_multiplier() == 1) { - params.warmup_iter = test.iters / 10; - params.max_iter = test.iters; + params.warmup_iter = test.iters / 10; + params.max_iter = test.iters; } else { - params.warmup_iter = 0; - params.max_iter = ucs_min(20u, - test.iters / ucs::test_time_multiplier()); + params.warmup_iter = 0; + params.max_iter = ucs_min(20u, test.iters / ucs::test_time_multiplier()); } params.max_time = 0.0; params.report_interval = 1.0; @@ -206,7 +204,7 @@ test_perf::test_result test_perf::run_multi_threaded(const test_spec &test, unsi params.rte = &rte::test_rte; params.report_arg = NULL; ucs_strncpy_zero(params.uct.dev_name, dev_name.c_str(), sizeof(params.uct.dev_name)); - ucs_strncpy_zero(params.uct.tl_name , tl_name.c_str(), sizeof(params.uct.tl_name)); + ucs_strncpy_zero(params.uct.tl_name , tl_name.c_str(), sizeof(params.uct.tl_name)); params.uct.data_layout = (uct_perf_data_layout_t)test.data_layout; params.uct.fc_window = UCT_PERF_TEST_MAX_FC_WINDOW; params.msg_size_cnt = test.msglencnt; diff --git a/test/gtest/common/test_perf.h b/test/gtest/common/test_perf.h index 795000b1402..3b1a836a0c9 100644 --- a/test/gtest/common/test_perf.h +++ b/test/gtest/common/test_perf.h @@ -74,7 +74,7 @@ class test_perf { static void exchange_vec(void *rte_group, void * req); static void report(void *rte_group, const ucx_perf_result_t *result, - void *arg, int is_final); + void *arg, int is_final, int is_multi_thread); static ucx_perf_rte_t test_rte; diff --git a/test/gtest/configure.m4 b/test/gtest/configure.m4 index 8b518416e10..aff6c1de23b 100644 --- a/test/gtest/configure.m4 +++ b/test/gtest/configure.m4 @@ -4,8 +4,26 @@ # See file LICENSE for terms. # +AC_LANG_PUSH([C++]) + +CHECK_COMPILER_FLAG([-fno-tree-vectorize], [-fno-tree-vectorize], + [AC_LANG_SOURCE([[int main(int argc, char** argv){return 0;}]])], + [GTEST_CXXFLAGS="$GTEST_CXXFLAGS -fno-tree-vectorize"], + []) + +# error #236: controlling expression is constant +CHECK_COMPILER_FLAG([--diag_suppress 236], [--diag_suppress 236], + [AC_LANG_SOURCE([[int main(int argc, char** argv){return 0;}]])], + [GTEST_CXXFLAGS="$GTEST_CXXFLAGS --diag_suppress 236"], + []) + +AC_LANG_POP([C++]) + +AC_SUBST([GTEST_CXXFLAGS], [$GTEST_CXXFLAGS]) + test_modules="" m4_include([test/gtest/ucm/test_dlopen/configure.m4]) +m4_include([test/gtest/ucm/test_dlopen/rpath-subdir/configure.m4]) m4_include([test/gtest/ucs/test_module/configure.m4]) AC_DEFINE_UNQUOTED([test_MODULES], ["${test_modules}"], [Test loadable modules]) AC_CONFIG_FILES([test/gtest/Makefile]) diff --git a/test/gtest/ucm/cuda_hooks.cc b/test/gtest/ucm/cuda_hooks.cc index 9e2ce5a191f..a0dddf90cc8 100644 --- a/test/gtest/ucm/cuda_hooks.cc +++ b/test/gtest/ucm/cuda_hooks.cc @@ -7,6 +7,13 @@ #include #include +extern "C" { +#include +#include +#include +#include +} + static ucm_event_t alloc_event, free_event; static void cuda_mem_alloc_callback(ucm_event_type_t event_type, @@ -80,14 +87,14 @@ class cuda_hooks : public ucs::test { void check_mem_alloc_events(void *ptr, size_t size, - int expect_mem_type = UCM_MEM_TYPE_CUDA) { + int expect_mem_type = UCS_MEMORY_TYPE_CUDA) { ASSERT_EQ(ptr, alloc_event.mem_type.address); ASSERT_EQ(size, alloc_event.mem_type.size); ASSERT_EQ(expect_mem_type, alloc_event.mem_type.mem_type); } void check_mem_free_events(void *ptr, size_t size, - int expect_mem_type = UCM_MEM_TYPE_CUDA) { + int expect_mem_type = UCS_MEMORY_TYPE_CUDA) { ASSERT_EQ(ptr, free_event.mem_type.address); ASSERT_EQ(expect_mem_type, free_event.mem_type.mem_type); } @@ -142,7 +149,7 @@ UCS_TEST_F(cuda_hooks, test_cuMemAllocManaged) { ret = cuMemAllocManaged(&dptr, 64, CU_MEM_ATTACH_GLOBAL); ASSERT_EQ(ret, CUDA_SUCCESS); - check_mem_alloc_events((void *)dptr, 64, UCM_MEM_TYPE_CUDA_MANAGED); + check_mem_alloc_events((void *)dptr, 64, UCS_MEMORY_TYPE_CUDA_MANAGED); ret = cuMemFree(dptr); ASSERT_EQ(ret, CUDA_SUCCESS); @@ -213,7 +220,7 @@ UCS_TEST_F(cuda_hooks, test_cudaMallocManaged) { ret = cudaMallocManaged(&ptr, 64, cudaMemAttachGlobal); ASSERT_EQ(ret, cudaSuccess); - check_mem_alloc_events(ptr, 64, UCM_MEM_TYPE_CUDA_MANAGED); + check_mem_alloc_events(ptr, 64, UCS_MEMORY_TYPE_CUDA_MANAGED); ret = cudaFree(ptr); ASSERT_EQ(ret, cudaSuccess); @@ -233,3 +240,21 @@ UCS_TEST_F(cuda_hooks, test_cudaMallocPitch) { ASSERT_EQ(ret, cudaSuccess); check_mem_free_events(devPtr, 0); } + +UCS_TEST_F(cuda_hooks, test_get_mem_type_current_device_info) { + ucs_sys_bus_id_t bus_id_ref = {0xffff, 0xff, 0xff, 0xff}; + ucs_sys_bus_id_t bus_id; + cudaError_t ret; + void *devPtr; + ucs_status_t status; + + ret = cudaMalloc(&devPtr, 64); + ASSERT_EQ(ret, cudaSuccess); + + status = ucm_get_mem_type_current_device_info(UCS_MEMORY_TYPE_CUDA, &bus_id); + ASSERT_UCS_OK(status); + ASSERT_NE(memcmp(&bus_id, &bus_id_ref, sizeof(bus_id)), 0); + + ret = cudaFree(devPtr); + ASSERT_EQ(ret, cudaSuccess); +} diff --git a/test/gtest/ucm/malloc_hook.cc b/test/gtest/ucm/malloc_hook.cc index 7086e62f13d..c754f474ffd 100644 --- a/test/gtest/ucm/malloc_hook.cc +++ b/test/gtest/ucm/malloc_hook.cc @@ -36,6 +36,42 @@ extern "C" { _prev = _value; \ } + +class malloc_hook_test_no_events : public ucs::test { +protected: + virtual ~malloc_hook_test_no_events() + { + } + + static void empty_event_callback(ucm_event_type_t event_type, + ucm_event_t *event, void *arg) + { + } + + virtual void init() + { + ucs::test::init(); + m_enable_events = ucm_global_opts.enable_events; + ucm_global_opts.enable_events = 0; + } + + virtual void cleanup() + { + ucm_global_opts.enable_events = m_enable_events; + ucs::test::cleanup(); + } + + int m_enable_events; +}; + +UCS_TEST_F(malloc_hook_test_no_events, empty_event) { + ucs_status_t status; + status = ucm_set_event_handler(0, 0, empty_event_callback, NULL); + ASSERT_UCS_OK(status); + ucm_unset_event_handler(0, empty_event_callback, NULL); +} + + template class mhook_thread { public: @@ -68,13 +104,14 @@ class mhook_thread { template class mmap_event { public: - mmap_event(T *test): m_test(test), m_events(0) + mmap_event(T *test): m_test(test), m_events(0), m_external_events(0) { } ~mmap_event() { unset(); + unset_external(); } ucs_status_t set(int events) @@ -88,6 +125,13 @@ class mmap_event { return status; } + ucs_status_t set_external(int events) + { + ucm_set_external_event(events); + m_external_events |= events; + return UCS_OK; + } + void unset() { if (m_events) { @@ -97,9 +141,18 @@ class mmap_event { } } + void unset_external() + { + if (m_external_events) { + ucm_unset_external_event(m_external_events); + m_external_events = 0; + } + } + protected: T *m_test; int m_events; + int m_external_events; static void mem_event_callback(ucm_event_type_t event_type, ucm_event_t *event, @@ -176,6 +229,8 @@ class malloc_hook : public ucs::test { ucs_status_t status; mmap_event event(this); + ucs::skip_on_address_sanitizer(); + m_got_event = 0; ucm_malloc_state_reset(128 * 1024, 128 * 1024); malloc_trim(0); @@ -195,14 +250,6 @@ class malloc_hook : public ucs::test { event.unset(); } - void skip_on_bistro() { - /* BISTRO is disabled under valgrind, we may run tests */ - if ((ucm_global_opts.mmap_hook_mode == UCM_MMAP_HOOK_BISTRO) && - !RUNNING_ON_VALGRIND) { - UCS_TEST_SKIP_R("skipping on BISTRO hooks"); - } - } - public: static int small_alloc_count; static const size_t small_alloc_size = 10000; @@ -211,6 +258,15 @@ class malloc_hook : public ucs::test { static volatile int bistro_call_counter; }; +static bool skip_on_bistro() { + return (ucm_global_opts.mmap_hook_mode == UCM_MMAP_HOOK_BISTRO); +} + +static bool skip_on_bistro_without_valgrind() { + /* BISTRO is disabled under valgrind, we may run tests */ + return (skip_on_bistro() && !RUNNING_ON_VALGRIND); +} + int malloc_hook::small_alloc_count = 1000 / ucs::test_time_multiplier(); volatile int malloc_hook::bistro_call_counter = 0; @@ -292,6 +348,7 @@ void test_thread::test() { const size_t small_alloc_size = malloc_hook::small_alloc_size; int num_ptrs_in_range; static volatile uint32_t total_ptrs_in_range = 0; + char *test_str; /* Allocate some pointers with old heap manager */ for (unsigned i = 0; i < 10; ++i) { @@ -341,6 +398,7 @@ void test_thread::test() { void *ptr = malloc(large_alloc_size); EXPECT_GE(m_map_size, large_alloc_size + small_map_size) << m_name; EXPECT_TRUE(is_ptr_in_range(ptr, large_alloc_size, m_map_ranges)) << m_name; + EXPECT_GE(malloc_usable_size(ptr), large_alloc_size); free(ptr); EXPECT_GE(m_unmap_size, large_alloc_size) << m_name; @@ -354,12 +412,20 @@ void test_thread::test() { /* Test setenv */ pthread_mutex_lock(&lock); setenv("TEST", "VALUE", 1); - EXPECT_EQ(std::string("VALUE"), getenv("TEST")); + test_str = getenv("TEST"); + if (test_str != NULL) { + EXPECT_EQ(std::string("VALUE"), test_str); + } else { + UCS_TEST_ABORT("getenv(\"TEST\") returned NULL"); + } pthread_mutex_unlock(&lock); /* Test username */ ucs_get_user_name(); + /* Test usable size */ + EXPECT_GE(malloc_usable_size(ptr_r), small_alloc_size); + /* Test realloc */ ptr_r = realloc(ptr_r, small_alloc_size / 2); free(ptr_r); @@ -441,9 +507,8 @@ void test_thread::test() { m_event.unset(); } -UCS_TEST_F(malloc_hook, single_thread) { - skip_on_bistro(); - +UCS_TEST_SKIP_COND_F(malloc_hook, single_thread, + skip_on_bistro_without_valgrind()) { pthread_barrier_t barrier; pthread_barrier_init(&barrier, NULL, 1); { @@ -452,15 +517,14 @@ UCS_TEST_F(malloc_hook, single_thread) { pthread_barrier_destroy(&barrier); } -UCS_TEST_F(malloc_hook, multi_threads) { +UCS_TEST_SKIP_COND_F(malloc_hook, multi_threads, + skip_on_bistro_without_valgrind()) { typedef mhook_thread thread_t; static const int num_threads = 8; ucs::ptr_vector threads; pthread_barrier_t barrier; - skip_on_bistro(); - malloc_trim(0); pthread_barrier_init(&barrier, NULL, num_threads); @@ -479,15 +543,12 @@ UCS_TEST_F(malloc_hook, asprintf) { (void)dlerror(); } -UCS_TEST_F(malloc_hook, fork) { +UCS_TEST_SKIP_COND_F(malloc_hook, fork, "broken") { static const int num_processes = 4; pthread_barrier_t barrier; std::vector pids; pid_t pid; - UCS_TEST_SKIP_R("broken"); - /* coverity[unreachable] */ - for (int i = 0; i < num_processes; ++i) { pid = fork(); if (pid == 0) { @@ -641,6 +702,7 @@ class mmap_hooks { void *buffer; int shmid; ucs_status_t status; + int num_threads; EXPECT_EQ(0u, m_mapped_size) << m_name; EXPECT_EQ(0u, m_unmapped_size) << m_name; @@ -726,8 +788,17 @@ class mmap_hooks { } /* 8. sbrk call - single thread only */ - { - if (!RUNNING_ON_VALGRIND && m_num_threads < 2) { + if (!RUNNING_ON_VALGRIND && (m_num_threads < 2)) { + num_threads = 0; + ucs_sys_enum_threads(enum_threads_cb, &num_threads); + // use sbrk() only if there are 3 threads in the system: + // 1. main thread + // 2. watchdog thread + // 3. test thread + // otherwise, another thread can call use malloc/free in same time, + // leading to heap corruption + + if (num_threads <= 3) { /* valgrind failed when sbrk is called directly, * also sbrk is not thread safe */ @@ -754,11 +825,16 @@ class mmap_hooks { std::string m_name; pthread_barrier_t *m_barrier; mmap_event m_event; + + static ucs_status_t enum_threads_cb(pid_t tid, void *ctx) + { + (*(int*)ctx)++; + return UCS_OK; + } }; UCS_TEST_F(malloc_hook_cplusplus, new_delete) { - const size_t size = 8 * 1000 * 1000; set(); @@ -777,23 +853,19 @@ UCS_TEST_F(malloc_hook_cplusplus, new_delete) { malloc_trim(0); - EXPECT_GE(m_unmapped_size, size * 3); + EXPECT_GE(m_unmapped_size, size); unset(); } -UCS_TEST_F(malloc_hook_cplusplus, dynamic_mmap_enable) { - if (RUNNING_ON_VALGRIND) { - UCS_TEST_SKIP_R("skipping on valgrind"); - } - skip_on_bistro(); +UCS_TEST_SKIP_COND_F(malloc_hook_cplusplus, dynamic_mmap_enable, + RUNNING_ON_VALGRIND || skip_on_bistro()) { EXPECT_TRUE(ucm_global_opts.enable_dynamic_mmap_thresh); test_dynamic_mmap_thresh(); } -UCS_TEST_F(malloc_hook_cplusplus, dynamic_mmap_disable) { - skip_on_bistro(); - +UCS_TEST_SKIP_COND_F(malloc_hook_cplusplus, dynamic_mmap_disable, + skip_on_bistro_without_valgrind()) { ucm_global_opts.enable_dynamic_mmap_thresh = 0; test_dynamic_mmap_thresh(); @@ -803,15 +875,14 @@ extern "C" { int ucm_dlmallopt_get(int); }; -UCS_TEST_F(malloc_hook_cplusplus, mallopt) { +UCS_TEST_SKIP_COND_F(malloc_hook_cplusplus, mallopt, + skip_on_bistro_without_valgrind()) { int v; int trim_thresh, mmap_thresh; char *p; size_t size; - skip_on_bistro(); - /* This test can not be run with the other * tests because it assumes that malloc hooks * are not initialized @@ -866,12 +937,7 @@ UCS_TEST_F(malloc_hook_cplusplus, mallopt) { unset(); } -UCS_TEST_F(malloc_hook_cplusplus, mmap_ptrs) { - - if (RUNNING_ON_VALGRIND) { - UCS_TEST_SKIP_R("skipping on valgrind"); - } - +UCS_TEST_SKIP_COND_F(malloc_hook_cplusplus, mmap_ptrs, RUNNING_ON_VALGRIND) { ucm_global_opts.enable_dynamic_mmap_thresh = 0; set(); @@ -964,7 +1030,7 @@ UCS_TEST_F(malloc_hook_cplusplus, remap_override_multi_threads) { typedef int (munmap_f_t)(void *addr, size_t len); -UCS_TEST_F(malloc_hook, bistro_patch) { +UCS_TEST_SKIP_COND_F(malloc_hook, bistro_patch, RUNNING_ON_VALGRIND) { const char *symbol = "munmap"; ucm_bistro_restore_point_t *rp = NULL; ucs_status_t status; @@ -974,17 +1040,13 @@ UCS_TEST_F(malloc_hook, bistro_patch) { uint64_t UCS_V_UNUSED patched; uint64_t UCS_V_UNUSED origin; - if (RUNNING_ON_VALGRIND) { - UCS_TEST_SKIP_R("skipping on valgrind"); - } - /* set hook to mmap call */ status = ucm_bistro_patch(symbol, (void*)bistro_hook<0>::munmap, &rp); ASSERT_UCS_OK(status); - EXPECT_NE((intptr_t)rp, NULL); + EXPECT_NE((intptr_t)rp, 0); munmap_f = (munmap_f_t*)ucm_bistro_restore_addr(rp); - EXPECT_NE((intptr_t)munmap_f, NULL); + EXPECT_NE((intptr_t)munmap_f, 0); /* save partial body of patched function */ patched = *(uint64_t*)munmap_f; @@ -1019,14 +1081,10 @@ UCS_TEST_F(malloc_hook, bistro_patch) { #endif } -UCS_TEST_F(malloc_hook, test_event) { +UCS_TEST_SKIP_COND_F(malloc_hook, test_event, RUNNING_ON_VALGRIND) { mmap_event event(this); ucs_status_t status; - if (RUNNING_ON_VALGRIND) { - UCS_TEST_SKIP_R("skipping on valgrind"); - } - status = event.set(UCM_EVENT_VM_MAPPED | UCM_EVENT_VM_UNMAPPED); ASSERT_UCS_OK(status); @@ -1034,20 +1092,13 @@ UCS_TEST_F(malloc_hook, test_event) { ASSERT_UCS_OK(status); } -UCS_TEST_F(malloc_hook, test_event_failed) { +UCS_TEST_SKIP_COND_F(malloc_hook, test_event_failed, + RUNNING_ON_VALGRIND || !skip_on_bistro()) { mmap_event event(this); ucs_status_t status; const char *symbol_munmap = "munmap"; const char *symbol_madvise = "madvise"; - if (RUNNING_ON_VALGRIND) { - UCS_TEST_SKIP_R("skipping on valgrind"); - } - - if (ucm_global_opts.mmap_hook_mode != UCM_MMAP_HOOK_BISTRO) { - UCS_TEST_SKIP_R("skipping on non-BISTRO hooks"); - } - status = event.set(UCM_EVENT_MUNMAP | UCM_EVENT_VM_UNMAPPED); ASSERT_UCS_OK(status); @@ -1056,6 +1107,8 @@ UCS_TEST_F(malloc_hook, test_event_failed) { bistro_patch patch(symbol_munmap, (void*)bistro_hook<0>::munmap); EXPECT_TRUE(ucm_test_events(UCM_EVENT_MUNMAP) == UCS_ERR_UNSUPPORTED); EXPECT_TRUE(ucm_test_events(UCM_EVENT_VM_UNMAPPED) == UCS_ERR_UNSUPPORTED); + EXPECT_TRUE(ucm_test_events(UCM_EVENT_MUNMAP | UCM_EVENT_VM_UNMAPPED) == + UCS_ERR_UNSUPPORTED); } /* set hook to madvise call */ { @@ -1065,18 +1118,67 @@ UCS_TEST_F(malloc_hook, test_event_failed) { } } -UCS_TEST_F(malloc_hook, test_event_unmap) { +UCS_TEST_SKIP_COND_F(malloc_hook, test_external_event, + RUNNING_ON_VALGRIND || !skip_on_bistro()) { mmap_event event(this); ucs_status_t status; - const char *symbol = "munmap"; + const char *symbol_munmap = "munmap"; + const char *symbol_madvise = "madvise"; - if (RUNNING_ON_VALGRIND) { - UCS_TEST_SKIP_R("skipping on valgrind"); - } + status = event.set_external(UCM_EVENT_VM_UNMAPPED); + ASSERT_UCS_OK(status); - if (ucm_global_opts.mmap_hook_mode != UCM_MMAP_HOOK_BISTRO) { - UCS_TEST_SKIP_R("skipping on non-BISTRO hooks"); + /* set hook to munmap call */ + { + bistro_patch patch(symbol_munmap, (void*)bistro_hook<0>::munmap); + /* OK due to UCM_EVENT_MUNMAP is not external */ + EXPECT_TRUE(ucm_test_external_events(UCM_EVENT_MUNMAP) == UCS_OK); + /* should fail */ + EXPECT_TRUE(ucm_test_external_events(UCM_EVENT_VM_UNMAPPED) == UCS_ERR_UNSUPPORTED); + /* should fail */ + EXPECT_TRUE(ucm_test_external_events(UCM_EVENT_MUNMAP | UCM_EVENT_VM_UNMAPPED) == + UCS_ERR_UNSUPPORTED); } + /* set hook to madvise call */ + { + bistro_patch patch(symbol_madvise, (void*)bistro_hook<0>::madvise); + /* OK due to UCM_EVENT_MADVISE is not external */ + EXPECT_TRUE(ucm_test_external_events(UCM_EVENT_MADVISE) == UCS_OK); + EXPECT_TRUE(ucm_test_external_events(UCM_EVENT_VM_UNMAPPED) == UCS_ERR_UNSUPPORTED); + } + /* set hook to munmap/madvise call which notify vm_unmap */ + { + bistro_patch patch_unmap(symbol_munmap, (void*)bistro_hook<1>::munmap); + bistro_patch patch_advise(symbol_madvise, (void*)bistro_hook<1>::madvise); + /* OK due to UCM_EVENT_MUNMAP is not external */ + EXPECT_TRUE(ucm_test_external_events(UCM_EVENT_MUNMAP) == UCS_OK); + /* OK due to UCM_EVENT_MADVISE is not external */ + EXPECT_TRUE(ucm_test_external_events(UCM_EVENT_MADVISE) == UCS_OK); + /* should be OK */ + EXPECT_TRUE(ucm_test_external_events(UCM_EVENT_VM_UNMAPPED) == UCS_OK); + EXPECT_TRUE(ucm_test_external_events(UCM_EVENT_MUNMAP | UCM_EVENT_VM_UNMAPPED) == + UCS_OK); + } + /* set hook to munmap & madvise call, but madvise does NOT notify vm_unmap */ + { + bistro_patch patch_unmap(symbol_munmap, (void*)bistro_hook<1>::munmap); + bistro_patch patch_advise(symbol_madvise, (void*)bistro_hook<0>::madvise); + /* OK due to UCM_EVENT_MUNMAP is not external */ + EXPECT_TRUE(ucm_test_external_events(UCM_EVENT_MUNMAP) == UCS_OK); + /* OK due to UCM_EVENT_MADVISE is not external */ + EXPECT_TRUE(ucm_test_external_events(UCM_EVENT_MADVISE) == UCS_OK); + /* should fail */ + EXPECT_TRUE(ucm_test_external_events(UCM_EVENT_VM_UNMAPPED) == UCS_ERR_UNSUPPORTED); + EXPECT_TRUE(ucm_test_external_events(UCM_EVENT_MUNMAP | UCM_EVENT_VM_UNMAPPED) == + UCS_ERR_UNSUPPORTED); + } +} + +UCS_TEST_SKIP_COND_F(malloc_hook, test_event_unmap, + RUNNING_ON_VALGRIND || !skip_on_bistro()) { + mmap_event event(this); + ucs_status_t status; + const char *symbol = "munmap"; status = event.set(UCM_EVENT_MMAP | UCM_EVENT_MUNMAP | UCM_EVENT_VM_UNMAPPED); ASSERT_UCS_OK(status); @@ -1097,72 +1199,237 @@ UCS_TEST_F(malloc_hook, test_event_unmap) { EXPECT_TRUE(status == UCS_OK); } -/* test for mmap events are fired from non-direct load modules - * we are trying to load lib1, from lib1 load lib2, and - * fire mmap event from lib2 */ -UCS_TEST_F(malloc_hook, dlopen) { +class malloc_hook_dlopen : public malloc_hook { +protected: + class library { + public: + typedef void* (*loader_t)(const char*, int); + + library(loader_t loader, const std::string &name = ""): + m_loader(loader), m_name(name), m_lib(NULL) + { + } + + ~library() + { + close(); + } + + void *open(const std::string name = "") + { + if (!name.empty()) { + m_name = name; + } + + close(); + + return (m_lib = m_loader(m_name.empty() ? NULL : m_name.c_str(), RTLD_NOW)); + } + + void attach(void *lib) + { + close(); + m_lib = lib; + } + + void close() + { + if (m_lib != NULL) { + dlclose(m_lib); + m_lib = NULL; + } + } + + operator bool() + { + return m_lib != NULL; + } + + void* sym(const char *name) + { + return dlsym(m_lib, name); + } + + protected: + loader_t m_loader; + std::string m_name; + void *m_lib; + }; + +public: + typedef library::loader_t loader_t; + + static std::string get_lib_dir() { #ifndef GTEST_UCM_HOOK_LIB_DIR # error "Missing build configuration" #else - typedef void (fire_mmap_f)(void); - typedef void* (load_lib_f)(const char *path); - - const char *libdlopen_load = "/libdlopen_test_do_load.so"; - const char *libdlopen_mmap = "/libdlopen_test_do_mmap.so"; - const char *load_lib = "load_lib"; - const char *fire_mmap = "fire_mmap"; - - std::string lib_load; - std::string lib_mmap; - void *lib; - void *lib2; - load_lib_f *load; - fire_mmap_f *fire; - ucs_status_t status; - mmap_event event(this); + return GTEST_UCM_HOOK_LIB_DIR; +#endif + } - status = event.set(UCM_EVENT_VM_MAPPED); - ASSERT_UCS_OK(status); + static std::string get_lib_path_do_load() { + return get_lib_dir() + "/libdlopen_test_do_load.so"; + } - lib_load = std::string(GTEST_UCM_HOOK_LIB_DIR) + libdlopen_load; - lib_mmap = std::string(GTEST_UCM_HOOK_LIB_DIR) + libdlopen_mmap; + static std::string get_lib_path_do_mmap() { + return get_lib_dir() + "/libdlopen_test_do_mmap.so"; + } - UCS_TEST_MESSAGE << "Loading " << lib_load; - UCS_TEST_MESSAGE << "Loading " << lib_mmap; + static std::string get_lib_path_do_load_rpath() { + return get_lib_dir() + "/libdlopen_test_do_load_rpath.so"; + } - lib = dlopen(lib_load.c_str(), RTLD_NOW); - EXPECT_NE((uintptr_t)lib, (uintptr_t)NULL); - if (!lib) { - goto no_lib; + static std::string get_lib_path_do_load_sub_rpath() { + return "libdlopen_test_rpath.so"; // library should be located using rpath } - load = (load_lib_f*)dlsym(lib, load_lib); - EXPECT_NE((uintptr_t)load, (uintptr_t)NULL); - if (!load) { - goto no_load; + /* test for mmap events are fired from non-direct load modules + * we are trying to load lib1, from lib1 load lib2, and + * fire mmap event from lib2 */ + void test_indirect_dlopen(loader_t loader) + { + typedef void (*fire_mmap_f)(void); + typedef void* (*load_lib_f)(const char *path, void* (*func)(const char*, int)); + + const char *load_lib = "load_lib"; + const char *fire_mmap = "fire_mmap"; + + library lib(loader, get_lib_path_do_load()); + library lib2(NULL); // lib2 is used for attach only + load_lib_f load; + fire_mmap_f fire; + ucs_status_t status; + mmap_event event(this); + + status = event.set(UCM_EVENT_VM_MAPPED); + ASSERT_UCS_OK(status); + + lib.open(); + ASSERT_TRUE(lib); + + load = (load_lib_f)lib.sym(load_lib); + ASSERT_TRUE(load != NULL); + + lib2.attach(load(get_lib_path_do_mmap().c_str(), loader)); + ASSERT_TRUE(lib2); + + fire = (fire_mmap_f)lib2.sym(fire_mmap); + ASSERT_TRUE(fire != NULL); + + m_got_event = 0; + fire(); + EXPECT_GT(m_got_event, 0); } - lib2 = load(lib_mmap.c_str()); - EXPECT_NE((uintptr_t)lib2, (uintptr_t)NULL); - if (!lib2) { - goto no_load; + /* Test for rpath section of caller module is processes */ + void test_rpath_dlopen(loader_t loader) + { + typedef void* (*load_lib_f)(const char *path, void* (*func)(const char*, int)); + + const char *load_lib = "load_lib"; + + library lib(loader); + library lib2(NULL); // lib2 is used for attach only + load_lib_f load; + ucs_status_t status; + mmap_event event(this); + + /* in case if reloc mode is used - it force hook dlopen */ + status = event.set(UCM_EVENT_VM_MAPPED); + ASSERT_UCS_OK(status); + + /* first check that without rpath library located in subdirectory could not be loaded */ + lib.open(get_lib_path_do_load()); + ASSERT_TRUE(lib); + if (!lib) { + return; + } + + load = (load_lib_f)lib.sym(load_lib); + ASSERT_TRUE(load != NULL); + + lib2.attach(load(get_lib_path_do_load_sub_rpath().c_str(), loader)); + ASSERT_FALSE(lib2); + + /* next check that rpath helps to load library located in subdirectory */ + /* don't care about opened libs - it will be closed automatically */ + lib.open(get_lib_path_do_load_rpath()); + ASSERT_TRUE(lib); + if (!lib) { + return; + } + + load = (load_lib_f)lib.sym(load_lib); + ASSERT_TRUE(load != NULL); + + lib2.attach(load(get_lib_path_do_load_sub_rpath().c_str(), loader)); + ASSERT_TRUE(lib2); } - fire = (fire_mmap_f*)dlsym(lib2, fire_mmap); - EXPECT_NE((uintptr_t)fire, (uintptr_t)NULL); - if (!fire) { - goto no_fire; + void test_dlopen_null(loader_t loader) + { + library lib(loader); + + lib.open(); + ASSERT_TRUE(lib); } +}; - m_got_event = 0; - fire(); - EXPECT_GT(m_got_event, 0); +UCS_TEST_F(malloc_hook_dlopen, indirect_dlopen) { + test_indirect_dlopen(dlopen); +} + +UCS_TEST_F(malloc_hook_dlopen, indirect_ucm_dlopen) { + test_indirect_dlopen(ucm_dlopen); +} + +UCS_TEST_F(malloc_hook_dlopen, rpath_dlopen) { + test_rpath_dlopen(dlopen); +} + +UCS_TEST_F(malloc_hook_dlopen, rpath_ucm_dlopen) { + test_rpath_dlopen(ucm_dlopen); +} + +UCS_TEST_F(malloc_hook_dlopen, ucm_dlopen_null_dlopen) { + test_dlopen_null(dlopen); +} + +UCS_TEST_F(malloc_hook_dlopen, ucm_dlopen_null_ucm_dlopen) { + test_dlopen_null(ucm_dlopen); +} + +UCS_MT_TEST_F(malloc_hook_dlopen, dlopen_mt_with_memtype, 2) { +#ifndef GTEST_UCM_HOOK_LIB_DIR +# error "Missing build configuration" +#endif + mmap_event event(this); + + ucs_status_t status = event.set(UCM_EVENT_VM_MAPPED | + UCM_EVENT_MEM_TYPE_ALLOC | + UCM_EVENT_MEM_TYPE_FREE); + ASSERT_UCS_OK(status); + + const std::string path = get_lib_path_do_mmap(); + static uint32_t count = 0; + + for (int i = 0; i < 100 / ucs::test_time_multiplier(); ++i) { + /* Tests that calling dlopen() from 2 threads does not deadlock, if for + * example we install memtype relocation patches and call dladdr() while + * iterating over loaded libraries. + */ + if (ucs_atomic_fadd32(&count, 1) % 2) { + void *lib1 = dlopen(get_lib_path_do_mmap().c_str(), RTLD_LAZY); + ASSERT_TRUE(lib1 != NULL); + dlclose(lib1); + } else { + void *lib2 = dlopen(get_lib_path_do_load().c_str(), RTLD_LAZY); + ASSERT_TRUE(lib2 != NULL); + dlclose(lib2); + } + + barrier(); + } -no_fire: - dlclose(lib2); -no_load: - dlclose(lib); -no_lib: event.unset(); -#endif /* GTEST_UCM_HOOK_LIB_DIR */ } diff --git a/test/gtest/ucm/rocm_hooks.cc b/test/gtest/ucm/rocm_hooks.cc new file mode 100644 index 00000000000..0ab08afb7cb --- /dev/null +++ b/test/gtest/ucm/rocm_hooks.cc @@ -0,0 +1,193 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. +* Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED. +* See file LICENSE for terms. +*/ +#include +#include +#include + +static ucm_event_t alloc_event, free_event; + +static void rocm_mem_alloc_callback(ucm_event_type_t event_type, + ucm_event_t *event, void *arg) +{ + alloc_event.mem_type.address = event->mem_type.address; + alloc_event.mem_type.size = event->mem_type.size; + alloc_event.mem_type.mem_type = event->mem_type.mem_type; +} + +static void rocm_mem_free_callback(ucm_event_type_t event_type, + ucm_event_t *event, void *arg) +{ + free_event.mem_type.address = event->mem_type.address; + free_event.mem_type.size = event->mem_type.size; + free_event.mem_type.mem_type = event->mem_type.mem_type; +} + +class rocm_hooks : public ucs::test { +protected: + + virtual void init() { + int dev_count; + ucs_status_t result; + hipError_t ret; + ucs::test::init(); + + ret = hipGetDeviceCount(&dev_count); + if ((ret != hipSuccess) || (dev_count < 1)) { + UCS_TEST_SKIP_R("no ROCm device detected"); + } + + if (hipSetDevice(0) != hipSuccess) { + UCS_TEST_SKIP_R("can't set ROCm device"); + } + + /* install memory hooks */ + result = ucm_set_event_handler(UCM_EVENT_MEM_TYPE_ALLOC, 0, + rocm_mem_alloc_callback, + reinterpret_cast(this)); + ASSERT_UCS_OK(result); + + result = ucm_set_event_handler(UCM_EVENT_MEM_TYPE_FREE, 0, + rocm_mem_free_callback, + reinterpret_cast(this)); + ASSERT_UCS_OK(result); + } + + virtual void cleanup() { + ucm_unset_event_handler(UCM_EVENT_MEM_TYPE_ALLOC, + rocm_mem_alloc_callback, + reinterpret_cast(this)); + ucm_unset_event_handler(UCM_EVENT_MEM_TYPE_FREE, + rocm_mem_free_callback, + reinterpret_cast(this)); + ucs::test::cleanup(); + } + + void check_mem_alloc_events(void *ptr, size_t size, + int expect_mem_type = UCS_MEMORY_TYPE_ROCM) { + ASSERT_EQ(ptr, alloc_event.mem_type.address); + ASSERT_EQ(size, alloc_event.mem_type.size); + ASSERT_EQ(expect_mem_type, alloc_event.mem_type.mem_type); + } + + void check_mem_free_events(void *ptr, size_t size, + int expect_mem_type = UCS_MEMORY_TYPE_ROCM) { + ASSERT_EQ(ptr, free_event.mem_type.address); + ASSERT_EQ(expect_mem_type, free_event.mem_type.mem_type); + } + +}; + +UCS_TEST_F(rocm_hooks, test_hipMem_Alloc_Free) { + hipError_t ret; + void *dptr, *dptr1; + + /* small allocation */ + ret = hipMalloc(&dptr, 64); + ASSERT_EQ(ret, hipSuccess); + check_mem_alloc_events((void *)dptr, 64); + + ret = hipFree(dptr); + ASSERT_EQ(ret, hipSuccess); + check_mem_free_events((void *)dptr, 64); + + /* large allocation */ + ret = hipMalloc(&dptr, (256 * UCS_MBYTE)); + ASSERT_EQ(ret, hipSuccess); + check_mem_alloc_events((void *)dptr, (256 * UCS_MBYTE)); + + ret = hipFree(dptr); + ASSERT_EQ(ret, hipSuccess); + check_mem_free_events((void *)dptr, (256 * UCS_MBYTE)); + + /* multiple allocations, hipfree in reverse order */ + ret = hipMalloc(&dptr, (1 * UCS_MBYTE)); + ASSERT_EQ(ret, hipSuccess); + check_mem_alloc_events((void *)dptr, (1 * UCS_MBYTE)); + + ret = hipMalloc(&dptr1, (1 * UCS_MBYTE)); + ASSERT_EQ(ret, hipSuccess); + check_mem_alloc_events((void *)dptr1, (1 * UCS_MBYTE)); + + ret = hipFree(dptr1); + ASSERT_EQ(ret, hipSuccess); + check_mem_free_events((void *)dptr1, (1 * UCS_MBYTE)); + + ret = hipFree(dptr); + ASSERT_EQ(ret, hipSuccess); + check_mem_free_events((void *)dptr, (1 * UCS_MBYTE)); +} + +UCS_TEST_F(rocm_hooks, test_hipMallocManaged) { + hipError_t ret; + void * dptr; + + ret = hipMallocManaged(&dptr, 64); + ASSERT_EQ(ret, hipSuccess); + check_mem_alloc_events((void *)dptr, 64); + + ret = hipFree(dptr); + ASSERT_EQ(ret, hipSuccess); + check_mem_free_events((void *)dptr, 0, UCS_MEMORY_TYPE_ROCM_MANAGED); +} + +UCS_TEST_F(rocm_hooks, test_hipMallocPitch) { + hipError_t ret; + void * dptr; + size_t pitch; + + ret = hipMallocPitch(&dptr, &pitch, 4, 8); + ASSERT_EQ(ret, hipSuccess); + check_mem_alloc_events((void *)dptr, (pitch * 8)); + + ret = hipFree(dptr); + ASSERT_EQ(ret, hipSuccess); + check_mem_free_events((void *)dptr, 0); +} + +UCS_TEST_F(rocm_hooks, test_hip_Malloc_Free) { + hipError_t ret; + void *ptr, *ptr1; + + /* small allocation */ + ret = hipMalloc(&ptr, 64); + ASSERT_EQ(ret, hipSuccess); + check_mem_alloc_events(ptr, 64); + + ret = hipFree(ptr); + ASSERT_EQ(ret, hipSuccess); + check_mem_free_events(ptr, 64); + + /* large allocation */ + ret = hipMalloc(&ptr, (256 * UCS_MBYTE)); + ASSERT_EQ(ret, hipSuccess); + check_mem_alloc_events(ptr, (256 * UCS_MBYTE)); + + ret = hipFree(ptr); + ASSERT_EQ(ret, hipSuccess); + check_mem_free_events(ptr, (256 * UCS_MBYTE)); + + /* multiple allocations, rocmfree in reverse order */ + ret = hipMalloc(&ptr, (1 * UCS_MBYTE)); + ASSERT_EQ(ret, hipSuccess); + check_mem_alloc_events(ptr, (1 * UCS_MBYTE)); + + ret = hipMalloc(&ptr1, (1 * UCS_MBYTE)); + ASSERT_EQ(ret, hipSuccess); + check_mem_alloc_events(ptr1, (1 * UCS_MBYTE)); + + ret = hipFree(ptr1); + ASSERT_EQ(ret, hipSuccess); + check_mem_free_events(ptr1, (1 * UCS_MBYTE)); + + ret = hipFree(ptr); + ASSERT_EQ(ret, hipSuccess); + check_mem_free_events(ptr, (1 * UCS_MBYTE)); + + /* hipFree with NULL */ + ret = hipFree(NULL); + ASSERT_EQ(ret, hipSuccess); +} + diff --git a/test/gtest/ucm/test_dlopen/Makefile.am b/test/gtest/ucm/test_dlopen/Makefile.am index 5aaf5d0e42f..3f729a6a8f4 100644 --- a/test/gtest/ucm/test_dlopen/Makefile.am +++ b/test/gtest/ucm/test_dlopen/Makefile.am @@ -7,10 +7,16 @@ noinst_lib_LTLIBRARIES = \ libdlopen_test_do_mmap.la \ - libdlopen_test_do_load.la + libdlopen_test_do_load.la \ + libdlopen_test_do_load_rpath.la -libdlopen_test_do_mmap_la_SOURCES = dlopen_test_do_mmap.c -libdlopen_test_do_load_la_SOURCES = dlopen_test_do_load.c +libdlopen_test_do_mmap_la_SOURCES = dlopen_test_do_mmap.c +libdlopen_test_do_load_la_SOURCES = dlopen_test_do_load.c +libdlopen_test_do_load_rpath_la_SOURCES = dlopen_test_do_load.c noinst_libdir = ${PWD}/.noinst +libdlopen_test_do_load_rpath_la_CPPFLAGS = -I$(top_srcdir)/src +libdlopen_test_do_load_la_CPPFLAGS = -I$(top_srcdir)/src +libdlopen_test_do_load_rpath_la_LDFLAGS = -R=${PWD}/rpath-subdir/.libs +SUBDIRS = rpath-subdir diff --git a/test/gtest/ucm/test_dlopen/dlopen_test_do_load.c b/test/gtest/ucm/test_dlopen/dlopen_test_do_load.c index a6dae7e5fc2..63c7f072b60 100644 --- a/test/gtest/ucm/test_dlopen/dlopen_test_do_load.c +++ b/test/gtest/ucm/test_dlopen/dlopen_test_do_load.c @@ -4,9 +4,12 @@ * See file LICENSE for terms. */ +#include + #include -void* load_lib(const char *path) +UCS_F_NOOPTIMIZE /* prevent using tail recursion unwind */ +void* load_lib(const char *path, void* (*load_func)(const char*, int)) { - return dlopen(path, RTLD_NOW); + return (load_func ? load_func : dlopen)(path, RTLD_NOW); } diff --git a/test/gtest/ucm/test_dlopen/rpath-subdir/Makefile.am b/test/gtest/ucm/test_dlopen/rpath-subdir/Makefile.am new file mode 100644 index 00000000000..67ea3067f1c --- /dev/null +++ b/test/gtest/ucm/test_dlopen/rpath-subdir/Makefile.am @@ -0,0 +1,14 @@ +# +# Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +# +# See file LICENSE for terms. +# + + +noinst_lib_LTLIBRARIES = \ + libdlopen_test_rpath.la + +libdlopen_test_rpath_la_SOURCES = dlopen_test_rpath.c +noinst_libdir = ${PWD}/.noinst + + diff --git a/test/gtest/ucm/test_dlopen/rpath-subdir/configure.m4 b/test/gtest/ucm/test_dlopen/rpath-subdir/configure.m4 new file mode 100644 index 00000000000..b12180272c5 --- /dev/null +++ b/test/gtest/ucm/test_dlopen/rpath-subdir/configure.m4 @@ -0,0 +1,7 @@ +# +# Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +# +# See file LICENSE for terms. +# + +AC_CONFIG_FILES([test/gtest/ucm/test_dlopen/rpath-subdir/Makefile]) diff --git a/test/gtest/ucm/test_dlopen/rpath-subdir/dlopen_test_rpath.c b/test/gtest/ucm/test_dlopen/rpath-subdir/dlopen_test_rpath.c new file mode 100644 index 00000000000..335f6a6951e --- /dev/null +++ b/test/gtest/ucm/test_dlopen/rpath-subdir/dlopen_test_rpath.c @@ -0,0 +1,10 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +int dummy() +{ + return 0; +} diff --git a/test/gtest/ucp/test_ucp_am.cc b/test/gtest/ucp/test_ucp_am.cc new file mode 100644 index 00000000000..23628f65506 --- /dev/null +++ b/test/gtest/ucp/test_ucp_am.cc @@ -0,0 +1,295 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * Copyright (c) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. + * Copyright (C) Los Alamos National Security, LLC. 2018. ALL RIGHTS RESERVED. + * + */ +#include +#include +#include +#include +#include + +#include "ucp_datatype.h" +#include "ucp_test.h" + +#define NUM_MESSAGES 17 + +#define UCP_REALLOC_ID 1000 +#define UCP_SEND_ID 0 +#define UCP_REPLY_ID 1 +#define UCP_RELEASE 1 + +class test_ucp_am_base : public ucp_test { +public: + int sent_ams; + int replies; + int recv_ams; + void *reply; + void *for_release[NUM_MESSAGES]; + int release; + + static ucp_params_t get_ctx_params() { + ucp_params_t params = ucp_test::get_ctx_params(); + params.field_mask |= UCP_PARAM_FIELD_FEATURES; + params.features = UCP_FEATURE_AM; + return params; + } + + static ucs_status_t ucp_process_am_cb(void *arg, void *data, + size_t length, + ucp_ep_h reply_ep, + unsigned flags); + + static ucs_status_t ucp_process_reply_cb(void *arg, void *data, + size_t length, + ucp_ep_h reply_ep, + unsigned flags); + + ucs_status_t am_handler(test_ucp_am_base *me, void *data, + size_t length, unsigned flags); +}; + +ucs_status_t test_ucp_am_base::ucp_process_reply_cb(void *arg, void *data, + size_t length, + ucp_ep_h reply_ep, + unsigned flags) +{ + test_ucp_am_base *self = reinterpret_cast(arg); + self->replies++; + return UCS_OK; +} + +ucs_status_t test_ucp_am_base::ucp_process_am_cb(void *arg, void *data, + size_t length, + ucp_ep_h reply_ep, + unsigned flags) +{ + test_ucp_am_base *self = reinterpret_cast(arg); + + if (reply_ep) { + self->reply = ucp_am_send_nb(reply_ep, UCP_REPLY_ID, NULL, 1, + ucp_dt_make_contig(0), + (ucp_send_callback_t) ucs_empty_function, + 0); + EXPECT_FALSE(UCS_PTR_IS_ERR(self->reply)); + } + + return self->am_handler(self, data, length, flags); +} + +ucs_status_t test_ucp_am_base::am_handler(test_ucp_am_base *me, void *data, + size_t length, unsigned flags) +{ + ucs_status_t status; + std::vector cmp(length, (char)length); + std::vector databuf(length, 'r'); + + memcpy(&databuf[0], data, length); + + EXPECT_EQ(cmp, databuf); + + bool has_desc = flags & UCP_CB_PARAM_FLAG_DATA; + if (me->release) { + me->for_release[me->recv_ams] = has_desc ? data : NULL; + status = has_desc ? UCS_INPROGRESS : UCS_OK; + } else { + status = UCS_OK; + } + + me->recv_ams++; + return status; +} + +class test_ucp_am : public test_ucp_am_base { +public: + ucp_ep_params_t get_ep_params() { + ucp_ep_params_t params = test_ucp_am_base::get_ep_params(); + params.field_mask |= UCP_EP_PARAM_FIELD_FLAGS; + params.flags |= UCP_EP_PARAMS_FLAGS_NO_LOOPBACK; + return params; + } + + virtual void init() { + modify_config("MAX_EAGER_LANES", "2"); + + ucp_test::init(); + sender().connect(&receiver(), get_ep_params()); + receiver().connect(&sender(), get_ep_params()); + } + +protected: + void do_set_am_handler_realloc_test(); + void do_send_process_data_test(int test_release, uint16_t am_id, + int send_reply); + void do_send_process_data_iov_test(size_t size); + void set_handlers(uint16_t am_id); + void set_reply_handlers(); +}; + +void test_ucp_am::set_reply_handlers() +{ + ucp_worker_set_am_handler(sender().worker(), UCP_REPLY_ID, + ucp_process_reply_cb, this, + UCP_AM_FLAG_WHOLE_MSG); + ucp_worker_set_am_handler(receiver().worker(), UCP_REPLY_ID, + ucp_process_reply_cb, this, + UCP_AM_FLAG_WHOLE_MSG); +} + +void test_ucp_am::set_handlers(uint16_t am_id) +{ + ucp_worker_set_am_handler(sender().worker(), am_id, + ucp_process_am_cb, this, + UCP_AM_FLAG_WHOLE_MSG); + ucp_worker_set_am_handler(receiver().worker(), am_id, + ucp_process_am_cb, this, + UCP_AM_FLAG_WHOLE_MSG); +} + +void test_ucp_am::do_send_process_data_test(int test_release, uint16_t am_id, + int send_reply) +{ + size_t buf_size = pow(2, NUM_MESSAGES - 2); + ucs_status_ptr_t sstatus = NULL; + std::vector buf(buf_size); + + recv_ams = 0; + sent_ams = 0; + replies = 0; + this->release = test_release; + + for (size_t i = 0; i < buf_size + 1; i = i ? (i * 2) : 1) { + for (size_t j = 0; j < i; j++) { + buf[j] = i; + } + + reply = NULL; + sstatus = ucp_am_send_nb(sender().ep(), am_id, + buf.data(), 1, ucp_dt_make_contig(i), + (ucp_send_callback_t) ucs_empty_function, + send_reply); + + EXPECT_FALSE(UCS_PTR_IS_ERR(sstatus)); + wait(sstatus); + sent_ams++; + + if (send_reply) { + while (sent_ams != replies) { + progress(); + } + + if (reply != NULL) { + ucp_request_release(reply); + } + } + } + + while (sent_ams != recv_ams) { + progress(); + } + + if (send_reply) { + while (sent_ams != replies) { + progress(); + } + } + + if (test_release) { + for(int i = 0; i < recv_ams; i++) { + if (for_release[i] != NULL) { + ucp_am_data_release(receiver().worker(), for_release[i]); + } + } + } +} + +void test_ucp_am::do_send_process_data_iov_test(size_t size) +{ + ucs_status_ptr_t sstatus; + size_t index; + size_t i; + + recv_ams = 0; + sent_ams = 0; + release = 0; + + const size_t iovcnt = 2; + std::vector sendbuf(size * iovcnt, 0); + + ucs::fill_random(sendbuf); + + set_handlers(0); + + for (i = 1; i < size; i *= 2) { + for (size_t iov_it = 0; iov_it < iovcnt; iov_it++) { + for (index = 0; index < i; index++) { + sendbuf[(iov_it * i) + index] = i * 2; + } + } + + ucp::data_type_desc_t send_dt_desc(DATATYPE_IOV, sendbuf.data(), + i * iovcnt, iovcnt); + + sstatus = ucp_am_send_nb(sender().ep(), 0, + send_dt_desc.buf(), iovcnt, DATATYPE_IOV, + (ucp_send_callback_t) ucs_empty_function, 0); + wait(sstatus); + EXPECT_FALSE(UCS_PTR_IS_ERR(sstatus)); + sent_ams++; + } + + while (sent_ams != recv_ams) { + progress(); + } +} + +void test_ucp_am::do_set_am_handler_realloc_test() +{ + set_handlers(UCP_SEND_ID); + do_send_process_data_test(0, UCP_SEND_ID, 0); + + set_handlers(UCP_REALLOC_ID); + do_send_process_data_test(0, UCP_REALLOC_ID, 0); + + set_handlers(UCP_SEND_ID + 1); + do_send_process_data_test(0, UCP_SEND_ID + 1, 0); +} + +UCS_TEST_P(test_ucp_am, send_process_am) +{ + set_handlers(UCP_SEND_ID); + do_send_process_data_test(0, UCP_SEND_ID, 0); + + set_reply_handlers(); + do_send_process_data_test(0, UCP_SEND_ID, UCP_AM_SEND_REPLY); +} + +UCS_TEST_P(test_ucp_am, send_process_am_release) +{ + set_handlers(UCP_SEND_ID); + do_send_process_data_test(UCP_RELEASE, 0, 0); +} + +UCS_TEST_P(test_ucp_am, send_process_iov_am) +{ + ucs::detail::message_stream ms("INFO"); + + for (unsigned i = 1; i <= 7; ++i) { + size_t max = (long)pow(10.0, i); + long count = ucs_max((long)(5000.0 / sqrt(max) / + ucs::test_time_multiplier()), 3); + ms << count << "x10^" << i << " " << std::flush; + for (long j = 0; j < count; ++j) { + size_t size = ucs::rand() % max + 1; + do_send_process_data_iov_test(size); + } + } +} + +UCS_TEST_P(test_ucp_am, set_am_handler_realloc) +{ + do_set_am_handler_realloc_test(); +} + +UCP_INSTANTIATE_TEST_CASE(test_ucp_am) diff --git a/test/gtest/ucp/test_ucp_atomic.cc b/test/gtest/ucp/test_ucp_atomic.cc index 53322c65242..04f6fe5ec5b 100644 --- a/test/gtest/ucp/test_ucp_atomic.cc +++ b/test/gtest/ucp/test_ucp_atomic.cc @@ -81,6 +81,23 @@ ucs_status_t test_ucp_atomic::ucp_atomic_post_nbi(ucp_ep_h ep, ucp_atomic_post_o return ucp_atomic_post(ep, opcode, value, sizeof(T), (uintptr_t)remote_addr, rkey); } +template +ucs_status_t test_ucp_atomic::ucp_atomic_post_nbx(ucp_ep_h ep, ucp_atomic_op_t opcode, + T value, void *remote_addr, + ucp_rkey_h rkey) +{ + ucp_request_param_t param; + + param.op_attr_mask = UCP_OP_ATTR_FIELD_DATATYPE; + param.datatype = ucp_dt_make_contig(sizeof(T)); + ucs_status_ptr_t sptr = ucp_atomic_op_nbx(ep, opcode, &value, 1, + (uint64_t)remote_addr, rkey, + ¶m); + EXPECT_FALSE(UCS_PTR_IS_PTR(sptr)); + + return UCS_PTR_STATUS(sptr); +} + template void test_ucp_atomic::nb_post(entity *e, size_t max_size, void *memheap_addr, ucp_rkey_h rkey, std::string& expected_data) @@ -102,6 +119,22 @@ void test_ucp_atomic::nb_post(entity *e, size_t max_size, void *memheap_addr, *(T*)&expected_data[0] = atomic_op_val(val, prev); } +template +void test_ucp_atomic::nbx_post(entity *e, size_t max_size, void *memheap_addr, + ucp_rkey_h rkey, std::string& expected_data) +{ + T val, prev; + + prev = *(T*)memheap_addr; + val = (T)ucs::rand() * (T)ucs::rand(); + + ASSERT_UCS_OK(test_ucp_atomic::ucp_atomic_post_nbx(e->ep(), OP, val, + memheap_addr, rkey)); + + expected_data.resize(sizeof(T)); + *(T*)&expected_data[0] = nbx_atomic_op_val(val, prev); +} + template void test_ucp_atomic::unaligned_nb_post(entity *e, size_t max_size, void *memheap_addr, ucp_rkey_h rkey, @@ -211,21 +244,41 @@ UCS_TEST_P(test_ucp_atomic32, atomic_add_nb) { test(&test_ucp_atomic32::nb_post, true); } +UCS_TEST_P(test_ucp_atomic32, atomic_add_nbx) { + test(&test_ucp_atomic32::nbx_post, false); + test(&test_ucp_atomic32::nbx_post, true); +} + UCS_TEST_P(test_ucp_atomic32, atomic_and_nb) { test(&test_ucp_atomic32::nb_post, false); test(&test_ucp_atomic32::nb_post, true); } +UCS_TEST_P(test_ucp_atomic32, atomic_and_nbx) { + test(&test_ucp_atomic32::nbx_post, false); + test(&test_ucp_atomic32::nbx_post, true); +} + UCS_TEST_P(test_ucp_atomic32, atomic_or_nb) { test(&test_ucp_atomic32::nb_post, false); test(&test_ucp_atomic32::nb_post, true); } +UCS_TEST_P(test_ucp_atomic32, atomic_or_nbx) { + test(&test_ucp_atomic32::nbx_post, false); + test(&test_ucp_atomic32::nbx_post, true); +} + UCS_TEST_P(test_ucp_atomic32, atomic_xor_nb) { test(&test_ucp_atomic32::nb_post, false); test(&test_ucp_atomic32::nb_post, true); } +UCS_TEST_P(test_ucp_atomic32, atomic_xor_nbx) { + test(&test_ucp_atomic32::nbx_post, false); + test(&test_ucp_atomic32::nbx_post, true); +} + UCS_TEST_P(test_ucp_atomic32, atomic_fadd_nb) { test(&test_ucp_atomic32::nb_fetch, false); test(&test_ucp_atomic32::nb_fetch, true); @@ -277,21 +330,41 @@ UCS_TEST_P(test_ucp_atomic64, atomic_add_nb) { test(&test_ucp_atomic64::nb_post, true); } +UCS_TEST_P(test_ucp_atomic64, atomic_add_nbx) { + test(&test_ucp_atomic64::nbx_post, false); + test(&test_ucp_atomic64::nbx_post, true); +} + UCS_TEST_P(test_ucp_atomic64, atomic_and_nb) { test(&test_ucp_atomic64::nb_post, false); test(&test_ucp_atomic64::nb_post, true); } +UCS_TEST_P(test_ucp_atomic64, atomic_and_nbx) { + test(&test_ucp_atomic64::nbx_post, false); + test(&test_ucp_atomic64::nbx_post, true); +} + UCS_TEST_P(test_ucp_atomic64, atomic_or_nb) { test(&test_ucp_atomic64::nb_post, false); test(&test_ucp_atomic64::nb_post, true); } +UCS_TEST_P(test_ucp_atomic64, atomic_or_nbx) { + test(&test_ucp_atomic64::nbx_post, false); + test(&test_ucp_atomic64::nbx_post, true); +} + UCS_TEST_P(test_ucp_atomic64, atomic_xor_nb) { test(&test_ucp_atomic64::nb_post, false); test(&test_ucp_atomic64::nb_post, true); } +UCS_TEST_P(test_ucp_atomic64, atomic_xor_nbx) { + test(&test_ucp_atomic64::nbx_post, false); + test(&test_ucp_atomic64::nbx_post, true); +} + UCS_TEST_P(test_ucp_atomic64, atomic_fadd_nb) { test(&test_ucp_atomic64::nb_fetch, false); test(&test_ucp_atomic64::nb_fetch, true); diff --git a/test/gtest/ucp/test_ucp_atomic.h b/test/gtest/ucp/test_ucp_atomic.h index 0c48efed87d..b8be4cccdfd 100644 --- a/test/gtest/ucp/test_ucp_atomic.h +++ b/test/gtest/ucp/test_ucp_atomic.h @@ -45,6 +45,10 @@ class test_ucp_atomic : public test_ucp_memheap { void nb_post(entity *e, size_t max_size, void *memheap_addr, ucp_rkey_h rkey, std::string& expected_data); + template + void nbx_post(entity *e, size_t max_size, void *memheap_addr, + ucp_rkey_h rkey, std::string& expected_data); + template void nb_fetch(entity *e, size_t max_size, void *memheap_addr, ucp_rkey_h rkey, std::string& expected_data); @@ -67,6 +71,24 @@ class test_ucp_atomic : public test_ucp_memheap { } } + template + T nbx_atomic_op_val(T v1, T v2) + { + /* coverity[switch_selector_expr_is_constant] */ + switch (OP) { + case UCP_ATOMIC_OP_ADD: + return v1 + v2; + case UCP_ATOMIC_OP_AND: + return v1 & v2; + case UCP_ATOMIC_OP_OR: + return v1 | v2; + case UCP_ATOMIC_OP_XOR: + return v1 ^ v2; + default: + return 0; + } + } + template T atomic_fop_val(T v1, T v2) { @@ -93,6 +115,12 @@ class test_ucp_atomic : public test_ucp_memheap { ucs_status_t ucp_atomic_post_nbi(ucp_ep_h ep, ucp_atomic_post_op_t opcode, T value, void *remote_addr, ucp_rkey_h rkey); + + template + ucs_status_t ucp_atomic_post_nbx(ucp_ep_h ep, ucp_atomic_op_t opcode, + T value, void *remote_addr, + ucp_rkey_h rkey); + template ucs_status_ptr_t ucp_atomic_fetch(ucp_ep_h ep, ucp_atomic_fetch_op_t opcode, T value, T *result, diff --git a/test/gtest/ucp/test_ucp_context.cc b/test/gtest/ucp/test_ucp_context.cc index 964a2cd1377..7372fd2526d 100644 --- a/test/gtest/ucp/test_ucp_context.cc +++ b/test/gtest/ucp/test_ucp_context.cc @@ -58,8 +58,8 @@ UCS_TEST_P(test_ucp_aliases, aliases) { create_entity(); } -UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_aliases, rc, "rc") -UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_aliases, rc_x, "rc_x") +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_aliases, rcv, "rc_v") +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_aliases, rcx, "rc_x") UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_aliases, ud, "ud") UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_aliases, ud_mlx5, "ud_mlx5") UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_aliases, ugni, "ugni") diff --git a/test/gtest/ucp/test_ucp_fence.cc b/test/gtest/ucp/test_ucp_fence.cc index 4e5afc4efa0..391fd4ea3b3 100644 --- a/test/gtest/ucp/test_ucp_fence.cc +++ b/test/gtest/ucp/test_ucp_fence.cc @@ -117,71 +117,23 @@ class test_ucp_fence : public test_ucp_atomic { protected: void test_fence(send_func_t send1, send_func_t send2, size_t alignment) { static const size_t memheap_size = sizeof(uint64_t); - ucs_status_t status; - - ucp_mem_map_params_t params; - ucp_mem_attr_t mem_attr; - ucp_mem_h memh; - void *memheap = NULL; - - void *rkey_buffer; - size_t rkey_buffer_size; - ucp_rkey_h rkey; - uint32_t error = 0; sender().connect(&receiver(), get_ep_params()); flush_worker(sender()); /* avoid deadlock for blocking amo */ - params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | - UCP_MEM_MAP_PARAM_FIELD_LENGTH | - UCP_MEM_MAP_PARAM_FIELD_FLAGS; - params.length = memheap_size; - params.flags = GetParam().variant; - if (params.flags & UCP_MEM_MAP_FIXED) { - params.address = ucs::mmap_fixed_address(); - params.flags |= UCP_MEM_MAP_ALLOCATE; - } else { - memheap = malloc(memheap_size); - params.address = memheap; - params.flags = params.flags & (~UCP_MEM_MAP_ALLOCATE); - } + mapped_buffer buffer(memheap_size, receiver(), 0); - status = ucp_mem_map(receiver().ucph(), ¶ms, &memh); - ASSERT_UCS_OK(status); + EXPECT_LE(memheap_size, buffer.size()); + memset(buffer.ptr(), 0, memheap_size); - mem_attr.field_mask = UCP_MEM_ATTR_FIELD_ADDRESS | - UCP_MEM_ATTR_FIELD_LENGTH; - status = ucp_mem_query(memh, &mem_attr); - ASSERT_UCS_OK(status); - EXPECT_LE(memheap_size, mem_attr.length); - if (!memheap) { - memheap = mem_attr.address; - } - memset(memheap, 0, memheap_size); - - status = ucp_rkey_pack(receiver().ucph(), memh, &rkey_buffer, &rkey_buffer_size); - ASSERT_UCS_OK(status); - - status = ucp_ep_rkey_unpack(sender().ep(), rkey_buffer, &rkey); - ASSERT_UCS_OK(status); - - ucp_rkey_buffer_release(rkey_buffer); - - run_workers(send1, send2, &sender(), rkey, memheap, 1, &error); + run_workers(send1, send2, &sender(), buffer.rkey(sender()), + buffer.ptr(), 1, &error); EXPECT_EQ(error, (uint32_t)0); - ucp_rkey_destroy(rkey); - status = ucp_mem_unmap(receiver().ucph(), memh); - ASSERT_UCS_OK(status); - disconnect(sender()); disconnect(receiver()); - - if (!(GetParam().variant & UCP_MEM_MAP_FIXED)) { - free(memheap); - } } static ucp_params_t get_ctx_params() { diff --git a/test/gtest/ucp/test_ucp_mem_type.cc b/test/gtest/ucp/test_ucp_mem_type.cc index 6e99e2ab2fe..ea27a90f453 100644 --- a/test/gtest/ucp/test_ucp_mem_type.cc +++ b/test/gtest/ucp/test_ucp_mem_type.cc @@ -5,13 +5,28 @@ */ #include "ucp_test.h" +#include + extern "C" { -#include "uct/api/uct.h" -#include "ucp/core/ucp_context.h" -#include "ucp/core/ucp_mm.h" +#include +#include +#include } +#define UCP_INSTANTIATE_TEST_CASE_MEMTYPE(_test_case, _name, _mem_type) \ + INSTANTIATE_TEST_CASE_P(_name, _test_case, \ + testing::ValuesIn(_test_case::enum_test_params( \ + _test_case::get_ctx_params(), \ + #_test_case, _mem_type))); + +#define UCP_INSTANTIATE_TEST_CASE_MEMTYPES(_test_case) \ + UCP_INSTANTIATE_TEST_CASE_MEMTYPE(_test_case, host, UCS_MEMORY_TYPE_HOST) \ + UCP_INSTANTIATE_TEST_CASE_MEMTYPE(_test_case, cuda, UCS_MEMORY_TYPE_CUDA) \ + UCP_INSTANTIATE_TEST_CASE_MEMTYPE(_test_case, cuda_managed, UCS_MEMORY_TYPE_CUDA_MANAGED) \ + UCP_INSTANTIATE_TEST_CASE_MEMTYPE(_test_case, rocm, UCS_MEMORY_TYPE_ROCM) \ + UCP_INSTANTIATE_TEST_CASE_MEMTYPE(_test_case, rocm_managed, UCS_MEMORY_TYPE_ROCM_MANAGED) + class test_ucp_mem_type : public ucp_test { public: static ucp_params_t get_ctx_params() { @@ -19,24 +34,98 @@ class test_ucp_mem_type : public ucp_test { params.features |= UCP_FEATURE_TAG; return params; } + + static std::vector + enum_test_params(const ucp_params_t& ctx_params, + const std::string& test_case_name, ucs_memory_type_t mem_type) + { + std::vector result; + + std::vector mem_types = + mem_buffer::supported_mem_types(); + if (std::find(mem_types.begin(), mem_types.end(), mem_type) != + mem_types.end()) { + generate_test_params_variant(ctx_params, "all", test_case_name, + "all", mem_type, result); + } + + return result; + } + +protected: + ucs_memory_type_t mem_type() const { + return static_cast(GetParam().variant); + } }; -UCS_TEST_P(test_ucp_mem_type, detect_host) { - ucs_status_t status; - uct_memory_type_t mem_type; - void *ptr; - size_t size = 256; +UCS_TEST_P(test_ucp_mem_type, detect) { + + const size_t size = 256; + const ucs_memory_type_t alloc_mem_type = mem_type(); - sender().connect(&sender(), get_ep_params()); + mem_buffer b(size, alloc_mem_type); - ptr = malloc(size); - EXPECT_TRUE(ptr != NULL); + ucs_memory_type_t detected_mem_type = + ucp_memory_type_detect(sender().ucph(), b.ptr(), size); + EXPECT_EQ(alloc_mem_type, detected_mem_type); +} - status = ucp_memory_type_detect_mds(sender().ucph(), ptr, size, &mem_type); - ASSERT_UCS_OK(status); - EXPECT_EQ(UCT_MD_MEM_TYPE_HOST, mem_type); +UCP_INSTANTIATE_TEST_CASE_MEMTYPES(test_ucp_mem_type) - free(ptr); +class test_ucp_mem_type_alloc_before_init : public test_ucp_mem_type { +public: + static ucp_params_t get_ctx_params() { + ucp_params_t params = ucp_test::get_ctx_params(); + params.features |= UCP_FEATURE_TAG; + return params; + } + + test_ucp_mem_type_alloc_before_init() { + m_size = 10000; + } + + virtual void init() { + m_send_buffer.reset(new mem_buffer(m_size, mem_type())); + m_recv_buffer.reset(new mem_buffer(m_size, mem_type())); + test_ucp_mem_type::init(); + } + + virtual void cleanup() { + test_ucp_mem_type::cleanup(); + m_send_buffer.reset(); + m_recv_buffer.reset(); + } + + static const uint64_t SEED = 0x1111111111111111lu; +protected: + size_t m_size; + ucs::auto_ptr m_send_buffer, m_recv_buffer; +}; + +UCS_TEST_P(test_ucp_mem_type_alloc_before_init, xfer) { + sender().connect(&receiver(), get_ep_params()); + + EXPECT_EQ(mem_type(), ucp_memory_type_detect(sender().ucph(), + m_send_buffer->ptr(), m_size)); + EXPECT_EQ(mem_type(), ucp_memory_type_detect(receiver().ucph(), + m_recv_buffer->ptr(), m_size)); + + mem_buffer::pattern_fill(m_send_buffer->ptr(), m_size, SEED, mem_type()); + + for (int i = 0; i < 3; ++i) { + mem_buffer::pattern_fill(m_recv_buffer->ptr(), m_size, 0, mem_type()); + + void *sreq = ucp_tag_send_nb(sender().ep(), m_send_buffer->ptr(), m_size, + ucp_dt_make_contig(1), 1, + (ucp_send_callback_t)ucs_empty_function); + void *rreq = ucp_tag_recv_nb(receiver().worker(), m_recv_buffer->ptr(), + m_size, ucp_dt_make_contig(1), 1, 1, + (ucp_tag_recv_callback_t)ucs_empty_function); + wait(sreq); + wait(rreq); + + mem_buffer::pattern_check(m_recv_buffer->ptr(), m_size, SEED, mem_type()); + } } -UCP_INSTANTIATE_TEST_CASE(test_ucp_mem_type) +UCP_INSTANTIATE_TEST_CASE_MEMTYPES(test_ucp_mem_type_alloc_before_init) diff --git a/test/gtest/ucp/test_ucp_memheap.cc b/test/gtest/ucp/test_ucp_memheap.cc index f1eacf8902b..21f3df43b91 100644 --- a/test/gtest/ucp/test_ucp_memheap.cc +++ b/test/gtest/ucp/test_ucp_memheap.cc @@ -46,7 +46,7 @@ void test_ucp_memheap::test_nonblocking_implicit_stream_xfer(nonblocking_send_fu } if (size == DEFAULT_SIZE) { - size = ucs_max((size_t)ucs::rand() % (12*1024), alignment); + size = ucs_max((size_t)ucs::rand() % (12 * UCS_KBYTE), alignment); } memheap_size = max_iter * size + alignment; @@ -153,7 +153,7 @@ void test_ucp_memheap::test_blocking_xfer(blocking_send_func_t send, } if (memheap_size == DEFAULT_SIZE) { - memheap_size = 3 * 1024; + memheap_size = 3 * UCS_KBYTE; zero_offset = 1; } diff --git a/test/gtest/ucp/test_ucp_mmap.cc b/test/gtest/ucp/test_ucp_mmap.cc index 965634b3ddc..b1015918895 100644 --- a/test/gtest/ucp/test_ucp_mmap.cc +++ b/test/gtest/ucp/test_ucp_mmap.cc @@ -8,6 +8,7 @@ extern "C" { #include #include +#include } class test_ucp_mmap : public test_ucp_memheap { @@ -26,6 +27,11 @@ class test_ucp_mmap : public test_ucp_memheap { } } + virtual void init() { + ucs::skip_on_address_sanitizer(); + test_ucp_memheap::init(); + } + protected: bool resolve_rma(entity *e, ucp_rkey_h rkey); bool resolve_amo(entity *e, ucp_rkey_h rkey); @@ -76,10 +82,13 @@ bool test_ucp_mmap::resolve_amo(entity *e, ucp_rkey_h rkey) bool test_ucp_mmap::resolve_rma_bw(entity *e, ucp_rkey_h rkey) { + ucp_ep_config_t *ep_config = ucp_ep_config(e->ep()); ucp_lane_index_t lane; uct_rkey_t uct_rkey; - lane = ucp_rkey_get_rma_bw_lane(rkey, e->ep(), UCT_MD_MEM_TYPE_HOST, &uct_rkey, 0); + lane = ucp_rkey_find_rma_lane(e->ucph(), ep_config, UCS_MEMORY_TYPE_HOST, + ep_config->tag.rndv.get_zcopy_lanes, rkey, 0, + &uct_rkey); if (lane != UCP_NULL_LANE) { return true; } else { @@ -113,6 +122,9 @@ void test_ucp_mmap::test_rkey_management(entity *e, ucp_mem_h memh, bool is_dumm } ASSERT_UCS_OK(status); + /* Test ucp_rkey_packed_md_map() */ + EXPECT_EQ(rkey->md_map, ucp_rkey_packed_md_map(rkey_buffer)); + bool have_rma = resolve_rma(e, rkey); bool have_amo = resolve_amo(e, rkey); bool have_rma_bw = resolve_rma_bw(e, rkey); @@ -153,7 +165,7 @@ UCS_TEST_P(test_ucp_mmap, alloc) { sender().connect(&sender(), get_ep_params()); for (int i = 0; i < 1000 / ucs::test_time_multiplier(); ++i) { - size_t size = ucs::rand() % (1024 * 1024); + size_t size = ucs::rand() % (UCS_MBYTE); ucp_mem_h memh; ucp_mem_map_params_t params; @@ -184,7 +196,7 @@ UCS_TEST_P(test_ucp_mmap, reg) { sender().connect(&sender(), get_ep_params()); for (int i = 0; i < 1000 / ucs::test_time_multiplier(); ++i) { - size_t size = ucs::rand() % (1024 * 1024); + size_t size = ucs::rand() % (UCS_MBYTE); void *ptr = malloc(size); ucs::fill_random(ptr, size); @@ -261,7 +273,7 @@ UCS_TEST_P(test_ucp_mmap, alloc_advise) { sender().connect(&sender(), get_ep_params()); - size_t size = 128 * (1024 * 1024); + size_t size = 128 * UCS_MBYTE; ucp_mem_h memh; ucp_mem_map_params_t params; @@ -306,7 +318,7 @@ UCS_TEST_P(test_ucp_mmap, reg_advise) { sender().connect(&sender(), get_ep_params()); - size_t size = 128 * 1024 * 1024; + size_t size = 128 * UCS_MBYTE; void *ptr = malloc(size); ucs::fill_random(ptr, size); diff --git a/test/gtest/ucp/test_ucp_peer_failure.cc b/test/gtest/ucp/test_ucp_peer_failure.cc index 87e21657085..7e8ffe28fa3 100644 --- a/test/gtest/ucp/test_ucp_peer_failure.cc +++ b/test/gtest/ucp/test_ucp_peer_failure.cc @@ -8,8 +8,9 @@ #include "ucp_datatype.h" extern "C" { -#include /* for testing memory consumption */ -#include // for debug +#include /* for testing EP RNDV configuration */ +#include /* for debug */ +#include /* for testing memory consumption */ } class test_ucp_peer_failure : public ucp_test { @@ -44,6 +45,12 @@ class test_ucp_peer_failure : public ucp_test { entity& failing_receiver(); void *send_nb(ucp_ep_h ep, ucp_rkey_h rkey); void *recv_nb(entity& e); + static ucs_log_func_rc_t + warn_unreleased_rdesc_handler(const char *file, unsigned line, + const char *function, + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap); void fail_receiver(); void smoke_test(bool stable_pair); static void unmap_memh(ucp_mem_h memh, ucp_context_h context); @@ -118,8 +125,6 @@ void test_ucp_peer_failure::set_timeouts() { m_env.push_back(new ucs::scoped_setenv("UCX_RC_TIMEOUT", "10ms")); m_env.push_back(new ucs::scoped_setenv("UCX_RC_RNR_TIMEOUT", "10ms")); m_env.push_back(new ucs::scoped_setenv("UCX_RC_RETRY_COUNT", "2")); - std::string ud_timeout = ucs::to_string(3 * ucs::test_time_multiplier()) + "s"; - m_env.push_back(new ucs::scoped_setenv("UCX_UD_TIMEOUT", ud_timeout.c_str())); } void test_ucp_peer_failure::err_cb(void *arg, ucp_ep_h ep, ucs_status_t status) { @@ -169,6 +174,24 @@ void *test_ucp_peer_failure::recv_nb(entity& e) { } } +ucs_log_func_rc_t +test_ucp_peer_failure::warn_unreleased_rdesc_handler(const char *file, unsigned line, + const char *function, + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) +{ + if (level == UCS_LOG_LEVEL_WARN) { + std::string err_str = format_message(message, ap); + + if (err_str.find("unexpected tag-receive descriptor") != std::string::npos) { + return UCS_LOG_FUNC_RC_STOP; + } + } + + return UCS_LOG_FUNC_RC_CONTINUE; +} + void test_ucp_peer_failure::fail_receiver() { /* TODO: need to handle non-empty TX window in UD EP destructor", * see debug message (ud_ep.c:220) @@ -179,7 +202,13 @@ void test_ucp_peer_failure::fail_receiver() { // TODO use force-close to close connections flush_worker(failing_receiver()); m_failing_memh.reset(); - failing_receiver().cleanup(); + { + /* transform warning messages about unreleased TM rdescs to test + * message that are expected here, since we closed receiver w/o + * reading the messages that were potentially received */ + scoped_log_handler slh(warn_unreleased_rdesc_handler); + failing_receiver().cleanup(); + } } void test_ucp_peer_failure::smoke_test(bool stable_pair) { @@ -269,6 +298,11 @@ void test_ucp_peer_failure::do_test(size_t msg_size, int pre_msg_count, set_rkeys(); + /* Since we don't want to test peer failure on a stable pair + * and don't expect EP timeout error on those EPs, + * run traffic on a stable pair to connect it */ + smoke_test(true); + if (!(GetParam().variant & FAIL_IMM)) { /* if not fail immediately, run traffic on failing pair to connect it */ smoke_test(false); @@ -286,6 +320,11 @@ void test_ucp_peer_failure::do_test(size_t msg_size, int pre_msg_count, } EXPECT_EQ(UCS_OK, m_err_status); + + /* Since UCT/UD EP has a SW implementation of reliablity on which peer + * failure mechanism is based, we should set small UCT/UD EP timeout + * for UCT/UD EPs for sender's UCP EP to reduce testing time */ + double prev_ib_ud_timeout = sender().set_ib_ud_timeout(3.); { scoped_log_handler slh(wrap_errors_logger); @@ -326,6 +365,8 @@ void test_ucp_peer_failure::do_test(size_t msg_size, int pre_msg_count, ucp_ep_h ep = sender().revoke_ep(0, FAILING_EP_INDEX); + m_failing_rkey.reset(); + void *creq = ucp_ep_close_nb(ep, UCP_EP_CLOSE_MODE_FORCE); wait(creq); @@ -346,12 +387,21 @@ void test_ucp_peer_failure::do_test(size_t msg_size, int pre_msg_count, } } + /* Since we won't test peer failure anymore, reset UCT/UD EP timeout to the + * default value to avoid possible UD EP timeout errors under high load */ + sender().set_ib_ud_timeout(prev_ib_ud_timeout); + /* Check workability of stable pair */ smoke_test(true); /* Check that TX polling is working well */ while (sender().progress()); + /* Destroy rkeys before destroying the worker (which also destroys the + * endpoints) */ + m_failing_rkey.reset(); + m_stable_rkey.reset(); + /* When all requests on sender are done we need to prevent LOCAL_FLUSH * in test teardown. Receiver is killed and doesn't respond on FC requests */ @@ -359,21 +409,31 @@ void test_ucp_peer_failure::do_test(size_t msg_size, int pre_msg_count, } UCS_TEST_P(test_ucp_peer_failure, basic) { - do_test(1024, /* msg_size */ + do_test(UCS_KBYTE, /* msg_size */ 0, /* pre_msg_cnt */ false, /* force_close */ false /* must_fail */); } +UCS_TEST_P(test_ucp_peer_failure, rndv_disable) { + const size_t size_max = std::numeric_limits::max(); + + sender().connect(&receiver(), get_ep_params(), STABLE_EP_INDEX); + EXPECT_EQ(size_max, ucp_ep_config(sender().ep())->tag.rndv.am_thresh); + EXPECT_EQ(size_max, ucp_ep_config(sender().ep())->tag.rndv.rma_thresh); + EXPECT_EQ(size_max, ucp_ep_config(sender().ep())->tag.rndv_send_nbr.am_thresh); + EXPECT_EQ(size_max, ucp_ep_config(sender().ep())->tag.rndv_send_nbr.rma_thresh); +} + UCS_TEST_P(test_ucp_peer_failure, zcopy, "ZCOPY_THRESH=1023") { - do_test(1024, /* msg_size */ + do_test(UCS_KBYTE, /* msg_size */ 0, /* pre_msg_cnt */ false, /* force_close */ true /* must_fail */); } -UCS_TEST_P(test_ucp_peer_failure, bcopy_multi, "MAX_BCOPY?=512", "RC_TM_ENABLE?=n") { - do_test(1024, /* msg_size */ +UCS_TEST_P(test_ucp_peer_failure, bcopy_multi, "SEG_SIZE?=512", "RC_TM_ENABLE?=n") { + do_test(UCS_KBYTE, /* msg_size */ 0, /* pre_msg_cnt */ false, /* force_close */ false /* must_fail */); @@ -386,15 +446,12 @@ UCS_TEST_P(test_ucp_peer_failure, force_close, "RC_FC_ENABLE?=n") { false /* must_fail */); } -UCS_TEST_P(test_ucp_peer_failure, disable_sync_send) { +UCS_TEST_SKIP_COND_P(test_ucp_peer_failure, disable_sync_send, + !(GetParam().variant & TEST_TAG)) { const size_t max_size = UCS_MBYTE; std::vector buf(max_size, 0); void *req; - if (!(GetParam().variant & TEST_TAG)) { - UCS_TEST_SKIP_R("Skip non-tagged variant"); - } - sender().connect(&receiver(), get_ep_params()); /* Make sure API is disabled for any size and data type */ diff --git a/test/gtest/ucp/test_ucp_perf.cc b/test/gtest/ucp/test_ucp_perf.cc index 0109bca2964..ebed657a573 100644 --- a/test/gtest/ucp/test_ucp_perf.cc +++ b/test/gtest/ucp/test_ucp_perf.cc @@ -26,7 +26,9 @@ class test_ucp_perf : public ucp_test, public test_perf { static ucs_log_func_rc_t log_handler(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap) { + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) { // Ignore errors that transport cannot reach peer if (level == UCS_LOG_LEVEL_ERROR) { std::string err_str = format_message(message, ap); @@ -39,124 +41,124 @@ class test_ucp_perf : public ucp_test, public test_perf { return UCS_LOG_FUNC_RC_CONTINUE; } - static test_spec tests[]; + const static test_spec tests[]; }; -test_perf::test_spec test_ucp_perf::tests[] = +const test_perf::test_spec test_ucp_perf::tests[] = { { "tag latency", "usec", UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_PINGPONG, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 60.0, 0 }, { "tag iov latency", "usec", UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_PINGPONG, - UCP_PERF_DATATYPE_IOV, 8192, 3, { 1024, 1024, 1024 }, 1, 100000l, + UCP_PERF_DATATYPE_IOV, 8192, 3, { 1024, 1024, 1024 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 60.0, 0 }, { "tag mr", "Mpps", UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 2000000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 2000000lu, ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.1, 100.0, 0 }, { "tag sync mr", "Mpps", UCX_PERF_API_UCP, UCX_PERF_CMD_TAG_SYNC, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 2000000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 200000lu, ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.05, 100.0, 0}, { "tag wild mr", "Mpps", UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 2000000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 2000000lu, ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.1, 100.0, UCX_PERF_TEST_FLAG_TAG_WILDCARD }, { "tag bw", "MB/sec", UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCT_PERF_DATA_LAYOUT_LAST, 0, 1, { 2048 }, 1, 100000l, + UCT_PERF_DATA_LAYOUT_LAST, 0, 1, { 2048 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 100.0, 100000.0 }, { "tag bw_zcopy_multi", "MB/sec", UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCT_PERF_DATA_LAYOUT_LAST, 0, 1, { 2048 }, 16, 100000l, + UCT_PERF_DATA_LAYOUT_LAST, 0, 1, { 2048 }, 16, 100000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 100.0, 100000.0 }, { "put latency", "usec", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_PINGPONG, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0, 0 }, { "put rate", "Mpps", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 2000000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 2000000lu, ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.5, 100.0, 0 }, { "put bw", "MB/sec", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 2048 }, 1, 100000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 2048 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 200.0, 100000.0, 0 }, { "get latency", "usec", UCX_PERF_API_UCP, UCX_PERF_CMD_GET, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0, 0 }, { "get bw", "MB/sec", UCX_PERF_API_UCP, UCX_PERF_CMD_GET, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 16384 }, 1, 10000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 16384 }, 1, 10000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 200.0, 100000.0, 0 }, { "stream latency", "usec", UCX_PERF_API_UCP, UCX_PERF_CMD_STREAM, UCX_PERF_TEST_TYPE_PINGPONG, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0, 0 }, { "stream bw", "MB/sec", UCX_PERF_API_UCP, UCX_PERF_CMD_STREAM, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 16384 }, 1, 10000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 16384 }, 1, 10000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 200.0, 100000.0, 0 }, { "stream recv-data latency", "usec", UCX_PERF_API_UCP, UCX_PERF_CMD_STREAM, UCX_PERF_TEST_TYPE_PINGPONG, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0, UCX_PERF_TEST_FLAG_STREAM_RECV_DATA }, { "stream recv-data bw", "MB/sec", UCX_PERF_API_UCP, UCX_PERF_CMD_STREAM, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 16384 }, 1, 10000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 16384 }, 1, 10000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 200.0, 100000.0, UCX_PERF_TEST_FLAG_STREAM_RECV_DATA }, { "atomic add rate", "Mpps", UCX_PERF_API_UCP, UCX_PERF_CMD_ADD, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 1000000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 1000000lu, ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.1, 500.0, 0 }, { "atomic fadd latency", "usec", UCX_PERF_API_UCP, UCX_PERF_CMD_FADD, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0, 0 }, { "atomic swap latency", "usec", UCX_PERF_API_UCP, UCX_PERF_CMD_SWAP, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0, 0 }, { "atomic cswap latency", "usec", UCX_PERF_API_UCP, UCX_PERF_CMD_CSWAP, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000l, + UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0, 0 }, @@ -165,24 +167,30 @@ test_perf::test_spec test_ucp_perf::tests[] = UCS_TEST_P(test_ucp_perf, envelope) { - /* Run all tests */ - std::stringstream ss; - ss << GetParam(); bool check_perf = true; - if (ss.str().find("tcp") != std::string::npos) { + size_t max_iter = std::numeric_limits::max(); + + if (has_transport("tcp")) { check_perf = false; + max_iter = 1000lu; } + std::stringstream ss; + ss << GetParam(); /* coverity[tainted_string_argument] */ ucs::scoped_setenv tls("UCX_TLS", ss.str().c_str()); ucs::scoped_setenv warn_invalid("UCX_WARN_INVALID_CONFIG", "no"); - for (test_spec *test = tests; test->title != NULL; ++test) { + /* Run all tests */ + for (const test_spec *test_iter = tests; test_iter->title != NULL; ++test_iter) { + test_spec test = *test_iter; + if (ucs_arch_get_cpu_model() == UCS_CPU_MODEL_ARM_AARCH64) { - test->max *= UCP_ARM_PERF_TEST_MULTIPLIER; - test->min /= UCP_ARM_PERF_TEST_MULTIPLIER; + test.max *= UCP_ARM_PERF_TEST_MULTIPLIER; + test.min /= UCP_ARM_PERF_TEST_MULTIPLIER; } - run_test(*test, 0, check_perf, "", ""); + test.iters = ucs_min(test.iters, max_iter); + run_test(test, 0, check_perf, "", ""); } } diff --git a/test/gtest/ucp/test_ucp_proto.cc b/test/gtest/ucp/test_ucp_proto.cc new file mode 100644 index 00000000000..995d2950ee5 --- /dev/null +++ b/test/gtest/ucp/test_ucp_proto.cc @@ -0,0 +1,16 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#include + +extern "C" { +#include +#include +} + +class test_proto : public ucs::test { +}; + diff --git a/test/gtest/ucp/test_ucp_rma.cc b/test/gtest/ucp/test_ucp_rma.cc index 741c2e3dafa..e0e00796d48 100644 --- a/test/gtest/ucp/test_ucp_rma.cc +++ b/test/gtest/ucp/test_ucp_rma.cc @@ -13,16 +13,6 @@ class test_ucp_rma : public test_ucp_memheap { private: static void send_completion(void *request, ucs_status_t status){} public: - void init() { - ucp_test::init(); - - // TODO: need to investigate the slowness of the disabled tests - if ((GetParam().transports.front().compare("dc_x") == 0) && - (GetParam().variant == UCP_MEM_MAP_NONBLOCK)) { - UCS_TEST_SKIP_R("skipping this test until the slowness is resolved"); - } - } - static ucp_params_t get_ctx_params() { ucp_params_t params = ucp_test::get_ctx_params(); params.features |= UCP_FEATURE_RMA; @@ -115,8 +105,6 @@ void test_ucp_rma::test_message_sizes(blocking_send_func_t func, size_t *msizes, } } -static const size_t MEG = 1024 * 1024ULL; - UCS_TEST_P(test_ucp_rma, nbi_small) { size_t sizes[] = { 8, 24, 96, 120, 250, 0}; @@ -135,12 +123,9 @@ UCS_TEST_P(test_ucp_rma, nbi_med) { sizes, 100, 1); } -UCS_TEST_P(test_ucp_rma, nbi_large) { - size_t sizes[] = { 1 * MEG, 3 * MEG, 9 * MEG, 17 * MEG, 32 * MEG, 0}; - - if (RUNNING_ON_VALGRIND) { - UCS_TEST_SKIP_R("skipping on valgrind"); - } +UCS_TEST_SKIP_COND_P(test_ucp_rma, nbi_large, RUNNING_ON_VALGRIND) { + size_t sizes[] = { 1 * UCS_MBYTE, 3 * UCS_MBYTE, 9 * UCS_MBYTE, + 17 * UCS_MBYTE, 32 * UCS_MBYTE, 0}; test_message_sizes(static_cast(&test_ucp_rma::nonblocking_put_nbi), sizes, 3, 1); @@ -166,12 +151,9 @@ UCS_TEST_P(test_ucp_rma, nb_med) { sizes, 100, 1); } -UCS_TEST_P(test_ucp_rma, nb_large) { - size_t sizes[] = { 1 * MEG, 3 * MEG, 9 * MEG, 17 * MEG, 32 * MEG, 0}; - - if (RUNNING_ON_VALGRIND) { - UCS_TEST_SKIP_R("skipping on valgrind"); - } +UCS_TEST_SKIP_COND_P(test_ucp_rma, nb_large, RUNNING_ON_VALGRIND) { + size_t sizes[] = { 1 * UCS_MBYTE, 3 * UCS_MBYTE, 9 * UCS_MBYTE, + 17 * UCS_MBYTE, 32 * UCS_MBYTE, 0}; test_message_sizes(static_cast(&test_ucp_rma::nonblocking_put_nb), sizes, 3, 1); diff --git a/test/gtest/ucp/test_ucp_rma_mt.cc b/test/gtest/ucp/test_ucp_rma_mt.cc index 8ca56c85a3f..120b0355251 100644 --- a/test/gtest/ucp/test_ucp_rma_mt.cc +++ b/test/gtest/ucp/test_ucp_rma_mt.cc @@ -53,7 +53,6 @@ class test_ucp_rma_mt : public ucp_test { }; UCS_TEST_P(test_ucp_rma_mt, put_get) { - int i; ucs_status_t st; uint64_t orig_data[MT_TEST_NUM_THREADS] GTEST_ATTRIBUTE_UNUSED_; uint64_t target_data[MT_TEST_NUM_THREADS] GTEST_ATTRIBUTE_UNUSED_; @@ -78,26 +77,35 @@ UCS_TEST_P(test_ucp_rma_mt, put_get) { st = ucp_rkey_pack(receiver().ucph(), memh, &rkey_buffer, &rkey_buffer_size); ASSERT_UCS_OK(st); - ucp_rkey_h *rkey; + std::vector rkey; + rkey.resize(MT_TEST_NUM_THREADS); - rkey = (ucp_rkey_h *)malloc(sizeof(ucp_rkey_h) * sender().get_num_workers()); - for (i = 0; i < sender().get_num_workers(); i++) { - st = ucp_ep_rkey_unpack(sender().ep(i), rkey_buffer, &rkey[i]); - ASSERT_UCS_OK(st); + /* test parallel rkey unpack */ +#if _OPENMP && ENABLE_MT +#pragma omp parallel for + for (int i = 0; i < MT_TEST_NUM_THREADS; i++) { + int worker_index = 0; + if (GetParam().thread_type == MULTI_THREAD_CONTEXT) { + worker_index = i; + } + ucs_status_t status = ucp_ep_rkey_unpack(sender().ep(worker_index), + rkey_buffer, &rkey[i]); + ASSERT_UCS_OK(status); } +#endif ucp_rkey_buffer_release(rkey_buffer); /* test blocking PUT */ - for (i = 0; i < MT_TEST_NUM_THREADS; i++) { + for (int i = 0; i < MT_TEST_NUM_THREADS; i++) { orig_data[i] = 0xdeadbeefdeadbeef + 10 * i; target_data[i] = 0; } #if _OPENMP && ENABLE_MT #pragma omp parallel for - for (i = 0; i < MT_TEST_NUM_THREADS; i++) { + for (int i = 0; i < MT_TEST_NUM_THREADS; i++) { int worker_index = 0; if (GetParam().thread_type == MULTI_THREAD_CONTEXT) { @@ -106,7 +114,7 @@ UCS_TEST_P(test_ucp_rma_mt, put_get) { void* req = ucp_put_nb(sender().ep(worker_index), &orig_data[i], sizeof(uint64_t), (uintptr_t)((uint64_t*)memheap + i), - rkey[worker_index], send_cb); + rkey[i], send_cb); wait(req, worker_index); flush_worker(sender(), worker_index); @@ -117,14 +125,14 @@ UCS_TEST_P(test_ucp_rma_mt, put_get) { /* test nonblocking PUT */ - for (i = 0; i < MT_TEST_NUM_THREADS; i++) { + for (int i = 0; i < MT_TEST_NUM_THREADS; i++) { orig_data[i] = 0xdeadbeefdeadbeef + 10 * i; target_data[i] = 0; } #if _OPENMP && ENABLE_MT #pragma omp parallel for - for (i = 0; i < MT_TEST_NUM_THREADS; i++) { + for (int i = 0; i < MT_TEST_NUM_THREADS; i++) { ucs_status_t status; int worker_index = 0; @@ -132,7 +140,7 @@ UCS_TEST_P(test_ucp_rma_mt, put_get) { worker_index = i; status = ucp_put_nbi(sender().ep(worker_index), &orig_data[i], sizeof(uint64_t), - (uintptr_t)((uint64_t*)memheap + i), rkey[worker_index]); + (uintptr_t)((uint64_t*)memheap + i), rkey[i]); ASSERT_UCS_OK_OR_INPROGRESS(status); flush_worker(sender(), worker_index); @@ -143,14 +151,14 @@ UCS_TEST_P(test_ucp_rma_mt, put_get) { /* test blocking GET */ - for (i = 0; i < MT_TEST_NUM_THREADS; i++) { + for (int i = 0; i < MT_TEST_NUM_THREADS; i++) { orig_data[i] = 0; target_data[i] = 0xdeadbeefdeadbeef + 10 * i; } #if _OPENMP && ENABLE_MT #pragma omp parallel for - for (i = 0; i < MT_TEST_NUM_THREADS; i++) { + for (int i = 0; i < MT_TEST_NUM_THREADS; i++) { int worker_index = 0; if (GetParam().thread_type == MULTI_THREAD_CONTEXT) { @@ -159,7 +167,7 @@ UCS_TEST_P(test_ucp_rma_mt, put_get) { void *req = ucp_get_nb(sender().ep(worker_index), &orig_data[i], sizeof(uint64_t), (uintptr_t)((uint64_t*)memheap + i), - rkey[worker_index], send_cb); + rkey[i], send_cb); wait(req, worker_index); flush_worker(sender(), worker_index); @@ -170,14 +178,14 @@ UCS_TEST_P(test_ucp_rma_mt, put_get) { /* test nonblocking GET */ - for (i = 0; i < MT_TEST_NUM_THREADS; i++) { + for (int i = 0; i < MT_TEST_NUM_THREADS; i++) { orig_data[i] = 0; target_data[i] = 0xdeadbeefdeadbeef + 10 * i; } #if _OPENMP && ENABLE_MT #pragma omp parallel for - for (i = 0; i < MT_TEST_NUM_THREADS; i++) { + for (int i = 0; i < MT_TEST_NUM_THREADS; i++) { ucs_status_t status; int worker_index = 0; @@ -185,7 +193,7 @@ UCS_TEST_P(test_ucp_rma_mt, put_get) { worker_index = i; status = ucp_get_nbi(sender().ep(worker_index), &orig_data[i], sizeof(uint64_t), - (uintptr_t)((uint64_t *)memheap + i), rkey[worker_index]); + (uintptr_t)((uint64_t *)memheap + i), rkey[i]); ASSERT_UCS_OK_OR_INPROGRESS(status); flush_worker(sender(), worker_index); @@ -194,10 +202,12 @@ UCS_TEST_P(test_ucp_rma_mt, put_get) { } #endif - for (i = 0; i < sender().get_num_workers(); i++) { +#if _OPENMP && ENABLE_MT +#pragma omp parallel for + for (int i = 0; i < MT_TEST_NUM_THREADS; i++) { ucp_rkey_destroy(rkey[i]); } - free(rkey); +#endif st = ucp_mem_unmap(receiver().ucph(), memh); ASSERT_UCS_OK(st); diff --git a/test/gtest/ucp/test_ucp_sockaddr.cc b/test/gtest/ucp/test_ucp_sockaddr.cc index 89e408d7e43..6a61e1e8770 100644 --- a/test/gtest/ucp/test_ucp_sockaddr.cc +++ b/test/gtest/ucp/test_ucp_sockaddr.cc @@ -5,16 +5,22 @@ */ #include "ucp_test.h" +#include "common/test.h" +#include "ucp/ucp_test.h" #include #include #include #include +extern "C" { +#include +} + #define UCP_INSTANTIATE_ALL_TEST_CASE(_test_case) \ UCP_INSTANTIATE_TEST_CASE (_test_case) \ UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, shm, "shm") \ - UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, dc_ud, "dc_x,ud,ud_x,mm") \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, dc_ud, "dc_x,ud_v,ud_x,mm") \ UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, no_ud_ud_x, "dc_x,mm") \ /* dc_ud case is for testing handling of a large worker address on * UCT_IFACE_FLAG_CONNECT_TO_IFACE transports (dc_x) */ @@ -32,19 +38,55 @@ class test_ucp_sockaddr : public ucp_test { } enum { - MT_PARAM_VARIANT = DEFAULT_PARAM_VARIANT + 1, /* Enabled worker level - multi-threading */ - CONN_REQ_TAG, /* Accepting by ucp_conn_request_h, + CONN_REQ_TAG = DEFAULT_PARAM_VARIANT + 1, /* Accepting by ucp_conn_request_h, send/recv by TAG API */ CONN_REQ_STREAM /* Accepting by ucp_conn_request_h, send/recv by STREAM API */ }; + enum { + TEST_MODIFIER_MASK = UCS_MASK(16), + TEST_MODIFIER_MT = UCS_BIT(16), + TEST_MODIFIER_CM = UCS_BIT(17) + }; + + enum { + SEND_DIRECTION_C2S = UCS_BIT(0), /* send data from client to server */ + SEND_DIRECTION_S2C = UCS_BIT(1), /* send data from server to client */ + SEND_DIRECTION_BIDI = SEND_DIRECTION_C2S | SEND_DIRECTION_S2C /* bidirectional send */ + }; + typedef enum { SEND_RECV_TAG, SEND_RECV_STREAM } send_recv_type_t; + ucs::sock_addr_storage m_test_addr; + + void init() { + if (GetParam().variant & TEST_MODIFIER_CM) { + modify_config("SOCKADDR_CM_ENABLE", "yes"); + } + get_sockaddr(); + ucp_test::init(); + skip_loopback(); + } + + static void + enum_test_params_with_modifier(const ucp_params_t& ctx_params, + const std::string& name, + const std::string& test_case_name, + const std::string& tls, + std::vector &result, + unsigned modifier) + { + generate_test_params_variant(ctx_params, name, test_case_name, tls, + modifier, result, SINGLE_THREAD); + generate_test_params_variant(ctx_params, name, test_case_name, tls, + modifier | TEST_MODIFIER_MT, result, + MULTI_THREAD_WORKER); + } + static std::vector enum_test_params(const ucp_params_t& ctx_params, const std::string& name, @@ -54,70 +96,111 @@ class test_ucp_sockaddr : public ucp_test { std::vector result = ucp_test::enum_test_params(ctx_params, name, test_case_name, tls); - generate_test_params_variant(ctx_params, name, test_case_name, tls, - MT_PARAM_VARIANT, result, - MULTI_THREAD_WORKER); - generate_test_params_variant(ctx_params, name, test_case_name, tls, - CONN_REQ_TAG, result); - generate_test_params_variant(ctx_params, name, test_case_name, tls, - CONN_REQ_STREAM, result); + enum_test_params_with_modifier(ctx_params, name, test_case_name, tls, + result, CONN_REQ_TAG); + enum_test_params_with_modifier(ctx_params, name, test_case_name, tls, + result, CONN_REQ_TAG | TEST_MODIFIER_CM); + enum_test_params_with_modifier(ctx_params, name, test_case_name, tls, + result, CONN_REQ_STREAM); + enum_test_params_with_modifier(ctx_params, name, test_case_name, tls, + result, CONN_REQ_STREAM | TEST_MODIFIER_CM); return result; } static ucs_log_func_rc_t detect_error_logger(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap) + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) { if (level == UCS_LOG_LEVEL_ERROR) { - std::string err_str = format_message(message, ap); - if ((strstr(err_str.c_str(), "no supported sockaddr auxiliary transports found for")) || - (strstr(err_str.c_str(), "sockaddr aux resources addresses")) || - (strstr(err_str.c_str(), "no peer failure handler")) || - (strstr(err_str.c_str(), "connection request failed on listener")) || + static std::vector stop_list; + if (stop_list.empty()) { + stop_list.push_back("no supported sockaddr auxiliary transports found for"); + stop_list.push_back("sockaddr aux resources addresses"); + stop_list.push_back("no peer failure handler"); + stop_list.push_back("connection request failed on listener"); /* when the "peer failure" error happens, it is followed by: */ - (strstr(err_str.c_str(), "received event RDMA_CM_EVENT_UNREACHABLE"))) { - UCS_TEST_MESSAGE << err_str; - return UCS_LOG_FUNC_RC_STOP; + stop_list.push_back("received event RDMA_CM_EVENT_UNREACHABLE"); + stop_list.push_back(ucs_status_string(UCS_ERR_UNREACHABLE)); + stop_list.push_back(ucs_status_string(UCS_ERR_UNSUPPORTED)); + } + + std::string err_str = format_message(message, ap); + for (size_t i = 0; i < stop_list.size(); ++i) { + if (err_str.find(stop_list[i]) != std::string::npos) { + UCS_TEST_MESSAGE << err_str; + return UCS_LOG_FUNC_RC_STOP; + } } } return UCS_LOG_FUNC_RC_CONTINUE; } - void get_listen_addr(struct sockaddr_in *listen_addr) { + void get_sockaddr() { + std::vector saddrs; struct ifaddrs* ifaddrs; + ucs_status_t status; + size_t size; int ret = getifaddrs(&ifaddrs); ASSERT_EQ(ret, 0); for (struct ifaddrs *ifa = ifaddrs; ifa != NULL; ifa = ifa->ifa_next) { - if (ucs_netif_is_active(ifa->ifa_name) && - ucs::is_inet_addr(ifa->ifa_addr) && + if (ucs_netif_flags_is_active(ifa->ifa_flags) && + ucs::is_inet_addr(ifa->ifa_addr) && ucs::is_rdmacm_netdev(ifa->ifa_name)) { - *listen_addr = *(struct sockaddr_in*)(void*)ifa->ifa_addr; - listen_addr->sin_port = ucs::get_port(); - freeifaddrs(ifaddrs); - return; + saddrs.push_back(ucs::sock_addr_storage()); + status = ucs_sockaddr_sizeof(ifa->ifa_addr, &size); + ASSERT_UCS_OK(status); + saddrs.back().set_sock_addr(*ifa->ifa_addr, size); + saddrs.back().set_port(0); /* listen on any port then update */ } } + freeifaddrs(ifaddrs); - UCS_TEST_SKIP_R("No interface for testing"); - } - void inaddr_any_addr(struct sockaddr_in *addr, in_port_t port) - { - memset(addr, 0, sizeof(struct sockaddr_in)); - addr->sin_family = AF_INET; - addr->sin_addr.s_addr = INADDR_ANY; - addr->sin_port = port; + if (saddrs.empty()) { + UCS_TEST_SKIP_R("No interface for testing"); + } + + static const std::string dc_tls[] = { "dc", "dc_x", "ib" }; + + bool has_dc = has_any_transport( + std::vector(dc_tls, dc_tls + ucs_array_size(dc_tls))); + + /* FIXME: select random interface, except for DC transport, which do not + yet support having different gid_index for different UCT + endpoints on same iface */ + int saddr_idx = has_dc ? 0 : (ucs::rand() % saddrs.size()); + m_test_addr = saddrs[saddr_idx]; } - void start_listener(ucp_test_base::entity::listen_cb_type_t cb_type, - const struct sockaddr* addr) + void start_listener(ucp_test_base::entity::listen_cb_type_t cb_type) { - ucs_status_t status = receiver().listen(cb_type, addr, sizeof(*addr)); + ucs_time_t deadline = ucs::get_deadline(); + ucs_status_t status; + + do { + status = receiver().listen(cb_type, m_test_addr.get_sock_addr_ptr(), + m_test_addr.get_addr_size(), + get_server_ep_params()); + } while ((status == UCS_ERR_BUSY) && (ucs_get_time() < deadline)); + if (status == UCS_ERR_UNREACHABLE) { - UCS_TEST_SKIP_R("cannot listen to " + ucs::sockaddr_to_str(addr)); + UCS_TEST_SKIP_R("cannot listen to " + m_test_addr.to_str()); } + + ASSERT_UCS_OK(status); + ucp_listener_attr_t attr; + uint16_t port; + + attr.field_mask = UCP_LISTENER_ATTR_FIELD_SOCKADDR; + ASSERT_UCS_OK(ucp_listener_query(receiver().listenerh(), &attr)); + ASSERT_UCS_OK(ucs_sockaddr_get_port( + (const struct sockaddr *)&attr.sockaddr, &port)); + m_test_addr.set_port(port); + UCS_TEST_MESSAGE << "server listening on " << m_test_addr.to_str(); } static void scomplete_cb(void *req, ucs_status_t status) @@ -255,9 +338,9 @@ class test_ucp_sockaddr : public ucp_test { progress(); ep_count = ucp_stream_worker_poll(to.worker(), &poll_eps, 1, 0); } while (ep_count == 0); - ASSERT_EQ(1, ep_count); - EXPECT_EQ(to.ep(), poll_eps.ep); - EXPECT_EQ((void *)0xdeadbeef, poll_eps.user_data); + ASSERT_EQ(1, ep_count); + EXPECT_EQ(to.ep(), poll_eps.ep); + EXPECT_EQ(&to, poll_eps.user_data); recv_req = ucp_stream_recv_nb(to.ep(), &recv_data, 1, ucp_dt_make_contig(sizeof(recv_data)), @@ -280,72 +363,83 @@ class test_ucp_sockaddr : public ucp_test { { ucs_time_t deadline = ucs::get_deadline(); - while ((receiver().get_num_eps() == 0) && (m_err_handler_count == 0) && - (ucs_get_time() < deadline)) { + while ((receiver().get_num_eps() == 0) && + (sender().get_err_num() == 0) && (ucs_get_time() < deadline)) { check_events(sender().worker(), receiver().worker(), wakeup, NULL); } - return (m_err_handler_count == 0) && (receiver().get_num_eps() > 0); + + return (sender().get_err_num() == 0) && (receiver().get_num_eps() > 0); } void wait_for_reject(entity &e, bool wakeup) { ucs_time_t deadline = ucs::get_deadline(); - while ((e.get_rejected_cntr() == 0) && - (ucs_get_time() < deadline)) { + while ((e.get_err_num_rejected() == 0) && (ucs_get_time() < deadline)) { check_events(sender().worker(), receiver().worker(), wakeup, NULL); } + EXPECT_GT(deadline, ucs_get_time()); - EXPECT_EQ(1ul, e.get_rejected_cntr()); + EXPECT_EQ(1ul, e.get_err_num_rejected()); } virtual ucp_ep_params_t get_ep_params() { ucp_ep_params_t ep_params = ucp_test::get_ep_params(); ep_params.field_mask |= UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE | - UCP_EP_PARAM_FIELD_ERR_HANDLER | - UCP_EP_PARAM_FIELD_USER_DATA; + UCP_EP_PARAM_FIELD_ERR_HANDLER; /* The error handling requirement is needed since we need to take * care of a case where the client gets an error. In case ucp needs to * handle a large worker address but neither ud nor ud_x are present */ ep_params.err_mode = UCP_ERR_HANDLING_MODE_PEER; ep_params.err_handler.cb = err_handler_cb; ep_params.err_handler.arg = NULL; - ep_params.user_data = reinterpret_cast(this); return ep_params; } - void client_ep_connect(struct sockaddr *connect_addr) + virtual ucp_ep_params_t get_server_ep_params() { + return get_ep_params(); + } + + void client_ep_connect() { ucp_ep_params_t ep_params = get_ep_params(); ep_params.field_mask |= UCP_EP_PARAM_FIELD_FLAGS | - UCP_EP_PARAM_FIELD_SOCK_ADDR; + UCP_EP_PARAM_FIELD_SOCK_ADDR | + UCP_EP_PARAM_FIELD_USER_DATA; ep_params.flags = UCP_EP_PARAMS_FLAGS_CLIENT_SERVER; - ep_params.sockaddr.addr = connect_addr; - ep_params.sockaddr.addrlen = sizeof(*connect_addr); + ep_params.sockaddr.addr = m_test_addr.get_sock_addr_ptr(); + ep_params.sockaddr.addrlen = m_test_addr.get_addr_size(); + ep_params.user_data = &sender(); sender().connect(&receiver(), ep_params); } - void connect_and_send_recv(struct sockaddr *connect_addr, bool wakeup) + void connect_and_send_recv(bool wakeup, uint64_t flags) { { scoped_log_handler slh(detect_error_logger); - client_ep_connect(connect_addr); + client_ep_connect(); if (!wait_for_server_ep(wakeup)) { UCS_TEST_SKIP_R("cannot connect to server"); } } - send_recv(sender(), receiver(), - (GetParam().variant == CONN_REQ_STREAM) ? SEND_RECV_STREAM : - SEND_RECV_TAG, wakeup, cb_type()); + if (flags & SEND_DIRECTION_C2S) { + send_recv(sender(), receiver(), send_recv_type(), wakeup, + cb_type()); + } + + if (flags & SEND_DIRECTION_S2C) { + send_recv(receiver(), sender(), send_recv_type(), wakeup, + cb_type()); + } } - void connect_and_reject(struct sockaddr *connect_addr, bool wakeup) + void connect_and_reject(bool wakeup) { { scoped_log_handler slh(detect_error_logger); - client_ep_connect(connect_addr); + client_ep_connect(); /* Check reachability with tagged send */ send_recv(sender(), receiver(), SEND_RECV_TAG, wakeup, ucp_test_base::entity::LISTEN_CB_REJECT); @@ -354,123 +448,370 @@ class test_ucp_sockaddr : public ucp_test { wait_for_reject(sender(), wakeup); } - void listen_and_communicate(ucp_test_base::entity::listen_cb_type_t cb_type, - bool wakeup) + void listen_and_communicate(bool wakeup, uint64_t flags) { - struct sockaddr_in connect_addr; - get_listen_addr(&connect_addr); + UCS_TEST_MESSAGE << "Testing " << m_test_addr.to_str(); - UCS_TEST_MESSAGE << "Testing " - << ucs::sockaddr_to_str( - (const struct sockaddr*)&connect_addr); - - start_listener(cb_type, (const struct sockaddr*)&connect_addr); - connect_and_send_recv((struct sockaddr*)&connect_addr, wakeup); + start_listener(cb_type()); + connect_and_send_recv(wakeup, flags); } - void listen_and_reject(ucp_test_base::entity::listen_cb_type_t cb_type, - bool wakeup) + void listen_and_reject(bool wakeup) { - struct sockaddr_in connect_addr; - get_listen_addr(&connect_addr); + UCS_TEST_MESSAGE << "Testing " << m_test_addr.to_str(); - UCS_TEST_MESSAGE << "Testing " - << ucs::sockaddr_to_str( - (const struct sockaddr*)&connect_addr); - start_listener(cb_type, (const struct sockaddr*)&connect_addr); - connect_and_reject((struct sockaddr*)&connect_addr, wakeup); + start_listener(ucp_test_base::entity::LISTEN_CB_REJECT); + connect_and_reject(wakeup); } + void one_sided_disconnect(entity &e, enum ucp_ep_close_mode mode) { + void *req = e.disconnect_nb(0, 0, mode); + ucs_time_t deadline = ucs_time_from_sec(10.0) + ucs_get_time(); + while (!is_request_completed(req) && (ucs_get_time() < deadline)) { + /* TODO: replace the progress() with e().progress() when + async progress is implemented. */ + progress(); + }; - static void err_handler_cb(void *arg, ucp_ep_h ep, ucs_status_t status) { - test_ucp_sockaddr *self = reinterpret_cast(arg); - ucp_test::err_handler_cb(static_cast(self), ep, status); - - if (status == UCS_ERR_REJECTED) { - entity *e = self->get_entity_by_ep(ep); - if (e != NULL) { - e->inc_rejected_cntr(); - return; - } + e.close_ep_req_free(req); + } + + void concurrent_disconnect(enum ucp_ep_close_mode mode) { + ASSERT_EQ(2ul, entities().size()); + ASSERT_EQ(1, sender().get_num_workers()); + ASSERT_EQ(1, sender().get_num_eps()); + ASSERT_EQ(1, receiver().get_num_workers()); + ASSERT_EQ(1, receiver().get_num_eps()); + + void *sender_ep_close_req = sender().disconnect_nb(0, 0, mode); + void *receiver_ep_close_req = receiver().disconnect_nb(0, 0, mode); + + ucs_time_t deadline = ucs::get_deadline(); + while ((!is_request_completed(sender_ep_close_req) || + !is_request_completed(receiver_ep_close_req)) && + (ucs_get_time() < deadline)) { + progress(); } + sender().close_ep_req_free(sender_ep_close_req); + receiver().close_ep_req_free(receiver_ep_close_req); + } + + static void err_handler_cb(void *arg, ucp_ep_h ep, ucs_status_t status) { + ucp_test::err_handler_cb(arg, ep, status); + /* The current expected errors are only from the err_handle test * and from transports where the worker address is too long but ud/ud_x * are not present, or ud/ud_x are present but their addresses are too - * long as well */ - if (status != UCS_ERR_UNREACHABLE) { + * long as well, in addition we can get disconnect events during test + * teardown. + */ + switch (status) { + case UCS_ERR_REJECTED: + case UCS_ERR_UNREACHABLE: + case UCS_ERR_CONNECTION_RESET: + UCS_TEST_MESSAGE << "ignoring error " < 0) { - UCS_TEST_SKIP_R("Not parameterized test"); - } +UCS_TEST_P(test_ucp_sockaddr, onesided_disconnect_c2s) { + listen_and_communicate(false, SEND_DIRECTION_C2S); + one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FLUSH); +} - listen_and_reject(ucp_test_base::entity::LISTEN_CB_REJECT, false); +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, onesided_disconnect_s2c, + no_close_protocol()) { + listen_and_communicate(false, SEND_DIRECTION_S2C); + one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FLUSH); } -UCS_TEST_P(test_ucp_sockaddr, err_handle) { +UCS_TEST_P(test_ucp_sockaddr, onesided_disconnect_bidi) { + listen_and_communicate(false, SEND_DIRECTION_BIDI); + one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FLUSH); +} - struct sockaddr_in listen_addr; +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, concurrent_disconnect, + no_close_protocol()) { + listen_and_communicate(false, 0); + concurrent_disconnect(UCP_EP_CLOSE_MODE_FLUSH); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, concurrent_disconnect_c2s, + no_close_protocol()) { + listen_and_communicate(false, SEND_DIRECTION_C2S); + concurrent_disconnect(UCP_EP_CLOSE_MODE_FLUSH); +} - get_listen_addr(&listen_addr); +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, concurrent_disconnect_s2c, + no_close_protocol()) { + listen_and_communicate(false, SEND_DIRECTION_S2C); + concurrent_disconnect(UCP_EP_CLOSE_MODE_FLUSH); +} + +UCS_TEST_P(test_ucp_sockaddr, concurrent_disconnect_bidi) { + listen_and_communicate(false, SEND_DIRECTION_BIDI); + concurrent_disconnect(UCP_EP_CLOSE_MODE_FLUSH); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, concurrent_disconnect_force, + no_close_protocol()) { + listen_and_communicate(false, 0); + concurrent_disconnect(UCP_EP_CLOSE_MODE_FORCE); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, concurrent_disconnect_force_c2s, + no_close_protocol()) { + listen_and_communicate(false, SEND_DIRECTION_C2S); + concurrent_disconnect(UCP_EP_CLOSE_MODE_FORCE); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, concurrent_disconnect_force_s2c, + no_close_protocol()) { + listen_and_communicate(false, SEND_DIRECTION_S2C); + concurrent_disconnect(UCP_EP_CLOSE_MODE_FORCE); +} +UCS_TEST_P(test_ucp_sockaddr, concurrent_disconnect_force_bidi) { + listen_and_communicate(false, SEND_DIRECTION_BIDI); + concurrent_disconnect(UCP_EP_CLOSE_MODE_FORCE); +} + +UCS_TEST_P(test_ucp_sockaddr, listen_inaddr_any) { + /* save testing address */ + ucs::sock_addr_storage test_addr(m_test_addr); + m_test_addr.reset_to_any(); + + UCS_TEST_MESSAGE << "Testing " << m_test_addr.to_str(); + + start_listener(cb_type()); + /* get the actual port which was selected by listener */ + test_addr.set_port(m_test_addr.get_port()); + /* restore address */ + m_test_addr = test_addr; + connect_and_send_recv(false, SEND_DIRECTION_C2S); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, reject, nonparameterized_test()) { + listen_and_reject(false); +} + +UCS_TEST_P(test_ucp_sockaddr, listener_query) { + ucp_listener_attr_t listener_attr; + ucs_status_t status; + + listener_attr.field_mask = UCP_LISTENER_ATTR_FIELD_SOCKADDR; + + UCS_TEST_MESSAGE << "Testing " << m_test_addr.to_str(); + + start_listener(cb_type()); + status = ucp_listener_query(receiver().listenerh(), &listener_attr); + EXPECT_UCS_OK(status); + + EXPECT_EQ(m_test_addr, listener_attr.sockaddr); +} + +UCS_TEST_P(test_ucp_sockaddr, err_handle) { + + ucs::sock_addr_storage listen_addr(m_test_addr.to_ucs_sock_addr()); ucs_status_t status = receiver().listen(cb_type(), - (const struct sockaddr*)&listen_addr, - sizeof(listen_addr)); + m_test_addr.get_sock_addr_ptr(), + m_test_addr.get_addr_size(), + get_server_ep_params()); if (status == UCS_ERR_UNREACHABLE) { - UCS_TEST_SKIP_R("cannot listen to " + ucs::sockaddr_to_str(&listen_addr)); + UCS_TEST_SKIP_R("cannot listen to " + m_test_addr.to_str()); } /* make the client try to connect to a non-existing port on the server side */ - listen_addr.sin_port = 1; + m_test_addr.set_port(1); { scoped_log_handler slh(wrap_errors_logger); - client_ep_connect((struct sockaddr*)&listen_addr); + client_ep_connect(); /* allow for the unreachable event to arrive before restoring errors */ - wait_for_flag(&m_err_handler_count); + wait_for_flag(&sender().get_err_num()); } - EXPECT_EQ(1, m_err_handler_count); + EXPECT_EQ(1u, sender().get_err_num()); } UCP_INSTANTIATE_ALL_TEST_CASE(test_ucp_sockaddr) +class test_ucp_sockaddr_destroy_ep_on_err : public test_ucp_sockaddr { +public: + test_ucp_sockaddr_destroy_ep_on_err() { + /* Set small TL timeouts to reduce testing time */ + m_env.push_back(new ucs::scoped_setenv("UCX_RC_TIMEOUT", "10ms")); + m_env.push_back(new ucs::scoped_setenv("UCX_RC_RNR_TIMEOUT", "10ms")); + m_env.push_back(new ucs::scoped_setenv("UCX_RC_RETRY_COUNT", "2")); + } + + virtual ucp_ep_params_t get_server_ep_params() { + ucp_ep_params_t params = test_ucp_sockaddr::get_server_ep_params(); + + params.field_mask |= UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE | + UCP_EP_PARAM_FIELD_ERR_HANDLER | + UCP_EP_PARAM_FIELD_USER_DATA; + params.err_mode = UCP_ERR_HANDLING_MODE_PEER; + params.err_handler.cb = err_handler_cb; + params.err_handler.arg = NULL; + params.user_data = &receiver(); + return params; + } + + static void err_handler_cb(void *arg, ucp_ep_h ep, ucs_status_t status) { + test_ucp_sockaddr::err_handler_cb(arg, ep, status); + entity *e = reinterpret_cast(arg); + e->disconnect_nb(0, 0, UCP_EP_CLOSE_MODE_FORCE); + } + +private: + ucs::ptr_vector m_env; +}; + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, empty, + no_close_protocol()) { + listen_and_communicate(false, 0); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, s2c, + no_close_protocol()) { + listen_and_communicate(false, SEND_DIRECTION_S2C); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, c2s, + no_close_protocol()) { + listen_and_communicate(false, SEND_DIRECTION_C2S); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, bidi, + no_close_protocol()) { + listen_and_communicate(false, SEND_DIRECTION_BIDI); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_client_cforce, + no_close_protocol()) { + listen_and_communicate(false, 0); + scoped_log_handler slh(wrap_errors_logger); + one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FORCE); + one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FLUSH); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_c2s_cforce, + no_close_protocol()) { + listen_and_communicate(false, SEND_DIRECTION_C2S); + scoped_log_handler slh(wrap_errors_logger); + one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FORCE); + one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FLUSH); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_s2c_cforce, + no_close_protocol()) { + listen_and_communicate(false, SEND_DIRECTION_S2C); + scoped_log_handler slh(wrap_errors_logger); + one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FORCE); + one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FLUSH); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_bidi_cforce, + no_close_protocol()) { + listen_and_communicate(false, SEND_DIRECTION_BIDI); + scoped_log_handler slh(wrap_errors_logger); + one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FORCE); + one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FLUSH); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_client_sforce, + no_close_protocol()) { + listen_and_communicate(false, 0); + scoped_log_handler slh(wrap_errors_logger); + one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FORCE); + one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FLUSH); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_c2s_sforce, + no_close_protocol()) { + listen_and_communicate(false, SEND_DIRECTION_C2S); + scoped_log_handler slh(wrap_errors_logger); + one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FORCE); + one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FLUSH); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_s2c_sforce, + no_close_protocol()) { + listen_and_communicate(false, SEND_DIRECTION_S2C); + scoped_log_handler slh(wrap_errors_logger); + one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FORCE); + one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FLUSH); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_bidi_sforce, + no_close_protocol()) { + listen_and_communicate(false, SEND_DIRECTION_BIDI); + scoped_log_handler slh(wrap_errors_logger); + one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FORCE); + one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FLUSH); +} + +UCP_INSTANTIATE_ALL_TEST_CASE(test_ucp_sockaddr_destroy_ep_on_err) class test_ucp_sockaddr_with_wakeup : public test_ucp_sockaddr { public: - static ucp_params_t get_ctx_params() { ucp_params_t params = test_ucp_sockaddr::get_ctx_params(); params.features |= UCP_FEATURE_WAKEUP; @@ -478,16 +819,27 @@ class test_ucp_sockaddr_with_wakeup : public test_ucp_sockaddr { } }; -UCS_TEST_P(test_ucp_sockaddr_with_wakeup, wakeup) { - listen_and_communicate(cb_type(), true); +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_with_wakeup, wakeup, + no_close_protocol()) { + listen_and_communicate(true, 0); } -UCS_TEST_P(test_ucp_sockaddr_with_wakeup, reject) { - if (GetParam().variant > 0) { - UCS_TEST_SKIP_R("Invalid test parameter"); - } +UCS_TEST_P(test_ucp_sockaddr_with_wakeup, wakeup_c2s) { + listen_and_communicate(true, SEND_DIRECTION_C2S); +} - listen_and_reject(ucp_test_base::entity::LISTEN_CB_REJECT, true); +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_with_wakeup, wakeup_s2c, + no_close_protocol()) { + listen_and_communicate(true, SEND_DIRECTION_S2C); +} + +UCS_TEST_P(test_ucp_sockaddr_with_wakeup, wakeup_bidi) { + listen_and_communicate(true, SEND_DIRECTION_BIDI); +} + +UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_with_wakeup, reject, + nonparameterized_test()) { + listen_and_reject(true); } UCP_INSTANTIATE_ALL_TEST_CASE(test_ucp_sockaddr_with_wakeup) @@ -511,30 +863,388 @@ UCS_TEST_P(test_ucp_sockaddr_with_rma_atomic, wireup) { /* This test makes sure that the client-server flow works when the required * features are RMA/ATOMIC. With these features, need to make sure that * there is a lane for ucp-wireup (an am_lane should be created and used) */ - struct sockaddr_in connect_addr; - get_listen_addr(&connect_addr); - - UCS_TEST_MESSAGE << "Testing " << ucs::sockaddr_to_str((const struct sockaddr*)&connect_addr); - - start_listener(cb_type(), (const struct sockaddr*)&connect_addr); + UCS_TEST_MESSAGE << "Testing " << m_test_addr.to_str(); + start_listener(cb_type()); { scoped_log_handler slh(wrap_errors_logger); - client_ep_connect((struct sockaddr*)&connect_addr); + client_ep_connect(); /* allow the err_handler callback to be invoked if needed */ if (!wait_for_server_ep(false)) { - EXPECT_EQ(1, m_err_handler_count); + EXPECT_EQ(1ul, sender().get_err_num()); UCS_TEST_SKIP_R("cannot connect to server"); } - EXPECT_EQ(0, m_err_handler_count); + EXPECT_EQ(0ul, sender().get_err_num()); /* even if server EP is created, in case of long address, wireup will be * done later, need to communicate */ - send_recv(sender(), receiver(), (GetParam().variant == CONN_REQ_STREAM) ? - SEND_RECV_STREAM : SEND_RECV_TAG, false, cb_type()); + send_recv(sender(), receiver(), send_recv_type(), false, cb_type()); } } UCP_INSTANTIATE_ALL_TEST_CASE(test_ucp_sockaddr_with_rma_atomic) + + +class test_ucp_sockaddr_protocols : public test_ucp_sockaddr { +public: + static ucp_params_t get_ctx_params() { + ucp_params_t params = test_ucp_sockaddr::get_ctx_params(); + params.field_mask |= UCP_PARAM_FIELD_FEATURES; + params.features |= UCP_FEATURE_RMA | UCP_FEATURE_AM; + /* Atomics not supported for now because need to emulate the case + * of using different device than the one selected by default on the + * worker for atomic operations */ + return params; + } + + static std::vector + enum_test_params(const ucp_params_t& ctx_params, + const std::string& name, + const std::string& test_case_name, + const std::string& tls) + { + std::vector result; + enum_test_params_with_modifier(ctx_params, name, test_case_name, tls, + result, TEST_MODIFIER_CM); + return result; + } + + virtual void init() { + test_ucp_sockaddr::init(); + start_listener(cb_type()); + client_ep_connect(); + } + + void get_nb(std::string& send_buf, std::string& recv_buf, ucp_rkey_h rkey, + std::vector& reqs) + { + reqs.push_back(ucp_get_nb(sender().ep(), &send_buf[0], send_buf.size(), + (uintptr_t)&recv_buf[0], rkey, scomplete_cb)); + } + + void put_nb(std::string& send_buf, std::string& recv_buf, ucp_rkey_h rkey, + std::vector& reqs) + { + reqs.push_back(ucp_put_nb(sender().ep(), &send_buf[0], send_buf.size(), + (uintptr_t)&recv_buf[0], rkey, scomplete_cb)); + reqs.push_back(ucp_ep_flush_nb(sender().ep(), 0, scomplete_cb)); + } + +protected: + typedef void (test_ucp_sockaddr_protocols::*rma_nb_func_t)( + std::string&, std::string&, ucp_rkey_h, std::vector&); + + void compare_buffers(std::string& send_buf, std::string& recv_buf) + { + EXPECT_TRUE(send_buf == recv_buf) + << "send_buf: '" << ucs::compact_string(send_buf, 20) << "', " + << "recv_buf: '" << ucs::compact_string(recv_buf, 20) << "'"; + } + + void test_tag_send_recv(size_t size, bool is_exp, bool is_sync = false) + { + std::string send_buf(size, 'x'); + std::string recv_buf(size, 'y'); + + void *rreq = NULL, *sreq = NULL; + + if (is_exp) { + rreq = ucp_tag_recv_nb(receiver().worker(), &recv_buf[0], size, + ucp_dt_make_contig(1), 0, 0, rtag_complete_cb); + } + + if (is_sync) { + sreq = ucp_tag_send_sync_nb(sender().ep(), &send_buf[0], size, + ucp_dt_make_contig(1), 0, scomplete_cb); + } else { + sreq = ucp_tag_send_nb(sender().ep(), &send_buf[0], size, + ucp_dt_make_contig(1), 0, scomplete_cb); + } + + if (!is_exp) { + short_progress_loop(); + rreq = ucp_tag_recv_nb(receiver().worker(), &recv_buf[0], size, + ucp_dt_make_contig(1), 0, 0, rtag_complete_cb); + } + + wait(sreq); + wait(rreq); + + compare_buffers(send_buf, recv_buf); + } + + void wait_for_server_ep() + { + if (!test_ucp_sockaddr::wait_for_server_ep(false)) { + UCS_TEST_ABORT("server endpoint is not created"); + } + } + + void test_stream_send_recv(size_t size, bool is_exp) + { + std::string send_buf(size, 'x'); + std::string recv_buf(size, 'y'); + size_t recv_length; + void *rreq, *sreq; + + if (is_exp) { + wait_for_server_ep(); + rreq = ucp_stream_recv_nb(receiver().ep(), &recv_buf[0], size, + ucp_dt_make_contig(1), rstream_complete_cb, + &recv_length, UCP_STREAM_RECV_FLAG_WAITALL); + sreq = ucp_stream_send_nb(sender().ep(), &send_buf[0], size, + ucp_dt_make_contig(1), scomplete_cb, 0); + } else { + sreq = ucp_stream_send_nb(sender().ep(), &send_buf[0], size, + ucp_dt_make_contig(1), scomplete_cb, 0); + short_progress_loop(); + wait_for_server_ep(); + rreq = ucp_stream_recv_nb(receiver().ep(), &recv_buf[0], size, + ucp_dt_make_contig(1), rstream_complete_cb, + &recv_length, UCP_STREAM_RECV_FLAG_WAITALL); + } + + wait(sreq); + wait(rreq); + + compare_buffers(send_buf, recv_buf); + } + + void register_mem(entity* initiator, entity* target, void *buffer, + size_t length, ucp_mem_h *memh_p, ucp_rkey_h *rkey_p) + { + ucp_mem_map_params_t params = {0}; + params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | + UCP_MEM_MAP_PARAM_FIELD_LENGTH; + params.address = buffer; + params.length = length; + + ucs_status_t status = ucp_mem_map(target->ucph(), ¶ms, memh_p); + ASSERT_UCS_OK(status); + + void *rkey_buffer; + size_t rkey_buffer_size; + status = ucp_rkey_pack(target->ucph(), *memh_p, &rkey_buffer, + &rkey_buffer_size); + ASSERT_UCS_OK(status); + + status = ucp_ep_rkey_unpack(initiator->ep(), rkey_buffer, rkey_p); + ASSERT_UCS_OK(status); + + ucp_rkey_buffer_release(rkey_buffer); + } + + void test_rma(size_t size, rma_nb_func_t rma_func) + { + std::string send_buf(size, 'x'); + std::string recv_buf(size, 'y'); + + ucp_mem_h memh; + ucp_rkey_h rkey; + + register_mem(&sender(), &receiver(), &recv_buf[0], size, &memh, &rkey); + + std::vector reqs; + (this->*rma_func)(send_buf, recv_buf, rkey, reqs); + + while (!reqs.empty()) { + wait(reqs.back()); + reqs.pop_back(); + } + + compare_buffers(send_buf, recv_buf); + + ucp_rkey_destroy(rkey); + ucs_status_t status = ucp_mem_unmap(receiver().ucph(), memh); + ASSERT_UCS_OK(status); + } + + void test_am_send_recv(size_t size) + { + std::string sb(size, 'x'); + + bool am_received = false; + ucp_worker_set_am_handler(receiver().worker(), 0, + rx_am_msg_cb, &am_received, 0); + + ucs_status_ptr_t sreq = ucp_am_send_nb(sender().ep(), 0, &sb[0], size, + ucp_dt_make_contig(1), + scomplete_cb, 0); + wait(sreq); + wait_for_flag(&am_received); + EXPECT_TRUE(am_received); + + ucp_worker_set_am_handler(receiver().worker(), 0, NULL, NULL, 0); + } + +private: + static ucs_status_t rx_am_msg_cb(void *arg, void *data, size_t length, + ucp_ep_h reply_ep, unsigned flags) { + volatile bool *am_rx = reinterpret_cast(arg); + EXPECT_FALSE(*am_rx); + *am_rx = true; + return UCS_OK; + } +}; + +UCS_TEST_P(test_ucp_sockaddr_protocols, tag_zcopy_4k_exp, + "ZCOPY_THRESH=2k", "RNDV_THRESH=inf") +{ + test_tag_send_recv(4 * UCS_KBYTE, true); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, tag_zcopy_64k_exp, + "ZCOPY_THRESH=2k", "RNDV_THRESH=inf") +{ + test_tag_send_recv(64 * UCS_KBYTE, true); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, tag_zcopy_4k_exp_sync, + "ZCOPY_THRESH=2k", "RNDV_THRESH=inf") +{ + test_tag_send_recv(4 * UCS_KBYTE, true, true); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, tag_zcopy_64k_exp_sync, + "ZCOPY_THRESH=2k", "RNDV_THRESH=inf") +{ + test_tag_send_recv(64 * UCS_KBYTE, true, true); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, tag_rndv_exp, "RNDV_THRESH=10k") +{ + test_tag_send_recv(64 * UCS_KBYTE, true); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, tag_zcopy_4k_unexp, + "ZCOPY_THRESH=2k", "RNDV_THRESH=inf") +{ + test_tag_send_recv(4 * UCS_KBYTE, false); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, tag_zcopy_64k_unexp, + "ZCOPY_THRESH=2k", "RNDV_THRESH=inf") +{ + test_tag_send_recv(64 * UCS_KBYTE, false); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, tag_zcopy_4k_unexp_sync, + "ZCOPY_THRESH=2k", "RNDV_THRESH=inf") +{ + test_tag_send_recv(4 * UCS_KBYTE, false, true); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, tag_zcopy_64k_unexp_sync, + "ZCOPY_THRESH=2k", "RNDV_THRESH=inf") +{ + test_tag_send_recv(64 * UCS_KBYTE, false, true); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, tag_rndv_unexp, "RNDV_THRESH=10k") +{ + test_tag_send_recv(64 * UCS_KBYTE, false); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, stream_bcopy_4k_exp, "ZCOPY_THRESH=inf") +{ + test_stream_send_recv(4 * UCS_KBYTE, true); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, stream_bcopy_4k_unexp, + "ZCOPY_THRESH=inf") +{ + test_stream_send_recv(4 * UCS_KBYTE, false); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, stream_bcopy_64k_exp, "ZCOPY_THRESH=inf") +{ + test_stream_send_recv(64 * UCS_KBYTE, true); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, stream_bcopy_64k_unexp, + "ZCOPY_THRESH=inf") +{ + test_stream_send_recv(64 * UCS_KBYTE, false); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, stream_zcopy_64k_exp, "ZCOPY_THRESH=2k") +{ + test_stream_send_recv(64 * UCS_KBYTE, true); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, stream_zcopy_64k_unexp, + "ZCOPY_THRESH=2k") +{ + test_stream_send_recv(64 * UCS_KBYTE, false); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, get_bcopy_small) +{ + test_rma(8, &test_ucp_sockaddr_protocols::get_nb); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, get_bcopy, "ZCOPY_THRESH=inf") +{ + test_rma(64 * UCS_KBYTE, &test_ucp_sockaddr_protocols::get_nb); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, get_zcopy, "ZCOPY_THRESH=10k") +{ + test_rma(64 * UCS_KBYTE, &test_ucp_sockaddr_protocols::get_nb); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, put_bcopy_small) +{ + test_rma(8, &test_ucp_sockaddr_protocols::put_nb); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, put_bcopy, "ZCOPY_THRESH=inf") +{ + test_rma(64 * UCS_KBYTE, &test_ucp_sockaddr_protocols::put_nb); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, put_zcopy, "ZCOPY_THRESH=10k") +{ + test_rma(64 * UCS_KBYTE, &test_ucp_sockaddr_protocols::put_nb); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, am_short) +{ + test_am_send_recv(1); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, am_bcopy_1k, "ZCOPY_THRESH=inf") +{ + test_am_send_recv(1 * UCS_KBYTE); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, am_bcopy_64k, "ZCOPY_THRESH=inf") +{ + test_am_send_recv(64 * UCS_KBYTE); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, am_zcopy_1k, "ZCOPY_THRESH=512") +{ + test_am_send_recv(1 * UCS_KBYTE); +} + +UCS_TEST_P(test_ucp_sockaddr_protocols, am_zcopy_64k, "ZCOPY_THRESH=512") +{ + test_am_send_recv(64 * UCS_KBYTE); +} + + +/* Only IB transports support CM for now + * For DC case, allow fallback to UD if DC is not supported + */ +#define UCP_INSTANTIATE_CM_TEST_CASE(_test_case) \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, dcudx, "dc_x,ud") \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, ud, "ud_v") \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, udx, "ud_x") \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, rc, "rc_v") \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, rcx, "rc_x") \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, ib, "ib") + +UCP_INSTANTIATE_CM_TEST_CASE(test_ucp_sockaddr_protocols) diff --git a/test/gtest/ucp/test_ucp_stream.cc b/test/gtest/ucp/test_ucp_stream.cc index e7c99d2862b..b3afe6c876f 100644 --- a/test/gtest/ucp/test_ucp_stream.cc +++ b/test/gtest/ucp/test_ucp_stream.cc @@ -33,12 +33,13 @@ class test_ucp_stream_base : public ucp_test { size_t test_ucp_stream_base::wait_stream_recv(void *request) { + ucs_time_t deadline = ucs::get_deadline(); ucs_status_t status; size_t length; do { progress(); status = ucp_stream_recv_request_test(request, &length); - } while (status == UCS_INPROGRESS); + } while ((status == UCS_INPROGRESS) && (ucs_get_time() < deadline)); ASSERT_UCS_OK(status); ucp_request_free(request); @@ -62,6 +63,53 @@ class test_ucp_stream_onesided : public test_ucp_stream_base { } }; +UCS_TEST_P(test_ucp_stream_onesided, recv_not_connected_ep_cleanup) { + receiver().connect(&sender(), get_ep_params()); + + uint64_t recv_data = 0; + size_t length; + void *rreq = ucp_stream_recv_nb(receiver().ep(), &recv_data, 1, + ucp_dt_make_contig(sizeof(uint64_t)), + ucp_recv_cb, &length, + UCP_STREAM_RECV_FLAG_WAITALL); + EXPECT_TRUE(UCS_PTR_IS_PTR(rreq)); + EXPECT_EQ(UCS_INPROGRESS, ucp_request_check_status(rreq)); + disconnect(receiver()); + EXPECT_EQ(UCS_ERR_CANCELED, ucp_request_check_status(rreq)); + ucp_request_free(rreq); +} + +UCS_TEST_P(test_ucp_stream_onesided, recv_connected_ep_cleanup) { + skip_loopback(); + sender().connect(&receiver(), get_ep_params()); + receiver().connect(&sender(), get_ep_params()); + + uint64_t send_data = ucs::rand(); + uint64_t recv_data = 0; + ucp_datatype_t dt = ucp_dt_make_contig(sizeof(uint64_t)); + + ucp::data_type_desc_t send_dt_desc(dt, &send_data, sizeof(send_data)); + void *sreq = stream_send_nb(send_dt_desc); + + size_t recvd_length; + void *rreq = ucp_stream_recv_nb(receiver().ep(), &recv_data, 1, dt, + ucp_recv_cb, &recvd_length, + UCP_STREAM_RECV_FLAG_WAITALL); + + EXPECT_EQ(sizeof(send_data), wait_stream_recv(rreq)); + EXPECT_EQ(send_data, recv_data); + wait(sreq); + + rreq = ucp_stream_recv_nb(receiver().ep(), &recv_data, 1, dt, ucp_recv_cb, + &recvd_length, UCP_STREAM_RECV_FLAG_WAITALL); + EXPECT_TRUE(UCS_PTR_IS_PTR(rreq)); + EXPECT_EQ(UCS_INPROGRESS, ucp_request_check_status(rreq)); + disconnect(sender()); + disconnect(receiver()); + EXPECT_EQ(UCS_ERR_CANCELED, ucp_request_check_status(rreq)); + ucp_request_free(rreq); +} + UCS_TEST_P(test_ucp_stream_onesided, send_recv_no_ep) { /* connect from sender side only and send */ @@ -141,13 +189,14 @@ class test_ucp_stream : public test_ucp_stream_base void test_ucp_stream::do_send_recv_data_test(ucp_datatype_t datatype) { - std::vector sbuf(16 * 1024 * 1024, 's'); size_t ssize = 0; /* total send size in bytes */ + std::vector sbuf(16 * UCS_MBYTE, 's'); std::vector check_pattern; ucs_status_ptr_t sstatus; /* send all msg sizes*/ - for (size_t i = 3; i < sbuf.size(); i *= 2) { + for (size_t i = 3; i < sbuf.size(); + i *= (2 * ucs::test_time_multiplier())) { if (UCP_DT_IS_GENERIC(datatype)) { for (size_t j = 0; j < i; ++j) { check_pattern.push_back(char(j)); @@ -171,7 +220,7 @@ void test_ucp_stream::do_send_recv_data_test(ucp_datatype_t datatype) do { progress(); rdata = ucp_stream_recv_data_nb(receiver().ep(), &length); - if (UCS_PTR_STATUS(rdata) == UCS_OK) { + if (rdata == NULL) { continue; } @@ -189,8 +238,8 @@ void test_ucp_stream::do_send_recv_test(ucp_datatype_t datatype) { const size_t dt_elem_size = UCP_DT_IS_CONTIG(datatype) ? ucp_contig_dt_elem_size(datatype) : 1; - std::vector sbuf(16 * 1024 * 1024, 's'); - size_t ssize = 0; /* total send size */ + size_t ssize = 0; /* total send size */ + std::vector sbuf(16 * UCS_MBYTE, 's'); ucs_status_ptr_t sstatus; std::vector check_pattern; @@ -268,7 +317,7 @@ void test_ucp_stream::do_send_exp_recv_test(ucp_datatype_t datatype) { const size_t dt_elem_size = UCP_DT_IS_CONTIG(datatype) ? ucp_contig_dt_elem_size(datatype) : 1; - const size_t msg_size = dt_elem_size * 1024 * 1024; + const size_t msg_size = dt_elem_size * UCS_MBYTE; const size_t n_msgs = 10; std::vector > rbufs(n_msgs, @@ -348,17 +397,17 @@ void test_ucp_stream::do_send_recv_data_recv_test(ucp_datatype_t datatype) { const size_t dt_elem_size = UCP_DT_IS_CONTIG(datatype) ? ucp_contig_dt_elem_size(datatype) : 1; - std::vector sbuf(16 * 1024 * 1024, 's'); - size_t ssize = 0; /* total send size */ + size_t ssize = 0; /* total send size */ + size_t roffset = 0; + size_t send_i = dt_elem_size; + size_t recv_i = 0; + std::vector sbuf(16 * UCS_MBYTE, 's'); ucs_status_ptr_t sstatus; std::vector check_pattern; std::vector rbuf; - size_t roffset = 0; ucs_status_ptr_t rdata; size_t length; - size_t send_i = dt_elem_size; - size_t recv_i = 0; do { if (send_i < sbuf.size()) { rbuf.resize(rbuf.size() + send_i, 'r'); @@ -377,7 +426,7 @@ void test_ucp_stream::do_send_recv_data_recv_test(ucp_datatype_t datatype) if ((++recv_i % 2) || ((ssize - roffset) < dt_elem_size)) { rdata = ucp_stream_recv_data_nb(receiver().ep(), &length); - if (UCS_PTR_STATUS(rdata) == UCS_OK) { + if (rdata == NULL) { continue; } @@ -459,7 +508,6 @@ UCS_TEST_P(test_ucp_stream, send_recv_generic) { ASSERT_UCS_OK(status); do_send_recv_test(dt); ucp_dt_destroy(dt); - } UCS_TEST_P(test_ucp_stream, send_exp_recv_8) { @@ -516,7 +564,7 @@ UCS_TEST_P(test_ucp_stream, send_recv_data_recv_iov) { } UCS_TEST_P(test_ucp_stream, send_zero_ending_iov_recv_data) { - const size_t min_size = 1024; + const size_t min_size = UCS_KBYTE; const size_t max_size = min_size * 64; const size_t iov_num = 8; /* must be divisible by 4 without a * remainder, caught on mlx5 based TLs @@ -608,13 +656,13 @@ class test_ucp_stream_many2one : public test_ucp_stream_base { void test_ucp_stream_many2one::init() { - /* Skip entities creation */ - test_base::init(); - if (is_self()) { UCS_TEST_SKIP_R("self"); } + /* Skip entities creation */ + test_base::init(); + for (size_t i = 0; i < m_nsenders + 1; ++i) { create_entity(); } diff --git a/test/gtest/ucp/test_ucp_tag.cc b/test/gtest/ucp/test_ucp_tag.cc index 732e1417551..4259208abaa 100644 --- a/test/gtest/ucp/test_ucp_tag.cc +++ b/test/gtest/ucp/test_ucp_tag.cc @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -41,6 +41,15 @@ void test_ucp_tag::init() ucp::dt_gen_finish_count = 0; } +void test_ucp_tag::enable_tag_mp_offload() +{ + m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_ENABLE", "y")); + m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_MP_SRQ_ENABLE", "try")); + m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_MP_NUM_STRIDES", "8")); + m_env.push_back(new ucs::scoped_setenv("UCX_IB_MLX5_DEVX_OBJECTS", + "dct,dcsrq,rcsrq,rcqp")); +} + void test_ucp_tag::request_init(void *request) { struct request *req = (struct request *)request; @@ -170,158 +179,237 @@ int test_ucp_tag::get_worker_index(int buf_index) } test_ucp_tag::request * -test_ucp_tag::send_nb(const void *buffer, size_t count, ucp_datatype_t datatype, - ucp_tag_t tag, int buf_index) +test_ucp_tag::send(entity &sender, send_type_t type, const void *buffer, + size_t count, ucp_datatype_t datatype, ucp_tag_t tag, + int buf_index) { int worker_index = get_worker_index(buf_index); request *req; + ucs_status_t status; + + switch (type) { + case SEND_B: + case SEND_NB: + req = (request*)ucp_tag_send_nb(sender.ep(worker_index), buffer, count, + datatype, tag, send_callback); + if ((req != NULL) && (type == SEND_B)) { + wait(req, get_worker_index(buf_index)); + request_release(req); + return NULL; + } - req = (request*)ucp_tag_send_nb(sender().ep(worker_index), buffer, count, datatype, - tag, send_callback); - if (UCS_PTR_IS_ERR(req)) { - ASSERT_UCS_OK(UCS_PTR_STATUS(req)); + if (UCS_PTR_IS_ERR(req)) { + ASSERT_UCS_OK(UCS_PTR_STATUS(req)); + } + break; + case SEND_NBR: + req = request_alloc(); + status = ucp_tag_send_nbr(sender.ep(worker_index), buffer, count, + datatype, tag, req); + ASSERT_UCS_OK_OR_INPROGRESS(status); + if (status == UCS_OK) { + request_free(req); + return (request*)UCS_STATUS_PTR(UCS_OK); + } + break; + case SEND_SYNC_NB: + return (request*)ucp_tag_send_sync_nb(sender.ep(worker_index), buffer, + count, datatype, tag, send_callback); + default: + return NULL; } + return req; } test_ucp_tag::request * -test_ucp_tag::send_nbr(const void *buffer, size_t count, - ucp_datatype_t datatype, - ucp_tag_t tag, int buf_index) +test_ucp_tag::send_nb(const void *buffer, size_t count, ucp_datatype_t datatype, + ucp_tag_t tag, int buf_index) { - int worker_index = get_worker_index(buf_index); - ucs_status_t status; - request *req; - - req = request_alloc(); - - status = ucp_tag_send_nbr(sender().ep(worker_index), buffer, count, datatype, - tag, req); + return send(sender(), SEND_NB, buffer, count, datatype, tag, buf_index); +} - ASSERT_UCS_OK_OR_INPROGRESS(status); - if (status == UCS_OK) { - request_free(req); - return (request *)UCS_STATUS_PTR(UCS_OK); - } - return req; +test_ucp_tag::request * +test_ucp_tag::send_nbr(const void *buffer, size_t count, + ucp_datatype_t datatype, + ucp_tag_t tag, int buf_index) +{ + return send(sender(), SEND_NBR, buffer, count, datatype, tag, buf_index); } void test_ucp_tag::send_b(const void *buffer, size_t count, ucp_datatype_t datatype, ucp_tag_t tag, int buf_index) { - request *req = send_nb(buffer, count, datatype, tag, buf_index); - - if (req != NULL) { - wait(req, get_worker_index(buf_index)); - request_release(req); - } + send(sender(), SEND_B, buffer, count, datatype, tag, buf_index); } test_ucp_tag::request * test_ucp_tag::send_sync_nb(const void *buffer, size_t count, ucp_datatype_t datatype, ucp_tag_t tag, int buf_index) { - int worker_index = get_worker_index(buf_index); - - return (request*)ucp_tag_send_sync_nb(sender().ep(worker_index), buffer, count, - datatype, tag, send_callback); + return send(sender(), SEND_SYNC_NB, buffer, count, datatype, tag, buf_index); } test_ucp_tag::request* -test_ucp_tag::recv_nb(void *buffer, size_t count, ucp_datatype_t dt, - ucp_tag_t tag, ucp_tag_t tag_mask, int buf_index) +test_ucp_tag::recv(entity &receiver, recv_type_t type, void *buffer, + size_t count, ucp_datatype_t datatype, + ucp_tag_t tag, ucp_tag_t tag_mask, + ucp_tag_recv_info_t *info, int buf_index) { - return is_external_request() ? - recv_req_nb(buffer, count, dt, tag, tag_mask, buf_index) : - recv_cb_nb(buffer, count, dt, tag, tag_mask, buf_index); -} - -test_ucp_tag::request* -test_ucp_tag::recv_req_nb(void *buffer, size_t count, ucp_datatype_t dt, - ucp_tag_t tag, ucp_tag_t tag_mask, int buf_index) -{ - request *req = request_alloc(); int worker_index = get_worker_index(buf_index); + request *req; + ucs_status_t status; - ucs_status_t status = ucp_tag_recv_nbr(receiver().worker(worker_index), buffer, count, - dt, tag, tag_mask, req); - if ((status != UCS_OK) && (status != UCS_INPROGRESS)) { - UCS_TEST_ABORT("ucp_tag_recv_nb returned status " << - ucs_status_string(status)); + switch (type) { + case RECV_B: + case RECV_NB: + req = (request*)ucp_tag_recv_nb(receiver.worker(worker_index), buffer, count, + datatype, tag, tag_mask, recv_callback); + if (type == RECV_NB) { + if (UCS_PTR_IS_ERR(req)) { + ASSERT_UCS_OK(UCS_PTR_STATUS(req)); + } else if (req == NULL) { + UCS_TEST_ABORT("ucp_tag_recv_nb returned NULL"); + } + } else { + if (UCS_PTR_IS_ERR(req)) { + return req; + } else if (req == NULL) { + UCS_TEST_ABORT("ucp_tag_recv_nb returned NULL"); + } else { + wait(req, worker_index); + status = req->status; + *info = req->info; + request_release(req); + return (request*)UCS_STATUS_PTR(status); + } + } + break; + case RECV_BR: + case RECV_NBR: + req = request_alloc(); + status = ucp_tag_recv_nbr(receiver.worker(worker_index), buffer, + count, datatype, tag, tag_mask, req); + if (type == RECV_NBR) { + if (UCS_STATUS_IS_ERR(status)) { + UCS_TEST_ABORT("ucp_tag_recv_nb returned status " << + ucs_status_string(status)); + } + } else { + if (!UCS_STATUS_IS_ERR(status)) { + wait(req, worker_index); + status = req->status; + *info = req->info; + request_release(req); + return (request*)UCS_STATUS_PTR(status); + } + } + break; + default: + return NULL; } + return req; } test_ucp_tag::request* -test_ucp_tag::recv_cb_nb(void *buffer, size_t count, ucp_datatype_t dt, - ucp_tag_t tag, ucp_tag_t tag_mask, int buf_index) +test_ucp_tag::recv_nb(void *buffer, size_t count, ucp_datatype_t datatype, + ucp_tag_t tag, ucp_tag_t tag_mask, int buf_index) { - int worker_index = get_worker_index(buf_index); - - request *req = (request*) ucp_tag_recv_nb(receiver().worker(worker_index), buffer, count, - dt, tag, tag_mask, recv_callback); - if (UCS_PTR_IS_ERR(req)) { - ASSERT_UCS_OK(UCS_PTR_STATUS(req)); - } else if (req == NULL) { - UCS_TEST_ABORT("ucp_tag_recv_nb returned NULL"); - } - return req; + recv_type_t type = is_external_request() ? RECV_NBR : RECV_NB; + return recv(receiver(), type, buffer, count, datatype, + tag, tag_mask, NULL, buf_index); } ucs_status_t -test_ucp_tag::recv_b(void *buffer, size_t count, ucp_datatype_t dt, ucp_tag_t tag, - ucp_tag_t tag_mask, ucp_tag_recv_info_t *info, int buf_index) +test_ucp_tag::recv_b(void *buffer, size_t count, ucp_datatype_t datatype, + ucp_tag_t tag, ucp_tag_t tag_mask, + ucp_tag_recv_info_t *info, int buf_index) { - return is_external_request() ? - recv_req_b(buffer, count, dt, tag, tag_mask, info, buf_index) : - recv_cb_b(buffer, count, dt, tag, tag_mask, info, buf_index); + recv_type_t type = is_external_request() ? RECV_BR : RECV_B; + request* req = recv(receiver(), type, buffer, count, datatype, + tag, tag_mask, info, buf_index); + return UCS_PTR_STATUS(req); } -ucs_status_t test_ucp_tag::recv_cb_b(void *buffer, size_t count, ucp_datatype_t datatype, - ucp_tag_t tag, ucp_tag_t tag_mask, - ucp_tag_recv_info_t *info, int buf_index) +bool test_ucp_tag::is_external_request() { - int worker_index = get_worker_index(buf_index); - ucs_status_t status; - request *req; + return false; +} - req = (request*)ucp_tag_recv_nb(receiver().worker(worker_index), buffer, count, datatype, - tag, tag_mask, recv_callback); - if (UCS_PTR_IS_ERR(req)) { - return UCS_PTR_STATUS(req); - } else if (req == NULL) { - UCS_TEST_ABORT("ucp_tag_recv_nb returned NULL"); - } else { - wait(req, worker_index); - status = req->status; - *info = req->info; - request_release(req); - return status; +ucp_context_attr_t test_ucp_tag::ctx_attr; + + +class test_ucp_tag_limits : public test_ucp_tag { +public: + test_ucp_tag_limits() { + m_test_offload = GetParam().variant; + m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_ENABLE", + ucs::to_string(m_test_offload).c_str())); } -} -ucs_status_t test_ucp_tag::recv_req_b(void *buffer, size_t count, ucp_datatype_t datatype, - ucp_tag_t tag, ucp_tag_t tag_mask, - ucp_tag_recv_info_t *info, int buf_index) -{ - int worker_index = get_worker_index(buf_index); - request *req = request_alloc(); - - ucs_status_t status = ucp_tag_recv_nbr(receiver().worker(worker_index), buffer, count, - datatype, tag, tag_mask, req); - if ((status == UCS_OK) || (status == UCS_INPROGRESS)) { - wait(req, worker_index); - status = req->status; - *info = req->info; + void init() { + test_ucp_tag::init(); + check_offload_support(m_test_offload); + } + + std::vector + static enum_test_params(const ucp_params_t& ctx_params, + const std::string& name, + const std::string& test_case_name, + const std::string& tls) + { + std::vector result; + generate_test_params_variant(ctx_params, name, test_case_name, + tls, false, result); + generate_test_params_variant(ctx_params, name, test_case_name + "/offload", + tls, true, result); + return result; + } + +protected: + bool m_test_offload; +}; + +UCS_TEST_P(test_ucp_tag_limits, check_max_short_rndv_thresh_zero, "RNDV_THRESH=0") { + size_t max_short = + static_cast(ucp_ep_config(sender().ep())->tag.eager.max_short + 1); + + // (maximal short + 1) <= RNDV thresh + EXPECT_LE(max_short, + ucp_ep_config(sender().ep())->tag.rndv.am_thresh); + EXPECT_LE(max_short, + ucp_ep_config(sender().ep())->tag.rndv.rma_thresh); + + // (maximal short + 1) <= RNDV send_nbr thresh + EXPECT_LE(max_short, + ucp_ep_config(sender().ep())->tag.rndv_send_nbr.am_thresh); + EXPECT_LE(max_short, + ucp_ep_config(sender().ep())->tag.rndv_send_nbr.rma_thresh); + + if (m_test_offload) { + // There is a lower bound for rndv threshold with tag offload. We should + // not send messages smaller than SW RNDV request size, because receiver + // may temporarily store this request in the user buffer (which will + // result in crash if the request does not fit user buffer). + size_t min_rndv = ucp_ep_tag_offload_min_rndv_thresh(ucp_ep_config(sender().ep())); + + EXPECT_GT(min_rndv, 0ul); // min_rndv should be RTS size at least + EXPECT_GE(min_rndv, + ucp_ep_config(sender().ep())->tag.rndv_send_nbr.am_thresh); + EXPECT_GE(min_rndv, + ucp_ep_config(sender().ep())->tag.rndv_send_nbr.rma_thresh); } - request_release(req); - return status; } -bool test_ucp_tag::is_external_request() -{ - return false; +UCS_TEST_P(test_ucp_tag_limits, check_max_short_zcopy_thresh_zero, "ZCOPY_THRESH=0") { + size_t max_short = + static_cast(ucp_ep_config(sender().ep())->tag.eager.max_short + 1); + + // (maximal short + 1) <= ZCOPY thresh + EXPECT_LE(max_short, + ucp_ep_config(sender().ep())->tag.eager.zcopy_thresh[0]); } -ucp_context_attr_t test_ucp_tag::ctx_attr; +UCP_INSTANTIATE_TEST_CASE(test_ucp_tag_limits) diff --git a/test/gtest/ucp/test_ucp_tag.h b/test/gtest/ucp/test_ucp_tag.h index e7331d80b22..c8149db60a4 100644 --- a/test/gtest/ucp/test_ucp_tag.h +++ b/test/gtest/ucp/test_ucp_tag.h @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -15,6 +15,20 @@ class test_ucp_tag : public ucp_test { public: static ucp_params_t get_ctx_params(); + enum send_type_t { + SEND_NB, + SEND_NBR, + SEND_B, + SEND_SYNC_NB + }; + + enum recv_type_t { + RECV_NB, + RECV_NBR, + RECV_B, + RECV_BR + }; + protected: enum { RECV_REQ_INTERNAL = DEFAULT_PARAM_VARIANT, @@ -32,6 +46,8 @@ class test_ucp_tag : public ucp_test { virtual void init(); + void enable_tag_mp_offload(); + static void request_init(void *request); static request* request_alloc(); @@ -45,17 +61,26 @@ class test_ucp_tag : public ucp_test { static void recv_callback(void *request, ucs_status_t status, ucp_tag_recv_info_t *info); - request * send_nb(const void *buffer, size_t count, ucp_datatype_t datatype, - ucp_tag_t tag, int ep_index = 0); + request* send(entity &sender, send_type_t type, const void *buffer, + size_t count, ucp_datatype_t datatype, ucp_tag_t tag, + int ep_index = 0); - request * send_nbr(const void *buffer, size_t count, ucp_datatype_t datatype, - ucp_tag_t tag, int ep_index = 0); + request* send_nb(const void *buffer, size_t count, ucp_datatype_t datatype, + ucp_tag_t tag, int ep_index = 0); + + request* send_nbr(const void *buffer, size_t count, ucp_datatype_t datatype, + ucp_tag_t tag, int ep_index = 0); void send_b(const void *buffer, size_t count, ucp_datatype_t datatype, ucp_tag_t tag, int buf_index = 0); - request * send_sync_nb(const void *buffer, size_t count, ucp_datatype_t datatype, - ucp_tag_t tag, int buf_index = 0); + request* send_sync_nb(const void *buffer, size_t count, ucp_datatype_t datatype, + ucp_tag_t tag, int buf_index = 0); + + request* recv(entity &receiver, recv_type_t type, void *buffer, + size_t count, ucp_datatype_t dt, ucp_tag_t tag, + ucp_tag_t tag_mask, ucp_tag_recv_info_t *info, + int buf_index = 0); request* recv_nb(void *buffer, size_t count, ucp_datatype_t dt, ucp_tag_t tag, ucp_tag_t tag_mask, int buf_index = 0); @@ -89,8 +114,11 @@ class test_ucp_tag : public ucp_test { virtual bool is_external_request(); static ucp_context_attr_t ctx_attr; + ucs::ptr_vector m_env; + private: int get_worker_index(int buf_index); + public: int count; }; diff --git a/test/gtest/ucp/test_ucp_tag_cancel.cc b/test/gtest/ucp/test_ucp_tag_cancel.cc index 186a5afe748..81e3aace957 100644 --- a/test/gtest/ucp/test_ucp_tag_cancel.cc +++ b/test/gtest/ucp/test_ucp_tag_cancel.cc @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -9,6 +9,9 @@ #include +extern "C" { +#include +} class test_ucp_tag_cancel : public test_ucp_tag { }; @@ -32,4 +35,36 @@ UCS_TEST_P(test_ucp_tag_cancel, cancel_exp) { request_release(req); } +// Test that cancelling already matched (but not yet completed) request does +// not produce any error. GH bug #4490. +UCS_TEST_P(test_ucp_tag_cancel, cancel_matched, "RNDV_THRESH=32K") { + uint64_t small_data = 0; + ucp_tag_t tag = 0xfafa; + size_t size = 50000; + + std::vector sbuf(size, 0); + std::vector rbuf(size, 0); + + request *rreq1 = recv_nb(&rbuf[0], rbuf.size(), DATATYPE, tag, + UCP_TAG_MASK_FULL); + request *rreq2 = recv_nb(&small_data, sizeof(small_data), DATATYPE, tag, + UCP_TAG_MASK_FULL); + + request *sreq1 = send_nb(&sbuf[0], sbuf.size(), DATATYPE, tag); + request *sreq2 = send_nb(&small_data, sizeof(small_data), DATATYPE, tag); + + wait_and_validate(rreq2); + + if (!rreq1->completed) { + ucp_request_cancel(receiver().worker(), rreq1); + } else { + UCS_TEST_MESSAGE << "nothing to cancel"; + } + + wait_and_validate(rreq1); + wait_and_validate(sreq1); + wait_and_validate(sreq2); +} + + UCP_INSTANTIATE_TEST_CASE(test_ucp_tag_cancel) diff --git a/test/gtest/ucp/test_ucp_tag_match.cc b/test/gtest/ucp/test_ucp_tag_match.cc index 9d0919c9632..e1e9e2d5e72 100644 --- a/test/gtest/ucp/test_ucp_tag_match.cc +++ b/test/gtest/ucp/test_ucp_tag_match.cc @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -8,18 +8,27 @@ #include "test_ucp_tag.h" #include +extern "C" { +#include +#include +} using namespace ucs; /* For vector serialization */ class test_ucp_tag_match : public test_ucp_tag { public: - virtual void init() - { - m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_ENABLE", "y")); + test_ucp_tag_match() { + // TODO: test offload and offload MP as different variants + enable_tag_mp_offload(); if (RUNNING_ON_VALGRIND) { - m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_MAX_BCOPY", "8k")); + m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_SEG_SIZE", "8k")); + m_env.push_back(new ucs::scoped_setenv("UCX_TCP_RX_SEG_SIZE", "8k")); } + } + + virtual void init() + { modify_config("TM_THRESH", "1"); test_ucp_tag::init(); @@ -53,7 +62,6 @@ class test_ucp_tag_match : public test_ucp_tag { } static ucs_status_t m_req_status; - ucs::ptr_vector m_env; }; ucs_status_t test_ucp_tag_match::m_req_status = UCS_OK; @@ -78,11 +86,9 @@ UCS_TEST_P(test_ucp_tag_match, send_recv_unexp) { EXPECT_EQ(send_data, recv_data); } -UCS_TEST_P(test_ucp_tag_match, send_recv_unexp_rqfree) { - if (GetParam().variant == RECV_REQ_EXTERNAL) { - UCS_TEST_SKIP_R("request free cannot be used for external requests"); - } - +UCS_TEST_SKIP_COND_P(test_ucp_tag_match, send_recv_unexp_rqfree, + /* request free cannot be used for external requests */ + (GetParam().variant == RECV_REQ_EXTERNAL)) { request *my_recv_req; uint64_t send_data = 0xdeadbeefdeadbeef; uint64_t recv_data = 0; @@ -167,7 +173,7 @@ UCS_TEST_P(test_ucp_tag_match, send2_nb_recv_exp_medium) { request_release(my_recv_req); } -UCS_TEST_P(test_ucp_tag_match, send2_nb_recv_medium_wildcard, "RNDV_THRESH=-1") { +UCS_TEST_P(test_ucp_tag_match, send2_nb_recv_medium_wildcard, "RNDV_THRESH=inf") { static const size_t size = 3000000; entity &sender2 = sender(); @@ -444,7 +450,53 @@ UCS_TEST_P(test_ucp_tag_match, sync_send_unexp) { request_release(my_send_req); } -UCS_TEST_P(test_ucp_tag_match, sync_send_unexp_rndv, "RNDV_THRESH=1048576") { +UCP_INSTANTIATE_TEST_CASE(test_ucp_tag_match) + +class test_ucp_tag_match_rndv : public test_ucp_tag_match { +public: + enum { + RNDV_SCHEME_AUTO = 0, + RNDV_SCHEME_PUT_ZCOPY, + RNDV_SCHEME_GET_ZCOPY + }; + + static const std::string rndv_schemes[]; + + void init() { + ASSERT_LE(GetParam().variant, (int)RNDV_SCHEME_GET_ZCOPY); + modify_config("RNDV_SCHEME", rndv_schemes[GetParam().variant]); + + test_ucp_tag_match::init(); + } + + std::vector + static enum_test_params(const ucp_params_t& ctx_params, + const std::string& name, + const std::string& test_case_name, + const std::string& tls) + { + std::vector result; + generate_test_params_variant(ctx_params, name, + test_case_name + "/rndv_" + + rndv_schemes[RNDV_SCHEME_AUTO], + tls, RNDV_SCHEME_AUTO, result); + generate_test_params_variant(ctx_params, name, + test_case_name + "/rndv_" + + rndv_schemes[RNDV_SCHEME_PUT_ZCOPY], + tls, RNDV_SCHEME_PUT_ZCOPY, result); + generate_test_params_variant(ctx_params, name, + test_case_name + "/rndv_" + + rndv_schemes[RNDV_SCHEME_GET_ZCOPY], + tls, RNDV_SCHEME_GET_ZCOPY, result); + return result; + } +}; + +const std::string test_ucp_tag_match_rndv::rndv_schemes[] = { "auto", + "put_zcopy", + "get_zcopy" }; + +UCS_TEST_P(test_ucp_tag_match_rndv, sync_send_unexp, "RNDV_THRESH=1048576") { static const size_t size = 1148576; request *my_send_req; ucp_tag_recv_info_t info; @@ -479,7 +531,7 @@ UCS_TEST_P(test_ucp_tag_match, sync_send_unexp_rndv, "RNDV_THRESH=1048576") { request_release(my_send_req); } -UCS_TEST_P(test_ucp_tag_match, rndv_req_exp, "RNDV_THRESH=1048576") { +UCS_TEST_P(test_ucp_tag_match_rndv, req_exp, "RNDV_THRESH=1048576") { static const size_t size = 1148576; request *my_send_req, *my_recv_req; @@ -513,7 +565,7 @@ UCS_TEST_P(test_ucp_tag_match, rndv_req_exp, "RNDV_THRESH=1048576") { request_release(my_recv_req); } -UCS_TEST_P(test_ucp_tag_match, rndv_rts_unexp, "RNDV_THRESH=1048576") { +UCS_TEST_P(test_ucp_tag_match_rndv, rts_unexp, "RNDV_THRESH=1048576") { static const size_t size = 1148576; request *my_send_req; ucp_tag_recv_info_t info; @@ -545,7 +597,7 @@ UCS_TEST_P(test_ucp_tag_match, rndv_rts_unexp, "RNDV_THRESH=1048576") { EXPECT_EQ(sendbuf, recvbuf); } -UCS_TEST_P(test_ucp_tag_match, rndv_truncated, "RNDV_THRESH=1048576") { +UCS_TEST_P(test_ucp_tag_match_rndv, truncated, "RNDV_THRESH=1048576") { static const size_t size = 1148576; request *my_send_req; ucp_tag_recv_info_t info; @@ -573,7 +625,49 @@ UCS_TEST_P(test_ucp_tag_match, rndv_truncated, "RNDV_THRESH=1048576") { wait_and_validate(my_send_req); } -UCS_TEST_P(test_ucp_tag_match, rndv_req_exp_auto_thresh, "RNDV_THRESH=auto") { +UCS_TEST_P(test_ucp_tag_match_rndv, post_larger_recv, "RNDV_THRESH=0") { + /* small send size should probably be lower than minimum GET Zcopy + * size supported by IB TLs */ + static const size_t small_send_size = 16; + static const size_t small_recv_size = small_send_size * 2; + static const size_t large_send_size = 1148576; + static const size_t large_recv_size = large_send_size + 1 * UCS_KBYTE; + /* array of [send][recv] sizes */ + static const size_t sizes[][2] = { { small_send_size, small_recv_size }, + { large_send_size, large_recv_size } }; + request *my_send_req, *my_recv_req; + + for (unsigned i = 0; i < ucs_array_size(sizes); i++) { + size_t send_size = sizes[i][0]; + size_t recv_size = sizes[i][1]; + std::vector sendbuf(send_size, 0); + std::vector recvbuf(recv_size, 0); + + ucs::fill_random(sendbuf); + ucs::fill_random(recvbuf); + + my_recv_req = recv_nb(&recvbuf[0], recvbuf.size(), DATATYPE, 0x1337, 0xffff); + ASSERT_TRUE(!UCS_PTR_IS_ERR(my_recv_req)); + EXPECT_FALSE(my_recv_req->completed); + + my_send_req = send_nb(&sendbuf[0], sendbuf.size(), DATATYPE, 0x111337); + ASSERT_TRUE(!UCS_PTR_IS_ERR(my_send_req)); + + wait(my_recv_req); + + EXPECT_EQ(sendbuf.size(), my_recv_req->info.length); + EXPECT_EQ(recvbuf.size(), ((ucp_request_t*)my_recv_req - 1)->recv.length); + EXPECT_EQ((ucp_tag_t)0x111337, my_recv_req->info.sender_tag); + EXPECT_TRUE(my_recv_req->completed); + EXPECT_NE(sendbuf, recvbuf); + EXPECT_TRUE(std::equal(sendbuf.begin(), sendbuf.end(), recvbuf.begin())); + + wait_and_validate(my_send_req); + request_release(my_recv_req); + } +} + +UCS_TEST_P(test_ucp_tag_match_rndv, req_exp_auto_thresh, "RNDV_THRESH=auto") { static const size_t size = 1148576; request *my_send_req, *my_recv_req; @@ -608,11 +702,11 @@ UCS_TEST_P(test_ucp_tag_match, rndv_req_exp_auto_thresh, "RNDV_THRESH=auto") { request_release(my_recv_req); } -UCS_TEST_P(test_ucp_tag_match, rndv_exp_huge_mix) { - const size_t sizes[] = { 1000, 2000, 2500ul * 1024 * 1024 }; +UCS_TEST_P(test_ucp_tag_match_rndv, exp_huge_mix) { + const size_t sizes[] = { 1000, 2000, 8000, 2500ul * UCS_MBYTE }; /* small sizes should warm-up tag cache */ - for (int i = 0; i < 3; ++i) { + for (unsigned i = 0; i < ucs_array_size(sizes); ++i) { const size_t size = sizes[i] / ucs::test_time_multiplier(); request *my_send_req, *my_recv_req; @@ -640,4 +734,67 @@ UCS_TEST_P(test_ucp_tag_match, rndv_exp_huge_mix) { } } -UCP_INSTANTIATE_TEST_CASE(test_ucp_tag_match) +UCS_TEST_P(test_ucp_tag_match_rndv, bidir_multi_exp_post, "RNDV_THRESH=0") { + const size_t sizes[] = { 8 * UCS_KBYTE, 128 * UCS_KBYTE, 512 * UCS_KBYTE, + 8 * UCS_MBYTE, 128 * UCS_MBYTE, 512 * UCS_MBYTE }; + + receiver().connect(&sender(), get_ep_params()); + + for (unsigned i = 0; i < ucs_array_size(sizes); ++i) { + const size_t size = sizes[i] / + ucs::test_time_multiplier() / + ucs::test_time_multiplier(); + const size_t count = ucs_max((size_t)(5000.0 / sqrt(sizes[i]) / + ucs::test_time_multiplier()), 3lu); + std::vector sreqs; + std::vector rreqs; + std::vector > sbufs; + std::vector > rbufs; + + sbufs.resize(count * 2); + rbufs.resize(count * 2); + + for (size_t repeat = 0; repeat < count * 2; ++repeat) { + entity &send_e = repeat < count ? sender() : receiver(); + entity &recv_e = repeat < count ? receiver() : sender(); + request *my_send_req, *my_recv_req; + + sbufs[repeat].resize(size, 0); + rbufs[repeat].resize(size, 0); + ucs::fill_random(sbufs[repeat]); + + my_recv_req = recv(recv_e, RECV_NB, + &rbufs[repeat][0], rbufs[repeat].size(), + DATATYPE, 0x1337, 0xffff, NULL); + ASSERT_TRUE(!UCS_PTR_IS_ERR(my_recv_req)); + EXPECT_FALSE(my_recv_req->completed); + + my_send_req = send(send_e, SEND_NB, + &sbufs[repeat][0], sbufs[repeat].size(), + DATATYPE, 0x111337); + ASSERT_TRUE(!UCS_PTR_IS_ERR(my_send_req)); + + sreqs.push_back(my_send_req); + rreqs.push_back(my_recv_req); + } + + for (size_t repeat = 0; repeat < count * 2; ++repeat) { + request *my_send_req, *my_recv_req; + + my_recv_req = rreqs[repeat]; + my_send_req = sreqs[repeat]; + + wait(my_recv_req); + + EXPECT_EQ(sbufs[repeat].size(), my_recv_req->info.length); + EXPECT_EQ((ucp_tag_t)0x111337, my_recv_req->info.sender_tag); + EXPECT_TRUE(my_recv_req->completed); + EXPECT_EQ(sbufs[repeat], rbufs[repeat]); + + wait_and_validate(my_send_req); + request_free(my_recv_req); + } + } +} + +UCP_INSTANTIATE_TEST_CASE(test_ucp_tag_match_rndv) diff --git a/test/gtest/ucp/test_ucp_tag_mem_type.cc b/test/gtest/ucp/test_ucp_tag_mem_type.cc new file mode 100644 index 00000000000..7cc352cf601 --- /dev/null +++ b/test/gtest/ucp/test_ucp_tag_mem_type.cc @@ -0,0 +1,210 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#include "test_ucp_tag.h" +#include + +#include "ucp_datatype.h" + +extern "C" { +#include +#include +} + +#include + + +class test_ucp_tag_mem_type: public test_ucp_tag { +public: + enum { + VARIANT_DEFAULT = UCS_BIT(0), + VARIANT_GDR_OFF = UCS_BIT(1), + VARIANT_TAG_OFFLOAD = UCS_BIT(2), + VARIANT_MAX = UCS_BIT(3) + }; + + void init() { + int mem_type_pair_index = GetParam().variant % mem_type_pairs.size(); + int varient_index = GetParam().variant / mem_type_pairs.size(); + + if (varient_index & VARIANT_GDR_OFF) { + m_env.push_back(new ucs::scoped_setenv("UCX_IB_GPU_DIRECT_RDMA", "n")); + } + + if (varient_index & VARIANT_TAG_OFFLOAD) { + enable_tag_mp_offload(); + + if (RUNNING_ON_VALGRIND) { + m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_SEG_SIZE", "8k")); + m_env.push_back(new ucs::scoped_setenv("UCX_TCP_RX_SEG_SIZE", "8k")); + } + } + + m_send_mem_type = mem_type_pairs[mem_type_pair_index][0]; + m_recv_mem_type = mem_type_pairs[mem_type_pair_index][1]; + + modify_config("MAX_EAGER_LANES", "2"); + modify_config("MAX_RNDV_LANES", "2"); + + test_ucp_tag::init(); + } + + void cleanup() { + test_ucp_tag::cleanup(); + } + + std::vector + static enum_test_params(const ucp_params_t& ctx_params, + const std::string& name, + const std::string& test_case_name, + const std::string& tls) { + + std::vector result; + int count = 0; + + for (int i = 0; i < VARIANT_MAX; i++) { + for (std::vector >::const_iterator iter = + mem_type_pairs.begin(); iter != mem_type_pairs.end(); ++iter) { + generate_test_params_variant(ctx_params, name, test_case_name + "/" + + std::string(ucs_memory_type_names[(*iter)[0]]) + + "<->" + std::string(ucs_memory_type_names[(*iter)[1]]), + tls, count++, result); + } + } + + return result; + } + + static std::vector > mem_type_pairs; + +protected: + + size_t do_xfer(const void *sendbuf, void *recvbuf, size_t count, + ucp_datatype_t send_dt, ucp_datatype_t recv_dt, + bool expected, bool truncated, bool extended); + + ucs_memory_type_t m_send_mem_type; + ucs_memory_type_t m_recv_mem_type; + +private: + + static const uint64_t SENDER_TAG = 0x111337; + static const uint64_t RECV_MASK = 0xffff; + static const uint64_t RECV_TAG = 0x1337; +}; + +std::vector > +test_ucp_tag_mem_type::mem_type_pairs = ucs::supported_mem_type_pairs(); + +size_t test_ucp_tag_mem_type::do_xfer(const void *sendbuf, void *recvbuf, + size_t count, ucp_datatype_t send_dt, + ucp_datatype_t recv_dt, bool expected, + bool truncated, bool extended) +{ + size_t recv_count = count; + size_t send_count = count; + size_t recvd = 0; + request *rreq, *sreq; + + if (truncated) { + recv_count /= 2; + } + + if (extended) { + send_count /= 2; + } + + if (expected) { + rreq = recv_nb(recvbuf, recv_count, recv_dt, RECV_TAG, RECV_MASK); + sreq = send_nb(sendbuf, send_count, send_dt, SENDER_TAG); + } else { + sreq = send_nb(sendbuf, send_count, send_dt, SENDER_TAG); + + wait_for_unexpected_msg(receiver().worker(), 10.0); + + rreq = recv_nb(recvbuf, recv_count, recv_dt, RECV_TAG, RECV_MASK); + } + + /* progress both sender and receiver */ + wait(rreq); + if (sreq != NULL) { + wait(sreq); + request_release(sreq); + } + + recvd = rreq->info.length; + if (!truncated) { + EXPECT_UCS_OK(rreq->status); + EXPECT_EQ((ucp_tag_t)SENDER_TAG, rreq->info.sender_tag); + } else { + EXPECT_EQ(UCS_ERR_MESSAGE_TRUNCATED, rreq->status); + } + + request_release(rreq); + return recvd; +}; + +UCS_TEST_P(test_ucp_tag_mem_type, basic) +{ + ucp_datatype_t type = ucp_dt_make_contig(1); + + UCS_TEST_MESSAGE << "TEST: " + << ucs_memory_type_names[m_send_mem_type] << " <-> " + << ucs_memory_type_names[m_recv_mem_type]; + + for (unsigned i = 1; i <= 7; ++i) { + size_t max = (long)pow(10.0, i); + size_t length = ucs::rand() % max + 1; + + mem_buffer m_recv_mem_buf(length, m_recv_mem_type); + mem_buffer m_send_mem_buf(length, m_send_mem_type); + + mem_buffer::pattern_fill(m_recv_mem_buf.ptr(), m_recv_mem_buf.size(), + 1, m_recv_mem_buf.mem_type()); + + mem_buffer::pattern_fill(m_send_mem_buf.ptr(), m_send_mem_buf.size(), + 2, m_send_mem_buf.mem_type()); + + size_t recvd = do_xfer(m_send_mem_buf.ptr(), m_recv_mem_buf.ptr(), + length, type, type, true, false, false); + ASSERT_EQ(length, recvd); + mem_buffer::pattern_check(m_recv_mem_buf.ptr(), length, + 2, m_recv_mem_buf.mem_type()); + } +} + +UCS_TEST_P(test_ucp_tag_mem_type, xfer_mismatch_length) +{ + ucp_datatype_t type = ucp_dt_make_contig(1); + size_t length = ucs::rand() % ((ssize_t)pow(10.0, 7)); + + UCS_TEST_MESSAGE << "TEST: " + << ucs_memory_type_names[m_send_mem_type] << " <-> " + << ucs_memory_type_names[m_recv_mem_type] << " length: " + << length; + + mem_buffer m_recv_mem_buf(length, m_recv_mem_type); + mem_buffer m_send_mem_buf(length, m_send_mem_type); + + mem_buffer::pattern_fill(m_recv_mem_buf.ptr(), m_recv_mem_buf.size(), + 1, m_recv_mem_buf.mem_type()); + + mem_buffer::pattern_fill(m_send_mem_buf.ptr(), m_send_mem_buf.size(), + 2, m_send_mem_buf.mem_type()); + + /* truncated */ + do_xfer(m_send_mem_buf.ptr(), m_recv_mem_buf.ptr(), + length, type, type, true, true, false); + + /* extended recv buffer */ + size_t recvd = do_xfer(m_send_mem_buf.ptr(), m_recv_mem_buf.ptr(), + length, type, type, true, false, true); + ASSERT_EQ(length / 2, recvd); + +} + + +UCP_INSTANTIATE_TEST_CASE_GPU_AWARE(test_ucp_tag_mem_type); diff --git a/test/gtest/ucp/test_ucp_tag_mt.cc b/test/gtest/ucp/test_ucp_tag_mt.cc index b6ecb6a522b..90e1c12fb8f 100644 --- a/test/gtest/ucp/test_ucp_tag_mt.cc +++ b/test/gtest/ucp/test_ucp_tag_mt.cc @@ -53,19 +53,18 @@ class test_ucp_tag_mt : public test_ucp_tag { }; UCS_TEST_P(test_ucp_tag_mt, send_recv) { - int i; uint64_t send_data[MT_TEST_NUM_THREADS] GTEST_ATTRIBUTE_UNUSED_; uint64_t recv_data[MT_TEST_NUM_THREADS] GTEST_ATTRIBUTE_UNUSED_; ucp_tag_recv_info_t info[MT_TEST_NUM_THREADS] GTEST_ATTRIBUTE_UNUSED_; - for (i = 0; i < MT_TEST_NUM_THREADS; i++) { + for (int i = 0; i < MT_TEST_NUM_THREADS; i++) { send_data[i] = 0xdeadbeefdeadbeef + 10 * i; recv_data[i] = 0; } #if _OPENMP && ENABLE_MT #pragma omp parallel for - for (i = 0; i < MT_TEST_NUM_THREADS; i++) { + for (int i = 0; i < MT_TEST_NUM_THREADS; i++) { ucs_status_t status; int worker_index = 0; diff --git a/test/gtest/ucp/test_ucp_tag_offload.cc b/test/gtest/ucp/test_ucp_tag_offload.cc index 6bc74a3a697..e1681a2ac58 100644 --- a/test/gtest/ucp/test_ucp_tag_offload.cc +++ b/test/gtest/ucp/test_ucp_tag_offload.cc @@ -9,16 +9,24 @@ #include "ucp_datatype.h" extern "C" { +#include #include #include } +#define UCP_INSTANTIATE_TAG_OFFLOAD_TEST_CASE(_test_case) \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, dcx, "dc_x") \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, rcx, "rc_x") + class test_ucp_tag_offload : public test_ucp_tag { public: + test_ucp_tag_offload() { + // TODO: test offload and offload MP as different variants + enable_tag_mp_offload(); + } void init() { - m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_ENABLE", "y")); test_ucp_tag::init(); check_offload_support(true); } @@ -32,13 +40,30 @@ class test_ucp_tag_offload : public test_ucp_tag { return req; } + request* recv_nb_exp(void *buffer, size_t count, ucp_datatype_t dt, + ucp_tag_t tag, ucp_tag_t tag_mask) + { + request *req1 = recv_nb_and_check(buffer, count, DATATYPE, tag, + UCP_TAG_MASK_FULL); + + // Post and cancel another receive to make sure the first one was offloaded + size_t size = receiver().worker()->context->config.ext.tm_thresh + 1; + std::vector tbuf(size, 0); + request *req2 = recv_nb_and_check(&tbuf[0], size, DATATYPE, tag, + UCP_TAG_MASK_FULL); + req_cancel(receiver(), req2); + + return req1; + } + void send_recv(entity &se, ucp_tag_t tag, size_t length) { std::vector sendbuf(length); std::vector recvbuf(length); - request *rreq = recv_nb_and_check(&recvbuf[0], length, DATATYPE, tag, - UCP_TAG_MASK_FULL); + request *rreq = recv_nb_exp(&recvbuf[0], length, DATATYPE, tag, + UCP_TAG_MASK_FULL); + request *sreq = (request*)ucp_tag_send_nb(se.ep(), &sendbuf[0], length, DATATYPE, tag, send_callback); if (UCS_PTR_IS_ERR(sreq)) { @@ -63,9 +88,6 @@ class test_ucp_tag_offload : public test_ucp_tag { wait(req); request_free(req); } - -protected: - ucs::ptr_vector m_env; }; UCS_TEST_P(test_ucp_tag_offload, post_after_cancel) @@ -175,8 +197,8 @@ UCS_TEST_P(test_ucp_tag_offload, post_dif_buckets) } } -UCS_TEST_P(test_ucp_tag_offload, force_thresh_basic, "TM_FORCE_THRESH=4096", - "TM_THRESH=1024") +UCS_TEST_P(test_ucp_tag_offload, force_thresh_basic, "TM_FORCE_THRESH=4k", + "TM_THRESH=1k") { uint64_t small_val = 0xFAFA; const size_t big_size = 5000; @@ -213,8 +235,8 @@ UCS_TEST_P(test_ucp_tag_offload, force_thresh_basic, "TM_FORCE_THRESH=4096", } } -UCS_TEST_P(test_ucp_tag_offload, force_thresh_blocked, "TM_FORCE_THRESH=4096", - "TM_THRESH=1024") +UCS_TEST_P(test_ucp_tag_offload, force_thresh_blocked, "TM_FORCE_THRESH=4k", + "TM_THRESH=1k") { uint64_t small_val = 0xFAFA; const size_t big_size = 5000; @@ -270,8 +292,39 @@ UCS_TEST_P(test_ucp_tag_offload, force_thresh_blocked, "TM_FORCE_THRESH=4096", } } +// Check that worker will not try to connect tag offload capable iface with +// the peer which does not support tag offload (e.g CX-5 and CX-4). In this +// case connection attempt should fail (due to peer unreachable) or some other +// transport should be selected (if available). Otherwise connect can hang, +// because some transports (e.g. rcx) have different ep address type for +// interfaces which support tag_offload. +UCS_TEST_P(test_ucp_tag_offload, connect) +{ + m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_ENABLE", "n")); + + entity *e = create_entity(true); + // Should be: + // - either complete ok + // - or force skipping the test (because peer is unreachable) + e->connect(&receiver(), get_ep_params()); +} + +UCS_TEST_P(test_ucp_tag_offload, small_rndv, "RNDV_THRESH=0", "TM_THRESH=0") +{ + activate_offload(sender()); + send_recv(sender(), 0x11ul, 0ul); + send_recv(sender(), 0x11ul, 1ul); +} + +UCS_TEST_P(test_ucp_tag_offload, small_sw_rndv, "RNDV_THRESH=0", "TM_THRESH=0", + "TM_SW_RNDV=y") +{ + activate_offload(sender()); + send_recv(sender(), 0x11ul, 0ul); + send_recv(sender(), 0x11ul, 1ul); +} -UCP_INSTANTIATE_TEST_CASE(test_ucp_tag_offload) +UCP_INSTANTIATE_TAG_OFFLOAD_TEST_CASE(test_ucp_tag_offload) class test_ucp_tag_offload_multi : public test_ucp_tag_offload { @@ -399,11 +452,161 @@ UCS_TEST_P(test_ucp_tag_offload_multi, recv_from_multi) // Do not include SM transports, because they would be selected for tag matching. // And since they do not support TM offload, this test would be skipped. -UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_tag_offload_multi, all_rcdc, - "\\rc,\\ud,rc_x,dc_x") +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_tag_offload_multi, all_rcdc, "rc,dc") -#if ENABLE_STATS +class test_ucp_tag_offload_selection : public test_ucp_tag_offload { +public: + test_ucp_tag_offload_selection() { + m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_ENABLE", "y")); + } + + static uct_device_type_t get_dev_type(ucp_ep_h ep, ucp_rsc_index_t idx) { + return ep->worker->context->tl_rscs[idx].tl_rsc.dev_type; + } + + static bool lane_shm_or_self(ucp_ep_h ep, ucp_rsc_index_t idx) { + uct_device_type_t dev_type = get_dev_type(ep, idx); + return (dev_type == UCT_DEVICE_TYPE_SHM) || (dev_type == UCT_DEVICE_TYPE_SELF); + } +}; + +UCS_TEST_P(test_ucp_tag_offload_selection, tag_lane) +{ + ucp_ep_h ep = sender().ep(); + bool has_tag_offload = false; + bool has_shm_or_self = false; + + for (ucp_rsc_index_t idx = 0; idx < sender().ucph()->num_tls; ++idx) { + if (lane_shm_or_self(ep, idx)) { + has_shm_or_self = true; + } + + uct_iface_attr_t *attr = ucp_worker_iface_get_attr(sender().worker(), idx); + if (attr->cap.flags & UCT_IFACE_FLAG_TAG_EAGER_BCOPY) { + // We do not have transports with partial tag offload support + EXPECT_TRUE(attr->cap.flags & UCT_IFACE_FLAG_TAG_RNDV_ZCOPY); + has_tag_offload = true; + } + } + + ucp_ep_config_t *ep_config = ucp_ep_config(ep); + + if (has_tag_offload && !has_shm_or_self) { + EXPECT_TRUE(ucp_ep_is_tag_offload_enabled(ep_config)); + EXPECT_EQ(ep_config->key.tag_lane, ep_config->tag.lane); + } else { + // If shm or self transports exist they would be used for tag matching + // rather than network offload + EXPECT_FALSE(ucp_ep_is_tag_offload_enabled(ep_config)); + EXPECT_EQ(ep_config->key.am_lane, ep_config->tag.lane); + } +} + +UCP_INSTANTIATE_TAG_OFFLOAD_TEST_CASE(test_ucp_tag_offload_selection); +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_tag_offload_selection, self_rcx, + "self,rc_x"); + + +class test_ucp_tag_offload_gpu : public test_ucp_tag_offload { +public: + test_ucp_tag_offload_gpu() { + modify_config("RNDV_THRESH", "1024"); + } + + std::vector + static enum_test_params(const ucp_params_t& ctx_params, + const std::string& name, + const std::string& test_case_name, + const std::string& tls) + { + std::vector result; + + generate_test_params_variant(ctx_params, name, test_case_name, + tls, UCS_MEMORY_TYPE_CUDA, result); + generate_test_params_variant(ctx_params, name, test_case_name, + tls, UCS_MEMORY_TYPE_ROCM, result); + + return result; + } + +protected: + ucs_memory_type_t mem_type() const { + return static_cast(GetParam().variant); + } +}; + +// Test that expected SW RNDV request is handled properly when receive buffer +// is allocated on GPU memory. +UCS_TEST_P(test_ucp_tag_offload_gpu, sw_rndv_to_gpu_mem, "TM_SW_RNDV=y") +{ + activate_offload(sender()); + + size_t size = 2048; + ucp_tag_t tag = 0xCAFEBABEul; + // Test will be skipped here if GPU mem is not supported + mem_buffer rbuf(size, mem_type()); + request *rreq = recv_nb_exp(rbuf.ptr(), size, DATATYPE, tag, + UCP_TAG_MASK_FULL); + + std::vector sendbuf(size); // can send from any memory + request *sreq = (request*)ucp_tag_send_nb(sender().ep(), &sendbuf[0], + size, DATATYPE, tag, + send_callback); + wait_and_validate(rreq); + wait_and_validate(sreq); +} + +// Test that small buffers wich can be scattered to CQE are not posted to the +// HW. Otherwise it may segfault, while copying data from CQE to the +// (potentially) GPU buffer. +UCS_TEST_P(test_ucp_tag_offload_gpu, rx_scatter_to_cqe, "TM_THRESH=1") +{ + activate_offload(sender()); + + size_t size = 8; + ucp_tag_t tag = 0xCAFEBABEul; + // Test will be skipped here if GPU mem is not supported + mem_buffer rbuf(size, mem_type()); + request *rreq = recv_nb_exp(rbuf.ptr(), size, DATATYPE, tag, + UCP_TAG_MASK_FULL); + uint64_t sbuf = 0ul; + request *sreq = (request*)ucp_tag_send_nb(sender().ep(), &sbuf, sizeof(sbuf), + DATATYPE, tag, send_callback); + wait_and_validate(rreq); + wait_and_validate(sreq); +} + +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_tag_offload_gpu, rc_dc_gpu, + "dc_x,rc_x," UCP_TEST_GPU_COPY_TLS) + +class test_ucp_tag_offload_status : public test_ucp_tag { +public: + test_ucp_tag_offload_status() { + m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_ENABLE", "y")); + } + + static ucp_params_t get_ctx_params() { + ucp_params_t params = ucp_test::get_ctx_params(); + // Do not pass UCP_FEATURE_TAG feature to check that UCT will not + // initialize tag offload infrastructure in this case. + params.features = UCP_FEATURE_RMA; + return params; + } +}; + +UCS_TEST_P(test_ucp_tag_offload_status, check_offload_status) +{ + for (ucp_rsc_index_t i = 0; i < sender().ucph()->num_tls; ++i) { + EXPECT_FALSE(ucp_worker_iface_get_attr(sender().worker(), i)->cap.flags & + (UCT_IFACE_FLAG_TAG_EAGER_BCOPY | + UCT_IFACE_FLAG_TAG_RNDV_ZCOPY)); + } +} + +UCP_INSTANTIATE_TAG_OFFLOAD_TEST_CASE(test_ucp_tag_offload_status) + +#ifdef ENABLE_STATS class test_ucp_tag_offload_stats : public test_ucp_tag_offload_multi { public: @@ -427,7 +630,9 @@ class test_ucp_tag_offload_stats : public test_ucp_tag_offload_multi { UCP_TAG_MASK_FULL); // Post and cancel another receive to make sure the first one was offloaded - request *req2 = recv_nb_and_check(buffer, count, DATATYPE, tag, + size_t size = receiver().worker()->context->config.ext.tm_thresh + 1; + std::vector tbuf(size, 0); + request *req2 = recv_nb_and_check(&tbuf[0], size, DATATYPE, tag, UCP_TAG_MASK_FULL); req_cancel(receiver(), req2); @@ -483,14 +688,14 @@ class test_ucp_tag_offload_stats : public test_ucp_tag_offload_multi { } }; -UCS_TEST_P(test_ucp_tag_offload_stats, post, "TM_THRESH=1") +UCS_TEST_P(test_ucp_tag_offload_stats, post, "TM_THRESH=128") { - uint64_t dummy; uint64_t tag = 0x11; + std::vector dummy(256, 0); activate_offload(sender()); - request *rreq = recv_nb(&dummy, sizeof(dummy), DATATYPE, tag, + request *rreq = recv_nb(dummy.data(), dummy.size(), DATATYPE, tag, UCP_TAG_MASK_FULL); wait_counter(worker_offload_stats(receiver()), @@ -502,10 +707,10 @@ UCS_TEST_P(test_ucp_tag_offload_stats, post, "TM_THRESH=1") UCP_WORKER_STAT_TAG_OFFLOAD_CANCELED); } -UCS_TEST_P(test_ucp_tag_offload_stats, block, "TM_THRESH=1") +UCS_TEST_P(test_ucp_tag_offload_stats, block, "TM_THRESH=128") { uint64_t tag = 0x11; - std::vector buf(64, 0); + std::vector buf(256, 0); activate_offload(sender()); @@ -574,6 +779,68 @@ UCS_TEST_P(test_ucp_tag_offload_stats, sw_rndv, "RNDV_THRESH=1000") test_send_recv(size, true, UCP_WORKER_STAT_TAG_OFFLOAD_MATCHED_SW_RNDV); } -UCP_INSTANTIATE_TEST_CASE(test_ucp_tag_offload_stats) +UCS_TEST_P(test_ucp_tag_offload_stats, force_sw_rndv, "TM_SW_RNDV=y", + "RNDV_THRESH=1000") +{ + size_t size = 2048; // Size bigger than RNDV thresh + + // Offload is not activated, so the first message should arrive unexpectedly + test_send_recv(size, false, UCP_WORKER_STAT_TAG_OFFLOAD_RX_UNEXP_SW_RNDV); + test_send_recv(size, false, UCP_WORKER_STAT_TAG_OFFLOAD_MATCHED_SW_RNDV); +} + + +UCP_INSTANTIATE_TAG_OFFLOAD_TEST_CASE(test_ucp_tag_offload_stats) + + +class test_ucp_tag_offload_stats_gpu : public test_ucp_tag_offload_stats { +public: + test_ucp_tag_offload_stats_gpu() { + m_env.push_back(new ucs::scoped_setenv("UCX_IB_GPU_DIRECT_RDMA", "n")); + } + + std::vector + static enum_test_params(const ucp_params_t& ctx_params, + const std::string& name, + const std::string& test_case_name, + const std::string& tls) + { + std::vector result; + + generate_test_params_variant(ctx_params, name, test_case_name, + tls, UCS_MEMORY_TYPE_CUDA, result); + generate_test_params_variant(ctx_params, name, test_case_name, + tls, UCS_MEMORY_TYPE_ROCM, result); + + return result; + } + +protected: + ucs_memory_type_t mem_type() const { + return static_cast(GetParam().variant); + } +}; + +UCS_TEST_P(test_ucp_tag_offload_stats_gpu, block_gpu_no_gpu_direct, + "TM_THRESH=128") +{ + activate_offload(sender()); + + size_t size = 2048; + // Test will be skipped here if GPU mem is not supported + mem_buffer rbuf(size, mem_type()); + request *rreq = recv_nb_and_check(rbuf.ptr(), size, DATATYPE, 0x11, + UCP_TAG_MASK_FULL); + + wait_counter(worker_offload_stats(receiver()), + UCP_WORKER_STAT_TAG_OFFLOAD_BLOCK_MEM_REG); + + validate_offload_counter(UCP_WORKER_STAT_TAG_OFFLOAD_POSTED, 0ul); + + req_cancel(receiver(), rreq); +} + +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_tag_offload_stats_gpu, rc_dc_gpu, + "dc_x,rc_x," UCP_TEST_GPU_COPY_TLS) #endif diff --git a/test/gtest/ucp/test_ucp_tag_probe.cc b/test/gtest/ucp/test_ucp_tag_probe.cc index 28700c3071c..de7266d98fc 100644 --- a/test/gtest/ucp/test_ucp_tag_probe.cc +++ b/test/gtest/ucp/test_ucp_tag_probe.cc @@ -12,6 +12,16 @@ class test_ucp_tag_probe : public test_ucp_tag { public: + test_ucp_tag_probe() { + if (has_transport("tcp")) { + /* Decrease `TX_SEG_SIZE` and `RX_SEG_SIZE` parameters + * for TCP transport to be able fully consume receive + * buffer by 100-byte messages */ + m_env.push_back(new ucs::scoped_setenv("UCX_TCP_TX_SEG_SIZE", "4k")); + m_env.push_back(new ucs::scoped_setenv("UCX_TCP_RX_SEG_SIZE", "4k")); + } + } + /* The parameters mean the following: * - s_size and r_size: send and recv buffer sizes. * Can be different for checking message transaction error @@ -107,7 +117,6 @@ class test_ucp_tag_probe : public test_ucp_tag { ++count; } } - }; diff --git a/test/gtest/ucp/test_ucp_tag_xfer.cc b/test/gtest/ucp/test_ucp_tag_xfer.cc index 068da456323..fb2b2d13f21 100644 --- a/test/gtest/ucp/test_ucp_tag_xfer.cc +++ b/test/gtest/ucp/test_ucp_tag_xfer.cc @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -10,6 +10,7 @@ #include "ucp_datatype.h" extern "C" { +#include #include #include } @@ -23,25 +24,48 @@ class test_ucp_tag_xfer : public test_ucp_tag { VARIANT_DEFAULT, VARIANT_ERR_HANDLING, VARIANT_RNDV_PUT_ZCOPY, + VARIANT_RNDV_GET_ZCOPY, VARIANT_RNDV_AUTO, VARIANT_SEND_NBR, }; + test_ucp_tag_xfer() { + // TODO: test offload and offload MP as different variants + enable_tag_mp_offload(); + + if (RUNNING_ON_VALGRIND) { + // Alow using TM MP offload for messages with a size of at least + // 10000 bytes by setting HW TM segment size to 10 kB, since each + // packet in TM MP offload is MTU-size buffer (i.e., in most cases + // it is 4 kB segments) + m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_SEG_SIZE", "10k")); + m_env.push_back(new ucs::scoped_setenv("UCX_TCP_RX_SEG_SIZE", "8k")); + } + } + virtual void init() { if (GetParam().variant == VARIANT_RNDV_PUT_ZCOPY) { modify_config("RNDV_SCHEME", "put_zcopy"); + } else if (GetParam().variant == VARIANT_RNDV_GET_ZCOPY) { + modify_config("RNDV_SCHEME", "get_zcopy"); } else if (GetParam().variant == VARIANT_RNDV_AUTO) { modify_config("RNDV_SCHEME", "auto"); } modify_config("MAX_EAGER_LANES", "2"); modify_config("MAX_RNDV_LANES", "2"); - m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_ENABLE", "y")); - if (RUNNING_ON_VALGRIND) { - m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_MAX_BCOPY", "8k")); - } + test_ucp_tag::init(); } + bool skip_on_ib_dc() { +#if HAVE_DC_DV + // skip due to DCI stuck bug + return has_transport("dc_x"); +#else + return false; +#endif + } + std::vector static enum_test_params(const ucp_params_t& ctx_params, const std::string& name, @@ -57,6 +81,9 @@ class test_ucp_tag_xfer : public test_ucp_tag { generate_test_params_variant(ctx_params, name, test_case_name + "/rndv_put_zcopy", tls, VARIANT_RNDV_PUT_ZCOPY, result); + generate_test_params_variant(ctx_params, name, + test_case_name + "/rndv_get_zcopy", tls, + VARIANT_RNDV_GET_ZCOPY, result); generate_test_params_variant(ctx_params, name, test_case_name + "/rndv_auto", tls, VARIANT_RNDV_AUTO, result); @@ -116,7 +143,6 @@ class test_ucp_tag_xfer : public test_ucp_tag { static const uint64_t SENDER_TAG = 0x111337; static const uint64_t RECV_MASK = 0xffff; static const uint64_t RECV_TAG = 0x1337; - ucs::ptr_vector m_env; }; @@ -233,6 +259,7 @@ void test_ucp_tag_xfer::test_run_xfer(bool send_contig, bool recv_contig, &send_dt, &recv_dt); /* coverity[var_deref_model] */ + /* coverity[var_deref_op] */ recvd = do_xfer(&sendbuf[0], &recvbuf[0], count, send_dt, recv_dt, expected, sync, truncated); if (!truncated) { @@ -263,10 +290,8 @@ void test_ucp_tag_xfer::test_xfer_probe(bool send_contig, bool recv_contig, ucp_tag_recv_info_t info; request *rreq, *sreq; - if (&sender() == &receiver()) { - /* the self transport doesn't do rndv and completes the send immediately */ - UCS_TEST_SKIP_R("loop-back unsupported"); - } + /* the self transport doesn't do rndv and completes the send immediately */ + skip_loopback(); ucp::dt_gen_start_count = 0; ucp::dt_gen_finish_count = 0; @@ -495,8 +520,8 @@ size_t test_ucp_tag_xfer::do_xfer(const void *sendbuf, void *recvbuf, void test_ucp_tag_xfer::test_xfer_len_offset() { const size_t max_offset = 128; - const size_t max_length = 64 * 1024; - const size_t min_length = 1024; + const size_t max_length = 64 * UCS_KBYTE; + const size_t min_length = UCS_KBYTE; const size_t offset_step = 16; const size_t length_step = 16; const size_t buf_size = max_length + max_offset + 2; @@ -508,9 +533,6 @@ void test_ucp_tag_xfer::test_xfer_len_offset() ucs::detail::message_stream *ms; skip_err_handling(); - if (RUNNING_ON_VALGRIND) { - UCS_TEST_SKIP_R("valgrind"); - } EXPECT_EQ(posix_memalign(&send_buf, 8192, buf_size), 0); EXPECT_EQ(posix_memalign(&recv_buf, 8192, buf_size), 0); @@ -571,6 +593,10 @@ UCS_TEST_P(test_ucp_tag_xfer, generic_unexp) { test_xfer(&test_ucp_tag_xfer::test_xfer_generic, false, false, false); } +UCS_TEST_P(test_ucp_tag_xfer, generic_unexp_truncated) { + test_xfer(&test_ucp_tag_xfer::test_xfer_generic, false, false, true); +} + UCS_TEST_P(test_ucp_tag_xfer, iov_exp) { test_xfer(&test_ucp_tag_xfer::test_xfer_iov, true, false, false); } @@ -587,12 +613,8 @@ UCS_TEST_P(test_ucp_tag_xfer, generic_err_exp) { test_xfer(&test_ucp_tag_xfer::test_xfer_generic_err, true, false, false); } -UCS_TEST_P(test_ucp_tag_xfer, generic_err_unexp) { -#if HAVE_DC_DV - if (GetParam().transports.front().compare("dc_x") == 0) { - UCS_TEST_SKIP_R("DCI stuck bug"); - } -#endif +UCS_TEST_SKIP_COND_P(test_ucp_tag_xfer, generic_err_unexp, + skip_on_ib_dc()) { test_xfer(&test_ucp_tag_xfer::test_xfer_generic_err, false, false, false); } @@ -727,7 +749,7 @@ UCS_TEST_P(test_ucp_tag_xfer, send_contig_recv_contig_exp_sync_rndv_truncated, /* because ucp_tag_send_req return status (instead request) if send operation * completed immediately */ skip_loopback(); - test_run_xfer(true, true, true, false, false); + test_run_xfer(true, true, true, true, true); } UCS_TEST_P(test_ucp_tag_xfer, send_contig_recv_contig_unexp_rndv, "RNDV_THRESH=1000", @@ -747,7 +769,7 @@ UCS_TEST_P(test_ucp_tag_xfer, send_contig_recv_contig_unexp_sync_rndv, "RNDV_THR UCS_TEST_P(test_ucp_tag_xfer, send_contig_recv_contig_unexp_sync_rndv_truncated, "RNDV_THRESH=1000", "ZCOPY_THRESH=1248576") { - test_run_xfer(true, true, false, false, true); + test_run_xfer(true, true, false, true, true); } UCS_TEST_P(test_ucp_tag_xfer, send_contig_recv_contig_exp_rndv_probe, "RNDV_THRESH=1000", @@ -777,7 +799,7 @@ UCS_TEST_P(test_ucp_tag_xfer, send_generic_recv_generic_exp_sync_rndv_truncated, /* because ucp_tag_send_req return status (instead request) if send operation * completed immediately */ skip_loopback(); - test_run_xfer(false, false, true, false, false); + test_run_xfer(false, false, true, true, true); } UCS_TEST_P(test_ucp_tag_xfer, send_generic_recv_generic_unexp_rndv, "RNDV_THRESH=1000") { @@ -794,7 +816,7 @@ UCS_TEST_P(test_ucp_tag_xfer, send_generic_recv_generic_unexp_sync_rndv, "RNDV_T UCS_TEST_P(test_ucp_tag_xfer, send_generic_recv_generic_unexp_sync_rndv_truncated, "RNDV_THRESH=1000") { - test_run_xfer(false, false, false, false, true); + test_run_xfer(false, false, false, true, true); } UCS_TEST_P(test_ucp_tag_xfer, send_generic_recv_generic_exp_rndv_probe, "RNDV_THRESH=1000") { @@ -823,7 +845,7 @@ UCS_TEST_P(test_ucp_tag_xfer, send_generic_recv_contig_exp_sync_rndv_truncated, /* because ucp_tag_send_req return status (instead request) if send operation * completed immediately */ skip_loopback(); - test_run_xfer(false, true, true, false, false); + test_run_xfer(false, true, true, true, true); } UCS_TEST_P(test_ucp_tag_xfer, send_generic_recv_contig_unexp_rndv, "RNDV_THRESH=1000") { @@ -840,7 +862,7 @@ UCS_TEST_P(test_ucp_tag_xfer, send_generic_recv_contig_unexp_sync_rndv, "RNDV_TH UCS_TEST_P(test_ucp_tag_xfer, send_generic_recv_contig_unexp_sync_rndv_truncated, "RNDV_THRESH=1000") { - test_run_xfer(false, true, false, false, true); + test_run_xfer(false, true, false, true, true); } UCS_TEST_P(test_ucp_tag_xfer, send_generic_recv_contig_exp_rndv_probe, "RNDV_THRESH=1000") { @@ -953,13 +975,14 @@ UCS_TEST_P(test_ucp_tag_xfer, send_contig_recv_generic_exp_rndv_probe_zcopy, "RN test_xfer_probe(true, false, true, false); } -UCS_TEST_P(test_ucp_tag_xfer, test_xfer_len_offset, "RNDV_THRESH=1000") { +UCS_TEST_SKIP_COND_P(test_ucp_tag_xfer, test_xfer_len_offset, + RUNNING_ON_VALGRIND, "RNDV_THRESH=1000") { test_xfer_len_offset(); } UCS_TEST_P(test_ucp_tag_xfer, iov_with_empty_buffers, "ZCOPY_THRESH=512") { const size_t iovcnt = ucp::data_type_desc_t::MAX_IOV; - const size_t size = 1024; + const size_t size = UCS_KBYTE; const int expected = 1; const int sync = 0; const int truncated = 0; @@ -996,7 +1019,7 @@ UCS_TEST_P(test_ucp_tag_xfer, iov_with_empty_buffers, "ZCOPY_THRESH=512") { UCP_INSTANTIATE_TEST_CASE(test_ucp_tag_xfer) -#if ENABLE_STATS +#ifdef ENABLE_STATS class test_ucp_tag_stats : public test_ucp_tag_xfer { public: @@ -1028,14 +1051,54 @@ class test_ucp_tag_stats : public test_ucp_tag_xfer { return e.worker()->stats; } - void validate_counters(uint64_t tx_cntr, uint64_t rx_cntr) { + unsigned get_rx_stat(unsigned counter) { + return UCS_STATS_GET_COUNTER(worker_stats(receiver()), counter); + } + + void validate_counters(unsigned tx_counter, unsigned rx_counter) { uint64_t cnt; - cnt = UCS_STATS_GET_COUNTER(ep_stats(sender()), tx_cntr); + cnt = UCS_STATS_GET_COUNTER(ep_stats(sender()), tx_counter); EXPECT_EQ(1ul, cnt); - cnt = UCS_STATS_GET_COUNTER(worker_stats(receiver()), rx_cntr); + cnt = get_rx_stat(rx_counter); EXPECT_EQ(1ul, cnt); } + bool has_xpmem() { + return ucp_context_find_tl_md(receiver().ucph(), "xpmem") != NULL; + } + + bool has_get_zcopy() { + return has_transport("rc_v") || has_transport("rc_x") || + has_transport("dc_x") || + (ucp_context_find_tl_md(receiver().ucph(), "cma") != NULL) || + (ucp_context_find_tl_md(receiver().ucph(), "knem") != NULL); + } + + void validate_rndv_counters() { + unsigned get_zcopy = get_rx_stat(UCP_WORKER_STAT_TAG_RX_RNDV_GET_ZCOPY); + unsigned send_rtr = get_rx_stat(UCP_WORKER_STAT_TAG_RX_RNDV_SEND_RTR); + unsigned rkey_ptr = get_rx_stat(UCP_WORKER_STAT_TAG_RX_RNDV_RKEY_PTR); + + UCS_TEST_MESSAGE << "get_zcopy: " << get_zcopy + << " send_rtr: " << send_rtr + << " rkey_ptr: " << rkey_ptr; + EXPECT_EQ(1, get_zcopy + send_rtr + rkey_ptr); + + if (has_xpmem()) { + /* rkey_ptr expected to be selected if xpmem is available */ + EXPECT_EQ(1u, rkey_ptr); + } else if (has_get_zcopy()) { + /* if any transports supports get_zcopy, expect it to be used */ + EXPECT_EQ(1u, get_zcopy); + } else { + /* Could be a transport which supports get_zcopy that wasn't + * accounted for, or fallback to RTR. In any case, rkey_ptr is not + * expected to be used. + */ + EXPECT_EQ(1u, send_rtr + get_zcopy); + } + } + }; @@ -1092,6 +1155,7 @@ UCS_TEST_P(test_ucp_tag_stats, rndv_expected, "RNDV_THRESH=1000") { test_run_xfer(true, true, true, false, false); validate_counters(UCP_EP_STAT_TAG_TX_RNDV, UCP_WORKER_STAT_TAG_RX_RNDV_EXP); + validate_rndv_counters(); } UCS_TEST_P(test_ucp_tag_stats, rndv_unexpected, "RNDV_THRESH=1000") { @@ -1099,6 +1163,7 @@ UCS_TEST_P(test_ucp_tag_stats, rndv_unexpected, "RNDV_THRESH=1000") { test_run_xfer(true, true, false, false, false); validate_counters(UCP_EP_STAT_TAG_TX_RNDV, UCP_WORKER_STAT_TAG_RX_RNDV_UNEXP); + validate_rndv_counters(); } UCP_INSTANTIATE_TEST_CASE(test_ucp_tag_stats) diff --git a/test/gtest/ucp/test_ucp_wakeup.cc b/test/gtest/ucp/test_ucp_wakeup.cc index 4d1ae6a6545..3c279d25430 100644 --- a/test/gtest/ucp/test_ucp_wakeup.cc +++ b/test/gtest/ucp/test_ucp_wakeup.cc @@ -109,7 +109,14 @@ UCS_TEST_P(test_ucp_wakeup, efd) EXPECT_EQ(send_data, recv_data); } -UCS_TEST_P(test_ucp_wakeup, tx_wait, "ZCOPY_THRESH=10000") +/* This test doesn't progress receiver's worker, while + * waiting for the events on a sender's worker fd. So, + * this causes the hang due to lack of the progress during + * TCP CM message exchange (TCP doesn't have an async progress + * for such events) + * TODO: add async progress for TCP connections */ +UCS_TEST_SKIP_COND_P(test_ucp_wakeup, tx_wait, + has_transport("tcp"), "ZCOPY_THRESH=10000") { const ucp_datatype_t DATATYPE = ucp_dt_make_contig(1); const size_t COUNT = 20000; diff --git a/test/gtest/ucp/test_ucp_wireup.cc b/test/gtest/ucp/test_ucp_wireup.cc index 3868691df6c..3e3ace36294 100644 --- a/test/gtest/ucp/test_ucp_wireup.cc +++ b/test/gtest/ucp/test_ucp_wireup.cc @@ -4,8 +4,6 @@ * See file LICENSE for terms. */ -#define __STDC_LIMIT_MACROS - #include "ucp_test.h" #include "common/test.h" #include "ucp/ucp_test.h" @@ -15,8 +13,8 @@ extern "C" { #include -#include #include +#include } class test_ucp_wireup : public ucp_test { @@ -26,14 +24,15 @@ class test_ucp_wireup : public ucp_test { const std::string& name, const std::string& test_case_name, const std::string& tls, - uint64_t features); + uint64_t features, bool test_all = 0); protected: enum { TEST_RMA = UCS_BIT(0), TEST_TAG = UCS_BIT(1), TEST_STREAM = UCS_BIT(2), - UNIFIED_MODE = UCS_BIT(3) + UNIFIED_MODE = UCS_BIT(3), + TEST_AMO = UCS_BIT(4) }; typedef uint64_t elem_type; @@ -70,17 +69,24 @@ class test_ucp_wireup : public ucp_test { static void tag_recv_completion(void *request, ucs_status_t status, ucp_tag_recv_info_t *info); -private: + void rkeys_cleanup(); + + void memhs_cleanup(); + + void clear_recv_data(); + + void fill_send_data(); + + ucp_rkey_h get_rkey(ucp_ep_h ep, ucp_mem_h memh); + +protected: vec_type m_send_data; vec_type m_recv_data; ucs::handle m_memh_sender; ucs::handle m_memh_receiver; std::vector< ucs::handle > m_rkeys; - void clear_recv_data(); - - ucp_rkey_h get_rkey(ucp_ep_h ep, ucp_mem_h memh); - +private: static void stream_recv_completion(void *request, ucs_status_t status, size_t length); @@ -92,7 +98,7 @@ test_ucp_wireup::enum_test_params_features(const ucp_params_t& ctx_params, const std::string& name, const std::string& test_case_name, const std::string& tls, - uint64_t features) + uint64_t features, bool test_all) { std::vector result; ucp_params_t tmp_ctx_params = ctx_params; @@ -120,11 +126,25 @@ test_ucp_wireup::enum_test_params_features(const ucp_params_t& ctx_params, generate_test_params_variant(tmp_ctx_params, name, test_case_name + "/stream", tls, TEST_STREAM, result); - generate_test_params_variant(tmp_ctx_params, name, test_case_name + "/tag", + generate_test_params_variant(tmp_ctx_params, name, test_case_name + "/stream", tls, TEST_STREAM | UNIFIED_MODE, result); } + if (features & (UCP_FEATURE_AMO32 | UCP_FEATURE_AMO64)) { + tmp_ctx_params.features = (UCP_FEATURE_AMO32 | UCP_FEATURE_AMO64); + generate_test_params_variant(tmp_ctx_params, name, test_case_name + "/amo", + tls, TEST_AMO, result); + } + + if (test_all) { + uint64_t all_flags = (TEST_TAG | TEST_RMA | TEST_STREAM); + tmp_ctx_params.features = features; + generate_test_params_variant(tmp_ctx_params, name, test_case_name + "/all", + tls, all_flags, result); + generate_test_params_variant(tmp_ctx_params, name, test_case_name + "/all", + tls, all_flags | UNIFIED_MODE, result); + } return result; } @@ -148,7 +168,7 @@ void test_ucp_wireup::init() m_send_data.resize(BUFFER_LENGTH, 0); m_recv_data.resize(BUFFER_LENGTH, 0); - if (GetParam().variant & TEST_RMA) { + if (GetParam().variant & (TEST_RMA | TEST_AMO)) { ucs_status_t status; ucp_mem_map_params_t params; ucp_mem_h memh; @@ -194,10 +214,18 @@ ucp_rkey_h test_ucp_wireup::get_rkey(ucp_ep_h ep, ucp_mem_h memh) return rkey; } -void test_ucp_wireup::cleanup() { +void test_ucp_wireup::rkeys_cleanup() { m_rkeys.clear(); +} + +void test_ucp_wireup::memhs_cleanup() { m_memh_sender.reset(); m_memh_receiver.reset(); +} + +void test_ucp_wireup::cleanup() { + rkeys_cleanup(); + memhs_cleanup(); ucp_test::cleanup(); } @@ -358,17 +386,27 @@ class test_ucp_wireup_1sided : public test_ucp_wireup { return enum_test_params_features(ctx_params, name, test_case_name, tls, UCP_FEATURE_RMA | UCP_FEATURE_TAG); } + + test_ucp_wireup_1sided() { + for (ucp_lane_index_t i = 0; i < UCP_MAX_LANES; ++i) { + m_lanes2remote[i] = i; + } + } + + ucp_lane_index_t m_lanes2remote[UCP_MAX_LANES]; }; UCS_TEST_P(test_ucp_wireup_1sided, address) { ucs_status_t status; size_t size; void *buffer; - unsigned order[UCP_MAX_RESOURCES]; std::set packed_dev_priorities, unpacked_dev_priorities; ucp_rsc_index_t tl; - status = ucp_address_pack(sender().worker(), NULL, -1, order, &size, &buffer); + status = ucp_address_pack(sender().worker(), NULL, + std::numeric_limits::max(), + UCP_ADDRESS_PACK_FLAGS_ALL, m_lanes2remote, &size, + &buffer); ASSERT_UCS_OK(status); ASSERT_TRUE(buffer != NULL); ASSERT_GT(size, 0ul); @@ -383,7 +421,8 @@ UCS_TEST_P(test_ucp_wireup_1sided, address) { ucp_unpacked_address unpacked_address; - status = ucp_address_unpack(sender().worker(), buffer, &unpacked_address); + status = ucp_address_unpack(sender().worker(), buffer, + UCP_ADDRESS_PACK_FLAGS_ALL, &unpacked_address); ASSERT_UCS_OK(status); EXPECT_EQ(sender().worker()->uuid, unpacked_address.uuid); @@ -394,9 +433,8 @@ UCS_TEST_P(test_ucp_wireup_1sided, address) { EXPECT_LE(unpacked_address.address_count, static_cast(sender().ucph()->num_tls)); - for (const ucp_address_entry_t *ae = unpacked_address.address_list; - ae < unpacked_address.address_list + unpacked_address.address_count; - ++ae) { + const ucp_address_entry_t *ae; + ucp_unpacked_address_for_each(ae, &unpacked_address) { unpacked_dev_priorities.insert(ae->iface_attr.priority); } @@ -409,20 +447,50 @@ UCS_TEST_P(test_ucp_wireup_1sided, address) { ASSERT_TRUE(packed_dev_priorities == unpacked_dev_priorities); } +UCS_TEST_P(test_ucp_wireup_1sided, ep_address, "IB_NUM_PATHS?=2") { + ucs_status_t status; + size_t size; + void *buffer; + + sender().connect(&receiver(), get_ep_params()); + + status = ucp_address_pack(sender().worker(), sender().ep(), + std::numeric_limits::max(), + UCP_ADDRESS_PACK_FLAGS_ALL, m_lanes2remote, &size, + &buffer); + ASSERT_UCS_OK(status); + ASSERT_TRUE(buffer != NULL); + + ucp_unpacked_address unpacked_address; + + status = ucp_address_unpack(sender().worker(), buffer, + UCP_ADDRESS_PACK_FLAGS_ALL, &unpacked_address); + ASSERT_UCS_OK(status); + + EXPECT_EQ(sender().worker()->uuid, unpacked_address.uuid); + EXPECT_LE(unpacked_address.address_count, + static_cast(sender().ucph()->num_tls)); + + ucs_free(unpacked_address.address_list); + ucs_free(buffer); +} + UCS_TEST_P(test_ucp_wireup_1sided, empty_address) { ucs_status_t status; size_t size; void *buffer; - unsigned order[UCP_MAX_RESOURCES]; - status = ucp_address_pack(sender().worker(), NULL, 0, order, &size, &buffer); + status = ucp_address_pack(sender().worker(), NULL, 0, + UCP_ADDRESS_PACK_FLAGS_ALL, m_lanes2remote, &size, + &buffer); ASSERT_UCS_OK(status); ASSERT_TRUE(buffer != NULL); ASSERT_GT(size, 0ul); ucp_unpacked_address unpacked_address; - status = ucp_address_unpack(sender().worker(), buffer, &unpacked_address); + status = ucp_address_unpack(sender().worker(), buffer, + UCP_ADDRESS_PACK_FLAGS_ALL, &unpacked_address); ASSERT_UCS_OK(status); EXPECT_EQ(sender().worker()->uuid, unpacked_address.uuid); @@ -471,7 +539,8 @@ UCS_TEST_P(test_ucp_wireup_1sided, stress_connect) { for (int i = 0; i < 30; ++i) { sender().connect(&receiver(), get_ep_params()); send_recv(sender().ep(), receiver().worker(), receiver().ep(), 1, - 10000 / ucs::test_time_multiplier()); + 10000 / (ucs::test_time_multiplier() * + ucs::test_time_multiplier())); if (!is_loopback()) { receiver().connect(&sender(), get_ep_params()); } @@ -484,11 +553,15 @@ UCS_TEST_P(test_ucp_wireup_1sided, stress_connect) { } UCS_TEST_P(test_ucp_wireup_1sided, stress_connect2) { - int count = ucs_min(1000 / ucs::test_time_multiplier(), max_connections() / 2); + int max_count = (int)ucs_max(10, + (1000.0 / (ucs::test_time_multiplier() * + ucs::test_time_multiplier()))); + int count = ucs_min(max_count, max_connections() / 2); + for (int i = 0; i < count; ++i) { sender().connect(&receiver(), get_ep_params()); send_recv(sender().ep(), receiver().worker(), receiver().ep(), 1, 1); - if (&sender() != &receiver()) { + if (!is_loopback()) { receiver().connect(&sender(), get_ep_params()); } @@ -582,14 +655,15 @@ UCS_TEST_P(test_ucp_wireup_1sided, disconnect_nb_onesided) { std::vector sreqs; send_nb(sender().ep(), 1000, 1000, sreqs); - void *dreq = sender().disconnect_nb(); - if (!UCS_PTR_IS_PTR(dreq)) { - ASSERT_UCS_OK(UCS_PTR_STATUS(dreq)); + void *req = sender().disconnect_nb(); + ucs_time_t deadline = ucs::get_deadline(); + while (!is_request_completed(req) && (ucs_get_time() < deadline)) { + progress(); } - wait(dreq); - recv_b(receiver().worker(), receiver().ep(), 1000, 1000); + sender().close_ep_req_free(req); + recv_b(receiver().worker(), receiver().ep(), 1000, 1000); waitall(sreqs); } @@ -694,10 +768,8 @@ UCS_TEST_P(test_ucp_wireup_2sided, no_loopback_with_delay) { test_connect_loopback(true, false); } -UCS_TEST_P(test_ucp_wireup_2sided, async_connect) { - if (!(GetParam().ctx_params.features & UCP_FEATURE_TAG)) { - UCS_TEST_SKIP_R("The test requires UCP_FEATURE_TAG"); - } +UCS_TEST_SKIP_COND_P(test_ucp_wireup_2sided, async_connect, + !(GetParam().ctx_params.features & UCP_FEATURE_TAG)) { sender().connect(&receiver(), get_ep_params()); ucp_ep_h send_ep = sender().ep(); std::vector reqs; @@ -710,6 +782,7 @@ UCS_TEST_P(test_ucp_wireup_2sided, async_connect) { while(!(send_ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED) && (ucs_get_time() < deadline)) { ucp_worker_progress(sender().worker()); + ucp_worker_progress(receiver().worker()); } reqs.push_back(ucp_tag_recv_nb(receiver().worker(), NULL, 0, DT_U64, 1, @@ -802,3 +875,564 @@ UCS_TEST_P(test_ucp_wireup_errh_peer, msg_before_ep_create) { } UCP_INSTANTIATE_TEST_CASE(test_ucp_wireup_errh_peer) + +class test_ucp_wireup_fallback : public test_ucp_wireup { +public: + test_ucp_wireup_fallback() { + m_num_lanes = 0; + } + + static std::vector + enum_test_params(const ucp_params_t& ctx_params, const std::string& name, + const std::string& test_case_name, const std::string& tls) + { + return enum_test_params_features(ctx_params, name, test_case_name, tls, + UCP_FEATURE_TAG | UCP_FEATURE_RMA | + UCP_FEATURE_STREAM, 1); + } + + void init() { + /* do nothing */ + } + + void cleanup() { + /* do nothing */ + } + + bool check_scalable_tls(const ucp_worker_h worker, size_t est_num_eps) { + ucp_rsc_index_t rsc_index; + + ucs_for_each_bit(rsc_index, worker->context->tl_bitmap) { + ucp_md_index_t md_index = worker->context->tl_rscs[rsc_index].md_index; + const uct_md_attr_t *md_attr = &worker->context->tl_mds[md_index].attr; + + if ((worker->context->tl_rscs[rsc_index].flags & UCP_TL_RSC_FLAG_AUX) || + (md_attr->cap.flags & UCT_MD_FLAG_SOCKADDR) || + (worker->context->tl_rscs[rsc_index].tl_rsc.dev_type == UCT_DEVICE_TYPE_ACC)) { + // Skip TLs for wireup and CM and acceleration TLs + continue; + } + + if (ucp_worker_iface_get_attr(worker, rsc_index)->max_num_eps >= est_num_eps) { + EXPECT_TRUE((worker->scalable_tl_bitmap & UCS_BIT(rsc_index)) != 0); + return true; + } else { + EXPECT_TRUE((worker->scalable_tl_bitmap & UCS_BIT(rsc_index)) == 0); + } + } + + return false; + } + + bool test_est_num_eps_fallback(size_t est_num_eps, + unsigned long &min_max_num_eps) { + size_t num_lanes = 0; + bool res = true; + bool has_only_unscalable; + + min_max_num_eps = UCS_ULUNITS_INF; + + UCS_TEST_MESSAGE << "Testing " << est_num_eps << " number of EPs"; + modify_config("NUM_EPS", ucs::to_string(est_num_eps).c_str()); + test_ucp_wireup::init(); + + sender().connect(&receiver(), get_ep_params()); + if (!is_loopback()) { + receiver().connect(&sender(), get_ep_params()); + } + send_recv(sender().ep(), receiver().worker(), receiver().ep(), 1, 1); + flush_worker(sender()); + + has_only_unscalable = !check_scalable_tls(sender().worker(), + est_num_eps); + + for (ucp_lane_index_t lane = 0; + lane < ucp_ep_num_lanes(sender().ep()); lane++) { + uct_ep_h uct_ep = sender().ep()->uct_eps[lane]; + if (uct_ep == NULL) { + continue; + } + + uct_iface_attr_t iface_attr; + ucs_status_t status = uct_iface_query(uct_ep->iface, &iface_attr); + ASSERT_UCS_OK(status); + + num_lanes++; + + if (!has_only_unscalable && (iface_attr.max_num_eps < est_num_eps)) { + res = false; + goto out; + } + + if (iface_attr.max_num_eps < min_max_num_eps) { + min_max_num_eps = iface_attr.max_num_eps; + } + } + +out: + test_ucp_wireup::cleanup(); + + if (est_num_eps == 1) { + m_num_lanes = num_lanes; + } else if (has_only_unscalable) { + /* If has only unscalable transports, check that the number of + * lanes is the same as for the case when "est_num_eps == 1" */ + res = (num_lanes == m_num_lanes); + } + + return res; + } + +private: + + /* The number of lanes activated for the case when "est_num_eps == 1" */ + size_t m_num_lanes; +}; + +UCS_TEST_P(test_ucp_wireup_fallback, est_num_eps_fallback) { + unsigned long test_min_max_eps, min_max_eps; + + test_est_num_eps_fallback(1, test_min_max_eps); + + size_t prev_min_max_eps = 0; + while ((test_min_max_eps != UCS_ULUNITS_INF) && + /* number of EPs was changed between iterations */ + (test_min_max_eps != prev_min_max_eps)) { + if (test_min_max_eps > 1) { + EXPECT_TRUE(test_est_num_eps_fallback(test_min_max_eps - 1, + min_max_eps)); + } + + EXPECT_TRUE(test_est_num_eps_fallback(test_min_max_eps, + min_max_eps)); + + EXPECT_TRUE(test_est_num_eps_fallback(test_min_max_eps + 1, + min_max_eps)); + prev_min_max_eps = test_min_max_eps; + test_min_max_eps = min_max_eps; + } +} + +/* Test fallback from RC to UD, since RC isn't scalable enough + * as its iface max_num_eps attribute = 256 by default */ +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wireup_fallback, + rc_ud, "rc_x,rc_v,ud_x,ud_v") +/* Test fallback selection of UD only TLs, since TCP shouldn't + * be used for any lanes */ +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wireup_fallback, + ud_tcp, "ud_x,ud_v,tcp") +/* Test two scalable enough transports */ +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wireup_fallback, + dc_ud, "dc_x,ud_x,ud_v") +/* Test unsacalable transports only */ +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wireup_fallback, + rc, "rc_x,rc_v") +/* Test all available IB transports */ +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wireup_fallback, + ib, "ib") +/* Test on TCP only */ +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wireup_fallback, + tcp, "tcp") + +class test_ucp_wireup_unified : public test_ucp_wireup { +public: + static std::vector + enum_test_params(const ucp_params_t& ctx_params, const std::string& name, + const std::string& test_case_name, const std::string& tls) + { + std::vector result; + ucp_params_t tmp_ctx_params = ctx_params; + + tmp_ctx_params.features = UCP_FEATURE_TAG; + + generate_test_params_variant(tmp_ctx_params, name, test_case_name + "/uni", + tls, TEST_TAG | UNIFIED_MODE, result); + return result; + } + + bool context_has_tls(ucp_context_h ctx, const std::string& tl, + ucp_rsc_index_t md_idx) + { + for (ucp_rsc_index_t idx = 0; idx < ctx->num_tls; ++idx) { + if (ctx->tl_rscs[idx].md_index != md_idx) { + continue; + } + + if (!strcmp(ctx->tl_rscs[idx].tl_rsc.tl_name, tl.c_str())) { + return true; + } + } + + return false; + } + + bool worker_has_tls(ucp_worker_h worker, const std::string& tl, + ucp_rsc_index_t md_idx) + { + ucp_context_h ctx = worker->context; + + for (unsigned i = 0; i < worker->num_ifaces; ++i) { + ucp_worker_iface_t *wiface = worker->ifaces[i]; + ucp_rsc_index_t md_idx_it = ctx->tl_rscs[wiface->rsc_index].md_index; + + if (md_idx_it != md_idx) { + continue; + } + + char* name = ctx->tl_rscs[wiface->rsc_index].tl_rsc.tl_name; + if (!strcmp(name, tl.c_str())) { + return true; + } + } + return false; + } + + void check_unified_ifaces(entity *e, + const std::string& better_tl, + const std::string& tl) + { + ucp_context_h ctx = e->ucph(); + ucp_worker_h worker = e->worker(); + + for (ucp_rsc_index_t i = 0; i < ctx->num_mds; ++i) { + if (!(context_has_tls(ctx, better_tl, i) && + context_has_tls(ctx, tl, i))) { + continue; + } + + ASSERT_TRUE(ctx->num_tls > worker->num_ifaces); + EXPECT_TRUE(worker_has_tls(worker, better_tl, i)); + EXPECT_FALSE(worker_has_tls(worker, tl, i)); + } + } +}; + + +UCS_TEST_P(test_ucp_wireup_unified, select_best_ifaces) +{ + // Accelerated transports have better performance charasteristics than their + // verbs counterparts. Check that corresponding verbs transports are not used + // by workers in unified mode. + check_unified_ifaces(&sender(), "rc_mlx5", "rc_verbs"); + check_unified_ifaces(&sender(), "ud_mlx5", "ud_verbs"); + + // RC and DC has similar capabilities, but RC has better latency while + // estimated number of endpoints is relatively small. + // sender() is created with 1 ep, so RC should be selected over DC. + check_unified_ifaces(&sender(), "rc_mlx5", "dc_mlx5"); + + // Set some big enough number of endpoints for DC to be more performance + // efficient than RC. Now check that DC is selected over RC. + modify_config("NUM_EPS", "1000"); + entity *e = create_entity(); + check_unified_ifaces(e, "dc_mlx5", "rc_mlx5"); +} + +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wireup_unified, rc, "rc") +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wireup_unified, ud, "ud") +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wireup_unified, rc_dc, "rc,dc") + +class test_ucp_wireup_fallback_amo : public test_ucp_wireup { + void init() { + size_t device_atomics_cnt = 0; + + test_ucp_wireup::init(); + + for (ucp_rsc_index_t idx = 0; idx < sender().ucph()->num_tls; ++idx) { + uct_iface_attr_t *attr = ucp_worker_iface_get_attr(sender().worker(), + idx); + if (attr->cap.flags & UCT_IFACE_FLAG_ATOMIC_DEVICE) { + device_atomics_cnt++; + } + } + bool device_atomics_supported = sender().worker()->atomic_tls != 0; + + test_ucp_wireup::cleanup(); + + if (!device_atomics_supported || !device_atomics_cnt) { + UCS_TEST_SKIP_R("there are no TLs that support device atomics"); + } + } + + void cleanup() { + /* do nothing */ + } + +protected: + + bool use_device_amo(ucp_ep_h ep) { + ucp_ep_config_t *ep_config = ucp_ep_config(ep); + + for (ucp_lane_index_t lane = 0; lane < UCP_MAX_LANES; ++lane) { + if (ep_config->key.amo_lanes[lane] != UCP_NULL_LANE) { + return (ucp_ep_get_iface_attr(ep, lane)->cap.flags & + UCT_IFACE_FLAG_ATOMIC_DEVICE); + } + } + + return false; + } + + size_t get_min_max_num_eps(ucp_ep_h ep) { + unsigned long min_max_num_eps = UCS_ULUNITS_INF; + + for (ucp_lane_index_t lane = 0; lane < ucp_ep_num_lanes(ep); lane++) { + uct_iface_attr_t *iface_attr = ucp_ep_get_iface_attr(ep, lane); + + if (iface_attr->max_num_eps < min_max_num_eps) { + min_max_num_eps = iface_attr->max_num_eps; + } + } + + return min_max_num_eps; + } + + size_t test_wireup_fallback_amo(const std::vector &tls, + size_t est_num_eps, bool should_use_device_amo) { + unsigned long min_max_num_eps = UCS_ULUNITS_INF; + + UCS_TEST_MESSAGE << "Testing " << est_num_eps << " number of EPs"; + modify_config("NUM_EPS", ucs::to_string(est_num_eps).c_str()); + + // Create new entity and add to to the end of vector + // (thus it will be receiver without any connections) + create_entity(false); + + ucp_test_param params = GetParam(); + for (std::vector::const_iterator i = tls.begin(); + i != tls.end(); ++i) { + params.transports.clear(); + params.transports.push_back(*i); + create_entity(true, params); + sender().connect(&receiver(), get_ep_params()); + + EXPECT_EQ(should_use_device_amo, use_device_amo(sender().ep())); + + size_t max_num_eps = get_min_max_num_eps(sender().ep()); + if (max_num_eps < min_max_num_eps) { + min_max_num_eps = max_num_eps; + } + } + + test_ucp_wireup::cleanup(); + + return min_max_num_eps; + } + +public: + + static ucp_params_t get_ctx_params() { + ucp_params_t params = test_ucp_wireup::get_ctx_params(); + params.field_mask |= UCP_PARAM_FIELD_FEATURES; + params.features |= (UCP_FEATURE_AMO32 | + UCP_FEATURE_AMO64); + return params; + } +}; + +class test_ucp_wireup_amo : public test_ucp_wireup { +public: + typedef struct { + test_ucp_wireup_amo *test; + } request_t; + + static ucp_params_t get_ctx_params() { + ucp_params_t params = test_ucp_wireup::get_ctx_params(); + params.field_mask |= UCP_PARAM_FIELD_REQUEST_SIZE; + params.request_size = sizeof(request_t); + return params; + } + + static std::vector + enum_test_params(const ucp_params_t& ctx_params, const std::string& name, + const std::string& test_case_name, const std::string& tls) + { + uint64_t amo_features; + + EXPECT_TRUE((sizeof(elem_type) == 4ul) || (sizeof(elem_type) == 8ul)); + amo_features = (sizeof(elem_type) == 4ul) ? UCP_FEATURE_AMO32 : + UCP_FEATURE_AMO64; + return enum_test_params_features(ctx_params, name, test_case_name, tls, + amo_features, false); + } + +protected: + ucp_rkey_h get_rkey(const entity &e) { + if (&sender() == &e) { + return test_ucp_wireup::get_rkey(e.ep(), m_memh_receiver); + } else if (&receiver() == &e) { + return test_ucp_wireup::get_rkey(e.ep(), m_memh_sender); + } + + return NULL; + } + + void add_rkey(ucp_rkey_h rkey) { + ASSERT_NE((ucp_rkey_h)NULL, rkey); + m_rkeys.push_back(ucs::handle(rkey, ucp_rkey_destroy)); + } + + void fill_send_data() { + m_send_data[0] = ucs_generate_uuid(0); + } + + static void flush_cb(void *req, ucs_status_t status) { + request_t *request = (request_t *)req; + + ASSERT_UCS_OK(status); + request->test->rkeys_cleanup(); + request->test->memhs_cleanup(); + } +}; + +UCS_TEST_P(test_ucp_wireup_amo, relese_key_after_flush) { + fill_send_data(); + clear_recv_data(); + + sender().connect(&receiver(), get_ep_params()); + + ucp_rkey_h rkey = get_rkey(sender()); + add_rkey(rkey); + + ucs_status_t status = ucp_atomic_post(sender().ep(), UCP_ATOMIC_POST_OP_ADD, + m_send_data[0], sizeof(elem_type), + (uint64_t)&m_recv_data[0], rkey); + ASSERT_UCS_OK(status); + request_t *req = (request_t *)ucp_ep_flush_nb(sender().ep(), 0, flush_cb); + if (UCS_PTR_IS_PTR(req)) { + req->test = this; + wait(req); + } else { + ASSERT_UCS_OK(UCS_PTR_STATUS(req)); + } +} + +UCP_INSTANTIATE_TEST_CASE(test_ucp_wireup_amo) + +UCS_TEST_P(test_ucp_wireup_fallback_amo, different_amo_types) { + std::vector tls; + + /* the 1st peer support RC only (device atomics) */ + tls.push_back("rc"); + /* the 2nd peer support RC and SHM (device and CPU atomics) */ + tls.push_back("rc,shm"); + + size_t min_max_num_eps = test_wireup_fallback_amo(tls, 1, 1); + test_wireup_fallback_amo(tls, min_max_num_eps + 1, 0); +} + +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wireup_fallback_amo, + shm_rc, "shm,rc_x,rc_v") + +/* NOTE: this fixture is NOT inherited from test_ucp_wireup, because we want to + * create our own entities. + */ +class test_ucp_wireup_asymmetric : public ucp_test { +protected: + virtual void init() { + static const char *ibdev_sysfs_dir = "/sys/class/infiniband"; + + DIR *dir = opendir(ibdev_sysfs_dir); + if (dir == NULL) { + UCS_TEST_SKIP_R(std::string(ibdev_sysfs_dir) + " not found"); + } + + for (;;) { + struct dirent *entry = readdir(dir); + if (entry == NULL) { + break; + } + + if (entry->d_name[0] == '.') { + continue; + } + + m_ib_devices.push_back(entry->d_name); + } + + closedir(dir); + } + + void tag_sendrecv(size_t size) { + std::string send_data(size, 's'); + std::string recv_data(size, 'x'); + + ucs_status_ptr_t sreq = ucp_tag_send_nb( + sender().ep(0), &send_data[0], size, + ucp_dt_make_contig(1), 1, + (ucp_send_callback_t)ucs_empty_function); + ucs_status_ptr_t rreq = ucp_tag_recv_nb( + receiver().worker(), &recv_data[0], size, + ucp_dt_make_contig(1), 1, 1, + (ucp_tag_recv_callback_t)ucs_empty_function); + wait(sreq); + wait(rreq); + + EXPECT_EQ(send_data, recv_data); + } + + /* Generate a pci_bw configuration string for IB devices, which assigns + * the speed ai+b for device i. + */ + std::string pci_bw_config(int a, int b) { + std::string config_str; + for (size_t i = 0; i < m_ib_devices.size(); ++i) { + config_str += m_ib_devices[i] + ":" + + ucs::to_string((a * i) + b) + "Gbps"; + if (i != (m_ib_devices.size() - 1)) { + config_str += ","; + } + } + return config_str; + } + + std::vector m_ib_devices; + +public: + static ucp_params_t get_ctx_params() { + ucp_params_t params = ucp_test::get_ctx_params(); + params.field_mask |= UCP_PARAM_FIELD_FEATURES; + params.features = UCP_FEATURE_TAG; + return params; + } +}; + +/* + * Force asymmetric configuration by different PCI_BW settings + */ +UCS_TEST_SKIP_COND_P(test_ucp_wireup_asymmetric, connect, is_self()) { + + /* Enable cross-dev connection */ + /* coverity[tainted_string_argument] */ + ucs::scoped_setenv path_mtu_env("UCX_RC_PATH_MTU", "1024"); + + { + std::string config_str = pci_bw_config(20, 20); + UCS_TEST_MESSAGE << "creating sender: " << config_str; + /* coverity[tainted_string_argument] */ + ucs::scoped_setenv pci_bw_env("UCX_IB_PCI_BW", config_str.c_str()); + create_entity(); + } + + { + std::string config_str = pci_bw_config(-20, m_ib_devices.size() * 20); + UCS_TEST_MESSAGE << "creating receiver: " << config_str; + /* coverity[tainted_string_argument] */ + ucs::scoped_setenv pci_bw_env("UCX_IB_PCI_BW", config_str.c_str()); + create_entity(); + } + + sender().connect(&receiver(), get_ep_params()); + receiver().connect(&sender(), get_ep_params()); + + ucp_ep_print_info(sender().ep(), stdout); + ucp_ep_print_info(receiver().ep(), stdout); + + tag_sendrecv(1); + tag_sendrecv(100000); + tag_sendrecv(1000000); +} + +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wireup_asymmetric, rcv, "rc_v") +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wireup_asymmetric, rcx, "rc_x") +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wireup_asymmetric, ib, "ib") diff --git a/test/gtest/ucp/ucp_test.cc b/test/gtest/ucp/ucp_test.cc index b209a023571..70a07f7ed2a 100644 --- a/test/gtest/ucp/ucp_test.cc +++ b/test/gtest/ucp/ucp_test.cc @@ -4,12 +4,19 @@ */ #include "ucp_test.h" - #include +#include + +extern "C" { +#include +#if HAVE_IB +#include +#endif #include #include -#include +} +#include namespace ucp { const uint32_t MAGIC = 0xd7d7d7d7U; @@ -31,7 +38,7 @@ std::ostream& operator<<(std::ostream& os, const ucp_test_param& test_param) const ucp_datatype_t ucp_test::DATATYPE = ucp_dt_make_contig(1); const ucp_datatype_t ucp_test::DATATYPE_IOV = ucp_dt_make_iov(); -ucp_test::ucp_test() : m_err_handler_count(0) { +ucp_test::ucp_test() { ucs_status_t status; status = ucp_config_read(NULL, NULL, &m_ucp_config); ASSERT_UCS_OK(status); @@ -73,6 +80,24 @@ void ucp_test::init() { } } +static bool check_transport(const std::string check_tl_name, + const std::vector& tl_names) { + return (std::find(tl_names.begin(), tl_names.end(), + check_tl_name) != tl_names.end()); +} + +bool ucp_test::has_transport(const std::string& tl_name) const { + return check_transport(tl_name, GetParam().transports); +} + +bool ucp_test::has_any_transport(const std::vector& tl_names) const { + const std::vector& all_tl_names = GetParam().transports; + + return std::find_first_of(all_tl_names.begin(), all_tl_names.end(), + tl_names.begin(), tl_names.end()) != + all_tl_names.end(); +} + bool ucp_test::is_self() const { return "self" == GetParam().transports.front(); } @@ -81,9 +106,9 @@ ucp_test_base::entity* ucp_test::create_entity(bool add_in_front) { return create_entity(add_in_front, GetParam()); } -ucp_test_base::entity* ucp_test::create_entity(bool add_in_front, - const ucp_test_param &test_param) { - entity *e = new entity(test_param, m_ucp_config, get_worker_params()); +ucp_test_base::entity* +ucp_test::create_entity(bool add_in_front, const ucp_test_param &test_param) { + entity *e = new entity(test_param, m_ucp_config, get_worker_params(), this); if (add_in_front) { m_entities.push_front(e); } else { @@ -92,20 +117,6 @@ ucp_test_base::entity* ucp_test::create_entity(bool add_in_front, return e; } -ucp_test::entity* ucp_test::get_entity_by_ep(ucp_ep_h ep) { - ucs::ptr_vector::const_iterator e_it; - for (e_it = entities().begin(); e_it != entities().end(); ++e_it) { - for (int w_idx = 0; w_idx < (*e_it)->get_num_workers(); ++w_idx) { - for (int ep_idx = 0; ep_idx < (*e_it)->get_num_eps(w_idx); ++ep_idx) { - if (ep == (*e_it)->ep(w_idx, ep_idx)) { - return *e_it; - } - } - } - } - return NULL; -} - ucp_params_t ucp_test::get_ctx_params() { ucp_params_t params; memset(¶ms, 0, sizeof(params)); @@ -157,21 +168,24 @@ void ucp_test::flush_worker(const entity &e, int worker_index) wait(request, worker_index); } -void ucp_test::disconnect(const entity& entity) { - for (int i = 0; i < entity.get_num_workers(); i++) { - if (m_err_handler_count == 0) { - flush_worker(entity, i); - } +void ucp_test::disconnect(entity& e) { + bool has_failed_entity = false; + for (ucs::ptr_vector::const_iterator iter = entities().begin(); + !has_failed_entity && (iter != entities().end()); ++iter) { + has_failed_entity = ((*iter)->get_err_num() > 0); + } - for (int j = 0; j < entity.get_num_eps(i); j++) { - void *dreq = entity.disconnect_nb(i, j, m_err_handler_count == 0 ? - UCP_EP_CLOSE_MODE_FLUSH : - UCP_EP_CLOSE_MODE_FORCE); - if (!UCS_PTR_IS_PTR(dreq)) { - ASSERT_UCS_OK(UCS_PTR_STATUS(dreq)); - } - wait(dreq, i); + for (int i = 0; i < e.get_num_workers(); i++) { + enum ucp_ep_close_mode close_mode; + + if (has_failed_entity) { + close_mode = UCP_EP_CLOSE_MODE_FORCE; + } else { + flush_worker(e, i); + close_mode = UCP_EP_CLOSE_MODE_FLUSH; } + + e.close_all_eps(*this, i, close_mode); } } @@ -188,10 +202,11 @@ void ucp_test::wait(void *req, int worker_index) } ucs_status_t status; + ucs_time_t deadline = ucs::get_deadline(); do { progress(worker_index); status = ucp_request_check_status(req); - } while (status == UCS_INPROGRESS); + } while ((status == UCS_INPROGRESS) && (ucs_get_time() < deadline)); if (status != UCS_OK) { /* UCS errors are suppressed in case of error handling tests */ @@ -207,8 +222,7 @@ void ucp_test::set_ucp_config(ucp_config_t *config) { } int ucp_test::max_connections() { - std::vector::const_iterator end = GetParam().transports.end(); - if (std::find(GetParam().transports.begin(), end, "tcp") != end) { + if (has_transport("tcp")) { return ucs::max_tcp_connections(); } else { return std::numeric_limits::max(); @@ -351,8 +365,9 @@ bool ucp_test::check_test_param(const std::string& name, ucp_test_base::entity::entity(const ucp_test_param& test_param, ucp_config_t* ucp_config, - const ucp_worker_params_t& worker_params) - : m_rejected_cntr(0) + const ucp_worker_params_t& worker_params, + const ucp_test_base *test_owner) + : m_err_cntr(0), m_rejected_cntr(0) { ucp_test_param entity_param = test_param; ucp_worker_params_t local_worker_params = worker_params; @@ -379,8 +394,9 @@ ucp_test_base::entity::entity(const ucp_test_param& test_param, { scoped_log_handler slh(hide_errors_logger); - UCS_TEST_CREATE_HANDLE(ucp_context_h, m_ucph, ucp_cleanup, ucp_init, - &entity_param.ctx_params, ucp_config); + UCS_TEST_CREATE_HANDLE_IF_SUPPORTED(ucp_context_h, m_ucph, ucp_cleanup, + ucp_init, &entity_param.ctx_params, + ucp_config); } m_workers.resize(num_workers); @@ -433,17 +449,55 @@ void ucp_test_base::entity::connect(const entity* other, } } +/* + * Checks if the client's address matches any IP address on the server's side. + */ +bool ucp_test_base::entity::verify_client_address(struct sockaddr_storage + *client_address) +{ + struct ifaddrs* ifaddrs; + + if (getifaddrs(&ifaddrs) != 0) { + return false; + } + + for (struct ifaddrs *ifa = ifaddrs; ifa != NULL; ifa = ifa->ifa_next) { + if (ucs_netif_flags_is_active(ifa->ifa_flags) && + ucs::is_inet_addr(ifa->ifa_addr)) + { + if (!ucs_sockaddr_ip_cmp((const struct sockaddr*)client_address, + ifa->ifa_addr)) { + freeifaddrs(ifaddrs); + return true; + } + } + } + + freeifaddrs(ifaddrs); + return false; +} + ucp_ep_h ucp_test_base::entity::accept(ucp_worker_h worker, ucp_conn_request_h conn_request) { - ucp_ep_h ep; - ucp_ep_params_t ep_params; - ep_params.field_mask = UCP_EP_PARAM_FIELD_USER_DATA | - UCP_EP_PARAM_FIELD_CONN_REQUEST; - ep_params.user_data = (void *)0xdeadbeef; + ucp_ep_params_t ep_params = *m_server_ep_params; + ucp_conn_request_attr_t attr; + ucs_status_t status; + ucp_ep_h ep; + + attr.field_mask = UCP_CONN_REQUEST_ATTR_FIELD_CLIENT_ADDR; + status = ucp_conn_request_query(conn_request, &attr); + EXPECT_TRUE((status == UCS_OK) || (status == UCS_ERR_UNSUPPORTED)); + if (status == UCS_OK) { + EXPECT_TRUE(verify_client_address(&attr.client_address)); + } + + ep_params.field_mask |= UCP_EP_PARAM_FIELD_CONN_REQUEST | + UCP_EP_PARAM_FIELD_USER_DATA; + ep_params.user_data = reinterpret_cast(this); ep_params.conn_request = conn_request; - ucs_status_t status = ucp_ep_create(worker, &ep_params, &ep); + status = ucp_ep_create(worker, &ep_params, &ep); if (status == UCS_ERR_UNREACHABLE) { UCS_TEST_SKIP_R("Skipping due an unreachable destination (unsupported " "feature or no supported transport to send partial " @@ -476,6 +530,16 @@ void ucp_test_base::entity::empty_send_completion(void *r, ucs_status_t status) void ucp_test_base::entity::accept_ep_cb(ucp_ep_h ep, void *arg) { entity *self = reinterpret_cast(arg); int worker_index = 0; /* TODO pass worker index in arg */ + + /* take error handler from test fixture and add user data */ + ucp_ep_params_t ep_params = *self->m_server_ep_params; + ep_params.field_mask &= UCP_EP_PARAM_FIELD_ERR_HANDLER; + ep_params.field_mask |= UCP_EP_PARAM_FIELD_USER_DATA; + ep_params.user_data = reinterpret_cast(self); + + void *req = ucp_ep_modify_nb(ep, &ep_params); + ASSERT_UCS_PTR_OK(req); /* don't expect this operation to block */ + self->set_ep(ep, worker_index, self->get_num_eps(worker_index)); } @@ -508,13 +572,59 @@ void ucp_test_base::entity::fence(int worker_index) const { ASSERT_UCS_OK(status); } -void* ucp_test_base::entity::disconnect_nb(int worker_index, int ep_index, - enum ucp_ep_close_mode mode) const { +void *ucp_test_base::entity::disconnect_nb(int worker_index, int ep_index, + enum ucp_ep_close_mode mode) { ucp_ep_h ep = revoke_ep(worker_index, ep_index); if (ep == NULL) { return NULL; } - return ucp_ep_close_nb(ep, mode); + + void *req = ucp_ep_close_nb(ep, mode); + if (UCS_PTR_IS_PTR(req)) { + m_close_ep_reqs.push_back(req); + return req; + } + + ASSERT_UCS_OK(UCS_PTR_STATUS(req)); + return NULL; +} + +void ucp_test_base::entity::close_ep_req_free(void *close_req) { + if (close_req == NULL) { + return; + } + + ucs_status_t status = UCS_PTR_IS_ERR(close_req) ? UCS_PTR_STATUS(close_req) : + ucp_request_check_status(close_req); + ASSERT_NE(UCS_INPROGRESS, status) << "free not completed EP close request"; + if (status != UCS_OK) { + UCS_TEST_MESSAGE << "ucp_ep_close_nb completed with status " + << ucs_status_string(status); + } + + m_close_ep_reqs.erase(std::find(m_close_ep_reqs.begin(), + m_close_ep_reqs.end(), close_req)); + ucp_request_free(close_req); +} + +void ucp_test_base::entity::close_all_eps(const ucp_test &test, int worker_idx, + enum ucp_ep_close_mode mode) { + for (int j = 0; j < get_num_eps(worker_idx); j++) { + disconnect_nb(worker_idx, j, mode); + } + + ucs_time_t deadline = ucs::get_deadline(); + while (!m_close_ep_reqs.empty() && (ucs_get_time() < deadline)) { + void *req = m_close_ep_reqs.front(); + while (!is_request_completed(req)) { + test.progress(worker_idx); + } + + close_ep_req_free(req); + } + + EXPECT_TRUE(m_close_ep_reqs.empty()) << m_close_ep_reqs.size() + << " endpoints were not closed"; } void ucp_test_base::entity::destroy_worker(int worker_index) { @@ -545,7 +655,9 @@ ucp_ep_h ucp_test_base::entity::revoke_ep(int worker_index, int ep_index) const ucs_status_t ucp_test_base::entity::listen(listen_cb_type_t cb_type, const struct sockaddr* saddr, - socklen_t addrlen, int worker_index) + socklen_t addrlen, + const ucp_ep_params_t& ep_params, + int worker_index) { ucp_listener_params_t params; ucp_listener_h listener; @@ -574,6 +686,9 @@ ucs_status_t ucp_test_base::entity::listen(listen_cb_type_t cb_type, UCS_TEST_ABORT("invalid test parameter"); } + m_server_ep_params.reset(new ucp_ep_params_t(ep_params), + ucs::deleter); + ucs_status_t status; { scoped_log_handler wrap_err(wrap_errors_logger); @@ -583,10 +698,13 @@ ucs_status_t ucp_test_base::entity::listen(listen_cb_type_t cb_type, if (status == UCS_OK) { m_listener.reset(listener, ucp_listener_destroy); } else { - /* throw error if status is not (UCS_OK or UCS_ERR_UNREACHABLE). + /* throw error if status is not (UCS_OK or UCS_ERR_UNREACHABLE or + * UCS_ERR_BUSY). * UCS_ERR_INVALID_PARAM may also return but then the test should fail */ - EXPECT_EQ(UCS_ERR_UNREACHABLE, status); + EXPECT_TRUE((status == UCS_ERR_UNREACHABLE) || + (status == UCS_ERR_BUSY)) << ucs_status_string(status); } + return status; } @@ -602,6 +720,10 @@ ucp_context_h ucp_test_base::entity::ucph() const { return m_ucph; } +ucp_listener_h ucp_test_base::entity::listenerh() const { + return m_listener; +} + unsigned ucp_test_base::entity::progress(int worker_index) { ucp_worker_h ucp_worker = worker(worker_index); @@ -630,14 +752,25 @@ int ucp_test_base::entity::get_num_eps(int worker_index) const { return m_workers[worker_index].second.size(); } -size_t ucp_test_base::entity::get_rejected_cntr() const { - return m_rejected_cntr; +void ucp_test_base::entity::add_err(ucs_status_t status) { + switch (status) { + case UCS_ERR_REJECTED: + ++m_rejected_cntr; + /* fall through */ + default: + ++m_err_cntr; + } + + EXPECT_EQ(1ul, m_err_cntr) << "error callback is called more than once"; } -void ucp_test_base::entity::inc_rejected_cntr() { - ++m_rejected_cntr; +const size_t &ucp_test_base::entity::get_err_num_rejected() const { + return m_rejected_cntr; } +const size_t &ucp_test_base::entity::get_err_num() const { + return m_err_cntr; +} void ucp_test_base::entity::warn_existing_eps() const { for (size_t worker_index = 0; worker_index < m_workers.size(); ++worker_index) { @@ -650,6 +783,31 @@ void ucp_test_base::entity::warn_existing_eps() const { } } +double ucp_test_base::entity::set_ib_ud_timeout(double timeout_sec) +{ + double prev_timeout_sec = 0.; +#if HAVE_IB + for (ucp_rsc_index_t rsc_index = 0; + rsc_index < ucph()->num_tls; ++rsc_index) { + ucp_worker_iface_t *wiface = ucp_worker_iface(worker(), rsc_index); + // check if the iface is ud transport + if (wiface->iface->ops.iface_flush == uct_ud_iface_flush) { + uct_ud_iface_t *iface = + ucs_derived_of(wiface->iface, uct_ud_iface_t); + + uct_ud_enter(iface); + if (!prev_timeout_sec) { + prev_timeout_sec = ucs_time_to_sec(iface->config.peer_timeout); + } + + iface->config.peer_timeout = ucs_time_from_sec(timeout_sec); + uct_ud_leave(iface); + } + } +#endif + return prev_timeout_sec; +} + void ucp_test_base::entity::cleanup() { m_listener.reset(); m_workers.clear(); @@ -671,3 +829,57 @@ void ucp_test_base::entity::ep_destructor(ucp_ep_h ep, entity *e) EXPECT_EQ(UCS_OK, status); ucp_request_release(req); } + +bool ucp_test_base::is_request_completed(void *request) { + return (request == NULL) || + (ucp_request_check_status(request) != UCS_INPROGRESS); +} + +ucp_test::mapped_buffer::mapped_buffer(size_t size, const entity& entity, + int flags, ucs_memory_type_t mem_type) : + mem_buffer(size, mem_type), m_entity(entity), m_memh(NULL), + m_rkey_buffer(NULL) +{ + ucs_status_t status; + + if (flags & (UCP_MEM_MAP_ALLOCATE|UCP_MEM_MAP_FIXED)) { + UCS_TEST_ABORT("mapped_buffer does not support allocation by UCP"); + } + + ucp_mem_map_params_t params; + params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | + UCP_MEM_MAP_PARAM_FIELD_LENGTH | + UCP_MEM_MAP_PARAM_FIELD_FLAGS; + params.flags = flags; + params.address = ptr(); + params.length = size; + + status = ucp_mem_map(m_entity.ucph(), ¶ms, &m_memh); + ASSERT_UCS_OK(status); + + size_t rkey_buffer_size; + status = ucp_rkey_pack(m_entity.ucph(), m_memh, &m_rkey_buffer, + &rkey_buffer_size); + ASSERT_UCS_OK(status); +} + +ucp_test::mapped_buffer::~mapped_buffer() +{ + ucp_rkey_buffer_release(m_rkey_buffer); + ucs_status_t status = ucp_mem_unmap(m_entity.ucph(), m_memh); + EXPECT_UCS_OK(status); +} + +ucs::handle ucp_test::mapped_buffer::rkey(const entity& entity) const +{ + ucp_rkey_h rkey; + + ucs_status_t status = ucp_ep_rkey_unpack(entity.ep(), m_rkey_buffer, &rkey); + ASSERT_UCS_OK(status); + return ucs::handle(rkey, ucp_rkey_destroy); +} + +ucp_mem_h ucp_test::mapped_buffer::memh() const +{ + return m_memh; +} diff --git a/test/gtest/ucp/ucp_test.h b/test/gtest/ucp/ucp_test.h index 76c431ad9b9..6a23ed8c3a3 100644 --- a/test/gtest/ucp/ucp_test.h +++ b/test/gtest/ucp/ucp_test.h @@ -8,6 +8,7 @@ #include #include +#include /* ucp version compile time test */ #if (UCP_API_VERSION != UCP_VERSION(UCP_API_MAJOR,UCP_API_MINOR)) @@ -18,7 +19,15 @@ #include -#define MT_TEST_NUM_THREADS 4 +#if _OPENMP +#include "omp.h" +#endif + +#if _OPENMP && ENABLE_MT +#define MT_TEST_NUM_THREADS omp_get_max_threads() +#else +#define MT_TEST_NUM_THREADS 4 +#endif namespace ucp { @@ -33,18 +42,21 @@ struct ucp_test_param { int thread_type; }; +class ucp_test; // forward declaration + class ucp_test_base : public ucs::test_base { public: enum { SINGLE_THREAD = 42, - MULTI_THREAD_CONTEXT, - MULTI_THREAD_WORKER + MULTI_THREAD_CONTEXT, /* workers are single-threaded, context is mt-shared */ + MULTI_THREAD_WORKER /* workers are multi-threaded, cotnext is mt-single */ }; class entity { - typedef std::vector > ep_vec_t; + typedef std::vector > ep_vec_t; typedef std::vector, ep_vec_t> > worker_vec_t; + typedef std::deque close_ep_reqs_t; public: typedef enum { @@ -54,13 +66,16 @@ class ucp_test_base : public ucs::test_base { } listen_cb_type_t; entity(const ucp_test_param& test_param, ucp_config_t* ucp_config, - const ucp_worker_params_t& worker_params); + const ucp_worker_params_t& worker_params, + const ucp_test_base* test_owner); ~entity(); void connect(const entity* other, const ucp_ep_params_t& ep_params, int ep_idx = 0, int do_set_ep = 1); + bool verify_client_address(struct sockaddr_storage *client_address); + ucp_ep_h accept(ucp_worker_h worker, ucp_conn_request_h conn_request); void* modify_ep(const ucp_ep_params_t& ep_params, int worker_idx = 0, @@ -73,12 +88,18 @@ class ucp_test_base : public ucs::test_base { void fence(int worker_index = 0) const; void* disconnect_nb(int worker_index = 0, int ep_index = 0, - enum ucp_ep_close_mode mode = UCP_EP_CLOSE_MODE_FLUSH) const; + enum ucp_ep_close_mode mode = UCP_EP_CLOSE_MODE_FLUSH); + + void close_ep_req_free(void *close_req); + + void close_all_eps(const ucp_test &test, int wirker_idx, + enum ucp_ep_close_mode mode = UCP_EP_CLOSE_MODE_FLUSH); void destroy_worker(int worker_index = 0); ucs_status_t listen(listen_cb_type_t cb_type, const struct sockaddr *saddr, socklen_t addrlen, + const ucp_ep_params_t& ep_params, int worker_index = 0); ucp_ep_h ep(int worker_index = 0, int ep_index = 0) const; @@ -89,18 +110,24 @@ class ucp_test_base : public ucs::test_base { ucp_context_h ucph() const; + ucp_listener_h listenerh() const; + unsigned progress(int worker_index = 0); int get_num_workers() const; int get_num_eps(int worker_index = 0) const; - void inc_rejected_cntr(); + void add_err(ucs_status_t status); + + const size_t &get_err_num_rejected() const; - size_t get_rejected_cntr() const; + const size_t &get_err_num() const; void warn_existing_eps() const; + double set_ib_ud_timeout(double timeout_sec); + void cleanup(); static void ep_destructor(ucp_ep_h ep, entity *e); @@ -110,7 +137,10 @@ class ucp_test_base : public ucs::test_base { worker_vec_t m_workers; ucs::handle m_listener; std::queue m_conn_reqs; + close_ep_reqs_t m_close_ep_reqs; + size_t m_err_cntr; size_t m_rejected_cntr; + ucs::handle m_server_ep_params; private: static void empty_send_completion(void *r, ucs_status_t status); @@ -120,6 +150,8 @@ class ucp_test_base : public ucs::test_base { void set_ep(ucp_ep_h ep, int worker_index, int ep_index); }; + + static bool is_request_completed(void *req); }; /** @@ -167,25 +199,33 @@ class ucp_test : public ucp_test_base, void stats_activate(); void stats_restore(); +private: + static void set_ucp_config(ucp_config_t *config, + const ucp_test_param& test_param); + static bool check_test_param(const std::string& name, + const std::string& test_case_name, + const ucp_test_param& test_param); + protected: virtual void init(); bool is_self() const; virtual void cleanup(); + virtual bool has_transport(const std::string& tl_name) const; + bool has_any_transport(const std::vector& tl_names) const; entity* create_entity(bool add_in_front = false); entity* create_entity(bool add_in_front, const ucp_test_param& test_param); - entity* get_entity_by_ep(ucp_ep_h ep); unsigned progress(int worker_index = 0) const; void short_progress_loop(int worker_index = 0) const; void flush_ep(const entity &e, int worker_index = 0, int ep_index = 0); void flush_worker(const entity &e, int worker_index = 0); - void disconnect(const entity& entity); + void disconnect(entity& entity); void wait(void *req, int worker_index = 0); void set_ucp_config(ucp_config_t *config); int max_connections(); static void err_handler_cb(void *arg, ucp_ep_h ep, ucs_status_t status) { - ucp_test *self = reinterpret_cast(arg); - self->m_err_handler_count++; + entity *e = reinterpret_cast(arg); + e->add_err(status); } template @@ -196,17 +236,25 @@ class ucp_test : public ucp_test_base, } } -private: - static void set_ucp_config(ucp_config_t *config, - const ucp_test_param& test_param); - static bool check_test_param(const std::string& name, - const std::string& test_case_name, - const ucp_test_param& test_param); - -protected: - volatile int m_err_handler_count; static const ucp_datatype_t DATATYPE; static const ucp_datatype_t DATATYPE_IOV; + +protected: + class mapped_buffer : public mem_buffer { + public: + mapped_buffer(size_t size, const entity& entity, int flags = 0, + ucs_memory_type_t mem_type = UCS_MEMORY_TYPE_HOST); + virtual ~mapped_buffer(); + + ucs::handle rkey(const entity& entity) const; + + ucp_mem_h memh() const; + + private: + const entity& m_entity; + ucp_mem_h m_memh; + void* m_rkey_buffer; + }; }; @@ -215,7 +263,7 @@ std::ostream& operator<<(std::ostream& os, const ucp_test_param& test_param); /** * Instantiate the parameterized test case a combination of transports. * - * @param _test_case Test case class, derived from uct_test. + * @param _test_case Test case class, derived from ucp_test. * @param _name Instantiation name. * @param ... Transport names. */ @@ -230,17 +278,42 @@ std::ostream& operator<<(std::ostream& os, const ucp_test_param& test_param); /** * Instantiate the parameterized test case for all transport combinations. * - * @param _test_case Test case class, derived from uct_test. + * @param _test_case Test case class, derived from ucp_test. */ #define UCP_INSTANTIATE_TEST_CASE(_test_case) \ UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, dcx, "dc_x") \ - UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, ud, "ud") \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, ud, "ud_v") \ UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, udx, "ud_x") \ - UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, rc, "rc") \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, rc, "rc_v") \ UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, rcx, "rc_x") \ UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, shm_ib, "shm,ib") \ UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, ugni, "ugni") \ UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, self, "self") \ UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, tcp, "tcp") + +/** + * The list of GPU copy TLs + */ +#define UCP_TEST_GPU_COPY_TLS "cuda_copy,rocm_copy" + + +/** + * Instantiate the parameterized test case for all transport combinations + * with GPU memory awareness + * + * @param _test_case Test case class, derived from ucp_test. + */ +#define UCP_INSTANTIATE_TEST_CASE_GPU_AWARE(_test_case) \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, dcx, "dc_x," UCP_TEST_GPU_COPY_TLS) \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, ud, "ud_v," UCP_TEST_GPU_COPY_TLS) \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, udx, "ud_x," UCP_TEST_GPU_COPY_TLS) \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, rc, "rc_v," UCP_TEST_GPU_COPY_TLS) \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, rcx, "rc_x," UCP_TEST_GPU_COPY_TLS) \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, shm_ib, "shm,ib," UCP_TEST_GPU_COPY_TLS) \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, shm_ib_ipc, "shm,ib,cuda_ipc,rocm_ipc," \ + UCP_TEST_GPU_COPY_TLS) \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, ugni, "ugni," UCP_TEST_GPU_COPY_TLS) \ + UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, tcp, "tcp," UCP_TEST_GPU_COPY_TLS) + #endif diff --git a/test/gtest/ucs/arch/test_x86_64.cc b/test/gtest/ucs/arch/test_x86_64.cc new file mode 100644 index 00000000000..2e440dc6dda --- /dev/null +++ b/test/gtest/ucs/arch/test_x86_64.cc @@ -0,0 +1,105 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#if defined(__x86_64__) + +#include +extern "C" { +#include +#include +#include +} + +#include + +class test_arch : public ucs::test { +protected: + /* have to add wrapper for ucs_memcpy_relaxed because pure "C" inline call could + * not be used as template argument */ + static inline void *memcpy_relaxed(void *dst, const void *src, size_t size) + { + return ucs_memcpy_relaxed(dst, src, size); + } + + template + double measure_memcpy_bandwidth(size_t size) + { + ucs_time_t start_time, end_time; + void *src, *dst; + double result = 0.0; + int iter; + + src = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (src == MAP_FAILED) { + goto out; + } + + dst = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (dst == MAP_FAILED) { + goto out_unmap_src; + } + + memset(dst, 0, size); + memset(src, 0, size); + memcpy(dst, src, size); + + iter = 0; + start_time = ucs_get_time(); + do { + C(dst, src, size); + end_time = ucs_get_time(); + ++iter; + } while (end_time < start_time + ucs_time_from_sec(0.5)); + + result = size * iter / ucs_time_to_sec(end_time - start_time); + + munmap(dst, size); + out_unmap_src: + munmap(src, size); + out: + return result; + } +}; + +UCS_TEST_SKIP_COND_F(test_arch, memcpy, RUNNING_ON_VALGRIND || !ucs::perf_retry_count) { + const double diff = 0.95; /* allow 5% fluctuations */ + const double timeout = 30; /* 30 seconds to complete test successfully */ + double memcpy_bw = 0; + double memcpy_relax_bw = 0; + double secs; + size_t size; + char memunits_str[256]; + char thresh_min_str[16]; + char thresh_max_str[16]; + int i; + + ucs_memunits_to_str(ucs_global_opts.arch.builtin_memcpy_min, + thresh_min_str, sizeof(thresh_min_str)); + ucs_memunits_to_str(ucs_global_opts.arch.builtin_memcpy_max, + thresh_max_str, sizeof(thresh_max_str)); + UCS_TEST_MESSAGE << "Using memcpy relaxed for size " << + thresh_min_str << ".." << + thresh_max_str; + for (size = 4096; size <= 256 * UCS_MBYTE; size *= 2) { + secs = ucs_get_accurate_time(); + for (i = 0; ucs_get_accurate_time() - secs < timeout; i++) { + memcpy_bw = measure_memcpy_bandwidth(size); + memcpy_relax_bw = measure_memcpy_bandwidth(size); + if (memcpy_relax_bw / memcpy_bw >= diff) { + break; + } + usleep(1000); /* allow other tasks to complete */ + } + ucs_memunits_to_str(size, memunits_str, sizeof(memunits_str)); + UCS_TEST_MESSAGE << memunits_str << + " memcpy: " << (memcpy_bw / UCS_GBYTE) << + "GB/s memcpy relaxed: " << (memcpy_relax_bw / UCS_GBYTE) << + "GB/s iterations: " << i + 1; + EXPECT_GE(memcpy_relax_bw / memcpy_bw, diff); + } +} + +#endif diff --git a/test/gtest/ucs/test_algorithm.cc b/test/gtest/ucs/test_algorithm.cc index 2fa98cebdb9..2e4026d98ab 100644 --- a/test/gtest/ucs/test_algorithm.cc +++ b/test/gtest/ucs/test_algorithm.cc @@ -25,15 +25,12 @@ class test_algorithm : public ucs::test { return compare_func(elem1, elem2); } - - - static void * MAGIC; + static void *MAGIC; }; -void * test_algorithm::MAGIC = (void*)0xdeadbeef1ee7a880ull; +void *test_algorithm::MAGIC = (void*)0xdeadbeef1ee7a880ull; UCS_TEST_F(test_algorithm, qsort_r) { - for (int i = 0; i < 1000 / ucs::test_time_multiplier(); ++i) { unsigned nmemb = ucs::rand() % 100; @@ -50,9 +47,76 @@ UCS_TEST_F(test_algorithm, qsort_r) { } } -UCS_TEST_F(test_algorithm, crc16_string) { - UCS_TEST_MESSAGE << "crc16 of '123456789' is 0x" << std::hex << - ucs_crc16_string("123456789") << std::dec; - EXPECT_NE(ucs_crc16_string("123456789"), - ucs_crc16_string("12345")); +UCS_TEST_F(test_algorithm, crc16) { + std::string test_str; + + test_str = ""; + EXPECT_EQ(0u, ucs_crc16_string(test_str.c_str())); + + test_str = "0"; + EXPECT_EQ(0xc1fbu, ucs_crc16_string(test_str.c_str())); + + test_str = "01"; + EXPECT_EQ(0x99efu, ucs_crc16_string(test_str.c_str())); + + test_str = "012"; + EXPECT_EQ(0xfd89u, ucs_crc16_string(test_str.c_str())); + + test_str = "0123"; + EXPECT_EQ(0xea54u, ucs_crc16_string(test_str.c_str())); + + test_str = "01234"; + EXPECT_EQ(0x9394u, ucs_crc16_string(test_str.c_str())); + + test_str = "012345"; + EXPECT_EQ(0x4468u, ucs_crc16_string(test_str.c_str())); + + test_str = "0123456"; + EXPECT_EQ(0x4bc7u, ucs_crc16_string(test_str.c_str())); + + test_str = "01234567"; + EXPECT_EQ(0x07bcu, ucs_crc16_string(test_str.c_str())); + + test_str = "012345678"; + EXPECT_EQ(0x3253u, ucs_crc16_string(test_str.c_str())); + + test_str = "0123456789"; + EXPECT_EQ(0x3c16u, ucs_crc16_string(test_str.c_str())); +} + +UCS_TEST_F(test_algorithm, crc32) { + std::string test_str; + + test_str = ""; + EXPECT_EQ(0u, ucs_crc32(0, test_str.c_str(), test_str.size())); + + test_str = "0"; + EXPECT_EQ(0xf4dbdf21ul, ucs_crc32(0, test_str.c_str(), test_str.size())); + + test_str = "01"; + EXPECT_EQ(0xcf412436ul, ucs_crc32(0, test_str.c_str(), test_str.size())); + + test_str = "012"; + EXPECT_EQ(0xd5a06ab0ul, ucs_crc32(0, test_str.c_str(), test_str.size())); + + test_str = "0123"; + EXPECT_EQ(0xa6669d7dul, ucs_crc32(0, test_str.c_str(), test_str.size())); + + test_str = "01234"; + EXPECT_EQ(0xdda47024ul, ucs_crc32(0, test_str.c_str(), test_str.size())); + + test_str = "012345"; + EXPECT_EQ(0xb86f6b0ful, ucs_crc32(0, test_str.c_str(), test_str.size())); + + test_str = "0123456"; + EXPECT_EQ(0x8dbf08eeul, ucs_crc32(0, test_str.c_str(), test_str.size())); + + test_str = "01234567"; + EXPECT_EQ(0x2d803af5ul, ucs_crc32(0, test_str.c_str(), test_str.size())); + + test_str = "012345678"; + EXPECT_EQ(0x37fad1baul, ucs_crc32(0, test_str.c_str(), test_str.size())); + + test_str = "0123456789"; + EXPECT_EQ(0xa684c7c6ul, ucs_crc32(0, test_str.c_str(), test_str.size())); } diff --git a/test/gtest/ucs/test_arbiter.cc b/test/gtest/ucs/test_arbiter.cc index fbdaa2cac33..c6c8a6b23b6 100644 --- a/test/gtest/ucs/test_arbiter.cc +++ b/test/gtest/ucs/test_arbiter.cc @@ -18,6 +18,7 @@ class test_arbiter : public ucs::test { protected: static ucs_arbiter_cb_result_t resched_groups(ucs_arbiter_t *arbitrer, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { @@ -73,7 +74,7 @@ class test_arbiter : public ucs::test { for (j = 0; j < nelems_per_group; j++) { if (push_head) { int rev_j = nelems_per_group - 1 - j; - ucs_arbiter_group_push_head_elem(NULL, &groups[i], + ucs_arbiter_group_push_head_elem(&groups[i], &elems[nelems_per_group*i+rev_j]); } else { ucs_arbiter_group_push_elem(&groups[i], @@ -119,6 +120,7 @@ class test_arbiter : public ucs::test { } static ucs_arbiter_cb_result_t dispatch_cb(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { @@ -127,18 +129,18 @@ class test_arbiter : public ucs::test { } static ucs_arbiter_cb_result_t dispatch_dummy_cb(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; } - ucs_arbiter_cb_result_t desched_group(ucs_arbiter_elem_t *elem) + ucs_arbiter_cb_result_t desched_group(ucs_arbiter_group_t *group, + ucs_arbiter_elem_t *elem) { - ucs_arbiter_group_t *g = ucs_arbiter_elem_group(elem); - //ucs_warn("desched group %d", m_count); m_count++; - ucs_arbiter_group_schedule(&m_arb2, g); + ucs_arbiter_group_schedule(&m_arb2, group); return UCS_ARBITER_CB_RESULT_DESCHED_GROUP; } @@ -149,22 +151,25 @@ class test_arbiter : public ucs::test { } static ucs_arbiter_cb_result_t desched_cb(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { test_arbiter *self = (test_arbiter *)arg; - return self->desched_group(elem); + return self->desched_group(group, elem); } static ucs_arbiter_cb_result_t remove_cb(ucs_arbiter_t *arbiter, - ucs_arbiter_elem_t *elem, - void *arg) + ucs_arbiter_group_t *group, + ucs_arbiter_elem_t *elem, + void *arg) { test_arbiter *self = (test_arbiter *)arg; return self->remove_elem(elem); } static ucs_arbiter_cb_result_t stop_cb(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { @@ -172,6 +177,7 @@ class test_arbiter : public ucs::test { } static ucs_arbiter_cb_result_t purge_cb(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { @@ -181,6 +187,7 @@ class test_arbiter : public ucs::test { } static ucs_arbiter_cb_result_t count_cb(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { @@ -193,6 +200,7 @@ class test_arbiter : public ucs::test { } static ucs_arbiter_cb_result_t purge_cond_cb(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { @@ -209,6 +217,7 @@ class test_arbiter : public ucs::test { static ucs_arbiter_cb_result_t purge_dummy_cb(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, ucs_arbiter_elem_t *elem, void *arg) { @@ -312,7 +321,7 @@ UCS_TEST_F(test_arbiter, purge_cond) { for (int j = 0; j < num_elems; ++j) { arb_elem *e = new arb_elem; - if (ucs::rand() % 2) { + if ((ucs::rand() % 2) == 0) { e->release = true; ++purged_count[i]; } else { @@ -422,7 +431,7 @@ UCS_TEST_F(test_arbiter, multiple_dispatch) { ucs_arbiter_dispatch(&arbiter, 1, dispatch_cb, this); - ASSERT_TRUE(arbiter.current == NULL); + ASSERT_TRUE(ucs_arbiter_is_empty(&arbiter)); /* Release detached groups */ for (unsigned i = 0; i < m_num_groups; ++i) { @@ -471,7 +480,7 @@ UCS_TEST_F(test_arbiter, resched) { m_count = 0; ucs_arbiter_dispatch_nonempty(&arbiter, 3, remove_cb, this); EXPECT_EQ(1, m_count); - ASSERT_TRUE(arbiter.current == NULL); + ASSERT_TRUE(ucs_arbiter_is_empty(&arbiter)); ucs_arbiter_group_cleanup(&group2); ucs_arbiter_group_cleanup(&group1); @@ -543,6 +552,38 @@ UCS_TEST_F(test_arbiter, move_group) { ucs_arbiter_cleanup(&m_arb2); } +/* Simulates a bug fixed in UCX GH issue #5382 + * (https://github.com/openucx/ucx/issues/5382). + * The failing flow (with DC transport) is: + * - DCI waiting arbiter is being dispatched + * - In the dispatch callback group is scheduled to the TX waiting arbiter and + * UCS_ARBITER_CB_RESULT_DESCHED_GROUP is returned from the callback + * - Now the group is scheduled on TX waiting arbiter + * - ucs_arbiter_group_desched is called in uct_dc_mlx5_iface_dci_put + * - ARBITER_CHECK assert fails + */ +UCS_TEST_F(test_arbiter, move_group_and_desched) { + + ucs_arbiter_group_t group1; + ucs_arbiter_elem_t elem1; + + ucs_arbiter_init(&m_arb1); + ucs_arbiter_init(&m_arb2); + + ucs_arbiter_group_init(&group1); + ucs_arbiter_elem_init(&elem1); + ucs_arbiter_group_push_elem(&group1, &elem1); + ucs_arbiter_group_schedule(&m_arb1, &group1); + + m_count = 0; + ucs_arbiter_dispatch(&m_arb1, 1, desched_cb, this); + EXPECT_EQ(1, m_count); + ucs_arbiter_group_desched(&m_arb2, &group1); + + ucs_arbiter_cleanup(&m_arb1); + ucs_arbiter_cleanup(&m_arb2); +} + UCS_TEST_F(test_arbiter, push_head_scheduled) { ucs_arbiter_group_t group1; @@ -560,8 +601,8 @@ UCS_TEST_F(test_arbiter, push_head_scheduled) { ucs_arbiter_elem_init(&elem3.elem); elem1.count = elem2.count = elem3.count = 0; - ucs_arbiter_group_push_head_elem(&m_arb1, &group1, &elem1.elem); - ucs_arbiter_group_push_head_elem(&m_arb1, &group2, &elem2.elem); + ucs_arbiter_group_push_head_elem(&group1, &elem1.elem); + ucs_arbiter_group_push_head_elem(&group2, &elem2.elem); ucs_arbiter_group_schedule(&m_arb1, &group1); ucs_arbiter_group_schedule(&m_arb1, &group2); @@ -574,7 +615,7 @@ UCS_TEST_F(test_arbiter, push_head_scheduled) { EXPECT_EQ(0, elem3.count); /* Adding new head elem to group2 */ - ucs_arbiter_group_push_head_elem(&m_arb1, &group2, &elem3.elem); + ucs_arbiter_group_push_head_elem(&group2, &elem3.elem); m_count = 0; ucs_arbiter_dispatch(&m_arb1, 1, count_cb, this); @@ -588,9 +629,9 @@ UCS_TEST_F(test_arbiter, push_head_scheduled) { EXPECT_EQ(3, m_count); /* Add to single scheduled group */ - ucs_arbiter_group_push_head_elem(&m_arb1, &group2, &elem2.elem); + ucs_arbiter_group_push_head_elem(&group2, &elem2.elem); ucs_arbiter_group_schedule(&m_arb1, &group2); - ucs_arbiter_group_push_head_elem(&m_arb1, &group2, &elem3.elem); + ucs_arbiter_group_push_head_elem(&group2, &elem3.elem); m_count = 0; elem2.count = elem3.count = 0; @@ -711,7 +752,7 @@ UCS_TEST_F(test_arbiter, result_stop) { for (int i = 0; i < N + 3; i++) { ucs_arbiter_dispatch(&m_arb1, 1, stop_cb, this); /* arbiter current position must not change on STOP */ - EXPECT_EQ(m_arb1.current, groups[0].tail->next); + EXPECT_EQ(m_arb1.list.next, &groups[0].tail->next->list); } m_count = 0; @@ -723,3 +764,316 @@ UCS_TEST_F(test_arbiter, result_stop) { delete [] groups; delete [] elems; } + +class test_arbiter_resched_from_dispatch : public ucs::test { +public: + virtual void init() { + ucs::test::init(); + ucs_arbiter_init(&m_arb); + ucs_arbiter_group_init(&m_group1); + ucs_arbiter_group_init(&m_group2); + ucs_arbiter_elem_init(&m_elem); + } + + virtual void cleanup() { + ucs_arbiter_cleanup(&m_arb); + ucs::test::cleanup(); + } + +protected: + + /* the callback pushes the elem on group2 and schedules it */ + virtual ucs_arbiter_cb_result_t dispatch(ucs_arbiter_group_t *group, + ucs_arbiter_elem_t *elem) + { + if (m_moved) { + return UCS_ARBITER_CB_RESULT_STOP; + } else { + EXPECT_EQ(&m_elem, elem); + ucs_arbiter_group_push_elem(&m_group2, elem); + ucs_arbiter_group_schedule(&m_arb, &m_group2); + m_moved = true; + } + return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; + } + + static ucs_arbiter_cb_result_t purge_cb(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, + ucs_arbiter_elem_t *elem, void *arg) + { + return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; + } + + static ucs_arbiter_cb_result_t dispatch_cb(ucs_arbiter_t *arbiter, + ucs_arbiter_group_t *group, + ucs_arbiter_elem_t *elem, + void *arg) + { + test_arbiter_resched_from_dispatch *self = + reinterpret_cast(arg); + EXPECT_EQ(&self->m_arb, arbiter); + return self->dispatch(group, elem); + } + + void check_group_state(ucs_arbiter_group_t *group, bool is_scheduled) + { + EXPECT_EQ(is_scheduled, ucs_arbiter_group_is_scheduled(group)); + EXPECT_EQ(is_scheduled, !ucs_arbiter_group_is_empty(group)); + } + + ucs_arbiter_t m_arb; + ucs_arbiter_group_t m_group1, m_group2; + ucs_arbiter_elem_t m_elem; + bool m_moved; +}; + +/* from the arbiter dispatch callback, reschedule the element on another group, + * and remove it from current group + */ +UCS_TEST_F(test_arbiter_resched_from_dispatch, remove_and_resched) { + + m_moved = false; + + ucs_arbiter_group_push_elem(&m_group1, &m_elem); + ucs_arbiter_group_schedule(&m_arb, &m_group1); + + /* group1 should be scheduled, group2 not */ + check_group_state(&m_group1, true); + check_group_state(&m_group2, false); + + ucs_arbiter_dispatch(&m_arb, 1, dispatch_cb, this); + + /* the dispatch should deschedule group1 and schedule group2 instead */ + check_group_state(&m_group1, false); + check_group_state(&m_group2, true); + + ucs_arbiter_group_purge(&m_arb, &m_group2, purge_cb, NULL); +} + +class test_arbiter_random_resched : public test_arbiter_resched_from_dispatch { +public: + test_arbiter_random_resched(); + +protected: + virtual ucs_arbiter_cb_result_t dispatch(ucs_arbiter_group_t *_group, + ucs_arbiter_elem_t *elem); + + void do_test_loop(unsigned num_groups, unsigned elems_per_group, + unsigned dispatch_per_group); + +private: + typedef struct { + ucs_arbiter_group_t super; + unsigned num_elems; + } arb_group_t; + + void reset_counters(); + + void add_new_elem(arb_group_t *group); + + void do_test(unsigned iteration_num, unsigned num_groups, + unsigned elems_per_group, unsigned dispatch_per_group); + + std::vector m_groups; + unsigned m_num_dispatch; + unsigned m_num_only; + unsigned m_num_added; + unsigned m_num_removed; + unsigned m_num_push_self; + unsigned m_num_push_another; + unsigned m_num_next_group; + unsigned m_num_desched; + unsigned m_num_resched; +}; + +test_arbiter_random_resched::test_arbiter_random_resched() +{ + reset_counters(); +} + +void test_arbiter_random_resched::reset_counters() +{ + m_num_dispatch = 0; + m_num_only = 0; + m_num_added = 0; + m_num_removed = 0; + m_num_push_self = 0; + m_num_push_another = 0; + m_num_next_group = 0; + m_num_desched = 0; + m_num_resched = 0; +} + +void test_arbiter_random_resched::add_new_elem(arb_group_t *group) +{ + ucs_arbiter_elem_t *elem = new ucs_arbiter_elem_t; + + ucs_arbiter_elem_init(elem); + ucs_arbiter_group_push_elem(&group->super, elem); + ++group->num_elems; + ++m_num_added; +} + +ucs_arbiter_cb_result_t +test_arbiter_random_resched::dispatch(ucs_arbiter_group_t *_group, + ucs_arbiter_elem_t *elem) +{ + arb_group_t *group = ucs_derived_of(_group, arb_group_t); + arb_group_t *new_group; + + ++m_num_dispatch; + + /* Test ucs_arbiter_group_num_elems() */ + EXPECT_EQ(group->num_elems, ucs_arbiter_group_num_elems(&group->super)); + + /* We should be able to reschedule this group to another place */ + EXPECT_FALSE(ucs_arbiter_group_is_scheduled(&group->super)); + + /* Test ucs_arbiter_elem_is_only() */ + if (group->num_elems == 1) { + ++m_num_only; + EXPECT_TRUE(ucs_arbiter_elem_is_only(elem)); + } + + /* Randomly add few more elements to same group */ + while ((ucs::rand() % 4) == 0) { + add_new_elem(group); + if ((ucs::rand() % 2) == 0) { + ucs_arbiter_group_schedule(&m_arb, &group->super); + } + } + + if ((ucs::rand() % 2) == 0) { + /* Remove the current element. + * Must remove elements with higher probability than adding to avoid + * infinite loop. + */ + if ((ucs::rand() % 4) == 0) { + /* push the removed element to a random group. It could be either + * the current group or a new group, both cases should work. */ + new_group = &m_groups[ucs::rand() % m_groups.size()]; + ucs_arbiter_group_push_elem(&new_group->super, elem); + + if (new_group == group) { + ++m_num_push_self; + if ((ucs::rand() % 2) == 0) { + ucs_arbiter_group_schedule(&m_arb, &new_group->super); + } + } else { + /* schedule the new group if it's now the current one */ + ++m_num_push_another; + ucs_arbiter_group_schedule(&m_arb, &new_group->super); + } + + ++new_group->num_elems; + } else { + /* Element is removed permanently, so invalidate and delete it */ + ++m_num_removed; + memset(elem, 0xBB, sizeof(*elem)); + delete elem; + } + --group->num_elems; + return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; + } else { + /* Don't remove the current element, do some other random group action + * instead. + */ + int action = ucs::rand() % 3; + switch (action) { + case 0: + ++m_num_next_group; + return UCS_ARBITER_CB_RESULT_NEXT_GROUP; + case 1: + /* Reschedule the group on same arbiter to keep it going */ + ucs_arbiter_group_schedule(&m_arb, &group->super); + ++m_num_desched; + return UCS_ARBITER_CB_RESULT_DESCHED_GROUP; + case 2: + default: + ++m_num_resched; + return UCS_ARBITER_CB_RESULT_RESCHED_GROUP; + } + } +} + +void test_arbiter_random_resched::do_test(unsigned iteration_num, + unsigned num_groups, + unsigned elems_per_group, + unsigned dispatch_per_group) +{ + arb_group_t* group; + + UCS_TEST_MESSAGE << "Iteration " << iteration_num << ": " + << num_groups << " m_groups, " + << elems_per_group << " elements each"; + + /* Add elements and groups */ + m_groups.resize(num_groups); + for (unsigned group_index = 0; group_index < num_groups; ++group_index) { + group = &m_groups[group_index]; + ucs_arbiter_group_init(&group->super); + group->num_elems = 0; + + for (unsigned i = 0; i < elems_per_group; ++i) { + add_new_elem(group); + } + ucs_arbiter_group_schedule(&m_arb, &group->super); + + /* Test arbiter helper functions */ + EXPECT_EQ(elems_per_group, ucs_arbiter_group_num_elems(&group->super)); + if (elems_per_group == 0) { + EXPECT_TRUE(ucs_arbiter_group_is_empty(&group->super)); + } + } + + EXPECT_EQ(num_groups * elems_per_group, m_num_added); + + /* Dispatch arbiter until it becomes empty */ + do { + ucs_arbiter_dispatch(&m_arb, dispatch_per_group, dispatch_cb, + reinterpret_cast(this)); + } while (!ucs_arbiter_is_empty(&m_arb)); + + /* Show counters */ + UCS_TEST_MESSAGE << " added: " << m_num_added + << " removed: " << m_num_removed; + UCS_TEST_MESSAGE << " dispatch: " << m_num_dispatch + << " only: " << m_num_only + << " push self: " << m_num_push_self + << " push another: " << m_num_push_another; + UCS_TEST_MESSAGE << " desched: " << m_num_desched + << " resched: " << m_num_resched + << " next_group: " << m_num_next_group; + + /* Check counters */ + EXPECT_EQ(m_num_added, m_num_removed); + + /* Make sure all is removed */ + for (unsigned group_index = 0; group_index < num_groups; ++group_index) { + group = &m_groups[group_index]; + EXPECT_EQ(0u, group->num_elems); + EXPECT_TRUE(ucs_arbiter_group_is_empty(&group->super)); + } +} + +void test_arbiter_random_resched::do_test_loop(unsigned num_groups, + unsigned elems_per_group, + unsigned dispatch_per_group) +{ + for (unsigned i = 0; i < 5; ++i) { + reset_counters(); + do_test(i, num_groups, elems_per_group, dispatch_per_group); + } +} + +UCS_TEST_F(test_arbiter_random_resched, one_elem_one_group) { + do_test_loop(1, 1, 1); +} + +UCS_TEST_F(test_arbiter_random_resched, one_elem_many_groups) { + do_test_loop(42, 1, 1); +} + +UCS_TEST_F(test_arbiter_random_resched, many_elems_many_groups) { + do_test_loop(42, 10, 4); +} diff --git a/test/gtest/ucs/test_async.cc b/test/gtest/ucs/test_async.cc index 996215879ba..858dba26d61 100644 --- a/test/gtest/ucs/test_async.cc +++ b/test/gtest/ucs/test_async.cc @@ -48,7 +48,7 @@ class base { virtual void ack_event() = 0; virtual int event_id() = 0; - static void cb(int id, void *arg) { + static void cb(int id, int events, void *arg) { base *self = reinterpret_cast(arg); self->handler(); } @@ -79,9 +79,10 @@ class base_event : public base { } void set_handler(ucs_async_context_t *async) { - ucs_status_t status = ucs_async_set_event_handler(mode(), event_fd(), - POLLIN, cb, this, - async); + ucs_status_t status = + ucs_async_set_event_handler(mode(), event_fd(), + UCS_EVENT_SET_EVREAD, + cb, this, async); ASSERT_UCS_OK(status); base::set_handler(); } @@ -94,9 +95,13 @@ class base_event : public base { ucs_async_pipe_push(&m_event_pipe); } + void reset() { + ucs_async_pipe_drain(&m_event_pipe); + } + protected: virtual void ack_event() { - ucs_async_pipe_drain(&m_event_pipe); + reset(); } private: @@ -225,8 +230,10 @@ class local_timer : public local, public base_timer { public: + static const int TIMER_INTERVAL_USEC = 1000; + local_timer(ucs_async_mode_t mode) : local(mode), base_timer(mode) { - set_timer(&m_async, ucs_time_from_usec(1000)); + set_timer(&m_async, ucs_time_from_usec(TIMER_INTERVAL_USEC)); } ~local_timer() { @@ -242,7 +249,7 @@ public ucs::test_base { protected: static const int COUNT = 40; static const unsigned SLEEP_USEC = 1000; - static const int TIMER_RETRIES = 100; + static const int NUM_RETRIES = 100; static const int TIMER_EXP_COUNT = COUNT / 4; void suspend(double scale = 1.0) { @@ -272,6 +279,20 @@ public ucs::test_base { suspend(scale); } } + + template + void expect_count_GE(E& event, int value) { + for (int retry = 0; retry < NUM_RETRIES; ++retry) { + suspend_and_poll(&event, COUNT); + if (event.count() >= value) { + return; + } + UCS_TEST_MESSAGE << "retry " << (retry + 1); + } + EXPECT_GE(event.count(), value) << "after " << int(NUM_RETRIES) + << " retries"; + } + }; template @@ -364,24 +385,15 @@ class test_async_mt : public test_async { LOCAL* m_ev[NUM_THREADS]; }; - UCS_TEST_P(test_async, global_event) { global_event ge(GetParam()); ge.push_event(); - suspend_and_poll(&ge, COUNT); - EXPECT_GE(ge.count(), 1); + expect_count_GE(ge, 1); } UCS_TEST_P(test_async, global_timer) { global_timer gt(GetParam()); - for (int i = 0; i < TIMER_RETRIES; ++i) { - suspend_and_poll(>, COUNT * 4); - if (gt.count() >= COUNT) { - break; - } - UCS_TEST_MESSAGE << "retry " << (i + 1); - } - EXPECT_GE(gt.count(), int(COUNT)); + expect_count_GE(gt, COUNT); } UCS_TEST_P(test_async, max_events, "ASYNC_MAX_EVENTS=4") { @@ -423,8 +435,8 @@ UCS_TEST_P(test_async, max_events, "ASYNC_MAX_EVENTS=4") { } UCS_TEST_P(test_async, many_timers) { - - for (int count = 0; count < 4010; ++count) { + int max_iters = 4010 / ucs::test_time_multiplier(); + for (int count = 0; count < max_iters; ++count) { std::vector timers; ucs_status_t status; int timer_id; @@ -447,32 +459,24 @@ UCS_TEST_P(test_async, many_timers) { UCS_TEST_P(test_async, ctx_event) { local_event le(GetParam()); le.push_event(); - suspend_and_poll(&le, COUNT); - EXPECT_GE(le.count(), 1); + expect_count_GE(le, 1); } UCS_TEST_P(test_async, ctx_timer) { local_timer lt(GetParam()); - for (int i = 0; i < TIMER_RETRIES; ++i) { - suspend_and_poll(<, COUNT * 4); - if (lt.count() >= TIMER_EXP_COUNT) { - break; - } - UCS_TEST_MESSAGE << "retry " << (i + 1); - } - EXPECT_GE(lt.count(), int(TIMER_EXP_COUNT)); + expect_count_GE(lt, TIMER_EXP_COUNT); } UCS_TEST_P(test_async, two_timers) { local_timer lt1(GetParam()); local_timer lt2(GetParam()); - for (int i = 0; i < TIMER_RETRIES; ++i) { + for (int retry = 0; retry < NUM_RETRIES; ++retry) { suspend_and_poll2(<1, <2, COUNT * 4); if ((lt1.count() >= TIMER_EXP_COUNT) && (lt2.count() >= TIMER_EXP_COUNT)) { break; } - UCS_TEST_MESSAGE << "retry " << (i + 1); + UCS_TEST_MESSAGE << "retry " << (retry + 1); } EXPECT_GE(lt1.count(), int(TIMER_EXP_COUNT)); EXPECT_GE(lt2.count(), int(TIMER_EXP_COUNT)); @@ -480,15 +484,23 @@ UCS_TEST_P(test_async, two_timers) { UCS_TEST_P(test_async, ctx_event_block) { local_event le(GetParam()); + int count = 0; - le.block(); - le.push_event(); - suspend_and_poll(&le, COUNT); - EXPECT_EQ(0, le.count()); - le.unblock(); + for (int retry = 0; retry < NUM_RETRIES; ++retry) { + le.block(); + count = le.count(); + le.push_event(); + suspend_and_poll(&le, COUNT); + EXPECT_EQ(count, le.count()); + le.unblock(); - le.check_miss(); - EXPECT_GE(le.count(), 1); + le.check_miss(); + if (le.count() > count) { + break; + } + UCS_TEST_MESSAGE << "retry " << (retry + 1); + } + EXPECT_GT(le.count(), count); } UCS_TEST_P(test_async, ctx_event_block_two_miss) { @@ -526,21 +538,22 @@ UCS_TEST_P(test_async, ctx_event_block_two_miss) { UCS_TEST_P(test_async, ctx_timer_block) { local_timer lt(GetParam()); + int count = 0; - for (int i = 0; i < TIMER_RETRIES; ++i) { + for (int retry = 0; retry < NUM_RETRIES; ++retry) { lt.block(); - int count = lt.count(); + count = lt.count(); suspend_and_poll(<, COUNT); EXPECT_EQ(count, lt.count()); lt.unblock(); lt.check_miss(); - if (lt.count() >= 1) { + if (lt.count() > count) { break; } - UCS_TEST_MESSAGE << "retry " << (i + 1); + UCS_TEST_MESSAGE << "retry " << (retry + 1); } - EXPECT_GE(lt.count(), 1); /* Timer could expire again after unblock */ + EXPECT_GT(lt.count(), count); /* Timer could expire again after unblock */ } UCS_TEST_P(test_async, modify_event) { @@ -548,8 +561,7 @@ UCS_TEST_P(test_async, modify_event) { int count; le.push_event(); - suspend_and_poll(&le, COUNT); - EXPECT_GE(le.count(), 1); + expect_count_GE(le, 1); ucs_async_modify_handler(le.event_id(), 0); sleep(1); @@ -557,18 +569,12 @@ UCS_TEST_P(test_async, modify_event) { le.push_event(); suspend_and_poll(&le, COUNT); EXPECT_EQ(le.count(), count); + le.reset(); - ucs_async_modify_handler(le.event_id(), POLLIN); + ucs_async_modify_handler(le.event_id(), UCS_EVENT_SET_EVREAD); count = le.count(); le.push_event(); - for (int i = 0; i < TIMER_RETRIES; ++i) { - suspend_and_poll(&le, 1); - if (le.count() > count) { - break; - } - UCS_TEST_MESSAGE << "retry " << (i + 1); - } - EXPECT_GT(le.count(), count); + expect_count_GE(le, count + 1); ucs_async_modify_handler(le.event_id(), 0); sleep(1); @@ -597,6 +603,43 @@ UCS_TEST_P(test_async, warn_block) { } } +class local_timer_long_handler : public local_timer { +public: + local_timer_long_handler(ucs_async_mode_t mode, int sleep_usec) : + local_timer(mode), m_sleep_usec(sleep_usec) { + } + + virtual void handler() { + /* The handler would sleep long enough to increment the counter after + * main thread already considers it removed - unless the main thread + * waits for handler completion properly. + * It sleeps only once to avoid timer overrun deadlock in signal mode. + */ + ucs::safe_usleep(m_sleep_usec * 2); + m_sleep_usec = 0; + local_timer::handler(); + } + + int m_sleep_usec; +}; + +UCS_TEST_P(test_async, remove_sync) { + + /* create another handler so that removing the timer would not have to + * completely cleanup the async context, and race condition could happen + */ + local_timer le(GetParam()); + + for (int retry = 0; retry < NUM_RETRIES; ++retry) { + local_timer_long_handler lt(GetParam(), SLEEP_USEC * 2); + suspend_and_poll(<, 1); + lt.unset_handler(true); + int count = lt.count(); + suspend_and_poll(<, 1); + ASSERT_EQ(count, lt.count()); + } +} + class local_timer_remove_handler : public local_timer { public: local_timer_remove_handler(ucs_async_mode_t mode) : local_timer(mode) { @@ -611,11 +654,8 @@ class local_timer_remove_handler : public local_timer { UCS_TEST_P(test_async, timer_unset_from_handler) { local_timer_remove_handler lt(GetParam()); - ucs_time_t deadline = ucs_get_time() + ucs_time_from_sec(10.0); - do { - suspend_and_poll(<, 1); - } while ((lt.count() == 0) && (ucs_get_time() < deadline)); - EXPECT_GE(lt.count(), 1); + + expect_count_GE(lt, 1); suspend_and_poll(<, COUNT); EXPECT_LE(lt.count(), 5); /* timer could fire multiple times before we remove it */ int count = lt.count(); @@ -625,26 +665,39 @@ UCS_TEST_P(test_async, timer_unset_from_handler) { class local_event_remove_handler : public local_event { public: - local_event_remove_handler(ucs_async_mode_t mode) : local_event(mode) { + local_event_remove_handler(ucs_async_mode_t mode, bool sync) : + local_event(mode), m_sync(sync) { } protected: virtual void handler() { base::handler(); - unset_handler(false); + unset_handler(m_sync); } + +private: + bool m_sync; }; -UCS_TEST_P(test_async, event_unset_from_handler) { - local_event_remove_handler le(GetParam()); +class test_async_event_unset_from_handler : public test_async { +protected: + void test_unset_from_handler(bool sync) { + local_event_remove_handler le(GetParam(), sync); - le.push_event(); - suspend_and_poll(&le, COUNT); - EXPECT_EQ(1, le.count()); + for (int iter = 0; iter < 5; ++iter) { + le.push_event(); + expect_count_GE(le, 1); + EXPECT_EQ(1, le.count()); + } + } +}; - le.push_event(); - suspend_and_poll(&le, COUNT); - EXPECT_EQ(1, le.count()); +UCS_TEST_P(test_async_event_unset_from_handler, sync) { + test_unset_from_handler(true); +} + +UCS_TEST_P(test_async_event_unset_from_handler, async) { + test_unset_from_handler(false); } class local_event_add_handler : public local_event { @@ -671,15 +724,16 @@ class local_event_add_handler : public local_event { } protected: - static void dummy_cb(int id, void *arg) { + static void dummy_cb(int id, int events, void *arg) { } virtual void handler() { base::handler(); if (!m_event_set) { - ucs_status_t status = ucs_async_set_event_handler(mode(), m_pipefd[0], - POLLIN, dummy_cb, - this, &m_async); + ucs_status_t status = + ucs_async_set_event_handler(mode(), m_pipefd[0], + UCS_EVENT_SET_EVREAD, + dummy_cb, this, &m_async); ASSERT_UCS_OK(status); m_event_set = true; } @@ -703,33 +757,39 @@ typedef test_async_mt test_async_timer_mt; /* * Run multiple threads which all process events independently. */ -UCS_TEST_P(test_async_event_mt, multithread) { - if (!(HAVE_DECL_F_SETOWN_EX)) { - UCS_TEST_SKIP; - } - - spawn(); +UCS_TEST_SKIP_COND_P(test_async_event_mt, multithread, + !(HAVE_DECL_F_SETOWN_EX)) { + const int exp_min_count = (int)(COUNT * 0.5); + int min_count = 0; + for (int retry = 0; retry < NUM_RETRIES; ++retry) { + spawn(); + for (int j = 0; j < COUNT; ++j) { + for (unsigned i = 0; i < NUM_THREADS; ++i) { + event(i)->push_event(); + suspend(); + } + } + suspend(); + stop(); - for (int j = 0; j < COUNT; ++j) { + min_count = std::numeric_limits::max(); for (unsigned i = 0; i < NUM_THREADS; ++i) { - event(i)->push_event(); - suspend(); + int count = thread_count(i); + min_count = ucs_min(count, min_count); + } + if (min_count >= exp_min_count) { + break; } - } - - suspend(); - - stop(); - for (unsigned i = 0; i < NUM_THREADS; ++i) { - int count = thread_count(i); - EXPECT_GE(count, (int)(COUNT * 0.4)); + UCS_TEST_MESSAGE << "retry " << (retry + 1); } + EXPECT_GE(min_count, exp_min_count); } + UCS_TEST_P(test_async_timer_mt, multithread) { const int exp_min_count = (int)(COUNT * 0.10); int min_count = 0; - for (int r = 0; r < TIMER_RETRIES; ++r) { + for (int retry = 0; retry < NUM_RETRIES; ++retry) { spawn(); suspend(2 * COUNT); stop(); @@ -746,15 +806,18 @@ UCS_TEST_P(test_async_timer_mt, multithread) { EXPECT_GE(min_count, exp_min_count); } -INSTANTIATE_TEST_CASE_P(signal, test_async, ::testing::Values(UCS_ASYNC_MODE_SIGNAL)); -INSTANTIATE_TEST_CASE_P(thread_spinlock, test_async, ::testing::Values(UCS_ASYNC_MODE_THREAD_SPINLOCK)); -INSTANTIATE_TEST_CASE_P(thread_mutex, test_async, ::testing::Values(UCS_ASYNC_MODE_THREAD_MUTEX)); -INSTANTIATE_TEST_CASE_P(poll, test_async, ::testing::Values(UCS_ASYNC_MODE_POLL)); -INSTANTIATE_TEST_CASE_P(signal, test_async_event_mt, ::testing::Values(UCS_ASYNC_MODE_SIGNAL)); -INSTANTIATE_TEST_CASE_P(thread_spinlock, test_async_event_mt, ::testing::Values(UCS_ASYNC_MODE_THREAD_SPINLOCK)); -INSTANTIATE_TEST_CASE_P(thread_mutex, test_async_event_mt, ::testing::Values(UCS_ASYNC_MODE_THREAD_MUTEX)); -INSTANTIATE_TEST_CASE_P(poll, test_async_event_mt, ::testing::Values(UCS_ASYNC_MODE_POLL)); -INSTANTIATE_TEST_CASE_P(signal, test_async_timer_mt, ::testing::Values(UCS_ASYNC_MODE_SIGNAL)); -INSTANTIATE_TEST_CASE_P(thread_spinlock, test_async_timer_mt, ::testing::Values(UCS_ASYNC_MODE_THREAD_SPINLOCK)); -INSTANTIATE_TEST_CASE_P(thread_mutex, test_async_timer_mt, ::testing::Values(UCS_ASYNC_MODE_THREAD_MUTEX)); -INSTANTIATE_TEST_CASE_P(poll, test_async_timer_mt, ::testing::Values(UCS_ASYNC_MODE_POLL)); +std::ostream& operator<<(std::ostream& os, ucs_async_mode_t mode) +{ + return os << ucs_async_mode_names[mode]; +} + +#define INSTANTIATE_ASYNC_TEST_CASES(_test_fixture) \ + INSTANTIATE_TEST_CASE_P(signal, _test_fixture, ::testing::Values(UCS_ASYNC_MODE_SIGNAL)); \ + INSTANTIATE_TEST_CASE_P(thread_spinlock, _test_fixture, ::testing::Values(UCS_ASYNC_MODE_THREAD_SPINLOCK)); \ + INSTANTIATE_TEST_CASE_P(thread_mutex, _test_fixture, ::testing::Values(UCS_ASYNC_MODE_THREAD_MUTEX)); \ + INSTANTIATE_TEST_CASE_P(poll, _test_fixture, ::testing::Values(UCS_ASYNC_MODE_POLL)); + +INSTANTIATE_ASYNC_TEST_CASES(test_async); +INSTANTIATE_ASYNC_TEST_CASES(test_async_event_unset_from_handler); +INSTANTIATE_ASYNC_TEST_CASES(test_async_event_mt); +INSTANTIATE_ASYNC_TEST_CASES(test_async_timer_mt); diff --git a/test/gtest/ucs/test_callbackq.cc b/test/gtest/ucs/test_callbackq.cc index 5342522a006..705dbdc9134 100644 --- a/test/gtest/ucs/test_callbackq.cc +++ b/test/gtest/ucs/test_callbackq.cc @@ -5,21 +5,25 @@ */ #include +#include + extern "C" { #include #include #include } + class test_callbackq : public ucs::test_base, public ::testing::TestWithParam { protected: enum { - COMMAND_NONE, COMMAND_REMOVE_SELF, - COMMAND_ADD_ANOTHER + COMMAND_ENQUEUE_KEY, + COMMAND_ADD_ANOTHER, + COMMAND_NONE }; struct callback_ctx { @@ -64,6 +68,9 @@ class test_callbackq : case COMMAND_ADD_ANOTHER: add(ctx->to_add); break; + case COMMAND_ENQUEUE_KEY: + m_keys_queue.push_back(ctx->key); + break; case COMMAND_NONE: default: break; @@ -91,9 +98,14 @@ class test_callbackq : cb_flags() | flags); } + void remove(int callback_id) + { + ucs_callbackq_remove(&m_cbq, callback_id); + } + void remove(callback_ctx *ctx) { - ucs_callbackq_remove(&m_cbq, ctx->callback_id); + remove(ctx->callback_id); } void add_safe(callback_ctx *ctx, unsigned flags = 0) @@ -133,6 +145,7 @@ class test_callbackq : } ucs_callbackq_t m_cbq; + std::deque m_keys_queue; }; UCS_TEST_P(test_callbackq, single) { @@ -362,3 +375,55 @@ UCS_TEST_F(test_callbackq_noflags, remove_if) { } } +UCS_TEST_F(test_callbackq_noflags, ordering) { + static const int UNUSED_CB_KEY = -1; + static const int num_callbacks = 100; + std::vector ctxs(num_callbacks); + std::deque gc_list; + std::deque oneshot_callback_keys; + + for (int i = 0; i < num_callbacks; ++i) { + callback_ctx& r_ctx = ctxs[i]; + + // randomize: either permanent callback with key=i or oneshot callback + // with key=-1 + init_ctx(&r_ctx); + unsigned cb_flags = 0; + if (ucs::rand() % 2) { + // oneshot callback, which must stay in order + r_ctx.key = i; + r_ctx.command = COMMAND_ENQUEUE_KEY; + cb_flags = UCS_CALLBACKQ_FLAG_ONESHOT; + oneshot_callback_keys.push_back(i); + } else { + // permanent + r_ctx.key = UNUSED_CB_KEY; + if (ucs::rand() % 2) { + // do-nothing callback + r_ctx.command = COMMAND_NONE; + } else { + // non-one-shot callback which removes itself - for more fun + r_ctx.command = COMMAND_REMOVE_SELF; + } + } + + add(&r_ctx, cb_flags); + + if (r_ctx.command == COMMAND_NONE) { + // we need to remove callbacks which don't remove themselves in the + // end of the test + gc_list.push_back(r_ctx.callback_id); + } + } + + dispatch(10); + + // make sure the ONESHOT callbacks were executed in order + EXPECT_EQ(oneshot_callback_keys, m_keys_queue); + + // remove remaining callbacks + while (!gc_list.empty()) { + remove(gc_list.front()); + gc_list.pop_front(); + } +} diff --git a/test/gtest/ucs/test_config.cc b/test/gtest/ucs/test_config.cc index 220e90bf1b6..120e26ce1a9 100644 --- a/test/gtest/ucs/test_config.cc +++ b/test/gtest/ucs/test_config.cc @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * Copyright (C) UT-Battelle, LLC. 2014. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -80,8 +80,13 @@ typedef struct { double bw_mbits; double bw_gbits; double bw_tbits; + double bw_auto; ucs_config_bw_spec_t can_pci_bw; /* CAN-bus */ + + int air_conditioning; + int abs; + int transmission; } car_opts_t; @@ -121,6 +126,9 @@ ucs_config_field_t engine_opts_table[] = { {"POWER_ALIAS", NULL, "Engine power", ucs_offsetof(engine_opts_t, power), UCS_CONFIG_TYPE_ULUNITS}, + {"FUEL_LEVEL", "", "This is electric car", + UCS_CONFIG_DEPRECATED_FIELD_OFFSET, UCS_CONFIG_TYPE_DEPRECATED}, + {NULL} }; @@ -137,6 +145,9 @@ ucs_config_field_t car_opts_table[] = { {"PRICE_ALIAS", NULL, "Price", ucs_offsetof(car_opts_t, price), UCS_CONFIG_TYPE_UINT}, + {"DRIVER", "", "AI drives a car", + UCS_CONFIG_DEPRECATED_FIELD_OFFSET, UCS_CONFIG_TYPE_DEPRECATED}, + {"BRAND", "Chevy", "Car brand", ucs_offsetof(car_opts_t, brand), UCS_CONFIG_TYPE_STRING}, @@ -179,14 +190,48 @@ ucs_config_field_t car_opts_table[] = { {"BW_TBITS", "1024Tbs", "Bandwidth in tbits", ucs_offsetof(car_opts_t, bw_tbits), UCS_CONFIG_TYPE_BW}, + {"BW_AUTO", "auto", "Auto bandwidth value", + ucs_offsetof(car_opts_t, bw_auto), UCS_CONFIG_TYPE_BW}, + {"CAN_BUS_BW", "mlx5_0:1024Tbs", "Bandwidth in tbits of CAN-bus", ucs_offsetof(car_opts_t, can_pci_bw), UCS_CONFIG_TYPE_BW_SPEC}, + {"AIR_CONDITIONING", "on", "Air conditioning mode", + ucs_offsetof(car_opts_t, air_conditioning), UCS_CONFIG_TYPE_ON_OFF}, + + {"ABS", "off", "ABS mode", + ucs_offsetof(car_opts_t, abs), UCS_CONFIG_TYPE_ON_OFF}, + + {"TRANSMISSION", "auto", "Transmission mode", + ucs_offsetof(car_opts_t, transmission), UCS_CONFIG_TYPE_ON_OFF_AUTO}, + {NULL} }; +static std::vector config_err_exp_str; + class test_config : public ucs::test { protected: + static ucs_log_func_rc_t + config_error_handler(const char *file, unsigned line, const char *function, + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) + { + // Ignore errors that invalid input parameters as it is expected + if (level == UCS_LOG_LEVEL_WARN) { + std::string err_str = format_message(message, ap); + + for (size_t i = 0; i < config_err_exp_str.size(); i++) { + if (err_str.find(config_err_exp_str[i]) != std::string::npos) { + UCS_TEST_MESSAGE << err_str; + return UCS_LOG_FUNC_RC_STOP; + } + } + } + + return UCS_LOG_FUNC_RC_CONTINUE; + } /* * Wrapper class for car options parser. @@ -263,7 +308,7 @@ class test_config : public ucs::test { size_t dump_size; char line_buf[1024]; char alias[128]; - car_opts opts(NULL, NULL); + car_opts opts(UCS_DEFAULT_ENV_PREFIX, NULL); memset(alias, 0, sizeof(alias)); @@ -271,7 +316,7 @@ class test_config : public ucs::test { dump_data = NULL; FILE *file = open_memstream(&dump_data, &dump_size); ucs_config_parser_print_opts(file, "", *opts, car_opts_table, - prefix, + prefix, UCS_DEFAULT_ENV_PREFIX, (ucs_config_print_flags_t)flags); /* Sanity check - all lines begin with UCS_ */ @@ -320,7 +365,7 @@ class test_config : public ucs::test { }; UCS_TEST_F(test_config, parse_default) { - car_opts opts(NULL, "TEST"); + car_opts opts(UCS_DEFAULT_ENV_PREFIX, "TEST"); EXPECT_EQ(999U, opts->price); EXPECT_EQ(std::string("Chevy"), opts->brand); @@ -330,7 +375,7 @@ UCS_TEST_F(test_config, parse_default) { EXPECT_EQ(COLOR_RED, opts->coach.driver_seat.color); EXPECT_EQ(COLOR_BLUE, opts->coach.passenger_seat.color); EXPECT_EQ(COLOR_BLACK, opts->coach.rear_seat.color); - EXPECT_EQ(UCS_CONFIG_ULUNITS_AUTO, opts->vin); + EXPECT_EQ(UCS_ULUNITS_AUTO, opts->vin); EXPECT_EQ(200UL, opts->engine.power); EXPECT_EQ(1024.0, opts->bw_bytes); @@ -344,9 +389,14 @@ UCS_TEST_F(test_config, parse_default) { EXPECT_EQ(UCS_MBYTE * 128.0, opts->bw_mbits); EXPECT_EQ(UCS_GBYTE * 128.0, opts->bw_gbits); EXPECT_EQ(UCS_TBYTE * 128.0, opts->bw_tbits); + EXPECT_TRUE(UCS_CONFIG_BW_IS_AUTO(opts->bw_auto)); EXPECT_EQ(UCS_TBYTE * 128.0, opts->can_pci_bw.bw); EXPECT_EQ(std::string("mlx5_0"), opts->can_pci_bw.name); + + EXPECT_EQ(UCS_CONFIG_ON, opts->air_conditioning); + EXPECT_EQ(UCS_CONFIG_OFF, opts->abs); + EXPECT_EQ(UCS_CONFIG_AUTO, opts->transmission); } UCS_TEST_F(test_config, clone) { @@ -359,7 +409,7 @@ UCS_TEST_F(test_config, clone) { /* coverity[tainted_string_argument] */ ucs::scoped_setenv env2("UCX_PRICE_ALIAS", "0"); - car_opts opts(NULL, NULL); + car_opts opts(UCS_DEFAULT_ENV_PREFIX, NULL); EXPECT_EQ(COLOR_WHITE, opts->color); EXPECT_EQ(0U, opts->price); @@ -373,7 +423,7 @@ UCS_TEST_F(test_config, clone) { } UCS_TEST_F(test_config, set_get) { - car_opts opts(NULL, NULL); + car_opts opts(UCS_DEFAULT_ENV_PREFIX, NULL); EXPECT_EQ(COLOR_RED, opts->color); EXPECT_EQ(std::string(color_names[COLOR_RED]), std::string(opts.get("COLOR"))); @@ -398,7 +448,7 @@ UCS_TEST_F(test_config, set_get_with_table_prefix) { /* coverity[tainted_string_argument] */ ucs::scoped_setenv env2("UCX_CARS_COLOR", "white"); - car_opts opts(NULL, "CARS_"); + car_opts opts(UCS_DEFAULT_ENV_PREFIX, "CARS_"); EXPECT_EQ(COLOR_WHITE, opts->color); EXPECT_EQ(std::string(color_names[COLOR_WHITE]), std::string(opts.get("COLOR"))); @@ -408,9 +458,9 @@ UCS_TEST_F(test_config, set_get_with_env_prefix) { /* coverity[tainted_string_argument] */ ucs::scoped_setenv env1("UCX_COLOR", "black"); /* coverity[tainted_string_argument] */ - ucs::scoped_setenv env2("UCX_TEST_COLOR", "white"); + ucs::scoped_setenv env2("TEST_UCX_COLOR", "white"); - car_opts opts("TEST", NULL); + car_opts opts("TEST_" UCS_DEFAULT_ENV_PREFIX, NULL); EXPECT_EQ(COLOR_WHITE, opts->color); EXPECT_EQ(std::string(color_names[COLOR_WHITE]), std::string(opts.get("COLOR"))); @@ -428,20 +478,59 @@ UCS_TEST_F(test_config, performance) { /* Now test the time */ UCS_TEST_TIME_LIMIT(0.05) { - car_opts opts(NULL, NULL); + car_opts opts(UCS_DEFAULT_ENV_PREFIX, NULL); } } +UCS_TEST_F(test_config, unused) { + ucs::ucx_env_cleanup env_cleanup; + + /* set to warn about unused env vars */ + ucs_global_opts.warn_unused_env_vars = 1; + + const std::string warn_str = "unused env variable"; + const std::string unused_var1 = "UCX_UNUSED_VAR1"; + /* coverity[tainted_string_argument] */ + ucs::scoped_setenv env1(unused_var1.c_str(), "unused"); + + { + config_err_exp_str.push_back(warn_str + ": " + unused_var1); + scoped_log_handler log_handler(config_error_handler); + car_opts opts(UCS_DEFAULT_ENV_PREFIX, NULL); + + ucs_config_parser_warn_unused_env_vars_once(UCS_DEFAULT_ENV_PREFIX); + + config_err_exp_str.pop_back(); + } + + { + const std::string unused_var2 = "TEST_UNUSED_VAR2"; + /* coverity[tainted_string_argument] */ + ucs::scoped_setenv env2(unused_var2.c_str(), "unused"); + + config_err_exp_str.push_back(warn_str + ": " + unused_var2); + scoped_log_handler log_handler(config_error_handler); + car_opts opts("TEST_", NULL); + + ucs_config_parser_warn_unused_env_vars_once("TEST_"); + + config_err_exp_str.pop_back(); + } + + /* reset to not warn about unused env vars */ + ucs_global_opts.warn_unused_env_vars = 0; +} + UCS_TEST_F(test_config, dump) { /* aliases must not be counted here */ - test_config_print_opts(UCS_CONFIG_PRINT_CONFIG, 24u); + test_config_print_opts(UCS_CONFIG_PRINT_CONFIG, 28u); } UCS_TEST_F(test_config, dump_hidden) { /* aliases must be counted here */ test_config_print_opts((UCS_CONFIG_PRINT_CONFIG | UCS_CONFIG_PRINT_HIDDEN), - 29u); + 35u); } UCS_TEST_F(test_config, dump_hidden_check_alias_name) { @@ -449,10 +538,42 @@ UCS_TEST_F(test_config, dump_hidden_check_alias_name) { test_config_print_opts((UCS_CONFIG_PRINT_CONFIG | UCS_CONFIG_PRINT_HIDDEN | UCS_CONFIG_PRINT_DOC), - 29u); + 35u); test_config_print_opts((UCS_CONFIG_PRINT_CONFIG | UCS_CONFIG_PRINT_HIDDEN | UCS_CONFIG_PRINT_DOC), - 29u, "TEST_"); + 35u, "TEST_"); +} + +UCS_TEST_F(test_config, deprecated) { + /* set to warn about unused env vars */ + ucs_global_opts.warn_unused_env_vars = 1; + + const std::string warn_str = " is deprecated"; + const std::string deprecated_var1 = "UCX_DRIVER"; + /* coverity[tainted_string_argument] */ + ucs::scoped_setenv env1(deprecated_var1.c_str(), "Taxi driver"); + config_err_exp_str.push_back(deprecated_var1 + warn_str); + + { + scoped_log_handler log_handler(config_error_handler); + car_opts opts(UCS_DEFAULT_ENV_PREFIX, NULL); + } + + { + const std::string deprecated_var2 = "UCX_ENGINE_FUEL_LEVEL"; + /* coverity[tainted_string_argument] */ + ucs::scoped_setenv env2(deprecated_var2.c_str(), "58"); + config_err_exp_str.push_back(deprecated_var2 + warn_str); + + scoped_log_handler log_handler_vars(config_error_handler); + car_opts opts(UCS_DEFAULT_ENV_PREFIX, NULL); + config_err_exp_str.pop_back(); + } + + config_err_exp_str.pop_back(); + + /* reset to not warn about unused env vars */ + ucs_global_opts.warn_unused_env_vars = 0; } diff --git a/test/gtest/ucs/test_datatype.cc b/test/gtest/ucs/test_datatype.cc index 2b1652e572d..bcc0bed7b69 100644 --- a/test/gtest/ucs/test_datatype.cc +++ b/test/gtest/ucs/test_datatype.cc @@ -1,15 +1,18 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. * Copyright (C) UT-Battelle, LLC. 2014. ALL RIGHTS RESERVED. +* Copyright (C) Huawei Technologies Co., Ltd. 2020. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ #include extern "C" { #include +#include #include #include #include +#include } #include @@ -21,9 +24,11 @@ class test_datatype : public ucs::test { typedef struct { int i; ucs_list_link_t list; + ucs_hlist_link_t hlist; ucs_queue_elem_t queue; } elem_t; + UCS_TEST_F(test_datatype, list_basic) { ucs_list_link_t head; @@ -84,6 +89,81 @@ UCS_TEST_F(test_datatype, list_splice) { } } +UCS_TEST_F(test_datatype, hlist_basic) { + elem_t elem1, elem2, elem3; + ucs_hlist_head_t head; + std::vector v; + elem_t *elem; + + elem1.i = 1; + elem2.i = 2; + elem3.i = 3; + + /* initialize list, should be empty */ + ucs_hlist_head_init(&head); + EXPECT_TRUE(ucs_hlist_is_empty(&head)); + + /* add one element to head */ + ucs_hlist_add_head(&head, &elem1.hlist); + EXPECT_FALSE(ucs_hlist_is_empty(&head)); + + EXPECT_EQ(&elem1, ucs_hlist_head_elem(&head, elem_t, hlist)); + + /* test iteration over single-element list */ + v.clear(); + ucs_hlist_for_each(elem, &head, hlist) { + v.push_back(elem->i); + } + ASSERT_EQ(1ul, v.size()); + EXPECT_EQ(1, v[0]); + + ucs_hlist_del(&head, &elem1.hlist); + EXPECT_TRUE(ucs_hlist_is_empty(&head)); + + /* when list is empty, extract_head should return NULL */ + ucs_hlist_link_t *helem = ucs_hlist_extract_head(&head); + EXPECT_TRUE(helem == NULL); + + /* test iteration over empty list */ + v.clear(); + ucs_hlist_for_each(elem, &head, hlist) { + v.push_back(elem->i); + } + ASSERT_EQ(0ul, v.size()); + + /* add one element to head and extract it */ + ucs_hlist_add_head(&head, &elem1.hlist); + elem = ucs_list_extract_head_elem(&head, elem_t, hlist); + EXPECT_EQ(&elem1, elem); + + /* add 3 elements */ + ucs_hlist_add_tail(&head, &elem2.hlist); + ucs_hlist_add_head(&head, &elem1.hlist); + ucs_hlist_add_tail(&head, &elem3.hlist); + + /* iterate without extract */ + v.clear(); + ucs_hlist_for_each(elem, &head, hlist) { + v.push_back(elem->i); + } + ASSERT_EQ(3ul, v.size()); + EXPECT_EQ(1, v[0]); + EXPECT_EQ(2, v[1]); + EXPECT_EQ(3, v[2]); + + /* iterate and extract */ + v.clear(); + ucs_hlist_for_each_extract(elem, &head, hlist) { + v.push_back(elem->i); + } + ASSERT_EQ(3ul, v.size()); + EXPECT_EQ(1, v[0]); + EXPECT_EQ(2, v[1]); + EXPECT_EQ(3, v[2]); + + EXPECT_TRUE(ucs_hlist_is_empty(&head)); +} + UCS_TEST_F(test_datatype, queue) { ucs_queue_head_t head; @@ -222,15 +302,12 @@ UCS_TEST_F(test_datatype, queue_iter) { } } -UCS_TEST_F(test_datatype, queue_perf) { +UCS_TEST_SKIP_COND_F(test_datatype, queue_perf, + (ucs::test_time_multiplier() > 1)) { const size_t count = 100000000ul; ucs_queue_head_t head; ucs_queue_elem_t elem; - if (ucs::test_time_multiplier() > 1) { - UCS_TEST_SKIP; - } - ucs_queue_head_init(&head); ucs_queue_push(&head, &elem); elem.next = NULL; @@ -340,27 +417,28 @@ UCS_TEST_F(test_datatype, queue_extract_if) { UCS_TEST_F(test_datatype, ptr_array_basic) { ucs_ptr_array_t pa; - uint32_t value; - int a = 1, b = 2, c = 3, d = 4; + int a = 1, b = 2, c = 3, d = 4, e = 5, f = 6, g = 7; unsigned index; - ucs_ptr_array_init(&pa, 3, "ptr_array test"); + ucs_ptr_array_init(&pa, "ptr_array test"); - index = ucs_ptr_array_insert(&pa, &a, &value); + index = ucs_ptr_array_insert(&pa, &a); EXPECT_EQ(0u, index); - EXPECT_EQ(3u, value); - index = ucs_ptr_array_insert(&pa, &b, &value); + index = ucs_ptr_array_insert(&pa, &b); EXPECT_EQ(1u, index); - EXPECT_EQ(3u, value); - index = ucs_ptr_array_insert(&pa, &c, &value); + index = ucs_ptr_array_insert(&pa, &c); EXPECT_EQ(2u, index); - EXPECT_EQ(3u, value); - index = ucs_ptr_array_insert(&pa, &d, &value); - EXPECT_EQ(3u, index); - EXPECT_EQ(3u, value); + ucs_ptr_array_set(&pa, 3, &d); + + index = ucs_ptr_array_insert(&pa, &e); + EXPECT_EQ(4u, index); + + ucs_ptr_array_set(&pa, 6, &f); + + ucs_ptr_array_set(&pa, 100, &g); void *vc; int present = ucs_ptr_array_lookup(&pa, 2, vc); @@ -373,13 +451,28 @@ UCS_TEST_F(test_datatype, ptr_array_basic) { present = ucs_ptr_array_lookup(&pa, 2, vc); EXPECT_EQ(&d, vc); + ucs_ptr_array_set(&pa, 2, &g); + present = ucs_ptr_array_lookup(&pa, 2, vc); + EXPECT_EQ(&g, vc); + + present = ucs_ptr_array_lookup(&pa, 6, vc); + EXPECT_EQ(&f, vc); + + present = ucs_ptr_array_lookup(&pa, 100, vc); + EXPECT_EQ(&g, vc); + EXPECT_FALSE(ucs_ptr_array_lookup(&pa, 5, vc)); + EXPECT_FALSE(ucs_ptr_array_lookup(&pa, 99, vc)); + EXPECT_FALSE(ucs_ptr_array_lookup(&pa, 101, vc)); EXPECT_FALSE(ucs_ptr_array_lookup(&pa, 5005, vc)); - ucs_ptr_array_remove(&pa, 0, 0); - ucs_ptr_array_remove(&pa, 1, 0); - ucs_ptr_array_remove(&pa, 2, 0); - ucs_ptr_array_remove(&pa, 3, 0); + ucs_ptr_array_remove(&pa, 0); + ucs_ptr_array_remove(&pa, 1); + ucs_ptr_array_remove(&pa, 2); + ucs_ptr_array_remove(&pa, 3); + ucs_ptr_array_remove(&pa, 4); + ucs_ptr_array_remove(&pa, 6); + ucs_ptr_array_remove(&pa, 100); ucs_ptr_array_cleanup(&pa); } @@ -387,19 +480,17 @@ UCS_TEST_F(test_datatype, ptr_array_basic) { UCS_TEST_F(test_datatype, ptr_array_random) { const unsigned count = 10000 / ucs::test_time_multiplier(); ucs_ptr_array_t pa; - uint32_t value; - - ucs_ptr_array_init(&pa, 5, "ptr_array test"); + unsigned expeced_count = count; + ucs_ptr_array_init(&pa, "ptr_array test"); std::map map; /* Insert phase */ for (unsigned i = 0; i < count; ++i) { void *ptr = malloc(0); - unsigned index = ucs_ptr_array_insert(&pa, ptr, &value); + unsigned index = ucs_ptr_array_insert(&pa, ptr); EXPECT_TRUE(map.end() == map.find(index)); - EXPECT_EQ(5u, value); map[index] = ptr; } @@ -407,6 +498,7 @@ UCS_TEST_F(test_datatype, ptr_array_random) { for (unsigned i = 0; i < count / 10; ++i) { int remove_count = ucs::rand() % 10; + expeced_count -= remove_count; for (int j = 0; j < remove_count; ++j) { unsigned to_remove = ucs::rand() % map.size(); std::map::iterator iter = map.begin(); @@ -418,104 +510,98 @@ UCS_TEST_F(test_datatype, ptr_array_random) { EXPECT_EQ(ptr, map[index]); free(ptr); - ucs_ptr_array_remove(&pa, index, index * index); + ucs_ptr_array_remove(&pa, index); EXPECT_FALSE(ucs_ptr_array_lookup(&pa, index, ptr)); map.erase(index); } int insert_count = ucs::rand() % 10; + expeced_count += insert_count; for (int j = 0; j < insert_count; ++j) { void *ptr = malloc(0); - unsigned index = ucs_ptr_array_insert(&pa, ptr, &value); + unsigned index = ucs_ptr_array_insert(&pa, ptr); EXPECT_TRUE(map.end() == map.find(index)); - EXPECT_TRUE(index * index == value || 5u == value); map[index] = ptr; } } + unsigned count_elements = 0; /* remove all */ void *ptr; unsigned index; ucs_ptr_array_for_each(ptr, index, &pa) { EXPECT_EQ(ptr, map[index]); - ucs_ptr_array_remove(&pa, index, 0); + ucs_ptr_array_remove(&pa, index); free(ptr); + count_elements++; } - ucs_ptr_array_cleanup(&pa); -} - -UCS_TEST_F(test_datatype, ptr_array_placeholder) { - ucs_ptr_array_t pa; - uint32_t value; - int a = 1; - unsigned index; - - ucs_ptr_array_init(&pa, 3, "ptr_array test"); - - index = ucs_ptr_array_insert(&pa, &a, &value); - EXPECT_EQ(0u, index); - EXPECT_EQ(3u, value); - - ucs_ptr_array_remove(&pa, index, 4); - - index = ucs_ptr_array_insert(&pa, &a, &value); - EXPECT_EQ(0u, index); - EXPECT_EQ(4u, value); - - ucs_ptr_array_remove(&pa, index, 0); + EXPECT_EQ(count_elements, expeced_count); ucs_ptr_array_cleanup(&pa); } -UCS_TEST_F(test_datatype, ptr_array_perf) { +UCS_TEST_SKIP_COND_F(test_datatype, ptr_array_perf, + (ucs::test_time_multiplier() > 1)) { const unsigned count = 10000000; ucs_ptr_array_t pa; - uint32_t value; - - if (ucs::test_time_multiplier() > 1) { - UCS_TEST_SKIP; - } ucs_time_t insert_start_time = ucs_get_time(); - ucs_ptr_array_init(&pa, 0, "ptr_array test"); + ucs_ptr_array_init(&pa, "ptr_array test"); for (unsigned i = 0; i < count; ++i) { - EXPECT_EQ(i, ucs_ptr_array_insert(&pa, NULL, &value)); + EXPECT_EQ(i, ucs_ptr_array_insert(&pa, NULL)); } ucs_time_t lookup_start_time = ucs_get_time(); for (unsigned i = 0; i < count; ++i) { - void *ptr; + void *ptr GTEST_ATTRIBUTE_UNUSED_; int present = ucs_ptr_array_lookup(&pa, i, ptr); ASSERT_TRUE(present); } + ucs_time_t foreach_start_time = ucs_get_time(); + unsigned index; + void *element; + unsigned count_elements = 0; + ucs_ptr_array_for_each(element, index, &pa) { + void *ptr GTEST_ATTRIBUTE_UNUSED_; + int present = ucs_ptr_array_lookup(&pa, index, ptr); + element = NULL; + ASSERT_TRUE(present); + ASSERT_TRUE(element == NULL); + count_elements++; + } + + EXPECT_EQ(count_elements, count); + ucs_time_t remove_start_time = ucs_get_time(); for (unsigned i = 0; i < count; ++i) { - ucs_ptr_array_remove(&pa, i, 0); + ucs_ptr_array_remove(&pa, i); } ucs_time_t end_time = ucs_get_time(); ucs_ptr_array_cleanup(&pa); - double insert_ns = ucs_time_to_nsec(lookup_start_time - insert_start_time) / count; - double lookup_ns = ucs_time_to_nsec(remove_start_time - lookup_start_time) / count; - double remove_ns = ucs_time_to_nsec(end_time - remove_start_time) / count; + double insert_ns = ucs_time_to_nsec(lookup_start_time - insert_start_time) / count; + double lookup_ns = ucs_time_to_nsec(foreach_start_time - lookup_start_time) / count; + double foreach_ns = ucs_time_to_nsec(remove_start_time - foreach_start_time) / count; + double remove_ns = ucs_time_to_nsec(end_time - remove_start_time) / count; UCS_TEST_MESSAGE << "Timings (nsec): insert " << insert_ns << " lookup: " << - lookup_ns << " remove: " << remove_ns; + lookup_ns << " remove: " << remove_ns << " Foreach: " << foreach_ns; if (ucs::perf_retry_count) { EXPECT_LT(insert_ns, 1000.0); EXPECT_LT(remove_ns, 1000.0); -#ifdef __x86_64__ - EXPECT_LT(lookup_ns, 15.0); -#else - EXPECT_LT(lookup_ns, 100.0); -#endif + + if (ucs_arch_get_cpu_vendor() != UCS_CPU_VENDOR_GENERIC_ARM) { + EXPECT_LT(lookup_ns, 60.0); + } else { + EXPECT_LT(lookup_ns, 100.0); + } } } @@ -529,3 +615,183 @@ UCS_TEST_F(test_datatype, ptr_status) { void *ptr2 = (void*)(uintptr_t)(UCS_ERR_LAST + 1); EXPECT_TRUE(UCS_PTR_IS_ERR(ptr2)); } + +UCS_TEST_F(test_datatype, ptr_array_locked_basic) { + ucs_ptr_array_locked_t pa; + int a = 1, b = 2, c = 3, d = 4, e = 5, f = 6, g = 7; + unsigned index; + + ucs_ptr_array_locked_init(&pa, "ptr_array_locked test"); + + index = ucs_ptr_array_locked_insert(&pa, &a); + EXPECT_EQ(0u, index); + + index = ucs_ptr_array_locked_insert(&pa, &b); + EXPECT_EQ(1u, index); + + index = ucs_ptr_array_locked_insert(&pa, &c); + EXPECT_EQ(2u, index); + + ucs_ptr_array_locked_set(&pa, 3, &d); + + index = ucs_ptr_array_locked_insert(&pa, &e); + EXPECT_EQ(4u, index); + + ucs_ptr_array_locked_set(&pa, 6, &f); + + ucs_ptr_array_locked_set(&pa, 100, &g); + + void *vc; + int present = ucs_ptr_array_locked_lookup(&pa, 2, &vc); + ASSERT_TRUE(present); + EXPECT_EQ(&c, vc); + + vc = ucs_ptr_array_locked_replace(&pa, 2, &d); + EXPECT_EQ(&c, vc); + + present = ucs_ptr_array_locked_lookup(&pa, 2, &vc); + EXPECT_EQ(&d, vc); + + ucs_ptr_array_locked_set(&pa, 2, &g); + present = ucs_ptr_array_locked_lookup(&pa, 2, &vc); + EXPECT_EQ(&g, vc); + + present = ucs_ptr_array_locked_lookup(&pa, 6, &vc); + EXPECT_EQ(&f, vc); + + present = ucs_ptr_array_locked_lookup(&pa, 100, &vc); + EXPECT_EQ(&g, vc); + + EXPECT_FALSE(ucs_ptr_array_locked_lookup(&pa, 5, &vc)); + EXPECT_FALSE(ucs_ptr_array_locked_lookup(&pa, 99, &vc)); + EXPECT_FALSE(ucs_ptr_array_locked_lookup(&pa, 101, &vc)); + EXPECT_FALSE(ucs_ptr_array_locked_lookup(&pa, 5005, &vc)); + + ucs_ptr_array_locked_remove(&pa, 0); + ucs_ptr_array_locked_remove(&pa, 1); + ucs_ptr_array_locked_remove(&pa, 2); + ucs_ptr_array_locked_remove(&pa, 3); + ucs_ptr_array_locked_remove(&pa, 4); + ucs_ptr_array_locked_remove(&pa, 6); + ucs_ptr_array_locked_remove(&pa, 100); + + ucs_ptr_array_locked_cleanup(&pa); +} + +UCS_TEST_F(test_datatype, ptr_array_locked_random) { + const unsigned count = 10000 / ucs::test_time_multiplier(); + ucs_ptr_array_locked_t pa; + + ucs_ptr_array_locked_init(&pa, "ptr_array test"); + + std::map map; + + /* Insert phase */ + for (unsigned i = 0; i < count; ++i) { + void *ptr = malloc(0); + unsigned index = ucs_ptr_array_locked_insert(&pa, ptr); + + EXPECT_TRUE(map.end() == map.find(index)); + map[index] = ptr; + } + + /* Remove + insert */ + for (unsigned i = 0; i < count / 10; ++i) { + int remove_count = ucs::rand() % 10; + for (int j = 0; j < remove_count; ++j) { + unsigned to_remove = ucs::rand() % map.size(); + std::map::iterator iter = map.begin(); + std::advance(iter, to_remove); + unsigned index = iter->first; + + void *ptr = NULL; + EXPECT_TRUE(ucs_ptr_array_locked_lookup(&pa, index, &ptr)); + EXPECT_EQ(ptr, map[index]); + free(ptr); + + ucs_ptr_array_locked_remove(&pa, index); + + EXPECT_FALSE(ucs_ptr_array_locked_lookup(&pa, index, &ptr)); + + map.erase(index); + } + + int insert_count = ucs::rand() % 10; + for (int j = 0; j < insert_count; ++j) { + void *ptr = malloc(0); + unsigned index = ucs_ptr_array_locked_insert(&pa, ptr); + + EXPECT_TRUE(map.end() == map.find(index)); + map[index] = ptr; + } + } + + /* remove all */ + void *ptr; + unsigned index; + ucs_ptr_array_locked_for_each(ptr, index, &pa) { + EXPECT_EQ(ptr, map[index]); + ucs_ptr_array_locked_remove(&pa, index); + free(ptr); + } + + ucs_ptr_array_locked_cleanup(&pa); +} + +UCS_TEST_SKIP_COND_F(test_datatype, ptr_array_locked_perf, + (ucs::test_time_multiplier() > 1)) { + const unsigned count = 10000000; + ucs_ptr_array_locked_t pa; + + ucs_time_t insert_start_time = ucs_get_time(); + ucs_ptr_array_locked_init(&pa, "ptr_array test"); + for (unsigned i = 0; i < count; ++i) { + EXPECT_EQ(i, ucs_ptr_array_locked_insert(&pa, NULL)); + } + + ucs_time_t lookup_start_time = ucs_get_time(); + for (unsigned i = 0; i < count; ++i) { + void *ptr GTEST_ATTRIBUTE_UNUSED_; + int present = ucs_ptr_array_locked_lookup(&pa, i, &ptr); + ASSERT_TRUE(present); + } + + ucs_time_t foreach_start_time = ucs_get_time(); + unsigned index; + void *element; + ucs_ptr_array_locked_for_each(element, index, &pa) { + void *ptr GTEST_ATTRIBUTE_UNUSED_; + int present = ucs_ptr_array_locked_lookup(&pa, index, &ptr); + ASSERT_TRUE(present); + ASSERT_TRUE(element == NULL); + } + + ucs_time_t remove_start_time = ucs_get_time(); + for (unsigned i = 0; i < count; ++i) { + ucs_ptr_array_locked_remove(&pa, i); + } + + ucs_time_t end_time = ucs_get_time(); + + ucs_ptr_array_locked_cleanup(&pa); + + double insert_ns = ucs_time_to_nsec(lookup_start_time - insert_start_time) / count; + double lookup_ns = ucs_time_to_nsec(foreach_start_time - lookup_start_time) / count; + double foreach_ns = ucs_time_to_nsec(remove_start_time - foreach_start_time) / count; + double remove_ns = ucs_time_to_nsec(end_time - remove_start_time) / count; + + UCS_TEST_MESSAGE << "Locked array timings (nsec): insert " << insert_ns << " lookup: " << + lookup_ns << " remove: " << remove_ns << " Foreach: " << foreach_ns; + + if (ucs::perf_retry_count) { + EXPECT_LT(insert_ns, 1000.0); + EXPECT_LT(remove_ns, 1000.0); + + if (ucs_arch_get_cpu_vendor() != UCS_CPU_VENDOR_GENERIC_ARM) { + EXPECT_LT(lookup_ns, 60.0); + } else { + EXPECT_LT(lookup_ns, 100.0); + } + } +} + diff --git a/test/gtest/ucs/test_debug.cc b/test/gtest/ucs/test_debug.cc index 95570f57925..1ddc4adb3fa 100644 --- a/test/gtest/ucs/test_debug.cc +++ b/test/gtest/ucs/test_debug.cc @@ -15,20 +15,7 @@ extern "C" { extern "C" { -int UCS_F_NOINLINE my_cool_function(unsigned *lineno, void **address) -{ - int a; - - a = 5; - - ucs_compiler_fence(); -label1: *lineno = __LINE__; - ucs_compiler_fence(); - *address = &&label1; - - ++a; - return a; -} +void UCS_F_NOINLINE my_cool_function(unsigned *lineno) { *lineno = __LINE__; }; } @@ -61,20 +48,19 @@ UCS_TEST_F(test_debug, lookup_invalid) { EXPECT_EQ(UCS_ERR_NO_ELEM, status); } -UCS_TEST_F(test_debug, lookup_address) { +UCS_TEST_SKIP_COND_F(test_debug, lookup_address, BULLSEYE_ON) { unsigned lineno; - void *address; - - if (BULLSEYE_ON) { - UCS_TEST_SKIP; - } - my_cool_function(&lineno, &address); + my_cool_function(&lineno); ucs_debug_address_info info; - ucs_status_t status = ucs_debug_lookup_address(address, &info); + ucs_status_t status = ucs_debug_lookup_address((void*)&my_cool_function, + &info); ASSERT_UCS_OK(status); + UCS_TEST_MESSAGE << info.source_file << ":" << info.line_number << + " " << info.function << "()"; + EXPECT_NE(std::string::npos, std::string(info.file.path).find("gtest")); #ifdef HAVE_DETAILED_BACKTRACE diff --git a/test/gtest/ucs/test_event_set.cc b/test/gtest/ucs/test_event_set.cc new file mode 100644 index 00000000000..1f1fa39e276 --- /dev/null +++ b/test/gtest/ucs/test_event_set.cc @@ -0,0 +1,284 @@ +/** +* Copyright (C) Hiroyuki Sato. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#include +extern "C" { +#include +#include +#include +} + +#define MAX_BUF_LEN 255 + +static const char *UCS_EVENT_SET_TEST_STRING = "ucs_event_set test string"; +static const char *UCS_EVENT_SET_EXTRA_STRING = "ucs_event_set extra string"; +static const int UCS_EVENT_SET_EXTRA_NUM = 0xFF; + +enum { + UCS_EVENT_SET_EXTERNAL_FD = UCS_BIT(0), +}; + +class test_event_set : public ucs::test_base, + public ::testing::TestWithParam { +public: + static const char *evfd_data; + static pthread_barrier_t barrier; + + typedef void* (*event_set_pthread_callback_t)(void *arg); + + enum event_set_op_t { + EVENT_SET_OP_ADD, + EVENT_SET_OP_MOD, + EVENT_SET_OP_DEL + }; + + UCS_TEST_BASE_IMPL; + +protected: + void init() { + if (GetParam() & UCS_EVENT_SET_EXTERNAL_FD) { + m_ext_fd = epoll_create(1); + ASSERT_TRUE(m_ext_fd > 0); + } else { + m_ext_fd = -1; + } + } + + void cleanup() { + if (GetParam() & UCS_EVENT_SET_EXTERNAL_FD) { + ASSERT_NE(-1, m_ext_fd); + close(m_ext_fd); + m_ext_fd = -1; + } + } + + static void* event_set_read_func(void *arg) { + int *fd = (int *)arg; + int n; + + n = write(fd[1], evfd_data, strlen(test_event_set::evfd_data)); + if (n == -1) { + ADD_FAILURE(); + } + + thread_barrier(); + return 0; + } + + static void* event_set_tmo_func(void *arg) { + thread_barrier(); + return 0; + } + + void event_set_init(event_set_pthread_callback_t func) { + ucs_status_t status; + int ret; + + if (pipe(m_pipefd) == -1) { + UCS_TEST_ABORT("pipe() failed with error - " << + strerror(errno)); + } + + ret = pthread_barrier_init(&barrier, NULL, 2); + if (ret) { + UCS_TEST_ABORT("pthread_barrier_init() failed with error - " << + strerror(errno)); + } + + ret = pthread_create(&m_tid, NULL, func, (void *)&m_pipefd); + if (ret) { + UCS_TEST_ABORT("pthread_create() failed with error - " << + strerror(errno)); + } + + if (GetParam() & UCS_EVENT_SET_EXTERNAL_FD) { + status = ucs_event_set_create_from_fd(&m_event_set, m_ext_fd); + } else { + status = ucs_event_set_create(&m_event_set); + } + ASSERT_UCS_OK(status); + EXPECT_TRUE(m_event_set != NULL); + } + + void event_set_cleanup() { + ucs_event_set_cleanup(m_event_set); + + pthread_join(m_tid, NULL); + pthread_barrier_destroy(&barrier); + + close(m_pipefd[0]); + close(m_pipefd[1]); + } + + void event_set_ctl(event_set_op_t op, int fd, int events) { + ucs_status_t status = UCS_OK; + + switch (op) { + case EVENT_SET_OP_ADD: + status = ucs_event_set_add(m_event_set, fd, + (ucs_event_set_type_t)events, + (void *)(uintptr_t)fd); + break; + case EVENT_SET_OP_MOD: + status = ucs_event_set_mod(m_event_set, fd, + (ucs_event_set_type_t)events, + (void *)(uintptr_t)fd); + break; + case EVENT_SET_OP_DEL: + status = ucs_event_set_del(m_event_set, fd); + break; + default: + UCS_TEST_ABORT("unknown event set operation - " << op); + } + + EXPECT_UCS_OK(status); + } + + void event_set_wait(unsigned exp_event, int timeout_ms, + ucs_event_set_handler_t handler, void *arg) { + unsigned nread = ucs_sys_event_set_max_wait_events; + ucs_status_t status; + + /* Check for events on pipe fd */ + status = ucs_event_set_wait(m_event_set, &nread, 0, handler, arg); + EXPECT_EQ(exp_event, nread); + EXPECT_UCS_OK(status); + } + + static void thread_barrier() { + int ret = pthread_barrier_wait(&barrier); + EXPECT_TRUE((ret == 0) || (ret == PTHREAD_BARRIER_SERIAL_THREAD)); + } + + int m_pipefd[2]; + int m_ext_fd; + pthread_t m_tid; + ucs_sys_event_set_t *m_event_set; +}; + +const char *test_event_set::evfd_data = UCS_EVENT_SET_TEST_STRING; + +pthread_barrier_t test_event_set::barrier; + +static void event_set_func1(void *callback_data, int events, void *arg) +{ + char buf[MAX_BUF_LEN]; + char *extra_str = (char *)((void**)arg)[0]; + int *extra_num = (int *)((void**)arg)[1]; + int n; + int fd = (int)(uintptr_t)callback_data; + memset(buf, 0, MAX_BUF_LEN); + + EXPECT_EQ(UCS_EVENT_SET_EVREAD, events); + + n = read(fd, buf, MAX_BUF_LEN); + if (n == -1) { + ADD_FAILURE(); + return; + } + EXPECT_EQ(0, strcmp(UCS_EVENT_SET_TEST_STRING, buf)); + EXPECT_EQ(0, strcmp(UCS_EVENT_SET_EXTRA_STRING, extra_str)); + EXPECT_EQ(UCS_EVENT_SET_EXTRA_NUM, *extra_num); +} + +static void event_set_func2(void *callback_data, int events, void *arg) +{ + EXPECT_EQ(UCS_EVENT_SET_EVWRITE, events); +} + +static void event_set_func3(void *callback_data, int events, void *arg) +{ + ADD_FAILURE(); +} + +static void event_set_func4(void *callback_data, int events, void *arg) +{ + EXPECT_EQ(UCS_EVENT_SET_EVREAD, events); +} + +UCS_TEST_P(test_event_set, ucs_event_set_read_thread) { + void *arg[] = { (void*)UCS_EVENT_SET_EXTRA_STRING, + (void*)&UCS_EVENT_SET_EXTRA_NUM }; + + event_set_init(event_set_read_func); + event_set_ctl(EVENT_SET_OP_ADD, m_pipefd[0], + UCS_EVENT_SET_EVREAD); + + thread_barrier(); + + event_set_wait(1u, -1, event_set_func1, arg); + + event_set_ctl(EVENT_SET_OP_DEL, m_pipefd[0], 0); + event_set_cleanup(); +} + +UCS_TEST_P(test_event_set, ucs_event_set_write_thread) { + event_set_init(event_set_read_func); + event_set_ctl(EVENT_SET_OP_ADD, m_pipefd[1], + UCS_EVENT_SET_EVWRITE); + + thread_barrier(); + + event_set_wait(1u, -1, event_set_func2, NULL); + + event_set_ctl(EVENT_SET_OP_DEL, m_pipefd[1], 0); + event_set_cleanup(); +} + +UCS_TEST_P(test_event_set, ucs_event_set_tmo_thread) { + event_set_init(event_set_tmo_func); + event_set_ctl(EVENT_SET_OP_ADD, m_pipefd[0], + UCS_EVENT_SET_EVREAD); + + thread_barrier(); + + event_set_wait(0u, 0, event_set_func3, NULL); + + event_set_ctl(EVENT_SET_OP_DEL, m_pipefd[0], 0); + event_set_cleanup(); +} + +UCS_TEST_P(test_event_set, ucs_event_set_trig_modes) { + void *arg[] = { (void*)UCS_EVENT_SET_EXTRA_STRING, + (void*)&UCS_EVENT_SET_EXTRA_NUM }; + + event_set_init(event_set_read_func); + event_set_ctl(EVENT_SET_OP_ADD, m_pipefd[0], + UCS_EVENT_SET_EVREAD); + + thread_barrier(); + + /* Test level-triggered mode (default) */ + for (int i = 0; i < 10; i++) { + event_set_wait(1u, 0, event_set_func4, NULL); + } + + /* Test edge-triggered mode */ + /* Set edge-triggered mode */ + event_set_ctl(EVENT_SET_OP_MOD, m_pipefd[0], + (ucs_event_set_type_t)(UCS_EVENT_SET_EVREAD | + UCS_EVENT_SET_EDGE_TRIGGERED)); + + /* Should have only one event to read */ + event_set_wait(1u, 0, event_set_func4, NULL); + + /* Should not read nothing */ + for (int i = 0; i < 10; i++) { + event_set_wait(0u, 0, event_set_func1, arg); + } + + /* Call the function below directly to read + * all outstanding data from pipe fd */ + event_set_func1((void*)(uintptr_t)m_pipefd[0], UCS_EVENT_SET_EVREAD, arg); + + event_set_ctl(EVENT_SET_OP_DEL, m_pipefd[0], 0); + event_set_cleanup(); +} + +INSTANTIATE_TEST_CASE_P(ext_fd, test_event_set, + ::testing::Values(static_cast( + UCS_EVENT_SET_EXTERNAL_FD))); +INSTANTIATE_TEST_CASE_P(int_fd, test_event_set, ::testing::Values(0)); diff --git a/test/gtest/ucs/test_frag_list.cc b/test/gtest/ucs/test_frag_list.cc index a0fc165c8c4..dd55e9872f1 100644 --- a/test/gtest/ucs/test_frag_list.cc +++ b/test/gtest/ucs/test_frag_list.cc @@ -57,9 +57,8 @@ void frag_list::init_pkts(pkt *packets, int n) void frag_list::init() { - ::srand(::time(NULL)); ucs_stats_cleanup(); -#if ENABLE_STATS +#ifdef ENABLE_STATS push_config(); modify_config("STATS_DEST", "stdout"); modify_config("STATS_TRIGGER", ""); @@ -73,7 +72,7 @@ void frag_list::cleanup() { ucs_frag_list_cleanup(&m_frags); ucs_stats_cleanup(); -#if ENABLE_STATS +#ifdef ENABLE_STATS pop_config(); #endif ucs_stats_init(); @@ -99,7 +98,7 @@ UCS_TEST_F(frag_list, in_order_rcv) { err = ucs_frag_list_insert(&m_frags, &pkt, i); EXPECT_EQ(UCS_FRAG_LIST_INSERT_FAST, err); } -#if ENABLE_STATS +#ifdef ENABLE_STATS EXPECT_EQ((ucs_stats_counter_t)1, UCS_STATS_GET_COUNTER(m_frags.stats, UCS_FRAG_LIST_STAT_BURSTS)); EXPECT_EQ((ucs_stats_counter_t)9, UCS_STATS_GET_COUNTER(m_frags.stats, UCS_FRAG_LIST_STAT_BURST_LEN)); EXPECT_EQ((ucs_stats_counter_t)0, UCS_STATS_GET_COUNTER(m_frags.stats, UCS_FRAG_LIST_STAT_GAPS)); @@ -154,7 +153,7 @@ UCS_TEST_F(frag_list, one_hole) { i++; } EXPECT_EQ((unsigned)5, i); -#if ENABLE_STATS +#ifdef ENABLE_STATS EXPECT_EQ((ucs_stats_counter_t)2, UCS_STATS_GET_COUNTER(m_frags.stats, UCS_FRAG_LIST_STAT_BURSTS)); EXPECT_EQ((ucs_stats_counter_t)10, UCS_STATS_GET_COUNTER(m_frags.stats, UCS_FRAG_LIST_STAT_BURST_LEN)); EXPECT_EQ((ucs_stats_counter_t)1, UCS_STATS_GET_COUNTER(m_frags.stats, UCS_FRAG_LIST_STAT_GAPS)); @@ -222,7 +221,7 @@ UCS_TEST_F(frag_list, two_holes_basic) { i++; } EXPECT_EQ((unsigned)20, i); -#if ENABLE_STATS +#ifdef ENABLE_STATS EXPECT_EQ((ucs_stats_counter_t)7, UCS_STATS_GET_COUNTER(m_frags.stats, UCS_FRAG_LIST_STAT_BURSTS)); EXPECT_EQ((ucs_stats_counter_t)19, UCS_STATS_GET_COUNTER(m_frags.stats, UCS_FRAG_LIST_STAT_BURST_LEN)); EXPECT_EQ((ucs_stats_counter_t)2, UCS_STATS_GET_COUNTER(m_frags.stats, UCS_FRAG_LIST_STAT_GAPS)); diff --git a/test/gtest/ucs/test_iov.cc b/test/gtest/ucs/test_iov.cc new file mode 100644 index 00000000000..58d24c1fa77 --- /dev/null +++ b/test/gtest/ucs/test_iov.cc @@ -0,0 +1,167 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#include + +extern "C" { +#include +#include +} + + +class test_ucs_iov : public ucs::test { +protected: + struct iov1_t { + char length_padding[128]; + size_t length; + char buffer_padding[64]; + void *buffer; + }; + + struct iov2_t { + char length_padding[64]; + size_t length; + char buffer_padding[256]; + void *buffer; + }; + + template + void iov_set_length(T *iov, size_t length) { + iov->length = length; + } + + template + void iov_set_buffer(T *iov, void *buffer) { + iov->buffer = buffer; + } + + template + size_t iov_get_length(T *iov) { + return iov->length; + } + + template + void *iov_get_buffer(T *iov) { + return iov->buffer; + } + + template + size_t iov_converter(T1 *src_iov, size_t *src_iov_cnt_p, + T2 *dst_iov, size_t dst_iov_cnt, + size_t max_length, ucs_iov_iter_t *iov_iter_p) { + return ucs_iov_converter(src_iov, src_iov_cnt_p, + iov_set_buffer, iov_set_length, + dst_iov, dst_iov_cnt, + iov_get_buffer, iov_get_length, + max_length, iov_iter_p); + } + + void expect_zero_changes(size_t res_cnt, size_t res_length, + const ucs_iov_iter_t *iov_iter) { + EXPECT_EQ(0lu, res_cnt); + EXPECT_EQ(0lu, res_length); + EXPECT_EQ(0lu, iov_iter->iov_index); + EXPECT_EQ(0lu, iov_iter->buffer_offset); + } + + template + void test_iov_type_pair(T1 *iov1, size_t iov1_cnt, + T2 *iov2, size_t iov2_cnt, + size_t max_length) { + size_t res_total_length = 0; + size_t exp_total_length = 0; + size_t cnt, length; + ucs_iov_iter_t iov_iter; + + iov1 = new T1[iov1_cnt]; + ASSERT_TRUE(iov1 != NULL); + iov2 = new T2[iov2_cnt]; + ASSERT_TRUE(iov2 != NULL); + + for (size_t i = 0; i < iov2_cnt; i++) { + iov_set_buffer(&iov2[i], (void*)0x1); + iov_set_length(&iov2[i], i); + exp_total_length += iov_get_length(&iov2[i]); + } + + ucs_iov_iter_init(&iov_iter); + + while (iov_iter.iov_index < iov2_cnt) { + cnt = iov1_cnt; + length = iov_converter(iov1, &cnt, + iov2, iov2_cnt, + max_length, &iov_iter); + EXPECT_TRUE((iov_iter.iov_index == iov2_cnt) || + (length == max_length) || (cnt == iov1_cnt)); + res_total_length += length; + } + + EXPECT_EQ(exp_total_length, res_total_length); + + ucs_iov_iter_init(&iov_iter); + cnt = 0; + length = iov_converter((T1*)NULL, &cnt, + iov2, iov2_cnt, + max_length, &iov_iter); + expect_zero_changes(cnt, length, &iov_iter); + + ucs_iov_iter_init(&iov_iter); + cnt = iov1_cnt; + length = iov_converter(iov1, &cnt, + (T2*)NULL, 0, + max_length, &iov_iter); + expect_zero_changes(cnt, length, &iov_iter); + + ucs_iov_iter_init(&iov_iter); + cnt = iov1_cnt; + length = iov_converter(iov1, &cnt, + iov2, iov2_cnt, + 0, &iov_iter); + expect_zero_changes(cnt, length, &iov_iter); + + delete[] iov1; + delete[] iov2; + } +}; + +UCS_TEST_F(test_ucs_iov, total_length) { + const size_t iov_cnt = 1024; + size_t total_length = 0; + struct iovec *iov; + + iov = new struct iovec[iov_cnt]; + ASSERT_TRUE(iov != NULL); + + for (size_t i = 0; i < iov_cnt; i++) { + iov[i].iov_len = i; + total_length += iov[i].iov_len; + } + + EXPECT_EQ(total_length, ucs_iovec_total_length(iov, iov_cnt)); + + delete[] iov; +} + +UCS_TEST_F(test_ucs_iov, iov_to_iov) { + const size_t iov1_cnt = 16; + const size_t iov2_cnt = 1024; + const size_t max_length = 1024; + void *iov_buf1 = NULL; + void *iov_buf2 = NULL; + + test_iov_type_pair(static_cast(iov_buf1), iov1_cnt, + static_cast(iov_buf2), iov2_cnt, + max_length); + test_iov_type_pair(static_cast(iov_buf1), iov1_cnt, + static_cast(iov_buf2), iov2_cnt, + max_length); + test_iov_type_pair(static_cast(iov_buf1), iov1_cnt, + static_cast(iov_buf2), iov2_cnt, + max_length); + test_iov_type_pair(static_cast(iov_buf1), iov1_cnt, + static_cast(iov_buf2), iov2_cnt, + max_length); +} diff --git a/test/gtest/ucs/test_log.cc b/test/gtest/ucs/test_log.cc index 89ca3307244..3221cdb5a80 100644 --- a/test/gtest/ucs/test_log.cc +++ b/test/gtest/ucs/test_log.cc @@ -4,19 +4,19 @@ */ #include +#include +#include +#include + extern "C" { #include +#include } class log_test : public ucs::test { public: virtual void init() { - char ucs_log_spec[70]; - const char *default_tmp_dir = "/tmp"; - const char *tmp_dir; - ucs::test::init(); - /* skip because logger does not support file * output on valgrind */ @@ -24,16 +24,35 @@ class log_test : public ucs::test { UCS_TEST_SKIP_R("skipping on valgrind"); } + const char *default_tmp_dir = "/tmp"; + + ucs::test::init(); + ucs_log_cleanup(); push_config(); - tmp_dir = getenv("TMPDIR"); + const char *tmp_dir = getenv("TMPDIR"); if (tmp_dir == NULL) { tmp_dir = default_tmp_dir; } - snprintf(logfile, sizeof(logfile), "%s/gtest_ucs_log.%d", tmp_dir, getpid()); - /* coverity[tainted_string] */ - unlink(logfile); - snprintf(ucs_log_spec, sizeof(ucs_log_spec), "file:%s", logfile); + + tmp_dir_path = tmp_dir; + template_name = "gtest_ucs_log." + ucs::to_string(getpid()); + + std::string logfile = template_grep_name = + tmp_dir_path + "/" + template_name; + + /* add date/time to the log file name in order to track how many + * different log files were created during testing */ + logfile += ".%t"; + + /* add `*` to the template grep name to be able searching a test + * string in all possible file names with the time specified */ + template_grep_name += ".*"; + + /* remove already created files with the similar file name */ + log_files_foreach(&log_test::remove_file); + + std::string ucs_log_spec = "file:" + logfile; modify_config("LOG_FILE", ucs_log_spec); modify_config("LOG_LEVEL", "info"); ucs_log_init(); @@ -44,30 +63,117 @@ class log_test : public ucs::test { m_num_log_handlers_before = 0; pop_config(); check_log_file(); - unlink(logfile); + unsigned files_count = log_files_foreach(&log_test::remove_file); + EXPECT_LE(files_count, ucs_global_opts.log_file_rotate + 1); + EXPECT_NE(0, files_count); ucs_log_init(); ucs::test::cleanup(); } + void remove_file(const std::string &name, void *arg) { + unlink(name.c_str()); + } + + typedef void (log_test::*log_file_foreach_cb)(const std::string &name, + void *arg); + + unsigned log_files_foreach(log_file_foreach_cb cb, void *arg = NULL) { + DIR *dir = opendir(tmp_dir_path.c_str()); + struct dirent *entry; + unsigned files_count = 0; + + while ((entry = readdir(dir)) != NULL) { + if (strstr(entry->d_name, template_name.c_str()) != NULL) { + std::string full_file_name = tmp_dir_path + "/" + + ucs::to_string(entry->d_name); + (this->*cb)(full_file_name, arg); + files_count++; + } + } + closedir(dir); + + return files_count; + } + + void test_file_cur_size(const std::string &log_file_name, void *arg) { + FILE *logfile_fp = fopen(log_file_name.c_str(), "r"); + ASSERT_TRUE(logfile_fp != NULL); + + ucs_log_flush(); + + int ret = fseek(logfile_fp, 0, SEEK_END); + EXPECT_EQ(0, ret); + + long cur_size = ftell(logfile_fp); + EXPECT_LE(static_cast(cur_size), ucs_global_opts.log_file_size); + + fclose(logfile_fp); + + m_log_files_set.insert(log_file_name); + } + virtual void check_log_file() { - ADD_FAILURE(); + ADD_FAILURE() << read_logfile(); + } + + bool do_grep(const std::string &needle) { + unsigned num_retries = 0; + std::string cmd_str = ""; + std::string system_ret_str = ""; + + while (num_retries++ < GREP_RETRIES) { + /* if this is the last retry, allow printing the grep output */ + std::string grep_cmd = ucs_likely(num_retries != GREP_RETRIES) ? + "grep -q" : "grep"; + cmd_str = grep_cmd + " '" + needle + "' " + template_grep_name; + int ret = system(cmd_str.c_str()); + if (ret == 0) { + return true; + } else { + system_ret_str = "return value: "; + if (ret == -1) { + system_ret_str += ucs::to_string(ret) + + ", errno: " + ucs::to_string(errno); + } else { + system_ret_str += ucs::to_string(WEXITSTATUS(ret)); + } + } + + ucs_log_flush(); + } + + UCS_TEST_MESSAGE << "\"" << cmd_str << "\" failed after " + << num_retries - 1 << " iterations (" + << system_ret_str << ")"; + + return false; } - int do_grep(const char *needle) { - char cmd[128]; + void read_logfile(const std::string &log_file_name, void *arg) { + std::stringstream *ss = (std::stringstream*)arg; + std::ifstream ifs(log_file_name.c_str()); + *ss << log_file_name << ":" << std::endl << ifs.rdbuf() << std::endl; + } - snprintf(cmd, sizeof(cmd), "grep '%s' %s", needle, logfile); - return system(cmd); + std::string read_logfile() { + std::stringstream ss; + log_files_foreach(&log_test::read_logfile, &ss); + return ss.str(); } protected: - char logfile[64]; + std::string template_name; + std::string template_grep_name; + std::string tmp_dir_path; + std::set m_log_files_set; + + static const unsigned GREP_RETRIES = 20; }; class log_test_info : public log_test { virtual void check_log_file() { - if (do_grep("UCX INFO hello world")) { - ADD_FAILURE(); + if (!do_grep("UCX INFO hello world")) { + ADD_FAILURE() << read_logfile(); } } }; @@ -79,15 +185,15 @@ UCS_TEST_F(log_test_info, hello) { class log_test_print : public log_test { virtual void check_log_file() { - if (do_grep("UCX PRINT debug message")) { + if (!do_grep("UCX PRINT debug message")) { if (ucs_global_opts.log_print_enable) { /* not found but it should be there */ - ADD_FAILURE(); + ADD_FAILURE() << read_logfile(); } } else { if (!ucs_global_opts.log_print_enable) { /* found but prints disabled!!! */ - ADD_FAILURE(); + ADD_FAILURE() << read_logfile(); } } } @@ -101,3 +207,112 @@ UCS_TEST_F(log_test_print, print_off) { ucs_print("debug message"); } + +class log_test_file_size : public log_test { +protected: + virtual void check_log_file() { + unsigned files_count = log_files_foreach(&log_test_file_size:: + test_file_cur_size); + EXPECT_LE(files_count, ucs_global_opts.log_file_rotate + 1); + } + + virtual void check_log_file(const std::string &test_str) { + check_log_file(); + EXPECT_TRUE(do_grep(test_str)); + } + + void generate_random_str(std::string &s, size_t len) { + static const char possible_vals[] = + "0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"; + + ASSERT_TRUE(len != 0); + + for (size_t i = 0; i < len; ++i) { + s += possible_vals[ucs::rand() % + (ucs_array_size(possible_vals) - 1)]; + } + } + + void print_random_str() { + size_t entry_size = ucs::rand() % ucs_log_get_buffer_size(); + if (entry_size == 0) { + entry_size = 1; + } + + std::string entry_buf; + + generate_random_str(entry_buf, entry_size); + /* use %s here in order to satisfy the "format-security" compilation + * flag requirements */ + ucs_info("%s", entry_buf.c_str()); + + /* to not waste a lot of time grepping the test string */ + if (entry_size < 128) { + check_log_file(entry_buf); + } else { + check_log_file(); + } + } + + void test_log_file_max_size() { + const unsigned num_iters = 4; + + for (unsigned i = 0; i < num_iters; i++) { + size_t set_size, exp_files_count; + + do { + print_random_str(); + + set_size = m_log_files_set.size(); + exp_files_count = (ucs_global_opts.log_file_rotate + 1); + } while ((set_size == 0) || + ((set_size % exp_files_count) != 0)); + } + + EXPECT_EQ(m_log_files_set.size(), + ucs_global_opts.log_file_rotate + 1); + } +}; + +const std::string small_file_size = ucs::to_string(UCS_ALLOCA_MAX_SIZE); + +UCS_TEST_F(log_test_file_size, small_file, "LOG_FILE_SIZE=" + + small_file_size) { + test_log_file_max_size(); +} + +UCS_TEST_F(log_test_file_size, large_file, "LOG_FILE_SIZE=8k") { + test_log_file_max_size(); +} + +UCS_TEST_F(log_test_file_size, small_files, "LOG_FILE_SIZE=" + + small_file_size, + "LOG_FILE_ROTATE=4") { + test_log_file_max_size(); +} + +UCS_TEST_F(log_test_file_size, large_files, "LOG_FILE_SIZE=8k", + "LOG_FILE_ROTATE=4") { + test_log_file_max_size(); +} + + +class log_test_backtrace : public log_test { + virtual void check_log_file() { + if (!do_grep("print_backtrace")) { + ADD_FAILURE() << read_logfile(); + } + +#ifdef HAVE_DETAILED_BACKTRACE + if (!do_grep("main")) { + ADD_FAILURE() << read_logfile(); + } +#endif + } +}; + +UCS_TEST_F(log_test_backtrace, backtrace) { + ucs_log_print_backtrace(UCS_LOG_LEVEL_INFO); +} diff --git a/test/gtest/ucs/test_math.cc b/test/gtest/ucs/test_math.cc index 16cc44aecc8..7fcd417221e 100644 --- a/test/gtest/ucs/test_math.cc +++ b/test/gtest/ucs/test_math.cc @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -201,3 +202,68 @@ UCS_TEST_F(test_math, for_each_bit) { } EXPECT_EQ(UCS_BIT(63), gen_mask); } + +UCS_TEST_F(test_math, linear_func) { + ucs_linear_func_t func[2]; + double x, y[2]; + + /* Generate 2 random functions */ + x = ucs::rand() / (double)RAND_MAX; + for (unsigned i = 0; i < 2; ++i) { + func[i] = ucs_linear_func_make(ucs::rand() / (double)RAND_MAX, + ucs::rand() / (double)RAND_MAX); + y[i] = ucs_linear_func_apply(func[i], x); + } + + /* Add */ + ucs_linear_func_t sum_func = ucs_linear_func_add(func[0], func[1]); + double y_sum = ucs_linear_func_apply(sum_func, x); + EXPECT_NEAR(y[0] + y[1], y_sum, 1e-6); + + /* Add in-place */ + ucs_linear_func_t sum_func_inplace = func[0]; + ucs_linear_func_add_inplace(&sum_func_inplace, func[1]); + double y_sum_inplace = ucs_linear_func_apply(sum_func_inplace, x); + EXPECT_NEAR(y[0] + y[1], y_sum_inplace, 1e-6); + + /* Subtract */ + ucs_linear_func_t diff_func = ucs_linear_func_sub(func[0], func[1]); + double y_diff = ucs_linear_func_apply(diff_func, x); + EXPECT_NEAR(y[0] - y[1], y_diff, 1e-6); + + /* Intersect */ + double x_intersect = 0; + ucs_status_t status; + status = ucs_linear_func_intersect(func[0], func[1], &x_intersect); + ASSERT_EQ(UCS_OK, status); + double y_intersect[2]; + for (unsigned i = 0; i < 2; ++i) { + y_intersect[i] = ucs_linear_func_apply(func[i], x_intersect); + } + EXPECT_NEAR(y_intersect[0], y_intersect[1], 1e-6); + + /* Invalid intersect - parallel functions */ + ucs_linear_func_t tmp_func = func[0]; + tmp_func.c = func[0].c + 1.0; + status = ucs_linear_func_intersect(func[0], tmp_func, &x_intersect); + ASSERT_EQ(UCS_ERR_INVALID_PARAM, status); + + /* Invalid intersect - infinite point */ + ucs_linear_func_t tmp_func1 = ucs_linear_func_make(1000, DBL_MIN * 3); + ucs_linear_func_t tmp_func2 = ucs_linear_func_make(2000, DBL_MIN * 2); + status = ucs_linear_func_intersect(tmp_func1, tmp_func2, + &x_intersect); + ASSERT_EQ(UCS_ERR_INVALID_PARAM, status) << x_intersect; + + /* Compose */ + ucs_linear_func_t compose_func = ucs_linear_func_compose(func[0], func[1]); + double y_compose = ucs_linear_func_apply(compose_func, x); + double y_compose_exp = ucs_linear_func_apply(func[0], y[1]); + EXPECT_NEAR(y_compose_exp, y_compose, 1e-6); + + /* Add value of */ + ucs_linear_func_t added_func = func[0]; + ucs_linear_func_add_value_at(&added_func, func[1], x); + double y_added_func = ucs_linear_func_apply(added_func, x); + EXPECT_NEAR(y[0] + y[1], y_added_func, 1e-6); +} diff --git a/test/gtest/ucs/test_memtrack.cc b/test/gtest/ucs/test_memtrack.cc index 830890f7247..7ef5ccd34ef 100644 --- a/test/gtest/ucs/test_memtrack.cc +++ b/test/gtest/ucs/test_memtrack.cc @@ -19,7 +19,7 @@ extern "C" { #include -#if ENABLE_MEMTRACK +#ifdef ENABLE_MEMTRACK class test_memtrack : public ucs::test { protected: @@ -159,12 +159,17 @@ UCS_TEST_F(test_memtrack, sysv) { UCS_TEST_F(test_memtrack, memalign_realloc) { void* ptr; + int ret; - ptr = ucs_memalign(10, ALLOC_SIZE, ALLOC_NAME); + ret = ucs_posix_memalign(&ptr, 8, ALLOC_SIZE, ALLOC_NAME); + ASSERT_EQ(0, ret); ASSERT_NE((void *)NULL, ptr); ucs_free(ptr); + /* Silence coverity warning. */ + ptr = NULL; - ptr = ucs_memalign(1000, ALLOC_SIZE, ALLOC_NAME); + ret = ucs_posix_memalign(&ptr, 1024, ALLOC_SIZE, ALLOC_NAME); + ASSERT_EQ(0, ret); ASSERT_NE((void *)NULL, ptr); ptr = ucs_realloc(ptr, 2*ALLOC_SIZE, ALLOC_NAME); diff --git a/test/gtest/ucs/test_memtype_cache.cc b/test/gtest/ucs/test_memtype_cache.cc index eb3dd08ad43..e891db0efc3 100644 --- a/test/gtest/ucs/test_memtype_cache.cc +++ b/test/gtest/ucs/test_memtype_cache.cc @@ -5,76 +5,505 @@ */ #include -#if HAVE_CUDA -#include -#include -#endif -extern "C" { +#include + +#include #include -} +#include +extern "C" { +#include +} -class test_memtype_cache : public ucs::test { +class test_memtype_cache : public ucs::test_with_param { protected: + test_memtype_cache() : m_memtype_cache(NULL) { + } virtual void init() { - ucs_status_t status; - - ucs::test::init(); - status = ucs_memtype_cache_create(&m_memtype_cache); + ucs::test_with_param::init(); + ucs_status_t status = ucs_memtype_cache_create(&m_memtype_cache); ASSERT_UCS_OK(status); } virtual void cleanup() { - ucs_memtype_cache_destroy(m_memtype_cache); - ucs::test::cleanup(); + if (m_memtype_cache != NULL) { + ucs_memtype_cache_destroy(m_memtype_cache); + } + + ucs::test_with_param::cleanup(); + } + + void check_lookup(const void *ptr, size_t size, + bool expect_found, + ucs_memory_type_t expected_type = UCS_MEMORY_TYPE_LAST) const { + if (!size) { + return; + } + + ucs_memory_type_t mem_type; + ucs_status_t status = ucs_memtype_cache_lookup(m_memtype_cache, ptr, + size, &mem_type); + + if (!expect_found || (expected_type == UCS_MEMORY_TYPE_HOST)) { + /* memory type should be not found or unknown */ + EXPECT_TRUE((status == UCS_ERR_NO_ELEM) || + ((status == UCS_OK) && (mem_type == UCS_MEMORY_TYPE_LAST))) + << "ptr=" << ptr << " size=" << size << ": " + << ucs_status_string(status) + << " memtype=" << mem_buffer::mem_type_name(mem_type); + } else { + EXPECT_UCS_OK(status); + EXPECT_EQ(expected_type, mem_type) << "ptr=" << ptr << " size=" << size; + } + } + + void test_lookup_found(const void *ptr, size_t size, + ucs_memory_type_t expected_type) const { + check_lookup(ptr, size, true, expected_type); + } + + void test_lookup_notfound(const void *ptr, size_t size) const { + check_lookup(ptr, size, false); + } + + void test_ptr_found(const void *ptr, size_t size, + ucs_memory_type_t expected_type) const { + test_lookup_found(ptr, size, expected_type); + test_lookup_found(ptr, size / 2, expected_type); + test_lookup_found(ptr, 1, expected_type); + test_lookup_found(UCS_PTR_BYTE_OFFSET(ptr, size - 1), + 1, expected_type); + test_lookup_found(ptr, 0, expected_type); + } + + void test_region_found(const mem_buffer &b) const { + test_ptr_found(b.ptr(), b.size(), b.mem_type()); + } + + void test_region_not_found(const mem_buffer &b) const { + test_ptr_not_found(b.ptr(), b.size()); + } + + void test_ptr_not_found(const void *ptr, size_t size) const { + /* memtype cache is aligned by Page Table defined constant, + * so need to step by this value to make something not found */ + test_lookup_notfound(ptr, size + UCS_PGT_ADDR_ALIGN); + test_lookup_notfound(UCS_PTR_BYTE_OFFSET(ptr, size), 1 + UCS_PGT_ADDR_ALIGN); + } + + void test_ptr_released(const void *ptr, size_t size) const { + test_lookup_notfound(ptr, size); + test_lookup_notfound(ptr, 1); + } + + mem_buffer* allocate_mem_buffer(size_t size, ucs_memory_type_t mem_type, + std::vector *allocated_buffers = NULL, + bool test_not_found = true) const { + mem_buffer *buf = new mem_buffer(size, mem_type); + + if (allocated_buffers != NULL) { + allocated_buffers->push_back(buf); + } + + test_region_found(*buf); + + if (test_not_found) { + test_region_not_found(*buf); + } + + return buf; + } + + void release_mem_buffer(mem_buffer *buf, + std::vector > *released_ptrs, + std::vector *allocated_buffers = NULL) const { + if (allocated_buffers != NULL) { + allocated_buffers->pop_back(); + } + + released_ptrs->push_back(std::make_pair(buf->ptr(), buf->size())); + + delete buf; + } + + void test_ptrs_released(std::vector > *released_ptrs) const { + while (!released_ptrs->empty()) { + void *ptr = released_ptrs->back().first; + size_t size = released_ptrs->back().second; + + test_ptr_released(ptr, size); + test_ptr_not_found(ptr, size); + + released_ptrs->pop_back(); + } + } + + void release_buffers(std::vector *allocated_buffers) const { + std::vector > released_ptrs; + + while (!allocated_buffers->empty()) { + release_mem_buffer(allocated_buffers->back(), + &released_ptrs, allocated_buffers); + } + + test_ptrs_released(&released_ptrs); + } + + size_t get_test_step(size_t portions = 64) const { + return (RUNNING_ON_VALGRIND ? + (ucs_get_page_size() / 2 - 1) : + (ucs_get_page_size() / portions)); + } + + void test_memtype_cache_alloc_diff_mem_types(bool keep_buffers, + bool same_size_buffers) { + const size_t step = get_test_step(); + const size_t inner_step = (same_size_buffers ? + ucs_get_page_size() : step); + std::vector > released_ptrs; + std::vector allocated_buffers; + + const std::vector supported_mem_types = + mem_buffer::supported_mem_types(); + + /* The tests try to allocate two buffers with different memory types */ + for (std::vector::const_iterator iter = + supported_mem_types.begin(); + iter != supported_mem_types.end(); ++iter) { + for (size_t i = 1; i <= ucs_get_page_size(); i += step) { + mem_buffer *buf1 = allocate_mem_buffer(i, GetParam(), + &allocated_buffers, 0); + + for (size_t j = 1; j <= ucs_get_page_size(); j += inner_step) { + mem_buffer *buf2 = allocate_mem_buffer(j, *iter, + &allocated_buffers, + 0); + if (!keep_buffers) { + release_mem_buffer(buf2, &released_ptrs); + } + } + + if (!keep_buffers) { + release_mem_buffer(buf1, &released_ptrs); + } + } + + if (keep_buffers) { + /* release allocated buffers */ + release_buffers(&allocated_buffers); + } else { + /* test released buffers */ + test_ptrs_released(&released_ptrs); + } + } + } + + struct region_info { + void *start; + void *end; + ucs_memory_type_t mem_type; + + region_info(size_t start, size_t end, + ucs_memory_type_t mem_type) : + start(reinterpret_cast(start)), + end(reinterpret_cast(end)), + mem_type(mem_type) {} + }; + + void generate_test_remove_subintervals( + const std::vector &insert_regions, + size_t interval_start_offset, size_t interval_end_offset, + std::vector &remove_regions) { + // add regions that will be removed as intervals + for (std::vector::const_iterator iter = + insert_regions.begin(); iter != insert_regions.end(); ++iter) { + remove_regions.push_back(region_info(reinterpret_cast(iter->start) + + interval_start_offset, + reinterpret_cast(iter->end) - + interval_end_offset, + UCS_MEMORY_TYPE_LAST)); + } + + // add regions that will be removed as remaining intervals + for (std::vector::const_iterator iter = + insert_regions.begin(); iter != insert_regions.end(); ++iter) { + if (interval_start_offset) { + remove_regions.push_back(region_info(reinterpret_cast(iter->start), + reinterpret_cast(iter->start) + + interval_start_offset, + UCS_MEMORY_TYPE_LAST)); + } + + if (interval_end_offset) { + remove_regions.push_back(region_info(reinterpret_cast(iter->end) - + interval_end_offset, + reinterpret_cast(iter->end), + UCS_MEMORY_TYPE_LAST)); + } + } + } + + void test_region_insert_and_remove_subintervals(const std::vector ®ions, + size_t interval_start_offset, + size_t interval_end_offset, + std::vector &remove_regions) { + generate_test_remove_subintervals(regions, interval_start_offset, + interval_end_offset, remove_regions); + + // insert new regions + for (std::vector::const_iterator iter = + regions.begin(); iter != regions.end(); ++iter) { + size_t size = UCS_PTR_BYTE_DIFF(iter->start, iter->end); + memtype_cache_update(iter->start, size, iter->mem_type); + test_ptr_found(iter->start, size, iter->mem_type); + } + + // remove subintervals + for (std::vector::const_iterator iter = + remove_regions.begin(); iter != remove_regions.end(); ++iter) { + size_t size = UCS_PTR_BYTE_DIFF(iter->start, iter->end); + memtype_cache_remove(iter->start, size); + test_ptr_released(iter->start, size); + } + + // now all buffers released, check that can't find them + for (std::vector::const_iterator iter = + regions.begin(); iter != regions.end(); ++iter) { + size_t size = UCS_PTR_BYTE_DIFF(iter->start, iter->end); + test_ptr_released(iter->start, size); + test_ptr_not_found(iter->start, size); + } + } + + void memtype_cache_update(const void *ptr, size_t size, + ucs_memory_type_t mem_type) { + if (mem_type == UCS_MEMORY_TYPE_HOST) { + return; + } + + ucs_memtype_cache_update(m_memtype_cache, ptr, size, mem_type); } + void memtype_cache_update(const mem_buffer &b) { + memtype_cache_update(b.ptr(), b.size(), b.mem_type()); + } + + void memtype_cache_remove(const void *ptr, size_t size) { + ucs_memtype_cache_remove(m_memtype_cache, ptr, size); + } + +private: ucs_memtype_cache_t *m_memtype_cache; }; -#if HAVE_CUDA -UCS_TEST_F(test_memtype_cache, basic_cuda) { - cudaError_t cerr; +UCS_TEST_P(test_memtype_cache, basic) { + const size_t size = 64; void *ptr; - ucm_mem_type_t ucm_mem_type; - ucs_status_t status; - - /* set cuda device */ - if (cudaSetDevice(0) != cudaSuccess) { - UCS_TEST_SKIP_R("can't set cuda device"); - } - - cerr = cudaMalloc(&ptr, 64); - EXPECT_EQ(cerr, cudaSuccess); - status = ucs_memtype_cache_lookup(m_memtype_cache, ptr, 64, &ucm_mem_type); - EXPECT_UCS_OK(status); - EXPECT_EQ(ucm_mem_type, UCM_MEM_TYPE_CUDA); - status = ucs_memtype_cache_lookup(m_memtype_cache, ptr, 32, &ucm_mem_type); - EXPECT_UCS_OK(status); - EXPECT_EQ(ucm_mem_type, UCM_MEM_TYPE_CUDA); - status = ucs_memtype_cache_lookup(m_memtype_cache, (void *)((uintptr_t)ptr + 1), 7, &ucm_mem_type); - EXPECT_UCS_OK(status); - EXPECT_EQ(ucm_mem_type, UCM_MEM_TYPE_CUDA); - status = ucs_memtype_cache_lookup(m_memtype_cache, ptr, 1, &ucm_mem_type); - EXPECT_UCS_OK(status); - EXPECT_EQ(ucm_mem_type, UCM_MEM_TYPE_CUDA); - status = ucs_memtype_cache_lookup(m_memtype_cache, (void *)((uintptr_t) ptr + 63), 1, &ucm_mem_type); - EXPECT_UCS_OK(status); - EXPECT_EQ(ucm_mem_type, UCM_MEM_TYPE_CUDA); - status = ucs_memtype_cache_lookup(m_memtype_cache, ptr, 0, &ucm_mem_type); - EXPECT_UCS_OK(status); - EXPECT_EQ(ucm_mem_type, UCM_MEM_TYPE_CUDA); - status = ucs_memtype_cache_lookup(m_memtype_cache, ptr, 65, &ucm_mem_type); - EXPECT_TRUE(status == UCS_ERR_NO_ELEM); - status = ucs_memtype_cache_lookup(m_memtype_cache, (void *)((uintptr_t) ptr + 64), 1, &ucm_mem_type); - EXPECT_TRUE(status == UCS_ERR_NO_ELEM); - - cerr = cudaFree(ptr); - EXPECT_EQ(cerr, cudaSuccess); - status = ucs_memtype_cache_lookup(m_memtype_cache, ptr, 64, &ucm_mem_type); - EXPECT_TRUE(status == UCS_ERR_NO_ELEM); - status = ucs_memtype_cache_lookup(m_memtype_cache, ptr, 1, &ucm_mem_type); - EXPECT_TRUE(status == UCS_ERR_NO_ELEM); + + { + mem_buffer b(size, GetParam()); + + test_region_found(b); + test_region_not_found(b); + + ptr = b.ptr(); + } + + /* buffer is released */ + test_ptr_released(ptr, size); + test_ptr_not_found(ptr, size); +} + +UCS_TEST_P(test_memtype_cache, update_non_contig_regions_and_remove_subintervals) { + std::vector insert_regions; + std::vector remove_regions; + size_t start, end; + + const size_t region_size = UCS_BIT(28); + const size_t interval_start_offset = UCS_BIT(27); + + // insert [0x7f6ef0000000 .. 0x7f6f00000000] + start = 0x7f6ef0000000; + end = start + region_size; + test_memtype_cache::region_info region_info1(start, end, GetParam()); + insert_regions.push_back(region_info1); + + // insert [0x7f6f2c021000 .. 0x7f6f3c021000] + start = 0x7f6f2c021000; + end = start + region_size; + test_memtype_cache::region_info region_info2(start, end, + UCS_MEMORY_TYPE_LAST); + insert_regions.push_back(region_info2); + + // insert [0x7f6f42000000 .. 0x7f6f52000000] + start = 0x7f6f42000000; + end = start + region_size; + test_memtype_cache::region_info region_info3(start, end, + UCS_MEMORY_TYPE_LAST); + insert_regions.push_back(region_info3); + + test_region_insert_and_remove_subintervals(insert_regions, + interval_start_offset, + 0, remove_regions); +} + +UCS_TEST_P(test_memtype_cache, update_adjacent_regions_and_remove_subintervals) { + std::vector insert_regions; + std::vector remove_regions; + size_t start, end; + + const size_t region_size = UCS_BIT(28); + const size_t interval_start_offset = UCS_BIT(27); + + // insert [0x7f6ef0000000 .. 0x7f6f00000000] + start = 0x7f6ef0000000; + end = start + region_size; + test_memtype_cache::region_info region_info1(0x7f6ef0000000, 0x7f6f00000000, + GetParam()); + insert_regions.push_back(region_info1); + + // insert [0x7f6f00000000 .. 0x7f6f10000000] + start = end; + end = start + region_size; + test_memtype_cache::region_info region_info2(reinterpret_cast + (region_info1.end), + 0x7f6f40000000, GetParam()); + insert_regions.push_back(region_info2); + + // insert [0x7f6f10000000 .. 0x7f6f20000000] + start = end; + end = start + region_size; + test_memtype_cache::region_info region_info3(reinterpret_cast + (region_info2.end), + 0x7f6f48000000, GetParam()); + insert_regions.push_back(region_info3); + + test_region_insert_and_remove_subintervals(insert_regions, + interval_start_offset, + 0, remove_regions); +} + +UCS_TEST_P(test_memtype_cache, shared_page_regions) { + const std::vector supported_mem_types = + mem_buffer::supported_mem_types(); + const size_t size = 1000000; + + for (std::vector::const_iterator iter = + supported_mem_types.begin(); + iter != supported_mem_types.end(); ++iter) { + + std::vector > released_ptrs; + + /* Create two buffers that possibly will share one page + * + * < shared page > + * || || + * \/ || + * +----------------------+ || + * buf1: | | | | | | \/ + * +----------------------+----------------------+ + * buf2: | | | | | | + * +----------------------+ + */ + mem_buffer *buf1 = allocate_mem_buffer(size, GetParam()); + mem_buffer *buf2 = allocate_mem_buffer(size, *iter); + + test_region_found(*buf1); + test_region_found(*buf2); + + release_mem_buffer(buf2, &released_ptrs); + + /* check that `buf1` was not released accidentally + * after releasing `buf2` */ + test_region_found(*buf1); + + release_mem_buffer(buf1, &released_ptrs); + + /* buffer `buf1` and `buf2` are released */ + test_ptrs_released(&released_ptrs); + } +} + +UCS_TEST_P(test_memtype_cache, diff_mem_types_same_bufs) { + test_memtype_cache_alloc_diff_mem_types(false, true); +} + +UCS_TEST_P(test_memtype_cache, diff_mem_types_same_bufs_keep_mem) { + test_memtype_cache_alloc_diff_mem_types(true, true); +} + +UCS_TEST_P(test_memtype_cache, diff_mem_types_diff_bufs) { + test_memtype_cache_alloc_diff_mem_types(false, false); +} + +UCS_TEST_P(test_memtype_cache, diff_mem_types_diff_bufs_keep_mem) { + test_memtype_cache_alloc_diff_mem_types(true, false); } -#endif + +INSTANTIATE_TEST_CASE_P(mem_type, test_memtype_cache, + ::testing::ValuesIn(mem_buffer::supported_mem_types())); + +class test_memtype_cache_deferred_create : public test_memtype_cache { +protected: + virtual void init() { + /* do nothing */ + } + + void test_unknown_region_found(const mem_buffer &b) const { + test_ptr_found(b.ptr(), b.size(), + ((b.mem_type() == UCS_MEMORY_TYPE_HOST) ? + UCS_MEMORY_TYPE_HOST : + UCS_MEMORY_TYPE_LAST)); + } + + void test_alloc_before_init(size_t buf_size, bool test_adjacent, + size_t overlap_size) { + void *ptr; + + { + mem_buffer b(buf_size, GetParam()); + + test_memtype_cache::init(); + + test_unknown_region_found(b); + test_region_not_found(b); + + if (test_adjacent) { + /* add two adjacent regions: */ + memtype_cache_update(b.ptr(), b.size() / 2, b.mem_type()); + test_ptr_found(b.ptr(), b.size() / 2, b.mem_type()); + memtype_cache_update(UCS_PTR_BYTE_OFFSET(b.ptr(), + b.size() / 2 - overlap_size), + b.size() / 2 + 1, b.mem_type()); + test_ptr_found(b.ptr(), b.size() / 2, b.mem_type()); + } else { + memtype_cache_update(b); + } + + /* check that able to find the entire region */ + test_region_found(b); + + ptr = b.ptr(); + } + + /* buffer is released */ + test_ptr_released(ptr, buf_size); + test_ptr_not_found(ptr, buf_size); + } +}; + +UCS_TEST_P(test_memtype_cache_deferred_create, allocate_and_update) { + test_alloc_before_init(1000000, false, 0); +} + +UCS_TEST_P(test_memtype_cache_deferred_create, lookup_adjacent_regions) { + test_alloc_before_init(1000000, true, 0); +} + +UCS_TEST_P(test_memtype_cache_deferred_create, lookup_overlapped_regions) { + test_alloc_before_init(1000000, true, 1); +} + +INSTANTIATE_TEST_CASE_P(mem_type, test_memtype_cache_deferred_create, + ::testing::ValuesIn(mem_buffer::supported_mem_types())); diff --git a/test/gtest/ucs/test_module/configure.m4 b/test/gtest/ucs/test_module/configure.m4 index f59252dd3a7..2e594d2d27b 100644 --- a/test/gtest/ucs/test_module/configure.m4 +++ b/test/gtest/ucs/test_module/configure.m4 @@ -4,5 +4,5 @@ # See file LICENSE for terms. # -test_modules+=":module" +test_modules="${test_modules}:module" AC_CONFIG_FILES([test/gtest/ucs/test_module/Makefile]) diff --git a/test/gtest/ucs/test_mpmc.cc b/test/gtest/ucs/test_mpmc.cc index 548075c56bd..6653574ecf2 100644 --- a/test/gtest/ucs/test_mpmc.cc +++ b/test/gtest/ucs/test_mpmc.cc @@ -15,7 +15,7 @@ extern "C" { class test_mpmc : public ucs::test { protected: static const unsigned MPMC_SIZE = 100; - static const uint32_t SENTINEL = 0x7fffffffu; + static const uint64_t SENTINEL = 0x7fffffffu; static const unsigned NUM_THREADS = 4; @@ -44,7 +44,7 @@ class test_mpmc : public ucs::test { static void * consumer_thread_func(void *arg) { ucs_mpmc_queue_t *mpmc = reinterpret_cast(arg); ucs_status_t status; - uint32_t value; + uint64_t value; size_t count; count = 0; @@ -81,7 +81,7 @@ UCS_TEST_F(test_mpmc, basic) { EXPECT_FALSE(ucs_mpmc_queue_is_empty(&mpmc)); - uint32_t value; + uint64_t value; status = ucs_mpmc_queue_pull(&mpmc, &value); ASSERT_UCS_OK(status); diff --git a/test/gtest/ucs/test_mpool.cc b/test/gtest/ucs/test_mpool.cc index d27e841c79f..5d720ab1e31 100644 --- a/test/gtest/ucs/test_mpool.cc +++ b/test/gtest/ucs/test_mpool.cc @@ -26,7 +26,9 @@ class test_mpool : public ucs::test { static ucs_log_func_rc_t mpool_log_handler(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap) + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) { // Ignore errors that invalid input parameters as it is expected if (level == UCS_LOG_LEVEL_ERROR) { diff --git a/test/gtest/ucs/test_pgtable.cc b/test/gtest/ucs/test_pgtable.cc index 3444e290ccb..1a4669a641a 100644 --- a/test/gtest/ucs/test_pgtable.cc +++ b/test/gtest/ucs/test_pgtable.cc @@ -14,7 +14,6 @@ extern "C" { #include #include - class test_pgtable : public ucs::test { protected: @@ -103,6 +102,23 @@ class test_pgtable : public ucs::test { return count; } + void test_search_region(const ucs_pgt_region_t ®ion) + { + search_result_t result; + + result = search(region.start, region.end - 1); + EXPECT_EQ(1u, result.size()); + EXPECT_EQ(®ion, result.front()); + + result = search(region.start, region.end); + EXPECT_EQ(1u, result.size()); + EXPECT_EQ(®ion, result.front()); + + result = search(region.start, region.end + 1); + EXPECT_EQ(1u, result.size()); + EXPECT_EQ(®ion, result.front()); + } + private: static ucs_pgt_dir_t *pgd_alloc(const ucs_pgtable_t *pgtable) { return new ucs_pgt_dir_t; @@ -142,7 +158,7 @@ UCS_TEST_F(test_pgtable, basic) { EXPECT_EQ(®ion, lookup(0x4033ff)); EXPECT_TRUE(NULL == lookup(0x403400)); EXPECT_TRUE(NULL == lookup(0x0)); - EXPECT_TRUE(NULL == lookup(-1)); + EXPECT_TRUE(NULL == lookup(std::numeric_limits::max())); EXPECT_EQ(1u, num_regions()); remove(®ion); @@ -168,14 +184,15 @@ UCS_TEST_F(test_pgtable, lookup_adjacent) { UCS_TEST_F(test_pgtable, multi_search) { for (int count = 0; count < 10; ++count) { ucs::ptr_vector regions; - ucs_pgt_addr_t min = ULONG_MAX; + ucs_pgt_addr_t min = std::numeric_limits::max(); ucs_pgt_addr_t max = 0; /* generate random regions */ unsigned num_regions = 0; for (int i = 0; i < 200 / ucs::test_time_multiplier(); ++i) { ucs_pgt_addr_t start = (ucs::rand() & 0x7fffffff) << 24; - size_t size = ucs_min((size_t)ucs::rand(), ULONG_MAX - start); + size_t size = ucs_min((size_t)ucs::rand(), + std::numeric_limits::max() - start); ucs_pgt_addr_t end = start + ucs_align_down(size, UCS_PGT_ADDR_ALIGN); if (count_overlap(regions, start, end)) { /* Make sure regions do not overlap */ @@ -211,11 +228,8 @@ UCS_TEST_F(test_pgtable, multi_search) { } } -UCS_TEST_F(test_pgtable, invalid_param) { - if (UCS_PGT_ADDR_ALIGN == 1) { - UCS_TEST_SKIP; - } - +UCS_TEST_SKIP_COND_F(test_pgtable, invalid_param, + (UCS_PGT_ADDR_ALIGN == 1)) { ucs_pgt_region_t region1 = {0x4000, 0x4001}; insert(®ion1, UCS_ERR_INVALID_PARAM); @@ -262,13 +276,135 @@ UCS_TEST_F(test_pgtable, search_large_region) { ucs_pgt_region_t region = {0x3c03cb00, 0x3c03f600}; insert(®ion, UCS_OK); - search_result_t result = search(0x36990000, 0x3c810000); + search_result_t result; + + result = search(0x36990000, 0x3c810000); EXPECT_EQ(1u, result.size()); EXPECT_EQ(®ion, result.front()); + result = search(region.start - 1, region.start); + EXPECT_EQ(1u, result.size()); + + result = search(region.start, region.start + 1); + EXPECT_EQ(1u, result.size()); + EXPECT_EQ(®ion, result.front()); + + result = search(region.end - 1, region.end); + EXPECT_EQ(1u, result.size()); + EXPECT_EQ(®ion, result.front()); + + result = search(region.end, region.end + 1); + EXPECT_EQ(0u, result.size()); + remove(®ion); } +UCS_TEST_F(test_pgtable, search_non_contig_regions) { + const size_t region_size = UCS_BIT(28); + size_t start, end; + + // insert [0x7f6ef0000000 .. 0x7f6f00000000] + start = 0x7f6ef0000000; + end = start + region_size; + ucs_pgt_region_t region1 = {start, end}; + insert(®ion1, UCS_OK); + + // insert [0x7f6f2c021000 .. 0x7f6f3c021000] + start = 0x7f6f2c021000; + end = start + region_size; + ucs_pgt_region_t region2 = {start, end}; + insert(®ion2, UCS_OK); + + // insert [0x7f6f42000000 .. 0x7f6f52000000] + start = 0x7f6f42000000; + end = start + region_size; + ucs_pgt_region_t region3 = {start, end}; + insert(®ion3, UCS_OK); + + search_result_t result; + + // search the 1st region + test_search_region(region1); + + // search the 2nd region + test_search_region(region2); + + // search the 3rd region + test_search_region(region3); + + remove(®ion1); + remove(®ion2); + remove(®ion3); +} + +UCS_TEST_F(test_pgtable, search_adjacent_regions) { + const size_t region_size = UCS_BIT(28); + size_t start, end; + + // insert [0x7f6ef0000000 .. 0x7f6f00000000] + start = 0x7f6ef0000000; + end = start + region_size; + ucs_pgt_region_t region1 = {start, end}; + insert(®ion1, UCS_OK); + + // insert [0x7f6f00000000 .. 0x7f6f10000000] + start = end; + end = start + region_size; + ucs_pgt_region_t region2 = {region1.end, 0x7f6f40000000}; + insert(®ion2, UCS_OK); + + // insert [0x7f6f10000000 .. 0x7f6f20000000] + start = end; + end = start + region_size; + ucs_pgt_region_t region3 = {region2.end, 0x7f6f48000000}; + insert(®ion3, UCS_OK); + + search_result_t result; + + // search the 1st region + result = search(region1.start, region1.end - 1); + EXPECT_EQ(1u, result.size()); + EXPECT_EQ(®ion1, result.front()); + + result = search(region1.start, region1.end); + EXPECT_EQ(2u, result.size()); + EXPECT_EQ(®ion1, result.front()); + + result = search(region1.start, region1.end + 1); + EXPECT_EQ(2u, result.size()); + EXPECT_EQ(®ion1, result.front()); + + // search the 2nd region + result = search(region2.start, region2.end - 1); + EXPECT_EQ(1u, result.size()); + EXPECT_EQ(®ion2, result.front()); + + result = search(region2.start, region2.end); + EXPECT_EQ(2u, result.size()); + EXPECT_EQ(®ion2, result.front()); + + result = search(region2.start, region2.end + 1); + EXPECT_EQ(2u, result.size()); + EXPECT_EQ(®ion2, result.front()); + + // search the 3rd region + result = search(region3.start, region3.end - 1); + EXPECT_EQ(1u, result.size()); + EXPECT_EQ(®ion3, result.front()); + + result = search(region3.start, region3.end); + EXPECT_EQ(1u, result.size()); + EXPECT_EQ(®ion3, result.front()); + + result = search(region3.start, region3.end + 1); + EXPECT_EQ(1u, result.size()); + EXPECT_EQ(®ion3, result.front()); + + remove(®ion1); + remove(®ion2); + remove(®ion3); +} + class test_pgtable_perf : public test_pgtable { protected: @@ -444,10 +580,8 @@ UCS_TEST_F(test_pgtable_perf, basic) { purge(); } -UCS_TEST_F(test_pgtable_perf, workloads) { - if (ucs::test_time_multiplier() != 1) { - UCS_TEST_SKIP; - } +UCS_TEST_SKIP_COND_F(test_pgtable_perf, workloads, + (ucs::test_time_multiplier() != 1)) { measure_workload(UCS_MASK(28), 1024, @@ -478,4 +612,3 @@ UCS_TEST_F(test_pgtable_perf, workloads) { false, 0.8); } - diff --git a/test/gtest/ucs/test_profile.cc b/test/gtest/ucs/test_profile.cc index 11b9b9c7a95..4ca1ce34481 100644 --- a/test/gtest/ucs/test_profile.cc +++ b/test/gtest/ucs/test_profile.cc @@ -11,11 +11,10 @@ extern "C" { #include } +#include #include -#include - -#if HAVE_PROFILING +#ifdef HAVE_PROFILING class scoped_profile { public: @@ -23,6 +22,7 @@ class scoped_profile { const char *mode) : m_test(test), m_file_name(file_name) { ucs_profile_global_cleanup(); + ucs_profile_reset_locations(); m_test.push_config(); m_test.modify_config("PROFILE_MODE", mode); m_test.modify_config("PROFILE_FILE", m_file_name.c_str()); @@ -47,18 +47,47 @@ class scoped_profile { const std::string m_file_name; }; -class test_profile : public ucs::test { +class test_profile : public testing::TestWithParam, + public ucs::test_base { public: - static const char* UCS_PROFILE_FILENAME; - static const int MIN_LINE; - static const int MAX_LINE; + test_profile(); + ~test_profile(); - void test_header(ucs_profile_header_t *hdr, unsigned exp_mode); - void test_locations(ucs_profile_location_t *locations, unsigned num_locations, - uint64_t exp_count); -}; + UCS_TEST_BASE_IMPL; + +protected: + static const int MIN_LINE; + static const int MAX_LINE; + static const unsigned NUM_LOCAITONS; + static const char* PROFILE_FILENAME; + + + std::set m_tids; + pthread_spinlock_t m_tids_lock; + + struct thread_param { + test_profile *test; + int iters; + }; -const char* test_profile::UCS_PROFILE_FILENAME = "test.prof"; + void add_tid(int tid); + + static void *profile_thread_func(void *arg); + + int num_threads() const; + + void run_profiled_code(int num_iters); + + void test_header(const ucs_profile_header_t *hdr, unsigned exp_mode, + const void **ptr); + void test_locations(const ucs_profile_location_t *locations, + unsigned num_locations, const void **ptr); + void test_thread_locations(const ucs_profile_thread_header_t *thread_hdr, + unsigned num_locations, uint64_t exp_count, + unsigned exp_num_records, const void **ptr); + + void do_test(unsigned int_mode, const std::string& str_mode); +}; static int sum(int a, int b) { @@ -84,27 +113,106 @@ UCS_PROFILE_FUNC(int, profile_test_func2, (a, b), int a, int b) return UCS_PROFILE_CALL(sum, a, b); } -const int test_profile::MAX_LINE = __LINE__; +const int test_profile::MAX_LINE = __LINE__; +const unsigned test_profile::NUM_LOCAITONS = 12u; +const char* test_profile::PROFILE_FILENAME = "test.prof"; -void test_profile::test_header(ucs_profile_header_t *hdr, unsigned exp_mode) +test_profile::test_profile() { + pthread_spin_init(&m_tids_lock, 0); +} + +test_profile::~test_profile() +{ + pthread_spin_destroy(&m_tids_lock); +} + +void test_profile::add_tid(int tid) +{ + pthread_spin_lock(&m_tids_lock); + m_tids.insert(tid); + pthread_spin_unlock(&m_tids_lock); +} + +void *test_profile::profile_thread_func(void *arg) +{ + const thread_param *param = (const thread_param*)arg; + + param->test->add_tid(ucs_get_tid()); + + for (int i = 0; i < param->iters; ++i) { + profile_test_func1(); + profile_test_func2(1, 2); + } + + return NULL; +} + +int test_profile::num_threads() const +{ + return GetParam(); +} + +void test_profile::run_profiled_code(int num_iters) +{ + int ret; + thread_param param; + + param.iters = num_iters; + param.test = this; + + if (num_threads() == 1) { + profile_thread_func(¶m); + } else { + std::vector threads; + + for (int i = 0; i < num_threads(); ++i) { + pthread_t profile_thread; + ret = pthread_create(&profile_thread, NULL, profile_thread_func, + (void*)¶m); + if (ret < 0) { + ADD_FAILURE() << "pthread_create failed: " << strerror(errno); + break; + } + + threads.push_back(profile_thread); + } + + while (!threads.empty()) { + void *result; + ret = pthread_join(threads.back(), &result); + if (ret < 0) { + ADD_FAILURE() << "pthread_join failed: " << strerror(errno); + } + + threads.pop_back(); + } + } +} + +void test_profile::test_header(const ucs_profile_header_t *hdr, unsigned exp_mode, + const void **ptr) +{ + EXPECT_EQ(UCS_PROFILE_FILE_VERSION, hdr->version); EXPECT_EQ(std::string(ucs_get_host_name()), std::string(hdr->hostname)); EXPECT_EQ(getpid(), (pid_t)hdr->pid); EXPECT_EQ(exp_mode, hdr->mode); + EXPECT_EQ(NUM_LOCAITONS, hdr->num_locations); + EXPECT_EQ((uint32_t)num_threads(), hdr->num_threads); EXPECT_NEAR(hdr->one_second / ucs_time_from_sec(1.0), 1.0, 0.01); + + *ptr = hdr + 1; } -void test_profile::test_locations(ucs_profile_location_t *locations, - unsigned num_locations, uint64_t exp_count) +void test_profile::test_locations(const ucs_profile_location_t *locations, + unsigned num_locations, const void **ptr) { std::set loc_names; for (unsigned i = 0; i < num_locations; ++i) { - ucs_profile_location_t *loc = &locations[i]; + const ucs_profile_location_t *loc = &locations[i]; EXPECT_EQ(std::string(basename(__FILE__)), std::string(loc->file)); EXPECT_GE(loc->line, MIN_LINE); EXPECT_LE(loc->line, MAX_LINE); - EXPECT_LT(loc->total_time, ucs_time_from_sec(1.0) * ucs::test_time_multiplier()); - EXPECT_EQ(exp_count, locations[i].count); loc_names.insert(loc->name); } @@ -115,59 +223,176 @@ void test_profile::test_locations(ucs_profile_location_t *locations, EXPECT_NE(loc_names.end(), loc_names.find("sum")); EXPECT_NE(loc_names.end(), loc_names.find("allocate")); EXPECT_NE(loc_names.end(), loc_names.find("work")); + + *ptr = locations + num_locations; } -UCS_TEST_F(test_profile, accum) { - scoped_profile p(*this, UCS_PROFILE_FILENAME, "accum"); - profile_test_func1(); - profile_test_func2(1, 2); +void test_profile::test_thread_locations( + const ucs_profile_thread_header_t *thread_hdr, + unsigned num_locations, uint64_t exp_count, + unsigned exp_num_records, const void **ptr) +{ + const ucs_profile_thread_location_t *loc; - std::string data = p.read(); - ucs_profile_header_t *hdr = reinterpret_cast(&data[0]); - test_header(hdr, UCS_BIT(UCS_PROFILE_MODE_ACCUM)); + EXPECT_NE(m_tids.end(), m_tids.find(thread_hdr->tid)); + EXPECT_EQ(exp_num_records, thread_hdr->num_records); + + EXPECT_LE(thread_hdr->end_time, ucs_get_time()); + EXPECT_LE(thread_hdr->start_time, thread_hdr->end_time); + EXPECT_LE(thread_hdr->end_time - thread_hdr->start_time, + ucs_time_from_sec(1.0) * ucs::test_time_multiplier() * (1 + exp_count)); - EXPECT_EQ(12u, hdr->num_locations); - test_locations(reinterpret_cast(hdr + 1), - hdr->num_locations, - 1); + for (unsigned i = 0; i < num_locations; ++i) { + loc = &reinterpret_cast + (thread_hdr + 1)[i]; + EXPECT_EQ(exp_count, loc->count); + EXPECT_LE(loc->total_time, + ucs_time_from_sec(1.0) * ucs::test_time_multiplier() * exp_count); + } - EXPECT_EQ(0u, hdr->num_records); + *ptr = reinterpret_cast(thread_hdr + 1) + + num_locations; } -UCS_TEST_F(test_profile, log) { - static const int ITER = 3; - scoped_profile p(*this, UCS_PROFILE_FILENAME, "log"); - for (int i = 0; i < ITER; ++i) { - profile_test_func1(); - profile_test_func2(1, 2); - } +void test_profile::do_test(unsigned int_mode, const std::string& str_mode) +{ + const int ITER = 5; + uint64_t exp_count = (int_mode & UCS_BIT(UCS_PROFILE_MODE_ACCUM)) ? + ITER : 0; + uint64_t exp_num_records = (int_mode & UCS_BIT(UCS_PROFILE_MODE_LOG)) ? + (NUM_LOCAITONS * ITER) : 0; + + + scoped_profile p(*this, PROFILE_FILENAME, str_mode.c_str()); + run_profiled_code(ITER); std::string data = p.read(); - ucs_profile_header_t *hdr = reinterpret_cast(&data[0]); - test_header(hdr, UCS_BIT(UCS_PROFILE_MODE_LOG)); - - EXPECT_EQ(12u, hdr->num_locations); - ucs_profile_location_t *locations = reinterpret_cast(hdr + 1); - test_locations(locations, hdr->num_locations, 0); - - EXPECT_EQ(12 * ITER, (int)hdr->num_records); - ucs_profile_record_t *records = reinterpret_cast(locations + - hdr->num_locations); - uint64_t prev_ts = records[0].timestamp; - for (uint64_t i = 0; i < hdr->num_records; ++i) { - ucs_profile_record_t *rec = &records[i]; - EXPECT_GE(rec->location, 0u); - EXPECT_LT(rec->location, 12u); - EXPECT_GE(rec->timestamp, prev_ts); - prev_ts = rec->timestamp; - ucs_profile_location_t *loc = &locations[rec->location]; - if ((loc->type == UCS_PROFILE_TYPE_REQUEST_NEW) || - (loc->type == UCS_PROFILE_TYPE_REQUEST_EVENT) || - (loc->type == UCS_PROFILE_TYPE_REQUEST_FREE)) - { - EXPECT_EQ((uintptr_t)&test_request, rec->param64); + const void *ptr = &data[0]; + + /* Read and test file header */ + const ucs_profile_header_t *hdr = + reinterpret_cast(ptr); + test_header(hdr, int_mode, &ptr); + + /* Read and test global locations */ + const ucs_profile_location_t *locations = + reinterpret_cast(ptr); + test_locations(locations, hdr->num_locations, &ptr); + + /* Read and test threads */ + for (int i = 0; i < num_threads(); ++i) { + const ucs_profile_thread_header_t *thread_hdr = + reinterpret_cast(ptr); + + test_thread_locations(thread_hdr, hdr->num_locations, exp_count, + exp_num_records, &ptr); + + const ucs_profile_record_t *records = + reinterpret_cast(ptr); + uint64_t prev_ts = records[0].timestamp; + for (uint64_t i = 0; i < thread_hdr->num_records; ++i) { + const ucs_profile_record_t *rec = &records[i]; + + /* test location index */ + EXPECT_GE(rec->location, 0u); + EXPECT_LT(rec->location, uint32_t(NUM_LOCAITONS)); + + /* test timestamp */ + EXPECT_GE(rec->timestamp, prev_ts); + prev_ts = rec->timestamp; + + /* test param64 */ + const ucs_profile_location_t *loc = &locations[rec->location]; + if ((loc->type == UCS_PROFILE_TYPE_REQUEST_NEW) || + (loc->type == UCS_PROFILE_TYPE_REQUEST_EVENT) || + (loc->type == UCS_PROFILE_TYPE_REQUEST_FREE)) + { + EXPECT_EQ((uintptr_t)&test_request, rec->param64); + } + } + + ptr = records + thread_hdr->num_records; + } + + EXPECT_EQ(&data[data.size()], ptr) << data.size(); +} + +UCS_TEST_P(test_profile, accum) { + do_test(UCS_BIT(UCS_PROFILE_MODE_ACCUM), "accum"); +} + +UCS_TEST_P(test_profile, log) { + do_test(UCS_BIT(UCS_PROFILE_MODE_LOG), "log"); +} + +UCS_TEST_P(test_profile, log_accum) { + do_test(UCS_BIT(UCS_PROFILE_MODE_LOG) | UCS_BIT(UCS_PROFILE_MODE_ACCUM), + "log,accum"); +} + +INSTANTIATE_TEST_CASE_P(st, test_profile, ::testing::Values(1)); +INSTANTIATE_TEST_CASE_P(mt, test_profile, ::testing::Values(2, 4, 8)); + +class test_profile_perf : public test_profile { +}; + +UCS_TEST_SKIP_COND_P(test_profile_perf, overhead, RUNNING_ON_VALGRIND) { + +#if defined(__x86_64__) || defined(__powerpc64__) + const double EXP_OVERHEAD_NSEC = 100.0; +#else + const double EXP_OVERHEAD_NSEC = 150.0; +#endif + const int ITERS = 100; + const int WARMUP_ITERS = 5; + const int COUNT = 100000; + double overhead_nsec = 0.0; + + scoped_profile p(*this, PROFILE_FILENAME, "accum"); + + for (int retry = 0; retry < (ucs::perf_retry_count + 1); ++retry) { + ucs_time_t time_profile_on = 0; + ucs_time_t time_profile_off = 0; + + for (int i = 0; i < WARMUP_ITERS + ITERS; ++i) { + ucs_time_t t; + + t = ucs_get_time(); + for (volatile int j = 0; j < COUNT;) { + ++j; + } + if (i > WARMUP_ITERS) { + time_profile_off += ucs_get_time() - t; + } + + t = ucs_get_time(); + for (volatile int j = 0; j < COUNT;) { + UCS_PROFILE_CODE("test") { + ++j; + } + } + if (i > WARMUP_ITERS) { + time_profile_on += ucs_get_time() - t; + } + } + + overhead_nsec = ucs_time_to_nsec(time_profile_on - time_profile_off) / + COUNT / ITERS; + UCS_TEST_MESSAGE << "overhead: " << overhead_nsec << " nsec"; + + if (!ucs::perf_retry_count) { + UCS_TEST_MESSAGE << "not validating performance"; + return; /* Success */ + } else if (overhead_nsec < EXP_OVERHEAD_NSEC) { + return; /* Success */ + } else { + ucs::safe_sleep(ucs::perf_retry_interval); } } + + EXPECT_LT(overhead_nsec, EXP_OVERHEAD_NSEC) << "Profiling overhead is too high"; } +INSTANTIATE_TEST_CASE_P(st, test_profile_perf, ::testing::Values(1)); + #endif diff --git a/test/gtest/ucs/test_rcache.cc b/test/gtest/ucs/test_rcache.cc index 3fa041608dd..f8da479b97f 100644 --- a/test/gtest/ucs/test_rcache.cc +++ b/test/gtest/ucs/test_rcache.cc @@ -14,6 +14,35 @@ extern "C" { #include #include } +#include + + +class test_rcache_basic : public ucs::test { +}; + +UCS_TEST_F(test_rcache_basic, create_fail) { + static const ucs_rcache_ops_t ops = { + NULL, NULL, NULL + }; + ucs_rcache_params_t params = { + sizeof(ucs_rcache_region_t), + UCS_PGT_ADDR_ALIGN, + ucs_get_page_size(), + UCS_BIT(30), /* non-existing event */ + 1000, + &ops, + NULL, + 0 + }; + + ucs_rcache_t *rcache; + ucs_status_t status = ucs_rcache_create(¶ms, "test", + ucs_stats_get_root(), &rcache); + EXPECT_NE(UCS_OK, status); /* should fail */ + if (status == UCS_OK) { + ucs_rcache_destroy(rcache); + } +} class test_rcache : public ucs::test { @@ -42,10 +71,11 @@ class test_rcache : public ucs::test { UCM_EVENT_VM_UNMAPPED, 1000, &ops, - reinterpret_cast(this) + reinterpret_cast(this), + 0 }; - UCS_TEST_CREATE_HANDLE(ucs_rcache_t*, m_rcache, ucs_rcache_destroy, - ucs_rcache_create, ¶ms, "test", ucs_stats_get_root()); + UCS_TEST_CREATE_HANDLE_IF_SUPPORTED(ucs_rcache_t*, m_rcache, ucs_rcache_destroy, + ucs_rcache_create, ¶ms, "test", ucs_stats_get_root()); } virtual void cleanup() { @@ -96,7 +126,7 @@ class test_rcache : public ucs::test { region->super.super.end - region->super.super.start); EXPECT_EQ(uint32_t(MAGIC), region->magic); region->magic = 0; - uint32_t prev = ucs_atomic_fadd32(&m_reg_count, -1); + uint32_t prev = ucs_atomic_fsub32(&m_reg_count, 1); EXPECT_GT(prev, 0u); } @@ -175,7 +205,7 @@ static uintptr_t virt_to_phys(uintptr_t address) fd = open(pagemap_file, O_RDONLY); if (fd < 0) { ucs_error("failed to open %s: %m", pagemap_file); - pa = -1; + pa = std::numeric_limits::max(); goto out; } @@ -183,7 +213,7 @@ static uintptr_t virt_to_phys(uintptr_t address) ret = lseek(fd, offset, SEEK_SET); if (ret != offset) { ucs_error("failed to seek in %s to offset %zu: %m", pagemap_file, offset); - pa = -1; + pa = std::numeric_limits::max(); goto out_close; } @@ -191,7 +221,7 @@ static uintptr_t virt_to_phys(uintptr_t address) if (ret != sizeof(entry)) { ucs_error("read from %s at offset %zu returned %ld: %m", pagemap_file, offset, ret); - pa = -1; + pa = std::numeric_limits::max(); goto out_close; } @@ -199,7 +229,7 @@ static uintptr_t virt_to_phys(uintptr_t address) pfn = entry & ((1ULL << 54) - 1); pa = (pfn * page_size) | (address & (page_size - 1)); } else { - pa = -1; /* Page not present */ + pa = std::numeric_limits::max(); /* Page not present */ } out_close: @@ -535,7 +565,9 @@ class test_rcache_no_register : public test_rcache { static ucs_log_func_rc_t log_handler(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap) + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) { /* Ignore warnings about empty memory pool */ if ((level == UCS_LOG_LEVEL_WARN) && strstr(message, "failed to register")) { @@ -608,7 +640,7 @@ UCS_MT_TEST_F(test_rcache_no_register, merge_invalid_prot_slow, 5) munmap(mem, size1+size2); } -#if ENABLE_STATS +#ifdef ENABLE_STATS class test_rcache_stats : public test_rcache { protected: @@ -630,7 +662,7 @@ class test_rcache_stats : public test_rcache { } int get_counter(int stat) { - return (int)UCS_STATS_GET_COUNTER(m_rcache.get()->stats, stat); + return (int)UCS_STATS_GET_COUNTER(m_rcache->stats, stat); } /* a helper function for stats tests debugging */ @@ -688,11 +720,37 @@ UCS_TEST_F(test_rcache_stats, unmap_dereg) { r1 = get(mem, size1); put(r1); + /* Should generate umap event and invalidate the memory */ + munmap(mem, size1); + EXPECT_EQ(1, get_counter(UCS_RCACHE_UNMAP_INVALIDATES)); + + /* when doing another rcache operation, the region is actually destroyed */ + mem = alloc_pages(size1, PROT_READ|PROT_WRITE); + r1 = get(mem, size1); + put(r1); + EXPECT_GE(get_counter(UCS_RCACHE_UNMAPS), 1); + EXPECT_EQ(1, get_counter(UCS_RCACHE_DEREGS)); + + /* cleanup */ + munmap(mem, size1); +} + +UCS_TEST_F(test_rcache_stats, unmap_dereg_with_lock) { + static const size_t size1 = 1024 * 1024; + void *mem = alloc_pages(size1, PROT_READ|PROT_WRITE); + region *r1; + + r1 = get(mem, size1); + put(r1); + /* Should generate umap event but no dereg or unmap invalidation. * We can have more unmap events if releasing the region structure triggers * releasing memory back to the OS. */ + pthread_rwlock_wrlock(&m_rcache->pgt_lock); munmap(mem, size1); + pthread_rwlock_unlock(&m_rcache->pgt_lock); + EXPECT_GE(get_counter(UCS_RCACHE_UNMAPS), 1); EXPECT_EQ(0, get_counter(UCS_RCACHE_UNMAP_INVALIDATES)); EXPECT_EQ(0, get_counter(UCS_RCACHE_DEREGS)); @@ -742,8 +800,11 @@ UCS_TEST_F(test_rcache_stats, hits_slow) { mem2 = alloc_pages(size1, PROT_READ|PROT_WRITE); r1 = get(mem2, size1); - /* generate unmap event */ + /* generate unmap event under lock, to roce using invalidation queue */ + pthread_rwlock_rdlock(&m_rcache->pgt_lock); munmap(mem1, size1); + pthread_rwlock_unlock(&m_rcache->pgt_lock); + EXPECT_EQ(1, get_counter(UCS_RCACHE_UNMAPS)); EXPECT_EQ(2, get_counter(UCS_RCACHE_GETS)); @@ -770,3 +831,81 @@ UCS_TEST_F(test_rcache_stats, hits_slow) { munmap(mem2, size1); } #endif + + +class test_rcache_pfn : public ucs::test { +public: + void test_pfn(void *address, unsigned page_num) + { + pfn_enum_t ctx; + ucs_status_t status; + + ctx.page_num = page_num; + status = ucs_sys_enum_pfn((uintptr_t)address, + page_num, enum_pfn_cb, &ctx); + ASSERT_UCS_OK(status); + /* we expect that we got exact page_num PFN calls */ + ASSERT_EQ(page_num, ctx.page.size()); + ASSERT_EQ(page_num, ctx.pfn.size()); + } + +protected: + typedef std::set page_set_t; + typedef std::set pfn_set_t; + typedef struct { + unsigned page_num; + page_set_t page; + pfn_set_t pfn; + } pfn_enum_t; + + static void enum_pfn_cb(unsigned page_num, unsigned long pfn, void *ctx) + { + pfn_enum_t *data = (pfn_enum_t*)ctx; + + EXPECT_LT(page_num, data->page_num); + /* we expect that every page will have a unique page_num and a + * unique PFN */ + EXPECT_EQ(data->pfn.end(), data->pfn.find(pfn)); + EXPECT_EQ(data->page.end(), data->page.find(page_num)); + data->pfn.insert(pfn); + data->page.insert(page_num); + } +}; + +UCS_TEST_F(test_rcache_pfn, enum_pfn) { + const int MAX_PAGE_NUM = 1024 * 100; /* 400Mb max buffer */ + size_t page_size = ucs_get_page_size(); + void *region; + unsigned i; + size_t len; + unsigned long pfn; + ucs_status_t status; + + /* stack page could not be mapped into zero region, if we get 0 here it + * means the kernel does not provide PFNs */ + status = ucs_sys_get_pfn((uintptr_t)&pfn, 1, &pfn); + ASSERT_UCS_OK(status); + if (pfn == 0) { + /* stack page could not be mapped into zero region */ + UCS_TEST_SKIP_R("PFN is not supported"); + } + + /* initialize stream here to avoid incorrect debug output */ + ucs::detail::message_stream ms("PAGES"); + + for (i = 1; i < MAX_PAGE_NUM; i *= 2) { + len = page_size * i; + ms << i << " "; + region = mmap(NULL, len, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_TRUE(region != MAP_FAILED); + memset(region, 0, len); /* ensure that pages are mapped */ + /* test region aligned by page size */ + test_pfn(region, i); + if (i > 1) { /* test pfn on mid-of-page address */ + test_pfn(UCS_PTR_BYTE_OFFSET(region, page_size / 2), i - 1); + } + + munmap(region, len); + } +} diff --git a/test/gtest/ucs/test_sock.cc b/test/gtest/ucs/test_sock.cc index a8fe6c4900c..3a0c5f70126 100644 --- a/test/gtest/ucs/test_sock.cc +++ b/test/gtest/ucs/test_sock.cc @@ -11,6 +11,7 @@ extern "C" { #include + static std::string socket_err_exp_str; class test_socket : public ucs::test { @@ -19,7 +20,9 @@ class test_socket : public ucs::test { static ucs_log_func_rc_t socket_error_handler(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap) + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) { // Ignore errors that invalid input parameters as it is expected if (level == UCS_LOG_LEVEL_ERROR) { @@ -48,14 +51,14 @@ UCS_TEST_F(test_socket, sockaddr_sizeof) { /* Check with wrong IPv4 */ { size = 0; - EXPECT_EQ(UCS_OK, ucs_sockaddr_sizeof((const struct sockaddr*)&sa_in, &size)); + EXPECT_UCS_OK(ucs_sockaddr_sizeof((const struct sockaddr*)&sa_in, &size)); EXPECT_EQ(sizeof(struct sockaddr_in), size); } /* Check with wrong IPv6 */ { size = 0; - EXPECT_EQ(UCS_OK, ucs_sockaddr_sizeof((const struct sockaddr*)&sa_in6, &size)); + EXPECT_UCS_OK(ucs_sockaddr_sizeof((const struct sockaddr*)&sa_in6, &size)); EXPECT_EQ(sizeof(struct sockaddr_in6), size); } @@ -73,11 +76,11 @@ UCS_TEST_F(test_socket, sockaddr_sizeof) { } UCS_TEST_F(test_socket, sockaddr_get_port) { - const unsigned sin_port = 5555; + const uint16_t sin_port = 5555; struct sockaddr_in sa_in; struct sockaddr_in6 sa_in6; struct sockaddr_un sa_un; - unsigned port = 0; + uint16_t port = 0; sa_in.sin_family = AF_INET; sa_in.sin_port = htons(sin_port); @@ -88,14 +91,14 @@ UCS_TEST_F(test_socket, sockaddr_get_port) { /* Check with wrong IPv4 */ { port = 0; - EXPECT_EQ(UCS_OK, ucs_sockaddr_get_port((const struct sockaddr*)&sa_in, &port)); + EXPECT_UCS_OK(ucs_sockaddr_get_port((const struct sockaddr*)&sa_in, &port)); EXPECT_EQ(sin_port, port); } /* Check with wrong IPv6 */ { port = 0; - EXPECT_EQ(UCS_OK, ucs_sockaddr_get_port((const struct sockaddr*)&sa_in6, &port)); + EXPECT_UCS_OK(ucs_sockaddr_get_port((const struct sockaddr*)&sa_in6, &port)); EXPECT_EQ(sin_port, port); } @@ -152,7 +155,7 @@ UCS_TEST_F(test_socket, sockaddr_get_inet_addr) { } UCS_TEST_F(test_socket, sockaddr_str) { - const unsigned port = 65534; + const uint16_t port = 65534; const char *ipv4_addr = "192.168.122.157"; const char *ipv6_addr = "fe80::218:e7ff:fe16:fb97"; struct sockaddr_in sa_in; @@ -260,3 +263,162 @@ UCS_TEST_F(test_socket, socket_setopt) { close(fd); } + +static void sockaddr_cmp_test(int sa_family, const char *ip_addr1, + const char *ip_addr2, unsigned port1, + unsigned port2, struct sockaddr *sa1, + struct sockaddr *sa2) +{ + int cmp_res1, cmp_res2; + ucs_status_t status; + + sa1->sa_family = sa_family; + sa2->sa_family = sa_family; + + inet_pton(sa_family, ip_addr1, + const_cast(ucs_sockaddr_get_inet_addr(sa1))); + inet_pton(sa_family, ip_addr2, + const_cast(ucs_sockaddr_get_inet_addr(sa2))); + + status = ucs_sockaddr_set_port(sa1, port1); + ASSERT_UCS_OK(status); + status = ucs_sockaddr_set_port(sa2, port2); + ASSERT_UCS_OK(status); + + const void *addr1 = ucs_sockaddr_get_inet_addr(sa1); + const void *addr2 = ucs_sockaddr_get_inet_addr(sa2); + + ASSERT_TRUE(addr1 != NULL); + ASSERT_TRUE(addr2 != NULL); + + size_t addr_size = ((sa_family == AF_INET) ? + sizeof(UCS_SOCKET_INET_ADDR(sa1)) : + sizeof(UCS_SOCKET_INET6_ADDR(sa1))); + + // `sa1` vs `sa2` + { + int addr_cmp_res = memcmp(addr1, addr2, addr_size); + int port_cmp_res = + (port1 == port2) ? 0 : ((port1 < port2) ? -1 : 1); + int expected_cmp_res = + addr_cmp_res ? addr_cmp_res : port_cmp_res; + + cmp_res1 = ucs_sockaddr_cmp(sa1, sa2, &status); + EXPECT_UCS_OK(status); + EXPECT_EQ(expected_cmp_res, cmp_res1); + + // Call w/o `status` provided + cmp_res2 = ucs_sockaddr_cmp(sa1, sa2, &status); + EXPECT_EQ(cmp_res1, cmp_res2); + } + + // `sa2` vs `sa1` + { + int addr_cmp_res = memcmp(addr2, addr1, addr_size); + int port_cmp_res = + (port2 == port1) ? 0 : ((port2 < port1) ? -1 : 1); + int expected_cmp_res = + addr_cmp_res ? addr_cmp_res : port_cmp_res; + + cmp_res1 = ucs_sockaddr_cmp(sa2, sa1, &status); + EXPECT_UCS_OK(status); + EXPECT_EQ(expected_cmp_res, cmp_res1); + + // Call w/o `status` provided + cmp_res2 = ucs_sockaddr_cmp(sa2, sa1, &status); + EXPECT_EQ(cmp_res1, cmp_res2); + } +} + +UCS_TEST_F(test_socket, sockaddr_cmp) { + const unsigned port1 = 65534; + const unsigned port2 = 65533; + const char *ipv4_addr1 = "192.168.122.157"; + const char *ipv4_addr2 = "192.168.123.157"; + const char *ipv6_addr1 = "fe80::218:e7ff:fe16:fb97"; + const char *ipv6_addr2 = "fe80::219:e7ff:fe16:fb97"; + struct sockaddr_in sa_in_1 = { 0 }; + struct sockaddr_in sa_in_2 = { 0 }; + struct sockaddr_in6 sa_in6_1 = { 0 }; + struct sockaddr_in6 sa_in6_2 = { 0 }; + + // Same addresses; same ports + sockaddr_cmp_test(AF_INET, ipv4_addr1, ipv4_addr1, + port1, port1, + (struct sockaddr*)&sa_in_1, + (struct sockaddr*)&sa_in_2); + sockaddr_cmp_test(AF_INET6, ipv6_addr1, ipv6_addr1, + port1, port1, + (struct sockaddr*)&sa_in6_1, + (struct sockaddr*)&sa_in6_2); + + // Same addresses; different ports + sockaddr_cmp_test(AF_INET, ipv4_addr1, ipv4_addr1, + port1, port2, + (struct sockaddr*)&sa_in_1, + (struct sockaddr*)&sa_in_2); + sockaddr_cmp_test(AF_INET6, ipv6_addr1, ipv6_addr1, + port1, port2, + (struct sockaddr*)&sa_in6_1, + (struct sockaddr*)&sa_in6_2); + + // Different addresses; same ports + sockaddr_cmp_test(AF_INET, ipv4_addr1, ipv4_addr2, + port1, port1, + (struct sockaddr*)&sa_in_1, + (struct sockaddr*)&sa_in_2); + sockaddr_cmp_test(AF_INET6, ipv6_addr1, ipv6_addr2, + port1, port1, + (struct sockaddr*)&sa_in6_1, + (struct sockaddr*)&sa_in6_2); + + // Different addresses; different ports + sockaddr_cmp_test(AF_INET, ipv4_addr1, ipv4_addr2, + port1, port2, + (struct sockaddr*)&sa_in_1, + (struct sockaddr*)&sa_in_2); + sockaddr_cmp_test(AF_INET6, ipv6_addr1, ipv6_addr2, + port1, port2, + (struct sockaddr*)&sa_in6_1, + (struct sockaddr*)&sa_in6_2); +} + +static void sockaddr_cmp_err_test(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + ucs_status_t status; + int result; + + result = ucs_sockaddr_cmp((const struct sockaddr*)sa1, + (const struct sockaddr*)sa2, + &status); + EXPECT_EQ(UCS_ERR_INVALID_PARAM, status); + EXPECT_TRUE(result > 0); + + // Call w/o `status` provided + result = ucs_sockaddr_cmp((const struct sockaddr*)sa1, + (const struct sockaddr*)sa2, + NULL); + EXPECT_TRUE(result > 0); +} + +UCS_TEST_F(test_socket, sockaddr_cmp_err) { + // Check with wrong sa_family + struct sockaddr_un sa_un; + struct sockaddr_in sa_in; + + sa_un.sun_family = AF_UNIX; + sa_in.sin_family = AF_INET; + + socket_err_exp_str = "unknown address family: "; + scoped_log_handler log_handler(socket_error_handler); + + sockaddr_cmp_err_test((const struct sockaddr*)&sa_un, + (const struct sockaddr*)&sa_un); + + sockaddr_cmp_err_test((const struct sockaddr*)&sa_in, + (const struct sockaddr*)&sa_un); + + sockaddr_cmp_err_test((const struct sockaddr*)&sa_un, + (const struct sockaddr*)&sa_in); +} diff --git a/test/gtest/ucs/test_stats.cc b/test/gtest/ucs/test_stats.cc index d29dfa05cbd..354cf4e8588 100644 --- a/test/gtest/ucs/test_stats.cc +++ b/test/gtest/ucs/test_stats.cc @@ -12,7 +12,7 @@ extern "C" { #include #include -#if ENABLE_STATS +#ifdef ENABLE_STATS #define NUM_DATA_NODES 20 class stats_test : public ucs::test { diff --git a/test/gtest/ucs/test_stats_filter.cc b/test/gtest/ucs/test_stats_filter.cc index 833412a32c2..48609f3a040 100644 --- a/test/gtest/ucs/test_stats_filter.cc +++ b/test/gtest/ucs/test_stats_filter.cc @@ -14,7 +14,7 @@ extern "C" { #include #include -#if ENABLE_STATS +#ifdef ENABLE_STATS class stats_filter_test : public ucs::test { public: diff --git a/test/gtest/ucs/test_string.cc b/test/gtest/ucs/test_string.cc index 9e3e39aedd3..55c5d027bd1 100644 --- a/test/gtest/ucs/test_string.cc +++ b/test/gtest/ucs/test_string.cc @@ -6,6 +6,8 @@ #include extern "C" { +#include +#include #include } @@ -19,3 +21,85 @@ UCS_TEST_F(test_string, trim) { char str2[] = " foo foo "; EXPECT_EQ("foo foo", std::string(ucs_strtrim(str2))); } + +class test_string_buffer : public ucs::test { +}; + + +UCS_TEST_F(test_string_buffer, appendf) { + ucs_string_buffer_t strb; + + ucs_string_buffer_init(&strb); + + ucs_string_buffer_appendf(&strb, "%s", "We,"); + ucs_string_buffer_appendf(&strb, "%s", " Created,"); + ucs_string_buffer_appendf(&strb, "%s-%s", " The", "Monster"); + + EXPECT_EQ("We, Created, The-Monster", + std::string(ucs_string_buffer_cstr(&strb))); + + ucs_string_buffer_cleanup(&strb); +} + +UCS_TEST_F(test_string_buffer, append_long) { + ucs_string_buffer_t strb; + std::string str, exp_str; + + str.resize(100); + std::fill(str.begin(), str.end(), 'e'); + + ucs_string_buffer_init(&strb); + + for (unsigned i = 0; i < 10; ++i) { + ucs_string_buffer_appendf(&strb, "%s", str.c_str()); + exp_str += str; + EXPECT_EQ(exp_str.c_str(), std::string(ucs_string_buffer_cstr(&strb))); + } + + ucs_string_buffer_cleanup(&strb); +} + +UCS_TEST_F(test_string_buffer, rtrim) { + static const char *test_string = "wabbalubbadabdab"; + ucs_string_buffer_t strb; + + ucs_string_buffer_init(&strb); + ucs_string_buffer_appendf(&strb, "%s%s", test_string, ",,"); + ucs_string_buffer_rtrim(&strb, ","); + EXPECT_EQ(std::string(test_string), ucs_string_buffer_cstr(&strb)); + ucs_string_buffer_cleanup(&strb); + + ucs_string_buffer_init(&strb); + ucs_string_buffer_appendf(&strb, "%s%s", test_string, " \t \n \r "); + ucs_string_buffer_rtrim(&strb, NULL); + EXPECT_EQ(std::string(test_string), ucs_string_buffer_cstr(&strb)); + ucs_string_buffer_cleanup(&strb); +} + +class test_string_set : public ucs::test { +}; + +UCS_TEST_F(test_string_set, add) { + ucs_string_set_t sset; + + ucs_string_set_init(&sset); + + ucs_string_set_add(&sset, "We"); + ucs_string_set_addf(&sset, "%s", "Created"); + ucs_string_set_addf(&sset, "%s-%s", "The", "Monster"); + + EXPECT_TRUE (ucs_string_set_contains(&sset, "We")); + EXPECT_FALSE(ucs_string_set_contains(&sset, "Created ")); + EXPECT_TRUE (ucs_string_set_contains(&sset, "Created")); + + ucs_string_buffer_t strb; + ucs_string_buffer_init(&strb); + ucs_string_set_print_sorted(&sset, &strb, ","); + + EXPECT_EQ("Created,The-Monster,We", + std::string(ucs_string_buffer_cstr(&strb))); + + ucs_string_buffer_cleanup(&strb); + + ucs_string_set_cleanup(&sset); +} diff --git a/test/gtest/ucs/test_sys.cc b/test/gtest/ucs/test_sys.cc index 1538b6eaf71..dffcfd59311 100644 --- a/test/gtest/ucs/test_sys.cc +++ b/test/gtest/ucs/test_sys.cc @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2012. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -11,6 +11,7 @@ extern "C" { #include #include #include +#include } #include @@ -28,6 +29,17 @@ class test_sys : public ucs::test { ucs_memunits_to_str(size, buf, sizeof(buf)); EXPECT_EQ(std::string(expected), buf); } + + static void check_cache_type(ucs_cpu_cache_type_t type, const char *name) + { + size_t cache; + char memunits[32]; + + cache = ucs_cpu_get_cache_size(type); + + ucs_memunits_to_str(cache, memunits, sizeof(memunits)); + UCS_TEST_MESSAGE << name << " cache: " << memunits; + } }; UCS_TEST_F(test_sys, uuid) { @@ -46,25 +58,26 @@ UCS_TEST_F(test_sys, machine_guid) { } UCS_TEST_F(test_sys, spinlock) { - ucs_spinlock_t lock; + ucs_recursive_spinlock_t lock; pthread_t self; self = pthread_self(); - ucs_spinlock_init(&lock); + ucs_recursive_spinlock_init(&lock, 0); - ucs_spin_lock(&lock); - EXPECT_TRUE(ucs_spin_is_owner(&lock, self)); + ucs_recursive_spin_lock(&lock); + EXPECT_TRUE(ucs_recursive_spin_is_owner(&lock, self)); /* coverity[double_lock] */ - ucs_spin_lock(&lock); - EXPECT_TRUE(ucs_spin_is_owner(&lock, self)); + ucs_recursive_spin_lock(&lock); + EXPECT_TRUE(ucs_recursive_spin_is_owner(&lock, self)); - ucs_spin_unlock(&lock); - EXPECT_TRUE(ucs_spin_is_owner(&lock, self)); + ucs_recursive_spin_unlock(&lock); + EXPECT_TRUE(ucs_recursive_spin_is_owner(&lock, self)); - ucs_spin_unlock(&lock); - EXPECT_FALSE(ucs_spin_is_owner(&lock, self)); + /* coverity[double_unlock] */ + ucs_recursive_spin_unlock(&lock); + EXPECT_FALSE(ucs_recursive_spin_is_owner(&lock, self)); } UCS_TEST_F(test_sys, get_mem_prot) { @@ -134,5 +147,12 @@ UCS_TEST_F(test_sys, memunits_to_str) { test_memunits(UCS_GBYTE, "1G"); test_memunits(2 * UCS_GBYTE, "2G"); test_memunits(UCS_TBYTE, "1T"); - test_memunits(UCS_TBYTE * 1024, "1024T"); + test_memunits(UCS_TBYTE * 1024, "1P"); +} + +UCS_TEST_F(test_sys, cpu_cache) { + check_cache_type(UCS_CPU_CACHE_L1d, "L1d"); + check_cache_type(UCS_CPU_CACHE_L1i, "L1i"); + check_cache_type(UCS_CPU_CACHE_L2, "L2"); + check_cache_type(UCS_CPU_CACHE_L3, "L3"); } diff --git a/test/gtest/ucs/test_time.cc b/test/gtest/ucs/test_time.cc index 6491aa090df..36072609191 100644 --- a/test/gtest/ucs/test_time.cc +++ b/test/gtest/ucs/test_time.cc @@ -18,18 +18,15 @@ class test_time : public ucs::test { UCS_TEST_F(test_time, time_calc) { double value = ucs::rand() % UCS_USEC_PER_SEC; - EXPECT_NEAR(value * 1000, ucs_time_to_msec(ucs_time_from_sec (value)), 0.000001); - EXPECT_NEAR(value * 1000, ucs_time_to_usec(ucs_time_from_msec(value)), 0.001); - EXPECT_NEAR(value * 1000, ucs_time_to_nsec(ucs_time_from_usec(value)), 10.0); + EXPECT_NEAR(value * 1000ull, ucs_time_to_msec(ucs_time_from_sec (value)), 0.000001); + EXPECT_NEAR(value * 1000ull, ucs_time_to_usec(ucs_time_from_msec(value)), 0.01); + EXPECT_NEAR(value * 1000ull, ucs_time_to_nsec(ucs_time_from_usec(value)), 20.0); } /* This test is only useful when used with high-precision timers */ #if HAVE_HW_TIMER -UCS_TEST_F(test_time, get_time) { - if (ucs::test_time_multiplier() > 1) { - UCS_TEST_SKIP; - } - +UCS_TEST_SKIP_COND_F(test_time, get_time, + (ucs::test_time_multiplier() > 1)) { ucs_time_t time1 = ucs_get_time(); ucs_time_t time2 = ucs_get_time(); EXPECT_GE(time2, time1); @@ -63,7 +60,6 @@ UCS_TEST_F(test_time, timerq) { ucs_timer_queue_t timerq; ucs_status_t status; - ::srand(::time(NULL)); for (unsigned test_count = 0; test_count < 500; ++test_count) { const ucs_time_t interval1 = (ucs::rand() % 20) + 1; diff --git a/test/gtest/ucs/test_topo.cc b/test/gtest/ucs/test_topo.cc new file mode 100644 index 00000000000..61b5fafc782 --- /dev/null +++ b/test/gtest/ucs/test_topo.cc @@ -0,0 +1,47 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#include +extern "C" { +#include +} + +class test_topo : public ucs::test { +}; + +UCS_TEST_F(test_topo, find_device_by_bus_id) { + ucs_status_t status; + ucs_sys_device_t dev1; + ucs_sys_device_t dev2; + ucs_sys_bus_id_t dummy_bus_id; + + dummy_bus_id.domain = 0xffff; + dummy_bus_id.bus = 0xff; + dummy_bus_id.slot = 0xff; + dummy_bus_id.function = 1; + + status = ucs_topo_find_device_by_bus_id(&dummy_bus_id, &dev1); + ASSERT_UCS_OK(status); + + dummy_bus_id.function = 2; + + status = ucs_topo_find_device_by_bus_id(&dummy_bus_id, &dev2); + ASSERT_UCS_OK(status); + ASSERT_EQ(dev2, ((unsigned)dev1 + 1)); +} + +UCS_TEST_F(test_topo, get_distance) { + ucs_status_t status; + ucs_sys_dev_distance_t distance; + + status = ucs_topo_get_distance(UCS_SYS_DEVICE_ID_UNKNOWN, + UCS_SYS_DEVICE_ID_UNKNOWN, &distance); + ASSERT_EQ(UCS_ERR_IO_ERROR, status); +} + +UCS_TEST_F(test_topo, print_info) { + ucs_topo_print_info(NULL); +} diff --git a/test/gtest/ucs/test_twheel.cc b/test/gtest/ucs/test_twheel.cc index ecef1c2c183..3c8b89a175a 100644 --- a/test/gtest/ucs/test_twheel.cc +++ b/test/gtest/ucs/test_twheel.cc @@ -49,7 +49,6 @@ void twheel::init() { ucs_twheel_init(&m_wheel, ucs_time_from_usec(32) * ucs::test_time_multiplier(), ucs_get_time()); - ::srand(::time(NULL)); } void twheel::cleanup() @@ -126,9 +125,8 @@ void twheel::set_timer_delta(struct hr_timer *t, int how) #define N_LOOPS 20 -UCS_TEST_F(twheel, precision_single) { - UCS_TEST_SKIP; // Test is broken - +UCS_TEST_SKIP_COND_F(twheel, precision_single, true) { + // Test is broken #if 0 struct hr_timer t; ucs_time_t now; @@ -158,12 +156,10 @@ UCS_TEST_F(twheel, precision_single) { #define N_TIMERS 10000 -UCS_TEST_F(twheel, precision_multi) { - std::vector t(N_TIMERS); - - UCS_TEST_SKIP; // Test is broken - +UCS_TEST_SKIP_COND_F(twheel, precision_multi, true) { + // Test is broken #if 0 + std::vector t(N_TIMERS); ucs_time_t start, now, eps; init_timerv(&t[0], N_TIMERS); for (int i = 0; i < N_TIMERS; i++) { @@ -206,14 +202,11 @@ UCS_TEST_F(twheel, add_twice) { } -UCS_TEST_F(twheel, add_overflow) { - - UCS_TEST_SKIP; // Test is broken - +UCS_TEST_SKIP_COND_F(twheel, add_overflow, true) { + // Test is broken #if 0 struct hr_timer t; init_timer(&t, 0); - ::srand(::time(NULL)); t.total_time = 0; set_timer_delta(&t, -2); diff --git a/test/gtest/ucs/test_type.cc b/test/gtest/ucs/test_type.cc index 1f8790f2096..333fea488fb 100644 --- a/test/gtest/ucs/test_type.cc +++ b/test/gtest/ucs/test_type.cc @@ -8,6 +8,7 @@ extern "C" { #include #include +#include } #include @@ -40,6 +41,13 @@ UCS_TEST_F(test_type, cpu_set) { EXPECT_EQ(0, ucs_cpu_set_find_lcs(&cpu_mask)); } +UCS_TEST_F(test_type, status) { + void *ptr = (void*)0xff00000000ul; + EXPECT_TRUE(UCS_PTR_IS_PTR(ptr)); + EXPECT_FALSE(UCS_PTR_IS_PTR(NULL)); + EXPECT_NE(UCS_OK, UCS_PTR_STATUS(ptr)); +} + class test_init_once: public test_type { protected: test_init_once() : m_once(INIT_ONCE_INIT), m_count(0) {}; @@ -52,7 +60,7 @@ class test_init_once: public test_type { static const ucs_init_once_t INIT_ONCE_INIT; }; -const ucs_init_once_t test_init_once::INIT_ONCE_INIT = UCS_INIT_ONCE_INIITIALIZER; +const ucs_init_once_t test_init_once::INIT_ONCE_INIT = UCS_INIT_ONCE_INITIALIZER; UCS_MT_TEST_F(test_init_once, init_once, 10) { diff --git a/test/gtest/uct/ib/test_cq_moderation.cc b/test/gtest/uct/ib/test_cq_moderation.cc index 4aee357a367..90d65479ce5 100644 --- a/test/gtest/uct/ib/test_cq_moderation.cc +++ b/test/gtest/uct/ib/test_cq_moderation.cc @@ -4,46 +4,53 @@ * See file LICENSE for terms. */ -extern "C" { -#include -} #include #include #include #include -static const unsigned nsec_per_usec = (UCS_NSEC_PER_SEC / UCS_USEC_PER_SEC); - -/* wait for 3 usecs to get statistics */ -static const unsigned long test_period = (3ul * UCS_USEC_PER_SEC); -static const unsigned moderation_period = 1000; /* usecs */ -static const unsigned event_limit = (40 * test_period / moderation_period / nsec_per_usec); -static const unsigned max_repeats = 1000; +/* wait for 1 sec to get statistics */ +static const unsigned long test_period_usec = (1ul * UCS_USEC_PER_SEC); +static const unsigned moderation_period_usec = 1000; /* usecs */ +/* use multiplier 2 because we have same iface to send/recv which may produce 2x events */ +static const unsigned event_limit = (2 * test_period_usec / moderation_period_usec); +static const unsigned max_repeats = 60; /* max 3 minutes per test */ class test_uct_cq_moderation : public uct_test { protected: void init() { - uct_test::init(); - if (RUNNING_ON_VALGRIND) { UCS_TEST_SKIP_R("skipping on valgrind"); } - set_config("IB_TX_CQ_MODERATION=1"); - if ((GetParam()->tl_name == "rc") || (GetParam()->tl_name == "rc_mlx5") || - (GetParam()->tl_name == "dc") || (GetParam()->tl_name == "dc_mlx5")) { + if (!has_rc() && !has_ud()) { + UCS_TEST_SKIP_R("unsupported"); + } + + uct_test::init(); + + if (has_rc()) { set_config("RC_FC_ENABLE=n"); } - set_config(std::string("IB_TX_EVENT_MOD_PERIOD=") + ucs::to_string(moderation_period) + "us"); - set_config(std::string("IB_RX_EVENT_MOD_PERIOD=") + ucs::to_string(moderation_period) + "us"); + set_config(std::string("IB_TX_EVENT_MOD_PERIOD=") + + ucs::to_string(moderation_period_usec) + "us"); + set_config(std::string("IB_RX_EVENT_MOD_PERIOD=") + + ucs::to_string(moderation_period_usec) + "us"); - m_sender = uct_test::create_entity(0); + m_sender = uct_test::create_entity(0, NULL, NULL, NULL, NULL, NULL, + send_async_event_handler, this); m_entities.push_back(m_sender); - m_receiver = uct_test::create_entity(0); + m_receiver = uct_test::create_entity(0, NULL, NULL, NULL, NULL, NULL, + recv_async_event_handler, this); m_entities.push_back(m_receiver); + + check_skip_test(); + + m_send_async_event_ctx.wait_for_event(*m_sender, 0); + m_recv_async_event_ctx.wait_for_event(*m_receiver, 0); } void connect() { @@ -59,24 +66,26 @@ class test_uct_cq_moderation : public uct_test { m_sender->destroy_ep(0); } - void iface_arm(uct_iface_h iface) { - struct pollfd pfd; - int fd; + static void send_async_event_handler(void *arg, unsigned flags) { + test_uct_cq_moderation *self = static_cast(arg); + self->m_send_async_event_ctx.signal(); + } + + static void recv_async_event_handler(void *arg, unsigned flags) { + test_uct_cq_moderation *self = static_cast(arg); + self->m_recv_async_event_ctx.signal(); + } + void iface_arm(entity &test_e, async_event_ctx &ctx) { /* wait for all messages are arrived */ while (m_recv < m_send) { progress(); } - uct_iface_event_fd_get(iface, &fd); - - pfd.fd = fd; - pfd.events = POLLIN; - do { /* arm all event types */ while (1) { - if (uct_iface_event_arm(iface, + if (uct_iface_event_arm(test_e.iface(), UCT_EVENT_SEND_COMP | UCT_EVENT_RECV | UCT_EVENT_RECV_SIG) != UCS_ERR_BUSY) { @@ -84,8 +93,8 @@ class test_uct_cq_moderation : public uct_test { } progress(); } - /* repeat till FD is in active state */ - } while (poll(&pfd, 1, 0) > 0); + /* repeat till there are events */ + } while (ctx.wait_for_event(test_e, 0)); } static ucs_status_t am_cb(void *arg, void *data, size_t len, unsigned flags) { @@ -97,26 +106,22 @@ class test_uct_cq_moderation : public uct_test { return UCS_OK; } - void run_test(uct_iface_h iface); + void run_test(entity &test_e, async_event_ctx &ctx); entity * m_sender; entity * m_receiver; unsigned m_send; unsigned m_recv; + + uct_test::async_event_ctx m_send_async_event_ctx; + uct_test::async_event_ctx m_recv_async_event_ctx; }; -void test_uct_cq_moderation::run_test(uct_iface_h iface) { - unsigned events; - int fd; - unsigned i; - int polled; - struct pollfd pfd; +void test_uct_cq_moderation::run_test(entity &test_e, async_event_ctx &ctx) { + unsigned events, i; ucs_status_t status; - check_caps(UCT_IFACE_FLAG_EVENT_SEND_COMP); - check_caps(UCT_IFACE_FLAG_EVENT_RECV); - uct_iface_set_am_handler(m_receiver->iface(), 0, am_cb, this, 0); connect(); @@ -124,23 +129,19 @@ void test_uct_cq_moderation::run_test(uct_iface_h iface) { m_send = 0; m_recv = 0; - uct_iface_event_fd_get(iface, &fd); - pfd.fd = fd; - pfd.events = POLLIN; - /* repeat test till at least one iteration is successful * to exclude random fluctuations */ for (i = 0; i < max_repeats; i++) { events = 0; - iface_arm(iface); + iface_arm(test_e, ctx); ucs_time_t tm = ucs_get_time(); - while ((ucs_get_time() - tm) < test_period) { - polled = poll(&pfd, 1, 0); - if (polled > 0) { + while ((ucs_time_to_usec(ucs_get_time()) - + ucs_time_to_usec(tm)) < test_period_usec) { + if (ctx.wait_for_event(test_e, 0)) { events++; - iface_arm(iface); + iface_arm(test_e, ctx); } do { @@ -151,6 +152,8 @@ void test_uct_cq_moderation::run_test(uct_iface_h iface) { ASSERT_UCS_OK(status); } m_sender->flush(); + UCS_TEST_MESSAGE << "iteration: " << i + 1 << ", events: " << events + << ", limit: " << event_limit; if (events <= event_limit) { break; } @@ -161,12 +164,14 @@ void test_uct_cq_moderation::run_test(uct_iface_h iface) { EXPECT_LE(events, event_limit); } -UCS_TEST_P(test_uct_cq_moderation, send_period) { - run_test(m_sender->iface()); +UCS_TEST_SKIP_COND_P(test_uct_cq_moderation, send_period, + !check_event_caps(UCT_IFACE_FLAG_EVENT_SEND_COMP)) { + run_test(*m_sender, m_send_async_event_ctx); } -UCS_TEST_P(test_uct_cq_moderation, recv_period) { - run_test(m_receiver->iface()); +UCS_TEST_SKIP_COND_P(test_uct_cq_moderation, recv_period, + !check_event_caps(UCT_IFACE_FLAG_EVENT_RECV)) { + run_test(*m_receiver, m_recv_async_event_ctx); } #if HAVE_DECL_IBV_EXP_CQ_MODERATION diff --git a/test/gtest/uct/ib/test_dc.cc b/test/gtest/uct/ib/test_dc.cc index b296fbe28af..e1b5b48a92c 100644 --- a/test/gtest/uct/ib/test_dc.cc +++ b/test/gtest/uct/ib/test_dc.cc @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. * Copyright (C) UT-Battelle, LLC. 2016. ALL RIGHTS RESERVED. * Copyright (C) ARM Ltd. 2016.All rights reserved. * See file LICENSE for terms. @@ -61,7 +61,9 @@ class test_dc : public test_rc { static ucs_log_func_rc_t log_ep_destroy(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap) + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) { if (level != UCS_LOG_LEVEL_WARN) { /* debug messages are ignored */ @@ -133,10 +135,8 @@ class test_dc : public test_rc { { struct dcs_pending *preq = (struct dcs_pending *)uct_req; uct_dc_mlx5_ep_t *ep; - uct_dc_mlx5_iface_t *iface; - ep = dc_ep(preq->e, 0); - iface = dc_iface(preq->e); + ep = dc_ep(preq->e, 0); EXPECT_NE(UCT_DC_MLX5_EP_NO_DCI, ep->dci); @@ -144,20 +144,14 @@ class test_dc : public test_rc { * operation still stands on pending */ preq->is_done = 1; - iface->super.super.tx.cq_available = 0; return UCS_INPROGRESS; } static void purge_cb(uct_pending_req_t *uct_req, void *arg) { struct dcs_pending *preq = (struct dcs_pending *)uct_req; - uct_dc_mlx5_ep_t *ep; - uct_dc_mlx5_iface_t *iface; - ep = dc_ep(preq->e, 0); - iface = dc_iface(preq->e); - EXPECT_NE(UCT_DC_MLX5_EP_NO_DCI, ep->dci); - iface->super.super.tx.cq_available = 8; + EXPECT_NE(UCT_DC_MLX5_EP_NO_DCI, dc_ep(preq->e, 0)->dci); } static void purge_count_cb(uct_pending_req_t *uct_req, void *arg) @@ -239,11 +233,9 @@ UCS_TEST_P(test_dc, dcs_multi) { */ UCS_TEST_P(test_dc, dcs_ep_destroy) { - ucs_status_t status; uct_dc_mlx5_ep_t *ep; uct_dc_mlx5_iface_t *iface; - ucs_log_push_handler(log_ep_destroy); UCS_TEST_SCOPE_EXIT() { ucs_log_pop_handler(); } UCS_TEST_SCOPE_EXIT_END @@ -252,8 +244,7 @@ UCS_TEST_P(test_dc, dcs_ep_destroy) { iface = dc_iface(m_e1); n_warnings = 0; EXPECT_EQ(UCT_DC_MLX5_EP_NO_DCI, ep->dci); - status = uct_ep_am_short(m_e1->ep(0), 0, 0, NULL, 0); - EXPECT_UCS_OK(status); + send_am_messages(m_e1, 2, UCS_OK); /* dci 0 must be assigned to the ep */ EXPECT_EQ(iface->tx.dcis_stack[0], ep->dci); EXPECT_EQ(1, iface->tx.stack_top); @@ -299,25 +290,24 @@ UCS_TEST_P(test_dc, dcs_ep_flush_destroy) { EXPECT_EQ(0, iface->tx.stack_top); } -/* Check that flushing ep from pending releases dci */ -UCS_TEST_P(test_dc, dcs_ep_flush_pending) { - +UCS_TEST_P(test_dc, dcs_ep_flush_pending, "DC_NUM_DCI=1") { ucs_status_t status; uct_dc_mlx5_iface_t *iface; m_e1->connect_to_iface(0, *m_e2); m_e1->connect_to_iface(1, *m_e2); - /* use all iface resources */ iface = dc_iface(m_e1); - iface->super.super.tx.cq_available = 8; + + /* shorten test time by reducing dci QP resources */ + iface->tx.dcis[0].txqp.available = 8; do { status = uct_ep_am_short(m_e1->ep(1), 0, 0, NULL, 0); } while (status == UCS_OK); EXPECT_EQ(UCS_ERR_NO_RESOURCE, status); - /* flush another ep. Flush fails because there are no cqes */ + /* flush another ep. Flush fails because there is no free dci */ status = uct_ep_flush(m_e1->ep(0), 0, NULL); EXPECT_EQ(UCS_ERR_NO_RESOURCE, status); @@ -328,6 +318,9 @@ UCS_TEST_P(test_dc, dcs_ep_flush_pending) { status = uct_ep_pending_add(m_e1->ep(0), &preq.uct_req, 0); EXPECT_UCS_OK(status); + status = uct_ep_am_short(m_e1->ep(0), 0, 0, NULL, 0); + EXPECT_EQ(UCS_ERR_NO_RESOURCE, status); + /* progress till ep is flushed */ do { progress(); @@ -336,95 +329,47 @@ UCS_TEST_P(test_dc, dcs_ep_flush_pending) { /* flush the other active ep */ flush(); - /* check that ep does not hold dci */ - EXPECT_EQ(0, iface->tx.stack_top); -} - -/* Check that the following sequnce works ok: - * - Add some pending request to DCI wait queue - * - Try to send something from this ep. This will force ep to take free DCI - * (the send will not succeed anyway) - * - Progress all pendings - * - Make sure that there is no any assertion and everyting is ok - * (just send something). - * */ -UCS_TEST_P(test_dc, dcs_ep_am_pending) { - - ucs_status_t status; - uct_dc_mlx5_iface_t *iface; - - m_e1->connect_to_iface(0, *m_e2); - m_e1->connect_to_iface(1, *m_e2); - - /* use all iface resources */ - iface = dc_iface(m_e1); - iface->super.super.tx.cq_available = 8; - do { - status = uct_ep_am_short(m_e1->ep(1), 0, 0, NULL, 0); - } while (status == UCS_OK); - - EXPECT_EQ(UCS_ERR_NO_RESOURCE, status); - - /* put AM op on pending */ - preq.e = m_e1; - preq.uct_req.func = uct_pending_flush; - status = uct_ep_pending_add(m_e1->ep(0), &preq.uct_req, 0); - EXPECT_UCS_OK(status); - - status = uct_ep_am_short(m_e1->ep(0), 0, 0, NULL, 0); - EXPECT_EQ(UCS_ERR_NO_RESOURCE, status); - - flush(); - status = uct_ep_am_short(m_e1->ep(0), 0, 0, NULL, 0); EXPECT_EQ(UCS_OK, status); - flush(); + + /* check that ep does not hold dci */ + EXPECT_EQ(0, iface->tx.stack_top); } -/* check that ep does not hold dci after - * purge +/* check that ep does not hold dci after purge */ -UCS_TEST_P(test_dc, dcs_ep_purge_pending) { +UCS_TEST_P(test_dc, dcs_ep_purge_pending, "DC_NUM_DCI=1") { ucs_status_t status; uct_dc_mlx5_iface_t *iface; uct_dc_mlx5_ep_t *ep; m_e1->connect_to_iface(0, *m_e2); - m_e1->connect_to_iface(1, *m_e2); - /* use all iface resources */ iface = dc_iface(m_e1); - ep = dc_ep(m_e1, 0); - iface->super.super.tx.cq_available = 8; + ep = dc_ep(m_e1, 0); + iface->tx.dcis[0].txqp.available = 8; do { - status = uct_ep_am_short(m_e1->ep(1), 0, 0, NULL, 0); + status = uct_ep_am_short(m_e1->ep(0), 0, 0, NULL, 0); } while (status == UCS_OK); EXPECT_EQ(UCS_ERR_NO_RESOURCE, status); - /* flush another ep. Flush fails because there are no cqes */ status = uct_ep_flush(m_e1->ep(0), 0, NULL); EXPECT_EQ(UCS_ERR_NO_RESOURCE, status); /* put flush op on pending */ preq.is_done = 0; preq.e = m_e1; - preq.uct_req.func = uct_pending_dummy; + preq.uct_req.func = uct_pending_flush; status = uct_ep_pending_add(m_e1->ep(0), &preq.uct_req, 0); EXPECT_UCS_OK(status); - do { - progress(); - } while (!preq.is_done); - - EXPECT_LE(1, iface->tx.stack_top); uct_ep_pending_purge(m_e1->ep(0), purge_cb, NULL); - EXPECT_EQ(UCT_DC_MLX5_EP_NO_DCI, ep->dci); flush(); - EXPECT_EQ(0, iface->tx.stack_top); + EXPECT_EQ(UCT_DC_MLX5_EP_NO_DCI, ep->dci); } UCS_TEST_P(test_dc, rand_dci_many_eps) { @@ -445,7 +390,7 @@ UCS_TEST_P(test_dc, rand_dci_many_eps) { } /* Try to send on all eps (taking into account available resources) */ - uint32_t num_sends = ucs_min(num_eps, iface->super.super.tx.cq_available); + uint32_t num_sends = num_eps; for (unsigned i = 0; i < num_sends; i++) { ucs_status_t status = uct_ep_am_short(rand_e->ep(i), 0, 0, NULL, 0); @@ -469,19 +414,20 @@ UCS_TEST_P(test_dc, rand_dci_pending_purge) { int dci_id; uct_dc_mlx5_ep_t *ep; - iface->super.super.tx.cq_available = 0; - for (dci_id = 0; dci_id < ndci; ++dci_id) { for (int i = 0; i < num_eps; i++) { int ep_id = i + dci_id*ndci; rand_e->connect_to_iface(ep_id, *m_e2); ep = dc_ep(rand_e, ep_id); EXPECT_NE(UCT_DC_MLX5_EP_NO_DCI, ep->dci); + int available = iface->tx.dcis[ep->dci].txqp.available; + iface->tx.dcis[ep->dci].txqp.available = 0; for (int j = 0; j < num_reqs; ++j, ++idx) { preq[idx].func = NULL; ASSERT_UCS_OK(uct_ep_pending_add(rand_e->ep(ep_id), &preq[idx], 0)); } + iface->tx.dcis[ep->dci].txqp.available = available; } } @@ -497,8 +443,10 @@ UCS_TEST_P(test_dc, rand_dci_pending_purge) { flush(); } -UCS_TEST_P(test_dc, stress_iface_ops) { - test_iface_ops(); +UCS_TEST_SKIP_COND_P(test_dc, stress_iface_ops, + !check_caps(UCT_IFACE_FLAG_PUT_ZCOPY), "DC_NUM_DCI=1") { + + test_iface_ops(dc_iface(m_e1)->tx.dcis[0].txqp.available); } UCT_DC_INSTANTIATE_TEST_CASE(test_dc) @@ -511,6 +459,27 @@ class test_dc_flow_control : public test_rc_flow_control { uct_rc_fc_t* get_fc_ptr(entity *e, int ep_idx = 0) { return &ucs_derived_of(e->ep(ep_idx), uct_dc_mlx5_ep_t)->fc; } + + virtual void disable_entity(entity *e) { + uct_dc_mlx5_iface_t *iface = ucs_derived_of(e->iface(), + uct_dc_mlx5_iface_t); + + for (int i = 0; i < iface->tx.ndci; ++i) { + uct_rc_txqp_available_set(&iface->tx.dcis[i].txqp, 0); + } + iface->tx.stack_top = iface->tx.ndci; + } + + virtual void enable_entity(entity *e, unsigned cq_num = 128) { + uct_dc_mlx5_iface_t *iface = ucs_derived_of(e->iface(), + uct_dc_mlx5_iface_t); + + for (int i = 0; i < iface->tx.ndci; ++i) { + uct_rc_txqp_available_set(&iface->tx.dcis[i].txqp, + iface->tx.dcis[i].txwq.bb_max); + } + iface->tx.stack_top = 0; + } }; UCS_TEST_P(test_dc_flow_control, general_enabled) @@ -643,7 +612,7 @@ UCS_TEST_P(test_dc_flow_control, dci_leak) /* Make sure that ep does not hold dci when sends completed */ uct_dc_mlx5_iface_t *iface = ucs_derived_of(m_e1->iface(), uct_dc_mlx5_iface_t); - ucs_time_t deadline = ucs::get_deadline(); + ucs_time_t deadline = ucs::get_deadline(); while (iface->tx.stack_top && (ucs_get_time() < deadline)) { progress(); } @@ -661,8 +630,81 @@ UCS_TEST_P(test_dc_flow_control, dci_leak) UCT_DC_INSTANTIATE_TEST_CASE(test_dc_flow_control) +class test_dc_iface_attrs : public test_rc_iface_attrs { +public: + attr_map_t get_num_iov() { + return get_num_iov_mlx5_common(UCT_IB_MLX5_AV_FULL_SIZE); + } +}; + +UCS_TEST_P(test_dc_iface_attrs, iface_attrs) +{ + basic_iov_test(); +} + +UCT_DC_INSTANTIATE_TEST_CASE(test_dc_iface_attrs) + +class test_dc_fc_deadlock : public test_dc_flow_control { +public: + test_dc_fc_deadlock() { + modify_config("IB_TX_QUEUE_LEN", "8"); + modify_config("RC_FC_WND_SIZE", "128"); + modify_config("DC_TX_POLICY", "rand"); + } + +protected: + struct dc_pending { + uct_pending_req_t uct_req; + entity *e; + }; + + static ucs_status_t am_pending(uct_pending_req_t *req) { + struct dc_pending *pr = reinterpret_cast(req); + return uct_ep_am_short(pr->e->ep(0), 0, 0, NULL, 0); + } +}; + +UCS_TEST_P(test_dc_fc_deadlock, basic, "DC_NUM_DCI=1") +{ + // Send to m_e2 until dci resources are exhausted. + // Also set FC window to 0 emulating lack of all TX resources + ucs_status_t status; + do { + status = uct_ep_am_short(m_e1->ep(0), 0, 0, NULL, 0); + } while (status == UCS_OK); + send_am_messages(m_e1, 1, UCS_ERR_NO_RESOURCE); + get_fc_ptr(m_e1)->fc_wnd = 0; + + // Add am send to pending + struct dc_pending preq; + preq.e = m_e1; + preq.uct_req.func = am_pending; + EXPECT_UCS_OK(uct_ep_pending_add(m_e1->ep(0), &preq.uct_req, 0)); + + // Send whole FC window to m_e1, which will force sending grant request. + // This grant request will be added to pending on m_e1, because it has no + // resources. + int wnd = 5; + set_fc_attributes(m_e2, true, wnd, + ucs_max((int)(wnd*0.5), 1), + ucs_max((int)(wnd*0.25), 1)); + send_am_and_flush(m_e2, wnd); + + // Now, make sure that m_e1 will send grant to m_e2 even though FC window + // is still empty (dci resources will be restored during progression). + // If grant was not sent, this would be a deadlock situation due to lack + // of FC resources. + validate_grant(m_e2); + + // Restore m_e1 for proper cleanup + ucs_derived_of(m_e1->iface(), uct_dc_mlx5_iface_t)->tx.fc_grants = 0; + uct_ep_pending_purge(m_e1->ep(0), NULL, NULL); +} + +UCT_DC_INSTANTIATE_TEST_CASE(test_dc_fc_deadlock) + -#if ENABLE_STATS +#ifdef ENABLE_STATS class test_dc_flow_control_stats : public test_rc_flow_control_stats { public: diff --git a/test/gtest/uct/ib/test_devx.cc b/test/gtest/uct/ib/test_devx.cc new file mode 100644 index 00000000000..653dfa419bc --- /dev/null +++ b/test/gtest/uct/ib/test_devx.cc @@ -0,0 +1,51 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#include +extern "C" { +#include +} +#include +#include +#include + +class test_devx : public uct_test { +public: + entity* m_e; + + void init() { + uct_test::init(); + + m_e = create_entity(0); + m_entities.push_back(m_e); + + if (!(md()->super.dev.flags & UCT_IB_DEVICE_FLAG_MLX5_PRM && + md()->flags & UCT_IB_MLX5_MD_FLAG_DEVX)) { + std::stringstream ss; + ss << "DEVX is not supported by " << GetParam(); + UCS_TEST_SKIP_R(ss.str()); + } + } + + uct_ib_mlx5_md_t *md() { + return ucs_derived_of(m_e->md(), uct_ib_mlx5_md_t); + } + + uct_priv_worker_t *worker() { + return ucs_derived_of(m_e->worker(), uct_priv_worker_t); + } +}; + +UCS_TEST_P(test_devx, dbrec) +{ + uct_ib_mlx5_dbrec_t *dbrec; + + dbrec = (uct_ib_mlx5_dbrec_t *)ucs_mpool_get_inline(&md()->dbrec_pool); + ASSERT_FALSE(dbrec == NULL); + ucs_mpool_put_inline(dbrec); +} + +UCT_INSTANTIATE_IB_TEST_CASE(test_devx); diff --git a/test/gtest/uct/ib/test_ib.cc b/test/gtest/uct/ib/test_ib.cc index f6482f85615..5c1a3433c66 100644 --- a/test/gtest/uct/ib/test_ib.cc +++ b/test/gtest/uct/ib/test_ib.cc @@ -4,361 +4,470 @@ * See file LICENSE for terms. */ -#include -#include - -extern "C" { -#include -#include -#include -#include -#include -#include -} +#include +test_uct_ib::test_uct_ib() : m_e1(NULL), m_e2(NULL) { } -class test_uct_ib : public uct_test { -public: - void initialize() { - uct_test::init(); +void test_uct_ib::create_connected_entities() { + m_e1 = uct_test::create_entity(0); + m_e2 = uct_test::create_entity(0); + + m_entities.push_back(m_e1); + m_entities.push_back(m_e2); - m_e1 = uct_test::create_entity(0); - m_entities.push_back(m_e1); + m_e1->connect(0, *m_e2, 0); + m_e2->connect(0, *m_e1, 0); +} - m_e2 = uct_test::create_entity(0); - m_entities.push_back(m_e2); +void test_uct_ib::init() { + uct_test::init(); + create_connected_entities(); + test_uct_ib::m_ib_am_handler_counter = 0; +} - m_e1->connect(0, *m_e2, 0); - m_e2->connect(0, *m_e1, 0); +ucs_status_t test_uct_ib::ib_am_handler(void *arg, void *data, + size_t length, unsigned flags) { + recv_desc_t *my_desc = (recv_desc_t *) arg; + uint64_t *test_ib_hdr = (uint64_t *) data; + uint64_t *actual_data = (uint64_t *) test_ib_hdr + 1; + unsigned data_length = length - sizeof(test_ib_hdr); - test_uct_ib::ib_am_handler_counter = 0; + my_desc->length = data_length; + if (*test_ib_hdr == 0xbeef) { + memcpy(my_desc + 1, actual_data , data_length); } + ++test_uct_ib::m_ib_am_handler_counter; + return UCS_OK; +} - typedef struct { - unsigned length; - /* data follows */ - } recv_desc_t; - - typedef struct { - unsigned have_pkey; /* if 1 - means that the configured pkey was found */ - unsigned have_lmc; /* if 1 - means that the lmc is higher than zero */ -#if HAVE_DECL_IBV_LINK_LAYER_ETHERNET - unsigned have_valid_gid_idx; -#endif - } ib_port_desc_t; - - static ucs_status_t ib_am_handler(void *arg, void *data, size_t length, - unsigned flags) { - recv_desc_t *my_desc = (recv_desc_t *) arg; - uint64_t *test_ib_hdr = (uint64_t *) data; - uint64_t *actual_data = (uint64_t *) test_ib_hdr + 1; - unsigned data_length = length - sizeof(test_ib_hdr); - - my_desc->length = data_length; - if (*test_ib_hdr == 0xbeef) { - memcpy(my_desc + 1, actual_data , data_length); - } - ++test_uct_ib::ib_am_handler_counter; - return UCS_OK; - } +void test_uct_ib::send_recv_short() { + size_t start_am_counter = test_uct_ib::m_ib_am_handler_counter; + uint64_t send_data = 0xdeadbeef; + uint64_t test_ib_hdr = 0xbeef; + recv_desc_t *recv_buffer; + ucs_status_t status; - void pkey_find(const char *dev_name, unsigned port_num, struct ibv_port_attr port_attr, - struct ibv_context *ibctx, ib_port_desc_t *port_desc) { - uint16_t table_idx, pkey, pkey_partition; - uct_ib_iface_config_t *ib_config = ucs_derived_of(m_iface_config, uct_ib_iface_config_t); - - /* check if the configured pkey exists in the port's pkey table */ - for (table_idx = 0; table_idx < port_attr.pkey_tbl_len; table_idx++) { - if(ibv_query_pkey(ibctx, port_num, table_idx, &pkey)) { - UCS_TEST_ABORT("Failed to query pkey on port " << port_num << " on device: " << dev_name); - } - pkey_partition = ntohs(pkey) & UCT_IB_PKEY_PARTITION_MASK; - if (pkey_partition == (ib_config->pkey_value & UCT_IB_PKEY_PARTITION_MASK)) { - port_desc->have_pkey = 1; - break; - } - } - } + check_caps_skip(UCT_IFACE_FLAG_AM_SHORT); -#if HAVE_DECL_IBV_LINK_LAYER_ETHERNET - void test_eth_port(struct ibv_device *device, struct ibv_port_attr port_attr, - struct ibv_context *ibctx, unsigned port_num, - ib_port_desc_t *port_desc) { + recv_buffer = (recv_desc_t *) malloc(sizeof(*recv_buffer) + sizeof(uint64_t)); + recv_buffer->length = 0; /* Initialize length to 0 */ - union ibv_gid gid; - uct_ib_md_config_t *md_config = ucs_derived_of(m_md_config, uct_ib_md_config_t); - char md_name[UCT_MD_NAME_MAX]; - uct_md_h uct_md; - uct_ib_md_t *ib_md; - ucs_status_t status; - uint8_t gid_index; + /* set a callback for the uct to invoke for receiving the data */ + uct_iface_set_am_handler(m_e2->iface(), 0, ib_am_handler, recv_buffer, 0); - /* no pkeys for Ethernet */ - port_desc->have_pkey = 0; + /* send the data */ + status = uct_ep_am_short(m_e1->ep(0), 0, test_ib_hdr, + &send_data, sizeof(send_data)); + EXPECT_TRUE((status == UCS_OK) || (status == UCS_INPROGRESS)); - uct_ib_make_md_name(md_name, device); + flush(); + wait_for_value(&test_uct_ib::m_ib_am_handler_counter, + start_am_counter + 1, true); - status = uct_ib_md_open(md_name, m_md_config, &uct_md); - ASSERT_UCS_OK(status); + ASSERT_EQ(sizeof(send_data), recv_buffer->length); + EXPECT_EQ(send_data, *(uint64_t*)(recv_buffer+1)); - ib_md = ucs_derived_of(uct_md, uct_ib_md_t); - status = uct_ib_device_select_gid_index(&ib_md->dev, - port_num, md_config->ext.gid_index, - &gid_index); - ASSERT_UCS_OK(status); + free(recv_buffer); +} - /* check the gid index */ - if (ibv_query_gid(ibctx, port_num, gid_index, &gid) != 0) { - UCS_TEST_ABORT("Failed to query gid (index=" << gid_index << ")"); - } - if (uct_ib_device_is_gid_raw_empty(gid.raw)) { - port_desc->have_valid_gid_idx = 0; - } else { - port_desc->have_valid_gid_idx = 1; - } +size_t test_uct_ib::m_ib_am_handler_counter = 0; - uct_ib_md_close(uct_md); +class test_uct_ib_addr : public test_uct_ib { +public: + uct_ib_iface_config_t *ib_config() { + return ucs_derived_of(m_iface_config, uct_ib_iface_config_t); } -#endif - void lmc_find(struct ibv_port_attr port_attr, ib_port_desc_t *port_desc) { + void test_address_pack(uint64_t subnet_prefix) { + uct_ib_iface_t *iface = ucs_derived_of(m_e1->iface(), uct_ib_iface_t); + static const uint16_t lid_in = 0x1ee7; + union ibv_gid gid_in; + uct_ib_address_t *ib_addr; + size_t address_size; - if (port_attr.lmc > 0) { - port_desc->have_lmc = 1; - } - } + gid_in.global.subnet_prefix = subnet_prefix; + gid_in.global.interface_id = 0xdeadbeef; - void port_attr_test(const char *dev_name, unsigned port_num, ib_port_desc_t *port_desc) { - struct ibv_device **device_list; - struct ibv_context *ibctx = NULL; - struct ibv_port_attr port_attr; - int num_devices, i, found = 0; + uct_ib_address_pack_params_t pack_params; + pack_params.flags = uct_ib_iface_address_pack_flags(iface); + pack_params.gid = gid_in; + pack_params.lid = lid_in; + pack_params.roce_info = iface->gid_info.roce_info; + /* to suppress gcc 4.3.4 warning */ + pack_params.path_mtu = (enum ibv_mtu)0; + pack_params.gid_index = std::numeric_limits::max(); + pack_params.pkey = iface->pkey; + address_size = uct_ib_address_size(&pack_params); + ib_addr = (uct_ib_address_t*)malloc(address_size); + uct_ib_address_pack(&pack_params, ib_addr); + + uct_ib_address_pack_params_t unpack_params; + uct_ib_address_unpack(ib_addr, &unpack_params); - /* get device list */ - device_list = ibv_get_device_list(&num_devices); - if (device_list == NULL) { - UCS_TEST_ABORT("Failed to get the device list."); + if (uct_ib_iface_is_roce(iface)) { + EXPECT_TRUE(iface->config.force_global_addr); + EXPECT_TRUE((unpack_params.flags & UCT_IB_ADDRESS_PACK_FLAG_ETH) != 0); + EXPECT_EQ(iface->gid_info.roce_info.addr_family, + unpack_params.roce_info.addr_family); + EXPECT_EQ(iface->gid_info.roce_info.ver, + unpack_params.roce_info.ver); + } else { + EXPECT_EQ(lid_in, unpack_params.lid); } - /* search for the given device in the device list */ - for (i = 0; i < num_devices; ++i) { - if (strcmp(device_list[i]->name, dev_name)) { - continue; - } - /* found this dev_name on the host - open it */ - ibctx = ibv_open_device(device_list[i]); - if (ibctx == NULL) { - UCS_TEST_ABORT("Failed to open the device."); - } - found = 1; - break; - } - if (found != 1) { - UCS_TEST_ABORT("The requested device: " << dev_name << ", wasn't found in the device list."); - } + if (ib_config()->is_global && + !(unpack_params.flags & UCT_IB_ADDRESS_PACK_FLAG_ETH)) { + EXPECT_TRUE(unpack_params.flags & UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX); + EXPECT_EQ(gid_in.global.subnet_prefix, unpack_params.gid.global.subnet_prefix); - if (ibv_query_port(ibctx, port_num, &port_attr) != 0) { - UCS_TEST_ABORT("Failed to query port " << port_num << " on device: " << dev_name); + EXPECT_TRUE(unpack_params.flags & UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID); + EXPECT_EQ(gid_in.global.interface_id, unpack_params.gid.global.interface_id); } - /* check the lmc value in the port */ - lmc_find(port_attr, port_desc); + EXPECT_TRUE(!(unpack_params.flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU)); + EXPECT_EQ(UCT_IB_ADDRESS_INVALID_PATH_MTU, unpack_params.path_mtu); - if (IBV_PORT_IS_LINK_LAYER_ETHERNET(&port_attr)) { - test_eth_port(device_list[i], port_attr, ibctx, port_num, port_desc); - goto out; - } + EXPECT_TRUE(!(unpack_params.flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX)); + EXPECT_EQ(UCT_IB_ADDRESS_INVALID_GID_INDEX, unpack_params.gid_index); - /* find the configured pkey */ - pkey_find(dev_name, port_num, port_attr, ibctx, port_desc); + EXPECT_TRUE((unpack_params.flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY) != 0); + EXPECT_EQ(iface->pkey, unpack_params.pkey); -out: - ibv_close_device(ibctx); - ibv_free_device_list(device_list); + free(ib_addr); } - void test_port_avail(ib_port_desc_t *port_desc) { - char *p, *dev_name; - unsigned port_num; + void test_fill_ah_attr(uint64_t subnet_prefix) { + uct_ib_iface_t *iface = ucs_derived_of(m_e1->iface(), uct_ib_iface_t); + static const uint16_t lid = 0x1ee7; + union ibv_gid gid; + struct ibv_ah_attr ah_attr; - dev_name = strdup(GetParam()->dev_name.c_str()); /* device name and port number */ - /* split dev_name */ - p = strchr(dev_name, ':'); - EXPECT_TRUE(p != NULL); - *p = 0; + ASSERT_EQ(iface->config.force_global_addr, + ib_config()->is_global || uct_ib_iface_is_roce(iface)); - /* dev_name holds the device name */ - /* port number */ - if (sscanf(p + 1, "%d", &port_num) != 1) { - UCS_TEST_ABORT("Failed to get the port number on device: " << dev_name); - } - port_attr_test(dev_name, port_num, port_desc); + gid.global.subnet_prefix = subnet_prefix ?: iface->gid_info.gid.global.subnet_prefix; + gid.global.interface_id = 0xdeadbeef; - free(dev_name); + uct_ib_iface_fill_ah_attr_from_gid_lid(iface, lid, &gid, + iface->gid_info.gid_index, 0, + &ah_attr); + + if (uct_ib_iface_is_roce(iface)) { + /* in case of roce, should be global */ + EXPECT_TRUE(ah_attr.is_global); + } else if (ib_config()->is_global) { + /* in case of global address is forced - ah_attr should use GRH */ + EXPECT_TRUE(ah_attr.is_global); + } else if (iface->gid_info.gid.global.subnet_prefix == gid.global.subnet_prefix) { + /* in case of subnets are same - ah_attr depend from forced/nonforced GRH */ + EXPECT_FALSE(ah_attr.is_global); + } else if (iface->gid_info.gid.global.subnet_prefix != gid.global.subnet_prefix) { + /* in case of subnets are different - ah_attr should use GRH */ + EXPECT_TRUE(ah_attr.is_global); + } } +}; - void test_address_pack(uint64_t subnet_prefix) { - uct_ib_iface_t *iface = ucs_derived_of(m_e1->iface(), uct_ib_iface_t); - static const uint16_t lid_in = 0x1ee7; - union ibv_gid gid_in, gid_out; - uct_ib_address_t *ib_addr; - uint16_t lid_out; +UCS_TEST_P(test_uct_ib_addr, address_pack) { + test_address_pack(UCT_IB_LINK_LOCAL_PREFIX); + test_address_pack(UCT_IB_SITE_LOCAL_PREFIX | htobe64(0x7200)); + test_address_pack(0xdeadfeedbeefa880ul); +} - ib_addr = (uct_ib_address_t*)malloc(uct_ib_address_size(iface)); +UCS_TEST_P(test_uct_ib_addr, fill_ah_attr) { + test_fill_ah_attr(UCT_IB_LINK_LOCAL_PREFIX); + test_fill_ah_attr(UCT_IB_SITE_LOCAL_PREFIX | htobe64(0x7200)); + test_fill_ah_attr(0xdeadfeedbeefa880ul); + test_fill_ah_attr(0l); +} - gid_in.global.subnet_prefix = subnet_prefix; - gid_in.global.interface_id = 0xdeadbeef; - uct_ib_address_pack(iface, &gid_in, lid_in, ib_addr); +UCS_TEST_P(test_uct_ib_addr, address_pack_global, "IB_IS_GLOBAL=y") { + test_address_pack(UCT_IB_LINK_LOCAL_PREFIX); + test_address_pack(UCT_IB_SITE_LOCAL_PREFIX | htobe64(0x7200)); + test_address_pack(0xdeadfeedbeefa880ul); +} - uct_ib_address_unpack(ib_addr, &lid_out, &gid_out); +UCS_TEST_P(test_uct_ib_addr, fill_ah_attr_global, "IB_IS_GLOBAL=y") { + test_fill_ah_attr(UCT_IB_LINK_LOCAL_PREFIX); + test_fill_ah_attr(UCT_IB_SITE_LOCAL_PREFIX | htobe64(0x7200)); + test_fill_ah_attr(0xdeadfeedbeefa880ul); + test_fill_ah_attr(0l); +} - if (uct_ib_iface_is_roce(iface)) { - EXPECT_TRUE(iface->is_global_addr); - } else { - EXPECT_EQ(lid_in, lid_out); - } +UCT_INSTANTIATE_IB_TEST_CASE(test_uct_ib_addr); - if (iface->is_global_addr) { - EXPECT_EQ(gid_in.global.subnet_prefix, gid_out.global.subnet_prefix); - EXPECT_EQ(gid_in.global.interface_id, gid_out.global.interface_id); - } - free(ib_addr); - } +test_uct_ib_with_specific_port::test_uct_ib_with_specific_port() { + m_ibctx = NULL; + m_port = 0; + m_dev_name = ""; - void send_recv_short() { - uint64_t send_data = 0xdeadbeef; - uint64_t test_ib_hdr = 0xbeef; - recv_desc_t *recv_buffer; + memset(&m_port_attr, 0, sizeof(m_port_attr)); +} - initialize(); - check_caps(UCT_IFACE_FLAG_AM_SHORT); +void test_uct_ib_with_specific_port::init() { + size_t colon_pos = GetParam()->dev_name.find(":"); + std::string port_num_str; - recv_buffer = (recv_desc_t *) malloc(sizeof(*recv_buffer) + sizeof(uint64_t)); - recv_buffer->length = 0; /* Initialize length to 0 */ + m_dev_name = GetParam()->dev_name.substr(0, colon_pos); + port_num_str = GetParam()->dev_name.substr(colon_pos + 1); - /* set a callback for the uct to invoke for receiving the data */ - uct_iface_set_am_handler(m_e2->iface(), 0, ib_am_handler , recv_buffer, 0); + /* port number */ + if (sscanf(port_num_str.c_str(), "%d", &m_port) != 1) { + UCS_TEST_ABORT("Failed to get the port number on device: " << m_dev_name); + } - /* send the data */ - uct_ep_am_short(m_e1->ep(0), 0, test_ib_hdr, &send_data, sizeof(send_data)); + std::string abort_reason = + "The requested device " + m_dev_name + + " wasn't found in the device list."; + struct ibv_device **device_list; + int i, num_devices; + + /* get device list */ + device_list = ibv_get_device_list(&num_devices); + if (device_list == NULL) { + abort_reason = "Failed to get the device list."; + num_devices = 0; + } - short_progress_loop(100.0); + /* search for the given device in the device list */ + for (i = 0; i < num_devices; ++i) { + if (strcmp(device_list[i]->name, m_dev_name.c_str())) { + continue; + } - ASSERT_EQ(sizeof(send_data), recv_buffer->length); - EXPECT_EQ(send_data, *(uint64_t*)(recv_buffer+1)); + /* found this dev_name on the host - open it */ + m_ibctx = ibv_open_device(device_list[i]); + if (m_ibctx == NULL) { + abort_reason = "Failed to open the device."; + } + break; + } - free(recv_buffer); + ibv_free_device_list(device_list); + if (m_ibctx == NULL) { + UCS_TEST_ABORT(abort_reason); } - uct_ib_device_t *ib_device(entity *entity) { - uct_ib_iface_t *iface = ucs_derived_of(entity->iface(), uct_ib_iface_t); - return uct_ib_iface_device(iface); + if (ibv_query_port(m_ibctx, m_port, &m_port_attr) != 0) { + UCS_TEST_ABORT("Failed to query port " << m_port << + "on device: " << m_dev_name); } -protected: - entity *m_e1, *m_e2; - static size_t ib_am_handler_counter; -}; + try { + check_port_attr(); + } catch (...) { + test_uct_ib_with_specific_port::cleanup(); + throw; + } +} + +void test_uct_ib_with_specific_port::cleanup() { + if (m_ibctx != NULL) { + ibv_close_device(m_ibctx); + m_ibctx = NULL; + } +} -size_t test_uct_ib::ib_am_handler_counter = 0; +class test_uct_ib_lmc : public test_uct_ib_with_specific_port { +public: + void init() { + test_uct_ib_with_specific_port::init(); + test_uct_ib::init(); + } -UCS_TEST_P(test_uct_ib, non_default_pkey, "IB_PKEY=0x2") -{ - ib_port_desc_t *port_desc; - - /* check if the configured pkey exists in the port's pkey table. - * skip this test if it doesn't. */ - port_desc = (ib_port_desc_t *) calloc(1, sizeof(*port_desc)); - test_port_avail(port_desc); - - if (port_desc->have_pkey) { - free(port_desc); - } else { - free(port_desc); - UCS_TEST_SKIP_R("pkey not found or not an IB port"); + void cleanup() { + test_uct_ib::cleanup(); + test_uct_ib_with_specific_port::cleanup(); } + void check_port_attr() { + /* check if a non zero lmc is set on the port */ + if (!m_port_attr.lmc) { + UCS_TEST_SKIP_R("lmc is set to zero on an IB port"); + } + } +}; + +UCS_TEST_P(test_uct_ib_lmc, non_default_lmc, "IB_LID_PATH_BITS=1") { send_recv_short(); } -UCS_TEST_P(test_uct_ib, non_default_lmc, "IB_LID_PATH_BITS=1") -{ - ib_port_desc_t *port_desc; - - /* check if a non zero lmc is set on the port. - * skip this test if it isn't. */ - port_desc = (ib_port_desc_t *) calloc(1, sizeof(*port_desc)); - test_port_avail(port_desc); - - if (port_desc->have_lmc) { - free(port_desc); - } else { - free(port_desc); - UCS_TEST_SKIP_R("lmc is set to zero on an IB port"); +UCT_INSTANTIATE_IB_TEST_CASE(test_uct_ib_lmc); + +class test_uct_ib_gid_idx : public test_uct_ib_with_specific_port { +public: + void init() { + test_uct_ib_with_specific_port::init(); + test_uct_ib::init(); } - send_recv_short(); -} + void cleanup() { + test_uct_ib::cleanup(); + test_uct_ib_with_specific_port::cleanup(); + } -#if HAVE_DECL_IBV_LINK_LAYER_ETHERNET -UCS_TEST_P(test_uct_ib, non_default_gid_idx, "GID_INDEX=1") -{ - ib_port_desc_t *port_desc; - - /* check if a non zero gid index can be used on the port. - * skip this test if it cannot. */ - port_desc = (ib_port_desc_t *) calloc(1, sizeof(*port_desc)); - test_port_avail(port_desc); - - if (port_desc->have_valid_gid_idx) { - free(port_desc); - } else { - free(port_desc); - UCS_TEST_SKIP_R("the configured gid index (1) cannot be used on the port"); + void check_port_attr() { + std::stringstream device_str; + device_str << ibv_get_device_name(m_ibctx->device) << ":" << m_port; + + if (!IBV_PORT_IS_LINK_LAYER_ETHERNET(&m_port_attr)) { + UCS_TEST_SKIP_R(device_str.str() + " is not Ethernet"); + } + + union ibv_gid gid; + uct_ib_md_config_t *md_config = + ucs_derived_of(m_md_config, uct_ib_md_config_t); + ucs::handle uct_md; + uct_ib_iface_t dummy_ib_iface; + uct_ib_md_t *ib_md; + ucs_status_t status; + uint8_t gid_index; + + UCS_TEST_CREATE_HANDLE(uct_md_h, uct_md, uct_ib_md_close, uct_ib_md_open, + &uct_ib_component, + ibv_get_device_name(m_ibctx->device), m_md_config); + + ib_md = ucs_derived_of(uct_md, uct_ib_md_t); + + dummy_ib_iface.config.port_num = m_port; + dummy_ib_iface.super.md = &ib_md->super; + + ASSERT_EQ(&ib_md->dev, uct_ib_iface_device(&dummy_ib_iface)); + + /* uct_ib_iface_init_roce_gid_info() requires only the port from the + * ib_iface so we can use a dummy one here. + * this function will set the gid_index in the dummy ib_iface. */ + status = uct_ib_iface_init_roce_gid_info(&dummy_ib_iface, + md_config->ext.gid_index); + ASSERT_UCS_OK(status); + + gid_index = dummy_ib_iface.gid_info.gid_index; + device_str << " gid index " << static_cast(gid_index); + + /* check the gid index */ + if (ibv_query_gid(m_ibctx, m_port, gid_index, &gid) != 0) { + UCS_TEST_ABORT("failed to query " + device_str.str()); + } + + /* check if the gid is valid to use */ + if (uct_ib_device_is_gid_raw_empty(gid.raw)) { + UCS_TEST_SKIP_R(device_str.str() + " is empty"); + } + + if (!uct_ib_device_test_roce_gid_index(&ib_md->dev, m_port, &gid, + gid_index)) { + UCS_TEST_SKIP_R("failed to create address handle on " + + device_str.str()); + } } +}; +UCS_TEST_P(test_uct_ib_gid_idx, non_default_gid_idx, "GID_INDEX=1") { send_recv_short(); } -#endif -UCS_TEST_P(test_uct_ib, address_pack) { - initialize(); - test_address_pack(UCT_IB_LINK_LOCAL_PREFIX); - test_address_pack(UCT_IB_SITE_LOCAL_PREFIX | htobe64(0x7200)); - test_address_pack(0xdeadfeedbeefa880ul); +UCT_INSTANTIATE_IB_TEST_CASE(test_uct_ib_gid_idx); + +class test_uct_ib_utils : public ucs::test { +}; + +UCS_TEST_F(test_uct_ib_utils, sec_to_qp_time) { + double avg; + uint8_t qp_val; + + // 0 sec + qp_val = uct_ib_to_qp_fabric_time(0); + EXPECT_EQ(1, qp_val); + + // the average time defined for the [0, 1st element] + qp_val = uct_ib_to_qp_fabric_time(4.096 * pow(2, 0) / UCS_USEC_PER_SEC); + EXPECT_EQ(1, qp_val); + + // the time defined for the 1st element + qp_val = uct_ib_to_qp_fabric_time(4.096 * pow(2, 1) / UCS_USEC_PER_SEC); + EXPECT_EQ(1, qp_val); + + for (uint8_t index = 2; index <= UCT_IB_FABRIC_TIME_MAX; index++) { + uint8_t prev_index = index - 1; + + // the time defined for the (i)th element + qp_val = uct_ib_to_qp_fabric_time(4.096 * pow(2, index) / UCS_USEC_PER_SEC); + EXPECT_EQ(index % UCT_IB_FABRIC_TIME_MAX, qp_val); + + // avg = (the average time defined for the [(i - 1)th element, (i)th element]) + avg = (4.096 * pow(2, prev_index) + 4.096 * pow(2, index)) * 0.5; + qp_val = uct_ib_to_qp_fabric_time(avg / UCS_USEC_PER_SEC); + EXPECT_EQ(index % UCT_IB_FABRIC_TIME_MAX, qp_val); + + // the average time defined for the [(i - 1)th element, avg] + qp_val = uct_ib_to_qp_fabric_time((4.096 * pow(2, prev_index) + avg) * 0.5 / UCS_USEC_PER_SEC); + EXPECT_EQ(prev_index, qp_val); + + // the average time defined for the [avg, (i)th element] + qp_val = uct_ib_to_qp_fabric_time((avg + 4.096 * pow(2, index)) * 0.5 / UCS_USEC_PER_SEC); + EXPECT_EQ(index % UCT_IB_FABRIC_TIME_MAX, qp_val); + } } +UCS_TEST_F(test_uct_ib_utils, sec_to_rnr_time) { + double avg; + uint8_t rnr_val; -UCT_INSTANTIATE_IB_TEST_CASE(test_uct_ib); + // 0 sec + rnr_val = uct_ib_to_rnr_fabric_time(0); + EXPECT_EQ(1, rnr_val); + // the average time defined for the [0, 1st element] + avg = uct_ib_qp_rnr_time_ms[1] * 0.5; + rnr_val = uct_ib_to_rnr_fabric_time(avg / UCS_MSEC_PER_SEC); + EXPECT_EQ(1, rnr_val); -class test_uct_event_ib : public test_uct_ib { -public: - test_uct_event_ib() { - length = 8; - wakeup_fd.revents = 0; - wakeup_fd.events = POLLIN; - wakeup_fd.fd = 0; - test_ib_hdr = 0xbeef; - m_buf1 = NULL; - m_buf2 = NULL; + for (uint8_t index = 1; index < UCT_IB_FABRIC_TIME_MAX; index++) { + uint8_t next_index = (index + 1) % UCT_IB_FABRIC_TIME_MAX; + + // the time defined for the (i)th element + rnr_val = uct_ib_to_rnr_fabric_time(uct_ib_qp_rnr_time_ms[index] / UCS_MSEC_PER_SEC); + EXPECT_EQ(index, rnr_val); + + // avg = (the average time defined for the [(i)th element, (i + 1)th element]) + avg = (uct_ib_qp_rnr_time_ms[index] + uct_ib_qp_rnr_time_ms[next_index]) * 0.5; + rnr_val = uct_ib_to_rnr_fabric_time(avg / UCS_MSEC_PER_SEC); + EXPECT_EQ(next_index, rnr_val); + + // the average time defined for the [(i)th element, avg] + rnr_val = uct_ib_to_rnr_fabric_time((uct_ib_qp_rnr_time_ms[index] + avg) * 0.5 / UCS_MSEC_PER_SEC); + EXPECT_EQ(index, rnr_val); + + // the average time defined for the [avg, (i + 1)th element] + rnr_val = uct_ib_to_rnr_fabric_time((avg + uct_ib_qp_rnr_time_ms[next_index]) * + 0.5 / UCS_MSEC_PER_SEC); + EXPECT_EQ(next_index, rnr_val); } - void initialize() { - ucs_status_t status; + // the time defined for the biggest value + rnr_val = uct_ib_to_rnr_fabric_time(uct_ib_qp_rnr_time_ms[0] / UCS_MSEC_PER_SEC); + EXPECT_EQ(0, rnr_val); - test_uct_ib::initialize(); + // 1 sec + rnr_val = uct_ib_to_rnr_fabric_time(1.); + EXPECT_EQ(0, rnr_val); +} - check_caps(UCT_IFACE_FLAG_PUT_SHORT | UCT_IFACE_FLAG_CB_SYNC | - UCT_IFACE_FLAG_EVENT_SEND_COMP | - UCT_IFACE_FLAG_EVENT_RECV); - /* create receiver wakeup */ - status = uct_iface_event_fd_get(m_e1->iface(), &wakeup_fd.fd); - ASSERT_EQ(status, UCS_OK); +class test_uct_event_ib : public test_uct_ib { +public: + test_uct_event_ib() { + length = 8; + test_ib_hdr = 0xbeef; + m_buf1 = NULL; + m_buf2 = NULL; + } + + void init() { + test_uct_ib::init(); - EXPECT_EQ(0, poll(&wakeup_fd, 1, 0)); + check_skip_test(); m_buf1 = new mapped_buffer(length, 0x1, *m_e1); m_buf2 = new mapped_buffer(length, 0x2, *m_e2); @@ -370,6 +479,26 @@ class test_uct_event_ib : public test_uct_ib { test_uct_event_ib::bcopy_pack_count = 0; } + /* overload `test_uct_ib` variant to pass the async event handler to + * the receive entity */ + void create_connected_entities() { + /* `m_e1` entity is used as a receiver in UCT IB Event tests */ + m_e1 = uct_test::create_entity(0, NULL, NULL, NULL, NULL, NULL, + async_event_handler, this); + m_e2 = uct_test::create_entity(0); + + m_entities.push_back(m_e1); + m_entities.push_back(m_e2); + + m_e1->connect(0, *m_e2, 0); + m_e2->connect(0, *m_e1, 0); + } + + static void async_event_handler(void *arg, unsigned flags) { + test_uct_event_ib *self = static_cast(arg); + self->m_async_event_ctx.signal(); + } + static size_t pack_cb(void *dest, void *arg) { const mapped_buffer *buf = (const mapped_buffer *)arg; memcpy(dest, buf->ptr(), buf->length()); @@ -431,27 +560,29 @@ class test_uct_event_ib : public test_uct_ib { protected: static const unsigned EVENTS = UCT_EVENT_RECV | UCT_EVENT_SEND_COMP; - struct pollfd wakeup_fd; size_t length; uint64_t test_ib_hdr; mapped_buffer *m_buf1, *m_buf2; static size_t bcopy_pack_count; + uct_test::async_event_ctx m_async_event_ctx; }; size_t test_uct_event_ib::bcopy_pack_count = 0; -UCS_TEST_P(test_uct_event_ib, tx_cq) +UCS_TEST_SKIP_COND_P(test_uct_event_ib, tx_cq, + !check_caps(UCT_IFACE_FLAG_PUT_BCOPY | + UCT_IFACE_FLAG_CB_SYNC) || + !check_event_caps(UCT_IFACE_FLAG_EVENT_SEND_COMP | + UCT_IFACE_FLAG_EVENT_RECV)) { ucs_status_t status; - initialize(); - status = uct_iface_event_arm(m_e1->iface(), EVENTS); ASSERT_EQ(status, UCS_OK); /* check initial state of the fd and [send|recv]_cq */ - EXPECT_EQ(0, poll(&wakeup_fd, 1, 0)); + EXPECT_FALSE(m_async_event_ctx.wait_for_event(*m_e1, 0)); check_send_cq(m_e1->iface(), 0); check_recv_cq(m_e1->iface(), 0); @@ -459,7 +590,9 @@ UCS_TEST_P(test_uct_event_ib, tx_cq) send_msg_e1_e2(); /* make sure the file descriptor is signaled once */ - ASSERT_EQ(1, poll(&wakeup_fd, 1, 1000*ucs::test_time_multiplier())); + EXPECT_TRUE(m_async_event_ctx.wait_for_event(*m_e1, + 1000 * + ucs::test_time_multiplier())); status = uct_iface_event_arm(m_e1->iface(), EVENTS); ASSERT_EQ(status, UCS_ERR_BUSY); @@ -472,18 +605,21 @@ UCS_TEST_P(test_uct_event_ib, tx_cq) } -UCS_TEST_P(test_uct_event_ib, txrx_cq) +UCS_TEST_SKIP_COND_P(test_uct_event_ib, txrx_cq, + !check_caps(UCT_IFACE_FLAG_PUT_BCOPY | + UCT_IFACE_FLAG_CB_SYNC | + UCT_IFACE_FLAG_AM_SHORT) || + !check_event_caps(UCT_IFACE_FLAG_EVENT_SEND_COMP | + UCT_IFACE_FLAG_EVENT_RECV)) { const size_t msg_count = 1; ucs_status_t status; - initialize(); - status = uct_iface_event_arm(m_e1->iface(), EVENTS); ASSERT_EQ(UCS_OK, status); /* check initial state of the fd and [send|recv]_cq */ - EXPECT_EQ(0, poll(&wakeup_fd, 1, 0)); + EXPECT_FALSE(m_async_event_ctx.wait_for_event(*m_e1, 0)); check_send_cq(m_e1->iface(), 0); check_recv_cq(m_e1->iface(), 0); @@ -494,13 +630,15 @@ UCS_TEST_P(test_uct_event_ib, txrx_cq) twait(150); /* Let completion to be generated */ /* Make sure all messages delivered */ - while ((test_uct_ib::ib_am_handler_counter < msg_count) || + while ((test_uct_ib::m_ib_am_handler_counter < msg_count) || (test_uct_event_ib::bcopy_pack_count < msg_count)) { progress(); } /* make sure the file descriptor is signaled */ - ASSERT_EQ(1, poll(&wakeup_fd, 1, 1000*ucs::test_time_multiplier())); + EXPECT_TRUE(m_async_event_ctx.wait_for_event(*m_e1, + 1000 * + ucs::test_time_multiplier())); /* Acknowledge all the requests */ short_progress_loop(); diff --git a/test/gtest/uct/ib/test_ib.h b/test/gtest/uct/ib/test_ib.h new file mode 100644 index 00000000000..1cad4814ffb --- /dev/null +++ b/test/gtest/uct/ib/test_ib.h @@ -0,0 +1,48 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* See file LICENSE for terms. +*/ + +#include + +extern "C" { +#include +#include +#include +#include +#include +} + + +class test_uct_ib : public uct_test { +public: + typedef struct { + unsigned length; + /* data follows */ + } recv_desc_t; + + test_uct_ib(); + void init(); + virtual void create_connected_entities(); + static ucs_status_t ib_am_handler(void *arg, void *data, + size_t length, unsigned flags); + virtual void send_recv_short(); + +protected: + entity *m_e1, *m_e2; + static size_t m_ib_am_handler_counter; +}; + +class test_uct_ib_with_specific_port : public test_uct_ib { +public: + test_uct_ib_with_specific_port(); + void init(); + void cleanup(); + virtual void check_port_attr() = 0; + +protected: + std::string m_dev_name; + unsigned m_port; + struct ibv_context *m_ibctx; + struct ibv_port_attr m_port_attr; +}; diff --git a/test/gtest/uct/ib/test_ib_event.cc b/test/gtest/uct/ib/test_ib_event.cc new file mode 100644 index 00000000000..87cda262cce --- /dev/null +++ b/test/gtest/uct/ib/test_ib_event.cc @@ -0,0 +1,121 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include + +extern "C" { +#if HAVE_MLX5_HW +#include +#include +#endif +#include +} + +#include + +class uct_p2p_test_event : public uct_p2p_test { +private: + void rc_mlx5_ep_to_err(entity &e, uint32_t *qp_num_p) { +#if HAVE_MLX5_HW + uct_ib_mlx5_md_t *md = (uct_ib_mlx5_md_t *)e.md(); + uct_rc_mlx5_ep_t *ep = (uct_rc_mlx5_ep_t *)e.ep(0); + uct_ib_mlx5_qp_t *qp = &ep->tx.wq.super; + + uct_ib_mlx5_modify_qp_state(md, qp, IBV_QPS_ERR); + + *qp_num_p = qp->qp_num; +#endif + } + + void rc_verbs_ep_to_err(entity &e, uint32_t *qp_num_p) { + uct_rc_verbs_ep_t *ep = (uct_rc_verbs_ep_t *)e.ep(0); + + uct_ib_modify_qp(ep->qp, IBV_QPS_ERR); + + *qp_num_p = ep->qp->qp_num; + } + +public: + uct_p2p_test_event(): uct_p2p_test(0) {} + + static ucs_log_level_t orig_log_level; + static volatile unsigned flushed_qp_num; + + static ucs_log_func_rc_t + last_wqe_check_log(const char *file, unsigned line, const char *function, + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) + { + std::string msg = format_message(message, ap); + + UCS_TEST_MESSAGE << msg.c_str(); + sscanf(msg.c_str(), + "IB Async event on %*s SRQ-attached QP 0x%x was flushed", + &flushed_qp_num); + + return (level <= orig_log_level) ? UCS_LOG_FUNC_RC_CONTINUE + : UCS_LOG_FUNC_RC_STOP; + } + + int wait_for_last_wqe_event(entity &e) { + const resource *r = dynamic_cast(GetParam()); + flushed_qp_num = -1; + uint32_t qp_num = 0; + + if (r->tl_name == "rc_mlx5") { + rc_mlx5_ep_to_err(e, &qp_num); + } else { + rc_verbs_ep_to_err(e, &qp_num); + } + + ucs_time_t deadline = ucs_get_time() + + ucs_time_from_sec(ucs::test_time_multiplier()); + while (ucs_get_time() < deadline) { + if (flushed_qp_num == qp_num) { + return 1; + } + usleep(1000); + } + + return 0; + } +}; + +UCS_TEST_P(uct_p2p_test_event, last_wqe, "ASYNC_EVENTS=y") +{ + const p2p_resource *r = dynamic_cast(GetParam()); + ucs_assert_always(r != NULL); + + mapped_buffer sendbuf(0, 0, sender()); + mapped_buffer recvbuf(0, 0, receiver()); + + ucs_log_push_handler(last_wqe_check_log); + orig_log_level = ucs_global_opts.log_component.log_level; + ucs_global_opts.log_component.log_level = UCS_LOG_LEVEL_DEBUG; + if (!ucs_log_is_enabled(UCS_LOG_LEVEL_DEBUG)) { + UCS_TEST_SKIP_R("Debug logging is disabled"); + } + + UCS_TEST_SCOPE_EXIT() { + ucs_global_opts.log_component.log_level = orig_log_level; + ucs_log_pop_handler(); + } UCS_TEST_SCOPE_EXIT_END + + ASSERT_TRUE(wait_for_last_wqe_event(sender())); + if (!r->loopback) { + ASSERT_TRUE(wait_for_last_wqe_event(receiver())); + } +} + +ucs_log_level_t uct_p2p_test_event::orig_log_level; +volatile unsigned uct_p2p_test_event::flushed_qp_num; + +UCT_INSTANTIATE_RC_TEST_CASE(uct_p2p_test_event); diff --git a/test/gtest/uct/ib/test_ib_md.cc b/test/gtest/uct/ib/test_ib_md.cc index 39c4f6f45cc..d7bf8026589 100644 --- a/test/gtest/uct/ib/test_ib_md.cc +++ b/test/gtest/uct/ib/test_ib_md.cc @@ -7,9 +7,10 @@ #include #include -#include #include +#ifdef HAVE_MLX5_HW #include +#endif #include #include @@ -60,31 +61,34 @@ void test_ib_md::ib_md_umr_check(void *rkey_buffer, ASSERT_TRUE(memh != UCT_MEM_HANDLE_NULL); uct_ib_mem_t *ib_memh = (uct_ib_mem_t *)memh; - uct_ib_md_t *ib_md = (uct_ib_md_t *)md(); if (amo_access) { EXPECT_TRUE(ib_memh->flags & UCT_IB_MEM_ACCESS_REMOTE_ATOMIC); - EXPECT_FALSE(ib_memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR); } else { EXPECT_FALSE(ib_memh->flags & UCT_IB_MEM_ACCESS_REMOTE_ATOMIC); - EXPECT_FALSE(ib_memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR); } +#ifdef HAVE_MLX5_HW + EXPECT_FALSE(ib_memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR); +#endif + status = uct_md_mkey_pack(md(), memh, rkey_buffer); EXPECT_UCS_OK(status); - if (amo_access) { - if (check_umr(ib_md)) { - EXPECT_TRUE(ib_memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR); - EXPECT_TRUE(ib_memh->atomic_rkey != 0); - } else { - EXPECT_FALSE(ib_memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR); - EXPECT_TRUE(ib_memh->atomic_rkey == 0); - } + status = uct_md_mkey_pack(md(), memh, rkey_buffer); + EXPECT_UCS_OK(status); + +#ifdef HAVE_MLX5_HW + uct_ib_md_t *ib_md = (uct_ib_md_t *)md(); + + if ((amo_access && check_umr(ib_md)) || ib_md->relaxed_order) { + EXPECT_TRUE(ib_memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR); + EXPECT_TRUE(ib_memh->atomic_rkey != 0); } else { EXPECT_FALSE(ib_memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR); EXPECT_TRUE(ib_memh->atomic_rkey == 0); } +#endif status = uct_md_mem_dereg(md(), memh); EXPECT_UCS_OK(status); @@ -93,7 +97,7 @@ void test_ib_md::ib_md_umr_check(void *rkey_buffer, } bool test_ib_md::has_ksm() const { -#if HAVE_DECL_MLX5DV_CONTEXT_FLAGS_DEVX +#if HAVE_DEVX return (ucs_derived_of(md(), uct_ib_md_t)->dev.flags & UCT_IB_DEVICE_FLAG_MLX5_PRM) && (ucs_derived_of(md(), uct_ib_mlx5_md_t)->flags & UCT_IB_MLX5_MD_FLAG_KSM); #elif defined(HAVE_EXP_UMR_KSM) @@ -105,10 +109,16 @@ bool test_ib_md::has_ksm() const { } bool test_ib_md::check_umr(uct_ib_md_t *ib_md) const { -#if HAVE_DECL_MLX5DV_CONTEXT_FLAGS_DEVX +#if HAVE_DEVX return has_ksm(); +#elif HAVE_EXP_UMR + if (ib_md->dev.flags & UCT_IB_DEVICE_FLAG_MLX5_PRM) { + uct_ib_mlx5_md_t *mlx5_md = ucs_derived_of(ib_md, uct_ib_mlx5_md_t); + return mlx5_md->umr_qp != NULL; + } + return false; #else - return ib_md->umr_qp != NULL; + return false; #endif } @@ -138,6 +148,13 @@ UCS_TEST_P(test_ib_md, ib_md_umr_ksm) { ib_md_umr_check(&rkey_buffer[0], has_ksm(), UCT_IB_MD_MAX_MR_SIZE + 0x1000); } +UCS_TEST_P(test_ib_md, relaxed_order, "PCI_RELAXED_ORDERING=on") { + std::string rkey_buffer(md_attr().rkey_packed_size, '\0'); + + ib_md_umr_check(&rkey_buffer[0], false); + ib_md_umr_check(&rkey_buffer[0], true); +} + #if HAVE_UMR_KSM UCS_TEST_P(test_ib_md, umr_noninline_klm, "MAX_INLINE_KLM_LIST=1") { @@ -149,43 +166,4 @@ UCS_TEST_P(test_ib_md, umr_noninline_klm, "MAX_INLINE_KLM_LIST=1") { } #endif -UCS_TEST_P(test_ib_md, alloc_dm) { - void *address; - size_t size; - ucs_status_t status; - uct_ib_device_mem_h dev_mem; - uct_mem_h dm_memh; - - for (unsigned i = 1; i < 300; ++i) { - const size_t orig_size = i * 100; - size = orig_size; - - address = NULL; - - status = uct_ib_md_alloc_device_mem(md(), &size, &address, UCT_MD_MEM_ACCESS_ALL, - "test DM", &dev_mem); - if ((status == UCS_ERR_NO_RESOURCE) || (status == UCS_ERR_UNSUPPORTED)) { - continue; - } - - ASSERT_UCS_OK(status); - EXPECT_GT(size, 0ul); - - EXPECT_GE(size, orig_size); - EXPECT_TRUE(address != NULL); - EXPECT_TRUE(dev_mem != NULL); - - memset(address, 0xBB, size); - - status = uct_md_mem_reg(md(), address, size, UCT_MD_MEM_ACCESS_ALL, - &dm_memh); - ASSERT_UCS_OK(status); - - status = uct_md_mem_dereg(md(), dm_memh); - ASSERT_UCS_OK(status); - - uct_ib_md_release_device_mem(dev_mem); - } -} - _UCT_MD_INSTANTIATE_TEST_CASE(test_ib_md, ib) diff --git a/test/gtest/uct/ib/test_ib_pkey.cc b/test/gtest/uct/ib/test_ib_pkey.cc new file mode 100644 index 00000000000..80a9264512c --- /dev/null +++ b/test/gtest/uct/ib/test_ib_pkey.cc @@ -0,0 +1,226 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. +* See file LICENSE for terms. +*/ + +#include + + +class test_uct_ib_pkey : public test_uct_ib_with_specific_port { +protected: + test_uct_ib_pkey() { + m_pkey[0] = UCT_IB_ADDRESS_INVALID_PKEY; + m_pkey[1] = UCT_IB_ADDRESS_INVALID_PKEY; + m_pkey_index[0] = 0; + m_pkey_index[1] = 0; + } + + void check_port_attr() { + if (IBV_PORT_IS_LINK_LAYER_ETHERNET(&m_port_attr)) { + /* no pkeys for Ethernet */ + UCS_TEST_SKIP_R("skip pkey test for port with Ethernet link type"); + } + } + + void check_pkeys() { + EXPECT_TRUE(check_pkey(m_e1->iface(), m_pkey[0], m_pkey_index[0])); + EXPECT_TRUE(check_pkey(m_e2->iface(), m_pkey[1], m_pkey_index[1])); + } + + void cleanup_entities() { + m_e1->destroy_eps(); + m_e2->destroy_eps(); + m_entities.remove(m_e1); + m_entities.remove(m_e2); + m_e1 = NULL; + m_e2 = NULL; + } + + void send_recv_short() { + create_connected_entities(); + check_pkeys(); + + test_uct_ib::send_recv_short(); + + cleanup_entities(); + } + + uint16_t query_pkey(uint16_t pkey_idx) const { + uint16_t pkey; + + if (ibv_query_pkey(m_ibctx, m_port, pkey_idx, &pkey)) { + UCS_TEST_ABORT("Failed to query pkey on port " << m_port << + " on device: " << m_dev_name); + } + return ntohs(pkey); + } + + bool check_pkey(const uct_iface_t *iface, uint16_t pkey, + uint16_t pkey_index) const { + const uct_ib_iface_t *ib_iface = ucs_derived_of(iface, uct_ib_iface_t); + return ((pkey == ib_iface->pkey) && + (pkey_index == ib_iface->pkey_index)); + } + + bool find_default_pkey(uint16_t &pkey, uint16_t &pkey_index) const { + for (uint16_t table_idx = 0; table_idx < m_port_attr.pkey_tbl_len; table_idx++) { + uint16_t pkey_value = query_pkey(table_idx); + if (can_use_pkey(pkey_value)) { + /* found the first valid pkey with full membership */ + pkey = pkey_value; + pkey_index = table_idx; + return true; + } + } + + return false; + } + + bool can_use_pkey(uint16_t pkey) const { + return ((pkey != UCT_IB_ADDRESS_INVALID_PKEY) && + ((pkey & UCT_IB_PKEY_MEMBERSHIP_MASK) != 0)); + } + + typedef std::pair< + /* PKEY values */ + std::vector >, + /* PKEY indices */ + std::vector > > ib_pkey_pairs_t; + + ib_pkey_pairs_t supported_pkey_pairs(bool full_membership_only = true) { + static std::vector > supported_pkey_pairs; + static std::vector > supported_pkey_idx_pairs; + static ib_pkey_pairs_t result; + + if (result.first.empty()) { + std::vector supported_pkeys; + std::vector supported_pkeys_idx; + for (uint16_t table_idx = 0; + table_idx < m_port_attr.pkey_tbl_len; table_idx++) { + uint16_t pkey = query_pkey(table_idx); + if (pkey == UCT_IB_ADDRESS_INVALID_PKEY) { + continue; + } + + supported_pkeys.push_back(pkey); + supported_pkeys_idx.push_back(table_idx); + } + + supported_pkey_pairs = ucs::make_pairs(supported_pkeys); + supported_pkey_idx_pairs = ucs::make_pairs(supported_pkeys_idx); + + result = std::make_pair(supported_pkey_pairs, + supported_pkey_idx_pairs); + } + + return result; + } + + uint16_t test_pack_unpack_ib_address(uct_ib_iface_t *iface, + uct_ib_address_t *ib_addr) { + uct_ib_address_pack_params_t params; + + uct_ib_iface_address_pack(iface, ib_addr); + uct_ib_address_unpack(ib_addr, ¶ms); + EXPECT_TRUE((params.flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY) != 0); + EXPECT_EQ(m_pkey[0], params.pkey); + + return params.pkey; + } + +public: + uint16_t m_pkey[2]; + uint16_t m_pkey_index[2]; +}; + +UCS_TEST_P(test_uct_ib_pkey, default_pkey) { + if (!find_default_pkey(m_pkey[0], m_pkey_index[0])) { + UCS_TEST_SKIP_R("unable to find a valid pkey with full membership"); + } + + m_pkey[1] = m_pkey[0]; + m_pkey_index[1] = m_pkey_index[0]; + + send_recv_short(); +} + +UCS_TEST_P(test_uct_ib_pkey, all_avail_pkeys) { + /* test all pkeys that are configured for the device */ + for (uint16_t table_idx = 0; table_idx < m_port_attr.pkey_tbl_len; table_idx++) { + m_pkey[0] = m_pkey[1] = query_pkey(table_idx); + if (!can_use_pkey(m_pkey[0])) { + continue; + } + modify_config("IB_PKEY", "0x" + + ucs::to_hex_string(m_pkey[0] & + UCT_IB_PKEY_PARTITION_MASK)); + m_pkey_index[0] = m_pkey_index[1] = table_idx; + send_recv_short(); + } +} + +UCS_TEST_P(test_uct_ib_pkey, test_pkey_pairs) { + /* test all pkeys (even with limited membership) that are configured + * for the device */ + ib_pkey_pairs_t pairs = supported_pkey_pairs(false); + + for (size_t i = 0; i < pairs.first.size(); i++) { + m_pkey[0] = pairs.first[i][0]; + m_pkey[1] = pairs.first[i][1]; + m_pkey_index[0] = pairs.second[i][0]; + m_pkey_index[1] = pairs.second[i][1]; + + modify_config("IB_PKEY", "0x" + + ucs::to_hex_string(m_pkey[0] & + UCT_IB_PKEY_PARTITION_MASK)); + m_e1 = uct_test::create_entity(0); + m_entities.push_back(m_e1); + + modify_config("IB_PKEY", "0x" + + ucs::to_hex_string(m_pkey[1] & + UCT_IB_PKEY_PARTITION_MASK)); + m_e2 = uct_test::create_entity(0); + m_entities.push_back(m_e2); + + m_e1->connect(0, *m_e2, 0); + m_e2->connect(0, *m_e1, 0); + + check_pkeys(); + + /* pack-unpack the first IB iface address */ + uct_ib_iface_t *iface1 = ucs_derived_of(m_e1->iface(), + uct_ib_iface_t); + uct_ib_address_t *ib_addr1 = + (uct_ib_address_t*)ucs_alloca(uct_ib_iface_address_size(iface1)); + uint16_t pkey1 = test_pack_unpack_ib_address(iface1, + ib_addr1); + + /* pack-unpack the second IB iface address */ + uct_ib_iface_t *iface2 = ucs_derived_of(m_e2->iface(), + uct_ib_iface_t); + uct_ib_address_t *ib_addr2 = + (uct_ib_address_t*)ucs_alloca(uct_ib_iface_address_size(iface2)); + uint16_t pkey2 = test_pack_unpack_ib_address(iface2, + ib_addr2); + + int res = !(/* both PKEYs are with limited membership */ + !((pkey1 | pkey2) & UCT_IB_PKEY_MEMBERSHIP_MASK) || + /* the PKEYs are not equal */ + ((pkey1 ^ pkey2) & UCT_IB_PKEY_PARTITION_MASK)); + EXPECT_EQ(res, uct_ib_iface_is_reachable(m_e1->iface(), + (uct_device_addr_t*)ib_addr2, + NULL)); + EXPECT_EQ(res, uct_ib_iface_is_reachable(m_e2->iface(), + (uct_device_addr_t*)ib_addr1, + NULL)); + + if (res) { + test_uct_ib::send_recv_short(); + } + + cleanup_entities(); + } +} + + +UCT_INSTANTIATE_IB_TEST_CASE(test_uct_ib_pkey); diff --git a/test/gtest/uct/ib/test_ib_xfer.cc b/test/gtest/uct/ib/test_ib_xfer.cc index 670fe86e68f..a6d6471d9bc 100644 --- a/test/gtest/uct/ib/test_ib_xfer.cc +++ b/test/gtest/uct/ib/test_ib_xfer.cc @@ -6,20 +6,21 @@ #include #include -#include class uct_p2p_rma_test_inlresp : public uct_p2p_rma_test {}; -UCS_TEST_P(uct_p2p_rma_test_inlresp, get_bcopy_inlresp0, "IB_TX_INLINE_RESP=0") { - check_caps(UCT_IFACE_FLAG_GET_BCOPY); +UCS_TEST_SKIP_COND_P(uct_p2p_rma_test_inlresp, get_bcopy_inlresp0, + !check_caps(UCT_IFACE_FLAG_GET_BCOPY), + "IB_TX_INLINE_RESP=0") { test_xfer_multi(static_cast(&uct_p2p_rma_test::get_bcopy), 1ul, sender().iface_attr().cap.get.max_bcopy, TEST_UCT_FLAG_RECV_ZCOPY); } -UCS_TEST_P(uct_p2p_rma_test_inlresp, get_bcopy_inlresp64, "IB_TX_INLINE_RESP=64") { - check_caps(UCT_IFACE_FLAG_GET_BCOPY); +UCS_TEST_SKIP_COND_P(uct_p2p_rma_test_inlresp, get_bcopy_inlresp64, + !check_caps(UCT_IFACE_FLAG_GET_BCOPY), + "IB_TX_INLINE_RESP=64") { test_xfer_multi(static_cast(&uct_p2p_rma_test::get_bcopy), 1ul, sender().iface_attr().cap.get.max_bcopy, TEST_UCT_FLAG_RECV_ZCOPY); @@ -31,14 +32,12 @@ UCT_INSTANTIATE_IB_TEST_CASE(uct_p2p_rma_test_inlresp) class uct_p2p_rma_test_alloc_methods : public uct_p2p_rma_test { protected: void test_put_zcopy() { - check_caps(UCT_IFACE_FLAG_PUT_ZCOPY); test_xfer_multi(static_cast(&uct_p2p_rma_test::put_zcopy), 0, sender().iface_attr().cap.put.max_zcopy, TEST_UCT_FLAG_SEND_ZCOPY); } void test_get_zcopy() { - check_caps(UCT_IFACE_FLAG_GET_ZCOPY); test_xfer_multi(static_cast(&uct_p2p_rma_test::get_zcopy), sender().iface_attr().cap.get.min_zcopy, sender().iface_attr().cap.get.max_zcopy, @@ -46,22 +45,40 @@ class uct_p2p_rma_test_alloc_methods : public uct_p2p_rma_test { } }; -UCS_TEST_P(uct_p2p_rma_test_alloc_methods, xfer_reg_odp, - "REG_METHODS=odp,direct") +#ifdef IMPLICIT_ODP_FIXED +UCS_TEST_SKIP_COND_P(uct_p2p_rma_test_alloc_methods, xfer_reg_odp, + !check_caps(UCT_IFACE_FLAG_PUT_ZCOPY | + UCT_IFACE_FLAG_GET_ZCOPY), + "REG_METHODS=odp,direct", + "MLX5_DEVX_OBJECTS=dct,dcsrq") { test_put_zcopy(); test_get_zcopy(); } +#endif -UCS_TEST_P(uct_p2p_rma_test_alloc_methods, xfer_reg_rcache, - "REG_METHODS=rcache,direct") +UCS_TEST_SKIP_COND_P(uct_p2p_rma_test_alloc_methods, xfer_reg_rcache, + !check_caps(UCT_IFACE_FLAG_PUT_ZCOPY | + UCT_IFACE_FLAG_GET_ZCOPY), + "REG_METHODS=rcache,direct") { test_put_zcopy(); test_get_zcopy(); } -UCS_TEST_P(uct_p2p_rma_test_alloc_methods, xfer_reg_direct, - "REG_METHODS=direct") +UCS_TEST_SKIP_COND_P(uct_p2p_rma_test_alloc_methods, xfer_reg_direct, + !check_caps(UCT_IFACE_FLAG_PUT_ZCOPY | + UCT_IFACE_FLAG_GET_ZCOPY), + "REG_METHODS=direct") +{ + test_put_zcopy(); + test_get_zcopy(); +} + +UCS_TEST_SKIP_COND_P(uct_p2p_rma_test_alloc_methods, xfer_reg_multithreaded, + !check_caps(UCT_IFACE_FLAG_PUT_ZCOPY | + UCT_IFACE_FLAG_GET_ZCOPY), + "REG_MT_THRESH=1", "REG_MT_CHUNK=1G", "REG_MT_BIND=y") { test_put_zcopy(); test_get_zcopy(); @@ -72,11 +89,13 @@ UCT_INSTANTIATE_IB_TEST_CASE(uct_p2p_rma_test_alloc_methods) class uct_p2p_mix_test_alloc_methods : public uct_p2p_mix_test {}; +#ifdef IMPLICIT_ODP_FIXED UCS_TEST_P(uct_p2p_mix_test_alloc_methods, mix1000_odp, - "REG_METHODS=odp,direct") + "REG_METHODS=odp,direct", "MLX5_DEVX_OBJECTS=dct,dcsrq") { run(1000); } +#endif UCS_TEST_P(uct_p2p_mix_test_alloc_methods, mix1000_rcache, "REG_METHODS=rcache,direct") @@ -84,64 +103,22 @@ UCS_TEST_P(uct_p2p_mix_test_alloc_methods, mix1000_rcache, run(1000); } -UCT_INSTANTIATE_IB_TEST_CASE(uct_p2p_mix_test_alloc_methods) - - -class uct_p2p_mix_test_indirect_atomic : public uct_p2p_mix_test {}; - -UCS_TEST_P(uct_p2p_mix_test_indirect_atomic, mix1000_indirect_atomic, - "INDIRECT_ATOMIC=n") +UCS_TEST_P(uct_p2p_mix_test_alloc_methods, mix1000_multithreaded, + "REG_MT_THRESH=1", "REG_MT_CHUNK=1K", "REG_MT_BIND=y") { run(1000); } -UCT_INSTANTIATE_IB_TEST_CASE(uct_p2p_mix_test_indirect_atomic) - - -class uct_p2p_mix_test_dm : public uct_p2p_mix_test { -public: - virtual void run(unsigned count) { - - check_run_conditions(); - - size_t size = m_send_size; - uct_ib_device_mem_h dev_mem; - ucs_status_t status; - uct_mem_h dm_memh; - void *dm_ptr; - - status = uct_ib_md_alloc_device_mem(receiver().md(), &size, &dm_ptr, - UCT_MD_MEM_ACCESS_ALL, "test DM", - &dev_mem); - if ((status == UCS_ERR_NO_RESOURCE) || (status == UCS_ERR_UNSUPPORTED)) { - UCS_TEST_SKIP_R("Device memory is not available"); - } - ASSERT_UCS_OK(status); - - status = uct_md_mem_reg(receiver().md(), dm_ptr, m_send_size, - UCT_MD_MEM_ACCESS_ALL, &dm_memh); - ASSERT_UCS_OK(status); - - mapped_buffer sendbuf(m_send_size, 1, sender()); - mapped_buffer recvbuf(dm_ptr, m_send_size, dm_memh, 2, receiver()); - - for (unsigned i = 0; i < count; ++i) { - random_op(sendbuf, recvbuf); - } - - sender().flush(); +UCT_INSTANTIATE_IB_TEST_CASE(uct_p2p_mix_test_alloc_methods) - status = uct_md_mem_dereg(receiver().md(), dm_memh); - ASSERT_UCS_OK(status); - uct_ib_md_release_device_mem(dev_mem); - } -}; +class uct_p2p_mix_test_indirect_atomic : public uct_p2p_mix_test {}; -UCS_TEST_P(uct_p2p_mix_test_dm, mix1000) +UCS_TEST_P(uct_p2p_mix_test_indirect_atomic, mix1000_indirect_atomic, + "INDIRECT_ATOMIC=n") { run(1000); } -UCT_INSTANTIATE_IB_TEST_CASE(uct_p2p_mix_test_dm) +UCT_INSTANTIATE_IB_TEST_CASE(uct_p2p_mix_test_indirect_atomic) diff --git a/test/gtest/uct/ib/test_rc.cc b/test/gtest/uct/ib/test_rc.cc index ee8f870c00e..bba5ff93320 100644 --- a/test/gtest/uct/ib/test_rc.cc +++ b/test/gtest/uct/ib/test_rc.cc @@ -1,16 +1,12 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * Copyright (C) UT-Battelle, LLC. 2016. ALL RIGHTS RESERVED. * Copyright (C) ARM Ltd. 2016.All rights reserved. * See file LICENSE for terms. */ #include "test_rc.h" - - -#define UCT_RC_INSTANTIATE_TEST_CASE(_test_case) \ - _UCT_INSTANTIATE_TEST_CASE(_test_case, rc) \ - _UCT_INSTANTIATE_TEST_CASE(_test_case, rc_mlx5) +#include void test_rc::init() @@ -20,6 +16,8 @@ void test_rc::init() m_e1 = uct_test::create_entity(0); m_entities.push_back(m_e1); + check_skip_test(); + m_e2 = uct_test::create_entity(0); m_entities.push_back(m_e2); @@ -37,16 +35,8 @@ void test_rc::connect() // Check that iface tx ops buffer and flush comp memory pool are moderated // properly when we have communication ops + lots of flushes -void test_rc::test_iface_ops() +void test_rc::test_iface_ops(int cq_len) { - check_caps(UCT_IFACE_FLAG_PUT_ZCOPY); - int cq_len = 16; - - if (UCS_OK != uct_config_modify(m_iface_config, "RC_TX_CQ_LEN", - ucs::to_string(cq_len).c_str())) { - UCS_TEST_ABORT("Error: cannot enable random DCI policy"); - } - entity *e = uct_test::create_entity(0); m_entities.push_back(e); e->connect(0, *m_e2, 0); @@ -58,12 +48,11 @@ void test_rc::test_iface_ops() comp.func = NULL; UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, sendbuf.ptr(), sendbuf.length(), - sendbuf.memh(), - m_e1->iface_attr().cap.am.max_iov); + sendbuf.memh(), m_e1->iface_attr().cap.put.max_iov); // For _x transports several CQEs can be consumed per WQE, post less put zcopy // ops, so that flush would be sucessfull (otherwise flush will return // NO_RESOURCES and completion will not be added for it). - for (int i = 0; i < cq_len / 3; i++) { + for (int i = 0; i < cq_len / 5; i++) { ASSERT_UCS_OK_OR_INPROGRESS(uct_ep_put_zcopy(e->ep(0), iov, iovcnt, recvbuf.addr(), recvbuf.rkey(), &comp)); @@ -78,11 +67,38 @@ void test_rc::test_iface_ops() flush(); } -UCS_TEST_P(test_rc, stress_iface_ops) { - test_iface_ops(); +UCS_TEST_SKIP_COND_P(test_rc, stress_iface_ops, + !check_caps(UCT_IFACE_FLAG_PUT_ZCOPY)) { + int cq_len = 16; + + if (UCS_OK != uct_config_modify(m_iface_config, "RC_TX_CQ_LEN", + ucs::to_string(cq_len).c_str())) { + UCS_TEST_ABORT("Error: cannot modify RC_TX_CQ_LEN"); + } + + test_iface_ops(cq_len); +} + +UCS_TEST_P(test_rc, tx_cq_moderation) { + unsigned tx_mod = ucs_min(rc_iface(m_e1)->config.tx_moderation / 4, 8); + int16_t init_rsc = rc_ep(m_e1)->txqp.available; + + send_am_messages(m_e1, tx_mod, UCS_OK); + + int16_t rsc = rc_ep(m_e1)->txqp.available; + + EXPECT_LE(rsc, init_rsc); + + short_progress_loop(100); + + EXPECT_EQ(rsc, rc_ep(m_e1)->txqp.available); + + flush(); + + EXPECT_EQ(init_rsc, rc_ep(m_e1)->txqp.available); } -UCT_RC_INSTANTIATE_TEST_CASE(test_rc) +UCT_INSTANTIATE_RC_TEST_CASE(test_rc) class test_rc_max_wr : public test_rc { @@ -111,7 +127,282 @@ UCS_TEST_P(test_rc_max_wr, send_limit) send_am_messages(m_e1, 1, UCS_OK); } -UCT_RC_INSTANTIATE_TEST_CASE(test_rc_max_wr) +UCT_INSTANTIATE_RC_TEST_CASE(test_rc_max_wr) + +class test_rc_get_limit : public test_rc { +public: + test_rc_get_limit() { + m_num_get_bytes = 8 * UCS_KBYTE + 557; // some non power of 2 value + modify_config("RC_TX_NUM_GET_BYTES", + ucs::to_string(m_num_get_bytes).c_str()); + + m_max_get_zcopy = 4096; + modify_config("RC_MAX_GET_ZCOPY", + ucs::to_string(m_max_get_zcopy).c_str()); + + modify_config("RC_TX_QUEUE_LEN", "32"); + modify_config("RC_TM_ENABLE", "y", true); + + m_comp.count = 300000; // some big value to avoid func invocation + m_comp.func = NULL; + } + + void init() { +#ifdef ENABLE_STATS + stats_activate(); +#endif + test_rc::init(); + } + +#ifdef ENABLE_STATS + void cleanup() { + uct_test::cleanup(); + stats_restore(); + } + + uint64_t get_no_reads_stat_counter(entity *e) { + uct_rc_iface_t *iface = ucs_derived_of(e->iface(), uct_rc_iface_t); + + return UCS_STATS_GET_COUNTER(iface->stats, UCT_RC_IFACE_STAT_NO_READS); + } +#endif + + ssize_t reads_available(entity *e) { + return rc_iface(e)->tx.reads_available; + } + + void post_max_reads(entity *e, const mapped_buffer &sendbuf, + const mapped_buffer &recvbuf) { + UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, sendbuf.ptr(), sendbuf.length(), + sendbuf.memh(), e->iface_attr().cap.get.max_iov); + + int i = 0; + ucs_status_t status; + do { + if (i++ % 2) { + status = uct_ep_get_zcopy(e->ep(0), iov, iovcnt, recvbuf.addr(), + recvbuf.rkey(), &m_comp); + } else { + status = uct_ep_get_bcopy(e->ep(0), (uct_unpack_callback_t)memcpy, + sendbuf.ptr(), sendbuf.length(), + recvbuf.addr(), recvbuf.rkey(), &m_comp); + } + } while (status == UCS_INPROGRESS); + + EXPECT_EQ(UCS_ERR_NO_RESOURCE, status); + EXPECT_GE(0u, reads_available(e)); + } + + static size_t empty_pack_cb(void *dest, void *arg) { + return 0ul; + } + +protected: + unsigned m_num_get_bytes; + unsigned m_max_get_zcopy; + uct_completion_t m_comp; +}; + +UCS_TEST_SKIP_COND_P(test_rc_get_limit, get_ops_limit, + !check_caps(UCT_IFACE_FLAG_GET_ZCOPY | + UCT_IFACE_FLAG_GET_BCOPY)) +{ + mapped_buffer sendbuf(1024, 0ul, *m_e1); + mapped_buffer recvbuf(1024, 0ul, *m_e2); + + post_max_reads(m_e1, sendbuf, recvbuf); + +#ifdef ENABLE_STATS + EXPECT_GT(get_no_reads_stat_counter(m_e1), 0ul); +#endif + + // Check that it is possible to add to pending if get returns NO_RESOURCE + // due to lack of get credits + uct_pending_req_t pend_req; + pend_req.func = NULL; // Make valgrind happy + EXPECT_EQ(UCS_OK, uct_ep_pending_add(m_e1->ep(0), &pend_req, 0)); + uct_ep_pending_purge(m_e1->ep(0), NULL, NULL); + + flush(); + EXPECT_EQ(m_num_get_bytes, reads_available(m_e1)); +} + +// Check that get function fails for messages bigger than MAX_GET_ZCOPY value +UCS_TEST_SKIP_COND_P(test_rc_get_limit, get_size_limit, + !check_caps(UCT_IFACE_FLAG_GET_ZCOPY)) +{ + EXPECT_EQ(m_max_get_zcopy, m_e1->iface_attr().cap.get.max_zcopy); + + mapped_buffer buf(m_max_get_zcopy + 1, 0ul, *m_e1); + + UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, buf.ptr(), buf.length(), buf.memh(), + m_e1->iface_attr().cap.get.max_iov); + + scoped_log_handler wrap_err(wrap_errors_logger); + ucs_status_t status = uct_ep_get_zcopy(m_e1->ep(0), iov, iovcnt, + buf.addr(), buf.rkey(), &m_comp); + EXPECT_EQ(UCS_ERR_INVALID_PARAM, status); + + flush(); + EXPECT_EQ(m_num_get_bytes, reads_available(m_e1)); +} + +// Check that get size value is trimmed by the actual maximum IB msg size +UCS_TEST_SKIP_COND_P(test_rc_get_limit, invalid_get_size, + !check_caps(UCT_IFACE_FLAG_GET_ZCOPY)) +{ + size_t max_ib_msg = uct_ib_iface_port_attr(&rc_iface(m_e1)->super)->max_msg_sz; + + modify_config("RC_MAX_GET_ZCOPY", ucs::to_string(max_ib_msg + 1).c_str()); + + scoped_log_handler wrap_warn(hide_warns_logger); + entity *e = uct_test::create_entity(0); + m_entities.push_back(e); + + EXPECT_EQ(m_max_get_zcopy, m_e1->iface_attr().cap.get.max_zcopy); +} + +// Check that gets resource counter is not affected/changed when the get +// function fails due to lack of some other resources. +UCS_TEST_SKIP_COND_P(test_rc_get_limit, post_get_no_res, + !check_caps(UCT_IFACE_FLAG_GET_ZCOPY | + UCT_IFACE_FLAG_AM_BCOPY)) +{ + unsigned max_get_bytes = reads_available(m_e1); + ucs_status_t status; + + do { + status = send_am_message(m_e1, 0, 0); + } while (status == UCS_OK); + + EXPECT_EQ(UCS_ERR_NO_RESOURCE, status); + EXPECT_EQ(max_get_bytes, reads_available(m_e1)); + + mapped_buffer buf(1024, 0ul, *m_e1); + UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, buf.ptr(), buf.length(), buf.memh(), + m_e1->iface_attr().cap.get.max_iov); + + status = uct_ep_get_zcopy(m_e1->ep(0), iov, iovcnt, buf.addr(), buf.rkey(), + &m_comp); + EXPECT_EQ(UCS_ERR_NO_RESOURCE, status); + EXPECT_EQ(max_get_bytes, reads_available(m_e1)); +#ifdef ENABLE_STATS + EXPECT_EQ(get_no_reads_stat_counter(m_e1), 0ul); +#endif + + flush(); +} + +UCS_TEST_SKIP_COND_P(test_rc_get_limit, check_rma_ops, + !check_caps(UCT_IFACE_FLAG_GET_ZCOPY | + UCT_IFACE_FLAG_GET_BCOPY | + UCT_IFACE_FLAG_PUT_SHORT | + UCT_IFACE_FLAG_PUT_BCOPY | + UCT_IFACE_FLAG_PUT_ZCOPY | + UCT_IFACE_FLAG_AM_SHORT | + UCT_IFACE_FLAG_AM_BCOPY | + UCT_IFACE_FLAG_AM_ZCOPY)) + +{ + mapped_buffer sendbuf(1024, 0ul, *m_e1); + mapped_buffer recvbuf(1024, 0ul, *m_e2); + + post_max_reads(m_e1, sendbuf, recvbuf); + + UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, sendbuf.ptr(), 1, sendbuf.memh(), 1); + uct_ep_h ep = m_e1->ep(0); + + EXPECT_EQ(UCS_ERR_NO_RESOURCE, uct_ep_put_short(ep, NULL, 0, 0, 0)); + EXPECT_EQ(UCS_ERR_NO_RESOURCE, uct_ep_put_bcopy(ep, NULL, NULL, 0, 0)); + EXPECT_EQ(UCS_ERR_NO_RESOURCE, uct_ep_put_zcopy(ep, iov, iovcnt, 0, 0, + NULL)); + + if (check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP64)) { + ASSERT_TRUE(check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), OP64)); + ASSERT_TRUE(check_atomics(UCS_BIT(UCT_ATOMIC_OP_CSWAP), FOP64)); + EXPECT_EQ(UCS_ERR_NO_RESOURCE, + uct_ep_atomic64_post(ep, UCT_ATOMIC_OP_ADD, 0, 0, 0)); + EXPECT_EQ(UCS_ERR_NO_RESOURCE, + uct_ep_atomic64_fetch(ep, UCT_ATOMIC_OP_ADD, 0, NULL, 0, 0, + NULL)); + EXPECT_EQ(UCS_ERR_NO_RESOURCE, + uct_ep_atomic_cswap64(ep, 0, 0, 0, 0, NULL, NULL)); + } + + if (check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP32)) { + ASSERT_TRUE(check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), OP32)); + ASSERT_TRUE(check_atomics(UCS_BIT(UCT_ATOMIC_OP_CSWAP), FOP32)); + EXPECT_EQ(UCS_ERR_NO_RESOURCE, + uct_ep_atomic32_post(ep, UCT_ATOMIC_OP_ADD, 0, 0, 0)); + EXPECT_EQ(UCS_ERR_NO_RESOURCE, + uct_ep_atomic32_fetch(ep, UCT_ATOMIC_OP_ADD, 0, NULL, 0, 0, + NULL)); + EXPECT_EQ(UCS_ERR_NO_RESOURCE, + uct_ep_atomic_cswap32(ep, 0, 0, 0, 0, NULL, NULL)); + } + + EXPECT_UCS_OK(uct_ep_am_short(ep, 0, 0, NULL, 0)); + EXPECT_EQ(0l, uct_ep_am_bcopy(ep, 0, empty_pack_cb, NULL, 0)); + EXPECT_FALSE(UCS_STATUS_IS_ERR(uct_ep_am_zcopy(ep, 0, NULL, 0, iov, iovcnt, + 0, NULL))); + + if (check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY)) { + // we do not have partial tag offload support + ASSERT_TRUE(check_caps(UCT_IFACE_FLAG_TAG_EAGER_SHORT | + UCT_IFACE_FLAG_TAG_EAGER_ZCOPY | + UCT_IFACE_FLAG_TAG_RNDV_ZCOPY)); + + EXPECT_UCS_OK(uct_ep_tag_eager_short(ep, 0ul, NULL, 0)); + EXPECT_EQ(0l, uct_ep_tag_eager_bcopy(ep, 0ul, 0ul, empty_pack_cb, + NULL, 0)); + EXPECT_FALSE(UCS_STATUS_IS_ERR(uct_ep_tag_eager_zcopy(ep, 0ul, 0ul, iov, + iovcnt, 0u, + NULL))); + void *rndv_op = uct_ep_tag_rndv_zcopy(ep, 0ul, NULL, 0u, iov, iovcnt, + 0u, NULL); + EXPECT_FALSE(UCS_PTR_IS_ERR(rndv_op)); + EXPECT_UCS_OK(uct_ep_tag_rndv_cancel(ep, rndv_op)); + EXPECT_UCS_OK(uct_ep_tag_rndv_request(ep, 0ul, NULL, 0u, 0u)); + } + + flush(); + EXPECT_EQ(m_num_get_bytes, reads_available(m_e1)); +} + +// Check that outstanding get ops purged gracefully when ep is closed. +// Also check that get resources taken by those ops are released. +UCS_TEST_SKIP_COND_P(test_rc_get_limit, get_zcopy_purge, + !check_caps(UCT_IFACE_FLAG_GET_ZCOPY | + UCT_IFACE_FLAG_GET_BCOPY)) +{ + mapped_buffer sendbuf(1024, 0ul, *m_e1); + mapped_buffer recvbuf(1024, 0ul, *m_e2); + + post_max_reads(m_e1, sendbuf, recvbuf); + + scoped_log_handler hide_warn(hide_warns_logger); + + unsigned flags = UCT_FLUSH_FLAG_CANCEL; + ucs_time_t deadline = ucs::get_deadline(); + ucs_status_t status; + do { + ASSERT_EQ(1ul, m_e1->num_eps()); + status = uct_ep_flush(m_e1->ep(0), flags, NULL); + progress(); + if (flags & UCT_FLUSH_FLAG_CANCEL) { + ASSERT_UCS_OK_OR_INPROGRESS(status); + flags = UCT_FLUSH_FLAG_LOCAL; + continue; + } + } while (((status == UCS_ERR_NO_RESOURCE) || (status == UCS_INPROGRESS)) && + (ucs_get_time() < deadline)); + + m_e1->destroy_eps(); + flush(); + EXPECT_EQ(m_num_get_bytes, reads_available(m_e1)); +} + +UCT_INSTANTIATE_RC_DC_TEST_CASE(test_rc_get_limit) uint32_t test_rc_flow_control::m_am_rx_count = 0; @@ -213,7 +504,7 @@ void test_rc_flow_control::test_flush_fc_disabled() /* send active message should be OK */ get_fc_ptr(m_e1)->fc_wnd = 1; - send_am_message(m_e1, 1, UCS_OK); + send_am_messages(m_e1, 1, UCS_OK); EXPECT_EQ(0, get_fc_ptr(m_e1)->fc_wnd); /* flush must have resources */ @@ -268,7 +559,7 @@ UCS_TEST_P(test_rc_flow_control, pending_only_fc) send_am_and_flush(m_e1, wnd); m_e2->destroy_ep(0); - ASSERT_TRUE(rc_iface(m_e2)->tx.arbiter.current == NULL); + ASSERT_TRUE(ucs_arbiter_is_empty(&rc_iface(m_e2)->tx.arbiter)); } /* Check that user callback passed to uct_ep_pending_purge is not @@ -288,10 +579,10 @@ UCS_TEST_P(test_rc_flow_control, fc_disabled_flush) test_flush_fc_disabled(); } -UCT_RC_INSTANTIATE_TEST_CASE(test_rc_flow_control) +UCT_INSTANTIATE_RC_TEST_CASE(test_rc_flow_control) -#if ENABLE_STATS +#ifdef ENABLE_STATS void test_rc_flow_control_stats::test_general(int wnd, int soft_thresh, int hard_thresh) @@ -345,6 +636,73 @@ UCS_TEST_P(test_rc_flow_control_stats, soft_request) EXPECT_EQ(1ul, v); } -UCT_RC_INSTANTIATE_TEST_CASE(test_rc_flow_control_stats) +UCT_INSTANTIATE_RC_TEST_CASE(test_rc_flow_control_stats) #endif + +#ifdef HAVE_MLX5_HW +extern "C" { +#include +} +#endif + +test_uct_iface_attrs::attr_map_t test_rc_iface_attrs::get_num_iov() { + if (has_transport("rc_mlx5")) { + return get_num_iov_mlx5_common(0ul); + } else { + EXPECT_TRUE(has_transport("rc_verbs")); + m_e->connect(0, *m_e, 0); + uct_rc_verbs_ep_t *ep = ucs_derived_of(m_e->ep(0), uct_rc_verbs_ep_t); + uint32_t max_sge; + ASSERT_UCS_OK(uct_ib_qp_max_send_sge(ep->qp, &max_sge)); + + attr_map_t iov_map; + iov_map["put"] = iov_map["get"] = max_sge; + iov_map["am"] = max_sge - 1; // 1 iov reserved for am header + return iov_map; + } +} + +test_uct_iface_attrs::attr_map_t +test_rc_iface_attrs::get_num_iov_mlx5_common(size_t av_size) +{ + attr_map_t iov_map; + +#ifdef HAVE_MLX5_HW + // For RMA iovs can use all WQE space, remainig from control and + // remote address segments (and AV if relevant) + size_t rma_iov = (UCT_IB_MLX5_MAX_SEND_WQE_SIZE - + (sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_ctrl_seg) + av_size)) / + sizeof(struct mlx5_wqe_data_seg); + + iov_map["put"] = iov_map["get"] = rma_iov; + + // For am zcopy just small constant number of iovs is allowed + // (to preserve some inline space for AM zcopy header) + iov_map["am"] = UCT_IB_MLX5_AM_ZCOPY_MAX_IOV; + +#if IBV_HW_TM + if (UCT_RC_MLX5_TM_ENABLED(ucs_derived_of(m_e->iface(), + uct_rc_mlx5_iface_common_t))) { + // For TAG eager zcopy iovs can use all WQE space, remainig from control + // segment, TMH header (+ inline data segment) and AV (if relevant) + iov_map["tag"] = (UCT_IB_MLX5_MAX_SEND_WQE_SIZE - + (sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_inl_data_seg) + + sizeof(struct ibv_tmh) + av_size)) / + sizeof(struct mlx5_wqe_data_seg); + } +#endif // IBV_HW_TM +#endif // HAVE_MLX5_HW + + return iov_map; +} + +UCS_TEST_P(test_rc_iface_attrs, iface_attrs) +{ + basic_iov_test(); +} + +UCT_INSTANTIATE_RC_TEST_CASE(test_rc_iface_attrs) + diff --git a/test/gtest/uct/ib/test_rc.h b/test/gtest/uct/ib/test_rc.h index 9731145f219..e3273574623 100644 --- a/test/gtest/uct/ib/test_rc.h +++ b/test/gtest/uct/ib/test_rc.h @@ -33,7 +33,7 @@ class test_rc : public uct_test { void send_am_messages(entity *e, int wnd, ucs_status_t expected, uint8_t am_id = 0, int ep_idx = 0) { for (int i = 0; i < wnd; i++) { - EXPECT_EQ(expected, send_am_message(e, wnd, am_id, ep_idx)); + EXPECT_EQ(expected, send_am_message(e, am_id, ep_idx)); } } @@ -41,7 +41,7 @@ class test_rc : public uct_test { uct_test::short_progress_loop(delta_ms); } - void test_iface_ops(); + void test_iface_ops(int cq_len); static ucs_status_t am_dummy_handler(void *arg, void *data, size_t length, unsigned flags) { @@ -141,7 +141,7 @@ class test_rc_flow_control : public test_rc { }; -#if ENABLE_STATS +#ifdef ENABLE_STATS class test_rc_flow_control_stats : public test_rc_flow_control { public: void init() { @@ -158,4 +158,19 @@ class test_rc_flow_control_stats : public test_rc_flow_control { }; #endif + +class test_rc_iface_attrs : public test_uct_iface_attrs { +public: + test_rc_iface_attrs() { + ucs_status_t status = uct_config_modify(m_iface_config, + "RC_TM_ENABLE", "y"); + EXPECT_TRUE((status == UCS_OK) || (status == UCS_ERR_NO_ELEM)); + } + + attr_map_t get_num_iov_mlx5_common(size_t av_size); + + attr_map_t get_num_iov(); +}; + + #endif diff --git a/test/gtest/uct/ib/test_sockaddr.cc b/test/gtest/uct/ib/test_sockaddr.cc index ddcaffedc70..a3c31d85bff 100644 --- a/test/gtest/uct/ib/test_sockaddr.cc +++ b/test/gtest/uct/ib/test_sockaddr.cc @@ -11,6 +11,7 @@ extern "C" { #include #include #include +#include } #include @@ -41,35 +42,50 @@ class test_uct_sockaddr : public uct_test { test_uct_sockaddr() : server(NULL), client(NULL), err_count(0), server_recv_req(0), delay_conn_reply(false) { - memset(&listen_sock_addr, 0, sizeof(listen_sock_addr)); - memset(&connect_sock_addr, 0, sizeof(connect_sock_addr)); } - void init() { - uct_test::init(); + void check_md_usability() { + uct_md_attr_t md_attr; + uct_md_config_t *md_config; + ucs_status_t status; + uct_md_h md; - uct_iface_params_t server_params, client_params; - struct sockaddr_in *listen_addr_in, *connect_addr_in; + status = uct_md_config_read(GetParam()->component, NULL, NULL, &md_config); + EXPECT_TRUE(status == UCS_OK); + + status = uct_md_open(GetParam()->component, GetParam()->md_name.c_str(), + md_config, &md); + EXPECT_TRUE(status == UCS_OK); + uct_config_release(md_config); - /* If we reached here, the interface is active, as it was tested at the - * resource creation */ - if (!ucs::is_inet_addr((struct sockaddr *)&(GetParam()->connect_if_addr))) { - UCS_TEST_SKIP_R("There is no IP on the interface"); + status = uct_md_query(md, &md_attr); + ASSERT_UCS_OK(status); + + uct_md_close(md); + + if (!(md_attr.cap.flags & UCT_MD_FLAG_SOCKADDR)) { + UCS_TEST_SKIP_R(GetParam()->md_name.c_str() + + std::string(" does not support client-server " + "connection establishment via sockaddr " + "without a cm")); } + } - /* This address is accessible, as it was tested at the resource creation */ - listen_sock_addr.addr = (struct sockaddr *)&(GetParam()->listen_if_addr); - ASSERT_TRUE(listen_sock_addr.addr != NULL); + void init() { + check_md_usability(); + + uct_iface_params_t server_params, client_params; + uint16_t port; - listen_addr_in = (struct sockaddr_in *) (listen_sock_addr.addr); + uct_test::init(); - /* Get a usable port on the host */ - listen_addr_in->sin_port = ucs::get_port(); + /* This address is accessible, as it was tested at the resource creation */ + m_listen_addr = GetParam()->listen_sock_addr; + m_connect_addr = GetParam()->connect_sock_addr; - connect_sock_addr.addr = (struct sockaddr *)&(GetParam()->connect_if_addr); - ASSERT_TRUE(connect_sock_addr.addr != NULL); - connect_addr_in = (struct sockaddr_in *)connect_sock_addr.addr; - connect_addr_in->sin_port = listen_addr_in->sin_port; + port = ucs::get_port(); + m_listen_addr.set_port(port); + m_connect_addr.set_port(port); /* open iface for the server side */ server_params.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE | @@ -81,14 +97,23 @@ class test_uct_sockaddr : public uct_test { server_params.err_handler = err_handler; server_params.err_handler_arg = reinterpret_cast(this); server_params.err_handler_flags = 0; - server_params.mode.sockaddr.listen_sockaddr = listen_sock_addr; + server_params.mode.sockaddr.listen_sockaddr = m_listen_addr.to_ucs_sock_addr(); server_params.mode.sockaddr.cb_flags = UCT_CB_FLAG_ASYNC; server_params.mode.sockaddr.conn_request_cb = conn_request_cb; server_params.mode.sockaddr.conn_request_arg = reinterpret_cast(this); + /* if origin port is busy, create_entity will retry with another one */ server = uct_test::create_entity(server_params); m_entities.push_back(server); + check_skip_test(); + + port = ucs::sock_addr_storage(server->iface_params().mode.sockaddr + .listen_sockaddr) + .get_port(); + m_listen_addr.set_port(port); + m_connect_addr.set_port(port); + /* open iface for the client side */ client_params.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE | UCT_IFACE_PARAM_FIELD_ERR_HANDLER | @@ -103,7 +128,34 @@ class test_uct_sockaddr : public uct_test { m_entities.push_back(client); /* initiate the client's private data callback argument */ - client->client_cb_arg = server->iface_attr().max_conn_priv; + client->max_conn_priv = server->iface_attr().max_conn_priv; + + UCS_TEST_MESSAGE << "Testing " << m_listen_addr + << " Interface: " << GetParam()->dev_name; + } + + size_t iface_priv_data_do_pack(void *priv_data) + { + size_t priv_data_len; + + client_priv_data = "Client private data"; + priv_data_len = 1 + client_priv_data.length(); + + memcpy(priv_data, client_priv_data.c_str(), priv_data_len); + return priv_data_len; + } + + static ssize_t client_iface_priv_data_cb(void *arg, + const uct_cm_ep_priv_data_pack_args_t + *pack_args, void *priv_data) + { + test_uct_sockaddr *self = reinterpret_cast(arg); + size_t priv_data_len; + + priv_data_len = self->iface_priv_data_do_pack(priv_data); + EXPECT_LE(priv_data_len, self->client->max_conn_priv); + + return priv_data_len; } static void conn_request_cb(uct_iface_h iface, void *arg, @@ -112,40 +164,41 @@ class test_uct_sockaddr : public uct_test { { test_uct_sockaddr *self = reinterpret_cast(arg); - EXPECT_EQ(std::string(reinterpret_cast - (uct_test::entity::client_priv_data.c_str())), + EXPECT_EQ(self->client_priv_data, std::string(reinterpret_cast(conn_priv_data))); - EXPECT_EQ(1 + uct_test::entity::client_priv_data.length(), length); + EXPECT_EQ(1 + self->client_priv_data.length(), length); + if (self->delay_conn_reply) { self->delayed_conn_reqs.push(conn_request); } else { uct_iface_accept(iface, conn_request); } + ucs_memory_cpu_store_fence(); self->server_recv_req++; } static ucs_status_t err_handler(void *arg, uct_ep_h ep, ucs_status_t status) { test_uct_sockaddr *self = reinterpret_cast(arg); - self->err_count++; + ucs_atomic_add32(&self->err_count, 1); return UCS_OK; } protected: entity *server, *client; - ucs_sock_addr_t listen_sock_addr, connect_sock_addr; - volatile int err_count, server_recv_req; + ucs::sock_addr_storage m_listen_addr, m_connect_addr; + volatile uint32_t err_count; + volatile int server_recv_req; std::queue delayed_conn_reqs; bool delay_conn_reply; + std::string client_priv_data; }; UCS_TEST_P(test_uct_sockaddr, connect_client_to_server) { - UCS_TEST_MESSAGE << "Testing " << ucs::sockaddr_to_str(listen_sock_addr.addr) - << " Interface: " << GetParam()->dev_name.c_str(); - - client->connect(0, *server, 0, &connect_sock_addr); + client->connect(0, *server, 0, m_connect_addr, client_iface_priv_data_cb, + NULL, NULL, this); /* wait for the server to connect */ while (server_recv_req == 0) { @@ -155,7 +208,7 @@ UCS_TEST_P(test_uct_sockaddr, connect_client_to_server) /* since the transport may support a graceful exit in case of an error, * make sure that the error handling flow wasn't invoked (there were no * errors) */ - EXPECT_EQ(0, err_count); + EXPECT_EQ(0ul, err_count); /* the test may end before the client's ep got connected. * it should also pass in this case as well - the client's * ep shouldn't be accessed (for connection reply from the server) after the @@ -164,18 +217,18 @@ UCS_TEST_P(test_uct_sockaddr, connect_client_to_server) UCS_TEST_P(test_uct_sockaddr, connect_client_to_server_with_delay) { - UCS_TEST_MESSAGE << "Testing " << ucs::sockaddr_to_str(listen_sock_addr.addr) - << " Interface: " << GetParam()->dev_name.c_str(); delay_conn_reply = true; - client->connect(0, *server, 0, &connect_sock_addr); + client->connect(0, *server, 0, m_connect_addr, client_iface_priv_data_cb, + NULL, NULL, this); /* wait for the server to connect */ while (server_recv_req == 0) { progress(); } ASSERT_EQ(1, server_recv_req); + ucs_memory_cpu_load_fence(); ASSERT_EQ(1ul, delayed_conn_reqs.size()); - EXPECT_EQ(0, err_count); + EXPECT_EQ(0ul, err_count); while (!delayed_conn_reqs.empty()) { uct_iface_accept(server->iface(), delayed_conn_reqs.front()); delayed_conn_reqs.pop(); @@ -189,23 +242,23 @@ UCS_TEST_P(test_uct_sockaddr, connect_client_to_server_with_delay) } else { EXPECT_EQ(UCS_OK, status); } - EXPECT_EQ(0, err_count); + EXPECT_EQ(0ul, err_count); } UCS_TEST_P(test_uct_sockaddr, connect_client_to_server_reject_with_delay) { - UCS_TEST_MESSAGE << "Testing " << ucs::sockaddr_to_str(listen_sock_addr.addr) - << " Interface: " << GetParam()->dev_name.c_str(); delay_conn_reply = true; - client->connect(0, *server, 0, &connect_sock_addr); + client->connect(0, *server, 0, m_connect_addr, client_iface_priv_data_cb, + NULL, NULL, this); /* wait for the server to connect */ while (server_recv_req == 0) { progress(); } ASSERT_EQ(1, server_recv_req); + ucs_memory_cpu_load_fence(); ASSERT_EQ(1ul, delayed_conn_reqs.size()); - EXPECT_EQ(0, err_count); + EXPECT_EQ(0ul, err_count); while (!delayed_conn_reqs.empty()) { uct_iface_reject(server->iface(), delayed_conn_reqs.front()); delayed_conn_reqs.pop(); @@ -213,20 +266,17 @@ UCS_TEST_P(test_uct_sockaddr, connect_client_to_server_reject_with_delay) while (err_count == 0) { progress(); } - EXPECT_EQ(1, err_count); + EXPECT_EQ(1ul, err_count); } UCS_TEST_P(test_uct_sockaddr, many_clients_to_one_server) { - UCS_TEST_MESSAGE << "Testing " << ucs::sockaddr_to_str(listen_sock_addr.addr) - << " Interface: " << GetParam()->dev_name.c_str(); - + int num_clients = ucs_max(2, 100 / ucs::test_time_multiplier()); uct_iface_params_t client_params; entity *client_test; - int i, num_clients = 100; /* multiple clients, each on an iface of its own, connecting to the same server */ - for (i = 0; i < num_clients; ++i) { + for (int i = 0; i < num_clients; ++i) { /* open iface for the client side */ client_params.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE | UCT_IFACE_PARAM_FIELD_ERR_HANDLER | @@ -240,43 +290,40 @@ UCS_TEST_P(test_uct_sockaddr, many_clients_to_one_server) client_test = uct_test::create_entity(client_params); m_entities.push_back(client_test); - client_test->client_cb_arg = server->iface_attr().max_conn_priv; - client_test->connect(i, *server, 0, &connect_sock_addr); + client_test->max_conn_priv = server->iface_attr().max_conn_priv; + client_test->connect(i, *server, 0, m_connect_addr, + client_iface_priv_data_cb, NULL, NULL, this); } while (server_recv_req < num_clients){ progress(); } ASSERT_TRUE(server_recv_req == num_clients); - EXPECT_EQ(0, err_count); + EXPECT_EQ(0ul, err_count); } UCS_TEST_P(test_uct_sockaddr, many_conns_on_client) { - UCS_TEST_MESSAGE << "Testing " << ucs::sockaddr_to_str(listen_sock_addr.addr) - << " Interface: " << GetParam()->dev_name.c_str(); - - int i, num_conns_on_client = 100; + int num_conns_on_client = ucs_max(2, 100 / ucs::test_time_multiplier()); /* multiple clients, on the same iface, connecting to the same server */ - for (i = 0; i < num_conns_on_client; ++i) { - client->connect(i, *server, 0, &connect_sock_addr); + for (int i = 0; i < num_conns_on_client; ++i) { + client->connect(i, *server, 0, m_connect_addr, client_iface_priv_data_cb, + NULL, NULL, this); } while (server_recv_req < num_conns_on_client) { progress(); } ASSERT_TRUE(server_recv_req == num_conns_on_client); - EXPECT_EQ(0, err_count); + EXPECT_EQ(0ul, err_count); } -UCS_TEST_P(test_uct_sockaddr, err_handle) +UCS_TEST_SKIP_COND_P(test_uct_sockaddr, err_handle, + !check_caps(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { - check_caps(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE); - UCS_TEST_MESSAGE << "Testing " << ucs::sockaddr_to_str(listen_sock_addr.addr) - << " Interface: " << GetParam()->dev_name.c_str(); - - client->connect(0, *server, 0, &connect_sock_addr); + client->connect(0, *server, 0, m_connect_addr, client_iface_priv_data_cb, + NULL, NULL, this); scoped_log_handler slh(wrap_errors_logger); /* kill the server */ @@ -286,29 +333,24 @@ UCS_TEST_P(test_uct_sockaddr, err_handle) * test error handling */ if (server_recv_req == 0) { wait_for_flag(&err_count); - EXPECT_EQ(1, err_count); + /* Double check for server_recv_req if it's not delivered from NIC to + * host memory under hight load */ + EXPECT_TRUE((err_count == 1) || (server_recv_req == 1)); } } -UCS_TEST_P(test_uct_sockaddr, conn_to_non_exist_server) +UCS_TEST_SKIP_COND_P(test_uct_sockaddr, conn_to_non_exist_server, + !check_caps(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { - check_caps(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE); - - struct sockaddr_in *connect_addr_in; - connect_addr_in = (struct sockaddr_in *) (connect_sock_addr.addr); - in_port_t orig_port = connect_addr_in->sin_port; - - connect_addr_in->sin_port = 1; - UCS_TEST_MESSAGE << "Testing " << ucs::sockaddr_to_str(listen_sock_addr.addr) - << " Interface: " << GetParam()->dev_name.c_str(); - + m_connect_addr.set_port(htons(1)); err_count = 0; /* wrap errors now since the client will try to connect to a non existing port */ { scoped_log_handler slh(wrap_errors_logger); /* client - try to connect to a non-existing port on the server side */ - client->connect(0, *server, 0, &connect_sock_addr); + client->connect(0, *server, 0, m_connect_addr, client_iface_priv_data_cb, + NULL, NULL, this); completion comp; ucs_status_t status = uct_ep_flush(client->ep(0), 0, &comp); if (status == UCS_INPROGRESS) { @@ -320,8 +362,1070 @@ UCS_TEST_P(test_uct_sockaddr, conn_to_non_exist_server) /* destroy the client's ep. this ep shouldn't be accessed anymore */ client->destroy_ep(0); } - /* restore the previous existing port */ - connect_addr_in->sin_port = orig_port; } UCT_INSTANTIATE_SOCKADDR_TEST_CASE(test_uct_sockaddr) + +class test_uct_cm_sockaddr : public uct_test { + friend class uct_test::entity; +protected: + enum { + TEST_STATE_CONNECT_REQUESTED = UCS_BIT(0), + TEST_STATE_CLIENT_CONNECTED = UCS_BIT(1), + TEST_STATE_SERVER_CONNECTED = UCS_BIT(2), + TEST_STATE_CLIENT_DISCONNECTED = UCS_BIT(3), + TEST_STATE_SERVER_DISCONNECTED = UCS_BIT(4), + TEST_STATE_SERVER_REJECTED = UCS_BIT(5), + TEST_STATE_CLIENT_GOT_REJECT = UCS_BIT(6), + TEST_STATE_CLIENT_GOT_ERROR = UCS_BIT(7) + }; + + enum { + TEST_EP_FLAG_DISCONNECT_INITIATOR = UCS_BIT(0), + TEST_EP_FLAG_DISCONNECT_CB_INVOKED = UCS_BIT(1) + }; + +public: + test_uct_cm_sockaddr() : m_state(0), m_server(NULL), m_client(NULL), + m_server_recv_req_cnt(0), m_client_connect_cb_cnt(0), + m_server_connect_cb_cnt(0), + m_server_disconnect_cnt(0), m_client_disconnect_cnt(0), + m_reject_conn_request(false), + m_server_start_disconnect(false), + m_delay_conn_reply(false), + m_short_priv_data_len(0), m_long_priv_data_len(0) { + } + + void init() { + uct_test::init(); + + /* This address is accessible, as it was tested at the resource creation */ + m_listen_addr = GetParam()->listen_sock_addr; + m_connect_addr = GetParam()->connect_sock_addr; + + uint16_t port = ucs::get_port(); + m_listen_addr.set_port(port); + m_connect_addr.set_port(port); + + m_server = uct_test::create_entity(); + m_entities.push_back(m_server); + m_client = uct_test::create_entity(); + m_entities.push_back(m_client); + + m_client->max_conn_priv = m_client->cm_attr().max_conn_priv; + m_server->max_conn_priv = m_server->cm_attr().max_conn_priv; + + m_short_priv_data_len = 20; + m_long_priv_data_len = 420 * UCS_KBYTE; + + m_short_priv_data.resize(m_short_priv_data_len); + ucs::fill_random(m_short_priv_data); + + m_long_priv_data.resize(m_long_priv_data_len); + ucs::fill_random(m_long_priv_data); + + UCS_TEST_MESSAGE << "Testing " << m_listen_addr + << " Interface: " << GetParam()->dev_name; + } + +protected: + + void start_listen(uct_cm_listener_conn_request_callback_t server_conn_req_cb) { + uct_listener_params_t params; + + params.field_mask = UCT_LISTENER_PARAM_FIELD_CONN_REQUEST_CB | + UCT_LISTENER_PARAM_FIELD_USER_DATA; + params.conn_request_cb = server_conn_req_cb; + params.user_data = static_cast(this); + /* if origin port set in init() is busy, listen() will retry with another one */ + m_server->listen(m_listen_addr, params); + + /* the listen function may have changed the initial port on the listener's + * address. update this port for the address to connect to */ + m_connect_addr.set_port(m_listen_addr.get_port()); + } + + void listen_and_connect() { + start_listen(test_uct_cm_sockaddr::conn_request_cb); + m_client->connect(0, *m_server, 0, m_connect_addr, client_priv_data_cb, + client_connect_cb, client_disconnect_cb, this); + + wait_for_bits(&m_state, TEST_STATE_CONNECT_REQUESTED); + EXPECT_TRUE(m_state & TEST_STATE_CONNECT_REQUESTED); + } + + size_t priv_data_do_pack(size_t pack_limit, void *priv_data) { + if (pack_limit < m_long_priv_data_len) { + /* small private data length */ + memcpy(priv_data, m_short_priv_data.data(), m_short_priv_data_len); + return m_short_priv_data_len; + } else { + /* large private data length (tcp_sockcm) */ + memcpy(priv_data, m_long_priv_data.data(), m_long_priv_data_len); + return m_long_priv_data_len; + } + } + + ssize_t common_priv_data_cb(void *arg, size_t pack_limit, void *priv_data) { + test_uct_cm_sockaddr *self = reinterpret_cast(arg); + size_t priv_data_len; + + priv_data_len = self->priv_data_do_pack(pack_limit, priv_data); + EXPECT_LE(priv_data_len, pack_limit); + return priv_data_len; + } + + static ssize_t client_priv_data_cb(void *arg, + const uct_cm_ep_priv_data_pack_args_t + *pack_args, void *priv_data) + { + test_uct_cm_sockaddr *self = reinterpret_cast(arg); + return self->common_priv_data_cb(arg, self->m_client->max_conn_priv, priv_data); + } + + static ssize_t server_priv_data_cb(void *arg, + const uct_cm_ep_priv_data_pack_args_t + *pack_args, void *priv_data) + { + test_uct_cm_sockaddr *self = reinterpret_cast(arg); + return self->common_priv_data_cb(arg, self->m_server->max_conn_priv, priv_data); + } + + void accept(uct_cm_h cm, uct_conn_request_h conn_request, + uct_cm_ep_server_conn_notify_callback_t notify_cb, + uct_ep_disconnect_cb_t disconnect_cb, + void *user_data) + { + uct_ep_params_t ep_params; + ucs_status_t status; + uct_ep_h ep; + + ASSERT_TRUE(m_server->listener()); + m_server->reserve_ep(m_server->num_eps()); + + ep_params.field_mask = UCT_EP_PARAM_FIELD_CM | + UCT_EP_PARAM_FIELD_CONN_REQUEST | + UCT_EP_PARAM_FIELD_USER_DATA | + UCT_EP_PARAM_FIELD_SOCKADDR_NOTIFY_CB_SERVER | + UCT_EP_PARAM_FIELD_SOCKADDR_DISCONNECT_CB | + UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS | + UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB; + + ep_params.cm = cm; + ep_params.conn_request = conn_request; + ep_params.sockaddr_cb_flags = UCT_CB_FLAG_ASYNC; + ep_params.sockaddr_pack_cb = server_priv_data_cb; + ep_params.sockaddr_cb_server = notify_cb; + ep_params.disconnect_cb = disconnect_cb; + ep_params.user_data = user_data; + + status = uct_ep_create(&ep_params, &ep); + ASSERT_UCS_OK(status); + m_server->eps().back().reset(ep, uct_ep_destroy); + } + + virtual void server_accept(entity *server, uct_conn_request_h conn_request, + uct_cm_ep_server_conn_notify_callback_t notify_cb, + uct_ep_disconnect_cb_t disconnect_cb, + void *user_data) + { + accept(server->cm(), conn_request, notify_cb, disconnect_cb, user_data); + } + + void verify_remote_data(const void *remote_data, size_t remote_length) + { + std::vector r_data((char*)(remote_data), (char*)(remote_data) + remote_length); + + if (remote_length == m_short_priv_data_len) { + EXPECT_EQ(m_short_priv_data, r_data); + } else if (remote_length == m_long_priv_data_len) { + EXPECT_EQ(m_long_priv_data, r_data); + } else { + UCS_TEST_ABORT("wrong data length received " << remote_length); + } + } + + /* + * Common section for the server's handling of a connection request. + * Process the connection request and check if the server's accept is + * required for the calling test. + * + * return true if the server should accept the connection request and + * false if not. + */ + static bool common_conn_request(uct_listener_h listener, void *arg, + const uct_cm_listener_conn_request_args_t + *conn_req_args) { + test_uct_cm_sockaddr *self = reinterpret_cast(arg); + ucs_sock_addr_t m_connect_addr_sock_addr = + self->m_connect_addr.to_ucs_sock_addr(); + uct_conn_request_h conn_request; + const uct_cm_remote_data_t *remote_data; + uint16_t client_port; + ucs_status_t status; + + EXPECT_TRUE(ucs_test_all_flags(conn_req_args->field_mask, + (UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_CONN_REQUEST | + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_REMOTE_DATA | + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_CLIENT_ADDR))); + + conn_request = conn_req_args->conn_request; + remote_data = conn_req_args->remote_data; + + /* check the address of the remote client */ + EXPECT_EQ(0, ucs_sockaddr_ip_cmp(m_connect_addr_sock_addr.addr, + conn_req_args->client_address.addr)); + + status = ucs_sockaddr_get_port(conn_req_args->client_address.addr, &client_port); + ASSERT_UCS_OK(status); + EXPECT_GT(client_port, 0); + + self->verify_remote_data(remote_data->conn_priv_data, remote_data->conn_priv_data_length); + + self->m_state |= TEST_STATE_CONNECT_REQUESTED; + + if (self->m_delay_conn_reply) { + self->m_delayed_conn_reqs.push(conn_request); + } else if (self->m_reject_conn_request) { + status = uct_listener_reject(listener, conn_request); + ASSERT_UCS_OK(status); + self->m_state |= TEST_STATE_SERVER_REJECTED; + } else { + /* do regular server accept */ + return true; + } + + return false; + } + + static void + conn_request_cb(uct_listener_h listener, void *arg, + const uct_cm_listener_conn_request_args_t *conn_req_args) { + test_uct_cm_sockaddr *self = reinterpret_cast(arg); + + if (self->common_conn_request(listener, arg, conn_req_args)) { + EXPECT_TRUE(conn_req_args->field_mask & + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_CONN_REQUEST); + self->server_accept(self->m_server, conn_req_args->conn_request, + server_connect_cb, server_disconnect_cb, self); + } + + ucs_memory_cpu_store_fence(); + self->m_server_recv_req_cnt++; + } + + + static void + server_connect_cb(uct_ep_h ep, void *arg, + const uct_cm_ep_server_conn_notify_args_t *notify_args) { + test_uct_cm_sockaddr *self = reinterpret_cast(arg); + + if (notify_args->field_mask & UCT_CM_EP_SERVER_CONN_NOTIFY_ARGS_FIELD_STATUS) { + EXPECT_EQ(UCS_OK, notify_args->status); + } + + self->m_state |= TEST_STATE_SERVER_CONNECTED; + self->m_server_connect_cb_cnt++; + } + + static void + client_connect_cb(uct_ep_h ep, void *arg, + const uct_cm_ep_client_connect_args_t *connect_args) { + test_uct_cm_sockaddr *self = reinterpret_cast(arg); + const uct_cm_remote_data_t *remote_data; + ucs_status_t status; + + EXPECT_TRUE(ucs_test_all_flags(connect_args->field_mask, + (UCT_CM_EP_CLIENT_CONNECT_ARGS_FIELD_REMOTE_DATA | + UCT_CM_EP_CLIENT_CONNECT_ARGS_FIELD_STATUS))); + + remote_data = connect_args->remote_data; + status = connect_args->status; + + if (status == UCS_ERR_REJECTED) { + self->m_state |= TEST_STATE_CLIENT_GOT_REJECT; + } else if (status != UCS_OK) { + self->m_state |= TEST_STATE_CLIENT_GOT_ERROR; + } else { + EXPECT_TRUE(ucs_test_all_flags(remote_data->field_mask, + (UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA_LENGTH | + UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA))); + + self->verify_remote_data(remote_data->conn_priv_data, remote_data->conn_priv_data_length); + + status = uct_cm_client_ep_conn_notify(ep); + ASSERT_UCS_OK(status); + + self->m_state |= TEST_STATE_CLIENT_CONNECTED; + self->m_client_connect_cb_cnt++; + } + } + + static void + server_disconnect_cb(uct_ep_h ep, void *arg) { + test_uct_cm_sockaddr *self = reinterpret_cast(arg); + + if (!(self->m_server_start_disconnect)) { + self->m_server->disconnect(ep); + } + + self->m_state |= TEST_STATE_SERVER_DISCONNECTED; + self->m_server_disconnect_cnt++; + } + + static void client_disconnect_cb(uct_ep_h ep, void *arg) { + test_uct_cm_sockaddr *self = reinterpret_cast(arg); + + if (self->m_server_start_disconnect) { + /* if the server was the one who initiated the disconnect flow, + * the client should also disconnect its ep from the server in + * its disconnect cb */ + self->m_client->disconnect(ep); + } + + self->m_state |= TEST_STATE_CLIENT_DISCONNECTED; + self->m_client_disconnect_cnt++; + } + + void cm_disconnect(entity *ent) { + size_t i; + + /* Disconnect all the existing endpoints */ + for (i = 0; i < ent->num_eps(); ++i) { + ent->disconnect(ent->ep(i)); + } + + wait_for_bits(&m_state, TEST_STATE_CLIENT_DISCONNECTED | + TEST_STATE_SERVER_DISCONNECTED); + EXPECT_TRUE(ucs_test_all_flags(m_state, (TEST_STATE_SERVER_DISCONNECTED | + TEST_STATE_CLIENT_DISCONNECTED))); + } + + void wait_for_client_server_counters(volatile int *server_cnt, + volatile int *client_cnt, int val, + double timeout = 10 * DEFAULT_TIMEOUT_SEC) { + ucs_time_t deadline; + + deadline = ucs_get_time() + ucs_time_from_sec(timeout) * + ucs::test_time_multiplier(); + + while (((*server_cnt < val) || (*client_cnt < val)) && + (ucs_get_time() < deadline)) { + progress(); + } + } + + void test_delayed_server_response(bool reject) + { + ucs_status_t status; + ucs_time_t deadline; + + m_delay_conn_reply = true; + + listen_and_connect(); + + EXPECT_FALSE(m_state & + (TEST_STATE_SERVER_CONNECTED | TEST_STATE_CLIENT_CONNECTED | + TEST_STATE_CLIENT_GOT_REJECT | TEST_STATE_CLIENT_GOT_ERROR)); + + deadline = ucs_get_time() + ucs_time_from_sec(DEFAULT_TIMEOUT_SEC) * + ucs::test_time_multiplier(); + + while ((m_server_recv_req_cnt == 0) && (ucs_get_time() < deadline)) { + progress(); + } + ASSERT_EQ(1, m_server_recv_req_cnt); + ucs_memory_cpu_load_fence(); + + if (reject) { + /* wrap errors since a reject is expected */ + scoped_log_handler slh(detect_reject_error_logger); + + status = uct_listener_reject(m_server->listener(), + m_delayed_conn_reqs.front()); + ASSERT_UCS_OK(status); + + wait_for_bits(&m_state, TEST_STATE_CLIENT_GOT_REJECT); + EXPECT_TRUE(m_state & TEST_STATE_CLIENT_GOT_REJECT); + } else { + server_accept(m_server, m_delayed_conn_reqs.front(), + server_connect_cb, server_disconnect_cb, this); + + wait_for_bits(&m_state, TEST_STATE_SERVER_CONNECTED | + TEST_STATE_CLIENT_CONNECTED); + EXPECT_TRUE(ucs_test_all_flags(m_state, TEST_STATE_SERVER_CONNECTED | + TEST_STATE_CLIENT_CONNECTED)); + } + + m_delayed_conn_reqs.pop(); + } + + static ucs_log_func_rc_t + detect_addr_route_error_logger(const char *file, unsigned line, const char *function, + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) + { + if (level == UCS_LOG_LEVEL_ERROR) { + std::string err_str = format_message(message, ap); + if ((strstr(err_str.c_str(), "client: got error event RDMA_CM_EVENT_ADDR_ERROR")) || + (strstr(err_str.c_str(), "client: got error event RDMA_CM_EVENT_ROUTE_ERROR")) || + (strstr(err_str.c_str(), "rdma_resolve_route(to addr=240.0.0.0")) || + (strstr(err_str.c_str(), "error event on client ep"))) { + UCS_TEST_MESSAGE << err_str; + return UCS_LOG_FUNC_RC_STOP; + } + } + return UCS_LOG_FUNC_RC_CONTINUE; + } + + static ucs_log_func_rc_t + detect_reject_error_logger(const char *file, unsigned line, const char *function, + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) + { + if (level == UCS_LOG_LEVEL_ERROR) { + std::string err_str = format_message(message, ap); + if (strstr(err_str.c_str(), "client: got error event RDMA_CM_EVENT_REJECTED")) { + UCS_TEST_MESSAGE << err_str; + return UCS_LOG_FUNC_RC_STOP; + } + } + return UCS_LOG_FUNC_RC_CONTINUE; + } + + static ucs_log_func_rc_t + detect_double_disconnect_error_logger(const char *file, unsigned line, + const char *function, ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) + { + if (level == UCS_LOG_LEVEL_ERROR) { + std::string err_str = format_message(message, ap); + if (err_str.find("duplicate call of uct_ep_disconnect") != + std::string::npos) { + UCS_TEST_MESSAGE << err_str; + return UCS_LOG_FUNC_RC_STOP; + } + } + return UCS_LOG_FUNC_RC_CONTINUE; + } + + void basic_listen_connect_disconnect() { + listen_and_connect(); + + wait_for_bits(&m_state, TEST_STATE_SERVER_CONNECTED | + TEST_STATE_CLIENT_CONNECTED); + EXPECT_TRUE(ucs_test_all_flags(m_state, (TEST_STATE_SERVER_CONNECTED | + TEST_STATE_CLIENT_CONNECTED))); + + cm_disconnect(m_client); + } + +protected: + ucs::sock_addr_storage m_listen_addr, m_connect_addr; + uint64_t m_state; + entity *m_server; + entity *m_client; + volatile int m_server_recv_req_cnt, m_client_connect_cb_cnt, + m_server_connect_cb_cnt; + volatile int m_server_disconnect_cnt, m_client_disconnect_cnt; + bool m_reject_conn_request; + bool m_server_start_disconnect; + bool m_delay_conn_reply; + std::queue m_delayed_conn_reqs; + size_t m_short_priv_data_len, m_long_priv_data_len; + std::vector m_short_priv_data; + std::vector m_long_priv_data; +}; + + +UCS_TEST_P(test_uct_cm_sockaddr, cm_query) +{ + ucs_status_t status; + size_t i; + + for (i = 0; i < m_entities.size(); ++i) { + uct_cm_attr_t attr; + attr.field_mask = UCT_CM_ATTR_FIELD_MAX_CONN_PRIV; + status = uct_cm_query(m_entities.at(i).cm(), &attr); + ASSERT_UCS_OK(status); + EXPECT_LT(0ul, attr.max_conn_priv); + } +} + +UCS_TEST_P(test_uct_cm_sockaddr, listener_query) +{ + uct_listener_attr_t attr; + ucs_status_t status; + uint16_t port; + char m_listener_ip_port_str[UCS_SOCKADDR_STRING_LEN]; + char attr_addr_ip_port_str[UCS_SOCKADDR_STRING_LEN]; + + start_listen(test_uct_cm_sockaddr::conn_request_cb); + + attr.field_mask = UCT_LISTENER_ATTR_FIELD_SOCKADDR; + status = uct_listener_query(m_server->listener(), &attr); + ASSERT_UCS_OK(status); + + ucs_sockaddr_str(m_listen_addr.get_sock_addr_ptr(), m_listener_ip_port_str, + UCS_SOCKADDR_STRING_LEN); + ucs_sockaddr_str((struct sockaddr*)&attr.sockaddr, attr_addr_ip_port_str, + UCS_SOCKADDR_STRING_LEN); + EXPECT_EQ(strcmp(m_listener_ip_port_str, attr_addr_ip_port_str), 0); + + status = ucs_sockaddr_get_port((struct sockaddr*)&attr.sockaddr, &port); + ASSERT_UCS_OK(status); + + EXPECT_EQ(m_listen_addr.get_port(), port); +} + +UCS_TEST_P(test_uct_cm_sockaddr, cm_open_listen_close) +{ + basic_listen_connect_disconnect(); +} + +UCS_TEST_P(test_uct_cm_sockaddr, cm_open_listen_close_large_priv_data) +{ + m_entities.clear(); + + /* Set the values for max send/recv socket buffers (for tcp_sockcm) to + * small enough values to have the send/recv of a large data buffer in + * batches, and not all data at once. + * Set the value of the transport's private data length to a large enough + * value to be able to send/recv the batches. + * A transport for which these values are not configurable, like rdmacm, + * these operations will fail and have no effect. */ + if (m_cm_config) { + /* coverity[check_return] */ + uct_config_modify(m_cm_config, "PRIV_DATA_LEN", "900KB"); + uct_config_modify(m_cm_config, "SNDBUF", "100KB"); + uct_config_modify(m_cm_config, "RCVBUF", "100KB"); + } + + /* recreate m_server and m_client with the above env parameters changed */ + init(); + basic_listen_connect_disconnect(); +} + +UCS_TEST_P(test_uct_cm_sockaddr, cm_open_listen_kill_server) +{ + listen_and_connect(); + + wait_for_bits(&m_state, TEST_STATE_SERVER_CONNECTED | + TEST_STATE_CLIENT_CONNECTED); + EXPECT_TRUE(ucs_test_all_flags(m_state, (TEST_STATE_SERVER_CONNECTED | + TEST_STATE_CLIENT_CONNECTED))); + + EXPECT_EQ(1ul, m_entities.remove(m_server)); + m_server = NULL; + + wait_for_bits(&m_state, TEST_STATE_CLIENT_DISCONNECTED); + EXPECT_TRUE(m_state & TEST_STATE_CLIENT_DISCONNECTED); +} + +UCS_TEST_P(test_uct_cm_sockaddr, cm_server_reject) +{ + m_reject_conn_request = true; + + /* wrap errors since a reject is expected */ + scoped_log_handler slh(detect_reject_error_logger); + + listen_and_connect(); + + wait_for_bits(&m_state, TEST_STATE_SERVER_REJECTED | + TEST_STATE_CLIENT_GOT_REJECT); + EXPECT_TRUE(ucs_test_all_flags(m_state, (TEST_STATE_SERVER_REJECTED | + TEST_STATE_CLIENT_GOT_REJECT))); + + EXPECT_FALSE((m_state & + (TEST_STATE_SERVER_CONNECTED | TEST_STATE_CLIENT_CONNECTED))); +} + +UCS_TEST_P(test_uct_cm_sockaddr, many_conns_on_client) +{ + int num_conns_on_client = ucs_max(2, 100 / ucs::test_time_multiplier()); + + m_server_start_disconnect = true; + + /* Listen */ + start_listen(conn_request_cb); + + /* Connect */ + /* multiple clients, on the same cm, connecting to the same server */ + for (int i = 0; i < num_conns_on_client; ++i) { + m_client->connect(i, *m_server, 0, m_connect_addr, client_priv_data_cb, + client_connect_cb, client_disconnect_cb, this); + } + + /* wait for the server to connect to all the endpoints on the cm */ + wait_for_client_server_counters(&m_server_connect_cb_cnt, + &m_client_connect_cb_cnt, + num_conns_on_client); + + EXPECT_EQ(num_conns_on_client, m_server_recv_req_cnt); + EXPECT_EQ(num_conns_on_client, m_client_connect_cb_cnt); + EXPECT_EQ(num_conns_on_client, m_server_connect_cb_cnt); + EXPECT_EQ(num_conns_on_client, (int)m_client->num_eps()); + EXPECT_EQ(num_conns_on_client, (int)m_server->num_eps()); + + /* Disconnect */ + cm_disconnect(m_server); + + /* wait for disconnect to complete */ + wait_for_client_server_counters(&m_server_disconnect_cnt, + &m_client_disconnect_cnt, + num_conns_on_client); + + EXPECT_EQ(num_conns_on_client, m_server_disconnect_cnt); + EXPECT_EQ(num_conns_on_client, m_client_disconnect_cnt); +} + +UCS_TEST_P(test_uct_cm_sockaddr, err_handle) +{ + /* client - try to connect to a server that isn't listening */ + m_client->connect(0, *m_server, 0, m_connect_addr, client_priv_data_cb, + client_connect_cb, client_disconnect_cb, this); + + EXPECT_FALSE(m_state & TEST_STATE_CONNECT_REQUESTED); + + /* with the TCP port space (which is currently tested with rdmacm), + * a REJECT event will be generated on the client side. + * with tcp_sockcm, an EPOLLERR event will be generated and transformed + * to an error code. */ + wait_for_bits(&m_state, TEST_STATE_CLIENT_GOT_REJECT); + EXPECT_TRUE(ucs_test_all_flags(m_state, TEST_STATE_CLIENT_GOT_REJECT)); +} + +UCS_TEST_P(test_uct_cm_sockaddr, conn_to_non_exist_server_port) +{ + /* Listen */ + start_listen(test_uct_cm_sockaddr::conn_request_cb); + + m_connect_addr.set_port(htons(1)); + + /* wrap errors since a reject is expected */ + scoped_log_handler slh(detect_reject_error_logger); + + /* client - try to connect to a non-existing port on the server side. */ + m_client->connect(0, *m_server, 0, m_connect_addr, client_priv_data_cb, + client_connect_cb, client_disconnect_cb, this); + + /* with the TCP port space (which is currently tested with rdmacm), + * a REJECT event will be generated on the client side. + * with tcp_sockcm, an EPOLLERR event will be generated and transformed + * to an error code. */ + wait_for_bits(&m_state, TEST_STATE_CLIENT_GOT_REJECT); + EXPECT_TRUE(ucs_test_all_flags(m_state, TEST_STATE_CLIENT_GOT_REJECT)); +} + +UCS_TEST_P(test_uct_cm_sockaddr, connect_client_to_server_with_delay) +{ + test_delayed_server_response(false); + + cm_disconnect(m_client); +} + +UCS_TEST_P(test_uct_cm_sockaddr, connect_client_to_server_reject_with_delay) +{ + test_delayed_server_response(true); +} + +UCS_TEST_P(test_uct_cm_sockaddr, ep_disconnect_err_codes) +{ + bool disconnecting = false; + + listen_and_connect(); + + { + entity::scoped_async_lock lock(*m_client); + if (m_state & TEST_STATE_CLIENT_CONNECTED) { + UCS_TEST_MESSAGE << "EXP: " << ucs_status_string(UCS_OK); + EXPECT_EQ(UCS_OK, uct_ep_disconnect(m_client->ep(0), 0)); + disconnecting = true; + } else { + UCS_TEST_MESSAGE << "EXP: " << ucs_status_string(UCS_ERR_BUSY); + EXPECT_EQ(UCS_ERR_BUSY, uct_ep_disconnect(m_client->ep(0), 0)); + } + } + + wait_for_bits(&m_state, TEST_STATE_SERVER_CONNECTED | + TEST_STATE_CLIENT_CONNECTED); + EXPECT_TRUE(ucs_test_all_flags(m_state, (TEST_STATE_SERVER_CONNECTED | + TEST_STATE_CLIENT_CONNECTED))); + + { + entity::scoped_async_lock lock(*m_client); + if (disconnecting) { + scoped_log_handler slh(detect_double_disconnect_error_logger); + if (m_state & TEST_STATE_CLIENT_DISCONNECTED) { + UCS_TEST_MESSAGE << "EXP: " + << ucs_status_string(UCS_ERR_NOT_CONNECTED); + EXPECT_EQ(UCS_ERR_NOT_CONNECTED, + uct_ep_disconnect(m_client->ep(0), 0)); + } else { + UCS_TEST_MESSAGE << "EXP: " + << ucs_status_string(UCS_INPROGRESS); + EXPECT_EQ(UCS_INPROGRESS, + uct_ep_disconnect(m_client->ep(0), 0)); + } + } else { + UCS_TEST_MESSAGE << "EXP: " << ucs_status_string(UCS_OK); + ASSERT_UCS_OK(uct_ep_disconnect(m_client->ep(0), 0)); + disconnecting = true; + } + } + + ASSERT_TRUE(disconnecting); + wait_for_bits(&m_state, TEST_STATE_CLIENT_DISCONNECTED); + EXPECT_TRUE(m_state & TEST_STATE_CLIENT_DISCONNECTED); + + /* wrap errors since the client will call uct_ep_disconnect the second time + * on the same endpoint. this ep may not be disconnected yet */ + { + scoped_log_handler slh(detect_double_disconnect_error_logger); + UCS_TEST_MESSAGE << "EXP: " << ucs_status_string(UCS_ERR_NOT_CONNECTED); + EXPECT_EQ(UCS_ERR_NOT_CONNECTED, uct_ep_disconnect(m_client->ep(0), 0)); + } +} + +UCT_INSTANTIATE_SOCKADDR_TEST_CASE(test_uct_cm_sockaddr) + + +class test_uct_cm_sockaddr_err_handle_non_exist_ip : public test_uct_cm_sockaddr { +public: + void init() { + /* tcp_sockcm requires setting this parameter to shorten the time of waiting + * for the connect() to fail when connecting to a non-existing ip. + * A transport for which this value is not configurable, like rdmacm, + * will have no effect. */ + modify_config("SYN_CNT", "1", true); + + test_uct_cm_sockaddr::init(); + } +}; + +UCS_TEST_P(test_uct_cm_sockaddr_err_handle_non_exist_ip, conn_to_non_exist_ip) +{ + struct sockaddr_in addr; + ucs_status_t status; + size_t size; + + /* Listen */ + start_listen(test_uct_cm_sockaddr::conn_request_cb); + + /* 240.0.0.0/4 - This block, formerly known as the Class E address + space, is reserved for future use; see [RFC1112], Section 4. + therefore, this value can be used as a non-existing IP for this test */ + memset(&addr, 0, sizeof(struct sockaddr_in)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr("240.0.0.0"); + addr.sin_port = m_listen_addr.get_port(); + + status = ucs_sockaddr_sizeof((struct sockaddr*)&addr, &size); + ASSERT_UCS_OK(status); + + m_connect_addr.set_sock_addr(*(struct sockaddr*)&addr, size); + + /* wrap errors now since the client will try to connect to a non existing IP */ + { + scoped_log_handler slh(detect_addr_route_error_logger); + /* client - try to connect to a non-existing IP */ + m_client->connect(0, *m_server, 0, m_connect_addr, client_priv_data_cb, + client_connect_cb, client_disconnect_cb, this); + + wait_for_bits(&m_state, TEST_STATE_CLIENT_GOT_ERROR, 300); + EXPECT_TRUE(m_state & TEST_STATE_CLIENT_GOT_ERROR); + + EXPECT_FALSE(m_state & TEST_STATE_CONNECT_REQUESTED); + EXPECT_FALSE(m_state & + (TEST_STATE_SERVER_CONNECTED | TEST_STATE_CLIENT_CONNECTED)); + } +} + +UCT_INSTANTIATE_SOCKADDR_TEST_CASE(test_uct_cm_sockaddr_err_handle_non_exist_ip) + + +class test_uct_cm_sockaddr_stress : public test_uct_cm_sockaddr { +public: + test_uct_cm_sockaddr_stress() : m_clients_num(0), + m_ep_init_disconnect_cnt(0) { + } + + typedef struct { + uct_ep_h ep; + volatile uint8_t state; + } ep_state_t; + + void init() { + test_uct_cm_sockaddr::init(); + + m_clients_num = ucs_max(2, 100 / ucs::test_time_multiplier()); + pthread_mutex_init(&m_lock, NULL); + } + + void cleanup() { + pthread_mutex_destroy(&m_lock); + test_uct_cm_sockaddr::cleanup(); + } + + int get_ep_index(uct_ep_h ep) { + for (int i = 0; i < (2 * m_clients_num); i++) { + if (m_all_eps[i].ep == ep) { + return i; + } + } + + return -1; + } + + void common_test_disconnect(uct_ep_h ep) { + int index; + + index = get_ep_index(ep); + ASSERT_GE(index, 0); + EXPECT_LT(index, (2 * m_clients_num)); + + pthread_mutex_lock(&m_lock); + m_all_eps[index].state |= TEST_EP_FLAG_DISCONNECT_CB_INVOKED; + + if (m_all_eps[index].state & TEST_EP_FLAG_DISCONNECT_INITIATOR) { + m_ep_init_disconnect_cnt--; + pthread_mutex_unlock(&m_lock); + } else { + pthread_mutex_unlock(&m_lock); + ASSERT_UCS_OK(uct_ep_disconnect(ep, 0)); + } + } + + void disconnect_cnt_increment(volatile int *cnt) { + pthread_mutex_lock(&m_lock); + (*cnt)++; + pthread_mutex_unlock(&m_lock); + } + + static void server_disconnect_cb(uct_ep_h ep, void *arg) { + test_uct_cm_sockaddr_stress *self = + reinterpret_cast(arg); + + self->common_test_disconnect(ep); + self->disconnect_cnt_increment(&self->m_server_disconnect_cnt); + } + + static void client_disconnect_cb(uct_ep_h ep, void *arg) { + test_uct_cm_sockaddr_stress *self = + reinterpret_cast(arg); + + self->common_test_disconnect(ep); + self->disconnect_cnt_increment(&self->m_client_disconnect_cnt); + } + + void server_accept(entity *server, uct_conn_request_h conn_request, + uct_cm_ep_server_conn_notify_callback_t notify_cb, + uct_ep_disconnect_cb_t disconnect_cb, + void *user_data) { + test_uct_cm_sockaddr::accept(server->cm(), conn_request, notify_cb, + disconnect_cb, user_data); + } + + static void + conn_request_cb(uct_listener_h listener, void *arg, + const uct_cm_listener_conn_request_args_t *conn_req_args) { + test_uct_cm_sockaddr_stress *self = + reinterpret_cast(arg); + + if (test_uct_cm_sockaddr::common_conn_request(listener, arg, conn_req_args)) { + EXPECT_TRUE(conn_req_args->field_mask & + UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_CONN_REQUEST); + self->server_accept(self->m_server, conn_req_args->conn_request, + server_connect_cb, server_disconnect_cb, self); + } + + ucs_memory_cpu_store_fence(); + self->m_server_recv_req_cnt++; + } + +protected: + int m_clients_num; + std::vector m_all_eps; + int m_ep_init_disconnect_cnt; + pthread_mutex_t m_lock; +}; + +UCS_TEST_P(test_uct_cm_sockaddr_stress, many_clients_to_one_server) +{ + int i, disconnected_eps_on_each_side, no_disconnect_eps_cnt = 0; + entity *client_test; + time_t seed = time(0); + ucs_time_t deadline; + + /* Listen */ + start_listen(test_uct_cm_sockaddr_stress::conn_request_cb); + + /* Connect */ + /* multiple clients, each on a cm of its own, connecting to the same server */ + for (i = 0; i < m_clients_num; ++i) { + client_test = uct_test::create_entity(); + m_entities.push_back(client_test); + + client_test->max_conn_priv = client_test->cm_attr().max_conn_priv; + client_test->connect(0, *m_server, 0, m_connect_addr, client_priv_data_cb, + client_connect_cb, client_disconnect_cb, this); + } + + /* wait for the server to connect to all the clients */ + wait_for_client_server_counters(&m_server_connect_cb_cnt, + &m_client_connect_cb_cnt, m_clients_num); + + EXPECT_EQ(m_clients_num, m_server_recv_req_cnt); + EXPECT_EQ(m_clients_num, m_client_connect_cb_cnt); + EXPECT_EQ(m_clients_num, m_server_connect_cb_cnt); + EXPECT_EQ(m_clients_num, (int)m_server->num_eps()); + + /* Disconnect */ + srand(seed); + UCS_TEST_MESSAGE << "Using random seed: " << seed; + + m_all_eps.resize(2 * m_clients_num); + + /* save all the clients' and server's eps in the m_all_eps array */ + for (i = 0; i < m_clients_num; ++i) { + /* first 2 entities are m_server and m_client */ + m_all_eps[i].ep = m_entities.at(2 + i).ep(0); + m_all_eps[i].state = 0; + m_all_eps[m_clients_num + i].ep = m_server->ep(i); + m_all_eps[m_clients_num + i].state = 0; + } + + /* Disconnect */ + /* go over the eps array and for each ep - use rand() to decide whether or + * not it should initiate a disconnect */ + for (i = 0; i < (2 * m_clients_num); ++i) { + if ((ucs::rand() % 2) == 0) { + continue; + } + + /* don't start a disconnect on an ep that was already disconnected */ + pthread_mutex_lock(&m_lock); + if (!(m_all_eps[i].state & TEST_EP_FLAG_DISCONNECT_CB_INVOKED)) { + m_all_eps[i].state |= TEST_EP_FLAG_DISCONNECT_INITIATOR; + pthread_mutex_unlock(&m_lock); + /* uct_ep_disconnect cannot be called when m_lock is taken + * in order to prevent abba deadlock since uct will try taking + * the async lock inside this function */ + ASSERT_UCS_OK(uct_ep_disconnect(m_all_eps[i].ep, 0)); + /* count the number of eps that initiated a disconnect */ + pthread_mutex_lock(&m_lock); + m_ep_init_disconnect_cnt++; + } + pthread_mutex_unlock(&m_lock); + } + + /* wait for all the disconnect flows that began, to complete. + * if an ep initiated a disconnect, its disconnect callback should have been + * called, and so is the disconnect callback of its remote peer ep. + * every ep that initiated a disconnect is counted. this counter is + * decremented in its disconnect cb, therefore once all eps that initiated + * a disconnect are disconnected, this counter should be equal to zero */ + deadline = ucs_get_time() + ucs_time_from_sec(10 * DEFAULT_TIMEOUT_SEC) * + ucs::test_time_multiplier(); + + while ((m_ep_init_disconnect_cnt != 0) && (ucs_get_time() < deadline)) { + progress(); + } + EXPECT_EQ(0, m_ep_init_disconnect_cnt); + + /* count and print the number of eps that were not disconnected */ + for (i = 0; i < (2 * m_clients_num); i++) { + if (m_all_eps[i].state == 0) { + no_disconnect_eps_cnt++; + } else { + EXPECT_TRUE((m_all_eps[i].state & ~TEST_EP_FLAG_DISCONNECT_INITIATOR) == + TEST_EP_FLAG_DISCONNECT_CB_INVOKED); + } + } + + UCS_TEST_MESSAGE << no_disconnect_eps_cnt << + " (out of " << (2 * m_clients_num) << ") " + "eps were not disconnected during the test."; + + disconnected_eps_on_each_side = ((2 * m_clients_num) - no_disconnect_eps_cnt) / 2; + wait_for_client_server_counters(&m_server_disconnect_cnt, + &m_client_disconnect_cnt, + disconnected_eps_on_each_side); + + EXPECT_EQ(disconnected_eps_on_each_side, m_server_disconnect_cnt); + EXPECT_EQ(disconnected_eps_on_each_side, m_client_disconnect_cnt); + + /* destroy all the eps here (and not in the test's destruction flow) so that + * no disconnect callbacks are invoked after the test ends */ + m_entities.clear(); +} + +UCT_INSTANTIATE_SOCKADDR_TEST_CASE(test_uct_cm_sockaddr_stress) + + +class test_uct_cm_sockaddr_multiple_cms : public test_uct_cm_sockaddr { +public: + void init() { + ucs_status_t status; + + test_uct_cm_sockaddr::init(); + + status = ucs_async_context_create(UCS_ASYNC_MODE_THREAD_SPINLOCK, + &m_test_async); + ASSERT_UCS_OK(status); + + status = uct_cm_config_read(GetParam()->component, NULL, NULL, &m_test_config); + ASSERT_UCS_OK(status); + + UCS_TEST_CREATE_HANDLE(uct_worker_h, m_test_worker, uct_worker_destroy, + uct_worker_create, m_test_async, + UCS_THREAD_MODE_SINGLE) + + UCS_TEST_CREATE_HANDLE(uct_cm_h, m_test_cm, uct_cm_close, + uct_cm_open, GetParam()->component, + m_test_worker, m_test_config); + } + + void cleanup() { + m_test_cm.reset(); + uct_config_release(m_test_config); + m_test_worker.reset(); + ucs_async_context_destroy(m_test_async); + test_uct_cm_sockaddr::cleanup(); + } + + void server_accept(entity *server, uct_conn_request_h conn_request, + uct_cm_ep_server_conn_notify_callback_t notify_cb, + uct_ep_disconnect_cb_t disconnect_cb, + void *user_data) + { + accept(m_test_cm, conn_request, notify_cb, disconnect_cb, user_data); + } + +protected: + ucs::handle m_test_worker; + ucs::handle m_test_cm; + ucs_async_context_t *m_test_async; + uct_cm_config_t *m_test_config; +}; + +UCS_TEST_P(test_uct_cm_sockaddr_multiple_cms, server_switch_cm) +{ + listen_and_connect(); + + wait_for_bits(&m_state, TEST_STATE_SERVER_CONNECTED | + TEST_STATE_CLIENT_CONNECTED); + EXPECT_TRUE(ucs_test_all_flags(m_state, (TEST_STATE_SERVER_CONNECTED | + TEST_STATE_CLIENT_CONNECTED))); + + cm_disconnect(m_client); + + /* destroy the server's ep here so that it would be destroyed before the cm + * it is using */ + m_server->destroy_ep(0); +} + +UCT_INSTANTIATE_SOCKADDR_TEST_CASE(test_uct_cm_sockaddr_multiple_cms) diff --git a/test/gtest/uct/ib/test_ud.cc b/test/gtest/uct/ib/test_ud.cc index ebe5ad64314..9554e5e61c3 100644 --- a/test/gtest/uct/ib/test_ud.cc +++ b/test/gtest/uct/ib/test_ud.cc @@ -11,8 +11,10 @@ extern "C" { #include #include +#include #include #include +#include } @@ -33,13 +35,10 @@ class test_ud : public ud_base_test { return UCS_OK; } - static int rx_ack_count; - static int tx_ackreq_psn; - static ucs_status_t count_rx_acks(uct_ud_ep_t *ep, uct_ud_neth_t *neth) { if (UCT_UD_PSN_COMPARE(neth->ack_psn, >, ep->tx.acked_psn)) { - rx_ack_count++; + ucs_atomic_add32(&rx_ack_count, 1); } return UCS_OK; } @@ -52,36 +51,28 @@ class test_ud : public ud_base_test { return UCS_OK; } - static int rx_drop_count; - static ucs_status_t drop_rx(uct_ud_ep_t *ep, uct_ud_neth_t *neth) { - rx_drop_count++; + ucs_atomic_add32(&rx_drop_count, 1); if (neth->packet_type & UCT_UD_PACKET_FLAG_ACK_REQ) { tx_ack_psn = neth->psn; - ack_req_tx_cnt++; + ucs_atomic_add32(&ack_req_tx_cnt, 1); ucs_debug("RX: psn %u ack_req", neth->psn); } return UCS_ERR_BUSY; } - static int ack_req_tx_cnt; - - static uct_ud_psn_t tx_ack_psn; - static ucs_status_t ack_req_count_tx(uct_ud_ep_t *ep, uct_ud_neth_t *neth) { if (neth->packet_type & UCT_UD_PACKET_FLAG_ACK_REQ) { tx_ack_psn = neth->psn; - ack_req_tx_cnt++; + ucs_atomic_add32(&ack_req_tx_cnt, 1); } return UCS_OK; } - static int tx_count; - static ucs_status_t count_tx(uct_ud_ep_t *ep, uct_ud_neth_t *neth) { - tx_count++; + ucs_atomic_add32(&tx_count, 1); return UCS_OK; } @@ -170,20 +161,26 @@ class test_ud : public ud_base_test { EXPECT_EQ(4, ep(m_e1, 0)->tx.psn); EXPECT_EQ(3, ep(m_e1)->tx.acked_psn); } -}; -int test_ud::ack_req_tx_cnt = 0; -int test_ud::rx_ack_count = 0; -int test_ud::tx_ackreq_psn = 0; -int test_ud::rx_drop_count = 0; -int test_ud::tx_count = 0; -uct_ud_psn_t test_ud::tx_ack_psn = 0; + static volatile uint32_t rx_ack_count; + static volatile uint32_t rx_drop_count; + static volatile uint32_t ack_req_tx_cnt; + static volatile uint32_t tx_count; + static volatile uct_ud_psn_t tx_ackreq_psn; + static volatile uct_ud_psn_t tx_ack_psn; +}; -UCS_TEST_P(test_ud, basic_tx) { - unsigned i, N=13; +volatile uint32_t test_ud::ack_req_tx_cnt = 0; +volatile uint32_t test_ud::rx_ack_count = 0; +volatile uint32_t test_ud::rx_drop_count = 0; +volatile uint32_t test_ud::tx_count = 0; +volatile uct_ud_psn_t test_ud::tx_ackreq_psn = 0; +volatile uct_ud_psn_t test_ud::tx_ack_psn = 0; - check_caps(UCT_IFACE_FLAG_PUT_SHORT); +UCS_TEST_SKIP_COND_P(test_ud, basic_tx, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { + unsigned i, N = 13; disable_async(m_e1); disable_async(m_e2); @@ -207,10 +204,9 @@ UCS_TEST_P(test_ud, basic_tx) { EXPECT_EQ(0, ep(m_e2)->rx.acked_psn); } -UCS_TEST_P(test_ud, duplex_tx) { - unsigned i, N=5; - - check_caps(UCT_IFACE_FLAG_PUT_SHORT); +UCS_TEST_SKIP_COND_P(test_ud, duplex_tx, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { + unsigned i, N = 5; disable_async(m_e1); disable_async(m_e2); @@ -241,10 +237,9 @@ UCS_TEST_P(test_ud, duplex_tx) { } /* send full window, rcv ack after progreess, send some more */ -UCS_TEST_P(test_ud, tx_window1) { - unsigned i, N=13; - - check_caps(UCT_IFACE_FLAG_PUT_SHORT); +UCS_TEST_SKIP_COND_P(test_ud, tx_window1, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { + unsigned i, N = 13; disable_async(m_e1); disable_async(m_e2); @@ -269,10 +264,8 @@ UCS_TEST_P(test_ud, tx_window1) { /* basic flush */ /* send packet, flush, wait till flush ended */ -UCS_TEST_P(test_ud, flush_ep) { - - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - +UCS_TEST_SKIP_COND_P(test_ud, flush_ep, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { connect(); EXPECT_UCS_OK(tx(m_e1)); EXPECT_UCS_OK(ep_flush_b(m_e1)); @@ -280,10 +273,8 @@ UCS_TEST_P(test_ud, flush_ep) { validate_flush(); } -UCS_TEST_P(test_ud, flush_iface) { - - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - +UCS_TEST_SKIP_COND_P(test_ud, flush_iface, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { connect(); EXPECT_UCS_OK(tx(m_e1)); EXPECT_UCS_OK(iface_flush_b(m_e1)); @@ -297,10 +288,9 @@ UCS_TEST_P(test_ud, flush_iface) { * send full window, * should not be able to send some more */ -UCS_TEST_P(test_ud, tx_window2) { - unsigned i, N=13; - - check_caps(UCT_IFACE_FLAG_PUT_SHORT); +UCS_TEST_SKIP_COND_P(test_ud, tx_window2, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { + unsigned i, N = 13; disable_async(m_e1); disable_async(m_e2); @@ -323,10 +313,8 @@ UCS_TEST_P(test_ud, tx_window2) { /* last packet in window must have ack_req * answered with ack control message */ -UCS_TEST_P(test_ud, ack_req_single) { - - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - +UCS_TEST_SKIP_COND_P(test_ud, ack_req_single, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { connect(); disable_async(m_e1); disable_async(m_e2); @@ -349,10 +337,9 @@ UCS_TEST_P(test_ud, ack_req_single) { } /* test that ack request is sent on 1/4 of window */ -UCS_TEST_P(test_ud, ack_req_window) { - unsigned i, N=16; - - check_caps(UCT_IFACE_FLAG_PUT_SHORT); +UCS_TEST_SKIP_COND_P(test_ud, ack_req_window, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { + unsigned i, N = 16; disable_async(m_e1); disable_async(m_e2); @@ -378,9 +365,8 @@ UCS_TEST_P(test_ud, ack_req_window) { } /* simulate retransmission of the CREQ packet */ -UCS_TEST_P(test_ud, crep_drop1) { - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - +UCS_TEST_SKIP_COND_P(test_ud, crep_drop1, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { m_e1->connect_to_iface(0, *m_e2); /* setup filter to drop crep */ ep(m_e1, 0)->rx.rx_hook = drop_ctl; @@ -401,9 +387,8 @@ UCS_TEST_P(test_ud, crep_drop1) { /* check that creq is not left on tx window if * both sides connect simultaniously. */ -UCS_TEST_P(test_ud, crep_drop2) { - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - +UCS_TEST_SKIP_COND_P(test_ud, crep_drop2, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { connect_to_iface(); ep(m_e1)->rx.rx_hook = drop_ctl; @@ -461,7 +446,7 @@ UCS_TEST_P(test_ud, crep_ack_drop) { set_tx_win(m_e1, 10); do { - status = send_am_message(m_e1, 1, 0); + status = send_am_message(m_e1); progress(); } while (status == UCS_ERR_NO_RESOURCE); ASSERT_UCS_OK(status); @@ -477,7 +462,7 @@ UCS_TEST_P(test_ud, crep_ack_drop) { twait(500); short_progress_loop(); - status = send_am_message(m_e1, 1, 0); + status = send_am_message(m_e1); ASSERT_UCS_OK(status); short_progress_loop(); @@ -499,13 +484,12 @@ UCS_TEST_P(test_ud, creq_flush) { EXPECT_EQ(UCS_INPROGRESS, status); } -UCS_TEST_P(test_ud, ca_ai) { +UCS_TEST_SKIP_COND_P(test_ud, ca_ai, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { ucs_status_t status; int prev_cwnd; int max_window; - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - /* check initial window */ disable_async(m_e1); disable_async(m_e2); @@ -548,18 +532,16 @@ UCS_TEST_P(test_ud, ca_ai) { } } -UCS_TEST_P(test_ud, ca_md, "IB_TX_QUEUE_LEN=" UCS_PP_MAKE_STRING(UCT_UD_CA_MAX_WINDOW)) { +/* skip valgrind for now */ +UCS_TEST_SKIP_COND_P(test_ud, ca_md, + (RUNNING_ON_VALGRIND || + !check_caps(UCT_IFACE_FLAG_AM_SHORT)), + "IB_TX_QUEUE_LEN=" UCS_PP_MAKE_STRING(UCT_UD_CA_MAX_WINDOW)) { + unsigned prev_cwnd, new_cwnd; + uint32_t new_tx_count; ucs_status_t status; - int prev_cwnd, new_cwnd; - int i; - - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - - if (RUNNING_ON_VALGRIND) { - /* skip valgrind for now */ - UCS_TEST_SKIP_R("skipping on valgrind"); - } + unsigned num_sent; connect(); @@ -569,52 +551,63 @@ UCS_TEST_P(test_ud, ca_md, "IB_TX_QUEUE_LEN=" UCS_PP_MAKE_STRING(UCT_UD_CA_MAX_W * on receive drop all packets. After several retransmission * attempts the window will be reduced to the minimum */ + uct_ud_enter(iface(m_e1)); set_tx_win(m_e1, UCT_UD_CA_MAX_WINDOW); ep(m_e2, 0)->rx.rx_hook = drop_rx; - for (i = 1; i < UCT_UD_CA_MAX_WINDOW; i++) { + uct_ud_leave(iface(m_e1)); + + num_sent = 0; + while (num_sent < UCT_UD_CA_MAX_WINDOW) { status = tx(m_e1); if (status == UCS_ERR_NO_RESOURCE) { // the congestion window can shrink by async timer if ACKs are // not received fast enough - EXPECT_GT(i, 1); /* at least one packet should be sent */ break; } - EXPECT_UCS_OK(status); + ASSERT_UCS_OK(status); progress(); + ++num_sent; } short_progress_loop(); + UCS_TEST_MESSAGE << "sent " << num_sent << " packets"; + EXPECT_GE(num_sent, 1u); /* at least one packet should be sent */ + ep(m_e1)->tx.tx_hook = count_tx; do { + uct_ud_enter(iface(m_e1)); + tx_count = 0; prev_cwnd = ep(m_e1, 0)->ca.cwnd; - tx_count = 0; + uct_ud_leave(iface(m_e1)); + do { progress(); } while (ep(m_e1, 0)->ca.cwnd > (prev_cwnd / UCT_UD_CA_MD_FACTOR)); short_progress_loop(); - new_cwnd = ep(m_e1, 0)->ca.cwnd; - EXPECT_GE(tx_count, new_cwnd - 1); + uct_ud_enter(iface(m_e1)); + new_cwnd = ep(m_e1, 0)->ca.cwnd; + new_tx_count = tx_count; + uct_ud_leave(iface(m_e1)); + + EXPECT_GE(new_tx_count, ucs_min(new_cwnd - 1, num_sent)); if (new_cwnd > UCT_UD_CA_MIN_WINDOW) { /* up to 3 additional ack_reqs per each resend */ - EXPECT_LE(tx_count, (prev_cwnd - new_cwnd) + - (int)(3 * ucs_ilog2(prev_cwnd/new_cwnd))); - } + int order = ucs_ilog2(prev_cwnd / new_cwnd); + EXPECT_LE(new_tx_count, (prev_cwnd - new_cwnd + 3) * order); + } } while (ep(m_e1, 0)->ca.cwnd > UCT_UD_CA_MIN_WINDOW); } -UCS_TEST_P(test_ud, ca_resend) { +UCS_TEST_SKIP_COND_P(test_ud, ca_resend, + (RUNNING_ON_VALGRIND || + !check_caps(UCT_IFACE_FLAG_AM_SHORT))) { - int max_window = 10; + int max_window = 9; int i; ucs_status_t status; - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - - if (RUNNING_ON_VALGRIND) { - UCS_TEST_SKIP_R("skipping on valgrind"); - } connect(); set_tx_win(m_e1, max_window); @@ -629,20 +622,10 @@ UCS_TEST_P(test_ud, ca_resend) { do { progress(); } while(ep(m_e1)->ca.cwnd > max_window/2); - /* expect that: - * 4 packets will be retransmitted - * first packet will have ack_req, - * there will 2 ack_reqs - * in addition there may be up to two - * standalone ack_reqs - */ - disable_async(m_e1); - disable_async(m_e2); + /* expect at least 1 drop and 1 ack req */ short_progress_loop(100); - EXPECT_LE(0, rx_drop_count); - EXPECT_GE(4+2, rx_drop_count); - EXPECT_LE(0, ack_req_tx_cnt); - EXPECT_GE(2+2, ack_req_tx_cnt); + EXPECT_GE(rx_drop_count, 1u); + EXPECT_GE(ack_req_tx_cnt, 1u); } UCS_TEST_P(test_ud, connect_iface_single_drop_creq) { @@ -658,9 +641,8 @@ UCS_TEST_P(test_ud, connect_iface_single_drop_creq) { } #endif -UCS_TEST_P(test_ud, connect_iface_single) { - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - +UCS_TEST_SKIP_COND_P(test_ud, connect_iface_single, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { /* single connect */ m_e1->connect_to_iface(0, *m_e2); short_progress_loop(TEST_UD_PROGRESS_TIMEOUT); @@ -687,9 +669,8 @@ UCS_TEST_P(test_ud, connect_iface_2to1) { EXPECT_EQ(1, ucs_frag_list_sn(&ep(m_e1, 1)->rx.ooo_pkts)); } -UCS_TEST_P(test_ud, connect_iface_seq) { - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - +UCS_TEST_SKIP_COND_P(test_ud, connect_iface_seq, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { /* sequential connect from both sides */ m_e1->connect_to_iface(0, *m_e2); validate_connect(ep(m_e1), 0U); @@ -782,17 +763,18 @@ UCS_TEST_P(test_ud, ep_destroy_simple) { EXPECT_EQ(1U, ud_ep2->ep_id); } -UCS_TEST_P(test_ud, ep_destroy_flush) { +UCS_TEST_SKIP_COND_P(test_ud, ep_destroy_flush, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { uct_ep_h ep; ucs_status_t status; uct_ud_ep_t *ud_ep1; uct_ep_params_t ep_params; - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - connect(); EXPECT_UCS_OK(tx(m_e1)); short_progress_loop(); + + /* m_e1::ep[0] has to be revoked at the end of the testing */ uct_ep_destroy(m_e1->ep(0)); /* ep destroy should try to flush outstanding packets */ short_progress_loop(); @@ -806,18 +788,26 @@ UCS_TEST_P(test_ud, ep_destroy_flush) { ud_ep1 = ucs_derived_of(ep, uct_ud_ep_t); EXPECT_EQ(1U, ud_ep1->ep_id); uct_ep_destroy(ep); -} -UCS_TEST_P(test_ud, ep_destroy_passive) { - check_caps(UCT_IFACE_FLAG_PUT_SHORT); + /* revoke m_e1::ep[0] as it was destroyed manually */ + m_e1->revoke_ep(0); +} +UCS_TEST_SKIP_COND_P(test_ud, ep_destroy_passive, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { connect(); + + /* m_e2::ep[0] has to be revoked at the end of the testing */ uct_ep_destroy(m_e2->ep(0)); + /* destroyed ep must still accept data */ EXPECT_UCS_OK(tx(m_e1)); EXPECT_UCS_OK(ep_flush_b(m_e1)); validate_flush(); + + /* revoke m_e2::ep[0] as it was destroyed manually */ + m_e2->revoke_ep(0); } UCS_TEST_P(test_ud, ep_destroy_creq) { @@ -830,7 +820,7 @@ UCS_TEST_P(test_ud, ep_destroy_creq) { m_e1->connect_to_iface(0, *m_e2); short_progress_loop(TEST_UD_PROGRESS_TIMEOUT); - uct_ep_destroy(m_e1->ep(0)); + m_e1->destroy_ep(0); /* check that ep id are not reused on both sides */ ep_params.field_mask = UCT_EP_PARAM_FIELD_IFACE; @@ -850,93 +840,12 @@ UCS_TEST_P(test_ud, ep_destroy_creq) { EXPECT_EQ(1U, ud_ep->ep_id); } -/* check that the amount of reserved skbs is not less than - * iface tx queue len - */ -UCS_TEST_P(test_ud, res_skb_basic) { - uct_ud_send_skb_t *skb; - uct_ud_iface_t *ud_if; - int i, tx_qlen; - - connect(); - - ud_if = iface(m_e1); - tx_qlen = ud_if->tx.available; - - uct_ud_send_skb_t *used_skbs[tx_qlen]; - - for (i = 0; i < tx_qlen; i++) { - skb = uct_ud_iface_resend_skb_get(ud_if); - ASSERT_TRUE(skb); - used_skbs[i] = skb; - } - - for (i = 0; i < tx_qlen; i++) { - uct_ud_iface_resend_skb_put(ud_if, used_skbs[i]); - } -} - -/* test that reserved skb is not being reused while it is still in flight - */ -UCS_TEST_P(test_ud, res_skb_tx) { - - uct_ud_iface_t *ud_if; - int poll_sn; - uct_ud_send_skb_t *skb; - int n, tx_count; - - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - - disable_async(m_e1); - disable_async(m_e2); - connect(); - EXPECT_UCS_OK(tx(m_e1)); - short_progress_loop(); - - ud_if = iface(m_e1); - n = tx_count = 0; - poll_sn = 1; - while(n < 100) { - while(uct_ud_iface_can_tx(ud_if)) { - uct_ud_put_hdr_t *put_hdr; - uct_ud_neth_t *neth; - - skb = uct_ud_iface_resend_skb_get(ud_if); - ASSERT_TRUE(skb); - VALGRIND_MAKE_MEM_DEFINED(skb, sizeof *skb); - ASSERT_LT(skb->flags, poll_sn); - skb->flags = poll_sn; - - /* simulate put */ - neth = skb->neth; - uct_ud_neth_init_data(ep(m_e1), neth); - uct_ud_neth_set_type_put(ep(m_e1), neth); - uct_ud_neth_ack_req(ep(m_e1), neth); - - put_hdr = (uct_ud_put_hdr_t *)(neth+1); - put_hdr->rva = (uint64_t)&m_dummy; - memcpy(put_hdr+1, &m_dummy, sizeof(m_dummy)); - skb->len = sizeof(*neth) + sizeof(*put_hdr) + sizeof(m_dummy); - - ucs_derived_of(ud_if->super.ops, uct_ud_iface_ops_t)->tx_skb(ep(m_e1), - skb, 0); - uct_ud_iface_resend_skb_put(ud_if, skb); - tx_count++; - } - short_progress_loop(1); - poll_sn++; - n++; - } -} - #if UCT_UD_EP_DEBUG_HOOKS /* Simulate loss of ctl packets during simultaneous CREQs. * Use-case: CREQ and CREP packets from m_e2 to m_e1 are lost. * Check: that both eps (m_e1 and m_e2) are connected finally */ -UCS_TEST_P(test_ud, ctls_loss) { - - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - +UCS_TEST_SKIP_COND_P(test_ud, ctls_loss, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { iface(m_e2)->tx.available = 0; connect_to_iface(); @@ -965,6 +874,43 @@ UCS_TEST_P(test_ud, ctls_loss) { } #endif -_UCT_INSTANTIATE_TEST_CASE(test_ud, ud) -_UCT_INSTANTIATE_TEST_CASE(test_ud, ud_mlx5) +UCT_INSTANTIATE_UD_TEST_CASE(test_ud) + +#ifdef HAVE_MLX5_HW +extern "C" { +#include +} +#endif + +class test_ud_iface_attrs : public test_uct_iface_attrs { +public: + attr_map_t get_num_iov() { + attr_map_t iov_map; +#ifdef HAVE_MLX5_HW + if (has_transport("ud_mlx5")) { + // For am zcopy just small constant number of iovs is allowed + // (to preserve some inline space for AM zcopy header) + iov_map["am"] = UCT_IB_MLX5_AM_ZCOPY_MAX_IOV; + + } else +#endif + { + EXPECT_TRUE(has_transport("ud_verbs")); + uct_ud_verbs_iface_t *iface = ucs_derived_of(m_e->iface(), + uct_ud_verbs_iface_t); + size_t max_sge = 0; + EXPECT_UCS_OK(uct_ud_verbs_qp_max_send_sge(iface, &max_sge)); + iov_map["am"] = max_sge; + } + + return iov_map; + } +}; + +UCS_TEST_P(test_ud_iface_attrs, iface_attrs) +{ + basic_iov_test(); +} + +UCT_INSTANTIATE_UD_TEST_CASE(test_ud_iface_attrs) diff --git a/test/gtest/uct/ib/test_ud_ds.cc b/test/gtest/uct/ib/test_ud_ds.cc index 3c3cef25463..0dabb05f032 100644 --- a/test/gtest/uct/ib/test_ud_ds.cc +++ b/test/gtest/uct/ib/test_ud_ds.cc @@ -4,7 +4,7 @@ * See file LICENSE for terms. */ -#include +#include "ud_base.h" extern "C" { #include @@ -63,15 +63,32 @@ class test_ud_ds : public uct_test { unsigned test_ud_ds::N = 1000; UCS_TEST_P(test_ud_ds, if_addr) { - union ibv_gid gid1, gid2; - uint16_t lid1, lid2; - uct_ib_address_unpack(ib_adr1, &lid1, &gid1); - uct_ib_address_unpack(ib_adr2, &lid2, &gid2); - EXPECT_EQ(lid1, lid2); - EXPECT_EQ(gid1.global.subnet_prefix, gid2.global.subnet_prefix); - EXPECT_EQ(gid1.global.interface_id, gid2.global.interface_id); + uct_ib_address_pack_params_t unpack_params1, unpack_params2; + + uct_ib_address_unpack(ib_adr1, &unpack_params1); + uct_ib_address_unpack(ib_adr2, &unpack_params2); + EXPECT_EQ(unpack_params1.lid, unpack_params2.lid); + EXPECT_EQ(unpack_params1.gid.global.subnet_prefix, + unpack_params2.gid.global.subnet_prefix); + EXPECT_EQ(unpack_params1.gid.global.interface_id, + unpack_params2.gid.global.interface_id); EXPECT_NE(uct_ib_unpack_uint24(if_adr1.qp_num), uct_ib_unpack_uint24(if_adr2.qp_num)); + + EXPECT_TRUE(!(unpack_params1.flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU)); + EXPECT_EQ(UCT_IB_ADDRESS_INVALID_PATH_MTU, unpack_params1.path_mtu); + EXPECT_TRUE(!(unpack_params2.flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU)); + EXPECT_EQ(UCT_IB_ADDRESS_INVALID_PATH_MTU, unpack_params2.path_mtu); + + EXPECT_TRUE(!(unpack_params1.flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX)); + EXPECT_EQ(UCT_IB_ADDRESS_INVALID_GID_INDEX, unpack_params1.gid_index); + EXPECT_TRUE(!(unpack_params2.flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX)); + EXPECT_EQ(UCT_IB_ADDRESS_INVALID_GID_INDEX, unpack_params2.gid_index); + + EXPECT_TRUE((unpack_params1.flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY) != 0); + EXPECT_EQ(UCT_IB_ADDRESS_DEFAULT_PKEY, unpack_params1.pkey); + EXPECT_TRUE((unpack_params2.flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY) != 0); + EXPECT_EQ(UCT_IB_ADDRESS_DEFAULT_PKEY, unpack_params2.pkey); } void test_ud_ds::test_cep_insert(entity *e, uct_ib_address_t *ib_addr, @@ -84,14 +101,16 @@ void test_ud_ds::test_cep_insert(entity *e, uct_ib_address_t *ib_addr, e->create_ep(i + base); EXPECT_EQ(i+base, ep(e, i + base)->ep_id); EXPECT_EQ((unsigned)UCT_UD_EP_NULL_ID, ep(e, i + base)->dest_ep_id); - EXPECT_UCS_OK(uct_ud_iface_cep_insert(iface(e), ib_addr, if_addr, ep(e, i + base), UCT_UD_EP_CONN_ID_MAX)); + EXPECT_UCS_OK(uct_ud_iface_cep_insert(iface(e), ib_addr, if_addr, + ep(e, i + base), + UCT_UD_EP_CONN_ID_MAX, 0)); EXPECT_EQ(i, ep(e, i + base)->conn_id); } /* lookup non existing ep */ - my_ep = uct_ud_iface_cep_lookup(iface(e), ib_addr, if_addr, 3333); + my_ep = uct_ud_iface_cep_lookup(iface(e), ib_addr, if_addr, 3333, 0); EXPECT_TRUE(my_ep == NULL); for (i = 0; i < N; i++) { - my_ep = uct_ud_iface_cep_lookup(iface(e), ib_addr, if_addr, i); + my_ep = uct_ud_iface_cep_lookup(iface(e), ib_addr, if_addr, i, 0); EXPECT_TRUE(my_ep != NULL); EXPECT_EQ(i+base, ep(e, i + base)->ep_id); EXPECT_EQ(i, ep(e, i + base)->conn_id); @@ -109,12 +128,15 @@ UCS_TEST_P(test_ud_ds, cep_rollback) { m_e1->create_ep(0); EXPECT_EQ(0U, ep(m_e1, 0)->ep_id); EXPECT_EQ((unsigned)UCT_UD_EP_NULL_ID, ep(m_e1, 0)->dest_ep_id); - EXPECT_UCS_OK(uct_ud_iface_cep_insert(iface(m_e1), ib_adr1, &if_adr1, ep(m_e1, 0), UCT_UD_EP_CONN_ID_MAX)); + EXPECT_UCS_OK(uct_ud_iface_cep_insert(iface(m_e1), ib_adr1, &if_adr1, + ep(m_e1, 0), UCT_UD_EP_CONN_ID_MAX, 0)); EXPECT_EQ(0U, ep(m_e1, 0)->conn_id); uct_ud_iface_cep_rollback(iface(m_e1), ib_adr1, &if_adr1, ep(m_e1, 0)); - EXPECT_UCS_OK(uct_ud_iface_cep_insert(iface(m_e1), ib_adr1, &if_adr1, ep(m_e1, 0), UCT_UD_EP_CONN_ID_MAX)); + EXPECT_UCS_OK(uct_ud_iface_cep_insert(iface(m_e1), ib_adr1, + &if_adr1, ep(m_e1, 0), + UCT_UD_EP_CONN_ID_MAX, 0)); EXPECT_EQ(0U, ep(m_e1, 0)->conn_id); } @@ -127,31 +149,36 @@ UCS_TEST_P(test_ud_ds, cep_replace) { /* Assume that we have 5 connections pending and 3 CREQs received */ m_e1->create_ep(N); - EXPECT_UCS_OK(uct_ud_iface_cep_insert(iface(m_e1), ib_adr1, &if_adr1, ep(m_e1, N), N+1)); + EXPECT_UCS_OK(uct_ud_iface_cep_insert(iface(m_e1), ib_adr1, &if_adr1, + ep(m_e1, N), N + 1, 0)); EXPECT_EQ(N+1, ep(m_e1, N)->conn_id); m_e1->create_ep(N+1); - EXPECT_UCS_OK(uct_ud_iface_cep_insert(iface(m_e1), ib_adr1, &if_adr1, ep(m_e1, N+1), N+4)); + EXPECT_UCS_OK(uct_ud_iface_cep_insert(iface(m_e1), ib_adr1, &if_adr1, + ep(m_e1, N + 1), N + 4, 0)); EXPECT_EQ(N+4, ep(m_e1, N+1)->conn_id); m_e1->create_ep(N+2); - EXPECT_UCS_OK(uct_ud_iface_cep_insert(iface(m_e1), ib_adr1, &if_adr1, ep(m_e1, N+2), N+5)); + EXPECT_UCS_OK(uct_ud_iface_cep_insert(iface(m_e1), ib_adr1, &if_adr1, + ep(m_e1, N + 2), N + 5, 0)); EXPECT_EQ(N+5, ep(m_e1, N+2)->conn_id); /* we initiate 2 connections */ - my_ep = uct_ud_iface_cep_lookup(iface(m_e1), ib_adr1, &if_adr1, UCT_UD_EP_CONN_ID_MAX); + my_ep = uct_ud_iface_cep_lookup(iface(m_e1), ib_adr1, &if_adr1, + UCT_UD_EP_CONN_ID_MAX, 0); EXPECT_TRUE(my_ep == NULL); m_e1->create_ep(N+3); /* slot N must be free. conn_id will be N+1 when inserting ep with no id */ - EXPECT_UCS_OK(uct_ud_iface_cep_insert(iface(m_e1), ib_adr1, &if_adr1, ep(m_e1, N+3), UCT_UD_EP_CONN_ID_MAX)); + EXPECT_UCS_OK(uct_ud_iface_cep_insert(iface(m_e1), ib_adr1, &if_adr1, + ep(m_e1, N + 3), UCT_UD_EP_CONN_ID_MAX, + 0)); EXPECT_EQ(N, ep(m_e1, N+3)->conn_id); /* slot N+1 already occupied */ - my_ep = uct_ud_iface_cep_lookup(iface(m_e1), ib_adr1, &if_adr1, UCT_UD_EP_CONN_ID_MAX); + my_ep = uct_ud_iface_cep_lookup(iface(m_e1), ib_adr1, &if_adr1, + UCT_UD_EP_CONN_ID_MAX, 0); EXPECT_TRUE(my_ep != NULL); EXPECT_EQ(N+1, my_ep->conn_id); } -_UCT_INSTANTIATE_TEST_CASE(test_ud_ds, ud) -_UCT_INSTANTIATE_TEST_CASE(test_ud_ds, ud_mlx5) - +UCT_INSTANTIATE_UD_TEST_CASE(test_ud_ds) diff --git a/test/gtest/uct/ib/test_ud_pending.cc b/test/gtest/uct/ib/test_ud_pending.cc index a48904d65b6..e41e18a6063 100644 --- a/test/gtest/uct/ib/test_ud_pending.cc +++ b/test/gtest/uct/ib/test_ud_pending.cc @@ -31,6 +31,8 @@ class test_ud_pending : public ud_base_test { req_count = 0; me = this; m_e1->connect_to_iface(0, *m_e2); + disable_async(m_e1); + disable_async(m_e2); set_tx_win(m_e1, UCT_UD_CA_MAX_WINDOW); /* ep is not connected yet */ EXPECT_EQ(UCS_ERR_NO_RESOURCE, tx(m_e1)); @@ -91,12 +93,11 @@ int test_ud_pending::req_count = 0; test_ud_pending *test_ud_pending::me = 0; /* add/purge requests */ -UCS_TEST_P(test_ud_pending, async_progress) { +UCS_TEST_SKIP_COND_P(test_ud_pending, async_progress, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { uct_pending_req_t r[N]; int i; - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - req_count = 0; connect(); @@ -113,12 +114,11 @@ UCS_TEST_P(test_ud_pending, async_progress) { EXPECT_EQ(N, req_count); } -UCS_TEST_P(test_ud_pending, sync_progress) { +UCS_TEST_SKIP_COND_P(test_ud_pending, sync_progress, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { uct_pending_req_t r[N]; int i; - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - req_count = 0; connect(); @@ -136,12 +136,11 @@ UCS_TEST_P(test_ud_pending, sync_progress) { EXPECT_EQ(N, req_count); } -UCS_TEST_P(test_ud_pending, err_busy) { +UCS_TEST_SKIP_COND_P(test_ud_pending, err_busy, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { uct_pending_req_t r[N]; int i; - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - req_count = 0; connect(); @@ -159,34 +158,27 @@ UCS_TEST_P(test_ud_pending, err_busy) { EXPECT_EQ(N, req_count); } -UCS_TEST_P(test_ud_pending, connect) +UCS_TEST_SKIP_COND_P(test_ud_pending, connect, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - - disable_async(m_e1); - disable_async(m_e2); post_pending_reqs(); check_pending_reqs(true); } -UCS_TEST_P(test_ud_pending, flush) +UCS_TEST_SKIP_COND_P(test_ud_pending, flush, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - - disable_async(m_e1); - disable_async(m_e2); post_pending_reqs(); flush(); check_pending_reqs(false); } -UCS_TEST_P(test_ud_pending, window) +UCS_TEST_SKIP_COND_P(test_ud_pending, window, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { int i; uct_pending_req_t r; - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - req_count = 0; me = this; connect(); @@ -202,14 +194,13 @@ UCS_TEST_P(test_ud_pending, window) uct_ep_pending_purge(m_e1->ep(0), purge_cb, NULL); } -UCS_TEST_P(test_ud_pending, tx_wqe) +UCS_TEST_SKIP_COND_P(test_ud_pending, tx_wqe, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { int i; uct_pending_req_t r; ucs_status_t status; - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - req_count = 0; me = this; disable_async(m_e1); @@ -231,6 +222,4 @@ UCS_TEST_P(test_ud_pending, tx_wqe) uct_ep_pending_purge(m_e1->ep(0), purge_cb, NULL); } -_UCT_INSTANTIATE_TEST_CASE(test_ud_pending, ud) -_UCT_INSTANTIATE_TEST_CASE(test_ud_pending, ud_mlx5) - +UCT_INSTANTIATE_UD_TEST_CASE(test_ud_pending) diff --git a/test/gtest/uct/ib/test_ud_slow_timer.cc b/test/gtest/uct/ib/test_ud_timer.cc similarity index 69% rename from test/gtest/uct/ib/test_ud_slow_timer.cc rename to test/gtest/uct/ib/test_ud_timer.cc index d2555d6e638..de0a4ccd640 100644 --- a/test/gtest/uct/ib/test_ud_slow_timer.cc +++ b/test/gtest/uct/ib/test_ud_timer.cc @@ -17,7 +17,7 @@ extern "C" { } -class test_ud_slow_timer : public ud_base_test { +class test_ud_timer : public ud_base_test { public: /* ack while doing retransmit */ static int packet_count, rx_limit; @@ -39,7 +39,7 @@ class test_ud_slow_timer : public ud_base_test { uct_ud_iface_t); /* hack to disable retransmit */ - ep->tx.send_time = ucs_twheel_get_time(&iface->async.slow_timer); + ep->tx.send_time = ucs_twheel_get_time(&iface->tx.timer); tick_count++; return UCS_OK; } @@ -62,7 +62,7 @@ class test_ud_slow_timer : public ud_base_test { { ucs_time_t deadline = ucs_get_time() + ucs_time_from_sec(60) * ucs::test_time_multiplier(); - void *ud_ep_tmp; + void *ud_ep_tmp GTEST_ATTRIBUTE_UNUSED_; while ((ucs_get_time() < deadline) && ucs_ptr_array_lookup(&iface->eps, ep_idx, ud_ep_tmp)) { @@ -71,15 +71,14 @@ class test_ud_slow_timer : public ud_base_test { } }; -int test_ud_slow_timer::rx_limit = 10; -int test_ud_slow_timer::packet_count = 0; -int test_ud_slow_timer::tick_count = 0; +int test_ud_timer::rx_limit = 10; +int test_ud_timer::packet_count = 0; +int test_ud_timer::tick_count = 0; /* single packet received without progress */ -UCS_TEST_P(test_ud_slow_timer, tx1) { - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - +UCS_TEST_SKIP_COND_P(test_ud_timer, tx1, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { connect(); EXPECT_UCS_OK(tx(m_e1)); wait_for_rx_sn(1); @@ -88,10 +87,9 @@ UCS_TEST_P(test_ud_slow_timer, tx1) { } /* multiple packets received without progress */ -UCS_TEST_P(test_ud_slow_timer, txn) { - unsigned i, N=42; - - check_caps(UCT_IFACE_FLAG_PUT_SHORT); +UCS_TEST_SKIP_COND_P(test_ud_timer, txn, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { + unsigned i, N = 42; connect(); set_tx_win(m_e1, 1024); @@ -103,8 +101,8 @@ UCS_TEST_P(test_ud_slow_timer, txn) { EXPECT_EQ(N, ucs_frag_list_sn(&ep(m_e2)->rx.ooo_pkts)); } -UCS_TEST_P(test_ud_slow_timer, ep_destroy, "UD_TIMEOUT=1s") { - void *ud_ep_tmp; +UCS_TEST_P(test_ud_timer, ep_destroy, "UD_TIMEOUT=1s") { + void *ud_ep_tmp GTEST_ATTRIBUTE_UNUSED_; connect(); uct_ud_ep_t *ud_ep = ep(m_e1); @@ -118,9 +116,33 @@ UCS_TEST_P(test_ud_slow_timer, ep_destroy, "UD_TIMEOUT=1s") { EXPECT_FALSE(ucs_ptr_array_lookup(&iface->eps, ep_idx, ud_ep_tmp)); } +UCS_TEST_P(test_ud_timer, backoff_config) { + /* check minimum allowed value */ + ASSERT_UCS_OK(uct_config_modify(m_iface_config, + "UD_TIMER_BACKOFF", + ucs::to_string(UCT_UD_MIN_TIMER_TIMER_BACKOFF).c_str())); + entity *e = uct_test::create_entity(0); + m_entities.push_back(e); + + { + /* iface creation should fail with back off value less than + * UCT_UD_MIN_TIMER_TIMER_BACKOFF */ + ASSERT_UCS_OK(uct_config_modify(m_iface_config, + "UD_TIMER_BACKOFF", + ucs::to_string(UCT_UD_MIN_TIMER_TIMER_BACKOFF - 0.1).c_str())); + scoped_log_handler wrap_err(wrap_errors_logger); + uct_iface_h iface; + ucs_status_t status = uct_iface_open(e->md(), e->worker(), + &e->iface_params(), + m_iface_config, &iface); + EXPECT_EQ(UCS_ERR_INVALID_PARAM, status); + EXPECT_EQ(NULL, iface); + } +} + #if UCT_UD_EP_DEBUG_HOOKS /* no traffic - no ticks */ -UCS_TEST_P(test_ud_slow_timer, tick1) { +UCS_TEST_P(test_ud_timer, tick1) { connect(); tick_count = 0; ep(m_e1)->timer_hook = tick_counter; @@ -129,10 +151,8 @@ UCS_TEST_P(test_ud_slow_timer, tick1) { } /* ticks while tx window is not empty */ -UCS_TEST_P(test_ud_slow_timer, tick2) { - - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - +UCS_TEST_SKIP_COND_P(test_ud_timer, tick2, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { connect(); tick_count = 0; ep(m_e1)->timer_hook = tick_counter; @@ -143,9 +163,8 @@ UCS_TEST_P(test_ud_slow_timer, tick2) { /* retransmit one packet */ -UCS_TEST_P(test_ud_slow_timer, retransmit1) { - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - +UCS_TEST_SKIP_COND_P(test_ud_timer, retransmit1, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { connect(); ep(m_e2)->rx.rx_hook = drop_packet; EXPECT_UCS_OK(tx(m_e1)); @@ -159,11 +178,10 @@ UCS_TEST_P(test_ud_slow_timer, retransmit1) { } /* retransmit many packets */ -UCS_TEST_P(test_ud_slow_timer, retransmitn) { +UCS_TEST_SKIP_COND_P(test_ud_timer, retransmitn, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { - unsigned i, N=42; - - check_caps(UCT_IFACE_FLAG_PUT_SHORT); + unsigned i, N = 42; connect(); set_tx_win(m_e1, 1024); @@ -181,13 +199,12 @@ UCS_TEST_P(test_ud_slow_timer, retransmitn) { } -UCS_TEST_P(test_ud_slow_timer, partial_drop) { +UCS_TEST_SKIP_COND_P(test_ud_timer, partial_drop, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { - unsigned i, N=24; + unsigned i, N = 24; int orig_avail; - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - connect(); set_tx_win(m_e1, 1024); packet_count = 0; @@ -215,6 +232,4 @@ UCS_TEST_P(test_ud_slow_timer, partial_drop) { } #endif -_UCT_INSTANTIATE_TEST_CASE(test_ud_slow_timer, ud) -_UCT_INSTANTIATE_TEST_CASE(test_ud_slow_timer, ud_mlx5) - +UCT_INSTANTIATE_UD_TEST_CASE(test_ud_timer) diff --git a/test/gtest/uct/ib/ud_base.cc b/test/gtest/uct/ib/ud_base.cc index e67f48bb0d2..b919bc130c6 100644 --- a/test/gtest/uct/ib/ud_base.cc +++ b/test/gtest/uct/ib/ud_base.cc @@ -9,6 +9,8 @@ void ud_base_test::init() m_e1 = uct_test::create_entity(0); m_entities.push_back(m_e1); + check_skip_test(); + m_e2 = uct_test::create_entity(0); m_entities.push_back(m_e2); } @@ -85,7 +87,5 @@ void ud_base_test::set_tx_win(entity *e, uct_ud_psn_t size) void ud_base_test::disable_async(entity *e) { - ucs_async_remove_handler(iface(e)->async.timer_id, 1); + iface(e)->async.disable = 1; } - - diff --git a/test/gtest/uct/ib/ud_base.h b/test/gtest/uct/ib/ud_base.h index a898cafaaf7..a6e00a2fc9c 100644 --- a/test/gtest/uct/ib/ud_base.h +++ b/test/gtest/uct/ib/ud_base.h @@ -10,8 +10,10 @@ #include #include +extern "C" { #include #include +} #define TEST_UD_PROGRESS_TIMEOUT 300.0 @@ -49,4 +51,10 @@ class ud_base_test : public uct_test { uint64_t m_dummy; }; + +#define UCT_INSTANTIATE_UD_TEST_CASE(_test_case) \ + _UCT_INSTANTIATE_TEST_CASE(_test_case, ud_verbs) \ + _UCT_INSTANTIATE_TEST_CASE(_test_case, ud_mlx5) + + #endif diff --git a/test/gtest/uct/tcp/test_tcp.cc b/test/gtest/uct/tcp/test_tcp.cc new file mode 100644 index 00000000000..f5560ea1c72 --- /dev/null +++ b/test/gtest/uct/tcp/test_tcp.cc @@ -0,0 +1,257 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include +#include + +extern "C" { +#include +#include +} + +class test_uct_tcp : public uct_test { +public: + void init() { + if (RUNNING_ON_VALGRIND) { + modify_config("TX_SEG_SIZE", "1kb"); + modify_config("RX_SEG_SIZE", "1kb"); + } + + uct_test::init(); + m_ent = uct_test::create_entity(0); + m_entities.push_back(m_ent); + m_tcp_iface = (uct_tcp_iface*)m_ent->iface(); + } + + size_t get_accepted_conn_num(entity& ent) { + size_t num = 0; + uct_tcp_ep_t *ep; + + ucs_list_for_each(ep, &m_tcp_iface->ep_list, list) { + num += (ep->conn_state == UCT_TCP_EP_CONN_STATE_RECV_MAGIC_NUMBER); + } + + return num; + } + + ucs_status_t post_recv(int fd, bool nb = false) { + uint8_t msg; + size_t msg_size = sizeof(msg); + ucs_status_t status; + + scoped_log_handler slh(wrap_errors_logger); + if (nb) { + status = ucs_socket_recv_nb(fd, &msg, &msg_size, NULL, NULL); + } else { + status = ucs_socket_recv(fd, &msg, msg_size, NULL, NULL); + } + + return status; + } + + void post_send(int fd, const std::vector &buf) { + scoped_log_handler slh(wrap_errors_logger); + ucs_status_t status = ucs_socket_send(fd, &buf[0], + buf.size(), NULL, NULL); + // send can be OK or fail when a connection was closed by a peer + // before all data were sent + ASSERT_TRUE((status == UCS_OK) || + (status == UCS_ERR_IO_ERROR)); + } + + void detect_conn_reset(int fd) { + // Try to receive something on this socket fd - it has to be failed + ucs_status_t status = post_recv(fd); + ASSERT_TRUE(status == UCS_ERR_CONNECTION_RESET); + EXPECT_EQ(0, ucs_socket_is_connected(fd)); + } + + void test_listener_flood(entity& test_entity, size_t max_conn, + size_t msg_size) { + std::vector fds; + std::vector buf; + + if (msg_size > 0) { + buf.resize(msg_size + sizeof(uct_tcp_am_hdr_t)); + std::fill(buf.begin(), buf.end(), 0); + init_data(&buf[0], buf.size()); + } + + setup_conns_to_entity(test_entity, max_conn, fds); + + size_t handled = 0; + for (std::vector::const_iterator iter = fds.begin(); + iter != fds.end(); ++iter) { + size_t sent_length = 0; + do { + if (msg_size > 0) { + post_send(*iter, buf); + sent_length += buf.size(); + } else { + close(*iter); + } + + // If it was sent >= the length of the magic number or sending + // is not required by the current test, wait until connection + // is destroyed. Otherwise, need to send more data + if ((msg_size == 0) || (sent_length >= sizeof(uint64_t))) { + handled++; + + while (get_accepted_conn_num(test_entity) != (max_conn - handled)) { + sched_yield(); + progress(); + } + } else { + // Peers still have to be connected + ucs_status_t status = post_recv(*iter, true); + EXPECT_TRUE((status == UCS_OK) || + (status == UCS_ERR_NO_PROGRESS)); + EXPECT_EQ(1, ucs_socket_is_connected(*iter)); + } + } while ((msg_size != 0) && (sent_length < sizeof(uint64_t))); + } + + // give a chance to close all connections + while (!ucs_list_is_empty(&m_tcp_iface->ep_list)) { + sched_yield(); + progress(); + } + + // TCP has to reject all connections and forget EPs that were + // created after accept(): + // - EP list has to be empty + EXPECT_EQ(1, ucs_list_is_empty(&m_tcp_iface->ep_list)); + // - all connections have to be destroyed (if wasn't closed + // yet by the clients) + if (msg_size > 0) { + // if we sent data during the test, close socket fd here + while (!fds.empty()) { + int fd = fds.back(); + fds.pop_back(); + detect_conn_reset(fd); + close(fd); + } + } + } + + void setup_conns_to_entity(entity& to, size_t max_conn, + std::vector &fds) { + for (size_t i = 0; i < max_conn; i++) { + int fd = setup_conn_to_entity(to, i + 1lu); + fds.push_back(fd); + + // give a chance to finish all connections + while (get_accepted_conn_num(to) != (i + 1lu)) { + sched_yield(); + progress(); + } + + EXPECT_EQ(1, ucs_socket_is_connected(fd)); + } + } + +private: + void init_data(void *buf, size_t msg_size) { + uct_tcp_am_hdr_t *tcp_am_hdr; + ASSERT_TRUE(msg_size >= sizeof(*tcp_am_hdr)); + tcp_am_hdr = static_cast(buf); + tcp_am_hdr->am_id = std::numeric_limits::max(); + tcp_am_hdr->length = msg_size; + } + + int connect_to_entity(entity& to) { + uct_device_addr_t *dev_addr; + uct_iface_addr_t *iface_addr; + ucs_status_t status; + + dev_addr = (uct_device_addr_t*)malloc(to.iface_attr().device_addr_len); + iface_addr = (uct_iface_addr_t*)malloc(to.iface_attr().iface_addr_len); + + status = uct_iface_get_device_address(to.iface(), dev_addr); + ASSERT_UCS_OK(status); + + status = uct_iface_get_address(to.iface(), iface_addr); + ASSERT_UCS_OK(status); + + struct sockaddr_in dest_addr; + dest_addr.sin_family = AF_INET; + dest_addr.sin_port = *(in_port_t*)iface_addr; + dest_addr.sin_addr = *(struct in_addr*)dev_addr; + + int fd; + status = ucs_socket_create(AF_INET, SOCK_STREAM, &fd); + ASSERT_UCS_OK(status); + + status = ucs_socket_connect(fd, (const struct sockaddr*)&dest_addr); + ASSERT_UCS_OK(status); + + status = ucs_sys_fcntl_modfl(fd, O_NONBLOCK, 0); + ASSERT_UCS_OK(status); + + free(iface_addr); + free(dev_addr); + + return fd; + } + + int setup_conn_to_entity(entity &to, size_t sn = 1) { + int fd = -1; + + do { + if (fd != -1) { + close(fd); + } + + fd = connect_to_entity(to); + EXPECT_NE(-1, fd); + + // give a chance to finish the connection + while (get_accepted_conn_num(to) != sn) { + sched_yield(); + progress(); + + ucs_status_t status = post_recv(fd, true); + if ((status != UCS_OK) && + (status != UCS_ERR_NO_PROGRESS)) { + break; + } + } + } while (!ucs_socket_is_connected(fd)); + + EXPECT_EQ(1, ucs_socket_is_connected(fd)); + + return fd; + } + +protected: + uct_tcp_iface *m_tcp_iface; + entity *m_ent; +}; + +UCS_TEST_P(test_uct_tcp, listener_flood_connect_and_send_large) { + const size_t max_conn = + ucs_min(static_cast(max_connections()), 128lu) / + ucs::test_time_multiplier(); + const size_t msg_size = m_tcp_iface->config.rx_seg_size * 4; + test_listener_flood(*m_ent, max_conn, msg_size); +} + +UCS_TEST_P(test_uct_tcp, listener_flood_connect_and_send_small) { + const size_t max_conn = + ucs_min(static_cast(max_connections()), 128lu) / + ucs::test_time_multiplier(); + // It should be less than length of the expected magic number by TCP + const size_t msg_size = 1; + test_listener_flood(*m_ent, max_conn, msg_size); +} + +UCS_TEST_P(test_uct_tcp, listener_flood_connect_and_close) { + const size_t max_conn = + ucs_min(static_cast(max_connections()), 128lu) / + ucs::test_time_multiplier(); + test_listener_flood(*m_ent, max_conn, 0); +} + +_UCT_INSTANTIATE_TEST_CASE(test_uct_tcp, tcp) diff --git a/test/gtest/uct/test_amo.cc b/test/gtest/uct/test_amo.cc index b898610d26f..5dce707cd12 100644 --- a/test/gtest/uct/test_amo.cc +++ b/test/gtest/uct/test_amo.cc @@ -21,6 +21,8 @@ void uct_amo_test::init() { entity *receiver = uct_test::create_entity(0); m_entities.push_back(receiver); + check_skip_test(); + for (unsigned i = 0; i < num_senders(); ++i) { entity *sender = uct_test::create_entity(0); m_entities.push_back(sender); diff --git a/test/gtest/uct/test_amo_add_xor.cc b/test/gtest/uct/test_amo_add_xor.cc index f89477b1915..f09ab0dfb86 100644 --- a/test/gtest/uct/test_amo_add_xor.cc +++ b/test/gtest/uct/test_amo_add_xor.cc @@ -42,23 +42,23 @@ class uct_amo_add_xor_test : public uct_amo_test { } }; -UCS_TEST_P(uct_amo_add_xor_test, add32) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), OP32); +UCS_TEST_SKIP_COND_P(uct_amo_add_xor_test, add32, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), OP32)) { test_op(add_op); } -UCS_TEST_P(uct_amo_add_xor_test, add64) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), OP64); +UCS_TEST_SKIP_COND_P(uct_amo_add_xor_test, add64, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), OP64)) { test_op(add_op); } -UCS_TEST_P(uct_amo_add_xor_test, xor32) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), OP32); +UCS_TEST_SKIP_COND_P(uct_amo_add_xor_test, xor32, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), OP32)) { test_op(xor_op); } -UCS_TEST_P(uct_amo_add_xor_test, xor64) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), OP64); +UCS_TEST_SKIP_COND_P(uct_amo_add_xor_test, xor64, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), OP64)) { test_op(xor_op); } diff --git a/test/gtest/uct/test_amo_and_or.cc b/test/gtest/uct/test_amo_and_or.cc index 1cf0edbe630..f410d1b4644 100644 --- a/test/gtest/uct/test_amo_and_or.cc +++ b/test/gtest/uct/test_amo_and_or.cc @@ -19,8 +19,6 @@ class uct_amo_and_or_test : public uct_amo_test { * for every worker to eliminate result to 0 or MAX_INT */ - check_atomics(UCS_BIT(opcode), sizeof(T) == sizeof(uint64_t) ? OP64 : OP32); - mapped_buffer recvbuf(sizeof(T), 0, receiver()); T value = 0x0ff0f00f; @@ -45,19 +43,23 @@ class uct_amo_and_or_test : public uct_amo_test { } }; -UCS_TEST_P(uct_amo_and_or_test, and32) { +UCS_TEST_SKIP_COND_P(uct_amo_and_or_test, and32, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_AND), OP32)) { test_op(and_op, and_val); } -UCS_TEST_P(uct_amo_and_or_test, add64) { +UCS_TEST_SKIP_COND_P(uct_amo_and_or_test, add64, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_AND), OP64)) { test_op(and_op, and_val); } -UCS_TEST_P(uct_amo_and_or_test, or32) { +UCS_TEST_SKIP_COND_P(uct_amo_and_or_test, or32, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_OR), OP32)) { test_op(or_op, or_val); } -UCS_TEST_P(uct_amo_and_or_test, or64) { +UCS_TEST_SKIP_COND_P(uct_amo_and_or_test, or64, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_OR), OP64)) { test_op(or_op, or_val); } diff --git a/test/gtest/uct/test_amo_cswap.cc b/test/gtest/uct/test_amo_cswap.cc index 5bb7f8edec2..cb7df33b07c 100644 --- a/test/gtest/uct/test_amo_cswap.cc +++ b/test/gtest/uct/test_amo_cswap.cc @@ -94,13 +94,13 @@ class uct_amo_cswap_test : public uct_amo_test { }; -UCS_TEST_P(uct_amo_cswap_test, cswap32) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_CSWAP), FOP32); +UCS_TEST_SKIP_COND_P(uct_amo_cswap_test, cswap32, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_CSWAP), FOP32)) { test_cswap(static_cast(&uct_amo_cswap_test::cswap32)); } -UCS_TEST_P(uct_amo_cswap_test, cswap64) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_CSWAP), FOP64); +UCS_TEST_SKIP_COND_P(uct_amo_cswap_test, cswap64, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_CSWAP), FOP64)) { test_cswap(static_cast(&uct_amo_cswap_test::cswap64)); } diff --git a/test/gtest/uct/test_amo_fadd_fxor.cc b/test/gtest/uct/test_amo_fadd_fxor.cc index b4ebfb518b9..86b6143fe87 100644 --- a/test/gtest/uct/test_amo_fadd_fxor.cc +++ b/test/gtest/uct/test_amo_fadd_fxor.cc @@ -40,23 +40,23 @@ class uct_amo_fadd_fxor_test : public uct_amo_test { } }; -UCS_TEST_P(uct_amo_fadd_fxor_test, fadd32) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP32); +UCS_TEST_SKIP_COND_P(uct_amo_fadd_fxor_test, fadd32, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP32)) { test_fop(add_op); } -UCS_TEST_P(uct_amo_fadd_fxor_test, fadd64) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP64); +UCS_TEST_SKIP_COND_P(uct_amo_fadd_fxor_test, fadd64, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP64)) { test_fop(add_op); } -UCS_TEST_P(uct_amo_fadd_fxor_test, fxor32) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), FOP32); +UCS_TEST_SKIP_COND_P(uct_amo_fadd_fxor_test, fxor32, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), FOP32)) { test_fop(xor_op); } -UCS_TEST_P(uct_amo_fadd_fxor_test, fxor64) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), FOP64); +UCS_TEST_SKIP_COND_P(uct_amo_fadd_fxor_test, fxor64, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), FOP64)) { test_fop(xor_op); } @@ -64,33 +64,39 @@ UCT_INSTANTIATE_TEST_CASE(uct_amo_fadd_fxor_test) class uct_amo_fadd_fxor_test_inlresp : public uct_amo_fadd_fxor_test {}; -UCS_TEST_P(uct_amo_fadd_fxor_test_inlresp, fadd64_inlresp0, "IB_TX_INLINE_RESP=0") { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP64); +UCS_TEST_SKIP_COND_P(uct_amo_fadd_fxor_test_inlresp, fadd64_inlresp0, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP64), + "IB_TX_INLINE_RESP=0") { test_fop(add_op); } -UCS_TEST_P(uct_amo_fadd_fxor_test_inlresp, fadd64_inlresp32, "IB_TX_INLINE_RESP=32") { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP64); +UCS_TEST_SKIP_COND_P(uct_amo_fadd_fxor_test_inlresp, fadd64_inlresp32, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP64), + "IB_TX_INLINE_RESP=32") { test_fop(add_op); } -UCS_TEST_P(uct_amo_fadd_fxor_test_inlresp, fadd64_inlresp64, "IB_TX_INLINE_RESP=64") { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP64); +UCS_TEST_SKIP_COND_P(uct_amo_fadd_fxor_test_inlresp, fadd64_inlresp64, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP64), + "IB_TX_INLINE_RESP=64") { test_fop(add_op); } -UCS_TEST_P(uct_amo_fadd_fxor_test_inlresp, fxor64_inlresp0, "IB_TX_INLINE_RESP=0") { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), FOP64); +UCS_TEST_SKIP_COND_P(uct_amo_fadd_fxor_test_inlresp, fxor64_inlresp0, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), FOP64), + "IB_TX_INLINE_RESP=0") { test_fop(xor_op); } -UCS_TEST_P(uct_amo_fadd_fxor_test_inlresp, fxor64_inlresp32, "IB_TX_INLINE_RESP=32") { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), FOP64); +UCS_TEST_SKIP_COND_P(uct_amo_fadd_fxor_test_inlresp, fxor64_inlresp32, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), FOP64), + "IB_TX_INLINE_RESP=32") { test_fop(xor_op); } -UCS_TEST_P(uct_amo_fadd_fxor_test_inlresp, fxor64_inlresp64, "IB_TX_INLINE_RESP=64") { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), FOP64); +UCS_TEST_SKIP_COND_P(uct_amo_fadd_fxor_test_inlresp, fxor64_inlresp64, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), FOP64), + "IB_TX_INLINE_RESP=64") { test_fop(xor_op); } diff --git a/test/gtest/uct/test_amo_fand_for.cc b/test/gtest/uct/test_amo_fand_for.cc index 4c8dc2a2133..b4778e19b50 100644 --- a/test/gtest/uct/test_amo_fand_for.cc +++ b/test/gtest/uct/test_amo_fand_for.cc @@ -18,8 +18,6 @@ class uct_amo_fand_for_test : public uct_amo_test { * and the final value of atomic variable is the and/or of all. */ - check_atomics(UCS_BIT(opcode), sizeof(T) == sizeof(uint64_t) ? FOP64 : FOP32); - mapped_buffer recvbuf(sizeof(T), 0, receiver()); T value = rand64(); @@ -42,19 +40,23 @@ class uct_amo_fand_for_test : public uct_amo_test { } }; -UCS_TEST_P(uct_amo_fand_for_test, fand32) { +UCS_TEST_SKIP_COND_P(uct_amo_fand_for_test, fand32, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_AND), FOP32)) { test_fop(and_op); } -UCS_TEST_P(uct_amo_fand_for_test, fand64) { +UCS_TEST_SKIP_COND_P(uct_amo_fand_for_test, fand64, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_AND), FOP64)) { test_fop(and_op); } -UCS_TEST_P(uct_amo_fand_for_test, for32) { +UCS_TEST_SKIP_COND_P(uct_amo_fand_for_test, for32, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_OR), FOP32)) { test_fop(or_op); } -UCS_TEST_P(uct_amo_fand_for_test, for64) { +UCS_TEST_SKIP_COND_P(uct_amo_fand_for_test, for64, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_OR), FOP64)) { test_fop(or_op); } @@ -62,27 +64,39 @@ UCT_INSTANTIATE_TEST_CASE(uct_amo_fand_for_test) class uct_amo_fand_for_test_inlresp : public uct_amo_fand_for_test {}; -UCS_TEST_P(uct_amo_fand_for_test_inlresp, fand64_inlresp0, "IB_TX_INLINE_RESP=0") { +UCS_TEST_SKIP_COND_P(uct_amo_fand_for_test_inlresp, fand64_inlresp0, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_AND), FOP64), + "IB_TX_INLINE_RESP=0") { test_fop(and_op); } -UCS_TEST_P(uct_amo_fand_for_test_inlresp, fand64_inlresp32, "IB_TX_INLINE_RESP=32") { +UCS_TEST_SKIP_COND_P(uct_amo_fand_for_test_inlresp, fand64_inlresp32, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_AND), FOP64), + "IB_TX_INLINE_RESP=32") { test_fop(and_op); } -UCS_TEST_P(uct_amo_fand_for_test_inlresp, fand64_inlresp64, "IB_TX_INLINE_RESP=64") { +UCS_TEST_SKIP_COND_P(uct_amo_fand_for_test_inlresp, fand64_inlresp64, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_AND), FOP64), + "IB_TX_INLINE_RESP=64") { test_fop(and_op); } -UCS_TEST_P(uct_amo_fand_for_test_inlresp, for64_inlresp0, "IB_TX_INLINE_RESP=0") { +UCS_TEST_SKIP_COND_P(uct_amo_fand_for_test_inlresp, for64_inlresp0, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_OR), FOP64), + "IB_TX_INLINE_RESP=0") { test_fop(or_op); } -UCS_TEST_P(uct_amo_fand_for_test_inlresp, for64_inlresp32, "IB_TX_INLINE_RESP=32") { +UCS_TEST_SKIP_COND_P(uct_amo_fand_for_test_inlresp, for64_inlresp32, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_OR), FOP64), + "IB_TX_INLINE_RESP=32") { test_fop(or_op); } -UCS_TEST_P(uct_amo_fand_for_test_inlresp, for64_inlresp64, "IB_TX_INLINE_RESP=64") { +UCS_TEST_SKIP_COND_P(uct_amo_fand_for_test_inlresp, for64_inlresp64, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_OR), FOP64), + "IB_TX_INLINE_RESP=64") { test_fop(or_op); } diff --git a/test/gtest/uct/test_amo_swap.cc b/test/gtest/uct/test_amo_swap.cc index 10f23341dc1..7367dc8a382 100644 --- a/test/gtest/uct/test_amo_swap.cc +++ b/test/gtest/uct/test_amo_swap.cc @@ -64,13 +64,13 @@ class uct_amo_swap_test : public uct_amo_test { }; -UCS_TEST_P(uct_amo_swap_test, swap32) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_SWAP), FOP32); +UCS_TEST_SKIP_COND_P(uct_amo_swap_test, swap32, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_SWAP), FOP32)) { test_swap(static_cast(&uct_amo_swap_test::swap32)); } -UCS_TEST_P(uct_amo_swap_test, swap64) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_SWAP), FOP64); +UCS_TEST_SKIP_COND_P(uct_amo_swap_test, swap64, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_SWAP), FOP64)) { test_swap(static_cast(&uct_amo_swap_test::swap64)); } @@ -78,18 +78,21 @@ UCT_INSTANTIATE_TEST_CASE(uct_amo_swap_test) class uct_amo_swap_test_inlresp : public uct_amo_swap_test {}; -UCS_TEST_P(uct_amo_swap_test_inlresp, swap32_inlresp0, "IB_TX_INLINE_RESP=0") { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_SWAP), FOP32); +UCS_TEST_SKIP_COND_P(uct_amo_swap_test_inlresp, swap32_inlresp0, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_SWAP), FOP32), + "IB_TX_INLINE_RESP=0") { test_swap(static_cast(&uct_amo_swap_test::swap32)); } -UCS_TEST_P(uct_amo_swap_test_inlresp, swap32_inlresp32, "IB_TX_INLINE_RESP=32") { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_SWAP), FOP32); +UCS_TEST_SKIP_COND_P(uct_amo_swap_test_inlresp, swap32_inlresp32, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_SWAP), FOP32), + "IB_TX_INLINE_RESP=32") { test_swap(static_cast(&uct_amo_swap_test::swap32)); } -UCS_TEST_P(uct_amo_swap_test_inlresp, swap32_inlresp64, "IB_TX_INLINE_RESP=64") { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_SWAP), FOP32); +UCS_TEST_SKIP_COND_P(uct_amo_swap_test_inlresp, swap32_inlresp64, + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_SWAP), FOP32), + "IB_TX_INLINE_RESP=64") { test_swap(static_cast(&uct_amo_swap_test::swap32)); } diff --git a/test/gtest/uct/test_event.cc b/test/gtest/uct/test_event.cc index aa0d169b5e6..96d9e4b196c 100644 --- a/test/gtest/uct/test_event.cc +++ b/test/gtest/uct/test_event.cc @@ -6,27 +6,31 @@ */ extern "C" { -#include -#include #include } #include #include "uct_test.h" -class test_uct_event_fd : public uct_test { +class test_uct_event : public uct_test { public: - void initialize() { + void init() { uct_test::init(); m_e1 = uct_test::create_entity(0); m_entities.push_back(m_e1); - m_e2 = uct_test::create_entity(0); + m_e2 = uct_test::create_entity(0, NULL, NULL, NULL, NULL, NULL, + async_event_handler, this); m_entities.push_back(m_e2); + check_skip_test(); + m_e1->connect(0, *m_e2, 0); m_e2->connect(0, *m_e1, 0); + /* give a chance to finish connection for some transports (ib/ud, tcp) */ + flush(); + m_am_count = 0; } @@ -35,6 +39,11 @@ class test_uct_event_fd : public uct_test { /* data follows */ } recv_desc_t; + static void async_event_handler(void *arg, unsigned flags) { + test_uct_event *self = static_cast(arg); + self->m_async_event_ctx.signal(); + } + static ucs_status_t am_handler(void *arg, void *data, size_t length, unsigned flags) { recv_desc_t *my_desc = (recv_desc_t *) arg; @@ -51,11 +60,21 @@ class test_uct_event_fd : public uct_test { return UCS_OK; } - void cleanup() { - uct_test::cleanup(); + void send_am_data(unsigned send_flags, int &am_send_count) { + ssize_t res; + + m_send_data = 0xdeadbeef; + do { + res = uct_ep_am_bcopy(m_e1->ep(0), 0, pack_u64, + &m_send_data, send_flags); + m_e1->progress(); + } while (res == UCS_ERR_NO_RESOURCE); + ASSERT_EQ((ssize_t)sizeof(m_send_data), res); + + ++am_send_count; } - void test_recv_am(bool signaled); + void test_recv_am(unsigned arm_flags, unsigned send_flags); static size_t pack_u64(void *dest, void *arg) { @@ -79,61 +98,34 @@ class test_uct_event_fd : public uct_test { protected: entity *m_e1, *m_e2; static int m_am_count; + uct_test::async_event_ctx m_async_event_ctx; + uint64_t m_send_data; }; -int test_uct_event_fd::m_am_count = 0; +int test_uct_event::m_am_count = 0; -void test_uct_event_fd::test_recv_am(bool signaled) +void test_uct_event::test_recv_am(unsigned arm_flags, unsigned send_flags) { - uint64_t send_data = 0xdeadbeef; int am_send_count = 0; - ssize_t res; recv_desc_t *recv_buffer; - struct pollfd wakeup_fd; ucs_status_t status; - unsigned send_flags; - unsigned arm_flags; - - initialize(); - if (signaled) { - check_caps(UCT_IFACE_FLAG_EVENT_RECV_SIG | UCT_IFACE_FLAG_CB_SYNC | - UCT_IFACE_FLAG_AM_BCOPY); - arm_flags = UCT_EVENT_RECV_SIG; - send_flags = UCT_SEND_FLAG_SIGNALED; - } else { - check_caps(UCT_IFACE_FLAG_EVENT_RECV | UCT_IFACE_FLAG_CB_SYNC | - UCT_IFACE_FLAG_AM_BCOPY); - arm_flags = UCT_EVENT_RECV; - send_flags = 0; - } - recv_buffer = (recv_desc_t *) malloc(sizeof(*recv_buffer) + sizeof(send_data)); + recv_buffer = (recv_desc_t *)malloc(sizeof(*recv_buffer) + + sizeof(m_send_data)); recv_buffer->length = 0; /* Initialize length to 0 */ - /* give a chance to finish connection for some transports (ib/ud, tcp) */ - flush(); - /* set a callback for the uct to invoke for receiving the data */ uct_iface_set_am_handler(m_e2->iface(), 0, am_handler, recv_buffer, 0); - - /* create receiver wakeup */ - status = uct_iface_event_fd_get(m_e2->iface(), &wakeup_fd.fd); - ASSERT_EQ(UCS_OK, status); - - wakeup_fd.events = POLLIN; - EXPECT_EQ(0, poll(&wakeup_fd, 1, 0)); + EXPECT_FALSE(m_async_event_ctx.wait_for_event(*m_e2, 0)); arm(m_e2, arm_flags); - - EXPECT_EQ(0, poll(&wakeup_fd, 1, 0)); + EXPECT_FALSE(m_async_event_ctx.wait_for_event(*m_e2, 0)); /* send the data */ - res = uct_ep_am_bcopy(m_e1->ep(0), 0, pack_u64, &send_data, send_flags); - ASSERT_EQ((ssize_t)sizeof(send_data), res); - ++am_send_count; - - /* make sure the file descriptor IS signaled ONCE */ - ASSERT_EQ(1, poll(&wakeup_fd, 1, 1000*ucs::test_time_multiplier())); + send_am_data(send_flags, am_send_count); + EXPECT_TRUE(m_async_event_ctx.wait_for_event(*m_e2, + 1000 * + ucs::test_time_multiplier())); for (;;) { if ((progress() == 0) && (m_am_count == am_send_count)) { @@ -148,11 +140,10 @@ void test_uct_event_fd::test_recv_am(bool signaled) arm(m_e2, arm_flags); /* send the data again */ - uct_ep_am_bcopy(m_e1->ep(0), 0, pack_u64, &send_data, send_flags); - ++am_send_count; - - /* make sure the file descriptor IS signaled */ - ASSERT_EQ(1, poll(&wakeup_fd, 1, 1000*ucs::test_time_multiplier())); + send_am_data(send_flags, am_send_count); + EXPECT_TRUE(m_async_event_ctx.wait_for_event(*m_e2, + 1000 * + ucs::test_time_multiplier())); while (m_am_count < am_send_count) { progress(); @@ -163,14 +154,20 @@ void test_uct_event_fd::test_recv_am(bool signaled) free(recv_buffer); } -UCS_TEST_P(test_uct_event_fd, am) +UCS_TEST_SKIP_COND_P(test_uct_event, am, + !check_caps(UCT_IFACE_FLAG_CB_SYNC | + UCT_IFACE_FLAG_AM_BCOPY) || + !check_event_caps(UCT_IFACE_FLAG_EVENT_RECV)) { - test_recv_am(false); + test_recv_am(UCT_EVENT_RECV, 0); } -UCS_TEST_P(test_uct_event_fd, sig_am) +UCS_TEST_SKIP_COND_P(test_uct_event, sig_am, + !check_caps(UCT_IFACE_FLAG_CB_SYNC | + UCT_IFACE_FLAG_AM_BCOPY) || + !check_event_caps(UCT_IFACE_FLAG_EVENT_RECV_SIG)) { - test_recv_am(true); + test_recv_am(UCT_EVENT_RECV_SIG, UCT_SEND_FLAG_SIGNALED); } -UCT_INSTANTIATE_NO_SELF_TEST_CASE(test_uct_event_fd); +UCT_INSTANTIATE_NO_SELF_TEST_CASE(test_uct_event); diff --git a/test/gtest/uct/test_fence.cc b/test/gtest/uct/test_fence.cc index b568cb18673..8644a7281a6 100644 --- a/test/gtest/uct/test_fence.cc +++ b/test/gtest/uct/test_fence.cc @@ -33,6 +33,8 @@ class uct_fence_test : public uct_test { entity *receiver = uct_test::create_entity(0); m_entities.push_back(receiver); + check_skip_test(); + entity *sender = uct_test::create_entity(0); m_entities.push_back(sender); @@ -195,51 +197,51 @@ class uct_fence_test : public uct_test { } }; -UCS_TEST_P(uct_fence_test, add32) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), OP32); - check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP32); +UCS_TEST_SKIP_COND_P(uct_fence_test, add32, + (!check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), OP32) || + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP32))) { test_fence(); } -UCS_TEST_P(uct_fence_test, add64) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), OP64); - check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP64); +UCS_TEST_SKIP_COND_P(uct_fence_test, add64, + (!check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), OP64) || + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_ADD), FOP64))) { test_fence(); } -UCS_TEST_P(uct_fence_test, and32) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_AND), OP32); - check_atomics(UCS_BIT(UCT_ATOMIC_OP_AND), FOP32); +UCS_TEST_SKIP_COND_P(uct_fence_test, and32, + (!check_atomics(UCS_BIT(UCT_ATOMIC_OP_AND), OP32) || + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_AND), FOP32))) { test_fence(); } -UCS_TEST_P(uct_fence_test, and64) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_AND), OP64); - check_atomics(UCS_BIT(UCT_ATOMIC_OP_AND), FOP64); +UCS_TEST_SKIP_COND_P(uct_fence_test, and64, + (!check_atomics(UCS_BIT(UCT_ATOMIC_OP_AND), OP64) || + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_AND), FOP64))) { test_fence(); } -UCS_TEST_P(uct_fence_test, or32) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_OR), OP32); - check_atomics(UCS_BIT(UCT_ATOMIC_OP_OR), FOP32); +UCS_TEST_SKIP_COND_P(uct_fence_test, or32, + (!check_atomics(UCS_BIT(UCT_ATOMIC_OP_OR), OP32) || + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_OR), FOP32))) { test_fence(); } -UCS_TEST_P(uct_fence_test, or64) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_OR), OP64); - check_atomics(UCS_BIT(UCT_ATOMIC_OP_OR), FOP64); +UCS_TEST_SKIP_COND_P(uct_fence_test, or64, + (!check_atomics(UCS_BIT(UCT_ATOMIC_OP_OR), OP64) || + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_OR), FOP64))) { test_fence(); } -UCS_TEST_P(uct_fence_test, xor32) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), OP32); - check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), FOP32); +UCS_TEST_SKIP_COND_P(uct_fence_test, xor32, + (!check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), OP32) || + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), FOP32))) { test_fence(); } -UCS_TEST_P(uct_fence_test, xor64) { - check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), OP64); - check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), FOP64); +UCS_TEST_SKIP_COND_P(uct_fence_test, xor64, + (!check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), OP64) || + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_XOR), FOP64))) { test_fence(); } diff --git a/test/gtest/uct/test_flush.cc b/test/gtest/uct/test_flush.cc index 31d5fcbad3a..de7c85cad07 100644 --- a/test/gtest/uct/test_flush.cc +++ b/test/gtest/uct/test_flush.cc @@ -28,15 +28,15 @@ class uct_flush_test : public uct_test { void init() { uct_test::init(); - if (UCT_DEVICE_TYPE_SELF == GetParam()->dev_type) { - entity *e = uct_test::create_entity(0); - m_entities.push_back(e); - e->connect(0, *e, 0); - } else { - entity *m_sender = uct_test::create_entity(0); - m_entities.push_back(m_sender); + entity *m_sender = uct_test::create_entity(0); + m_entities.push_back(m_sender); + + check_skip_test(); + if (UCT_DEVICE_TYPE_SELF == GetParam()->dev_type) { + m_sender->connect(0, *m_sender, 0); + } else { entity *m_receiver = uct_test::create_entity(0); m_entities.push_back(m_receiver); @@ -142,7 +142,6 @@ class uct_flush_test : public uct_test { void test_flush_put_bcopy(flush_func_t flush) { const size_t length = 8; - check_caps(UCT_IFACE_FLAG_PUT_BCOPY); mapped_buffer sendbuf(length, SEED1, sender()); mapped_buffer recvbuf(length, SEED2, receiver()); sendbuf.pattern_fill(SEED3); @@ -165,7 +164,9 @@ class uct_flush_test : public uct_test { void test_flush_am_zcopy(flush_func_t flush, bool destroy_ep) { const size_t length = 8; - check_caps(UCT_IFACE_FLAG_AM_ZCOPY); + if (is_flush_cancel()) { + ASSERT_TRUE(destroy_ep); + } mapped_buffer sendbuf(length, SEED1, sender()); mapped_buffer recvbuf(length, SEED2, receiver()); sendbuf.pattern_fill(SEED3); @@ -185,6 +186,7 @@ class uct_flush_test : public uct_test { do { status = uct_ep_am_zcopy(sender().ep(0), get_am_id(), NULL, 0, iov, iovcnt, 0, &zcomp); + progress(); } while (status == UCS_ERR_NO_RESOURCE); ASSERT_UCS_OK_OR_INPROGRESS(status); if (status == UCS_OK) { @@ -213,7 +215,9 @@ class uct_flush_test : public uct_test { void test_flush_am_disconnect(flush_func_t flush, bool destroy_ep) { const size_t length = 8; - check_caps(UCT_IFACE_FLAG_AM_BCOPY); + if (is_flush_cancel()) { + ASSERT_TRUE(destroy_ep); + } mapped_buffer sendbuf(length, SEED1, sender()); mapped_buffer recvbuf(length, SEED2, receiver()); sendbuf.pattern_fill(SEED3); @@ -302,8 +306,10 @@ uint32_t uct_flush_test::am_rx_count = 0; void uct_flush_test::test_flush_am_pending(flush_func_t flush, bool destroy_ep) { + if (is_flush_cancel()) { + ASSERT_TRUE(destroy_ep); + } const size_t length = 8; - check_caps(UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_PENDING); mapped_buffer sendbuf(length, SEED1, sender()); mapped_buffer recvbuf(length, SEED2, receiver()); sendbuf.pattern_fill(SEED3); @@ -368,9 +374,6 @@ void uct_flush_test::test_flush_am_pending(flush_func_t flush, bool destroy_ep) if (status == UCS_OK) { --flush_req.comp.count; } else if (status == UCS_ERR_NO_RESOURCE) { - if (is_flush_cancel()) { - continue; - } /* If flush returned NO_RESOURCE, add to pending must succeed */ flush_req.test = this; flush_req.uct.func = flush_progress; @@ -416,143 +419,122 @@ void uct_flush_test::test_flush_am_pending(flush_func_t flush, bool destroy_ep) recvbuf.pattern_check(SEED3); } -UCS_TEST_P(uct_flush_test, put_bcopy_flush_ep_no_comp) { +UCS_TEST_SKIP_COND_P(uct_flush_test, put_bcopy_flush_ep_no_comp, + !check_caps(UCT_IFACE_FLAG_PUT_BCOPY)) { am_rx_count = 0; m_flush_flags = UCT_FLUSH_FLAG_LOCAL; - test_flush_put_bcopy(&uct_flush_test::flush_ep_no_comp); - - if (!is_caps_supported(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { - return; - } - am_rx_count = 0; - m_flush_flags |= UCT_FLUSH_FLAG_CANCEL; test_flush_put_bcopy(&uct_flush_test::flush_ep_no_comp); - am_rx_count = 0; - m_flush_flags &= ~UCT_FLUSH_FLAG_CANCEL; - test_flush_put_bcopy(&uct_flush_test::flush_ep_no_comp); + if (is_caps_supported(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { + am_rx_count = 0; + m_flush_flags |= UCT_FLUSH_FLAG_CANCEL; + test_flush_put_bcopy(&uct_flush_test::flush_ep_no_comp); + } } -UCS_TEST_P(uct_flush_test, put_bcopy_flush_iface_no_comp) { +UCS_TEST_SKIP_COND_P(uct_flush_test, put_bcopy_flush_iface_no_comp, + !check_caps(UCT_IFACE_FLAG_PUT_BCOPY)) { test_flush_put_bcopy(&uct_flush_test::flush_iface_no_comp); } -UCS_TEST_P(uct_flush_test, put_bcopy_flush_ep_nb) { +UCS_TEST_SKIP_COND_P(uct_flush_test, put_bcopy_flush_ep_nb, + !check_caps(UCT_IFACE_FLAG_PUT_BCOPY)) { am_rx_count = 0; m_flush_flags = UCT_FLUSH_FLAG_LOCAL; - test_flush_put_bcopy(&uct_flush_test::flush_ep_nb); - - if (!is_caps_supported(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { - return; - } - am_rx_count = 0; - m_flush_flags |= UCT_FLUSH_FLAG_CANCEL; test_flush_put_bcopy(&uct_flush_test::flush_ep_nb); - am_rx_count = 0; - m_flush_flags &= ~UCT_FLUSH_FLAG_CANCEL; - test_flush_put_bcopy(&uct_flush_test::flush_ep_nb); + if (is_caps_supported(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { + am_rx_count = 0; + m_flush_flags |= UCT_FLUSH_FLAG_CANCEL; + test_flush_put_bcopy(&uct_flush_test::flush_ep_nb); + } } -UCS_TEST_P(uct_flush_test, am_zcopy_flush_ep_no_comp) { +UCS_TEST_SKIP_COND_P(uct_flush_test, am_zcopy_flush_ep_no_comp, + !check_caps(UCT_IFACE_FLAG_AM_ZCOPY), + "UD_TIMER_TICK?=100ms") { am_rx_count = 0; m_flush_flags = UCT_FLUSH_FLAG_LOCAL; - if (is_caps_supported(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { - - test_flush_am_zcopy(&uct_flush_test::flush_ep_no_comp, false); + test_flush_am_zcopy(&uct_flush_test::flush_ep_no_comp, false); - am_rx_count = 0; + if (is_caps_supported(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { + am_rx_count = 0; m_flush_flags |= UCT_FLUSH_FLAG_CANCEL; - test_flush_am_zcopy(&uct_flush_test::flush_ep_no_comp, false); - - am_rx_count = 0; - m_flush_flags &= ~UCT_FLUSH_FLAG_CANCEL; + test_flush_am_zcopy(&uct_flush_test::flush_ep_no_comp, true); } - - test_flush_am_zcopy(&uct_flush_test::flush_ep_no_comp, true); } -UCS_TEST_P(uct_flush_test, am_zcopy_flush_iface_no_comp) { +UCS_TEST_SKIP_COND_P(uct_flush_test, am_zcopy_flush_iface_no_comp, + !check_caps(UCT_IFACE_FLAG_AM_ZCOPY), + "UD_TIMER_TICK?=100ms") { test_flush_am_zcopy(&uct_flush_test::flush_iface_no_comp, true); } -UCS_TEST_P(uct_flush_test, am_zcopy_flush_ep_nb) { +UCS_TEST_SKIP_COND_P(uct_flush_test, am_zcopy_flush_ep_nb, + !check_caps(UCT_IFACE_FLAG_AM_ZCOPY), + "UD_TIMER_TICK?=100ms") { am_rx_count = 0; m_flush_flags = UCT_FLUSH_FLAG_LOCAL; - if (is_caps_supported(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { - test_flush_am_zcopy(&uct_flush_test::flush_ep_nb, false); + test_flush_am_zcopy(&uct_flush_test::flush_ep_nb, false); - am_rx_count = 0; + if (is_caps_supported(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { + am_rx_count = 0; m_flush_flags |= UCT_FLUSH_FLAG_CANCEL; - test_flush_am_zcopy(&uct_flush_test::flush_ep_nb, false); - - am_rx_count = 0; - m_flush_flags &= ~UCT_FLUSH_FLAG_CANCEL; + test_flush_am_zcopy(&uct_flush_test::flush_ep_nb, true); } - - test_flush_am_zcopy(&uct_flush_test::flush_ep_nb, true); } -UCS_TEST_P(uct_flush_test, am_flush_ep_no_comp) { +UCS_TEST_SKIP_COND_P(uct_flush_test, am_flush_ep_no_comp, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY)) { am_rx_count = 0; m_flush_flags = UCT_FLUSH_FLAG_LOCAL; - if (is_caps_supported(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { - test_flush_am_disconnect(&uct_flush_test::flush_ep_no_comp, false); + test_flush_am_disconnect(&uct_flush_test::flush_ep_no_comp, false); - am_rx_count = 0; + if (is_caps_supported(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { + am_rx_count = 0; m_flush_flags |= UCT_FLUSH_FLAG_CANCEL; - test_flush_am_disconnect(&uct_flush_test::flush_ep_no_comp, false); - - am_rx_count = 0; - m_flush_flags &= ~UCT_FLUSH_FLAG_CANCEL; + test_flush_am_disconnect(&uct_flush_test::flush_ep_no_comp, true); } - - test_flush_am_disconnect(&uct_flush_test::flush_ep_no_comp, true); } -UCS_TEST_P(uct_flush_test, am_flush_iface_no_comp) { +UCS_TEST_SKIP_COND_P(uct_flush_test, am_flush_iface_no_comp, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY)) { m_flush_flags = UCT_FLUSH_FLAG_LOCAL; test_flush_am_disconnect(&uct_flush_test::flush_iface_no_comp, true); } -UCS_TEST_P(uct_flush_test, am_flush_ep_nb) { +UCS_TEST_SKIP_COND_P(uct_flush_test, am_flush_ep_nb, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY)) { am_rx_count = 0; m_flush_flags = UCT_FLUSH_FLAG_LOCAL; - if (is_caps_supported(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { - test_flush_am_disconnect(&uct_flush_test::flush_ep_nb, false); - am_rx_count = 0; - m_flush_flags |= UCT_FLUSH_FLAG_CANCEL; - test_flush_am_disconnect(&uct_flush_test::flush_ep_nb, false); + test_flush_am_disconnect(&uct_flush_test::flush_ep_nb, false); - am_rx_count = 0; - m_flush_flags &= ~UCT_FLUSH_FLAG_CANCEL; + if (is_caps_supported(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { + am_rx_count = 0; + m_flush_flags |= UCT_FLUSH_FLAG_CANCEL; + test_flush_am_disconnect(&uct_flush_test::flush_ep_nb, true); } - - test_flush_am_disconnect(&uct_flush_test::flush_ep_nb, true); } -UCS_TEST_P(uct_flush_test, am_pending_flush_nb) { +UCS_TEST_SKIP_COND_P(uct_flush_test, am_pending_flush_nb, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY | + UCT_IFACE_FLAG_PENDING)) { am_rx_count = 0; m_flush_flags = UCT_FLUSH_FLAG_LOCAL; - if (is_caps_supported(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { - test_flush_am_pending(&uct_flush_test::flush_ep_nb, false); + test_flush_am_pending(&uct_flush_test::flush_ep_nb, false); + if (is_caps_supported(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) { am_rx_count = 0; m_flush_flags |= UCT_FLUSH_FLAG_CANCEL; - test_flush_am_pending(&uct_flush_test::flush_ep_nb, false); - - am_rx_count = 0; - m_flush_flags &= ~UCT_FLUSH_FLAG_CANCEL; + test_flush_am_pending(&uct_flush_test::flush_ep_nb, true); } - - test_flush_am_pending(&uct_flush_test::flush_ep_nb, false); } UCT_INSTANTIATE_TEST_CASE(uct_flush_test) diff --git a/test/gtest/uct/test_many2one_am.cc b/test/gtest/uct/test_many2one_am.cc index 4d74aa122fa..652d8e2b720 100644 --- a/test/gtest/uct/test_many2one_am.cc +++ b/test/gtest/uct/test_many2one_am.cc @@ -21,7 +21,37 @@ class test_many2one_am : public uct_test { unsigned length; } receive_desc_t; - test_many2one_am() : m_am_count(0) { + test_many2one_am() : m_am_count(0), m_receiver(NULL) { + } + + void init() { + std::string val = "16k"; + std::string tx_name, rx_name; + + if (has_ib()) { + tx_name = "IB_SEG_SIZE"; + } else if (has_transport("tcp")) { + tx_name = "TX_SEG_SIZE"; + rx_name = "RX_SEG_SIZE"; + } else if (has_transport("mm") || + has_transport("self")) { + tx_name = "SEG_SIZE"; + } + + if (!tx_name.empty()) { + modify_config(tx_name, val); + } + + if (!rx_name.empty()) { + modify_config(rx_name, val); + } + + uct_test::init(); + + m_receiver = create_entity(sizeof(receive_desc_t)); + m_entities.push_back(m_receiver); + + check_skip_test(); } static ucs_status_t am_handler(void *arg, void *data, size_t length, @@ -49,7 +79,7 @@ class test_many2one_am : public uct_test { ucs_atomic_add32(&m_am_count, 1); return (flags & UCT_CB_PARAM_FLAG_DESC) ? UCS_INPROGRESS : UCS_OK; } - mapped_buffer::pattern_check(data, length); + mem_buffer::pattern_check(data, length); ucs_atomic_add32(&m_am_count, 1); return UCS_OK; } @@ -58,7 +88,7 @@ class test_many2one_am : public uct_test { while (!m_backlog.empty()) { receive_desc_t *my_desc = m_backlog.back(); m_backlog.pop_back(); - mapped_buffer::pattern_check(my_desc + 1, my_desc->length); + mem_buffer::pattern_check(my_desc + 1, my_desc->length); if (my_desc->magic == MAGIC_DESC) { uct_iface_release_desc(my_desc); } else { @@ -71,35 +101,32 @@ class test_many2one_am : public uct_test { static const size_t NUM_SENDERS = 10; protected: - volatile uint32_t m_am_count; - std::vector m_backlog; + volatile uint32_t m_am_count; + std::vector m_backlog; + entity *m_receiver; }; -UCS_TEST_P(test_many2one_am, am_bcopy, "MAX_BCOPY=16384") +UCS_TEST_SKIP_COND_P(test_many2one_am, am_bcopy, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY | + UCT_IFACE_FLAG_CB_SYNC)) { const unsigned num_sends = 1000 / ucs::test_time_multiplier(); ucs_status_t status; - entity *receiver = create_entity(sizeof(receive_desc_t)); - m_entities.push_back(receiver); - - check_caps(UCT_IFACE_FLAG_AM_BCOPY); - check_caps(UCT_IFACE_FLAG_CB_SYNC); - ucs::ptr_vector buffers; for (unsigned i = 0; i < NUM_SENDERS; ++i) { entity *sender = create_entity(0); mapped_buffer *buffer = new mapped_buffer( sender->iface_attr().cap.am.max_bcopy, 0, *sender); - sender->connect(0, *receiver, i); + sender->connect(0, *m_receiver, i); m_entities.push_back(sender); buffers.push_back(buffer); } m_am_count = 0; - status = uct_iface_set_am_handler(receiver->iface(), AM_ID, am_handler, + status = uct_iface_set_am_handler(m_receiver->iface(), AM_ID, am_handler, (void*)this, 0); ASSERT_UCS_OK(status); @@ -112,13 +139,14 @@ UCS_TEST_P(test_many2one_am, am_bcopy, "MAX_BCOPY=16384") ssize_t packed_len; for (;;) { const entity& sender = ent(sender_num + 1); - packed_len = uct_ep_am_bcopy(sender.ep(0), AM_ID, mapped_buffer::pack, + packed_len = uct_ep_am_bcopy(sender.ep(0), AM_ID, + mapped_buffer::pack, (void*)&buffer, 0); if (packed_len != UCS_ERR_NO_RESOURCE) { break; } sender.progress(); - receiver->progress(); + m_receiver->progress(); } if (packed_len < 0) { ASSERT_UCS_OK((ucs_status_t)packed_len); @@ -129,7 +157,8 @@ UCS_TEST_P(test_many2one_am, am_bcopy, "MAX_BCOPY=16384") progress(); } - status = uct_iface_set_am_handler(receiver->iface(), AM_ID, NULL, NULL, 0); + status = uct_iface_set_am_handler(m_receiver->iface(), AM_ID, + NULL, NULL, 0); ASSERT_UCS_OK(status); check_backlog(); diff --git a/test/gtest/uct/test_md.cc b/test/gtest/uct/test_md.cc index 2f009dfa7ff..03ab61fe51d 100644 --- a/test/gtest/uct/test_md.cc +++ b/test/gtest/uct/test_md.cc @@ -6,11 +6,16 @@ #include "test_md.h" +#include + #include extern "C" { #include #include #include +#include +#include +#include } #include #include @@ -19,13 +24,6 @@ extern "C" { #include -#if HAVE_CUDA -#include -#include -#endif - -std::string const test_md::mem_types[] = {"host", "cuda", "cuda-managed", "rocm", "rocm-managed"}; - void* test_md::alloc_thread(void *arg) { volatile int *stop_flag = (int*)arg; @@ -34,39 +32,27 @@ void* test_md::alloc_thread(void *arg) int count = ucs::rand() % 100; std::vector buffers; for (int i = 0; i < count; ++i) { - buffers.push_back(malloc(ucs::rand() % (256*1024))); + buffers.push_back(malloc(ucs::rand() % (256 * UCS_KBYTE))); } std::for_each(buffers.begin(), buffers.end(), free); } return NULL; } -std::vector test_md::enum_mds(const std::string& mdc_name) { - static std::vector all_pds; - std::vector result; +std::vector test_md::enum_mds(const std::string& cmpt_name) { - if (all_pds.empty()) { - uct_md_resource_desc_t *md_resources; - unsigned num_md_resources; - ucs_status_t status; - - status = uct_query_md_resources(&md_resources, &num_md_resources); - ASSERT_UCS_OK(status); + std::vector md_resources = enum_md_resources(); - for (unsigned i = 0; i < num_md_resources; ++i) { - all_pds.push_back(md_resources[i].md_name); + std::vector result; + for (std::vector::iterator iter = md_resources.begin(); + iter != md_resources.end(); ++iter) { + if (iter->cmpt_attr.name == cmpt_name) { + result.push_back(test_md_param()); + result.back().component = iter->cmpt; + result.back().md_name = iter->rsc_desc.md_name; } - - uct_release_md_resource_list(md_resources); } - for (std::vector::iterator iter = all_pds.begin(); - iter != all_pds.end(); ++iter) - { - if (iter->substr(0, mdc_name.length()) == mdc_name) { - result.push_back(*iter); - } - } return result; } @@ -74,7 +60,7 @@ test_md::test_md() { UCS_TEST_CREATE_HANDLE(uct_md_config_t*, m_md_config, (void (*)(uct_md_config_t*))uct_config_release, - uct_md_config_read, GetParam().c_str(), NULL, NULL); + uct_md_config_read, GetParam().component, NULL, NULL); memset(&m_md_attr, 0, sizeof(m_md_attr)); } @@ -82,10 +68,13 @@ void test_md::init() { ucs::test_base::init(); UCS_TEST_CREATE_HANDLE(uct_md_h, m_md, uct_md_close, uct_md_open, - GetParam().c_str(), m_md_config); + GetParam().component, GetParam().md_name.c_str(), + m_md_config); ucs_status_t status = uct_md_query(m_md, &m_md_attr); ASSERT_UCS_OK(status); + + check_skip_test(); } void test_md::cleanup() @@ -105,81 +94,34 @@ void test_md::modify_config(const std::string& name, const std::string& value, } } -void test_md::check_caps(uint64_t flags, const std::string& name) +bool test_md::check_caps(uint64_t flags) { - uct_md_attr_t md_attr; - ucs_status_t status = uct_md_query(md(), &md_attr); - ASSERT_UCS_OK(status); - if (!ucs_test_all_flags(md_attr.cap.flags, flags)) { - std::stringstream ss; - ss << name << " is not supported by " << GetParam(); - UCS_TEST_SKIP_R(ss.str()); - } + return ((md() == NULL) || ucs_test_all_flags(m_md_attr.cap.flags, flags)); } -void test_md::alloc_memory(void **address, size_t size, char *fill_buffer, int mem_type) +void test_md::alloc_memory(void **address, size_t size, char *fill_buffer, + ucs_memory_type_t mem_type) { - if (mem_type == UCT_MD_MEM_TYPE_HOST) { - *address = malloc(size); - ASSERT_TRUE(*address != NULL); - if (fill_buffer) { - memcpy(*address, fill_buffer, size); - } -#if HAVE_CUDA - } else if (mem_type == UCT_MD_MEM_TYPE_CUDA) { - cudaError_t cerr; - - cerr = cudaMalloc(address, size); - ASSERT_TRUE(cerr == cudaSuccess); - - if(fill_buffer) { - cerr = cudaMemcpy(*address, fill_buffer, size, cudaMemcpyHostToDevice); - ASSERT_TRUE(cerr == cudaSuccess); - } -#endif - } else { - std::stringstream ss; - ss << "can't allocate " << mem_types[mem_type] - << " memory for " << GetParam(); - UCS_TEST_SKIP_R(ss.str()); + *address = mem_buffer::allocate(size, mem_type); + if (fill_buffer) { + mem_buffer::copy_to(*address, fill_buffer, size, mem_type); } } -void test_md::check_memory(void *address, void *expect, size_t size, int mem_type) +void test_md::check_memory(void *address, void *expect, size_t size, + ucs_memory_type_t mem_type) { - int ret; - if (mem_type == UCT_MD_MEM_TYPE_HOST) { - ret = memcmp(address, expect, size); - EXPECT_EQ(0, ret); - } else if (mem_type == UCT_MD_MEM_TYPE_CUDA) { -#if HAVE_CUDA - void *temp; - cudaError_t cerr; - - temp = malloc(size); - ASSERT_TRUE(temp != NULL); - cerr = cudaMemcpy(temp, address, size, cudaMemcpyDeviceToHost); - ASSERT_TRUE(cerr == cudaSuccess); - ret = memcmp(temp, expect, size); - EXPECT_EQ(0, ret); - free(temp); -#endif - } + EXPECT_TRUE(mem_buffer::compare(expect, address, size, mem_type)); } -void test_md::free_memory(void *address, int mem_type) +void test_md::free_memory(void *address, ucs_memory_type_t mem_type) { - if (mem_type == UCT_MD_MEM_TYPE_HOST) { - free(address); - } else if (mem_type == UCT_MD_MEM_TYPE_CUDA) { -#if HAVE_CUDA - cudaFree(address); -#endif - } + mem_buffer::release(address, mem_type); } -UCS_TEST_P(test_md, rkey_ptr) { - +UCS_TEST_SKIP_COND_P(test_md, rkey_ptr, + !check_caps(UCT_MD_FLAG_ALLOC | + UCT_MD_FLAG_RKEY_PTR)) { size_t size; uct_md_attr_t md_attr; void *rkey_buffer; @@ -189,14 +131,14 @@ UCS_TEST_P(test_md, rkey_ptr) { uct_rkey_bundle_t rkey_bundle; unsigned i; - check_caps(UCT_MD_FLAG_ALLOC|UCT_MD_FLAG_RKEY_PTR, "allocation+direct access"); // alloc (should work with both sysv and xpmem - size = 1024 * 1024 * sizeof(unsigned); + size = sizeof(unsigned) * UCS_MBYTE; + rva = NULL; status = uct_md_mem_alloc(md(), &size, (void **)&rva, UCT_MD_MEM_ACCESS_ALL, "test", &memh); ASSERT_UCS_OK(status); - EXPECT_LE(1024 * 1024 * sizeof(unsigned), size); + EXPECT_LE(sizeof(unsigned) * UCS_MBYTE, size); // pack status = uct_md_query(md(), &md_attr); @@ -211,11 +153,12 @@ UCS_TEST_P(test_md, rkey_ptr) { status = uct_md_mkey_pack(md(), memh, rkey_buffer); // unpack - status = uct_rkey_unpack(rkey_buffer, &rkey_bundle); + status = uct_rkey_unpack(GetParam().component, rkey_buffer, &rkey_bundle); ASSERT_UCS_OK(status); // get direct ptr - status = uct_rkey_ptr(&rkey_bundle, (uintptr_t)rva, (void **)&lva); + status = uct_rkey_ptr(GetParam().component, &rkey_bundle, (uintptr_t)rva, + (void **)&lva); ASSERT_UCS_OK(status); // check direct access // read @@ -232,31 +175,35 @@ UCS_TEST_P(test_md, rkey_ptr) { // check bounds // - status = uct_rkey_ptr(&rkey_bundle, (uintptr_t)(rva-1), (void **)&lva); - EXPECT_EQ(UCS_ERR_INVALID_ADDR, status); + status = uct_rkey_ptr(GetParam().component, &rkey_bundle, (uintptr_t)(rva-1), + (void **)&lva); + UCS_TEST_MESSAGE << "rkey_ptr of invalid address returned " + << ucs_status_string(status); - status = uct_rkey_ptr(&rkey_bundle, (uintptr_t)rva+size, (void **)&lva); - EXPECT_EQ(UCS_ERR_INVALID_ADDR, status); + status = uct_rkey_ptr(GetParam().component, &rkey_bundle, (uintptr_t)rva+size, + (void **)&lva); + UCS_TEST_MESSAGE << "rkey_ptr of invalid address returned " + << ucs_status_string(status); free(rkey_buffer); uct_md_mem_free(md(), memh); - uct_rkey_release(&rkey_bundle); + uct_rkey_release(GetParam().component, &rkey_bundle); } -UCS_TEST_P(test_md, alloc) { +UCS_TEST_SKIP_COND_P(test_md, alloc, + !check_caps(UCT_MD_FLAG_ALLOC)) { size_t size, orig_size; ucs_status_t status; void *address; uct_mem_h memh; - check_caps(UCT_MD_FLAG_ALLOC, "allocation"); - for (unsigned i = 0; i < 300; ++i) { size = orig_size = ucs::rand() % 65536; if (size == 0) { continue; } + address = NULL; status = uct_md_mem_alloc(md(), &size, &address, UCT_MD_MEM_ACCESS_ALL, "test", &memh); EXPECT_GT(size, 0ul); @@ -271,43 +218,50 @@ UCS_TEST_P(test_md, alloc) { } } -UCS_TEST_P(test_md, mem_type_owned) { +UCS_TEST_P(test_md, mem_type_detect_mds) { uct_md_attr_t md_attr; ucs_status_t status; - int ret; + ucs_memory_type_t mem_type; + int mem_type_id; void *address; status = uct_md_query(md(), &md_attr); ASSERT_UCS_OK(status); - if (md_attr.cap.mem_type == UCT_MD_MEM_TYPE_HOST) { - UCS_TEST_SKIP_R("MD owns only host memory"); + if (!md_attr.cap.detect_mem_types) { + UCS_TEST_SKIP_R("MD can't detect any memory types"); } - alloc_memory(&address, 1024, NULL, md_attr.cap.mem_type); - - ret = uct_md_is_mem_type_owned(md(), address, 1024); - EXPECT_TRUE(ret > 0); + ucs_for_each_bit(mem_type_id, md_attr.cap.detect_mem_types) { + alloc_memory(&address, UCS_KBYTE, NULL, + static_cast(mem_type_id)); + status = uct_md_detect_memory_type(md(), address, 1024, &mem_type); + ASSERT_UCS_OK(status); + EXPECT_TRUE(mem_type == mem_type_id); + } } -UCS_TEST_P(test_md, reg) { +UCS_TEST_SKIP_COND_P(test_md, reg, + !check_caps(UCT_MD_FLAG_REG)) { size_t size; uct_md_attr_t md_attr; ucs_status_t status; void *address; uct_mem_h memh; - check_caps(UCT_MD_FLAG_REG, "registration"); - status = uct_md_query(md(), &md_attr); ASSERT_UCS_OK(status); - for (unsigned mem_type = 0; mem_type < UCT_MD_MEM_TYPE_LAST; mem_type++) { - if (!(md_attr.cap.reg_mem_types & UCS_BIT(mem_type))) { - UCS_TEST_MESSAGE << mem_types[mem_type] << " memory " - << "registration is not supported by " << GetParam(); + for (unsigned mem_type_id = 0; mem_type_id < UCS_MEMORY_TYPE_LAST; mem_type_id++) { + ucs_memory_type_t mem_type = static_cast(mem_type_id); + + if (!(md_attr.cap.reg_mem_types & UCS_BIT(mem_type_id))) { + UCS_TEST_MESSAGE << mem_buffer::mem_type_name(mem_type) << " memory " + << "registration is not supported by " + << GetParam().md_name; continue; } + for (unsigned i = 0; i < 300; ++i) { size = ucs::rand() % 65536; if (size == 0) { @@ -335,24 +289,26 @@ UCS_TEST_P(test_md, reg) { } } -UCS_TEST_P(test_md, reg_perf) { +UCS_TEST_SKIP_COND_P(test_md, reg_perf, + !check_caps(UCT_MD_FLAG_REG)) { static const unsigned count = 10000; ucs_status_t status; uct_md_attr_t md_attr; void *ptr; - check_caps(UCT_MD_FLAG_REG, "registration"); - status = uct_md_query(md(), &md_attr); ASSERT_UCS_OK(status); - for (unsigned mem_type = 0; mem_type < UCT_MD_MEM_TYPE_LAST; mem_type++) { - if (!(md_attr.cap.reg_mem_types & UCS_BIT(mem_type))) { - UCS_TEST_MESSAGE << mem_types[mem_type] << " memory " - << " registration is not supported by " << GetParam(); + for (unsigned mem_type_id = 0; mem_type_id < UCS_MEMORY_TYPE_LAST; mem_type_id++) { + ucs_memory_type_t mem_type = static_cast(mem_type_id); + if (!(md_attr.cap.reg_mem_types & UCS_BIT(mem_type_id))) { + UCS_TEST_MESSAGE << mem_buffer::mem_type_name(mem_type) << " memory " + << " registration is not supported by " + << GetParam().md_name; continue; } - for (size_t size = 4096; size <= 4 * 1024 * 1024; size *= 2) { - alloc_memory(&ptr, size, NULL, mem_type); + for (size_t size = 4 * UCS_KBYTE; size <= 4 * UCS_MBYTE; size *= 2) { + alloc_memory(&ptr, size, NULL, + static_cast(mem_type_id)); ucs_time_t start_time = ucs_get_time(); ucs_time_t end_time = start_time; @@ -376,8 +332,8 @@ UCS_TEST_P(test_md, reg_perf) { } } - UCS_TEST_MESSAGE << GetParam() << ": Registration time for " << - mem_types[mem_type] << " memory " << size << " bytes: " << + UCS_TEST_MESSAGE << GetParam().md_name << ": Registration time for " << + ucs_memory_type_names[mem_type] << " memory " << size << " bytes: " << long(ucs_time_to_nsec(end_time - start_time) / n) << " ns"; free_memory(ptr, mem_type); @@ -385,15 +341,15 @@ UCS_TEST_P(test_md, reg_perf) { } } -UCS_TEST_P(test_md, reg_advise) { +UCS_TEST_SKIP_COND_P(test_md, reg_advise, + !check_caps(UCT_MD_FLAG_REG | + UCT_MD_FLAG_ADVISE)) { size_t size; ucs_status_t status; void *address; uct_mem_h memh; - check_caps(UCT_MD_FLAG_REG|UCT_MD_FLAG_ADVISE, "registration&advise"); - - size = 128 * 1024 * 1024; + size = 128 * UCS_MBYTE; address = malloc(size); ASSERT_TRUE(address != NULL); @@ -403,7 +359,8 @@ UCS_TEST_P(test_md, reg_advise) { ASSERT_UCS_OK(status); ASSERT_TRUE(memh != UCT_MEM_HANDLE_NULL); - status = uct_md_mem_advise(md(), memh, (char *)address + 7, 32*1024, UCT_MADV_WILLNEED); + status = uct_md_mem_advise(md(), memh, (char *)address + 7, + 32 * UCS_KBYTE, UCT_MADV_WILLNEED); EXPECT_UCS_OK(status); status = uct_md_mem_dereg(md(), memh); @@ -411,15 +368,16 @@ UCS_TEST_P(test_md, reg_advise) { free(address); } -UCS_TEST_P(test_md, alloc_advise) { +UCS_TEST_SKIP_COND_P(test_md, alloc_advise, + !check_caps(UCT_MD_FLAG_ALLOC | + UCT_MD_FLAG_ADVISE)) { size_t size, orig_size; ucs_status_t status; void *address; uct_mem_h memh; - check_caps(UCT_MD_FLAG_ALLOC|UCT_MD_FLAG_ADVISE, "allocation&advise"); - - orig_size = size = 128 * 1024 * 1024; + orig_size = size = 128 * UCS_MBYTE; + address = NULL; status = uct_md_mem_alloc(md(), &size, &address, UCT_MD_MEM_FLAG_NONBLOCK| @@ -430,7 +388,8 @@ UCS_TEST_P(test_md, alloc_advise) { EXPECT_TRUE(address != NULL); EXPECT_TRUE(memh != UCT_MEM_HANDLE_NULL); - status = uct_md_mem_advise(md(), memh, (char *)address + 7, 32*1024, UCT_MADV_WILLNEED); + status = uct_md_mem_advise(md(), memh, (char *)address + 7, + 32 * UCS_KBYTE, UCT_MADV_WILLNEED); EXPECT_UCS_OK(status); memset(address, 0xBB, size); @@ -441,16 +400,15 @@ UCS_TEST_P(test_md, alloc_advise) { * reproduce issue #1284, main thread is registering memory while another thread * allocates and releases memory. */ -UCS_TEST_P(test_md, reg_multi_thread) { +UCS_TEST_SKIP_COND_P(test_md, reg_multi_thread, + !check_caps(UCT_MD_FLAG_REG)) { ucs_status_t status; uct_md_attr_t md_attr; - check_caps(UCT_MD_FLAG_REG, "registration"); - status = uct_md_query(md(), &md_attr); ASSERT_UCS_OK(status); - if (!(md_attr.cap.reg_mem_types & UCS_BIT(UCT_MD_MEM_TYPE_HOST))) { + if (!(md_attr.cap.reg_mem_types & UCS_BIT(UCS_MEMORY_TYPE_HOST))) { UCS_TEST_SKIP_R("not host memory type"); } @@ -484,21 +442,24 @@ UCS_TEST_P(test_md, reg_multi_thread) { pthread_join(thread_id, NULL); } -UCS_TEST_P(test_md, sockaddr_accessibility) { +UCS_TEST_SKIP_COND_P(test_md, sockaddr_accessibility, + !check_caps(UCT_MD_FLAG_SOCKADDR)) { ucs_sock_addr_t sock_addr; struct ifaddrs *ifaddr, *ifa; - int found_ipoib = 0; - - check_caps(UCT_MD_FLAG_SOCKADDR, "sockaddr"); + bool found_rdma = false; + bool found_ip = false; ASSERT_TRUE(getifaddrs(&ifaddr) != -1); /* go through a linked list of available interfaces */ for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { - if (ucs::is_inet_addr(ifa->ifa_addr) && ucs_netif_is_active(ifa->ifa_name)) { + if (ucs::is_inet_addr(ifa->ifa_addr) && + ucs_netif_flags_is_active(ifa->ifa_flags)) { sock_addr.addr = ifa->ifa_addr; - if (!strcmp(GetParam().c_str(), "rdmacm")) { + found_ip = true; + + if (GetParam().md_name == "rdmacm") { if (ucs::is_rdmacm_netdev(ifa->ifa_name)) { UCS_TEST_MESSAGE << "Testing " << ifa->ifa_name << " with " << ucs::sockaddr_to_str(ifa->ifa_addr); @@ -506,7 +467,7 @@ UCS_TEST_P(test_md, sockaddr_accessibility) { UCT_SOCKADDR_ACC_LOCAL)); ASSERT_TRUE(uct_md_is_sockaddr_accessible(md(), &sock_addr, UCT_SOCKADDR_ACC_REMOTE)); - found_ipoib = 1; + found_rdma = true; } } else { UCS_TEST_MESSAGE << "Testing " << ifa->ifa_name << " with " << @@ -519,8 +480,13 @@ UCS_TEST_P(test_md, sockaddr_accessibility) { } } - if ((!strcmp(GetParam().c_str(), "rdmacm")) && (!found_ipoib)) { - UCS_TEST_MESSAGE << "Cannot find an IPoIB interface with an IPv4 address on the host"; + if (GetParam().md_name == "rdmacm") { + if (!found_rdma) { + UCS_TEST_MESSAGE << + "Cannot find an IPoIB/RoCE interface with an IPv4 address on the host"; + } + } else if (!found_ip) { + UCS_TEST_MESSAGE << "Cannot find an IPv4/IPv6 interface on the host"; } freeifaddrs(ifaddr); @@ -535,8 +501,11 @@ UCS_TEST_P(test_md, sockaddr_accessibility) { xpmem, \ cuda_cpy, \ cuda_ipc, \ + rocm_cpy, \ + rocm_ipc, \ ib, \ ugni, \ + sockcm, \ rdmacm \ ) diff --git a/test/gtest/uct/test_md.h b/test/gtest/uct/test_md.h index 7d9aa98bf94..7bd0a366087 100644 --- a/test/gtest/uct/test_md.h +++ b/test/gtest/uct/test_md.h @@ -8,17 +8,25 @@ #ifndef UCT_TEST_MD #define UCT_TEST_MD -#include -#include +#include "uct_test.h" -class test_md : public testing::TestWithParam, - public ucs::test_base +struct test_md_param { + uct_component_h component; + std::string md_name; +}; + +static std::ostream& operator<<(std::ostream& os, const test_md_param& md_param) { + return os << md_param.md_name; +} + +class test_md : public testing::TestWithParam, + public uct_test_base { public: UCS_TEST_BASE_IMPL; - static std::vector enum_mds(const std::string& mdc_name); + static std::vector enum_mds(const std::string& cmpt_name); test_md(); @@ -27,10 +35,12 @@ class test_md : public testing::TestWithParam, virtual void cleanup(); virtual void modify_config(const std::string& name, const std::string& value, bool optional); - void check_caps(uint64_t flags, const std::string& name); - void alloc_memory(void **address, size_t size, char *fill, int mem_type); - void check_memory(void *address, void *expect, size_t size, int mem_type); - void free_memory(void *address, int mem_type); + bool check_caps(uint64_t flags); + void alloc_memory(void **address, size_t size, char *fill, + ucs_memory_type_t mem_type); + void check_memory(void *address, void *expect, size_t size, + ucs_memory_type_t mem_type); + void free_memory(void *address, ucs_memory_type_t mem_type); void test_registration(); @@ -44,7 +54,6 @@ class test_md : public testing::TestWithParam, static void* alloc_thread(void *arg); - static std::string const mem_types[]; private: ucs::handle m_md_config; @@ -53,7 +62,7 @@ class test_md : public testing::TestWithParam, }; -#define _UCT_MD_INSTANTIATE_TEST_CASE(_test_case, _mdc_name) \ - INSTANTIATE_TEST_CASE_P(_mdc_name, _test_case, \ - testing::ValuesIn(_test_case::enum_mds(#_mdc_name))); +#define _UCT_MD_INSTANTIATE_TEST_CASE(_test_case, _cmpt_name) \ + INSTANTIATE_TEST_CASE_P(_cmpt_name, _test_case, \ + testing::ValuesIn(_test_case::enum_mds(#_cmpt_name))); #endif diff --git a/test/gtest/uct/test_mem.cc b/test/gtest/uct/test_mem.cc index a4fb58f5c22..f9c1107156d 100644 --- a/test/gtest/uct/test_mem.cc +++ b/test/gtest/uct/test_mem.cc @@ -4,17 +4,21 @@ * See file LICENSE for terms. */ -extern "C" { -#include +#include "uct_test.h" + #include -} -#include + class test_mem : public testing::TestWithParam, -public ucs::test_base { + public uct_test_base { public: UCS_TEST_BASE_IMPL; + virtual void init() { + ucs::skip_on_address_sanitizer(); + uct_test_base::init(); + } + protected: void check_mem(const uct_allocated_memory &mem, size_t min_length) { @@ -33,7 +37,7 @@ public ucs::test_base { }; -UCS_TEST_P(test_mem, nopd_alloc) { +UCS_TEST_P(test_mem, nomd_alloc) { uct_alloc_method_t methods[2]; uct_allocated_memory mem; ucs_status_t status; @@ -50,40 +54,38 @@ UCS_TEST_P(test_mem, nopd_alloc) { uct_mem_free(&mem); } -UCS_TEST_P(test_mem, pd_alloc) { +UCS_TEST_P(test_mem, md_alloc) { uct_alloc_method_t methods[3]; uct_allocated_memory mem; - uct_md_resource_desc_t *md_resources; + std::vector md_resources; uct_md_attr_t md_attr; - unsigned i, num_md_resources; ucs_status_t status; - uct_md_h pd; + uct_md_h md; uct_md_config_t *md_config; int nonblock; - status = uct_query_md_resources(&md_resources, &num_md_resources); - ASSERT_UCS_OK(status); - methods[0] = UCT_ALLOC_METHOD_MD; methods[1] = GetParam(); methods[2] = UCT_ALLOC_METHOD_HEAP; - for (i = 0; i < num_md_resources; ++i) { + md_resources = enum_md_resources(); + for (std::vector::iterator iter = md_resources.begin(); + iter != md_resources.end(); ++iter) { - status = uct_md_config_read(md_resources[i].md_name, NULL, NULL, &md_config); + status = uct_md_config_read(iter->cmpt, NULL, NULL, &md_config); ASSERT_UCS_OK(status); - status = uct_md_open(md_resources[i].md_name, md_config, &pd); + status = uct_md_open(iter->cmpt, iter->rsc_desc.md_name, md_config, &md); uct_config_release(md_config); ASSERT_UCS_OK(status); - status = uct_md_query(pd, &md_attr); + status = uct_md_query(md, &md_attr); ASSERT_UCS_OK(status); for (nonblock = 0; nonblock <= 1; ++nonblock) { int flags = nonblock ? UCT_MD_MEM_FLAG_NONBLOCK : 0; flags |= UCT_MD_MEM_ACCESS_ALL; - status = uct_mem_alloc(NULL, min_length, flags, methods, 3, &pd, 1, + status = uct_mem_alloc(NULL, min_length, flags, methods, 3, &md, 1, "test", &mem); ASSERT_UCS_OK(status); @@ -98,18 +100,16 @@ UCS_TEST_P(test_mem, pd_alloc) { uct_mem_free(&mem); } - uct_md_close(pd); + uct_md_close(md); } - - uct_release_md_resource_list(md_resources); } UCS_TEST_P(test_mem, md_fixed) { - uct_md_resource_desc_t *md_resources; + std::vector md_resources; uct_md_attr_t md_attr; uct_md_config_t *md_config; - uct_md_h pd; - unsigned num_md_resources, i, j; + uct_md_h md; + unsigned j; const size_t page_size = ucs_get_page_size(); const size_t n_tryes = 101; @@ -120,18 +120,18 @@ UCS_TEST_P(test_mem, md_fixed) { uct_allocated_memory_t uct_mem; ucs_status_t status; - status = uct_query_md_resources(&md_resources, &num_md_resources); - ASSERT_UCS_OK(status); + md_resources = enum_md_resources(); + for (std::vector::iterator iter = md_resources.begin(); + iter != md_resources.end(); ++iter) { - for (i = 0; i < num_md_resources; ++i) { - status = uct_md_config_read(md_resources[i].md_name, NULL, NULL, &md_config); + status = uct_md_config_read(iter->cmpt, NULL, NULL, &md_config); ASSERT_UCS_OK(status); - status = uct_md_open(md_resources[i].md_name, md_config, &pd); + status = uct_md_open(iter->cmpt, iter->rsc_desc.md_name, md_config, &md); uct_config_release(md_config); ASSERT_UCS_OK(status); - status = uct_md_query(pd, &md_attr); + status = uct_md_query(md, &md_attr); ASSERT_UCS_OK(status); if ((md_attr.cap.flags & UCT_MD_FLAG_ALLOC) && @@ -144,7 +144,7 @@ UCS_TEST_P(test_mem, md_fixed) { status = uct_mem_alloc(p_addr, 1, UCT_MD_MEM_FLAG_FIXED| UCT_MD_MEM_ACCESS_ALL, - &meth, 1, &pd, 1, "test", &uct_mem); + &meth, 1, &md, 1, "test", &uct_mem); if (status == UCS_OK) { ++n_success; EXPECT_EQ(meth, uct_mem.method); @@ -164,10 +164,8 @@ UCS_TEST_P(test_mem, md_fixed) { EXPECT_GT(n_success, (size_t)0); } - uct_md_close(pd); + uct_md_close(md); } - - uct_release_md_resource_list(md_resources); } diff --git a/test/gtest/uct/test_mm.cc b/test/gtest/uct/test_mm.cc index af0f7215340..49ab93fd0dd 100644 --- a/test/gtest/uct/test_mm.cc +++ b/test/gtest/uct/test_mm.cc @@ -6,24 +6,87 @@ extern "C" { #include +#include #include } #include "uct_p2p_test.h" #include #include "uct_test.h" + class test_uct_mm : public uct_test { public: - void initialize() { - if (GetParam()->dev_name == "posix") { - set_config("USE_SHM_OPEN=no"); + struct mm_resource : public resource { + std::string shm_dir; + + mm_resource(const resource& res, const std::string& shm_dir = "") : + resource(res.component, res.md_name, res.local_cpus, res.tl_name, + res.dev_name, res.dev_type), + shm_dir(shm_dir) + { + } + + virtual std::string name() const { + std::string name = resource::name(); + if (!shm_dir.empty()) { + name += ",dir=" + shm_dir; + } + return name; + } + }; + + typedef struct { + unsigned length; + /* data follows */ + } recv_desc_t; + + static std::vector enum_resources(const std::string& tl_name) { + static std::vector all_resources; + + if (all_resources.empty()) { + std::vector r = uct_test::enum_resources(""); + for (std::vector::iterator iter = r.begin(); + iter != r.end(); ++iter) { + if ((*iter)->tl_name == "posix") { + enum_posix_variants(**iter, all_resources); + } else { + all_resources.push_back(mm_resource(**iter)); + } + } + } + + return filter_resources(all_resources, tl_name); + } + + test_uct_mm() : m_e1(NULL), m_e2(NULL) { + if (GetParam()->tl_name == "posix") { + set_posix_config(); } + } + + const mm_resource* GetParam() { + return dynamic_cast(uct_test::GetParam()); + } + + static void enum_posix_variants(const resource &res, + std::vector &variants) { + variants.push_back(mm_resource(res, "." )); + variants.push_back(mm_resource(res, "/dev/shm")); + } + + void set_posix_config() { + set_config("DIR=" + GetParam()->shm_dir); + } + + virtual void init() { uct_test::init(); m_e1 = uct_test::create_entity(0); m_entities.push_back(m_e1); + check_skip_test(); + m_e2 = uct_test::create_entity(0); m_entities.push_back(m_e2); @@ -31,11 +94,6 @@ class test_uct_mm : public uct_test { m_e2->connect(0, *m_e1, 0); } - typedef struct { - unsigned length; - /* data follows */ - } recv_desc_t; - static ucs_status_t mm_am_handler(void *arg, void *data, size_t length, unsigned flags) { recv_desc_t *my_desc = (recv_desc_t *) arg; @@ -51,52 +109,152 @@ class test_uct_mm : public uct_test { return UCS_OK; } - void cleanup() { - uct_test::cleanup(); + bool check_md_caps(uint64_t flags) { + FOR_EACH_ENTITY(iter) { + if (!(ucs_test_all_flags((*iter)->md_attr().cap.flags, flags))) { + return false; + } + } + return true; + } + + void test_attach_ptr(void *ptr, void *attach_ptr, uint64_t magic) + { + *(uint64_t*)attach_ptr = 0; + ucs_memory_cpu_store_fence(); + + *(uint64_t*)ptr = magic; + ucs_memory_cpu_load_fence(); + + /* Writing to *ptr should also update *attach_ptr */ + EXPECT_EQ(magic, *(uint64_t*)attach_ptr) + << "ptr=" << ptr << " attach_ptr=" << attach_ptr; + + UCS_TEST_MESSAGE << std::hex << *(uint64_t*)attach_ptr; + } + + uct_mm_md_t *md(entity *e) { + return ucs_derived_of(e->md(), uct_mm_md_t); + } + + void test_attach(void *ptr, uct_mem_h memh, size_t size) + { + uct_mm_seg_t *seg = (uct_mm_seg_t*)memh; + ucs_status_t status; + + size_t iface_addr_len = uct_mm_md_mapper_call(md(m_e1), iface_addr_length); + std::vector iface_addr(iface_addr_len); + + status = uct_mm_md_mapper_call(md(m_e1), iface_addr_pack, &iface_addr[0]); + ASSERT_UCS_OK(status); + + uct_mm_remote_seg_t rseg; + status = uct_mm_md_mapper_call(md(m_e2), mem_attach, seg->seg_id, size, + &iface_addr[0], &rseg); + ASSERT_UCS_OK(status); + + test_attach_ptr(ptr, rseg.address, 0xdeadbeef11111); + + uct_mm_md_mapper_call(md(m_e2), mem_detach, &rseg); + } + + void test_rkey(void *ptr, uct_mem_h memh, size_t size) + { + ucs_status_t status; + + std::vector rkey_buffer(m_e1->md_attr().rkey_packed_size); + + status = uct_md_mkey_pack(m_e1->md(), memh, &rkey_buffer[0]); + ASSERT_UCS_OK(status); + + uct_rkey_bundle_t rkey_ob; + status = uct_rkey_unpack(GetParam()->component, &rkey_buffer[0], &rkey_ob); + ASSERT_UCS_OK(status); + + /* For shared memory transports, rkey is the offset between local and + * remote pointers. + */ + test_attach_ptr(ptr, UCS_PTR_BYTE_OFFSET(ptr, rkey_ob.rkey), + 0xdeadbeef22222); + + uct_rkey_release(GetParam()->component, &rkey_ob); + } + + void test_memh(void *ptr, uct_mem_h memh, size_t size) { + test_attach(ptr, memh, size); + test_attach(ptr, memh, size); + test_rkey(ptr, memh, size); } protected: entity *m_e1, *m_e2; }; -UCS_TEST_P(test_uct_mm, open_for_posix) { +UCS_TEST_SKIP_COND_P(test_uct_mm, open_for_posix, + check_caps(UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CB_SYNC)) +{ uint64_t send_data = 0xdeadbeef; uint64_t test_mm_hdr = 0xbeef; recv_desc_t *recv_buffer; - for (int i = 0; i < 2; i++) { + recv_buffer = (recv_desc_t *)malloc(sizeof(*recv_buffer) + + sizeof(uint64_t)); + recv_buffer->length = 0; /* Initialize length to 0 */ - if (i == 1) { - /* first loop tests USE_PROC_LINK==yes (default), - * second loop tests USE_PROC_LINK==no */ - if (GetParam()->dev_name == "posix") { - set_config("USE_PROC_LINK=no"); - } else { - break; - } - } + /* set a callback for the uct to invoke for receiving the data */ + uct_iface_set_am_handler(m_e2->iface(), 0, mm_am_handler , recv_buffer, + 0); + + /* send the data */ + uct_ep_am_short(m_e1->ep(0), 0, test_mm_hdr, &send_data, sizeof(send_data)); - initialize(); - check_caps(UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CB_SYNC); + /* progress sender and receiver until the receiver gets the message */ + wait_for_flag(&recv_buffer->length); - recv_buffer = (recv_desc_t *) malloc(sizeof(*recv_buffer) + sizeof(uint64_t)); - recv_buffer->length = 0; /* Initialize length to 0 */ + ASSERT_EQ(sizeof(send_data), recv_buffer->length); + EXPECT_EQ(send_data, *(uint64_t*)(recv_buffer+1)); - /* set a callback for the uct to invoke for receiving the data */ - uct_iface_set_am_handler(m_e2->iface(), 0, mm_am_handler , recv_buffer, - 0); + free(recv_buffer); +} - /* send the data */ - uct_ep_am_short(m_e1->ep(0), 0, test_mm_hdr, &send_data, sizeof(send_data)); +UCS_TEST_SKIP_COND_P(test_uct_mm, alloc, + !check_md_caps(UCT_MD_FLAG_ALLOC)) { - /* progress sender and receiver until the receiver gets the message */ - wait_for_flag(&recv_buffer->length); + size_t size = ucs_min(100000u, m_e1->md_attr().cap.max_alloc); + ucs_status_t status; - ASSERT_EQ(sizeof(send_data), recv_buffer->length); - EXPECT_EQ(send_data, *(uint64_t*)(recv_buffer+1)); + void *address = NULL; + size_t alloc_length = size; + uct_mem_h memh; + status = uct_md_mem_alloc(m_e1->md(), &alloc_length, &address, + UCT_MD_MEM_ACCESS_ALL, "test_mm", &memh); + ASSERT_UCS_OK(status); - free(recv_buffer); - } + test_memh(address, memh, size); + + status = uct_md_mem_free(m_e1->md(), memh); + ASSERT_UCS_OK(status); +} + +UCS_TEST_SKIP_COND_P(test_uct_mm, reg, + !check_md_caps(UCT_MD_FLAG_REG)) { + + size_t size = ucs_min(100000u, m_e1->md_attr().cap.max_reg); + ucs_status_t status; + + std::vector buffer(size); + + uct_mem_h memh; + status = uct_md_mem_reg(m_e1->md(), &buffer[0], size, UCT_MD_MEM_ACCESS_ALL, + &memh); + ASSERT_UCS_OK(status); + + test_memh(&buffer[0], memh, size); + + status = uct_md_mem_dereg(m_e1->md(), memh); + ASSERT_UCS_OK(status); } -_UCT_INSTANTIATE_TEST_CASE(test_uct_mm, mm) +_UCT_INSTANTIATE_TEST_CASE(test_uct_mm, posix) +_UCT_INSTANTIATE_TEST_CASE(test_uct_mm, sysv) +_UCT_INSTANTIATE_TEST_CASE(test_uct_mm, xpmem) diff --git a/test/gtest/uct/test_p2p_am.cc b/test/gtest/uct/test_p2p_am.cc index ff4253fba3d..303b3b6a78a 100644 --- a/test/gtest/uct/test_p2p_am.cc +++ b/test/gtest/uct/test_p2p_am.cc @@ -76,9 +76,11 @@ class uct_p2p_am_test : public uct_p2p_test (char*)resp_req->sendbuf->ptr() + sizeof(hdr), resp_req->sendbuf->length() - sizeof(hdr)); if (status == UCS_OK) { + pthread_mutex_lock(&test->m_lock); ++test->m_am_posted; resp_req->posted = true; delete resp_req->sendbuf; + pthread_mutex_unlock(&test->m_lock); } return status; } @@ -123,7 +125,7 @@ class uct_p2p_am_test : public uct_p2p_test tracer_ctx_t *ctx = (tracer_ctx_t *)arg; EXPECT_EQ(uint8_t(AM_ID), id); - mapped_buffer::pattern_check(data, length, SEED1); + mem_buffer::pattern_check(data, length, SEED1); *buffer = '\0'; ++ctx->count; } @@ -150,7 +152,7 @@ class uct_p2p_am_test : public uct_p2p_test pthread_mutex_unlock(&m_lock); return (my_desc->magic == MAGIC_DESC) ? UCS_INPROGRESS : UCS_OK; } - mapped_buffer::pattern_check(data, length, SEED1); + mem_buffer::pattern_check(data, length, SEED1); return UCS_OK; } @@ -159,7 +161,7 @@ class uct_p2p_am_test : public uct_p2p_test while (!m_backlog.empty()) { receive_desc_t *my_desc = m_backlog.back(); m_backlog.pop_back(); - mapped_buffer::pattern_check(my_desc + 1, my_desc->length, SEED1); + mem_buffer::pattern_check(my_desc + 1, my_desc->length, SEED1); pthread_mutex_unlock(&m_lock); if (my_desc->magic == MAGIC_DESC) { uct_iface_release_desc(my_desc); @@ -216,7 +218,7 @@ class uct_p2p_am_test : public uct_p2p_test } void test_xfer_do(send_func_t send, size_t length, unsigned flags, - uint32_t am_mode, uct_memory_type_t mem_type) + uint32_t am_mode, ucs_memory_type_t mem_type) { ucs_status_t status; @@ -262,7 +264,7 @@ class uct_p2p_am_test : public uct_p2p_test } virtual void test_xfer(send_func_t send, size_t length, unsigned flags, - uct_memory_type_t mem_type) { + ucs_memory_type_t mem_type) { if (receiver().iface_attr().cap.flags & UCT_IFACE_FLAG_CB_SYNC) { test_xfer_do(send, length, flags, 0, mem_type); @@ -306,25 +308,21 @@ class uct_p2p_am_test : public uct_p2p_test uct_p2p_am_test *test; bool posted; } m_pending_req; + pthread_mutex_t m_lock; private: bool m_keep_data; std::vector m_backlog; - pthread_mutex_t m_lock; tracer_ctx_t m_send_tracer; tracer_ctx_t m_recv_tracer; }; -UCS_TEST_P(uct_p2p_am_test, am_sync) { +UCS_TEST_SKIP_COND_P(uct_p2p_am_test, am_sync, + ((UCT_DEVICE_TYPE_SELF == GetParam()->dev_type) || + !check_caps(UCT_IFACE_FLAG_CB_SYNC, + UCT_IFACE_FLAG_AM_DUP))) { ucs_status_t status; - - if (UCT_DEVICE_TYPE_SELF == GetParam()->dev_type) { - UCS_TEST_SKIP_R("SELF doesn't use progress"); - } - - check_caps(UCT_IFACE_FLAG_CB_SYNC, UCT_IFACE_FLAG_AM_DUP); - mapped_buffer recvbuf(0, 0, sender()); /* dummy */ unsigned am_count = m_am_count = 0; @@ -363,11 +361,11 @@ UCS_TEST_P(uct_p2p_am_test, am_sync) { ASSERT_UCS_OK(status); } -UCS_TEST_P(uct_p2p_am_test, am_async) { +UCS_TEST_SKIP_COND_P(uct_p2p_am_test, am_async, + !check_caps(UCT_IFACE_FLAG_CB_ASYNC, + UCT_IFACE_FLAG_AM_DUP)) { ucs_status_t status; - check_caps(UCT_IFACE_FLAG_CB_ASYNC, UCT_IFACE_FLAG_AM_DUP); - mapped_buffer recvbuf(0, 0, sender()); /* dummy */ unsigned am_count = m_am_count = 0; @@ -407,13 +405,13 @@ UCS_TEST_P(uct_p2p_am_test, am_async) { ASSERT_UCS_OK(status); } -UCS_TEST_P(uct_p2p_am_test, am_async_response) { +UCS_TEST_SKIP_COND_P(uct_p2p_am_test, am_async_response, + !check_caps(UCT_IFACE_FLAG_CB_SYNC | + UCT_IFACE_FLAG_CB_ASYNC, + UCT_IFACE_FLAG_AM_DUP)) { ucs_status_t status; - - check_caps(UCT_IFACE_FLAG_CB_SYNC | UCT_IFACE_FLAG_CB_ASYNC, - UCT_IFACE_FLAG_AM_DUP); - mapped_buffer recvbuf(0, 0, sender()); /* dummy */ + m_am_posted = m_am_count = 0; m_pending_req.posted = false; @@ -439,12 +437,17 @@ UCS_TEST_P(uct_p2p_am_test, am_async_response) { ++m_am_posted; deadline = ucs_get_time() + ucs_time_from_sec(timeout); + pthread_mutex_lock(&m_lock); while ((!m_pending_req.posted || (m_am_count != m_am_posted)) && (ucs_get_time() < deadline)) { + pthread_mutex_unlock(&m_lock); sender().progress(); + pthread_mutex_lock(&m_lock); } + UCS_TEST_MESSAGE << "posted: " << m_am_posted << " am_count: " << m_am_count; EXPECT_TRUE(m_pending_req.posted); EXPECT_EQ(m_am_posted, m_am_count); + pthread_mutex_unlock(&m_lock); } status = uct_iface_set_am_handler(receiver().iface(), AM_ID, NULL, NULL, 0); @@ -492,7 +495,9 @@ class uct_p2p_am_misc : public uct_p2p_am_test static ucs_log_func_rc_t no_rx_buffs_log_handler(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap) + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) { /* Ignore warnings about empty memory pool */ if ((level == UCS_LOG_LEVEL_WARN) && @@ -509,16 +514,18 @@ class uct_p2p_am_misc : public uct_p2p_am_test bool m_rx_buf_limit_failed; }; -UCS_TEST_P(uct_p2p_am_test, am_bcopy) { - check_caps(UCT_IFACE_FLAG_AM_BCOPY, UCT_IFACE_FLAG_AM_DUP); +UCS_TEST_SKIP_COND_P(uct_p2p_am_test, am_bcopy, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY, + UCT_IFACE_FLAG_AM_DUP)) { test_xfer_multi(static_cast(&uct_p2p_am_test::am_bcopy), 0ul, sender().iface_attr().cap.am.max_bcopy, TEST_UCT_FLAG_DIR_SEND_TO_RECV); } -UCS_TEST_P(uct_p2p_am_test, am_short_keep_data) { - check_caps(UCT_IFACE_FLAG_AM_SHORT, UCT_IFACE_FLAG_AM_DUP); +UCS_TEST_SKIP_COND_P(uct_p2p_am_test, am_short_keep_data, + !check_caps(UCT_IFACE_FLAG_AM_SHORT, + UCT_IFACE_FLAG_AM_DUP)) { set_keep_data(true); test_xfer_multi(static_cast(&uct_p2p_am_test::am_short), sizeof(uint64_t), @@ -526,8 +533,9 @@ UCS_TEST_P(uct_p2p_am_test, am_short_keep_data) { TEST_UCT_FLAG_DIR_SEND_TO_RECV); } -UCS_TEST_P(uct_p2p_am_test, am_bcopy_keep_data) { - check_caps(UCT_IFACE_FLAG_AM_BCOPY, UCT_IFACE_FLAG_AM_DUP); +UCS_TEST_SKIP_COND_P(uct_p2p_am_test, am_bcopy_keep_data, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY, + UCT_IFACE_FLAG_AM_DUP)) { set_keep_data(true); test_xfer_multi(static_cast(&uct_p2p_am_test::am_bcopy), sizeof(uint64_t), @@ -535,8 +543,9 @@ UCS_TEST_P(uct_p2p_am_test, am_bcopy_keep_data) { TEST_UCT_FLAG_DIR_SEND_TO_RECV); } -UCS_TEST_P(uct_p2p_am_test, am_zcopy) { - check_caps(UCT_IFACE_FLAG_AM_ZCOPY, UCT_IFACE_FLAG_AM_DUP); +UCS_TEST_SKIP_COND_P(uct_p2p_am_test, am_zcopy, + !check_caps(UCT_IFACE_FLAG_AM_ZCOPY, + UCT_IFACE_FLAG_AM_DUP)) { test_xfer_multi(static_cast(&uct_p2p_am_test::am_zcopy), 0ul, sender().iface_attr().cap.am.max_zcopy, @@ -545,30 +554,25 @@ UCS_TEST_P(uct_p2p_am_test, am_zcopy) { UCT_INSTANTIATE_TEST_CASE(uct_p2p_am_test) -const unsigned uct_p2p_am_misc::RX_MAX_BUFS = 1024; /* due to hard coded 'grow' - parameter in uct_ib_iface_recv_mpool_init */ +const unsigned uct_p2p_am_misc::RX_MAX_BUFS = 1024; /* due to hard coded 'grow' + parameter in uct_ib_iface_recv_mpool_init */ const unsigned uct_p2p_am_misc::RX_QUEUE_LEN = 64; -UCS_TEST_P(uct_p2p_am_misc, no_rx_buffs) { - - mapped_buffer sendbuf(10 * sizeof(uint64_t), SEED1, sender()); +UCS_TEST_SKIP_COND_P(uct_p2p_am_misc, no_rx_buffs, + (RUNNING_ON_VALGRIND || m_rx_buf_limit_failed || + !check_caps(UCT_IFACE_FLAG_AM_SHORT | + UCT_IFACE_FLAG_CB_SYNC))) +{ + mapped_buffer sendbuf(ucs_min(sender().iface_attr().cap.am.max_short, + 10 * sizeof(uint64_t)), + SEED1, sender()); mapped_buffer recvbuf(0, 0, sender()); /* dummy */ ucs_status_t status; - if (RUNNING_ON_VALGRIND) { - UCS_TEST_SKIP_R("skipping on valgrind"); - } - if (&sender() == &receiver()) { UCS_TEST_SKIP_R("skipping on loopback"); } - if (m_rx_buf_limit_failed) { - UCS_TEST_SKIP_R("Current transport doesn't have rx memory pool"); - } - - check_caps(UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CB_SYNC); - /* set a callback for the uct to invoke for receiving the data */ status = uct_iface_set_am_handler(receiver().iface(), AM_ID, am_handler, (void*)this, 0); @@ -590,13 +594,13 @@ UCS_TEST_P(uct_p2p_am_misc, no_rx_buffs) { short_progress_loop(); ucs_log_pop_handler(); - /* check that now the sender is able to send */ - EXPECT_EQ(UCS_OK, send_with_timeout(sender_ep(), sendbuf, recvbuf, 6)); + /* check that now the sender is able to send. + * for UD time to recover depends on UCX_UD_TIMER_TICK */ + EXPECT_EQ(UCS_OK, send_with_timeout(sender_ep(), sendbuf, recvbuf, 100)); } -UCS_TEST_P(uct_p2p_am_misc, am_max_short_multi) { - check_caps(UCT_IFACE_FLAG_AM_SHORT); - +UCS_TEST_SKIP_COND_P(uct_p2p_am_misc, am_max_short_multi, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { ucs_status_t status; m_am_count = 0; @@ -608,7 +612,7 @@ UCS_TEST_P(uct_p2p_am_misc, am_max_short_multi) { size_t size = ucs_min(sender().iface_attr().cap.am.max_short, 8192ul); std::string sendbuf(size, 0); - mapped_buffer::pattern_fill(&sendbuf[0], sendbuf.size(), SEED1); + mem_buffer::pattern_fill(&sendbuf[0], sendbuf.size(), SEED1); ucs_assert(SEED1 == *(uint64_t*)&sendbuf[0]); /* exhaust all resources or time out 1sec */ @@ -633,33 +637,29 @@ UCS_TEST_P(uct_p2p_am_misc, am_max_short_multi) { EXPECT_EQ(UCS_OK, status); } -UCS_TEST_P(uct_p2p_am_misc, am_short, "MAX_SHORT=" + ucs::to_string(USHRT_MAX + 1)) { - check_caps(UCT_IFACE_FLAG_AM_SHORT, UCT_IFACE_FLAG_AM_DUP); - test_xfer_multi(static_cast(&uct_p2p_am_test::am_short), - sizeof(uint64_t), - sender().iface_attr().cap.am.max_short, - TEST_UCT_FLAG_DIR_SEND_TO_RECV); -} - UCT_INSTANTIATE_TEST_CASE(uct_p2p_am_misc) class uct_p2p_am_tx_bufs : public uct_p2p_am_test { public: uct_p2p_am_tx_bufs() : uct_p2p_am_test() { + std::string cfg_prefix = ""; ucs_status_t status1, status2; - /* can not reduce mpool size below retransmission window - * for ud - */ - if ((GetParam()->tl_name.compare("ud") == 0) || - (GetParam()->tl_name.compare("ud_mlx5") == 0)) { + /* can not reduce mpool size below retransmission window for ud */ + if (has_ud()) { m_inited = false; return; } - status1 = uct_config_modify(m_iface_config, "IB_TX_MAX_BUFS" , "32"); - status2 = uct_config_modify(m_iface_config, "IB_TX_BUFS_GROW" , "32"); + if (has_ib()) { + cfg_prefix = "IB_"; + } + + status1 = uct_config_modify(m_iface_config, + (cfg_prefix + "TX_MAX_BUFS").c_str() , "32"); + status2 = uct_config_modify(m_iface_config, + (cfg_prefix + "TX_BUFS_GROW").c_str(), "32"); if ((status1 != UCS_OK) || (status2 != UCS_OK)) { m_inited = false; } else { @@ -673,7 +673,7 @@ UCS_TEST_P(uct_p2p_am_tx_bufs, am_tx_max_bufs) { ucs_status_t status; mapped_buffer recvbuf(0, 0, sender()); /* dummy */ mapped_buffer sendbuf_bcopy(sender().iface_attr().cap.am.max_bcopy, - SEED1, sender()); + SEED1, sender()); status = uct_iface_set_am_handler(receiver().iface(), AM_ID, am_handler, this, UCT_CB_FLAG_ASYNC); @@ -682,17 +682,14 @@ UCS_TEST_P(uct_p2p_am_tx_bufs, am_tx_max_bufs) { if (!m_inited) { UCS_TEST_SKIP_R("Test does not apply to the current transport"); } - if (GetParam()->tl_name.compare("cm") == 0) { + if (has_transport("cm")) { UCS_TEST_SKIP_R("Test does not work with IB CM transport"); } - if ((GetParam()->tl_name.compare("rc") == 0) || - (GetParam()->tl_name.compare("rc_mlx5") == 0)) { + if (has_rc()) { UCS_TEST_SKIP_R("Test does not work with IB RC transports"); } do { status = am_bcopy(sender_ep(), sendbuf_bcopy, recvbuf); - if (status == UCS_OK) { - } } while (status == UCS_OK); /* short progress shall release tx buffers and diff --git a/test/gtest/uct/test_p2p_err.cc b/test/gtest/uct/test_p2p_err.cc index 5c721c6c332..fb8bc593353 100644 --- a/test/gtest/uct/test_p2p_err.cc +++ b/test/gtest/uct/test_p2p_err.cc @@ -85,6 +85,8 @@ class uct_p2p_err_test : public uct_p2p_test { } break; } + + progress(); } while (status == UCS_ERR_NO_RESOURCE); if (status != UCS_OK && status != UCS_INPROGRESS) { @@ -147,8 +149,9 @@ class uct_p2p_err_test : public uct_p2p_test { ucs_status_t uct_p2p_err_test::last_error = UCS_OK; -UCS_TEST_P(uct_p2p_err_test, local_access_error) { - check_caps(UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_ERRHANDLE_ZCOPY_BUF); +UCS_TEST_SKIP_COND_P(uct_p2p_err_test, local_access_error, + !check_caps(UCT_IFACE_FLAG_PUT_ZCOPY | + UCT_IFACE_FLAG_ERRHANDLE_ZCOPY_BUF)) { mapped_buffer sendbuf(16, 1, sender()); mapped_buffer recvbuf(16, 2, receiver()); @@ -160,8 +163,9 @@ UCS_TEST_P(uct_p2p_err_test, local_access_error) { recvbuf.pattern_check(2); } -UCS_TEST_P(uct_p2p_err_test, remote_access_error) { - check_caps(UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_ERRHANDLE_REMOTE_MEM); +UCS_TEST_SKIP_COND_P(uct_p2p_err_test, remote_access_error, + !check_caps(UCT_IFACE_FLAG_PUT_ZCOPY | + UCT_IFACE_FLAG_ERRHANDLE_REMOTE_MEM)) { mapped_buffer sendbuf(16, 1, sender()); mapped_buffer recvbuf(16, 2, receiver()); @@ -174,10 +178,10 @@ UCS_TEST_P(uct_p2p_err_test, remote_access_error) { } #if ENABLE_PARAMS_CHECK -UCS_TEST_P(uct_p2p_err_test, invalid_put_short_length) { - check_caps(UCT_IFACE_FLAG_PUT_SHORT); +UCS_TEST_SKIP_COND_P(uct_p2p_err_test, invalid_put_short_length, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { size_t max_short = sender().iface_attr().cap.put.max_short; - if (max_short > (2 * 1024 * 1024)) { + if (max_short > (2 * UCS_MBYTE)) { UCS_TEST_SKIP_R("max_short too large"); } @@ -191,10 +195,11 @@ UCS_TEST_P(uct_p2p_err_test, invalid_put_short_length) { recvbuf.pattern_check(2); } -UCS_TEST_P(uct_p2p_err_test, invalid_put_bcopy_length) { - check_caps(UCT_IFACE_FLAG_PUT_BCOPY | UCT_IFACE_FLAG_ERRHANDLE_BCOPY_LEN); +UCS_TEST_SKIP_COND_P(uct_p2p_err_test, invalid_put_bcopy_length, + !check_caps(UCT_IFACE_FLAG_PUT_BCOPY | + UCT_IFACE_FLAG_ERRHANDLE_BCOPY_LEN)) { size_t max_bcopy = sender().iface_attr().cap.put.max_bcopy; - if (max_bcopy > (2 * 1024 * 1024)) { + if (max_bcopy > (2 * UCS_MBYTE)) { UCS_TEST_SKIP_R("max_bcopy too large"); } @@ -208,10 +213,10 @@ UCS_TEST_P(uct_p2p_err_test, invalid_put_bcopy_length) { recvbuf.pattern_check(2); } -UCS_TEST_P(uct_p2p_err_test, invalid_am_short_length) { - check_caps(UCT_IFACE_FLAG_AM_SHORT); +UCS_TEST_SKIP_COND_P(uct_p2p_err_test, invalid_am_short_length, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { size_t max_short = sender().iface_attr().cap.am.max_short; - if (max_short > (2 * 1024 * 1024)) { + if (max_short > (2 * UCS_MBYTE)) { UCS_TEST_SKIP_R("max_short too large"); } @@ -225,10 +230,11 @@ UCS_TEST_P(uct_p2p_err_test, invalid_am_short_length) { recvbuf.pattern_check(2); } -UCS_TEST_P(uct_p2p_err_test, invalid_am_bcopy_length) { - check_caps(UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_ERRHANDLE_BCOPY_LEN); +UCS_TEST_SKIP_COND_P(uct_p2p_err_test, invalid_am_bcopy_length, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY | + UCT_IFACE_FLAG_ERRHANDLE_BCOPY_LEN)) { size_t max_bcopy = sender().iface_attr().cap.am.max_bcopy; - if (max_bcopy > (2 * 1024 * 1024)) { + if (max_bcopy > (2 * UCS_MBYTE)) { UCS_TEST_SKIP_R("max_bcopy too large"); } @@ -242,10 +248,10 @@ UCS_TEST_P(uct_p2p_err_test, invalid_am_bcopy_length) { recvbuf.pattern_check(2); } -UCS_TEST_P(uct_p2p_err_test, invalid_am_zcopy_hdr_length) { - check_caps(UCT_IFACE_FLAG_AM_ZCOPY); +UCS_TEST_SKIP_COND_P(uct_p2p_err_test, invalid_am_zcopy_hdr_length, + !check_caps(UCT_IFACE_FLAG_AM_ZCOPY)) { size_t max_hdr = sender().iface_attr().cap.am.max_hdr; - if (max_hdr > (2 * 1024 * 1024)) { + if (max_hdr > (2 * UCS_MBYTE)) { UCS_TEST_SKIP_R("max_hdr too large"); } if (max_hdr + 2 > sender().iface_attr().cap.am.max_bcopy) { @@ -263,9 +269,8 @@ UCS_TEST_P(uct_p2p_err_test, invalid_am_zcopy_hdr_length) { recvbuf.pattern_check(2); } -UCS_TEST_P(uct_p2p_err_test, short_invalid_am_id) { - check_caps(UCT_IFACE_FLAG_AM_SHORT); - +UCS_TEST_SKIP_COND_P(uct_p2p_err_test, short_invalid_am_id, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { mapped_buffer sendbuf(4, 2, sender()); test_error_run(OP_AM_SHORT, UCT_AM_ID_MAX, sendbuf.ptr(), sendbuf.length(), @@ -273,9 +278,8 @@ UCS_TEST_P(uct_p2p_err_test, short_invalid_am_id) { "active message id"); } -UCS_TEST_P(uct_p2p_err_test, bcopy_invalid_am_id) { - check_caps(UCT_IFACE_FLAG_AM_BCOPY); - +UCS_TEST_SKIP_COND_P(uct_p2p_err_test, bcopy_invalid_am_id, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY)) { mapped_buffer sendbuf(4, 2, sender()); test_error_run(OP_AM_BCOPY, UCT_AM_ID_MAX, sendbuf.ptr(), sendbuf.length(), @@ -283,9 +287,8 @@ UCS_TEST_P(uct_p2p_err_test, bcopy_invalid_am_id) { "active message id"); } -UCS_TEST_P(uct_p2p_err_test, zcopy_invalid_am_id) { - check_caps(UCT_IFACE_FLAG_AM_ZCOPY); - +UCS_TEST_SKIP_COND_P(uct_p2p_err_test, zcopy_invalid_am_id, + !check_caps(UCT_IFACE_FLAG_AM_ZCOPY)) { mapped_buffer sendbuf(4, 2, sender()); test_error_run(OP_AM_ZCOPY, UCT_AM_ID_MAX, sendbuf.ptr(), sendbuf.length(), diff --git a/test/gtest/uct/test_p2p_mix.cc b/test/gtest/uct/test_p2p_mix.cc index 56747ace835..4eb51a7cd9d 100644 --- a/test/gtest/uct/test_p2p_mix.cc +++ b/test/gtest/uct/test_p2p_mix.cc @@ -18,7 +18,7 @@ uct_p2p_mix_test::uct_p2p_mix_test() : uct_p2p_test(0), m_send_size(0) { ucs_status_t uct_p2p_mix_test::am_callback(void *arg, void *data, size_t length, unsigned flags) { - ucs_atomic_add32(&am_pending, -1); + ucs_atomic_sub32(&am_pending, 1); return UCS_OK; } @@ -124,9 +124,7 @@ void uct_p2p_mix_test::random_op(const mapped_buffer &sendbuf, for (;;) { status = (this->*m_avail_send_funcs[op])(sendbuf, recvbuf, &comp); - if (status == UCS_OK) { - break; - } else if (status == UCS_INPROGRESS) { + if (status == UCS_INPROGRESS) { /* coverity[loop_condition] */ while (comp.count > 0) { progress(); @@ -137,22 +135,18 @@ void uct_p2p_mix_test::random_op(const mapped_buffer &sendbuf, continue; } else { ASSERT_UCS_OK(status); + break; } } } -void uct_p2p_mix_test::check_run_conditions() { +void uct_p2p_mix_test::run(unsigned count) { if (m_avail_send_funcs.size() == 0) { UCS_TEST_SKIP_R("unsupported"); } - if (sender().md_attr().cap.mem_type != UCT_MD_MEM_TYPE_HOST) { + if (sender().md_attr().cap.access_mem_type != UCS_MEMORY_TYPE_HOST) { UCS_TEST_SKIP_R("skipping on non-host memory"); } -} - -void uct_p2p_mix_test::run(unsigned count) { - - check_run_conditions(); mapped_buffer sendbuf(m_send_size, 0, sender()); mapped_buffer recvbuf(m_send_size, 0, receiver()); @@ -161,7 +155,7 @@ void uct_p2p_mix_test::run(unsigned count) { random_op(sendbuf, recvbuf); } - sender().flush(); + flush(); } void uct_p2p_mix_test::init() { diff --git a/test/gtest/uct/test_p2p_mix.h b/test/gtest/uct/test_p2p_mix.h index b0e86be278d..4588095259e 100644 --- a/test/gtest/uct/test_p2p_mix.h +++ b/test/gtest/uct/test_p2p_mix.h @@ -55,19 +55,15 @@ class uct_p2p_mix_test : public uct_p2p_test { void random_op(const mapped_buffer &sendbuf, const mapped_buffer &recvbuf); - virtual void run(unsigned count); + void run(unsigned count); virtual void init(); virtual void cleanup(); -protected: - void check_run_conditions(); - - size_t m_send_size; - private: std::vector m_avail_send_funcs; + size_t m_send_size; static uint32_t am_pending; }; diff --git a/test/gtest/uct/test_p2p_rma.cc b/test/gtest/uct/test_p2p_rma.cc index 832f02ef174..3c925005f3d 100644 --- a/test/gtest/uct/test_p2p_rma.cc +++ b/test/gtest/uct/test_p2p_rma.cc @@ -67,11 +67,11 @@ ucs_status_t uct_p2p_rma_test::get_zcopy(uct_ep_h ep, const mapped_buffer &sendb } void uct_p2p_rma_test::test_xfer(send_func_t send, size_t length, - unsigned flags, uct_memory_type_t mem_type) + unsigned flags, ucs_memory_type_t mem_type) { - uct_memory_type_t src_mem_type = UCT_MD_MEM_TYPE_HOST; + ucs_memory_type_t src_mem_type = UCS_MEMORY_TYPE_HOST; - if ((GetParam()->tl_name.compare("cuda_ipc") == 0)) { + if (has_transport("cuda_ipc")) { src_mem_type = mem_type; } @@ -90,47 +90,65 @@ void uct_p2p_rma_test::test_xfer(send_func_t send, size_t length, } } -UCS_TEST_P(uct_p2p_rma_test, put_short) { - check_caps(UCT_IFACE_FLAG_PUT_SHORT); +UCS_TEST_SKIP_COND_P(uct_p2p_rma_test, put_short, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { test_xfer_multi(static_cast(&uct_p2p_rma_test::put_short), 0ul, sender().iface_attr().cap.put.max_short, TEST_UCT_FLAG_SEND_ZCOPY); } -UCS_TEST_P(uct_p2p_rma_test, put_bcopy) { - check_caps(UCT_IFACE_FLAG_PUT_BCOPY); +UCS_TEST_SKIP_COND_P(uct_p2p_rma_test, put_bcopy, + !check_caps(UCT_IFACE_FLAG_PUT_BCOPY)) { test_xfer_multi(static_cast(&uct_p2p_rma_test::put_bcopy), 0ul, sender().iface_attr().cap.put.max_bcopy, TEST_UCT_FLAG_SEND_ZCOPY); } -UCS_TEST_P(uct_p2p_rma_test, put_zcopy) { - check_caps(UCT_IFACE_FLAG_PUT_ZCOPY); +UCS_TEST_SKIP_COND_P(uct_p2p_rma_test, put_zcopy, + !check_caps(UCT_IFACE_FLAG_PUT_ZCOPY)) { test_xfer_multi(static_cast(&uct_p2p_rma_test::put_zcopy), 0ul, sender().iface_attr().cap.put.max_zcopy, TEST_UCT_FLAG_SEND_ZCOPY); } -UCS_TEST_P(uct_p2p_rma_test, get_short) { - check_caps(UCT_IFACE_FLAG_GET_SHORT); +UCS_TEST_SKIP_COND_P(uct_p2p_rma_test, get_short, + !check_caps(UCT_IFACE_FLAG_GET_SHORT)) { test_xfer_multi(static_cast(&uct_p2p_rma_test::get_short), 0ul, sender().iface_attr().cap.get.max_short, TEST_UCT_FLAG_RECV_ZCOPY); } -UCS_TEST_P(uct_p2p_rma_test, get_bcopy) { - check_caps(UCT_IFACE_FLAG_GET_BCOPY); +UCS_TEST_SKIP_COND_P(uct_p2p_rma_test, get_bcopy, + !check_caps(UCT_IFACE_FLAG_GET_BCOPY)) { test_xfer_multi(static_cast(&uct_p2p_rma_test::get_bcopy), 1ul, sender().iface_attr().cap.get.max_bcopy, TEST_UCT_FLAG_RECV_ZCOPY); } -UCS_TEST_P(uct_p2p_rma_test, get_zcopy) { - check_caps(UCT_IFACE_FLAG_GET_ZCOPY); +UCS_TEST_SKIP_COND_P(uct_p2p_rma_test, get_zcopy, + !check_caps(UCT_IFACE_FLAG_GET_ZCOPY)) { test_xfer_multi(static_cast(&uct_p2p_rma_test::get_zcopy), ucs_max(1ull, sender().iface_attr().cap.get.min_zcopy), sender().iface_attr().cap.get.max_zcopy, TEST_UCT_FLAG_RECV_ZCOPY); } +UCS_TEST_SKIP_COND_P(uct_p2p_rma_test, madvise, + !check_caps(UCT_IFACE_FLAG_GET_ZCOPY)) +{ + mapped_buffer sendbuf(4096, 0, sender()); + mapped_buffer recvbuf(4096, 0, receiver()); + char cmd_str[] = "/bin/true"; + + blocking_send(static_cast(&uct_p2p_rma_test::get_zcopy), + sender_ep(), sendbuf, recvbuf, true); + flush(); + + EXPECT_EQ(0, system(cmd_str)); + + blocking_send(static_cast(&uct_p2p_rma_test::get_zcopy), + sender_ep(), sendbuf, recvbuf, true); + flush(); +} + UCT_INSTANTIATE_TEST_CASE(uct_p2p_rma_test) diff --git a/test/gtest/uct/test_p2p_rma.h b/test/gtest/uct/test_p2p_rma.h index 599abe9d155..e9b0062e9be 100644 --- a/test/gtest/uct/test_p2p_rma.h +++ b/test/gtest/uct/test_p2p_rma.h @@ -36,7 +36,7 @@ class uct_p2p_rma_test : public uct_p2p_test { const mapped_buffer &recvbuf); virtual void test_xfer(send_func_t send, size_t length, - unsigned flags, uct_memory_type_t mem_type); + unsigned flags, ucs_memory_type_t mem_type); }; #endif diff --git a/test/gtest/uct/test_peer_failure.cc b/test/gtest/uct/test_peer_failure.cc index d33b14040a1..743e9a3b3e2 100644 --- a/test/gtest/uct/test_peer_failure.cc +++ b/test/gtest/uct/test_peer_failure.cc @@ -56,15 +56,33 @@ class test_uct_peer_failure : public uct_test { return UCS_OK; } + typedef struct { + uct_pending_req_t uct; + uct_ep_h ep; + } pending_send_request_t; + static ucs_status_t pending_cb(uct_pending_req_t *self) { - m_req_count++; - return UCS_OK; + const uint64_t send_data = 0; + pending_send_request_t *req = ucs_container_of(self, + pending_send_request_t, + uct); + + ucs_status_t status; + do { + /* Block in the pending handler (sending AM Short to fill UCT + * resources) to keep the pending requests in pending queue + * to purge them */ + status = uct_ep_am_short(req->ep, 0, 0, &send_data, + sizeof(send_data)); + } while (status == UCS_OK); + + return status; } static void purge_cb(uct_pending_req_t *self, void *arg) { - m_req_count++; + m_req_purge_count++; } static ucs_status_t err_cb(void *arg, uct_ep_h ep, ucs_status_t status) @@ -91,11 +109,6 @@ class test_uct_peer_failure : public uct_test { m_entities.push_back(m_receivers.back()); m_sender->connect(m_receivers.size() - 1, *m_receivers.back(), 0); - m_entities.back()->check_caps(UCT_IFACE_FLAG_AM_SHORT | - UCT_IFACE_FLAG_PENDING | - UCT_IFACE_FLAG_CB_SYNC | - UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE); - am_handler_setter(this)(m_receivers.back()); /* Make sure that TL is up and has resources */ send_recv_am(m_receivers.size() - 1); @@ -103,7 +116,7 @@ class test_uct_peer_failure : public uct_test { void set_am_handlers() { - check_caps(UCT_IFACE_FLAG_CB_SYNC); + check_caps_skip(UCT_IFACE_FLAG_CB_SYNC); std::for_each(m_receivers.begin(), m_receivers.end(), am_handler_setter(this)); } @@ -175,24 +188,27 @@ class test_uct_peer_failure : public uct_test { size_t m_tx_window; size_t m_err_count; size_t m_am_count; - static size_t m_req_count; + static size_t m_req_purge_count; + static const uint64_t m_required_caps; }; -size_t test_uct_peer_failure::m_req_count = 0ul; +size_t test_uct_peer_failure::m_req_purge_count = 0ul; +const uint64_t test_uct_peer_failure::m_required_caps = UCT_IFACE_FLAG_AM_SHORT | + UCT_IFACE_FLAG_PENDING | + UCT_IFACE_FLAG_CB_SYNC | + UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE; void test_uct_peer_failure::init() { uct_test::init(); + reduce_tl_send_queues(); + /* To reduce test execution time decrease retransmition timeouts * where it is relevant */ - if (GetParam()->tl_name == "rc" || GetParam()->tl_name == "rc_mlx5" || - GetParam()->tl_name == "dc" || GetParam()->tl_name == "dc_mlx5") { - set_config("RC_TIMEOUT=100us"); /* 100 us should be enough */ - set_config("RC_RETRY_COUNT=4"); - } else if (GetParam()->tl_name == "ud" || GetParam()->tl_name == "ud_mlx5") { - set_config("UD_TIMEOUT=3s"); - } + set_config("RC_TIMEOUT?=100us"); /* 100 us should be enough */ + set_config("RC_RETRY_COUNT?=4"); + set_config("UD_TIMEOUT?=3s"); uct_iface_params_t p = entity_params(); p.field_mask |= UCT_IFACE_PARAM_FIELD_OPEN_MODE; @@ -200,19 +216,20 @@ void test_uct_peer_failure::init() m_sender = uct_test::create_entity(p); m_entities.push_back(m_sender); + check_skip_test(); for (size_t i = 0; i < 2; ++i) { new_receiver(); } - m_err_count = 0; - m_req_count = 0; - m_am_count = 0; + m_err_count = 0; + m_req_purge_count = 0; + m_am_count = 0; } -UCS_TEST_P(test_uct_peer_failure, peer_failure) +UCS_TEST_SKIP_COND_P(test_uct_peer_failure, peer_failure, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT | + m_required_caps)) { - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - { scoped_log_handler slh(wrap_errors_logger); @@ -249,52 +266,61 @@ UCS_TEST_P(test_uct_peer_failure, peer_failure) UCS_ERR_ENDPOINT_TIMEOUT); EXPECT_EQ(uct_ep_flush(ep0(), 0, NULL), UCS_ERR_ENDPOINT_TIMEOUT); EXPECT_EQ(uct_ep_get_address(ep0(), NULL), UCS_ERR_ENDPOINT_TIMEOUT); - EXPECT_EQ(uct_ep_pending_add(ep0(), NULL, 0), UCS_ERR_ENDPOINT_TIMEOUT); + EXPECT_EQ(uct_ep_pending_add(ep0(), NULL, 0), UCS_ERR_BUSY); EXPECT_EQ(uct_ep_connect_to_ep(ep0(), NULL, NULL), UCS_ERR_ENDPOINT_TIMEOUT); EXPECT_GT(m_err_count, 0ul); } -UCS_TEST_P(test_uct_peer_failure, purge_failed_peer) +UCS_TEST_SKIP_COND_P(test_uct_peer_failure, purge_failed_peer, + !check_caps(m_required_caps)) { - check_caps(UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_PENDING); - set_am_handlers(); send_recv_am(0); send_recv_am(1); - const size_t num_pend_sends = 3ul; - uct_pending_req_t reqs[num_pend_sends]; + const ucs_time_t loop_end_limit = ucs::get_deadline(); + const size_t num_pend_sends = 64ul; + const uint64_t send_data = 0; + std::vector reqs(num_pend_sends); + { scoped_log_handler slh(wrap_errors_logger); - kill_receiver(); - ucs_status_t status; do { - status = uct_ep_am_short(ep0(), 0, 0, NULL, 0); - } while (status == UCS_OK); + status = uct_ep_am_short(ep0(), 0, 0, &send_data, + sizeof(send_data)); + } while ((status == UCS_OK) && (ucs_get_time() < loop_end_limit)); + + if (status == UCS_OK) { + UCS_TEST_SKIP_R("unable to fill the UCT resources"); + } else if (status != UCS_ERR_NO_RESOURCE) { + UCS_TEST_ABORT("AM Short failed with " << ucs_status_string(status)); + } + + kill_receiver(); for (size_t i = 0; i < num_pend_sends; i ++) { - reqs[i].func = pending_cb; - EXPECT_EQ(uct_ep_pending_add(ep0(), &reqs[i], 0), UCS_OK); + reqs[i].ep = ep0(); + reqs[i].uct.func = pending_cb; + EXPECT_EQ(UCS_OK, uct_ep_pending_add(ep0(), &reqs[i].uct, 0)); } flush(); } - EXPECT_EQ(uct_ep_am_short(ep0(), 0, 0, NULL, 0), UCS_ERR_ENDPOINT_TIMEOUT); + EXPECT_EQ(UCS_ERR_ENDPOINT_TIMEOUT, uct_ep_am_short(ep0(), 0, 0, NULL, 0)); uct_ep_pending_purge(ep0(), purge_cb, NULL); - EXPECT_EQ(num_pend_sends, m_req_count); + EXPECT_EQ(num_pend_sends, m_req_purge_count); EXPECT_GE(m_err_count, 0ul); } -UCS_TEST_P(test_uct_peer_failure, two_pairs_send) +UCS_TEST_SKIP_COND_P(test_uct_peer_failure, two_pairs_send, + !check_caps(m_required_caps)) { - check_caps(UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_PENDING); - set_am_handlers(); /* queue sends on 1st pair */ @@ -325,10 +351,9 @@ UCS_TEST_P(test_uct_peer_failure, two_pairs_send) } -UCS_TEST_P(test_uct_peer_failure, two_pairs_send_after) +UCS_TEST_SKIP_COND_P(test_uct_peer_failure, two_pairs_send_after, + !check_caps(m_required_caps)) { - check_caps(UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_PENDING); - set_am_handlers(); { @@ -367,10 +392,10 @@ class test_uct_peer_failure_cb : public test_uct_peer_failure { } }; -UCS_TEST_P(test_uct_peer_failure_cb, desproy_ep_cb) +UCS_TEST_SKIP_COND_P(test_uct_peer_failure_cb, desproy_ep_cb, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT | + m_required_caps)) { - check_caps(UCT_IFACE_FLAG_PUT_SHORT); - scoped_log_handler slh(wrap_errors_logger); kill_receiver(); EXPECT_EQ(uct_ep_put_short(ep0(), NULL, 0, 0, 0), UCS_OK); @@ -390,11 +415,6 @@ class test_uct_peer_failure_multiple : public test_uct_peer_failure void test_uct_peer_failure_multiple::init() { - if (RUNNING_ON_VALGRIND) { - /* See https://bugs.kde.org/show_bug.cgi?id=352742 */ - UCS_TEST_SKIP_R("skipping on valgrind because \"brk segment overflow\""); - } - size_t tx_queue_len = get_tx_queue_len(); if (ucs_get_page_size() > 4096) { @@ -416,17 +436,18 @@ void test_uct_peer_failure_multiple::init() size_t test_uct_peer_failure_multiple::get_tx_queue_len() const { - const std::string &tl_name = GetParam()->tl_name; - std::string name, val; - size_t tx_queue_len; + bool set = true; + std::string name, val; + size_t tx_queue_len; - if ((tl_name == "rc") || (tl_name == "rc_mlx5")) { - name = "RC_IB_TX_QUEUE_LEN"; - } else if ((tl_name == "dc") || (tl_name == "dc_mlx5")) { + if (has_rc()) { + name = "RC_RC_IB_TX_QUEUE_LEN"; + } else if (has_transport("dc_mlx5")) { name = "DC_RC_IB_TX_QUEUE_LEN"; - } else if ((tl_name == "ud") || (tl_name == "ud_mlx5")) { + } else if (has_ud()) { name = "UD_IB_TX_QUEUE_LEN"; } else { + set = false; name = "TX_QUEUE_LEN"; } @@ -437,12 +458,21 @@ size_t test_uct_peer_failure_multiple::get_tx_queue_len() const tx_queue_len = 256; UCS_TEST_MESSAGE << name << " setting not found, " << "taken test default value: " << tx_queue_len; + if (set) { + UCS_TEST_ABORT(name + " config name must be found for %s transport" + + GetParam()->tl_name); + } } return tx_queue_len; } -UCS_TEST_P(test_uct_peer_failure_multiple, test, "RC_TM_ENABLE?=n") +/* Skip under valgrind due to brk segment overflow. + * See https://bugs.kde.org/show_bug.cgi?id=352742 */ +UCS_TEST_SKIP_COND_P(test_uct_peer_failure_multiple, test, + (RUNNING_ON_VALGRIND || + !check_caps(m_required_caps)), + "RC_TM_ENABLE?=n") { ucs_time_t timeout = ucs_get_time() + ucs_time_from_sec(200 * ucs::test_time_multiplier()); diff --git a/test/gtest/uct/test_pending.cc b/test/gtest/uct/test_pending.cc index d9c026875f8..c447b766b9a 100644 --- a/test/gtest/uct/test_pending.cc +++ b/test/gtest/uct/test_pending.cc @@ -14,6 +14,13 @@ extern "C" { class test_uct_pending : public uct_test { public: + test_uct_pending() : uct_test() { + m_e1 = NULL; + m_e2 = NULL; + + reduce_tl_send_queues(); + } + virtual void init() { uct_test::init(); @@ -22,153 +29,223 @@ class test_uct_pending : public uct_test { m_e2 = uct_test::create_entity(0); m_entities.push_back(m_e2); + + check_skip_test(); } void initialize() { m_e1->connect(0, *m_e2, 0); m_e2->connect(0, *m_e1, 0); + flush(); } typedef struct pending_send_request { + uct_pending_req_t uct; uct_ep_h ep; uint64_t data; int countdown; /* Actually send after X calls */ - uct_pending_req_t uct; - int active; - int id; - mapped_buffer *buf; + int send_count; /* Used by fairness test */ + bool pending; + bool delete_me; } pending_send_request_t; - void send_am_fill_resources(uct_ep_h ep) { - uint64_t send_data = 0xdeadbeef; - ucs_time_t loop_end_limit = ucs_get_time() + ucs_time_from_sec(2); - ucs_status_t status; + struct am_completion_t { + uct_completion_t uct; + uct_ep_h ep; + }; + + bool send_am_or_add_pending(uint64_t *send_data, uint64_t header, + unsigned idx, pending_send_request_t *preq) { + ucs_time_t loop_end_limit = ucs::get_deadline(); + ucs_status_t status, status_pend; + + do { + status = uct_ep_am_short(m_e1->ep(idx), AM_ID, header, send_data, + sizeof(*send_data)); + if (status != UCS_OK) { + EXPECT_EQ(UCS_ERR_NO_RESOURCE, status); + pending_send_request_t *req = (preq != NULL) ? preq : + pending_alloc(*send_data, idx); + status_pend = uct_ep_pending_add(m_e1->ep(idx), + &req->uct, 0); + if (status_pend == UCS_ERR_BUSY) { /* retry */ + if (preq == NULL) { + pending_delete(req); + } + continue; + } + ASSERT_UCS_OK(status_pend); + ++n_pending; + req->pending = true; + /* coverity[leaked_storage] */ + } else if (preq != NULL) { + ++preq->send_count; /* used by fairness test */ + } + ++(*send_data); + return true; + } while (ucs_get_time() < loop_end_limit); + + return false; + } - do { - status = uct_ep_am_short(ep, 0, test_pending_hdr, &send_data, - sizeof(send_data)); - if (status == UCS_ERR_NO_RESOURCE) { + unsigned send_ams_and_add_pending(uint64_t *send_data, + uint64_t header = PENDING_HDR, + bool add_single_pend = true, + bool change_ep = false, + unsigned ep_idx = 0, + unsigned iters = 10000) { + ucs_time_t loop_end_limit = ucs_get_time() + ucs_time_from_sec(3); + unsigned i = 0; + int init_pending = n_pending; + int added_pending = 0; + unsigned idx; + + do { + idx = change_ep ? i : ep_idx; + if (!send_am_or_add_pending(send_data, header, idx, NULL)) { break; } - } while (ucs_get_time() < loop_end_limit); + ++i; + added_pending = n_pending - init_pending; + if ((added_pending != 0) && add_single_pend) { + EXPECT_EQ(1, added_pending); + break; + } + } while ((i < iters) && (ucs_get_time() < loop_end_limit)); - if (status != UCS_ERR_NO_RESOURCE) { + if (added_pending == 0) { UCS_TEST_SKIP_R("Can't fill UCT resources in the given time."); } + + return i; } static ucs_status_t am_handler(void *arg, void *data, size_t length, unsigned flags) { volatile unsigned *counter = (volatile unsigned*) arg; - uint64_t test_hdr = *(uint64_t *) data; - uint64_t actual_data = *(unsigned*)((char*)data + sizeof(test_hdr)); + uint64_t test_hdr = *(uint64_t *) data; + uint64_t actual_data = *(unsigned*)((char*)data + sizeof(test_hdr)); - if ((test_hdr == 0xabcd) && (actual_data == (0xdeadbeef + *counter))) { + if ((test_hdr == PENDING_HDR) && + (actual_data == (0xdeadbeef + *counter))) { ucs_atomic_add32(counter, 1); } else { - UCS_TEST_ABORT("Error in comparison in pending_am_handler. Counter: " << counter); + UCS_TEST_ABORT("Error in comparison in pending_am_handler. Counter: " + << counter << ", header: " << test_hdr + << ", data: " << actual_data); } return UCS_OK; } + static ucs_status_t am_handler_count(void *arg, void *data, size_t length, + unsigned flags) { + volatile unsigned *counter = (volatile unsigned*) arg; + ucs_atomic_add32(counter, 1); + return UCS_OK; + } + static ucs_status_t am_handler_simple(void *arg, void *data, size_t length, unsigned flags) { return UCS_OK; } - static ucs_status_t pending_send_op(uct_pending_req_t *self) { - - pending_send_request_t *req = ucs_container_of(self, pending_send_request_t, uct); - ucs_status_t status; + static ucs_status_t am_handler_check_rx_order(void *arg, void *data, + size_t length, unsigned flags) { + volatile bool *comp_received = (volatile bool*)arg; + uint64_t hdr = *(uint64_t*)data; - if (req->countdown > 0) { - --req->countdown; - return UCS_INPROGRESS; + /* We expect that message sent from pending callback will arrive + * before the one sent from the completion callback. */ + if (hdr == PENDING_HDR) { + pend_received = true; + EXPECT_FALSE(*comp_received); + } else if (hdr == COMP_HDR) { + *comp_received = true; + EXPECT_TRUE(pend_received); + } else { + EXPECT_EQ(AM_HDR, hdr); } - status = uct_ep_am_short(req->ep, 0, test_pending_hdr, &req->data, - sizeof(req->data)); - if (status == UCS_OK) { - pending_delete(req); - } - return status; + return UCS_OK; } - static ucs_status_t pending_send_op_simple(uct_pending_req_t *self) { + static void completion_cb(uct_completion_t *self, ucs_status_t c_status) { + am_completion_t *comp = ucs_container_of(self, am_completion_t, uct); - pending_send_request_t *req = ucs_container_of(self, pending_send_request_t, uct); - ucs_status_t status; + EXPECT_UCS_OK(c_status); + + ucs_status_t status = uct_ep_am_short(comp->ep, AM_ID, COMP_HDR, + NULL, 0); + EXPECT_TRUE(!UCS_STATUS_IS_ERR(status) || + (status == UCS_ERR_NO_RESOURCE)); + } + + static ucs_status_t pending_send_op(uct_pending_req_t *self) { + + pending_send_request_t *req = ucs_container_of(self, + pending_send_request_t, + uct); + if (req->countdown > 0) { + --req->countdown; + return UCS_INPROGRESS; + } - status = uct_ep_am_short(req->ep, 0, test_pending_hdr, &req->data, - sizeof(req->data)); + ucs_status_t status = uct_ep_am_short(req->ep, AM_ID, PENDING_HDR, + &req->data, sizeof(req->data)); if (status == UCS_OK) { - req->countdown ++; + req->pending = false; + req->send_count++; n_pending--; - req->active = 0; - //ucs_warn("dispatched %p idx %d total %d", req->ep, req->id, req->countdown); + if (req->delete_me) { + pending_delete(req); + } } + return status; } - static ucs_status_t pending_send_op_bcopy(uct_pending_req_t *self) { - - pending_send_request_t *req = ucs_container_of(self, pending_send_request_t, uct); - ssize_t packed_len; + static ucs_status_t pending_send_op_add_pending(uct_pending_req_t *self) { + ucs_status_t status = pending_send_op(self); + if (status == UCS_ERR_NO_RESOURCE) { + pending_send_request_t *req = ucs_container_of(self, + pending_send_request_t, + uct); + /* replace with the callback that just do sends and return + * `UCS_ERR_NO_RESOURCE` in case of no resources on the given EP */ + req->uct.func = pending_send_op; - packed_len = uct_ep_am_bcopy(req->ep, 0, mapped_buffer::pack, req->buf, 0); - if (packed_len > 0) { - req->countdown ++; - n_pending--; - req->active = 0; + status = uct_ep_pending_add(req->ep, &req->uct, 0); + ASSERT_UCS_OK(status); return UCS_OK; } - return (ucs_status_t)packed_len; - } - - static ucs_status_t pending_send_op_ok(uct_pending_req_t *self) { - pending_send_request_t *req = ucs_container_of(self, pending_send_request_t, uct); - pending_delete(req); - n_pending--; - return UCS_OK; + return status; } - static void purge_cb(uct_pending_req_t *uct_req, void *arg) + static void purge_cb(uct_pending_req_t *self, void *arg) { + pending_send_request_t *req = ucs_container_of(self, + pending_send_request_t, + uct); + pending_delete(req); ++n_purge; } - pending_send_request_t* pending_alloc(uint64_t send_data) { - pending_send_request_t *req = new pending_send_request_t(); - req->ep = m_e1->ep(0); - req->data = send_data; - req->countdown = 5; - req->uct.func = pending_send_op; - return req; - } + pending_send_request_t* pending_alloc(uint64_t send_data, int ep_idx = 0, + int count = 5, bool delete_me = true, + uct_pending_callback_t cb = pending_send_op) { + pending_send_request_t *req = new pending_send_request_t(); + req->ep = m_e1->ep(ep_idx); + req->data = send_data; + req->pending = false; + req->countdown = count; + req->uct.func = cb; + req->delete_me = delete_me; + req->send_count = 0; - pending_send_request_t* pending_alloc_simple(uint64_t send_data, int idx) { - pending_send_request_t *req = new pending_send_request_t(); - req->ep = m_e1->ep(idx); - req->data = send_data; - req->countdown = 0; - req->uct.func = pending_send_op_simple; - req->active = 0; - req->id = idx; - return req; - } - - pending_send_request_t* pending_alloc_simple(mapped_buffer *sbuf, int idx) { - pending_send_request_t *req = new pending_send_request_t(); - req->ep = m_e1->ep(idx); - req->buf = sbuf; - req->countdown = 0; - req->uct.func = pending_send_op_bcopy; - req->active = 0; - req->id = idx; return req; } @@ -177,16 +254,26 @@ class test_uct_pending : public uct_test { } protected: - static const uint64_t test_pending_hdr = 0xabcd; + static const uint64_t AM_HDR; + static const uint64_t PENDING_HDR; + static const uint64_t COMP_HDR; + static const uint8_t AM_ID; entity *m_e1, *m_e2; static int n_pending; static int n_purge; + static bool pend_received; }; -int test_uct_pending::n_pending = 0; -int test_uct_pending::n_purge = 0; +int test_uct_pending::n_pending = 0; +int test_uct_pending::n_purge = 0; +bool test_uct_pending::pend_received = false; +const uint64_t test_uct_pending::AM_HDR = 0x0ul; +const uint64_t test_uct_pending::PENDING_HDR = 0x1ul; +const uint64_t test_uct_pending::COMP_HDR = 0x2ul; +const uint8_t test_uct_pending::AM_ID = 0; -void install_handler_sync_or_async(uct_iface_t *iface, uint8_t id, uct_am_callback_t cb, void *arg) +void install_handler_sync_or_async(uct_iface_t *iface, uint8_t id, + uct_am_callback_t cb, void *arg) { ucs_status_t status; uct_iface_attr_t attr; @@ -202,140 +289,152 @@ void install_handler_sync_or_async(uct_iface_t *iface, uint8_t id, uct_am_callba } } -UCS_TEST_P(test_uct_pending, pending_op) +UCS_TEST_SKIP_COND_P(test_uct_pending, pending_op, + !check_caps(UCT_IFACE_FLAG_AM_SHORT | + UCT_IFACE_FLAG_PENDING)) { uint64_t send_data = 0xdeadbeef; - ucs_status_t status; - unsigned i, iters, counter = 0; + unsigned counter = 0; initialize(); - check_caps(UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_PENDING); - - iters = 1000000/ucs::test_time_multiplier(); /* set a callback for the uct to invoke for receiving the data */ - install_handler_sync_or_async(m_e2->iface(), 0, am_handler, &counter); + install_handler_sync_or_async(m_e2->iface(), AM_ID, am_handler, &counter); /* send the data until the resources run out */ - i = 0; - while (i < iters) { - status = uct_ep_am_short(m_e1->ep(0), 0, test_pending_hdr, &send_data, - sizeof(send_data)); - if (status != UCS_OK) { - if (status == UCS_ERR_NO_RESOURCE) { - - pending_send_request_t *req = pending_alloc(send_data); - - status = uct_ep_pending_add(m_e1->ep(0), &req->uct, 0); - if (status != UCS_OK) { - /* the request wasn't added to the pending data structure - * since resources became available. retry sending this message */ - pending_delete(req); - } else { - /* the request was added to the pending data structure */ - send_data += 1; - i++; - } - /* coverity[leaked_storage] */ - } else { - UCS_TEST_ABORT("Error: " << ucs_status_string(status)); - } - } else { - send_data += 1; - i++; - } - } + unsigned n_sends = send_ams_and_add_pending(&send_data, PENDING_HDR, false); + /* coverity[loop_condition] */ - while (counter != iters) { + while (counter != n_sends) { progress(); } + flush(); - ASSERT_EQ(counter, iters); + ASSERT_EQ(counter, n_sends); } -UCS_TEST_P(test_uct_pending, send_ooo_with_pending) +UCS_TEST_SKIP_COND_P(test_uct_pending, send_ooo_with_pending, + !check_caps(UCT_IFACE_FLAG_AM_SHORT | + UCT_IFACE_FLAG_PENDING)) { uint64_t send_data = 0xdeadbeef; - ucs_status_t status_send, status_pend = UCS_ERR_LAST; - ucs_time_t loop_end_limit; - unsigned i, counter = 0; + unsigned counter = 0; + ucs_status_t status; initialize(); - check_caps(UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_PENDING); /* set a callback for the uct to invoke when receiving the data */ - install_handler_sync_or_async(m_e2->iface(), 0, am_handler, &counter); - - loop_end_limit = ucs_get_time() + ucs_time_from_sec(2); - /* send while resources are available. try to add a request to pending */ - do { - status_send = uct_ep_am_short(m_e1->ep(0), 0, test_pending_hdr, &send_data, - sizeof(send_data)); - if (status_send == UCS_ERR_NO_RESOURCE) { - - pending_send_request_t *req = pending_alloc(send_data); - - status_pend = uct_ep_pending_add(m_e1->ep(0), &req->uct, 0); - if (status_pend == UCS_ERR_BUSY) { - pending_delete(req); - } else { - /* coverity[leaked_storage] */ - ++send_data; - break; - } - } else { - ASSERT_UCS_OK(status_send); - ++send_data; - } - } while (ucs_get_time() < loop_end_limit); + install_handler_sync_or_async(m_e2->iface(), AM_ID, am_handler, &counter); - if ((status_send == UCS_OK) || (status_pend == UCS_ERR_BUSY)) { - /* got here due to reaching the time limit in the above loop. - * couldn't add a request to pending. all sends were successful. */ - UCS_TEST_MESSAGE << "Can't create out-of-order in the given time."; - return; - } - /* there is one pending request */ - EXPECT_EQ(UCS_OK, status_pend); + unsigned n_sends = send_ams_and_add_pending(&send_data); /* progress the receiver a bit to release resources */ - for (i = 0; i < 1000; i++) { + for (unsigned i = 0; i < 1000; i++) { m_e2->progress(); } /* send a new message. the transport should make sure that this new message - * isn't sent before the one in pending, thus preventing out-of-order in sending. */ + * isn't sent before the one in pending, thus preventing out-of-order in + * sending. */ do { - status_send = uct_ep_am_short(m_e1->ep(0), 0, test_pending_hdr, - &send_data, sizeof(send_data)); + status = uct_ep_am_short(m_e1->ep(0), AM_ID, PENDING_HDR, &send_data, + sizeof(send_data)); short_progress_loop(); - } while (status_send == UCS_ERR_NO_RESOURCE); - ASSERT_UCS_OK(status_send); - ++send_data; + } while (status == UCS_ERR_NO_RESOURCE); + ASSERT_UCS_OK(status); + ++n_sends; /* the receive side checks that the messages were received in order. * check the last message here. (counter was raised by one for next iteration) */ - unsigned exp_counter = send_data - 0xdeadbeefUL; - wait_for_value(&counter, exp_counter, true); - EXPECT_EQ(exp_counter, counter); + wait_for_value(&counter, n_sends, true); + EXPECT_EQ(n_sends, counter); } -UCS_TEST_P(test_uct_pending, pending_purge) +UCS_TEST_SKIP_COND_P(test_uct_pending, send_ooo_with_pending_another_ep, + !check_caps(UCT_IFACE_FLAG_AM_SHORT | + UCT_IFACE_FLAG_PENDING)) { - const int num_eps = 5; - uct_pending_req_t reqs[num_eps]; + const int num_eps = 2; + uint64_t send_data = 0xdeadbeefUL; + unsigned counter = 0; + unsigned n_sends = 0; + bool ep_pending_idx[num_eps]; + + /* set a callback for the uct to invoke when receiving the data */ + install_handler_sync_or_async(m_e2->iface(), AM_ID, am_handler_count, + &counter); - check_caps(UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_PENDING); + for (unsigned idx = 0; idx < num_eps; ++idx) { + m_e1->connect(idx, *m_e2, idx); + ep_pending_idx[idx] = false; + } + + ucs_time_t loop_end_limit = ucs_get_time() + ucs_time_from_sec(3); + unsigned n_iters = 10000; + unsigned i = 0; + unsigned num_ep_pending = 0; + + n_pending = 0; + + do { + ucs_status_t status; + + for (unsigned idx = 0; idx < num_eps; ++idx) { + if (ep_pending_idx[idx]) { + continue; + } + + /* try to user all transport's resources */ + status = uct_ep_am_short(m_e1->ep(idx), AM_ID, PENDING_HDR, + &send_data, sizeof(send_data)); + if (status != UCS_OK) { + ASSERT_EQ(UCS_ERR_NO_RESOURCE, status); + ep_pending_idx[idx] = true; + num_ep_pending++; + + /* schedule pending req to send data on the another EP */ + pending_send_request_t *preq = + pending_alloc(send_data, num_eps - idx - 1, + 0, true, pending_send_op_add_pending); + status = uct_ep_pending_add(m_e1->ep(idx), &preq->uct, 0); + ASSERT_UCS_OK(status); + ++n_pending; + preq->pending = true; + /* coverity[leaked_storage] */ + } + ++n_sends; + } + + ++i; + } while ((num_ep_pending < num_eps) && + (i < n_iters) && (ucs_get_time() < loop_end_limit)); + + UCS_TEST_MESSAGE << "eps with pending: " << num_ep_pending << "/" << num_eps + << ", current pending: " << n_pending; + + flush(); + + wait_for_value(&n_pending, 0, true); + EXPECT_EQ(0, n_pending); + + wait_for_value(&counter, n_sends, true); + EXPECT_EQ(n_sends, counter); +} + +UCS_TEST_SKIP_COND_P(test_uct_pending, pending_purge, + !check_caps(UCT_IFACE_FLAG_AM_SHORT | + UCT_IFACE_FLAG_PENDING)) +{ + const int num_eps = 5; + uint64_t send_data = 0xdeadbeefUL; /* set a callback for the uct to invoke when receiving the data */ - install_handler_sync_or_async(m_e2->iface(), 0, am_handler_simple, NULL); + install_handler_sync_or_async(m_e2->iface(), AM_ID, am_handler_simple, NULL); for (int i = 0; i < num_eps; ++i) { m_e1->connect(i, *m_e2, i); - send_am_fill_resources(m_e1->ep(i)); - reqs[i].func = NULL; - EXPECT_UCS_OK(uct_ep_pending_add(m_e1->ep(i), &reqs[i], 0)); + send_ams_and_add_pending(&send_data, PENDING_HDR, true, false, i); } for (int i = 0; i < num_eps; ++i) { @@ -348,48 +447,37 @@ UCS_TEST_P(test_uct_pending, pending_purge) /* * test that the pending op callback is only called from the progress() */ -UCS_TEST_P(test_uct_pending, pending_async) +UCS_TEST_SKIP_COND_P(test_uct_pending, pending_async, + !check_caps(UCT_IFACE_FLAG_AM_SHORT | + UCT_IFACE_FLAG_AM_BCOPY | + UCT_IFACE_FLAG_PENDING | + UCT_IFACE_FLAG_CB_ASYNC)) { - pending_send_request_t *req = NULL; - ucs_status_t status; - ssize_t packed_len; - initialize(); - check_caps(UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_PENDING); - - mapped_buffer sbuf(ucs_min(64ul, m_e1->iface_attr().cap.am.max_bcopy), 0, - *m_e1); - - req = pending_alloc_simple(&sbuf, 0); /* set a callback for the uct to invoke when receiving the data */ - install_handler_sync_or_async(m_e2->iface(), 0, am_handler_simple, 0); + install_handler_sync_or_async(m_e2->iface(), AM_ID, am_handler_simple, 0); /* send while resources are available */ - n_pending = 0; - do { - packed_len = uct_ep_am_bcopy(m_e1->ep(0), 0, mapped_buffer::pack, - &sbuf, 0); - } while (packed_len >= 0); - - EXPECT_TRUE(packed_len == UCS_ERR_NO_RESOURCE); - - status = uct_ep_pending_add(m_e1->ep(0), &req->uct, 0); - EXPECT_UCS_OK(status); - n_pending++; + uint64_t send_data = 0xABC; + n_pending = 0; + send_ams_and_add_pending(&send_data); /* pending op must not be called either asynchronously or from the - * uct_ep_am_bcopy() */ + * uct_ep_am_bcopy/short() */ twait(300); EXPECT_EQ(1, n_pending); - packed_len = uct_ep_am_bcopy(m_e1->ep(0), 0, mapped_buffer::pack, &sbuf, 0); + /* send should fail, because we have pending op */ + mapped_buffer sbuf(ucs_min(64ul, m_e1->iface_attr().cap.am.max_bcopy), + 0, *m_e1); + ssize_t packed_len = uct_ep_am_bcopy(m_e1->ep(0), AM_ID, + mapped_buffer::pack, &sbuf, 0); EXPECT_EQ(1, n_pending); - EXPECT_GT(0, packed_len); + EXPECT_EQ((ssize_t)UCS_ERR_NO_RESOURCE, packed_len); wait_for_value(&n_pending, 0, true); EXPECT_EQ(0, n_pending); - pending_delete(req); } /* @@ -397,21 +485,20 @@ UCS_TEST_P(test_uct_pending, pending_async) * The issue is a dc transport specific but test may be also useful * for other transports */ -UCS_TEST_P(test_uct_pending, pending_ucs_ok_dc_arbiter_bug) +UCS_TEST_SKIP_COND_P(test_uct_pending, pending_ucs_ok_dc_arbiter_bug, + !check_caps(UCT_IFACE_FLAG_AM_SHORT | + UCT_IFACE_FLAG_PENDING) || + has_transport("cm")) { - ucs_status_t status; - ssize_t packed_len; - int N; - int i; + int N, max_listen_conn; initialize(); - check_caps(UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_PENDING); mapped_buffer sbuf(ucs_min(64ul, m_e1->iface_attr().cap.am.max_bcopy), 0, *m_e1); /* set a callback for the uct to invoke when receiving the data */ - install_handler_sync_or_async(m_e2->iface(), 0, am_handler_simple, 0); + install_handler_sync_or_async(m_e2->iface(), AM_ID, am_handler_simple, 0); if (RUNNING_ON_VALGRIND) { N = 64; @@ -424,30 +511,22 @@ UCS_TEST_P(test_uct_pending, pending_ucs_ok_dc_arbiter_bug) N = ucs_min(N, max_connections()); /* idx 0 is setup in initialize(). only need to alloc request */ - for (i = 1; i < N; i++) { - m_e1->connect(i, *m_e2, i); - } + for (int j, i = 1; i < N; i += j) { + max_listen_conn = ucs_min(max_connect_batch(), N - i); - /* give a chance to finish connection for some transports (ib/ud, tcp) */ - flush(); + for (j = 0; j < max_listen_conn; j++) { + int idx = i + j; + m_e1->connect(idx, *m_e2, idx); + } + /* give a chance to finish connection for some transports (ib/ud, tcp) */ + flush(); + } n_pending = 0; - /* try to exaust global resources and create a pending queue */ - for (i = 0; i < N; i++) { - packed_len = uct_ep_am_bcopy(m_e1->ep(i), 0, mapped_buffer::pack, - &sbuf, 0); - - if (packed_len == UCS_ERR_NO_RESOURCE) { - pending_send_request_t *req = pending_alloc(i); - - req->uct.func = pending_send_op_ok; - status = uct_ep_pending_add(m_e1->ep(i), &req->uct, 0); - EXPECT_UCS_OK(status); - n_pending++; - /* coverity[leaked_storage] */ - } - } + /* try to exhaust global resources and create a pending queue */ + uint64_t send_data = 0xBEEBEE; + send_ams_and_add_pending(&send_data, PENDING_HDR, false, true,0, N); UCS_TEST_MESSAGE << "pending queue len: " << n_pending; @@ -455,69 +534,43 @@ UCS_TEST_P(test_uct_pending, pending_ucs_ok_dc_arbiter_bug) EXPECT_EQ(0, n_pending); } -UCS_TEST_P(test_uct_pending, pending_fairness) +UCS_TEST_SKIP_COND_P(test_uct_pending, pending_fairness, + (RUNNING_ON_VALGRIND || + !check_caps(UCT_IFACE_FLAG_AM_SHORT | + UCT_IFACE_FLAG_PENDING))) { - int N = 16; + int N = 16; uint64_t send_data = 0xdeadbeef; int i, iters; - ucs_status_t status; - - if (RUNNING_ON_VALGRIND) { - UCS_TEST_SKIP_R("skipping on valgrind"); - } - - /* TODO: need to investigate the slowness of the test with TCP */ - if (GetParam()->tl_name == "tcp") { - ucs::watchdog_set(ucs::watchdog_timeout_default * 2.0); - } initialize(); - check_caps(UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_PENDING); + if (m_e1->iface_attr().cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { - N = 128; + N = ucs_min(128, max_connect_batch()); } pending_send_request_t *reqs[N]; - install_handler_sync_or_async(m_e2->iface(), 0, am_handler_simple, 0); + install_handler_sync_or_async(m_e2->iface(), AM_ID, am_handler_simple, 0); /* idx 0 is setup in initialize(). only need to alloc request */ - reqs[0] = pending_alloc_simple(send_data, 0); + reqs[0] = pending_alloc(send_data, 0, 0, false); for (i = 1; i < N; i++) { m_e1->connect(i, *m_e2, i); - reqs[i] = pending_alloc_simple(send_data, i); + reqs[i] = pending_alloc(send_data, i, 0, false); } /* give a chance to finish connection for some transports (ib/ud, tcp) */ flush(); n_pending = 0; - for (iters = 0; iters < 10000; iters++) { - /* send until resources of all eps are exausted */ + for (iters = 0; iters < 10000; iters++) { + /* send until resources of all eps are exhausted */ while (n_pending < N) { for (i = 0; i < N; ++i) { /* TODO: change to list */ - if (reqs[i]->active) { + if (reqs[i]->pending) { continue; } - for (;;) { - status = uct_ep_am_short(m_e1->ep(i), 0, test_pending_hdr, - &send_data, sizeof(send_data)); - if (status == UCS_ERR_NO_RESOURCE) { - /* schedule pending */ - status = uct_ep_pending_add(m_e1->ep(i), &reqs[i]->uct, - 0); - if (status == UCS_ERR_BUSY) { - continue; /* retry */ - } - ASSERT_UCS_OK(status); - - n_pending++; - reqs[i]->active = 1; - break; - } else { - ASSERT_UCS_OK(status); - /* sent */ - reqs[i]->countdown++; - break; - } + if (!send_am_or_add_pending(&send_data, PENDING_HDR, i, reqs[i])) { + UCS_TEST_SKIP_R("Can't fill UCT resources in the given time."); } } } @@ -525,26 +578,25 @@ UCS_TEST_P(test_uct_pending, pending_fairness) while(n_pending == N) { progress(); } - /* repeat the cycle. + /* repeat the cycle. * it is expected that every ep will send about - * the same number of messages. + * the same number of messages. */ } - /* check fairness: */ + /* check fairness: */ int min_sends = INT_MAX; int max_sends = 0; for (i = 0; i < N; i++) { - min_sends = ucs_min(min_sends, reqs[i]->countdown); - max_sends = ucs_max(max_sends, reqs[i]->countdown); - //printf("%d: send %d\n", i, reqs[i]->countdown); + min_sends = ucs_min(min_sends, reqs[i]->send_count); + max_sends = ucs_max(max_sends, reqs[i]->send_count); } - UCS_TEST_MESSAGE << " min_sends: " << min_sends - << " max_sends: " << max_sends + UCS_TEST_MESSAGE << " min_sends: " << min_sends + << " max_sends: " << max_sends << " still pending: " << n_pending; while(n_pending > 0) { - progress(); + progress(); } flush(); @@ -561,4 +613,40 @@ UCS_TEST_P(test_uct_pending, pending_fairness) } } +/* Check that pending requests are processed before the sends from + * completion callbacks */ +UCS_TEST_SKIP_COND_P(test_uct_pending, send_ooo_with_comp, + !check_caps(UCT_IFACE_FLAG_AM_SHORT | + UCT_IFACE_FLAG_AM_ZCOPY | + UCT_IFACE_FLAG_PENDING)) +{ + initialize(); + + bool comp_received = false; + pend_received = false; + + uct_iface_set_am_handler(m_e2->iface(), AM_ID, am_handler_check_rx_order, + &comp_received, 0); + + mapped_buffer sendbuf(32, 0, *m_e1); + UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, sendbuf.ptr(), sendbuf.length(), + sendbuf.memh(), 1); + am_completion_t comp; + comp.uct.func = completion_cb; + comp.uct.count = 1; + comp.ep = m_e1->ep(0); + ucs_status_t status = uct_ep_am_zcopy(m_e1->ep(0), AM_ID, &AM_HDR, + sizeof(AM_HDR), iov, iovcnt, 0, + &comp.uct); + ASSERT_FALSE(UCS_STATUS_IS_ERR(status)); + + uint64_t send_data = 0xFAFAul; + send_ams_and_add_pending(&send_data, AM_HDR); + + wait_for_flag(&n_pending); + EXPECT_TRUE(n_pending); + + flush(); +} + UCT_INSTANTIATE_NO_SELF_TEST_CASE(test_uct_pending); diff --git a/test/gtest/uct/test_stats.cc b/test/gtest/uct/test_stats.cc index af4f3232c2b..b2c8983e137 100644 --- a/test/gtest/uct/test_stats.cc +++ b/test/gtest/uct/test_stats.cc @@ -14,7 +14,7 @@ extern "C" { #include "uct_test.h" #include "uct_p2p_test.h" -#if ENABLE_STATS +#ifdef ENABLE_STATS #define EXPECT_STAT(_side, _uct_obj, _stat, _exp_val) \ do { \ @@ -90,11 +90,12 @@ class test_uct_stats : public uct_p2p_test { void init_bufs(size_t min, size_t max) { size_t size = ucs_max(min, ucs_min(64ul, max)); - lbuf = new mapped_buffer(size, 0, sender(), 0, sender().md_attr().cap.mem_type); - rbuf = new mapped_buffer(size, 0, receiver(), 0, sender().md_attr().cap.mem_type); + lbuf = new mapped_buffer(size, 0, sender(), 0, sender().md_attr().cap.access_mem_type); + rbuf = new mapped_buffer(size, 0, receiver(), 0, sender().md_attr().cap.access_mem_type); } virtual void cleanup() { + flush(); delete lbuf; delete rbuf; uct_p2p_test::cleanup(); @@ -189,12 +190,12 @@ class test_uct_stats : public uct_p2p_test { /* test basic stat counters: * am, put, get, amo, flush and fence */ -UCS_TEST_P(test_uct_stats, am_short) +UCS_TEST_SKIP_COND_P(test_uct_stats, am_short, + !check_caps(UCT_IFACE_FLAG_AM_SHORT)) { uint64_t hdr=0xdeadbeef, send_data=0xfeedf00d; ucs_status_t status; - check_caps(UCT_IFACE_FLAG_AM_SHORT); init_bufs(0, sender().iface_attr().cap.am.max_short); status = uct_iface_set_am_handler(receiver().iface(), 0, am_handler, @@ -211,12 +212,12 @@ UCS_TEST_P(test_uct_stats, am_short) check_am_rx_counters(sizeof(hdr) + sizeof(send_data)); } -UCS_TEST_P(test_uct_stats, am_bcopy) +UCS_TEST_SKIP_COND_P(test_uct_stats, am_bcopy, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY)) { ssize_t v; ucs_status_t status; - check_caps(UCT_IFACE_FLAG_AM_BCOPY); init_bufs(0, sender().iface_attr().cap.am.max_bcopy); status = uct_iface_set_am_handler(receiver().iface(), 0, am_handler, 0, UCT_CB_FLAG_ASYNC); @@ -232,11 +233,11 @@ UCS_TEST_P(test_uct_stats, am_bcopy) check_am_rx_counters(lbuf->length()); } -UCS_TEST_P(test_uct_stats, am_zcopy) +UCS_TEST_SKIP_COND_P(test_uct_stats, am_zcopy, + !check_caps(UCT_IFACE_FLAG_AM_ZCOPY)) { ucs_status_t status; - check_caps(UCT_IFACE_FLAG_AM_ZCOPY); init_bufs(0, sender().iface_attr().cap.am.max_zcopy); status = uct_iface_set_am_handler(receiver().iface(), 0, am_handler, 0, UCT_CB_FLAG_ASYNC); @@ -256,12 +257,12 @@ UCS_TEST_P(test_uct_stats, am_zcopy) } -UCS_TEST_P(test_uct_stats, put_short) +UCS_TEST_SKIP_COND_P(test_uct_stats, put_short, + !check_caps(UCT_IFACE_FLAG_PUT_SHORT)) { uint64_t send_data=0xfeedf00d; ucs_status_t status; - check_caps(UCT_IFACE_FLAG_PUT_SHORT); init_bufs(0, sender().iface_attr().cap.put.max_short); UCT_TEST_CALL_AND_TRY_AGAIN(uct_ep_put_short(sender_ep(), &send_data, sizeof(send_data), @@ -273,11 +274,11 @@ UCS_TEST_P(test_uct_stats, put_short) sizeof(send_data)); } -UCS_TEST_P(test_uct_stats, put_bcopy) +UCS_TEST_SKIP_COND_P(test_uct_stats, put_bcopy, + !check_caps(UCT_IFACE_FLAG_PUT_BCOPY)) { ssize_t v; - check_caps(UCT_IFACE_FLAG_PUT_BCOPY); init_bufs(0, sender().iface_attr().cap.put.max_bcopy); UCT_TEST_CALL_AND_TRY_AGAIN(uct_ep_put_bcopy(sender_ep(), mapped_buffer::pack, lbuf, @@ -289,11 +290,11 @@ UCS_TEST_P(test_uct_stats, put_bcopy) lbuf->length()); } -UCS_TEST_P(test_uct_stats, put_zcopy) +UCS_TEST_SKIP_COND_P(test_uct_stats, put_zcopy, + !check_caps(UCT_IFACE_FLAG_PUT_ZCOPY)) { ucs_status_t status; - check_caps(UCT_IFACE_FLAG_PUT_ZCOPY); init_bufs(0, sender().iface_attr().cap.put.max_zcopy); UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, lbuf->ptr(), lbuf->length(), lbuf->memh(), @@ -310,11 +311,11 @@ UCS_TEST_P(test_uct_stats, put_zcopy) } -UCS_TEST_P(test_uct_stats, get_bcopy) +UCS_TEST_SKIP_COND_P(test_uct_stats, get_bcopy, + !check_caps(UCT_IFACE_FLAG_GET_BCOPY)) { ucs_status_t status; - check_caps(UCT_IFACE_FLAG_GET_BCOPY); init_bufs(0, sender().iface_attr().cap.get.max_bcopy); init_completion(); @@ -330,11 +331,11 @@ UCS_TEST_P(test_uct_stats, get_bcopy) lbuf->length()); } -UCS_TEST_P(test_uct_stats, get_zcopy) +UCS_TEST_SKIP_COND_P(test_uct_stats, get_zcopy, + !check_caps(UCT_IFACE_FLAG_GET_ZCOPY)) { ucs_status_t status; - check_caps(UCT_IFACE_FLAG_GET_ZCOPY); init_bufs(sender().iface_attr().cap.get.min_zcopy, sender().iface_attr().cap.get.max_zcopy); @@ -353,16 +354,16 @@ UCS_TEST_P(test_uct_stats, get_zcopy) lbuf->length()); } -#define TEST_STATS_ATOMIC_POST(_op, _val) \ -UCS_TEST_P(test_uct_stats, atomic_post_ ## _op ## _val) \ -{ \ - ucs_status_t status; \ - check_atomics(UCS_BIT(UCT_ATOMIC_OP_ ## _op), OP ## _val); \ - init_bufs(sizeof(uint##_val##_t), sizeof(uint##_val##_t)); \ +#define TEST_STATS_ATOMIC_POST(_op, _val) \ +UCS_TEST_SKIP_COND_P(test_uct_stats, atomic_post_ ## _op ## _val, \ + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_ ## _op), OP ## _val)) \ +{ \ + ucs_status_t status; \ + init_bufs(sizeof(uint##_val##_t), sizeof(uint##_val##_t)); \ status = uct_ep_atomic ##_val##_post(sender_ep(), (UCT_ATOMIC_OP_ ## _op), \ - 1, rbuf->addr(), rbuf->rkey()); \ - EXPECT_UCS_OK(status); \ - check_atomic_counters(); \ + 1, rbuf->addr(), rbuf->rkey()); \ + EXPECT_UCS_OK(status); \ + check_atomic_counters(); \ } TEST_STATS_ATOMIC_POST(ADD, 32) @@ -375,21 +376,21 @@ TEST_STATS_ATOMIC_POST(XOR, 32) TEST_STATS_ATOMIC_POST(XOR, 64) -#define TEST_STATS_ATOMIC_FETCH(_op, _val) \ -UCS_TEST_P(test_uct_stats, atomic_fetch_## _op ## _val) \ -{ \ - ucs_status_t status; \ - uint##_val##_t result; \ - \ - check_atomics(UCS_BIT(UCT_ATOMIC_OP_ ## _op), FOP ## _val); \ - init_bufs(sizeof(result), sizeof(result)); \ - \ - init_completion(); \ - status = uct_ep_atomic##_val##_fetch(sender_ep(), (UCT_ATOMIC_OP_ ## _op), 1, \ +#define TEST_STATS_ATOMIC_FETCH(_op, _val) \ +UCS_TEST_SKIP_COND_P(test_uct_stats, atomic_fetch_## _op ## _val, \ + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_ ## _op), FOP ## _val)) \ +{ \ + ucs_status_t status; \ + uint##_val##_t result; \ + \ + init_bufs(sizeof(result), sizeof(result)); \ + \ + init_completion(); \ + status = uct_ep_atomic##_val##_fetch(sender_ep(), (UCT_ATOMIC_OP_ ## _op), 1, \ &result, rbuf->addr(), rbuf->rkey(), &m_comp); \ - wait_for_completion(status); \ - \ - check_atomic_counters(); \ + wait_for_completion(status); \ + \ + check_atomic_counters(); \ } TEST_STATS_ATOMIC_FETCH(ADD, 32) @@ -404,21 +405,21 @@ TEST_STATS_ATOMIC_FETCH(SWAP, 32) TEST_STATS_ATOMIC_FETCH(SWAP, 64) #define TEST_STATS_ATOMIC_CSWAP(val) \ -UCS_TEST_P(test_uct_stats, atomic_cswap##val) \ +UCS_TEST_SKIP_COND_P(test_uct_stats, atomic_cswap##val, \ + !check_atomics(UCS_BIT(UCT_ATOMIC_OP_CSWAP), FOP ## val)) \ { \ ucs_status_t status; \ uint##val##_t result; \ -\ - check_atomics(UCS_BIT(UCT_ATOMIC_OP_CSWAP), FOP ## val); \ + \ init_bufs(sizeof(result), sizeof(result)); \ -\ + \ init_completion(); \ UCT_TEST_CALL_AND_TRY_AGAIN( \ uct_ep_atomic_cswap##val (sender_ep(), 1, 2, rbuf->addr(), \ rbuf->rkey(), &result, &m_comp), \ status); \ wait_for_completion(status); \ -\ + \ check_atomic_counters(); \ } @@ -460,12 +461,12 @@ UCS_TEST_P(test_uct_stats, fence) /* flush test only check stats on tls with am_bcopy * TODO: full test matrix */ -UCS_TEST_P(test_uct_stats, flush_wait_iface) +UCS_TEST_SKIP_COND_P(test_uct_stats, flush_wait_iface, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY)) { uint64_t count_wait; ucs_status_t status; - check_caps(UCT_IFACE_FLAG_AM_BCOPY); init_bufs(0, sender().iface_attr().cap.am.max_bcopy); status = uct_iface_set_am_handler(receiver().iface(), 0, am_handler, 0, UCT_CB_FLAG_ASYNC); @@ -485,12 +486,12 @@ UCS_TEST_P(test_uct_stats, flush_wait_iface) EXPECT_STAT(sender, uct_iface, UCT_IFACE_STAT_FLUSH_WAIT, count_wait); } -UCS_TEST_P(test_uct_stats, flush_wait_ep) +UCS_TEST_SKIP_COND_P(test_uct_stats, flush_wait_ep, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY)) { uint64_t count_wait; ucs_status_t status; - check_caps(UCT_IFACE_FLAG_AM_BCOPY); init_bufs(0, sender().iface_attr().cap.am.max_bcopy); status = uct_iface_set_am_handler(receiver().iface(), 0, am_handler, 0, UCT_CB_FLAG_ASYNC); @@ -513,11 +514,11 @@ UCS_TEST_P(test_uct_stats, flush_wait_ep) /* fence test only check stats on tls with am_bcopy * TODO: full test matrix */ -UCS_TEST_P(test_uct_stats, fence_iface) +UCS_TEST_SKIP_COND_P(test_uct_stats, fence_iface, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY)) { ucs_status_t status; - check_caps(UCT_IFACE_FLAG_AM_BCOPY); init_bufs(0, sender().iface_attr().cap.am.max_bcopy); status = uct_iface_set_am_handler(receiver().iface(), 0, am_handler, 0, UCT_CB_FLAG_ASYNC); @@ -533,11 +534,11 @@ UCS_TEST_P(test_uct_stats, fence_iface) EXPECT_STAT(sender, uct_iface, UCT_IFACE_STAT_FENCE, 1UL); } -UCS_TEST_P(test_uct_stats, fence_ep) +UCS_TEST_SKIP_COND_P(test_uct_stats, fence_ep, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY)) { ucs_status_t status; - check_caps(UCT_IFACE_FLAG_AM_BCOPY); init_bufs(0, sender().iface_attr().cap.am.max_bcopy); status = uct_iface_set_am_handler(receiver().iface(), 0, am_handler, 0, UCT_CB_FLAG_ASYNC); @@ -553,12 +554,12 @@ UCS_TEST_P(test_uct_stats, fence_ep) EXPECT_STAT(sender, uct_ep, UCT_EP_STAT_FENCE, 1UL); } -UCS_TEST_P(test_uct_stats, tx_no_res) +UCS_TEST_SKIP_COND_P(test_uct_stats, tx_no_res, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY)) { uint64_t count; ucs_status_t status; - check_caps(UCT_IFACE_FLAG_AM_BCOPY); init_bufs(0, sender().iface_attr().cap.am.max_bcopy); status = uct_iface_set_am_handler(receiver().iface(), 0, am_handler, 0, UCT_CB_FLAG_ASYNC); @@ -569,13 +570,14 @@ UCS_TEST_P(test_uct_stats, tx_no_res) EXPECT_STAT(sender, uct_ep, UCT_EP_STAT_AM, 1024 - count); } -UCS_TEST_P(test_uct_stats, pending_add) +UCS_TEST_SKIP_COND_P(test_uct_stats, pending_add, + !check_caps(UCT_IFACE_FLAG_AM_BCOPY | + UCT_IFACE_FLAG_PENDING)) { const size_t num_reqs = 5; uct_pending_req_t p_reqs[num_reqs]; ssize_t len; - check_caps(UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_PENDING); init_bufs(0, sender().iface_attr().cap.am.max_bcopy); EXPECT_UCS_OK(uct_iface_set_am_handler(receiver().iface(), 0, am_handler, 0, diff --git a/test/gtest/uct/test_tag.cc b/test/gtest/uct/test_tag.cc index 70a306d4e9c..45ba9902317 100644 --- a/test/gtest/uct/test_tag.cc +++ b/test/gtest/uct/test_tag.cc @@ -11,7 +11,6 @@ extern "C" { #include "uct_test.h" #define UCT_TAG_INSTANTIATE_TEST_CASE(_test_case) \ - _UCT_INSTANTIATE_TEST_CASE(_test_case, rc) \ _UCT_INSTANTIATE_TEST_CASE(_test_case, rc_mlx5) \ _UCT_INSTANTIATE_TEST_CASE(_test_case, dc_mlx5) @@ -59,35 +58,26 @@ class test_tag : public uct_test { "RC_TM_ENABLE", "y"); ASSERT_TRUE((status == UCS_OK) || (status == UCS_ERR_NO_ELEM)); + status = uct_config_modify(m_iface_config, "RC_TM_MP_SRQ_ENABLE", "no"); + ASSERT_TRUE((status == UCS_OK) || (status == UCS_ERR_NO_ELEM)); + uct_test::init(); - uct_iface_params params; - params.field_mask = UCT_IFACE_PARAM_FIELD_RX_HEADROOM | - UCT_IFACE_PARAM_FIELD_OPEN_MODE | - UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_CB | - UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_ARG | - UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_CB | - UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_ARG; - - // tl and dev names are taken from resources via GetParam, no need - // to fill it here - params.rx_headroom = 0; - params.open_mode = UCT_IFACE_OPEN_MODE_DEVICE; - params.eager_cb = unexp_eager; - params.eager_arg = reinterpret_cast(this); - params.rndv_cb = unexp_rndv; - params.rndv_arg = reinterpret_cast(this); + entity *sender = uct_test::create_entity(0ul, NULL, unexp_eager, + unexp_rndv, + reinterpret_cast(this), + reinterpret_cast(this)); + m_entities.push_back(sender); - if (UCT_DEVICE_TYPE_SELF == GetParam()->dev_type) { - entity *e = uct_test::create_entity(params); - m_entities.push_back(e); + check_skip_test(); - e->connect(0, *e, 0); + if (UCT_DEVICE_TYPE_SELF == GetParam()->dev_type) { + sender->connect(0, *sender, 0); } else { - entity *sender = uct_test::create_entity(params); - m_entities.push_back(sender); - - entity *receiver = uct_test::create_entity(params); + entity *receiver = uct_test::create_entity(0ul, NULL, unexp_eager, + unexp_rndv, + reinterpret_cast(this), + reinterpret_cast(this)); m_entities.push_back(receiver); sender->connect(0, *receiver, 0); @@ -147,7 +137,7 @@ class test_tag : public uct_test { { UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, ctx.mbuf->ptr(), ctx.mbuf->length(), ctx.mbuf->memh(), - sender().iface_attr().cap.tag.eager.max_iov); + e.iface_attr().cap.tag.eager.max_iov); ucs_status_t status = uct_ep_tag_eager_zcopy(e.ep(0), ctx.tag, ctx.imm_data, iov, iovcnt, @@ -332,6 +322,7 @@ class test_tag : public uct_test { // Message should be reported as unexpected and filled with // recv seed (unchanged), as the incoming tag does not match the expected check_rx_completion(r_ctx, false, RECV_SEED); + ASSERT_UCS_OK(tag_cancel(receiver(), r_ctx, 1)); flush(); } @@ -401,7 +392,8 @@ class test_tag : public uct_test { } static ucs_status_t unexp_eager(void *arg, void *data, size_t length, - unsigned flags, uct_tag_t stag, uint64_t imm) + unsigned flags, uct_tag_t stag, + uint64_t imm, void **context) { recv_ctx *user_ctx = reinterpret_cast(imm); user_ctx->unexp = true; @@ -447,7 +439,9 @@ class test_tag : public uct_test { static ucs_log_func_rc_t log_ep_destroy(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap) + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) { if (level == UCS_LOG_LEVEL_WARN) { // Ignore warnings about uncompleted operations during ep destroy @@ -480,94 +474,94 @@ class test_tag : public uct_test { bool test_tag::is_am_received = false; -UCS_TEST_P(test_tag, tag_eager_short_expected) +UCS_TEST_SKIP_COND_P(test_tag, tag_eager_short_expected, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_SHORT)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_SHORT); test_tag_expected(static_cast(&test_tag::tag_eager_short), sender().iface_attr().cap.tag.eager.max_short); } -UCS_TEST_P(test_tag, tag_eager_bcopy_expected) +UCS_TEST_SKIP_COND_P(test_tag, tag_eager_bcopy_expected, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY); test_tag_expected(static_cast(&test_tag::tag_eager_bcopy), sender().iface_attr().cap.tag.eager.max_bcopy); } -UCS_TEST_P(test_tag, tag_eager_zcopy_expected) +UCS_TEST_SKIP_COND_P(test_tag, tag_eager_zcopy_expected, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_ZCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_ZCOPY); test_tag_expected(static_cast(&test_tag::tag_eager_zcopy), sender().iface_attr().cap.tag.eager.max_zcopy); } -UCS_TEST_P(test_tag, tag_rndv_zcopy_expected) +UCS_TEST_SKIP_COND_P(test_tag, tag_rndv_zcopy_expected, + !check_caps(UCT_IFACE_FLAG_TAG_RNDV_ZCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_RNDV_ZCOPY); test_tag_expected(static_cast(&test_tag::tag_rndv_zcopy), sender().iface_attr().cap.tag.rndv.max_zcopy); } -UCS_TEST_P(test_tag, tag_eager_bcopy_unexpected) +UCS_TEST_SKIP_COND_P(test_tag, tag_eager_bcopy_unexpected, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY); test_tag_unexpected(static_cast(&test_tag::tag_eager_bcopy), sender().iface_attr().cap.tag.eager.max_bcopy); } -UCS_TEST_P(test_tag, tag_eager_zcopy_unexpected) +UCS_TEST_SKIP_COND_P(test_tag, tag_eager_zcopy_unexpected, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_ZCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_ZCOPY); test_tag_unexpected(static_cast(&test_tag::tag_eager_zcopy), sender().iface_attr().cap.tag.eager.max_bcopy); } -UCS_TEST_P(test_tag, tag_rndv_zcopy_unexpected) +UCS_TEST_SKIP_COND_P(test_tag, tag_rndv_zcopy_unexpected, + !check_caps(UCT_IFACE_FLAG_TAG_RNDV_ZCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_RNDV_ZCOPY); test_tag_unexpected(static_cast(&test_tag::tag_rndv_zcopy)); } -UCS_TEST_P(test_tag, tag_eager_bcopy_wrong_tag) +UCS_TEST_SKIP_COND_P(test_tag, tag_eager_bcopy_wrong_tag, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY); test_tag_wrong_tag(static_cast(&test_tag::tag_eager_bcopy)); } -UCS_TEST_P(test_tag, tag_eager_zcopy_wrong_tag) +UCS_TEST_SKIP_COND_P(test_tag, tag_eager_zcopy_wrong_tag, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_ZCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_ZCOPY); test_tag_wrong_tag(static_cast(&test_tag::tag_eager_zcopy)); } -UCS_TEST_P(test_tag, tag_eager_short_tag_mask) +UCS_TEST_SKIP_COND_P(test_tag, tag_eager_short_tag_mask, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_SHORT)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_SHORT); test_tag_mask(static_cast(&test_tag::tag_eager_short)); } -UCS_TEST_P(test_tag, tag_eager_bcopy_tag_mask) +UCS_TEST_SKIP_COND_P(test_tag, tag_eager_bcopy_tag_mask, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY); test_tag_mask(static_cast(&test_tag::tag_eager_bcopy)); } -UCS_TEST_P(test_tag, tag_eager_zcopy_tag_mask) +UCS_TEST_SKIP_COND_P(test_tag, tag_eager_zcopy_tag_mask, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_ZCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_ZCOPY); test_tag_mask(static_cast(&test_tag::tag_eager_zcopy)); } -UCS_TEST_P(test_tag, tag_rndv_zcopy_tag_mask) +UCS_TEST_SKIP_COND_P(test_tag, tag_rndv_zcopy_tag_mask, + !check_caps(UCT_IFACE_FLAG_TAG_RNDV_ZCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_RNDV_ZCOPY); test_tag_mask(static_cast(&test_tag::tag_rndv_zcopy)); } -UCS_TEST_P(test_tag, tag_hold_uct_desc) +UCS_TEST_SKIP_COND_P(test_tag, tag_hold_uct_desc, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY | + UCT_IFACE_FLAG_TAG_RNDV_ZCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY | UCT_IFACE_FLAG_TAG_RNDV_ZCOPY); - int n = 10; int msg_size = ucs_min(sender().iface_attr().cap.tag.eager.max_bcopy, sender().iface_attr().cap.tag.rndv.max_zcopy); @@ -587,23 +581,21 @@ UCS_TEST_P(test_tag, tag_hold_uct_desc) } -UCS_TEST_P(test_tag, tag_send_no_tag) +UCS_TEST_SKIP_COND_P(test_tag, tag_send_no_tag, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY); - - uct_iface_set_am_handler(receiver().iface(), 0, am_handler, NULL, 0); - mapped_buffer lbuf(200, SEND_SEED, sender()); - ssize_t len = uct_ep_am_bcopy(sender().ep(0), 0, mapped_buffer::pack, - reinterpret_cast(&lbuf), 0); - EXPECT_EQ(lbuf.length(), static_cast(len)); - wait_for_flag(&is_am_received); - EXPECT_TRUE(is_am_received); + uct_iface_set_am_handler(receiver().iface(), 0, am_handler, NULL, 0); + mapped_buffer lbuf(200, SEND_SEED, sender()); + ssize_t len = uct_ep_am_bcopy(sender().ep(0), 0, mapped_buffer::pack, + reinterpret_cast(&lbuf), 0); + EXPECT_EQ(lbuf.length(), static_cast(len)); + wait_for_flag(&is_am_received); + EXPECT_TRUE(is_am_received); } -UCS_TEST_P(test_tag, tag_cancel_force) +UCS_TEST_SKIP_COND_P(test_tag, tag_cancel_force, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY); - const size_t length = 128; mapped_buffer recvbuf(length, RECV_SEED, receiver()); recv_ctx r_ctx; @@ -626,10 +618,9 @@ UCS_TEST_P(test_tag, tag_cancel_force) check_rx_completion(r_ctx, false, SEND_SEED); } -UCS_TEST_P(test_tag, tag_cancel_noforce) +UCS_TEST_SKIP_COND_P(test_tag, tag_cancel_noforce, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY); - const size_t length = 128; mapped_buffer recvbuf(length, RECV_SEED, receiver()); recv_ctx r_ctx; @@ -647,21 +638,20 @@ UCS_TEST_P(test_tag, tag_cancel_noforce) EXPECT_EQ(r_ctx.status, UCS_ERR_CANCELED); } -UCS_TEST_P(test_tag, tag_limit, "TM_SYNC_RATIO?=0.0") +UCS_TEST_SKIP_COND_P(test_tag, tag_limit, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY); - const size_t length = 32; - mapped_buffer recvbuf(length, RECV_SEED, receiver()); ucs::ptr_vector rctxs; - recv_ctx *rctx_p; + ucs::ptr_vector rbufs; ucs_status_t status; do { - // Can use the same recv buffer, as no sends will be issued. - rctx_p = (new recv_ctx()); - init_recv_ctx(*rctx_p, &recvbuf, 1); + recv_ctx *rctx_p = new recv_ctx(); + mapped_buffer *buf_p = new mapped_buffer(length, RECV_SEED, receiver()); + init_recv_ctx(*rctx_p, buf_p, 1); rctxs.push_back(rctx_p); + rbufs.push_back(buf_p); status = tag_post(receiver(), *rctx_p); // Make sure send resources are acknowledged, as we // awaiting for tag space exhaustion. @@ -680,20 +670,64 @@ UCS_TEST_P(test_tag, tag_limit, "TM_SYNC_RATIO?=0.0") status = tag_post(receiver(), rctxs.at(0)); } while ((ucs_get_time() < deadline) && (status == UCS_ERR_EXCEEDS_LIMIT)); ASSERT_UCS_OK(status); + + // remove posted tags from HW + for (ucs::ptr_vector::const_iterator iter = rctxs.begin(); + iter != rctxs.end() - 1; ++iter) { + ASSERT_UCS_OK(tag_cancel(receiver(), **iter, 1)); + } } -UCS_TEST_P(test_tag, sw_rndv_expected) +UCS_TEST_SKIP_COND_P(test_tag, tag_post_same, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY | UCT_IFACE_FLAG_TAG_RNDV_ZCOPY); + const size_t length = 128; + mapped_buffer recvbuf(length, RECV_SEED, receiver()); + recv_ctx r_ctx; + init_recv_ctx(r_ctx, &recvbuf, 1); + + ASSERT_UCS_OK(tag_post(receiver(), r_ctx)); + + // Can't post the same buffer until it is completed/cancelled + ucs_status_t status = tag_post(receiver(), r_ctx); + EXPECT_EQ(status, UCS_ERR_ALREADY_EXISTS); + + // Cancel with force, should be able to re-post immediately + ASSERT_UCS_OK(tag_cancel(receiver(), r_ctx, 1)); + ASSERT_UCS_OK(tag_post(receiver(), r_ctx)); + + // Cancel without force, should be able to re-post when receive completion + ASSERT_UCS_OK(tag_cancel(receiver(), r_ctx, 0)); + status = tag_post(receiver(), r_ctx); + EXPECT_EQ(status, UCS_ERR_ALREADY_EXISTS); // no completion yet + + wait_for_flag(&r_ctx.comp); // cancel completed, should be able to post + ASSERT_UCS_OK(tag_post(receiver(), r_ctx)); + + // Now send something to trigger rx completion + init_recv_ctx(r_ctx, &recvbuf, 1); // reinit rx to clear completed states + mapped_buffer sendbuf(length, SEND_SEED, sender()); + send_ctx s_ctx; + init_send_ctx(s_ctx, &sendbuf, 1, reinterpret_cast(&r_ctx)); + ASSERT_UCS_OK(tag_eager_bcopy(sender(), s_ctx)); + + wait_for_flag(&r_ctx.comp); // message consumed, should be able to post + ASSERT_UCS_OK(tag_post(receiver(), r_ctx)); + + ASSERT_UCS_OK(tag_cancel(receiver(), r_ctx, 1)); +} +UCS_TEST_SKIP_COND_P(test_tag, sw_rndv_expected, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY | + UCT_IFACE_FLAG_TAG_RNDV_ZCOPY)) +{ test_tag_expected(static_cast(&test_tag::tag_rndv_request), sender().iface_attr().cap.tag.rndv.max_hdr, true); } -UCS_TEST_P(test_tag, rndv_limit) +UCS_TEST_SKIP_COND_P(test_tag, rndv_limit, + !check_caps(UCT_IFACE_FLAG_TAG_RNDV_ZCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_RNDV_ZCOPY); - mapped_buffer sendbuf(8, SEND_SEED, sender()); ucs::ptr_vector sctxs; ucs_status_t status; @@ -723,16 +757,17 @@ UCS_TEST_P(test_tag, rndv_limit) ucs_log_pop_handler(); } -UCS_TEST_P(test_tag, sw_rndv_unexpected) +UCS_TEST_SKIP_COND_P(test_tag, sw_rndv_unexpected, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY | + UCT_IFACE_FLAG_TAG_RNDV_ZCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY | UCT_IFACE_FLAG_TAG_RNDV_ZCOPY); test_tag_unexpected(static_cast(&test_tag::tag_rndv_request)); } UCT_TAG_INSTANTIATE_TEST_CASE(test_tag) -#if ENABLE_STATS && IBV_HW_TM +#if defined (ENABLE_STATS) && IBV_HW_TM extern "C" { #include #include @@ -793,12 +828,11 @@ class test_tag_stats : public test_tag { } }; -UCS_TEST_P(test_tag_stats, tag_expected_eager) +UCS_TEST_SKIP_COND_P(test_tag_stats, tag_expected_eager, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_SHORT | + UCT_IFACE_FLAG_TAG_EAGER_BCOPY | + UCT_IFACE_FLAG_TAG_EAGER_ZCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_SHORT | - UCT_IFACE_FLAG_TAG_EAGER_BCOPY | - UCT_IFACE_FLAG_TAG_EAGER_ZCOPY); - std::pair > sfuncs[3] = { std::make_pair(static_cast(&test_tag::tag_eager_short), std::make_pair(sender().iface_attr().cap.tag.eager.max_short, @@ -822,10 +856,10 @@ UCS_TEST_P(test_tag_stats, tag_expected_eager) } } -UCS_TEST_P(test_tag_stats, tag_unexpected_eager) +UCS_TEST_SKIP_COND_P(test_tag_stats, tag_unexpected_eager, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY | + UCT_IFACE_FLAG_TAG_EAGER_ZCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY | UCT_IFACE_FLAG_TAG_EAGER_ZCOPY); - std::pair > sfuncs[2] = { std::make_pair(static_cast(&test_tag::tag_eager_bcopy), std::make_pair(sender().iface_attr().cap.tag.eager.max_bcopy, @@ -845,9 +879,9 @@ UCS_TEST_P(test_tag_stats, tag_unexpected_eager) } } -UCS_TEST_P(test_tag_stats, tag_list_ops) +UCS_TEST_SKIP_COND_P(test_tag_stats, tag_list_ops, + !check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_EAGER_BCOPY); mapped_buffer recvbuf(32, RECV_SEED, receiver()); recv_ctx rctx; @@ -871,10 +905,10 @@ UCS_TEST_P(test_tag_stats, tag_list_ops) } -UCS_TEST_P(test_tag_stats, tag_rndv) +UCS_TEST_SKIP_COND_P(test_tag_stats, tag_rndv, + !check_caps(UCT_IFACE_FLAG_TAG_RNDV_ZCOPY | + UCT_IFACE_FLAG_TAG_EAGER_BCOPY)) { - check_caps(UCT_IFACE_FLAG_TAG_RNDV_ZCOPY | UCT_IFACE_FLAG_TAG_EAGER_BCOPY); - size_t len = sender().iface_attr().cap.tag.rndv.max_zcopy / 8; // Check UNEXP_RNDV on the receiver @@ -899,3 +933,359 @@ UCS_TEST_P(test_tag_stats, tag_rndv) UCT_TAG_INSTANTIATE_TEST_CASE(test_tag_stats) #endif + + +#if IBV_HW_TM + +extern "C" { +#include +} + +// TODO: Unite with test_tag + add GRH testing for DC +class test_tag_mp_xrq : public uct_test { +public: + static const uint64_t SEND_SEED = 0xa1a1a1a1a1a1a1a1ul; + static const uint64_t AM_ID = 1; + typedef void (test_tag_mp_xrq::*send_func)(mapped_buffer*); + + virtual void init(); + test_tag_mp_xrq(); + uct_rc_mlx5_iface_common_t* rc_mlx5_iface(entity &e); + void send_eager_bcopy(mapped_buffer *buf); + void send_eager_zcopy(mapped_buffer *buf); + void send_rndv_zcopy(mapped_buffer *buf); + void send_rndv_request(mapped_buffer *buf); + void send_am_bcopy(mapped_buffer *buf); + void test_common(send_func sfunc, size_t num_segs, size_t exp_segs = 1, + bool is_eager = true); + + static ucs_status_t am_handler(void *arg, void *data, size_t length, + unsigned flags); + + static ucs_status_t unexp_eager(void *arg, void *data, size_t length, + unsigned flags, uct_tag_t stag, + uint64_t imm, void **context); + + static ucs_status_t unexp_rndv(void *arg, unsigned flags, uint64_t stag, + const void *header, unsigned header_length, + uint64_t remote_addr, size_t length, + const void *rkey_buf); + +protected: + static size_t m_rx_counter; + std::vector m_uct_descs; + bool m_hold_uct_desc; + + uct_test::entity& sender() { + return **m_entities.begin(); + } + + uct_test::entity& receiver() { + return **(m_entities.end() - 1); + } + +private: + ucs_status_t unexp_handler(void *data, unsigned flags, uint64_t imm, + void **context); + ucs_status_t handle_uct_desc(void *data, unsigned flags); + void set_env_var_or_skip(void *config, const char *var, const char *val); + size_t m_max_hdr; + bool m_first_received; + bool m_last_received; + uct_completion_t m_uct_comp; +}; + +size_t test_tag_mp_xrq::m_rx_counter = 0; + +test_tag_mp_xrq::test_tag_mp_xrq() : m_hold_uct_desc(false), + m_first_received(false), + m_last_received(false) +{ + m_max_hdr = sizeof(ibv_tmh) + sizeof(ibv_rvh); + m_uct_comp.count = 512; // We do not need completion func to be invoked + m_uct_comp.func = NULL; +} + +uct_rc_mlx5_iface_common_t* test_tag_mp_xrq::rc_mlx5_iface(entity &e) +{ + return ucs_derived_of(e.iface(), uct_rc_mlx5_iface_common_t); +} + +void test_tag_mp_xrq::set_env_var_or_skip(void *config, const char *var, + const char *val) +{ + ucs_status_t status = uct_config_modify(config, var, val); + if (status != UCS_OK) { + ucs_warn("%s", ucs_status_string(status)); + UCS_TEST_SKIP_R(std::string("Can't set ") + var); + } +} + +void test_tag_mp_xrq::init() +{ + set_env_var_or_skip(m_iface_config, "RC_TM_ENABLE", "y"); + set_env_var_or_skip(m_iface_config, "RC_TM_MP_SRQ_ENABLE", "try"); + set_env_var_or_skip(m_iface_config, "RC_TM_MP_NUM_STRIDES", "8"); + set_env_var_or_skip(m_md_config, "MLX5_DEVX_OBJECTS", "dct,dcsrq,rcsrq,rcqp"); + + uct_test::init(); + + entity *sender = uct_test::create_entity(0ul, NULL, unexp_eager, unexp_rndv, + reinterpret_cast(this), + reinterpret_cast(this)); + m_entities.push_back(sender); + + entity *receiver = uct_test::create_entity(0ul, NULL, unexp_eager, unexp_rndv, + reinterpret_cast(this), + reinterpret_cast(this)); + m_entities.push_back(receiver); + + if (!UCT_RC_MLX5_MP_ENABLED(rc_mlx5_iface(test_tag_mp_xrq::sender()))) { + UCS_TEST_SKIP_R("No MP XRQ support"); + } + + sender->connect(0, *receiver, 0); + + uct_iface_set_am_handler(receiver->iface(), AM_ID, am_handler, this, 0); +} + +void test_tag_mp_xrq::send_eager_bcopy(mapped_buffer *buf) +{ + ssize_t len = uct_ep_tag_eager_bcopy(sender().ep(0), 0x11, + reinterpret_cast(this), + mapped_buffer::pack, + reinterpret_cast(buf), 0); + + EXPECT_EQ(buf->length(), static_cast(len)); +} + +void test_tag_mp_xrq::send_eager_zcopy(mapped_buffer *buf) +{ + UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, buf->ptr(), buf->length(), buf->memh(), + sender().iface_attr().cap.tag.eager.max_iov); + + ucs_status_t status = uct_ep_tag_eager_zcopy(sender().ep(0), 0x11, + reinterpret_cast(this), + iov, iovcnt, 0, &m_uct_comp); + ASSERT_UCS_OK_OR_INPROGRESS(status); +} + +void test_tag_mp_xrq::send_rndv_zcopy(mapped_buffer *buf) +{ + UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, buf->ptr(), buf->length(), buf->memh(), + sender().iface_attr().cap.tag.rndv.max_iov); + + uint64_t dummy_hdr = 0xFAFA; + ucs_status_ptr_t rndv_op = uct_ep_tag_rndv_zcopy(sender().ep(0), 0x11, &dummy_hdr, + sizeof(dummy_hdr), iov, + iovcnt, 0, &m_uct_comp); + ASSERT_FALSE(UCS_PTR_IS_ERR(rndv_op)); + + // There will be no real RNDV performed, cancel the op to avoid mpool + // warning on exit + ASSERT_UCS_OK(uct_ep_tag_rndv_cancel(sender().ep(0),rndv_op)); +} + +void test_tag_mp_xrq::send_rndv_request(mapped_buffer *buf) +{ + size_t size = sender().iface_attr().cap.tag.rndv.max_hdr; + void *hdr = alloca(size); + + ASSERT_UCS_OK(uct_ep_tag_rndv_request(sender().ep(0), 0x11, hdr, size, 0)); +} + +void test_tag_mp_xrq::send_am_bcopy(mapped_buffer *buf) +{ + ssize_t len = uct_ep_am_bcopy(sender().ep(0), AM_ID, mapped_buffer::pack, + reinterpret_cast(buf), 0); + + EXPECT_EQ(buf->length(), static_cast(len)); +} + +void test_tag_mp_xrq::test_common(send_func sfunc, size_t num_segs, + size_t exp_segs, bool is_eager) +{ + size_t seg_size = rc_mlx5_iface(sender())->super.super.config.seg_size; + size_t seg_num = is_eager ? num_segs : 1; + size_t exp_val = is_eager ? exp_segs : 1; + size_t size = (seg_size * seg_num) - m_max_hdr; + m_rx_counter = 0; + m_first_received = m_last_received = false; + + EXPECT_TRUE(size <= sender().iface_attr().cap.tag.eager.max_bcopy); + mapped_buffer buf(size, SEND_SEED, sender()); + + (this->*sfunc)(&buf); + + wait_for_value(&m_rx_counter, exp_val, true); + EXPECT_EQ(exp_val, m_rx_counter); + EXPECT_EQ(is_eager, m_first_received); // relevant for eager only + EXPECT_EQ(is_eager, m_last_received); // relevant for eager only +} + +ucs_status_t test_tag_mp_xrq::handle_uct_desc(void *data, unsigned flags) +{ + if ((flags & UCT_CB_PARAM_FLAG_DESC) && m_hold_uct_desc) { + m_uct_descs.push_back(data); + return UCS_INPROGRESS; + } + + return UCS_OK; +} + +ucs_status_t test_tag_mp_xrq::am_handler(void *arg, void *data, size_t length, + unsigned flags) +{ + EXPECT_TRUE(flags & UCT_CB_PARAM_FLAG_FIRST); + EXPECT_FALSE(flags & UCT_CB_PARAM_FLAG_MORE); + + m_rx_counter++; + + test_tag_mp_xrq *self = reinterpret_cast(arg); + return self->handle_uct_desc(data, flags); +} + +ucs_status_t test_tag_mp_xrq::unexp_handler(void *data, unsigned flags, + uint64_t imm, void **context) +{ + void *self = reinterpret_cast(this); + + if (flags & UCT_CB_PARAM_FLAG_FIRST) { + // Set the message context which will be passed back with the rest of + // message fragments + *context = self; + m_first_received = true; + + } else { + // Check that the correct message context is passed with all fragments + EXPECT_EQ(self, *context); + } + + if (!(flags & UCT_CB_PARAM_FLAG_MORE)) { + // Last message should contain valid immediate value + EXPECT_EQ(reinterpret_cast(this), imm); + m_last_received = true; + } else { + // Immediate value is passed with the last message only + EXPECT_EQ(0ul, imm); + } + + + return handle_uct_desc(data, flags); +} + +ucs_status_t test_tag_mp_xrq::unexp_eager(void *arg, void *data, size_t length, + unsigned flags, uct_tag_t stag, + uint64_t imm, void **context) +{ + test_tag_mp_xrq *self = reinterpret_cast(arg); + + m_rx_counter++; + + return self->unexp_handler(data, flags, imm, context); +} + +ucs_status_t test_tag_mp_xrq::unexp_rndv(void *arg, unsigned flags, + uint64_t stag, const void *header, + unsigned header_length, + uint64_t remote_addr, size_t length, + const void *rkey_buf) +{ + EXPECT_FALSE(flags & UCT_CB_PARAM_FLAG_FIRST); + EXPECT_FALSE(flags & UCT_CB_PARAM_FLAG_MORE); + + m_rx_counter++; + + return UCS_OK; +} + +UCS_TEST_P(test_tag_mp_xrq, config) +{ + uct_rc_mlx5_iface_common_t *iface = rc_mlx5_iface(sender()); + + // MP XRQ is supported with tag offload only + EXPECT_TRUE(UCT_RC_MLX5_TM_ENABLED(iface)); + + // With MP XRQ segment size should be equal to MTU, because HW generates + // CQE per each received MTU + size_t mtu = uct_ib_mtu_value(uct_ib_iface_port_attr(&(iface)->super.super)->active_mtu); + EXPECT_EQ(mtu, iface->super.super.config.seg_size); + + const uct_iface_attr *attrs = &sender().iface_attr(); + + // Max tag bcopy is limited by tag tx memory pool + EXPECT_EQ(iface->tm.max_bcopy - sizeof(ibv_tmh), + attrs->cap.tag.eager.max_bcopy); + EXPECT_GT(attrs->cap.tag.eager.max_bcopy, + iface->super.super.config.seg_size); + + // Max tag zcopy is limited by maximal IB message size + EXPECT_EQ(uct_ib_iface_port_attr(&iface->super.super)->max_msg_sz - sizeof(ibv_tmh), + attrs->cap.tag.eager.max_zcopy); + + // Maximal AM size should not exceed segment size, so it would always + // arrive in one-fragment packet (with header it should be strictly less) + EXPECT_LT(attrs->cap.am.max_bcopy, iface->super.super.config.seg_size); + EXPECT_LT(attrs->cap.am.max_zcopy, iface->super.super.config.seg_size); +} + +UCS_TEST_P(test_tag_mp_xrq, desc_release) +{ + m_hold_uct_desc = true; // We want to "hold" UCT memory descriptors + std::pair sfuncs[5] = { + std::make_pair(&test_tag_mp_xrq::send_eager_bcopy, true), + std::make_pair(&test_tag_mp_xrq::send_eager_zcopy, true), + std::make_pair(&test_tag_mp_xrq::send_rndv_zcopy, false), + std::make_pair(&test_tag_mp_xrq::send_rndv_request, false), + std::make_pair(&test_tag_mp_xrq::send_am_bcopy, false) + }; + + for (int i = 0; i < 5; ++i) { + test_common(sfuncs[i].first, 3, 3, sfuncs[i].second); + } + + for (ucs::ptr_vector::const_iterator iter = m_uct_descs.begin(); + iter != m_uct_descs.end(); ++iter) + { + uct_iface_release_desc(*iter); + } +} + +UCS_TEST_P(test_tag_mp_xrq, am) +{ + test_common(&test_tag_mp_xrq::send_am_bcopy, 1, 1, false); +} + +UCS_TEST_P(test_tag_mp_xrq, bcopy_eager_only) +{ + test_common(&test_tag_mp_xrq::send_eager_bcopy, 1); +} + +UCS_TEST_P(test_tag_mp_xrq, zcopy_eager_only) +{ + test_common(&test_tag_mp_xrq::send_eager_zcopy, 1); +} + +UCS_TEST_P(test_tag_mp_xrq, bcopy_eager) +{ + test_common(&test_tag_mp_xrq::send_eager_bcopy, 5, 5); +} + +UCS_TEST_P(test_tag_mp_xrq, zcopy_eager) +{ + test_common(&test_tag_mp_xrq::send_eager_zcopy, 5, 5); +} + +UCS_TEST_P(test_tag_mp_xrq, rndv_zcopy) +{ + test_common(&test_tag_mp_xrq::send_rndv_zcopy, 1, 1, false); +} + +UCS_TEST_P(test_tag_mp_xrq, rndv_request) +{ + test_common(&test_tag_mp_xrq::send_rndv_request, 1, 1, false); +} + +UCT_TAG_INSTANTIATE_TEST_CASE(test_tag_mp_xrq) + +#endif diff --git a/test/gtest/uct/test_uct_ep.cc b/test/gtest/uct/test_uct_ep.cc index a642c8ebb64..81ba4079e7b 100644 --- a/test/gtest/uct/test_uct_ep.cc +++ b/test/gtest/uct/test_uct_ep.cc @@ -9,6 +9,7 @@ extern "C" { } #include "uct_test.h" + class test_uct_ep : public uct_test { protected: @@ -17,6 +18,8 @@ class test_uct_ep : public uct_test { m_sender = uct_test::create_entity(0); m_entities.push_back(m_sender); + check_skip_test(); + m_receiver = uct_test::create_entity(0); m_entities.push_back(m_receiver); @@ -38,21 +41,22 @@ class test_uct_ep : public uct_test { m_sender->destroy_ep(0); } + bool skip_on_ib_dc() { +#ifdef HAVE_DC_DV /* skip due to DCI stuck bug */ + return has_transport("dc_mlx5"); +#else + return false; +#endif + } + entity * m_sender; entity * m_receiver; }; -UCS_TEST_P(test_uct_ep, disconnect_after_send) { +UCS_TEST_SKIP_COND_P(test_uct_ep, disconnect_after_send, + (!check_caps(UCT_IFACE_FLAG_AM_ZCOPY) || + skip_on_ib_dc())) { ucs_status_t status; - unsigned count; - -#if HAVE_DC_DV - if (GetParam()->tl_name.compare("dc_mlx5") == 0) { - UCS_TEST_SKIP_R("DCI stuck bug"); - } -#endif - - check_caps(UCT_IFACE_FLAG_AM_ZCOPY); mapped_buffer buffer(256, 0, *m_sender); UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, buffer.ptr(), @@ -60,10 +64,10 @@ UCS_TEST_P(test_uct_ep, disconnect_after_send) { buffer.memh(), m_sender->iface_attr().cap.am.max_iov); - for (int i = 0; i < 300 / ucs::test_time_multiplier(); ++i) { + unsigned max_iter = 300 / ucs::test_time_multiplier(); + for (unsigned i = 0; i < max_iter; ++i) { connect(); - count = 0; - for (;;) { + for (unsigned count = 0; count < max_iter; ) { status = uct_ep_am_zcopy(m_sender->ep(0), 1, NULL, 0, iov, iovcnt, 0, NULL); if (status == UCS_ERR_NO_RESOURCE) { diff --git a/test/gtest/uct/test_uct_perf.cc b/test/gtest/uct/test_uct_perf.cc index fdcb46f6e39..81e8dde05e6 100644 --- a/test/gtest/uct/test_uct_perf.cc +++ b/test/gtest/uct/test_uct_perf.cc @@ -18,110 +18,116 @@ extern "C" { class test_uct_perf : public uct_test, public test_perf { protected: - static test_spec tests[]; + const static test_spec tests[]; }; -test_perf::test_spec test_uct_perf::tests[] = +const test_perf::test_spec test_uct_perf::tests[] = { { "am latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG, - UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000l, + UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 2.5, 0 }, { "am rate", "Mpps", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 2000000l, + UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 2000000lu, ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0, 0 }, { "am rate64", "Mpps", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 64 }, 1, 2000000l, + UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 64 }, 1, 2000000lu, ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0, 0 }, { "am bcopy latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG, - UCT_PERF_DATA_LAYOUT_BCOPY, 0, 1, { 8 }, 1, 100000l, + UCT_PERF_DATA_LAYOUT_BCOPY, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 2.5}, { "am bcopy bw", "MB/sec", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCT_PERF_DATA_LAYOUT_BCOPY, 0, 1, { 1000 }, 1, 100000l, + UCT_PERF_DATA_LAYOUT_BCOPY, 0, 1, { 1000 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 620.0, 15000.0, 0 }, { "am zcopy bw", "MB/sec", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 1000 }, 32, 100000l, + UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 1000 }, 32, 100000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 600.0, 15000.0, 0 }, + { "am zcopy bw flush ep", "MB/sec", + UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI, + UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 1000 }, 32, 100000lu, + ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 600.0, 15000.0, + UCX_PERF_TEST_FLAG_FLUSH_EP }, + { "put latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_PINGPONG, - UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000l, + UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 1.5, 0 }, { "put rate", "Mpps", UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 2000000l, + UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 2000000lu, ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0, 0 }, { "put bcopy bw", "MB/sec", UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCT_PERF_DATA_LAYOUT_BCOPY, 0, 1, { 2048 }, 1, 100000l, + UCT_PERF_DATA_LAYOUT_BCOPY, 0, 1, { 2048 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 620.0, 50000.0, 0 }, { "put zcopy bw", "MB/sec", UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 2048 }, 32, 100000l, + UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 2048 }, 32, 100000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 620.0, 50000.0, 0 }, { "get latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_GET, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 8 }, 1, 100000l, + UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5, 0 }, { "atomic add latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_ADD, UCX_PERF_TEST_TYPE_PINGPONG, - UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000l, + UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5, 0 }, { "atomic add rate", "Mpps", UCX_PERF_API_UCT, UCX_PERF_CMD_ADD, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 2000000l, + UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 2000000lu, ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.5, 50.0, 0 }, { "atomic fadd latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_FADD, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000l, + UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5, 0 }, { "atomic cswap latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_CSWAP, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000l, + UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5, 0 }, { "atomic swap latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_SWAP, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000l, + UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5, 0 }, { "am iov bw", "MB/sec", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI, - UCT_PERF_DATA_LAYOUT_ZCOPY, 8192, 3, { 256, 256, 512 }, 32, 100000l, + UCT_PERF_DATA_LAYOUT_ZCOPY, 8192, 3, { 256, 256, 512 }, 32, 100000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 600.0, 15000.0, 0 }, @@ -130,18 +136,19 @@ test_perf::test_spec test_uct_perf::tests[] = UCS_TEST_P(test_uct_perf, envelope) { - bool check_perf; - - if (GetParam()->tl_name == "cm" || GetParam()->tl_name == "ugni_udt" || GetParam()->tl_name == "cuda_ipc") { + if (has_transport("cm") || + has_transport("ugni_udt")) { UCS_TEST_SKIP; } /* For SandyBridge CPUs, don't check performance of far-socket devices */ std::vector cpus = get_affinity(); - check_perf = true; + bool check_perf = true; + size_t max_iter = std::numeric_limits::max(); + if (ucs_arch_get_cpu_model() == UCS_CPU_MODEL_INTEL_SANDYBRIDGE) { for (std::vector::iterator iter = cpus.begin(); iter != cpus.end(); ++iter) { - if (!CPU_ISSET(*iter, &GetParam()->local_cpus)) { + if (!ucs_cpu_is_set(*iter, &GetParam()->local_cpus)) { UCS_TEST_MESSAGE << "Not enforcing performance on SandyBridge far socket"; check_perf = false; break; @@ -149,20 +156,24 @@ UCS_TEST_P(test_uct_perf, envelope) { } } - if (GetParam()->tl_name == "tcp") { + if (has_transport("tcp")) { check_perf = false; /* TODO calibrate expected performance based on transport */ + max_iter = 1000lu; } /* Run all tests */ - for (test_spec *test = tests; test->title != NULL; ++test) { + for (const test_spec *test_iter = tests; test_iter->title != NULL; ++test_iter) { + test_spec test = *test_iter; + if (ucs_arch_get_cpu_model() == UCS_CPU_MODEL_ARM_AARCH64) { - test->max *= UCT_ARM_PERF_TEST_MULTIPLIER; - test->min /= UCT_ARM_PERF_TEST_MULTIPLIER; + test.max *= UCT_ARM_PERF_TEST_MULTIPLIER; + test.min /= UCT_ARM_PERF_TEST_MULTIPLIER; } else { - test->max *= UCT_PERF_TEST_MULTIPLIER; - test->min /= UCT_PERF_TEST_MULTIPLIER; + test.max *= UCT_PERF_TEST_MULTIPLIER; + test.min /= UCT_PERF_TEST_MULTIPLIER; } - run_test(*test, 0, check_perf, GetParam()->tl_name, GetParam()->dev_name); + test.iters = ucs_min(test.iters, max_iter); + run_test(test, 0, check_perf, GetParam()->tl_name, GetParam()->dev_name); } } diff --git a/test/gtest/uct/test_zcopy_comp.cc b/test/gtest/uct/test_zcopy_comp.cc index be8f901a8ec..f53ad9f75b2 100644 --- a/test/gtest/uct/test_zcopy_comp.cc +++ b/test/gtest/uct/test_zcopy_comp.cc @@ -8,37 +8,44 @@ class test_zcopy_comp : public uct_test { + void init() { + uct_test::init(); + + m_sender = create_entity(0); + m_entities.push_back(m_sender); + + check_skip_test(); + } + +protected: + entity *m_sender; }; -UCS_TEST_P(test_zcopy_comp, issue1440) +UCS_TEST_SKIP_COND_P(test_zcopy_comp, issue1440, + !check_caps(UCT_IFACE_FLAG_PUT_ZCOPY)) { - entity *sender = create_entity(0); - m_entities.push_back(sender); - entity *receiver_small = create_entity(0); m_entities.push_back(receiver_small); entity *receiver_large = create_entity(0); m_entities.push_back(receiver_large); - sender->connect(0, *receiver_small, 0); - sender->connect(1, *receiver_large, 0); - - check_caps(UCT_IFACE_FLAG_PUT_ZCOPY); + m_sender->connect(0, *receiver_small, 0); + m_sender->connect(1, *receiver_large, 0); - size_t size_small = ucs_max(8ul, sender->iface_attr().cap.put.min_zcopy); - size_t size_large = ucs_min(65536ul, sender->iface_attr().cap.put.max_zcopy); + size_t size_small = ucs_max(8ul, m_sender->iface_attr().cap.put.min_zcopy); + size_t size_large = ucs_min(65536ul, m_sender->iface_attr().cap.put.max_zcopy); ucs_assert(size_large > size_small); - if (sender->md_attr().cap.mem_type != UCT_MD_MEM_TYPE_HOST) { + if (m_sender->md_attr().cap.access_mem_type != UCS_MEMORY_TYPE_HOST) { std::stringstream ss; ss << "test_zcopy_comp is not supported by " << GetParam(); UCS_TEST_SKIP_R(ss.str()); } - mapped_buffer sendbuf_small(size_small, 0, *sender); - mapped_buffer sendbuf_large(size_large, 0, *sender); + mapped_buffer sendbuf_small(size_small, 0, *m_sender); + mapped_buffer sendbuf_large(size_large, 0, *m_sender); mapped_buffer recvbuf_small(size_small, 0, *receiver_small); mapped_buffer recvbuf_large(size_large, 0, *receiver_large); @@ -52,7 +59,7 @@ UCS_TEST_P(test_zcopy_comp, issue1440) while (num_small_sends || num_large_sends) { if (num_small_sends) { ucs_status_t status; - status = uct_ep_put_zcopy(sender->ep(0), sendbuf_small.iov(), 1, + status = uct_ep_put_zcopy(m_sender->ep(0), sendbuf_small.iov(), 1, (uintptr_t)recvbuf_small.ptr(), recvbuf_small.rkey(), &dummy_comp); if ((status == UCS_OK) || (status == UCS_INPROGRESS)) { @@ -61,7 +68,7 @@ UCS_TEST_P(test_zcopy_comp, issue1440) } if (num_large_sends) { ucs_status_t status; - status = uct_ep_put_zcopy(sender->ep(1), sendbuf_large.iov(), 1, + status = uct_ep_put_zcopy(m_sender->ep(1), sendbuf_large.iov(), 1, (uintptr_t)recvbuf_large.ptr(), recvbuf_large.rkey(), &dummy_comp); if ((status == UCS_OK) || (status == UCS_INPROGRESS)) { @@ -71,7 +78,10 @@ UCS_TEST_P(test_zcopy_comp, issue1440) progress(); } - sender->flush(); + /* Call flush on local and remote ifaces to progress data + * (e.g. if call flush only on local iface, a target side may + * not be able to send PUT ACK to an initiator in case of TCP) */ + flush(); } diff --git a/test/gtest/uct/uct_p2p_test.cc b/test/gtest/uct/uct_p2p_test.cc index 71789b105c3..0c7afe342cb 100644 --- a/test/gtest/uct/uct_p2p_test.cc +++ b/test/gtest/uct/uct_p2p_test.cc @@ -31,12 +31,7 @@ std::vector uct_p2p_test::enum_resources(const std::string& tl_ if (all_resources.empty()) { std::vector r = uct_test::enum_resources(""); for (std::vector::iterator iter = r.begin(); iter != r.end(); ++iter) { - p2p_resource res; - res.md_name = (*iter)->md_name; - res.local_cpus = (*iter)->local_cpus; - res.tl_name = (*iter)->tl_name; - res.dev_name = (*iter)->dev_name; - res.dev_type = (*iter)->dev_type; + p2p_resource res(**iter); if (UCT_DEVICE_TYPE_SELF != res.dev_type) { res.loopback = false; @@ -70,14 +65,14 @@ void uct_p2p_test::init() { ucs_assert_always(r != NULL); /* Create 2 connected endpoints */ + entity *e1 = uct_test::create_entity(m_rx_headroom, m_err_handler); + m_entities.push_back(e1); + + check_skip_test(); + if (r->loopback) { - entity *e = uct_test::create_entity(m_rx_headroom, m_err_handler); - m_entities.push_back(e); - e->connect(0, *e, 0); + e1->connect(0, *e1, 0); } else { - entity *e1 = uct_test::create_entity(m_rx_headroom, m_err_handler); - m_entities.push_back(e1); - entity *e2 = uct_test::create_entity(m_rx_headroom, m_err_handler); m_entities.push_back(e2); @@ -98,13 +93,15 @@ void uct_p2p_test::cleanup() { } void uct_p2p_test::test_xfer(send_func_t send, size_t length, unsigned flags, - uct_memory_type_t mem_type) { + ucs_memory_type_t mem_type) { UCS_TEST_SKIP; } ucs_log_func_rc_t uct_p2p_test::log_handler(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *message, va_list ap) + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *message, va_list ap) { if (level == UCS_LOG_LEVEL_TRACE_DATA) { ++log_data_count; @@ -117,7 +114,7 @@ uct_p2p_test::log_handler(const char *file, unsigned line, const char *function, template void uct_p2p_test::test_xfer_print(O& os, send_func_t send, size_t length, - unsigned flags, uct_memory_type_t mem_type) + unsigned flags, ucs_memory_type_t mem_type) { if (!ucs_log_is_enabled(UCS_LOG_LEVEL_TRACE_DATA)) { os << ucs::size_value(length) << " " << std::flush; @@ -129,13 +126,13 @@ void uct_p2p_test::test_xfer_print(O& os, send_func_t send, size_t length, */ int count_before = log_data_count; ucs_log_push_handler(log_handler); - orig_log_level = ucs_global_opts.log_level; - ucs_global_opts.log_level = UCS_LOG_LEVEL_TRACE_DATA; + orig_log_level = ucs_global_opts.log_component.log_level; + ucs_global_opts.log_component.log_level = UCS_LOG_LEVEL_TRACE_DATA; bool expect_log = ucs_log_is_enabled(UCS_LOG_LEVEL_TRACE_DATA); UCS_TEST_SCOPE_EXIT() { /* Restore logging */ - ucs_global_opts.log_level = orig_log_level; + ucs_global_opts.log_component.log_level = orig_log_level; ucs_log_pop_handler(); } UCS_TEST_SCOPE_EXIT_END @@ -150,35 +147,35 @@ void uct_p2p_test::test_xfer_multi(send_func_t send, size_t min_length, size_t max_length, unsigned flags) { - for (int mem_type = 0; mem_type < UCT_MD_MEM_TYPE_LAST; mem_type++) { + for (int mem_type = 0; mem_type < UCS_MEMORY_TYPE_LAST; mem_type++) { /* test mem type if md supports mem type * (or) if HOST MD can register mem type */ - if (!((sender().md_attr().cap.mem_type == mem_type) || - (sender().md_attr().cap.mem_type == UCT_MD_MEM_TYPE_HOST && + if (!((sender().md_attr().cap.access_mem_type == mem_type) || + (sender().md_attr().cap.access_mem_type == UCS_MEMORY_TYPE_HOST && sender().md_attr().cap.reg_mem_types & UCS_BIT(mem_type)))) { continue; } - if (mem_type == UCT_MD_MEM_TYPE_CUDA) { + if (mem_type == UCS_MEMORY_TYPE_CUDA) { if (!(flags & (TEST_UCT_FLAG_RECV_ZCOPY | TEST_UCT_FLAG_SEND_ZCOPY))) { continue; } } test_xfer_multi_mem_type(send, min_length, max_length, flags, - (uct_memory_type_t) mem_type); + (ucs_memory_type_t) mem_type); } } void uct_p2p_test::test_xfer_multi_mem_type(send_func_t send, size_t min_length, size_t max_length, unsigned flags, - uct_memory_type_t mem_type) { + ucs_memory_type_t mem_type) { ucs::detail::message_stream ms("INFO"); - ms << "memory_type:" << uct_test::uct_mem_type_names[mem_type] << " " << std::flush; + ms << "memory_type:" << ucs_memory_type_names[mem_type] << " " << std::flush; - /* Trim at 4GB */ - max_length = ucs_min(max_length, 4ull * 1124 * 1024 * 1024); + /* Trim at 4.1 GB */ + max_length = ucs_min(max_length, (size_t)(4.1 * (double)UCS_GBYTE)); /* Trim at max. phys memory */ max_length = ucs_min(max_length, ucs_get_phys_mem_size() / 8); @@ -190,10 +187,10 @@ void uct_p2p_test::test_xfer_multi_mem_type(send_func_t send, size_t min_length, max_length = ucs_min(max_length, ucs_get_memfree_size() / 4); /* For large size, slow down if needed */ - if (max_length > 1 * 1024 * 1024) { + if (max_length > UCS_MBYTE) { max_length = max_length / ucs::test_time_multiplier(); if (RUNNING_ON_VALGRIND) { - max_length = ucs_min(max_length, 20u * 1024 * 1024); + max_length = ucs_min(max_length, 20u * UCS_MBYTE); } } @@ -215,7 +212,7 @@ void uct_p2p_test::test_xfer_multi_mem_type(send_func_t send, size_t min_length, /* How many times to repeat */ int repeat_count; - repeat_count = (256 * 1024) / ((max_length + min_length) / 2); + repeat_count = (256 * UCS_KBYTE) / ((max_length + min_length) / 2); if (repeat_count > 1000) { repeat_count = 1000; } @@ -252,12 +249,23 @@ void uct_p2p_test::blocking_send(send_func_t send, uct_ep_h ep, { unsigned prev_comp_count = m_completion_count; + ucs_assert(m_completion.uct.count == 0); + ucs_status_t status; do { + if (!m_null_completion) { + ++m_completion.uct.count; + } status = (this->*send)(ep, sendbuf, recvbuf); if (status == UCS_OK) { + if (!m_null_completion) { + --m_completion.uct.count; + } return; } else if (status == UCS_ERR_NO_RESOURCE) { + if (!m_null_completion) { + --m_completion.uct.count; + } progress(); } else if (status == UCS_INPROGRESS) { break; @@ -271,10 +279,12 @@ void uct_p2p_test::blocking_send(send_func_t send, uct_ep_h ep, if (wait_for_completion) { if (comp() == NULL) { /* implicit non-blocking mode */ - sender().flush(); + /* Call flush on local and remote ifaces to progress data + * (e.g. if call flush only on local iface, a target side may + * not be able to send PUT ACK to an initiator in case of TCP) */ + flush(); } else { /* explicit non-blocking mode */ - ++m_completion.uct.count; while (m_completion_count <= prev_comp_count) { progress(); } @@ -284,7 +294,10 @@ void uct_p2p_test::blocking_send(send_func_t send, uct_ep_h ep, } void uct_p2p_test::wait_for_remote() { - sender().flush(); + /* Call flush on local and remote ifaces to progress data + * (e.g. if call flush only on local iface, a target side may + * not be able to send PUT ACK to an initiator in case of TCP) */ + flush(); } uct_test::entity& uct_p2p_test::sender() { diff --git a/test/gtest/uct/uct_p2p_test.h b/test/gtest/uct/uct_p2p_test.h index aa212bd5e39..10cea91bba8 100644 --- a/test/gtest/uct/uct_p2p_test.h +++ b/test/gtest/uct/uct_p2p_test.h @@ -41,14 +41,19 @@ class uct_p2p_test : public uct_test { struct p2p_resource : public resource { virtual std::string name() const; bool loopback; + + p2p_resource(const resource& res) : + resource(res.component, res.md_name, res.local_cpus, + res.tl_name, res.dev_name, res.dev_type), + loopback(false) { } }; virtual void test_xfer(send_func_t send, size_t length, unsigned flags, - uct_memory_type_t mem_type); + ucs_memory_type_t mem_type); void test_xfer_multi(send_func_t send, size_t min_length, size_t max_length, unsigned flags); void test_xfer_multi_mem_type(send_func_t send, size_t min_length, size_t max_length, - unsigned flags, uct_memory_type_t mem_type); + unsigned flags, ucs_memory_type_t mem_type); void blocking_send(send_func_t send, uct_ep_h ep, const mapped_buffer &sendbuf, const mapped_buffer &recvbuf, bool wait_for_completion); void wait_for_remote(); @@ -60,13 +65,15 @@ class uct_p2p_test : public uct_test { private: template void test_xfer_print(O& os, send_func_t send, size_t length, - unsigned flags, uct_memory_type_t mem_type); + unsigned flags, ucs_memory_type_t mem_type); static void completion_cb(uct_completion_t *self, ucs_status_t status); static ucs_log_func_rc_t log_handler(const char *file, unsigned line, const char *function, - ucs_log_level_t level, const char *prefix, va_list ap); + ucs_log_level_t level, + const ucs_log_component_config_t *comp_conf, + const char *prefix, va_list ap); static int log_data_count; static ucs_log_level_t orig_log_level; diff --git a/test/gtest/uct/uct_test.cc b/test/gtest/uct/uct_test.cc index 73084a2329a..326e59bd650 100644 --- a/test/gtest/uct/uct_test.cc +++ b/test/gtest/uct/uct_test.cc @@ -8,6 +8,7 @@ #include "uct/api/uct_def.h" #include +#include #include #include #include @@ -15,59 +16,193 @@ #include -#define FOR_EACH_ENTITY(_iter) \ - for (ucs::ptr_vector::const_iterator _iter = m_entities.begin(); \ - _iter != m_entities.end(); ++_iter) \ - - std::string resource::name() const { std::stringstream ss; ss << tl_name << "/" << dev_name; + if (!variant_name.empty()) { + ss << "/" << variant_name; + } return ss.str(); } -const char *uct_test::uct_mem_type_names[] = {"host", "cuda"}; +resource::resource() : component(NULL), md_name(""), tl_name(""), dev_name(""), + variant_name(""), dev_type(UCT_DEVICE_TYPE_LAST), + variant(DEFAULT_VARIANT) +{ + CPU_ZERO(&local_cpus); +} + +resource::resource(uct_component_h component, const std::string& md_name, + const ucs_cpu_set_t& local_cpus, const std::string& tl_name, + const std::string& dev_name, uct_device_type_t dev_type) : + component(component), md_name(md_name), local_cpus(local_cpus), + tl_name(tl_name), dev_name(dev_name), variant_name(""), + dev_type(dev_type), variant(DEFAULT_VARIANT) +{ +} + +resource::resource(uct_component_h component, const uct_md_attr_t& md_attr, + const uct_md_resource_desc_t& md_resource, + const uct_tl_resource_desc_t& tl_resource) : + component(component), + md_name(md_resource.md_name), + local_cpus(md_attr.local_cpus), + tl_name(tl_resource.tl_name), + dev_name(tl_resource.dev_name), + variant_name(""), + dev_type(tl_resource.dev_type), + variant(DEFAULT_VARIANT) +{ +} + +resource_speed::resource_speed(uct_component_h component, const uct_worker_h& worker, + const uct_md_h& md, const uct_md_attr_t& md_attr, + const uct_md_resource_desc_t& md_resource, + const uct_tl_resource_desc_t& tl_resource) : + resource(component, md_attr, md_resource, + tl_resource) { + ucs_status_t status; + uct_iface_params_t iface_params = { 0 }; + uct_iface_config_t *iface_config; + uct_iface_attr_t iface_attr; + uct_iface_h iface; + + status = uct_md_iface_config_read(md, tl_name.c_str(), NULL, + NULL, &iface_config); + ASSERT_UCS_OK(status); + + iface_params.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE | + UCT_IFACE_PARAM_FIELD_DEVICE; + iface_params.open_mode = UCT_IFACE_OPEN_MODE_DEVICE; + iface_params.mode.device.tl_name = tl_name.c_str(); + iface_params.mode.device.dev_name = dev_name.c_str(); + + status = uct_iface_open(md, worker, &iface_params, iface_config, &iface); + ASSERT_UCS_OK(status); + + status = uct_iface_query(iface, &iface_attr); + ASSERT_UCS_OK(status); + + bw = ucs_max(iface_attr.bandwidth.dedicated, iface_attr.bandwidth.shared); + + uct_iface_close(iface); + uct_config_release(iface_config); +} + +std::vector uct_test_base::enum_md_resources() { + + static std::vector all_md_resources; + + if (all_md_resources.empty()) { + uct_component_h *uct_components; + unsigned num_components; + ucs_status_t status; + + status = uct_query_components(&uct_components, &num_components); + ASSERT_UCS_OK(status); + + /* for RAII */ + ucs::handle cmpt_list(uct_components, + uct_release_component_list); + + for (unsigned cmpt_index = 0; cmpt_index < num_components; ++cmpt_index) { + uct_component_attr_t component_attr = {0}; + + component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME | + UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT | + UCT_COMPONENT_ATTR_FIELD_FLAGS; + /* coverity[var_deref_model] */ + status = uct_component_query(uct_components[cmpt_index], &component_attr); + ASSERT_UCS_OK(status); + + /* Save attributes before asking for MD resource list */ + md_resource md_rsc; + md_rsc.cmpt = uct_components[cmpt_index]; + md_rsc.cmpt_attr = component_attr; + + std::vector md_resources; + uct_component_attr_t component_attr_resouces = {0}; + md_resources.resize(md_rsc.cmpt_attr.md_resource_count); + component_attr_resouces.field_mask = UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES; + component_attr_resouces.md_resources = &md_resources[0]; + status = uct_component_query(uct_components[cmpt_index], + &component_attr_resouces); + ASSERT_UCS_OK(status); + + for (unsigned md_index = 0; + md_index < md_rsc.cmpt_attr.md_resource_count; ++md_index) { + md_rsc.rsc_desc = md_resources[md_index]; + all_md_resources.push_back(md_rsc); + } + } + } + + return all_md_resources; +} uct_test::uct_test() { + uct_component_attr_t component_attr = {0}; ucs_status_t status; - uct_md_attr_t pd_attr; - uct_md_h pd; + uct_md_attr_t md_attr; + uct_md_h md; - status = uct_md_config_read(GetParam()->md_name.c_str(), NULL, NULL, - &m_md_config); + status = uct_md_config_read(GetParam()->component, NULL, NULL, &m_md_config); ASSERT_UCS_OK(status); - status = uct_md_open(GetParam()->md_name.c_str(), m_md_config, &pd); + status = uct_md_open(GetParam()->component, GetParam()->md_name.c_str(), + m_md_config, &md); ASSERT_UCS_OK(status); - status = uct_md_query(pd, &pd_attr); + status = uct_md_query(md, &md_attr); ASSERT_UCS_OK(status); - if (pd_attr.cap.flags & UCT_MD_FLAG_SOCKADDR) { - status = uct_md_iface_config_read(pd, NULL, NULL, NULL, &m_iface_config); + if (md_attr.cap.flags & UCT_MD_FLAG_SOCKADDR) { + status = uct_md_iface_config_read(md, NULL, NULL, NULL, &m_iface_config); + } else if (!strcmp(GetParam()->tl_name.c_str(), "sockaddr")) { + m_iface_config = NULL; } else { - status = uct_md_iface_config_read(pd, GetParam()->tl_name.c_str(), NULL, + status = uct_md_iface_config_read(md, GetParam()->tl_name.c_str(), NULL, NULL, &m_iface_config); } ASSERT_UCS_OK(status); - uct_md_close(pd); + uct_md_close(md); + + component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME | + UCT_COMPONENT_ATTR_FIELD_FLAGS; + /* coverity[var_deref_model] */ + status = uct_component_query(GetParam()->component, &component_attr); + ASSERT_UCS_OK(status); + + UCS_TEST_MESSAGE << "Testing component: " << component_attr.name; + + if (component_attr.flags & UCT_COMPONENT_FLAG_CM) { + status = uct_cm_config_read(GetParam()->component, NULL, NULL, &m_cm_config); + ASSERT_UCS_OK(status); + } else { + m_cm_config = NULL; + } } uct_test::~uct_test() { - uct_config_release(m_iface_config); + if (m_cm_config != NULL) { + uct_config_release(m_cm_config); + } + if (m_iface_config != NULL) { + uct_config_release(m_iface_config); + } uct_config_release(m_md_config); } void uct_test::init_sockaddr_rsc(resource *rsc, struct sockaddr *listen_addr, struct sockaddr *connect_addr, size_t size) { - memcpy(&rsc->listen_if_addr, listen_addr, size); - memcpy(&rsc->connect_if_addr, connect_addr, size); + rsc->listen_sock_addr.set_sock_addr(*listen_addr, size); + rsc->connect_sock_addr.set_sock_addr(*connect_addr, size); } -void uct_test::set_interface_rscs(char *md_name, cpu_set_t local_cpus, - struct ifaddrs *ifa, +void uct_test::set_interface_rscs(uct_component_h cmpt, const char *name, + ucs_cpu_set_t local_cpus, struct ifaddrs *ifa, std::vector& all_resources) { int i; @@ -75,12 +210,8 @@ void uct_test::set_interface_rscs(char *md_name, cpu_set_t local_cpus, /* Create two resources on the same interface. the first one will have the * ip of the interface and the second one will have INADDR_ANY */ for (i = 0; i < 2; i++) { - resource rsc; - rsc.md_name = md_name, - rsc.local_cpus = local_cpus, - rsc.tl_name = "sockaddr", - rsc.dev_name = ifa->ifa_name; - rsc.dev_type = UCT_DEVICE_TYPE_NET; + resource rsc(cmpt, std::string(name), local_cpus, "sockaddr", + std::string(ifa->ifa_name), UCT_DEVICE_TYPE_NET); if (i == 0) { /* first rsc */ @@ -118,57 +249,125 @@ void uct_test::set_interface_rscs(char *md_name, cpu_set_t local_cpus, } } -void uct_test::set_sockaddr_resources(uct_md_h md, char *md_name, cpu_set_t local_cpus, - std::vector& all_resources) { +bool uct_test::is_interface_usable(struct ifaddrs *ifa, const char *name) { + if (!(ucs_netif_flags_is_active(ifa->ifa_flags)) || + !(ucs::is_inet_addr(ifa->ifa_addr))) { + return false; + } + + /* If rdmacm is tested, make sure that this is an IPoIB or RoCE interface */ + if (!strcmp(name, "rdmacm") && !ucs::is_rdmacm_netdev(ifa->ifa_name)) { + return false; + } + + return true; +} + +void uct_test::set_md_sockaddr_resources(const md_resource& md_rsc, uct_md_h md, + ucs_cpu_set_t local_cpus, + std::vector& all_resources) { struct ifaddrs *ifaddr, *ifa; ucs_sock_addr_t sock_addr; - EXPECT_TRUE(getifaddrs(&ifaddr) != -1); + EXPECT_EQ(0, getifaddrs(&ifaddr)) << strerror(errno); for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { sock_addr.addr = ifa->ifa_addr; - /* If rdmacm is tested, make sure that this is an IPoIB or RoCE interface */ - if (!strcmp(md_name, "rdmacm") && (!ucs::is_rdmacm_netdev(ifa->ifa_name))) { + if (!uct_test::is_interface_usable(ifa, md_rsc.rsc_desc.md_name)) { continue; } if (uct_md_is_sockaddr_accessible(md, &sock_addr, UCT_SOCKADDR_ACC_LOCAL) && - uct_md_is_sockaddr_accessible(md, &sock_addr, UCT_SOCKADDR_ACC_REMOTE) && - ucs_netif_is_active(ifa->ifa_name)) { + uct_md_is_sockaddr_accessible(md, &sock_addr, UCT_SOCKADDR_ACC_REMOTE)) + { + uct_test::set_interface_rscs(md_rsc.cmpt, md_rsc.rsc_desc.md_name, + local_cpus, ifa, all_resources); + } + } + + freeifaddrs(ifaddr); +} + +void uct_test::set_cm_sockaddr_resources(uct_component_h cmpt, const char *cmpt_name, + ucs_cpu_set_t local_cpus, + std::vector& all_resources) { - uct_test::set_interface_rscs(md_name, local_cpus, ifa, all_resources); + struct ifaddrs *ifaddr, *ifa; + + EXPECT_EQ(0, getifaddrs(&ifaddr)) << strerror(errno); + + for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { + if (!uct_test::is_interface_usable(ifa, cmpt_name)) { + continue; } + + uct_test::set_interface_rscs(cmpt, cmpt_name, local_cpus, ifa, all_resources); } freeifaddrs(ifaddr); } -std::vector uct_test::enum_resources(const std::string& tl_name, - bool loopback) { +void uct_test::set_cm_resources(std::vector& all_resources) +{ + uct_component_h *uct_components; + unsigned num_components; + ucs_status_t status; + + status = uct_query_components(&uct_components, &num_components); + ASSERT_UCS_OK(status); + + for (unsigned cmpt_index = 0; cmpt_index < num_components; ++cmpt_index) { + uct_component_attr_t component_attr = {0}; + + component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME | + UCT_COMPONENT_ATTR_FIELD_FLAGS; + /* coverity[var_deref_model] */ + status = uct_component_query(uct_components[cmpt_index], &component_attr); + ASSERT_UCS_OK(status); + + if (component_attr.flags & UCT_COMPONENT_FLAG_CM) { + ucs_cpu_set_t local_cpus; + CPU_ZERO(&local_cpus); + uct_test::set_cm_sockaddr_resources(uct_components[cmpt_index], + component_attr.name, local_cpus, + all_resources); + } + } + + uct_release_component_list(uct_components); +} + +std::vector uct_test::enum_resources(const std::string& tl_name) +{ + static bool tcp_fastest_dev = (getenv("GTEST_UCT_TCP_FASTEST_DEV") != NULL); static std::vector all_resources; if (all_resources.empty()) { - uct_md_resource_desc_t *md_resources; - unsigned num_md_resources; - uct_tl_resource_desc_t *tl_resources; - unsigned num_tl_resources; + ucs_async_context_t *async; + uct_worker_h worker; ucs_status_t status; - status = uct_query_md_resources(&md_resources, &num_md_resources); + status = ucs_async_context_create(UCS_ASYNC_MODE_THREAD_SPINLOCK, &async); + ASSERT_UCS_OK(status); + + status = uct_worker_create(async, UCS_THREAD_MODE_SINGLE, &worker); ASSERT_UCS_OK(status); - for (unsigned i = 0; i < num_md_resources; ++i) { - uct_md_h pd; + std::vector md_resources = enum_md_resources(); + + for (std::vector::iterator iter = md_resources.begin(); + iter != md_resources.end(); ++iter) { + uct_md_h md; uct_md_config_t *md_config; - status = uct_md_config_read(md_resources[i].md_name, NULL, NULL, - &md_config); + status = uct_md_config_read(iter->cmpt, NULL, NULL, &md_config); ASSERT_UCS_OK(status); { scoped_log_handler slh(hide_errors_logger); - status = uct_md_open(md_resources[i].md_name, md_config, &pd); + status = uct_md_open(iter->cmpt, iter->rsc_desc.md_name, + md_config, &md); } uct_config_release(md_config); if (status != UCS_OK) { @@ -176,37 +375,73 @@ std::vector uct_test::enum_resources(const std::string& tl_name } uct_md_attr_t md_attr; - status = uct_md_query(pd, &md_attr); + status = uct_md_query(md, &md_attr); ASSERT_UCS_OK(status); - status = uct_md_query_tl_resources(pd, &tl_resources, &num_tl_resources); + uct_tl_resource_desc_t *tl_resources; + unsigned num_tl_resources; + status = uct_md_query_tl_resources(md, &tl_resources, &num_tl_resources); ASSERT_UCS_OK(status); + resource_speed tcp_fastest_rsc; + for (unsigned j = 0; j < num_tl_resources; ++j) { - resource rsc; - rsc.md_name = md_resources[i].md_name; - rsc.local_cpus = md_attr.local_cpus; - rsc.tl_name = tl_resources[j].tl_name; - rsc.dev_name = tl_resources[j].dev_name; - rsc.dev_type = tl_resources[j].dev_type; - all_resources.push_back(rsc); + if (tcp_fastest_dev && (std::string("tcp") == tl_resources[j].tl_name)) { + resource_speed rsc(iter->cmpt, worker, md, md_attr, + iter->rsc_desc, tl_resources[j]); + if (!tcp_fastest_rsc.bw || (rsc.bw > tcp_fastest_rsc.bw)) { + tcp_fastest_rsc = rsc; + } + } else { + resource rsc(iter->cmpt, md_attr, iter->rsc_desc, + tl_resources[j]); + all_resources.push_back(rsc); + } } - if (md_attr.cap.flags & UCT_MD_FLAG_SOCKADDR) { - uct_test::set_sockaddr_resources(pd, md_resources[i].md_name, - md_attr.local_cpus, all_resources); + if (tcp_fastest_dev && tcp_fastest_rsc.bw) { + all_resources.push_back(tcp_fastest_rsc); + } + + if ((md_attr.cap.flags & UCT_MD_FLAG_SOCKADDR) && + !(iter->cmpt_attr.flags & UCT_COMPONENT_FLAG_CM)) { + uct_test::set_md_sockaddr_resources(*iter, md, md_attr.local_cpus, + all_resources); } uct_release_tl_resource_list(tl_resources); - uct_md_close(pd); + uct_md_close(md); } - uct_release_md_resource_list(md_resources); + uct_worker_destroy(worker); + ucs_async_context_destroy(async); + + set_cm_resources(all_resources); } return filter_resources(all_resources, tl_name); } +void uct_test::generate_test_variant(int variant, + const std::string &variant_name, + std::vector& test_res, + const std::string &tl_name) +{ + std::vector r = uct_test::enum_resources(""); + + for (std::vector::iterator iter = r.begin(); + iter != r.end(); ++iter) { + if (tl_name.empty() || ((*iter)->tl_name == tl_name)) { + resource rsc((*iter)->component, (*iter)->md_name, + (*iter)->local_cpus, (*iter)->tl_name, + (*iter)->dev_name, (*iter)->dev_type); + rsc.variant = variant; + rsc.variant_name = variant_name; + test_res.push_back(rsc); + } + } +} + void uct_test::init() { } @@ -227,34 +462,72 @@ bool uct_test::is_caps_supported(uint64_t required_flags) { return ret; } -void uct_test::check_caps(uint64_t required_flags, uint64_t invalid_flags) { +bool uct_test::check_caps(uint64_t required_flags, uint64_t invalid_flags) { FOR_EACH_ENTITY(iter) { - (*iter)->check_caps(required_flags, invalid_flags); + if (!(*iter)->check_caps(required_flags, invalid_flags)) { + return false; + } + } + return true; +} + +void uct_test::check_caps_skip(uint64_t required_flags, uint64_t invalid_flags) { + if (!check_caps(required_flags, invalid_flags)) { + UCS_TEST_SKIP_R("unsupported"); + } +} + +bool uct_test::check_event_caps(uint64_t required_flags, uint64_t invalid_flags) { + FOR_EACH_ENTITY(iter) { + if (!(*iter)->check_event_caps(required_flags, invalid_flags)) { + return false; + } } + return true; } -void uct_test::check_atomics(uint64_t required_ops, atomic_mode mode) { +bool uct_test::check_atomics(uint64_t required_ops, atomic_mode mode) { FOR_EACH_ENTITY(iter) { - (*iter)->check_atomics(required_ops, mode); + if (!(*iter)->check_atomics(required_ops, mode)) { + return false; + } } + return true; } +/* modify the config of all the matching environment parameters */ void uct_test::modify_config(const std::string& name, const std::string& value, bool optional) { - ucs_status_t status; + ucs_status_t status = UCS_OK; + + if (m_cm_config != NULL) { + status = uct_config_modify(m_cm_config, name.c_str(), value.c_str()); + if (status == UCS_OK) { + optional = true; + } else if (status != UCS_ERR_NO_ELEM) { + UCS_TEST_ABORT("Couldn't modify cm config parameter: " << name.c_str() << + " to " << value.c_str() << ": " << ucs_status_string(status)); + } + } - status = uct_config_modify(m_iface_config, name.c_str(), value.c_str()); - if (status == UCS_ERR_NO_ELEM) { - status = uct_config_modify(m_md_config, name.c_str(), value.c_str()); - if (status == UCS_ERR_NO_ELEM) { - test_base::modify_config(name, value, optional); - } else if (status != UCS_OK) { - UCS_TEST_ABORT("Couldn't modify pd config parameter: " << name.c_str() << + if (m_iface_config != NULL) { + status = uct_config_modify(m_iface_config, name.c_str(), value.c_str()); + if (status == UCS_OK) { + optional = true; + } else if (status != UCS_ERR_NO_ELEM) { + UCS_TEST_ABORT("Couldn't modify iface config parameter: " << name.c_str() << " to " << value.c_str() << ": " << ucs_status_string(status)); } + } + status = uct_config_modify(m_md_config, name.c_str(), value.c_str()); + if (status == UCS_OK) { + optional = true; + } + if ((status == UCS_OK) || (status == UCS_ERR_NO_ELEM)) { + test_base::modify_config(name, value, optional); } else if (status != UCS_OK) { - UCS_TEST_ABORT("Couldn't modify iface config parameter: " << name.c_str() << + UCS_TEST_ABORT("Couldn't modify md config parameter: " << name.c_str() << " to " << value.c_str() << ": " << ucs_status_string(status)); } } @@ -265,15 +538,50 @@ bool uct_test::get_config(const std::string& name, std::string& value) const const size_t max = 1024; value.resize(max); - status = uct_config_get(m_iface_config, name.c_str(), - const_cast(value.c_str()), max); - if (status == UCS_ERR_NO_ELEM) { - status = uct_config_get(m_md_config, name.c_str(), + if (m_cm_config != NULL) { + status = uct_config_get(m_cm_config, name.c_str(), const_cast(value.c_str()), max); + if (status == UCS_OK) { + return true; + } } - return (status == UCS_OK); + if (m_iface_config != NULL) { + status = uct_config_get(m_iface_config, name.c_str(), + const_cast(value.c_str()), max); + if (status == UCS_OK) { + return true; + } + } + + status = uct_config_get(m_md_config, name.c_str(), + const_cast(value.c_str()), max); + if (status == UCS_OK) { + return true; + } + + return false; +} + +bool uct_test::has_transport(const std::string& tl_name) const { + return (GetParam()->tl_name == tl_name); +} + +bool uct_test::has_ud() const { + return (has_transport("ud_verbs") || has_transport("ud_mlx5")); +} + +bool uct_test::has_rc() const { + return (has_transport("rc_verbs") || has_transport("rc_mlx5")); +} + +bool uct_test::has_rc_or_dc() const { + return (has_rc() || has_transport("dc_mlx5")); +} + +bool uct_test::has_ib() const { + return (has_rc_or_dc() || has_ud()); } void uct_test::stats_activate() @@ -294,22 +602,44 @@ void uct_test::stats_restore() } uct_test::entity* uct_test::create_entity(size_t rx_headroom, - uct_error_handler_t err_handler) { + uct_error_handler_t err_handler, + uct_tag_unexp_eager_cb_t eager_cb, + uct_tag_unexp_rndv_cb_t rndv_cb, + void *eager_arg, void *rndv_arg, + uct_async_event_cb_t async_event_cb, + void *async_event_arg) { uct_iface_params_t iface_params; - iface_params.field_mask = UCT_IFACE_PARAM_FIELD_RX_HEADROOM | - UCT_IFACE_PARAM_FIELD_OPEN_MODE | - UCT_IFACE_PARAM_FIELD_ERR_HANDLER | - UCT_IFACE_PARAM_FIELD_ERR_HANDLER_ARG | - UCT_IFACE_PARAM_FIELD_ERR_HANDLER_FLAGS; + iface_params.field_mask = UCT_IFACE_PARAM_FIELD_RX_HEADROOM | + UCT_IFACE_PARAM_FIELD_OPEN_MODE | + UCT_IFACE_PARAM_FIELD_ERR_HANDLER | + UCT_IFACE_PARAM_FIELD_ERR_HANDLER_ARG | + UCT_IFACE_PARAM_FIELD_ERR_HANDLER_FLAGS | + UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_CB | + UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_ARG | + UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_CB | + UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_ARG | + UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_CB | + UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_ARG; iface_params.rx_headroom = rx_headroom; iface_params.open_mode = UCT_IFACE_OPEN_MODE_DEVICE; iface_params.err_handler = err_handler; iface_params.err_handler_arg = this; iface_params.err_handler_flags = 0; - entity *new_ent = new entity(*GetParam(), m_iface_config, &iface_params, - m_md_config); - return new_ent; + iface_params.eager_cb = (eager_cb == NULL) ? + reinterpret_cast + (ucs_empty_function_return_success) : + eager_cb; + iface_params.eager_arg = eager_arg; + iface_params.rndv_cb = (rndv_cb == NULL) ? + reinterpret_cast + (ucs_empty_function_return_success) : + rndv_cb; + iface_params.rndv_arg = rndv_arg; + iface_params.async_event_cb = async_event_cb; + iface_params.async_event_arg = async_event_arg; + + return new entity(*GetParam(), m_iface_config, &iface_params, m_md_config); } uct_test::entity* uct_test::create_entity(uct_iface_params_t ¶ms) { @@ -318,6 +648,10 @@ uct_test::entity* uct_test::create_entity(uct_iface_params_t ¶ms) { return new_ent; } +uct_test::entity* uct_test::create_entity() { + return new entity(*GetParam(), m_md_config, m_cm_config); +} + const uct_test::entity& uct_test::ent(unsigned index) const { return m_entities.at(index); } @@ -372,19 +706,37 @@ void uct_test::twait(int delta_ms) const { int uct_test::max_connections() { - if (GetParam()->tl_name == "tcp") { + if (has_transport("tcp")) { return ucs::max_tcp_connections(); } else { return std::numeric_limits::max(); } } -std::string uct_test::entity::client_priv_data = ""; -size_t uct_test::entity::client_cb_arg = 0; +int uct_test::max_connect_batch() +{ + if (has_transport("tcp")) { + /* TCP connection listener is limited by Accept queue */ + return ucs_socket_max_conn(); + } else { + return std::numeric_limits::max(); + } +} -uct_test::entity::entity(const resource& resource, uct_iface_config_t *iface_config, - uct_iface_params_t *params, uct_md_config_t *md_config) { +void uct_test::reduce_tl_send_queues() +{ + /* To reduce send queues of UCT TLs to fill the resources faster */ + set_config("RC_TX_QUEUE_LEN?=32"); + set_config("UD_TX_QUEUE_LEN?=128"); + set_config("RC_FC_ENABLE?=n"); + set_config("SNDBUF?=1k"); + set_config("RCVBUF?=128"); +} +uct_test::entity::entity(const resource& resource, uct_iface_config_t *iface_config, + uct_iface_params_t *params, uct_md_config_t *md_config) : + m_resource(resource) +{ ucs_status_t status; if (params->open_mode == UCT_IFACE_OPEN_MODE_DEVICE) { @@ -393,141 +745,176 @@ uct_test::entity::entity(const resource& resource, uct_iface_config_t *iface_con params->mode.device.dev_name = resource.dev_name.c_str(); } - params->field_mask |= UCT_IFACE_PARAM_FIELD_STATS_ROOT | - UCT_IFACE_PARAM_FIELD_CPU_MASK; - params->stats_root = ucs_stats_get_root(); + params->field_mask |= UCT_IFACE_PARAM_FIELD_STATS_ROOT | + UCT_IFACE_PARAM_FIELD_CPU_MASK; + params->stats_root = ucs_stats_get_root(); UCS_CPU_ZERO(¶ms->cpu_mask); UCS_TEST_CREATE_HANDLE(uct_worker_h, m_worker, uct_worker_destroy, - uct_worker_create, &m_async.m_async, UCS_THREAD_MODE_SINGLE); + uct_worker_create, &m_async.m_async, + UCS_THREAD_MODE_SINGLE); - UCS_TEST_CREATE_HANDLE(uct_md_h, m_md, uct_md_close, - uct_md_open, resource.md_name.c_str(), md_config); + UCS_TEST_CREATE_HANDLE(uct_md_h, m_md, uct_md_close, uct_md_open, + resource.component, resource.md_name.c_str(), + md_config); status = uct_md_query(m_md, &m_md_attr); ASSERT_UCS_OK(status); - UCS_TEST_CREATE_HANDLE(uct_iface_h, m_iface, uct_iface_close, - uct_iface_open, m_md, m_worker, params, iface_config); + for (;;) { + { + scoped_log_handler slh(wrap_errors_logger); + status = UCS_TEST_TRY_CREATE_HANDLE(uct_iface_h, m_iface, + uct_iface_close, uct_iface_open, + m_md, m_worker, params, + iface_config); + if (status == UCS_OK) { + break; + } + } + EXPECT_EQ(UCS_ERR_BUSY, status); + if (params->open_mode != UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER) { + UCS_TEST_ABORT("any mode different from UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER must go with status UCS_OK"); + } + + const struct sockaddr* c_ifa_addr = + params->mode.sockaddr.listen_sockaddr.addr; + struct sockaddr* ifa_addr = const_cast(c_ifa_addr); + if (ifa_addr->sa_family == AF_INET) { + struct sockaddr_in *addr = + reinterpret_cast(ifa_addr); + addr->sin_port = ntohs(ucs::get_port()); + } else { + struct sockaddr_in6 *addr = + reinterpret_cast(ifa_addr); + addr->sin6_port = ntohs(ucs::get_port()); + } + } status = uct_iface_query(m_iface, &m_iface_attr); ASSERT_UCS_OK(status); uct_iface_progress_enable(m_iface, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); m_iface_params = *params; + + memset(&m_cm_attr, 0, sizeof(m_cm_attr)); + max_conn_priv = 0; } +uct_test::entity::entity(const resource& resource, uct_md_config_t *md_config, + uct_cm_config_t *cm_config) { + ucs_status_t status; + uct_component_attr_t comp_attr; -void uct_test::entity::cuda_mem_alloc(size_t length, uct_allocated_memory_t *mem) const { -#if HAVE_CUDA - ucs_status_t status; - cudaError_t cerr; + memset(&m_iface_attr, 0, sizeof(m_iface_attr)); + memset(&m_iface_params, 0, sizeof(m_iface_params)); - mem->length = length; - mem->md = m_md; - mem->mem_type = UCT_MD_MEM_TYPE_CUDA; - mem->memh = UCT_MEM_HANDLE_NULL; + UCS_TEST_CREATE_HANDLE(uct_worker_h, m_worker, uct_worker_destroy, + uct_worker_create, &m_async.m_async, + UCS_THREAD_MODE_SINGLE); - cerr = cudaMalloc(&mem->address, mem->length); - EXPECT_TRUE(cerr == cudaSuccess); + UCS_TEST_CREATE_HANDLE(uct_md_h, m_md, uct_md_close, + uct_md_open, resource.component, + resource.md_name.c_str(), md_config); - if (md_attr().cap.reg_mem_types & UCS_BIT(UCT_MD_MEM_TYPE_CUDA)) { - status = uct_md_mem_reg(m_md, mem->address, mem->length, - UCT_MD_MEM_ACCESS_ALL, &mem->memh); - ASSERT_UCS_OK(status); - } -#else - UCS_TEST_SKIP_R("can't allocate cuda memory"); -#endif -} + status = uct_md_query(m_md, &m_md_attr); + ASSERT_UCS_OK(status); -void uct_test::entity::get_rkey(uct_mem_h memh, uct_rkey_bundle *rkey_bundle, - int mem_type) const -{ - if ((md_attr().cap.flags & UCT_MD_FLAG_NEED_RKEY) && - (md_attr().cap.reg_mem_types & UCS_BIT(mem_type))) { - void *rkey_buffer = malloc(md_attr().rkey_packed_size); - if (rkey_buffer == NULL) { - UCS_TEST_ABORT("Failed to allocake rkey buffer"); - } - ucs_status_t status = uct_md_mkey_pack(m_md, memh, rkey_buffer); - ASSERT_UCS_OK(status); + comp_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME | + UCT_COMPONENT_ATTR_FIELD_FLAGS; + status = uct_component_query(resource.component, &comp_attr); + ASSERT_UCS_OK(status); - status = uct_rkey_unpack(rkey_buffer, rkey_bundle); + if (comp_attr.flags & UCT_COMPONENT_FLAG_CM) { + UCS_TEST_CREATE_HANDLE(uct_cm_h, m_cm, uct_cm_close, uct_cm_open, + resource.component, m_worker, cm_config); + + m_cm_attr.field_mask = UCT_CM_ATTR_FIELD_MAX_CONN_PRIV; + status = uct_cm_query(m_cm, &m_cm_attr); ASSERT_UCS_OK(status); - free(rkey_buffer); + max_conn_priv = 0; } else { - rkey_bundle->handle = NULL; - rkey_bundle->rkey = UCT_INVALID_RKEY; - rkey_bundle->type = NULL; + UCS_TEST_SKIP_R(std::string("cm is not supported on component ") + + comp_attr.name ); } } -void uct_test::entity::mem_alloc(size_t length, uct_allocated_memory_t *mem, - uct_rkey_bundle *rkey_bundle, int mem_type) const { - static const char *alloc_name = "uct_test"; +void uct_test::entity::mem_alloc_host(size_t length, + uct_allocated_memory_t *mem) const { + ucs_status_t status; if (md_attr().cap.flags & (UCT_MD_FLAG_ALLOC|UCT_MD_FLAG_REG)) { - if (mem_type == UCT_MD_MEM_TYPE_HOST) { - status = uct_iface_mem_alloc(m_iface, length, UCT_MD_MEM_ACCESS_ALL, - alloc_name, mem); - ASSERT_UCS_OK(status); - } else if (mem_type == UCT_MD_MEM_TYPE_CUDA) { - cuda_mem_alloc(length, mem); - } else { - UCS_TEST_ABORT("wrong memory type"); - } - - get_rkey(mem->memh, rkey_bundle, mem_type); - + status = uct_iface_mem_alloc(m_iface, length, UCT_MD_MEM_ACCESS_ALL, + "uct_test", mem); + ASSERT_UCS_OK(status); } else { uct_alloc_method_t method = UCT_ALLOC_METHOD_MMAP; - status = uct_mem_alloc(NULL, length, UCT_MD_MEM_ACCESS_ALL, - &method, 1, NULL, 0, alloc_name, - mem); + status = uct_mem_alloc(NULL, length, UCT_MD_MEM_ACCESS_ALL, &method, 1, + NULL, 0, "uct_test", mem); ASSERT_UCS_OK(status); - ucs_assert(mem->memh == UCT_MEM_HANDLE_NULL); + } + ucs_assert(mem->mem_type == UCS_MEMORY_TYPE_HOST); +} - rkey_bundle->rkey = UCT_INVALID_RKEY; - rkey_bundle->handle = NULL; - rkey_bundle->type = NULL; +void uct_test::entity::mem_free_host(const uct_allocated_memory_t *mem) const { + if (mem->method != UCT_ALLOC_METHOD_LAST) { + uct_iface_mem_free(mem); } } -void uct_test::entity::cuda_mem_free(const uct_allocated_memory_t *mem) const { -#if HAVE_CUDA - ucs_status_t status; - cudaError_t cerr; +void uct_test::entity::mem_type_reg(uct_allocated_memory_t *mem) const { + if (md_attr().cap.reg_mem_types & UCS_BIT(mem->mem_type)) { + ucs_status_t status = uct_md_mem_reg(m_md, mem->address, mem->length, + UCT_MD_MEM_ACCESS_ALL, &mem->memh); + ASSERT_UCS_OK(status); + mem->md = m_md; + } +} - if (mem->memh != UCT_MEM_HANDLE_NULL) { - status = uct_md_mem_dereg(m_md, mem->memh); +void uct_test::entity::mem_type_dereg(uct_allocated_memory_t *mem) const { + if ((mem->memh != UCT_MEM_HANDLE_NULL) && + (md_attr().cap.reg_mem_types & UCS_BIT(mem->mem_type))) { + ucs_status_t status = uct_md_mem_dereg(m_md, mem->memh); ASSERT_UCS_OK(status); + mem->memh = UCT_MEM_HANDLE_NULL; + mem->md = NULL; } - cerr = cudaFree(mem->address); - ASSERT_TRUE(cerr == cudaSuccess); -#endif } -void uct_test::entity::mem_free(const uct_allocated_memory_t *mem, - const uct_rkey_bundle_t& rkey, - const uct_memory_type_t mem_type) const { - ucs_status_t status; +void uct_test::entity::rkey_unpack(const uct_allocated_memory_t *mem, + uct_rkey_bundle *rkey_bundle) const +{ + if ((mem->memh != UCT_MEM_HANDLE_NULL) && + (md_attr().cap.flags & UCT_MD_FLAG_NEED_RKEY)) { + + void *rkey_buffer = malloc(md_attr().rkey_packed_size); + if (rkey_buffer == NULL) { + UCS_TEST_ABORT("Failed to allocate rkey buffer"); + } - if (rkey.rkey != UCT_INVALID_RKEY) { - status = uct_rkey_release(&rkey); + ucs_status_t status = uct_md_mkey_pack(m_md, mem->memh, rkey_buffer); ASSERT_UCS_OK(status); + + status = uct_rkey_unpack(m_resource.component, rkey_buffer, + rkey_bundle); + ASSERT_UCS_OK(status); + + free(rkey_buffer); + } else { + rkey_bundle->handle = NULL; + rkey_bundle->rkey = UCT_INVALID_RKEY; } +} - if (mem_type == UCT_MD_MEM_TYPE_HOST) { - if (mem->method != UCT_ALLOC_METHOD_LAST) { - uct_iface_mem_free(mem); - } - } else if(mem_type == UCT_MD_MEM_TYPE_CUDA) { - cuda_mem_free(mem); +void uct_test::entity::rkey_release(const uct_rkey_bundle *rkey_bundle) const +{ + if (rkey_bundle->rkey != UCT_INVALID_RKEY) { + ucs_status_t status = uct_rkey_release(m_resource.component, rkey_bundle); + ASSERT_UCS_OK(status); } } @@ -542,19 +929,27 @@ bool uct_test::entity::is_caps_supported(uint64_t required_flags) { return ucs_test_all_flags(iface_flags, required_flags); } -void uct_test::entity::check_caps(uint64_t required_flags, +bool uct_test::entity::check_caps(uint64_t required_flags, uint64_t invalid_flags) { uint64_t iface_flags = iface_attr().cap.flags; - if (!ucs_test_all_flags(iface_flags, required_flags)) { - UCS_TEST_SKIP_R("unsupported"); - } - if (iface_flags & invalid_flags) { - UCS_TEST_SKIP_R("unsupported"); - } + return (ucs_test_all_flags(iface_flags, required_flags) && + !(iface_flags & invalid_flags)); +} + +bool uct_test::entity::check_event_caps(uint64_t required_flags, + uint64_t invalid_flags) +{ + uint64_t iface_event_flags = iface_attr().cap.event_flags; + return (ucs_test_all_flags(iface_event_flags, required_flags) && + !(iface_event_flags & invalid_flags) && + /* iface has to support either event fd or event async + * callback notification mechanism */ + ((iface_event_flags & UCT_IFACE_FLAG_EVENT_FD) || + (iface_event_flags & UCT_IFACE_FLAG_EVENT_ASYNC_CB))); } -void uct_test::entity::check_atomics(uint64_t required_ops, atomic_mode mode) +bool uct_test::entity::check_atomics(uint64_t required_ops, atomic_mode mode) { uint64_t amo; @@ -573,12 +968,9 @@ void uct_test::entity::check_atomics(uint64_t required_ops, atomic_mode mode) break; default: UCS_TEST_ABORT("Incorrect atomic mode: " << mode); - break; } - if (!ucs_test_all_flags(amo, required_ops)) { - UCS_TEST_SKIP_R("unsupported"); - } + return ucs_test_all_flags(amo, required_ops); } uct_md_h uct_test::entity::md() const { @@ -593,6 +985,18 @@ uct_worker_h uct_test::entity::worker() const { return m_worker; } +uct_cm_h uct_test::entity::cm() const { + return m_cm; +} + +const uct_cm_attr_t& uct_test::entity::cm_attr() const { + return m_cm_attr; +} + +uct_listener_h uct_test::entity::listener() const { + return m_listener; +} + uct_iface_h uct_test::entity::iface() const { return m_iface; } @@ -609,6 +1013,14 @@ uct_ep_h uct_test::entity::ep(unsigned index) const { return m_eps.at(index); } +size_t uct_test::entity::num_eps() const { + return m_eps.size(); +} + +uct_test::entity::eps_vec_t& uct_test::entity::eps() { + return m_eps; +} + void uct_test::entity::reserve_ep(unsigned index) { if (index >= m_eps.size()) { m_eps.resize(index + 1); @@ -667,6 +1079,14 @@ void uct_test::entity::destroy_ep(unsigned index) { m_eps[index].reset(); } +void uct_test::entity::revoke_ep(unsigned index) { + if (!m_eps[index]) { + UCS_TEST_ABORT("ep[" << index << "] does not exist"); + } + + m_eps[index].revoke(); +} + void uct_test::entity::destroy_eps() { for (unsigned index = 0; index < m_eps.size(); ++index) { if (!m_eps[index]) { @@ -676,24 +1096,16 @@ void uct_test::entity::destroy_eps() { } } -ssize_t uct_test::entity::client_priv_data_cb(void *arg, const char *dev_name, - void *priv_data) -{ - size_t *max_conn_priv = (size_t*)arg; - size_t priv_data_len; - - client_priv_data = "Client private data"; - priv_data_len = 1 + client_priv_data.length(); - - memcpy(priv_data, client_priv_data.c_str(), priv_data_len); - EXPECT_LE(priv_data_len, (*max_conn_priv)); - - return priv_data_len; -} - -void uct_test::entity::connect_to_sockaddr(unsigned index, entity& other, - ucs_sock_addr_t *remote_addr) +void +uct_test::entity::connect_to_sockaddr(unsigned index, entity& other, + const ucs::sock_addr_storage &remote_addr, + uct_cm_ep_priv_data_pack_callback_t pack_cb, + uct_cm_ep_client_connect_callback_t connect_cb, + uct_ep_disconnect_cb_t disconnect_cb, + void *user_data) { + ucs_sock_addr_t ucs_remote_addr = remote_addr.to_ucs_sock_addr(); + uct_ep_params_t params; uct_ep_h ep; ucs_status_t status; @@ -703,17 +1115,27 @@ void uct_test::entity::connect_to_sockaddr(unsigned index, entity& other, } /* Connect to the server */ - uct_ep_params_t params; - params.field_mask = UCT_EP_PARAM_FIELD_IFACE | - UCT_EP_PARAM_FIELD_USER_DATA | + if (m_cm) { + params.field_mask = UCT_EP_PARAM_FIELD_CM | + UCT_EP_PARAM_FIELD_SOCKADDR_CONNECT_CB_CLIENT | + UCT_EP_PARAM_FIELD_SOCKADDR_DISCONNECT_CB | + UCT_EP_PARAM_FIELD_USER_DATA; + params.cm = m_cm; + params.sockaddr_cb_client = connect_cb; + params.disconnect_cb = disconnect_cb; + } else { + params.field_mask = UCT_EP_PARAM_FIELD_IFACE; + params.iface = m_iface; + } + + params.field_mask |= UCT_EP_PARAM_FIELD_USER_DATA | UCT_EP_PARAM_FIELD_SOCKADDR | UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS | UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB; - params.iface = iface(); - params.user_data = &client_cb_arg; - params.sockaddr = remote_addr; + params.user_data = user_data; + params.sockaddr = &ucs_remote_addr; params.sockaddr_cb_flags = UCT_CB_FLAG_ASYNC; - params.sockaddr_pack_cb = client_priv_data_cb; + params.sockaddr_pack_cb = pack_cb; status = uct_ep_create(¶ms, &ep); ASSERT_UCS_OK(status); @@ -792,22 +1214,68 @@ void uct_test::entity::connect_to_iface(unsigned index, entity& other) { void uct_test::entity::connect(unsigned index, entity& other, unsigned other_index, - ucs_sock_addr_t *remote_addr) + const ucs::sock_addr_storage &remote_addr, + uct_cm_ep_priv_data_pack_callback_t pack_cb, + uct_cm_ep_client_connect_callback_t connect_cb, + uct_ep_disconnect_cb_t disconnect_cb, + void *user_data) +{ + if (m_cm || + iface_attr().cap.flags & UCT_IFACE_FLAG_CONNECT_TO_SOCKADDR) { + connect_to_sockaddr(index, other, remote_addr, pack_cb, connect_cb, + disconnect_cb, user_data); + } else { + UCS_TEST_SKIP_R("cannot connect"); + } +} + +void uct_test::entity::connect(unsigned index, entity& other, unsigned other_index) { if (iface_attr().cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) { connect_to_ep(index, other, other_index); } else if (iface_attr().cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { connect_to_iface(index, other); - } else if (iface_attr().cap.flags & UCT_IFACE_FLAG_CONNECT_TO_SOCKADDR) { - connect_to_sockaddr(index, other, remote_addr); } else { UCS_TEST_SKIP_R("cannot connect"); } } -void uct_test::entity::connect(unsigned index, entity& other, unsigned other_index) +void uct_test::entity::listen(const ucs::sock_addr_storage &listen_addr, + const uct_listener_params_t ¶ms) { - connect(index, other, other_index, NULL); + ucs_status_t status; + + for (;;) { + { + scoped_log_handler slh(wrap_errors_logger); + status = UCS_TEST_TRY_CREATE_HANDLE(uct_listener_h, m_listener, + uct_listener_destroy, + uct_listener_create, m_cm, + listen_addr.get_sock_addr_ptr(), + listen_addr.get_addr_size(), + ¶ms); + if (status == UCS_OK) { + break; + } + } + EXPECT_EQ(UCS_ERR_BUSY, status); + + const struct sockaddr* c_ifa_addr = listen_addr.get_sock_addr_ptr(); + struct sockaddr* ifa_addr = const_cast(c_ifa_addr); + if (ifa_addr->sa_family == AF_INET) { + struct sockaddr_in *addr = + reinterpret_cast(ifa_addr); + addr->sin_port = ntohs(ucs::get_port()); + } else { + struct sockaddr_in6 *addr = + reinterpret_cast(ifa_addr); + addr->sin6_port = ntohs(ucs::get_port()); + } + } +} + +void uct_test::entity::disconnect(uct_ep_h ep) { + ASSERT_UCS_OK(uct_ep_disconnect(ep, 0)); } void uct_test::entity::flush() const { @@ -825,12 +1293,22 @@ std::ostream& operator<<(std::ostream& os, const uct_tl_resource_desc_t& resourc uct_test::mapped_buffer::mapped_buffer(size_t size, uint64_t seed, const entity& entity, size_t offset, - uct_memory_type_t mem_type) : + ucs_memory_type_t mem_type) : m_entity(entity) { if (size > 0) { size_t alloc_size = size + offset; - m_entity.mem_alloc(alloc_size, &m_mem, &m_rkey, mem_type); + if (mem_type == UCS_MEMORY_TYPE_HOST) { + m_entity.mem_alloc_host(alloc_size, &m_mem); + } else { + m_mem.method = UCT_ALLOC_METHOD_LAST; + m_mem.address = mem_buffer::allocate(alloc_size, mem_type); + m_mem.length = alloc_size; + m_mem.mem_type = mem_type; + m_mem.memh = UCT_MEM_HANDLE_NULL; + m_mem.md = NULL; + m_entity.mem_type_reg(&m_mem); + } m_buf = (char*)m_mem.address + offset; m_end = (char*)m_buf + size; pattern_fill(seed); @@ -839,158 +1317,40 @@ uct_test::mapped_buffer::mapped_buffer(size_t size, uint64_t seed, m_mem.address = NULL; m_mem.md = NULL; m_mem.memh = UCT_MEM_HANDLE_NULL; - m_mem.mem_type= UCT_MD_MEM_TYPE_HOST; + m_mem.mem_type= UCS_MEMORY_TYPE_HOST; m_mem.length = 0; m_buf = NULL; m_end = NULL; m_rkey.rkey = UCT_INVALID_RKEY; m_rkey.handle = NULL; - m_rkey.type = NULL; } - set_iov(); -} - -uct_test::mapped_buffer::mapped_buffer(void *ptr, size_t size, uct_mem_h memh, - uint64_t seed, const entity& entity, - uct_memory_type_t mem_type) : - m_entity(entity) -{ - m_mem.method = UCT_ALLOC_METHOD_LAST; - m_mem.address = NULL; - m_mem.md = NULL; - m_mem.memh = memh; - m_mem.mem_type = mem_type; - m_mem.length = 0; - m_buf = ptr; - m_end = (char*)ptr + size; - m_entity.get_rkey(memh, &m_rkey, mem_type); - set_iov(); -} - -void uct_test::mapped_buffer::set_iov() { m_iov.buffer = ptr(); m_iov.length = length(); m_iov.count = 1; m_iov.stride = 0; m_iov.memh = memh(); -} - -uct_test::mapped_buffer::~mapped_buffer() { - m_entity.mem_free(&m_mem, m_rkey, m_mem.mem_type); -} -void uct_test::mapped_buffer::pattern_fill(uint64_t seed) { - switch(m_mem.mem_type) { - case UCT_MD_MEM_TYPE_HOST: - pattern_fill(m_buf, (char*)m_end - (char*)m_buf, seed); - break; - case UCT_MD_MEM_TYPE_CUDA: - pattern_fill_cuda(m_buf, (char*)m_end - (char*)m_buf, seed); - break; - default: - UCS_TEST_ABORT("Wrong buffer memory type"); - } + m_entity.rkey_unpack(&m_mem, &m_rkey); + m_rkey.type = NULL; } - -void uct_test::mapped_buffer::pattern_fill(void *buffer, size_t length, uint64_t seed) -{ - uint64_t *ptr = (uint64_t*)buffer; - char *end = (char *)buffer + length; - - while ((char*)(ptr + 1) <= end) { - *ptr = seed; - seed = pat(seed); - ++ptr; +uct_test::mapped_buffer::~mapped_buffer() { + m_entity.rkey_release(&m_rkey); + if (m_mem.mem_type == UCS_MEMORY_TYPE_HOST) { + m_entity.mem_free_host(&m_mem); + } else { + ucs_assert(m_mem.method == UCT_ALLOC_METHOD_LAST); + m_entity.mem_type_dereg(&m_mem); + mem_buffer::release(m_mem.address, m_mem.mem_type); } - memcpy(ptr, &seed, end - (char*)ptr); } -void uct_test::mapped_buffer::pattern_fill_cuda(void *start, size_t length, uint64_t seed) -{ -#if HAVE_CUDA - void *temp; - cudaError_t cerr; - - temp = malloc(length); - ASSERT_TRUE(temp != NULL); - - pattern_fill(temp, length, seed); - - cerr = cudaMemcpy(start, temp, length, cudaMemcpyHostToDevice); - ASSERT_TRUE(cerr == cudaSuccess); - cerr = cudaDeviceSynchronize(); - ASSERT_TRUE(cerr == cudaSuccess); - free(temp); -#endif +void uct_test::mapped_buffer::pattern_fill(uint64_t seed) { + mem_buffer::pattern_fill(ptr(), length(), seed, m_mem.mem_type); } void uct_test::mapped_buffer::pattern_check(uint64_t seed) { - switch(m_mem.mem_type) { - case UCT_MD_MEM_TYPE_HOST: - pattern_check(ptr(), length(), seed); - break; - case UCT_MD_MEM_TYPE_CUDA: - pattern_check_cuda(ptr(), length(), seed); - break; - default: - UCS_TEST_ABORT("Wrong buffer memory type"); - } -} - -void uct_test::mapped_buffer::pattern_check(const void *buffer, size_t length) { - if (length > sizeof(uint64_t)) { - pattern_check(buffer, length, *(const uint64_t*)buffer); - } -} - -void uct_test::mapped_buffer::pattern_check(const void *buffer, size_t length, - uint64_t seed) { - const char* end = (const char*)buffer + length; - const uint64_t *ptr = (const uint64_t*)buffer; - - while ((const char*)(ptr + 1) <= end) { - if (*ptr != seed) { - UCS_TEST_ABORT("At offset " << ((const char*)ptr - (const char*)buffer) << ": " << - "Expected: 0x" << std::hex << seed << " " << - "Got: 0x" << std::hex << (*ptr) << std::dec); - } - seed = pat(seed); - ++ptr; - } - - size_t remainder = (end - (const char*)ptr); - if (remainder > 0) { - ucs_assert(remainder < sizeof(*ptr)); - uint64_t mask = UCS_MASK_SAFE(remainder * 8 * sizeof(char)); - uint64_t value = 0; - memcpy(&value, ptr, remainder); - if (value != (seed & mask)) { - UCS_TEST_ABORT("At offset " << ((const char*)ptr - (const char*)buffer) << - " (remainder " << remainder << ") : " << - "Expected: 0x" << std::hex << (seed & mask) << " " << - "Mask: 0x" << std::hex << mask << " " << - "Got: 0x" << std::hex << value << std::dec); - } - - } -} - -void uct_test::mapped_buffer::pattern_check_cuda(const void *buffer, size_t length, - uint64_t seed) { -#if HAVE_CUDA - void *temp = NULL; - cudaError_t cerr; - - temp = malloc(length); - ASSERT_TRUE(temp != NULL); - - cerr = cudaMemcpy(temp, buffer, length, cudaMemcpyDeviceToHost); - ASSERT_TRUE(cerr == cudaSuccess); - - pattern_check(temp, length, seed); - free(temp); -#endif + mem_buffer::pattern_check(ptr(), length(), seed, m_mem.mem_type); } void *uct_test::mapped_buffer::ptr() const { @@ -1005,12 +1365,6 @@ size_t uct_test::mapped_buffer::length() const { return (char*)m_end - (char*)m_buf; } -uint64_t uct_test::mapped_buffer::pat(uint64_t prev) { - /* LFSR pattern */ - static const uint64_t polynom = 1337; - return (prev << 1) | (__builtin_parityl(prev & polynom) & 1); -} - uct_mem_h uct_test::mapped_buffer::memh() const { return m_mem.memh; } @@ -1025,7 +1379,7 @@ const uct_iov_t* uct_test::mapped_buffer::iov() const { size_t uct_test::mapped_buffer::pack(void *dest, void *arg) { const mapped_buffer* buf = (const mapped_buffer*)arg; - memcpy(dest, buf->ptr(), buf->length()); + mem_buffer::copy_from(dest, buf->ptr(), buf->length(), buf->m_mem.mem_type); return buf->length(); } @@ -1055,7 +1409,15 @@ void uct_test::entity::async_wrapper::check_miss() ucs_async_check_miss(&m_async); } -ucs_status_t uct_test::send_am_message(entity *e, int wnd, uint8_t am_id, int ep_idx) +uct_test::entity::scoped_async_lock::scoped_async_lock(entity &e) : m_entity(e) { + UCS_ASYNC_BLOCK(&m_entity.m_async.m_async); +} + +uct_test::entity::scoped_async_lock::~scoped_async_lock() { + UCS_ASYNC_UNBLOCK(&m_entity.m_async.m_async); +} + +ucs_status_t uct_test::send_am_message(entity *e, uint8_t am_id, int ep_idx) { ssize_t res; @@ -1068,3 +1430,65 @@ ucs_status_t uct_test::send_am_message(entity *e, int wnd, uint8_t am_id, int ep return (ucs_status_t)(res >= 0 ? UCS_OK : res); } } + +void uct_test::async_event_ctx::signal() { + ASSERT_TRUE(aux_pipe_init); + ucs_async_pipe_push(&aux_pipe); +} + +bool uct_test::async_event_ctx::wait_for_event(entity &e, int timeout) { + if (wakeup_fd.fd == -1) { + /* create wakeup */ + if (e.iface_attr().cap.event_flags & UCT_IFACE_FLAG_EVENT_FD) { + ucs_status_t status = + uct_iface_event_fd_get(e.iface(), &wakeup_fd.fd); + ASSERT_UCS_OK(status); + } else { + ucs_status_t status = + ucs_async_pipe_create(&aux_pipe); + ASSERT_UCS_OK(status); + aux_pipe_init = true; + wakeup_fd.fd = ucs_async_pipe_rfd(&aux_pipe); + } + } + + int ret = poll(&wakeup_fd, 1, timeout); + EXPECT_TRUE((ret == 0) || (ret == 1)); + if (ret > 0) { + if (e.iface_attr().cap.event_flags & + UCT_IFACE_FLAG_EVENT_ASYNC_CB) { + ucs_async_pipe_drain(&aux_pipe); + } + return true; + } + + return false; +} + +void test_uct_iface_attrs::init() +{ + uct_test::init(); + m_e = uct_test::create_entity(0ul); + m_entities.push_back(m_e); +} + +void test_uct_iface_attrs::basic_iov_test() +{ + attr_map_t max_iov_map = get_num_iov(); + + EXPECT_FALSE(max_iov_map.empty()); + + if (max_iov_map.find("am") != max_iov_map.end()) { + EXPECT_EQ(max_iov_map.at("am"), m_e->iface_attr().cap.am.max_iov); + } + if (max_iov_map.find("tag") != max_iov_map.end()) { + EXPECT_EQ(max_iov_map.at("tag"), m_e->iface_attr().cap.tag.eager.max_iov); + } + if (max_iov_map.find("put") != max_iov_map.end()) { + EXPECT_EQ(max_iov_map.at("put"), m_e->iface_attr().cap.put.max_iov); + } + if (max_iov_map.find("get") != max_iov_map.end()) { + EXPECT_EQ(max_iov_map.at("get"), m_e->iface_attr().cap.get.max_iov); + } +} + diff --git a/test/gtest/uct/uct_test.h b/test/gtest/uct/uct_test.h index a144fc7ca6d..2f5b53dc958 100644 --- a/test/gtest/uct/uct_test.h +++ b/test/gtest/uct/uct_test.h @@ -1,5 +1,5 @@ /** -* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED. * * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. * Copyright (C) ARM Ltd. 2017. ALL RIGHTS RESERVED @@ -10,19 +10,22 @@ #ifndef UCT_TEST_H_ #define UCT_TEST_H_ +#include + +#include #include #include #include +#include +#include #include #include -#if HAVE_CUDA -#include -#include -#endif + #define DEFAULT_DELAY_MS 1.0 #define DEFAULT_TIMEOUT_SEC 10.0 +#define DEFAULT_VARIANT 0 #define UCT_TEST_CALL_AND_TRY_AGAIN(_func, _res) \ do { \ @@ -33,17 +36,58 @@ } while (_res == UCS_ERR_NO_RESOURCE) +#define FOR_EACH_ENTITY(_iter) \ + for (ucs::ptr_vector::const_iterator _iter = m_entities.begin(); \ + _iter != m_entities.end(); ++_iter) \ + + /* Testing resource */ struct resource { virtual ~resource() {}; virtual std::string name() const; + uct_component_h component; std::string md_name; - cpu_set_t local_cpus; + ucs_cpu_set_t local_cpus; std::string tl_name; std::string dev_name; + std::string variant_name; uct_device_type_t dev_type; - struct sockaddr_storage listen_if_addr; /* sockaddr to listen on */ - struct sockaddr_storage connect_if_addr; /* sockaddr to connect to */ + ucs::sock_addr_storage listen_sock_addr; /* sockaddr to listen on */ + ucs::sock_addr_storage connect_sock_addr; /* sockaddr to connect to */ + int variant; + + resource(); + resource(uct_component_h component, const std::string& md_name, + const ucs_cpu_set_t& local_cpus, const std::string& tl_name, + const std::string& dev_name, uct_device_type_t dev_type); + resource(uct_component_h component, const uct_md_attr_t& md_attr, + const uct_md_resource_desc_t& md_resource, + const uct_tl_resource_desc_t& tl_resource); +}; + +struct resource_speed : public resource { + double bw; + + resource_speed() : resource(), bw(0) { } + resource_speed(uct_component_h component, const uct_worker_h& worker, + const uct_md_h& md, const uct_md_attr_t& md_attr, + const uct_md_resource_desc_t& md_resource, + const uct_tl_resource_desc_t& tl_resource); +}; + + +/** + * UCT test, without parameterization + */ +class uct_test_base : public ucs::test_base { +protected: + struct md_resource { + uct_component_h cmpt; + uct_component_attr_t cmpt_attr; + uct_md_resource_desc_t rsc_desc; + }; + + static std::vector enum_md_resources(); }; @@ -51,13 +95,21 @@ struct resource { * UCT test, parametrized on a transport/device. */ class uct_test : public testing::TestWithParam, - public ucs::test_base { + public uct_test_base { public: UCS_TEST_BASE_IMPL; - static std::vector enum_resources(const std::string& tl_name, - bool loopback = false); - + /* we return a vector of pointers to allow test fixtures to extend the + * resource structure. + */ + static std::vector enum_resources(const std::string& tl_name); + + /* By default generate test variant for all tls. If variant is specific to + * the particular transport tl_name need to be specified accordingly */ + static void generate_test_variant(int variant, + const std::string &variant_name, + std::vector& test_res, + const std::string &tl_name=""); uct_test(); virtual ~uct_test(); @@ -73,25 +125,33 @@ class uct_test : public testing::TestWithParam, class entity { public: typedef uct_test::atomic_mode atomic_mode; + typedef std::vector< ucs::handle > eps_vec_t; entity(const resource& resource, uct_iface_config_t *iface_config, uct_iface_params_t *params, uct_md_config_t *md_config); - void mem_alloc(size_t length, uct_allocated_memory_t *mem, - uct_rkey_bundle *rkey_bundle, int mem_type) const; + entity(const resource& resource, uct_md_config_t *md_config, + uct_cm_config_t *cm_config); + + void mem_alloc_host(size_t length, uct_allocated_memory_t *mem) const; + + void mem_free_host(const uct_allocated_memory_t *mem) const; + + void mem_type_reg(uct_allocated_memory_t *mem) const; + + void mem_type_dereg(uct_allocated_memory_t *mem) const; - void get_rkey(uct_mem_h memh, uct_rkey_bundle *rkey_bundle, - int mem_type) const; + void rkey_unpack(const uct_allocated_memory_t *mem, + uct_rkey_bundle *rkey_bundle) const; - void mem_free(const uct_allocated_memory_t *mem, - const uct_rkey_bundle_t& rkey, - const uct_memory_type_t mem_type) const; + void rkey_release(const uct_rkey_bundle *rkey_bundle) const; unsigned progress() const; bool is_caps_supported(uint64_t required_flags); - void check_caps(uint64_t required_flags, uint64_t invalid_flags = 0); - void check_atomics(uint64_t required_ops, atomic_mode mode); + bool check_caps(uint64_t required_flags, uint64_t invalid_flags = 0); + bool check_event_caps(uint64_t required_flags, uint64_t invalid_flags = 0); + bool check_atomics(uint64_t required_ops, atomic_mode mode); uct_md_h md() const; @@ -99,6 +159,12 @@ class uct_test : public testing::TestWithParam, uct_worker_h worker() const; + uct_cm_h cm() const; + + const uct_cm_attr_t& cm_attr() const; + + uct_listener_h listener() const; + uct_iface_h iface() const; const uct_iface_attr& iface_attr() const; @@ -107,21 +173,46 @@ class uct_test : public testing::TestWithParam, uct_ep_h ep(unsigned index) const; + eps_vec_t& eps(); + size_t num_eps() const; + void reserve_ep(unsigned index); + void create_ep(unsigned index); void destroy_ep(unsigned index); + void revoke_ep(unsigned index); void destroy_eps(); void connect(unsigned index, entity& other, unsigned other_index); void connect(unsigned index, entity& other, unsigned other_index, - ucs_sock_addr_t *remote_addr); + const ucs::sock_addr_storage &remote_addr, + uct_cm_ep_priv_data_pack_callback_t pack_cb, + uct_cm_ep_client_connect_callback_t connect_cb, + uct_ep_disconnect_cb_t disconnect_cb, + void *user_data); void connect_to_iface(unsigned index, entity& other); void connect_to_ep(unsigned index, entity& other, unsigned other_index); - void connect_to_sockaddr(unsigned index, entity& other, ucs_sock_addr_t *remote_addr); + void connect_to_sockaddr(unsigned index, entity& other, + const ucs::sock_addr_storage &remote_addr, + uct_cm_ep_priv_data_pack_callback_t pack_cb, + uct_cm_ep_client_connect_callback_t connect_cb, + uct_ep_disconnect_cb_t disconnect_cb, + void *user_sata); + + void listen(const ucs::sock_addr_storage &listen_addr, + const uct_listener_params_t ¶ms); + void disconnect(uct_ep_h ep); void flush() const; - static std::string client_priv_data; - static size_t client_cb_arg; + size_t max_conn_priv; + + class scoped_async_lock { + public: + scoped_async_lock(entity &e); + ~scoped_async_lock(); + private: + entity &m_entity; + }; private: class async_wrapper { @@ -133,35 +224,33 @@ class uct_test : public testing::TestWithParam, private: async_wrapper(const async_wrapper &); }; - typedef std::vector< ucs::handle > eps_vec_t; entity(const entity&); - void reserve_ep(unsigned index); void connect_p2p_ep(uct_ep_h from, uct_ep_h to); void cuda_mem_alloc(size_t length, uct_allocated_memory_t *mem) const; void cuda_mem_free(const uct_allocated_memory_t *mem) const; - static ssize_t client_priv_data_cb(void *arg, const char *dev_name, - void *priv_data); - - ucs::handle m_md; - uct_md_attr_t m_md_attr; - mutable async_wrapper m_async; - ucs::handle m_worker; - ucs::handle m_iface; - eps_vec_t m_eps; - uct_iface_attr_t m_iface_attr; - uct_iface_params_t m_iface_params; + + const resource m_resource; + ucs::handle m_md; + uct_md_attr_t m_md_attr; + mutable async_wrapper m_async; + ucs::handle m_worker; + ucs::handle m_cm; + uct_cm_attr_t m_cm_attr; + ucs::handle m_listener; + ucs::handle m_iface; + eps_vec_t m_eps; + uct_iface_attr_t m_iface_attr; + uct_iface_params_t m_iface_params; }; class mapped_buffer { public: - mapped_buffer(size_t size, uint64_t seed, const entity& entity, size_t offset = 0, - uct_memory_type_t mem_type = UCT_MD_MEM_TYPE_HOST); - mapped_buffer(void *ptr, size_t size, uct_mem_h memh, uint64_t seed, - const entity& entity, - uct_memory_type_t mem_type = UCT_MD_MEM_TYPE_HOST); + mapped_buffer(size_t size, uint64_t seed, const entity& entity, + size_t offset = 0, + ucs_memory_type_t mem_type = UCS_MEMORY_TYPE_HOST); virtual ~mapped_buffer(); void *ptr() const; @@ -175,14 +264,8 @@ class uct_test : public testing::TestWithParam, void pattern_check(uint64_t seed); static size_t pack(void *dest, void *arg); - static void pattern_fill(void *buffer, size_t length, uint64_t seed); - static void pattern_fill_cuda(void *buffer, size_t length, uint64_t seed); - static void pattern_check(const void *buffer, size_t length); - static void pattern_check(const void *buffer, size_t length, uint64_t seed); - static void pattern_check_cuda(const void *buffer, size_t length, uint64_t seed); + private: - static uint64_t pat(uint64_t prev); - void set_iov(); const uct_test::entity& m_entity; @@ -193,6 +276,33 @@ class uct_test : public testing::TestWithParam, uct_iov_t m_iov; }; + class async_event_ctx { + public: + async_event_ctx() { + wakeup_fd.fd = -1; + wakeup_fd.events = POLLIN; + wakeup_fd.revents = 0; + aux_pipe_init = false; + memset(&aux_pipe, 0, sizeof(aux_pipe)); + } + + ~async_event_ctx() { + if (aux_pipe_init) { + ucs_async_pipe_destroy(&aux_pipe); + } + } + + void signal(); + bool wait_for_event(entity &e, int timeout); + + private: + struct pollfd wakeup_fd; + /* this used for UCT TLs that support async event cb + * for event notification */ + ucs_async_pipe_t aux_pipe; + bool aux_pipe_init; + }; + template static std::vector filter_resources(const std::vector& resources, const std::string& tl_name) @@ -218,6 +328,18 @@ class uct_test : public testing::TestWithParam, } } + void wait_for_bits(volatile uint64_t *flag, uint64_t mask, + double timeout = DEFAULT_TIMEOUT_SEC) const + { + ucs_time_t deadline = ucs_get_time() + + ucs_time_from_sec(timeout) * + ucs::test_time_multiplier(); + while ((ucs_get_time() < deadline) && (!ucs_test_all_flags(*flag, mask))) { + /* Don't do short_progress_loop() to avoid extra timings */ + progress(); + } + } + template void wait_for_value(volatile T *var, T value, bool progress, double timeout = DEFAULT_TIMEOUT_SEC) const @@ -241,45 +363,79 @@ class uct_test : public testing::TestWithParam, void stats_activate(); void stats_restore(); + virtual bool has_transport(const std::string& tl_name) const; + virtual bool has_ud() const; + virtual bool has_rc() const; + virtual bool has_rc_or_dc() const; + virtual bool has_ib() const; + bool is_caps_supported(uint64_t required_flags); - void check_caps(uint64_t required_flags, uint64_t invalid_flags = 0); - void check_caps(const entity& e, uint64_t required_flags, uint64_t invalid_flags = 0); - void check_atomics(uint64_t required_ops, atomic_mode mode); + bool check_caps(uint64_t required_flags, uint64_t invalid_flags = 0); + void check_caps_skip(uint64_t required_flags, uint64_t invalid_flags = 0); + bool check_event_caps(uint64_t required_flags, uint64_t invalid_flags = 0); + bool check_atomics(uint64_t required_ops, atomic_mode mode); const entity& ent(unsigned index) const; unsigned progress() const; void flush(ucs_time_t deadline = ULONG_MAX) const; virtual void short_progress_loop(double delay_ms = DEFAULT_DELAY_MS) const; virtual void twait(int delta_ms = DEFAULT_DELAY_MS) const; - static void set_sockaddr_resources(uct_md_h pd, char *md_name, cpu_set_t local_cpus, - std::vector& all_resources); - static void set_interface_rscs(char *md_name, cpu_set_t local_cpus, - struct ifaddrs *ifa, + static void set_cm_resources(std::vector& all_resources); + static bool is_interface_usable(struct ifaddrs *ifa, const char *name); + static void set_md_sockaddr_resources(const md_resource& md_rsc, uct_md_h pm, + ucs_cpu_set_t local_cpus, + std::vector& all_resources); + static void set_cm_sockaddr_resources(uct_component_h cmpt, const char *cmpt_name, + ucs_cpu_set_t local_cpus, + std::vector& all_resources); + static void set_interface_rscs(uct_component_h comt, const char * name, + ucs_cpu_set_t local_cpus, struct ifaddrs *ifa, std::vector& all_resources); static void init_sockaddr_rsc(resource *rsc, struct sockaddr *listen_addr, struct sockaddr *connect_addr, size_t size); - static const char *uct_mem_type_names[]; - uct_test::entity* create_entity(size_t rx_headroom, - uct_error_handler_t err_handler = NULL); + uct_error_handler_t err_handler = NULL, + uct_tag_unexp_eager_cb_t eager_cb = NULL, + uct_tag_unexp_rndv_cb_t rndv_cb = NULL, + void *eager_arg = NULL, + void *rndv_arg = NULL, + uct_async_event_cb_t async_event_cb = NULL, + void *async_event_arg = NULL); uct_test::entity* create_entity(uct_iface_params_t ¶ms); + uct_test::entity* create_entity(); int max_connections(); + int max_connect_batch(); + + void reduce_tl_send_queues(); - ucs_status_t send_am_message(entity *e, int wnd, uint8_t am_id = 0, int ep_idx = 0); + ucs_status_t send_am_message(entity *e, uint8_t am_id = 0, int ep_idx = 0); ucs::ptr_vector m_entities; uct_iface_config_t *m_iface_config; uct_md_config_t *m_md_config; - + uct_cm_config_t *m_cm_config; }; std::ostream& operator<<(std::ostream& os, const resource* resource); +class test_uct_iface_attrs : public uct_test { +public: + typedef std::map attr_map_t; + + void init(); + virtual attr_map_t get_num_iov() = 0; + void basic_iov_test(); + +protected: + entity *m_e; +}; + + #define UCT_TEST_IB_TLS \ rc_mlx5, \ - rc, \ + rc_verbs, \ dc_mlx5, \ - ud, \ + ud_verbs, \ ud_mlx5, \ cm @@ -292,7 +448,9 @@ std::ostream& operator<<(std::ostream& os, const resource* resource); ugni_udt, \ ugni_smsg, \ tcp, \ - mm, \ + posix, \ + sysv, \ + xpmem, \ cma, \ knem @@ -300,9 +458,13 @@ std::ostream& operator<<(std::ostream& os, const resource* resource); cuda_copy, \ gdr_copy +#define UCT_TEST_ROCM_MEM_TYPE_TLS \ + rocm_copy + #define UCT_TEST_TLS \ UCT_TEST_NO_SELF_TLS, \ UCT_TEST_CUDA_MEM_TYPE_TLS, \ + UCT_TEST_ROCM_MEM_TYPE_TLS, \ self /** @@ -336,6 +498,19 @@ std::ostream& operator<<(std::ostream& os, const resource* resource); #define UCT_INSTANTIATE_SOCKADDR_TEST_CASE(_test_case) \ UCS_PP_FOREACH(_UCT_INSTANTIATE_TEST_CASE, _test_case, UCT_TEST_SOCKADDR_TLS) +/** + * Instantiate the parametrized test case for the RC/DC transports. + * + * @param _test_case Test case class, derived from uct_test. + */ +#define UCT_INSTANTIATE_RC_TEST_CASE(_test_case) \ + _UCT_INSTANTIATE_TEST_CASE(_test_case, rc_verbs) \ + _UCT_INSTANTIATE_TEST_CASE(_test_case, rc_mlx5) + +#define UCT_INSTANTIATE_RC_DC_TEST_CASE(_test_case) \ + UCT_INSTANTIATE_RC_TEST_CASE(_test_case) \ + _UCT_INSTANTIATE_TEST_CASE(_test_case, dc_mlx5) + std::ostream& operator<<(std::ostream& os, const uct_tl_resource_desc_t& resource); #endif diff --git a/test/mpi/Makefile.am b/test/mpi/Makefile.am index 5cee0cef173..0bb8d5fcbcb 100644 --- a/test/mpi/Makefile.am +++ b/test/mpi/Makefile.am @@ -8,6 +8,8 @@ CC = $(MPICC) LD = $(MPICC) +if HAVE_MPICC + # Test application for memory hooks when running with MPI # (some MPIs have hooks of their own and we make sure ours still work) noinst_PROGRAMS = test_memhooks shmem_pingpong @@ -29,3 +31,5 @@ libtest_memhooks_la_LDFLAGS = -rpath /nowhere # Force shared library # SHMEM ping-pong test shmem_pingpong_LDFLAGS = -loshmem shmem_pingpong_SOURCES = shmem_pingpong.c + +endif diff --git a/test/mpi/test_memhooks.c b/test/mpi/test_memhooks.c index e6366b14089..d4568ceaa03 100644 --- a/test/mpi/test_memhooks.c +++ b/test/mpi/test_memhooks.c @@ -332,6 +332,7 @@ int malloc_hooks_run_flags(void *dl, ucm_event_type_t events) dlclose(dl_test); free(ptr_malloc_core); /* This should still work */ ptr_malloc_core = NULL; + malloc_trim(0); if (events & UCM_EVENT_VM_UNMAPPED) { CHKERR_JUMP(total_unmapped == 0, "No callback for munmap from malloc", fail); } diff --git a/ucx.spec.in b/ucx.spec.in index 767675cd9bf..955662501f5 100644 --- a/ucx.spec.in +++ b/ucx.spec.in @@ -29,7 +29,16 @@ BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) # UCX currently supports only the following architectures ExclusiveArch: aarch64 ppc64le x86_64 -BuildRequires: automake autoconf libtool gcc-c++ numactl-devel +%if %{defined extra_deps} +Requires: %{?extra_deps} +%endif + +BuildRequires: automake autoconf libtool gcc-c++ +%if "%{_vendor}" == "suse" +BuildRequires: libnuma-devel +%else +BuildRequires: numactl-devel +%endif %if %{with cma} BuildRequires: glibc-devel >= 2.15 %endif @@ -56,14 +65,20 @@ BuildRequires: xpmem-devel %endif %description -UCX stands for Unified Communication X. UCX provides an optimized communication -layer for Message Passing (MPI), PGAS/OpenSHMEM libraries and RPC/data-centric +UCX is an optimized communication framework for high-performance distributed applications. UCX utilizes high-speed networks, such as RDMA (InfiniBand, RoCE, etc), Cray Gemini or Aries, for inter-node communication. If no such network is available, TCP is used instead. UCX supports efficient transfer of data in -either main memory (RAM) or GPU memory (through CUDA and ROCm libraries). -In addition, UCX provides efficient intra-node communication, by leveraging the +either main memory (RAM) or GPU memory (through CUDA and ROCm libraries). In +addition, UCX provides efficient intra-node communication, by leveraging the following shared memory mechanisms: posix, sysv, cma, knem, and xpmem. +The acronym UCX stands for "Unified Communication X". + +This package was built from '@SCM_BRANCH@' branch, commit @SCM_VERSION@. + +%if "%{_vendor}" == "suse" +%debug_package +%endif %package devel Requires: %{name}%{?_isa} = %{version}-%{release} @@ -84,6 +99,7 @@ Provides header files and examples for developing with UCX. --disable-debug \ --disable-assertions \ --disable-params-check \ + --without-java \ %_enable_arg cma cma \ %_with_arg cuda cuda \ %_with_arg gdrcopy gdrcopy \ @@ -284,6 +300,12 @@ process to map the memory of another process into its virtual address space. %changelog +* Mon Feb 10 2020 Yossi Itigin 1.9.0-1 +- Bump version to 1.9.0 +* Sun Sep 22 2019 Yossi Itigin 1.8.0-1 +- Bump version to 1.8.0 +* Sun Mar 24 2019 Yossi Itigin 1.7.0-1 +- Bump version to 1.7.0 * Thu Jan 24 2019 Yossi Itigin 1.6.0-1 - Add cma, knem, and xpmem sub-packages * Tue Nov 20 2018 Yossi Itigin 1.6.0-1